linux代碼版本:linux4.4
glibc代碼版本:glibc-2.26
導讀:在linux核心态也搞了好幾年了,公司的新平台也都轉向了使用者态,發展趨勢也是linux的工作量也來越少,更多的工作也将聚焦在業務上。其實無論是在核心态還是使用者态程式設計,對于嵌入式程式設計來說,沒有本質差別,嵌入式産品上也不太會去做資料庫等之類的應用,大多還是基于裝置檔案操作、程序通信、socket等實作業務邏輯。在核心态面臨的踩記憶體當機問題,使用者态一樣會面臨。核心态用kdb,使用者态用gdb,當熟悉核心的一些知識後,再使用者态程式設計,感覺要容易一些,當你打開一個檔案操作時,open,read,write、select就會想到核心的實作。尤其是gdb功能比kdb強大太多了,有種将刀劍換成沖鋒槍的感覺。再發生踩記憶體也隻是死的該程序,不再是當機,調試起來效率高得多。使用者态的接口也就那麼些,很快看完了。相對于一直搞使用者态程式設計的同僚,搞過核心就總想看一下使用者态接口的實作。下面從fork()函數開始。
一、fork 函數在哪?
正要看 fork 函數的實作,就遇到了大麻煩,在核心代碼裡怎麼也找不到 fork 函數的實體。原以為能夠直接搜尋到 fork 函數,然後裡面再調用系統調用進入核心态,全局搜尋後也沒有找到 fork 函數定義,隻在 fork.c 裡面找到了
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
再看 SYSCALL_DEFINE0 定義
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void)
展開說就是
asmlinkage long sys_fork(void);
然後搜尋 sys_fork
#define __NR_fork 2
__SYSCALL(__NR_fork, sys_fork)
直接注冊成了系統調用,然後就再也找不到 fork 究竟在哪裡定義了。代碼搜不到,那就隻能寫一段代碼,用 gdb 跟蹤一下,
Breakpoint 1, main (argc=1, argv=0x7fffffffe4f8) at test.c:30
30 pid = fork();
(gdb) s
__libc_fork () at ../sysdeps/nptl/fork.c:49
49 ../sysdeps/nptl/fork.c: No such file or directory.
(gdb) bt
#0 __libc_fork () at ../sysdeps/nptl/fork.c:49
#1 0x0000555555554b54 in main (argc=1, argv=0x7fffffffe4f8) at test.c:30
(gdb) s
61 in ../sysdeps/nptl/fork.c
(gdb) s
66 in ../sysdeps/nptl/fork.c
很明顯,調用到了 __libc_fork ,也就是在 glibc 裡面,然後發現下面的調用
#ifdef ARCH_FORK
pid = ARCH_FORK ();
#else
# error "ARCH_FORK must be defined so that the CLONE_SETTID flag is used"
pid = INLINE_SYSCALL (fork, 0);
#endif
看下ia64下的定義
#define INLINE_SYSCALL(name, nr, args...) \
({ \
DO_INLINE_SYSCALL_NCS (__NR_##name, nr, args) \
if (_r10 == -1) \
{ \
__set_errno (_retval); \
_retval = -1; \
} \
_retval; })
調用__NR_fork 系統調用,正好就是 sys_fork 函數!
其實無論 do_fork 、sys_fork、sys_vfork 最終調用的都是 _do_fork ,隻是傳遞參數不同而已,
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
return _do_fork(clone_flags, stack_start, stack_size,
parent_tidptr, child_tidptr, 0);
}
#endif
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL, 0);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
#endif
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
0, NULL, NULL, 0);
}
#endif
上面可以看出,vfork 相比 fork 多了個CLONE_VFORK 和 CLONE_VM 的傳參
下面看 _do_fork :
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
/*決定是否以及什麼類型的 event 給 kernel_thread 是明确設定了 CLONE_UNTRACED ,
* fork 和 vfork 會進入該分支*/
if (!(clone_flags & CLONE_UNTRACED)) {
/* vfork 設定了 CLONE_VFORK */
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
/*fork 和 fork 都設定了 SIGCHLD ,也就是程序exit是發送 SIGCHLD 信号*/
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
/* 檢查目前程序是否使能了上面的 trace 事件*/
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
/* 複制父程序的 task_struct */
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
struct pid *pid;
cpufreq_task_times_alloc(p);
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
/*将子程序的 pid 寫到 父程序的參數中*/
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
/*vfork 的話擷取 task_struct */
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
/*喚醒新建立的程序*/
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
/*如果是 vfork 則等待vfork完成,也就是等待新建立的線程先執行*/
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
很明顯 通過 copy_process 複制父程序,并且指派完成後,會喚醒子程序,如果是vfork ,還會等待 子程序 執行完成,這也就是 vfork 建立的子程序先執行。
copy_process 函數完成了主要工作,還是比較複雜的。下表是傳遞的 flags 的一些解釋:
Flag name | Description |
---|---|
CLONE_VM | Shares the memory descriptor and all Page Tables . |
CLONE_FS | Shares the table that identifies the root directory and the current working directory, as well as the value of the bitmask used to mask the initial file permissions of a new file (the so-called file umask ). |
CLONE_FILES | Shares the table that identifies the open files . |
CLONE_SIGHAND | Shares the tables that identify the signal handlers and the blocked and pending signals . If this flag is true, the CLONE_VM flag must also be set. |
CLONE_PTRACE | If traced, the parent wants the child to be traced too. Furthermore, the debugger may want to trace the child on its own; in this case, the kernel forces the flag to 1. |
CLONE_VFORK | Set when the system call issued is a vfork( ) . |
CLONE_PARENT | Sets the parent of the child (parent and real_parent fields in the process descriptor) to the parent of the calling process. |
CLONE_THREAD | Inserts the child into the same thread group of the parent, and forces the child to share the signal descriptor of the parent. The child's tgid and group_leader fields are set accordingly. If this flag is true, the CLONE_SIGHAND flag must also be set. |
CLONE_NEWNS | Set if the clone needs its own namespace, that is, its own view of the mounted filesystems ; it is not possible to specify both CLONE_NEWNS and CLONE_FS . |
CLONE_SYSVSEM | Shares the System V IPC undoable semaphore operations . |
CLONE_SETTLS | Creates a new Thread Local Storage (TLS) segment for the lightweight process; the segment is described in the structure pointed to by the tls parameter. |
CLONE_PARENT_SETTID | Writes the PID of the child into the User Mode variable of the parent pointed to by theptid parameter. |
CLONE_CHILD_CLEARTID | When set, the kernel sets up a mechanism to be triggered when the child process will exit or when it will start executing a new program. In these cases, the kernel will clear the User Mode variable pointed to by the ctid parameter and will awaken any process waiting for this event. |
CLONE_DETACHED | A legacy flag ignored by the kernel. |
CLONE_UNTRACED | Set by the kernel to override the value of the CLONE_PTRACE flag (used for disabling tracing of kernel threads ). |
CLONE_CHILD_SETTID | Writes the PID of the child into the User Mode variable of the child pointed to by thectid parameter. |
CLONE_STOPPED | Forces the child to start in the TASK_STOPPED state. |
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
int retval;
struct task_struct *p;
void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
/*參數合法性檢查,CLONE_NEWNS 是子程序需要自己 命名空間 ,CLONE_FS 是共享父程序的檔案系統及根目錄和目前目錄,互相沖突*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
/*子程序即需要自己的 使用者命名空間 ,又共享父程序的根目錄和目前目錄,是沖突的*/
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
/* CLONE_THREAD 子程序和父程序屬于同一個線程組,就必須共享信号處理表*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
/*如果子程序共享父程序的信号處理表,那麼就必須共享 虛拟記憶體 區域和頁表*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
/*不能設定為調用者共用父程序(兄弟關系),且又不接受緻命信号*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
/*不能既擁有新的使用者命名空間,又在同一個線程組*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
/*安全檢查,詢問 Linux Security Module (LSM) 看目前任務是否可以建立一個新任務,LSM是SELinux的核心*/
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node);
{
struct task_struct *tsk;
unsigned long *stack;
int err;
/* fork 和 vfork 傳遞的都是 0 */
if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig);
/*配置設定 task_struct */
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
/*配置設定棧空間*/
stack = alloc_thread_stack_node(tsk, node);
if (!stack)
goto free_tsk;
/*将父程序的 task_struct 複制給建立的 task_struct */
err = arch_dup_task_struct(tsk, orig);
if (err)
goto free_stack;
/*新的 task_struct 棧指向了自己的棧空間*/
tsk->stack = stack;
err = kaiser_map_thread_stack(tsk->stack);
if (err)
goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif
/*複制父程序的 thread_info,并将task指針指向自己的 task_struct */
setup_thread_stack(tsk, orig);
/*清除從使用者空間傳回時通知核心的标志,暫時還不了解啥意思*/
clear_user_return_notifier(tsk);
/*清楚重新排程的标志,也就是不能排程*/
clear_tsk_need_resched(tsk);
/*清除底設定 魔幻數字,用于棧溢出檢查的*/
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_long();
#endif
/*
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
/*本身在使用,以及父程序在使用,是以設定為 2 */
atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
account_kernel_stack(stack, 1);
kcov_task_init(tsk);
return tsk;
free_stack:
free_thread_stack(stack);
free_tsk:
free_task_struct(tsk);
return NULL;
}
if (!p)
goto fork_out;
cpufreq_task_times_init(p);
/*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
/*設定線程id*/
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
ftrace_graph_init_task(p);
/*初始化自旋鎖*/
rt_mutex_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
/*判斷目前使用者擁有的程序是否超過最大限制*/
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
/*權限檢查*/
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
/*指派目前程序的證書*/
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
/*檢查目前的程序數量是否超過總限制*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
p->vtime_snap_whence = VTIME_SLEEPING;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
p->io_context = NULL;
p->audit_context = NULL;
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
}
}
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p, cgrp_ss_priv);
if (retval)
goto bad_fork_free_pid;
/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
* stalling fork(2) after we recorded the start_time but before it is
* visible to the system.
*/
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
spin_lock(¤t->sighand->siglock);
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
atomic_inc(¤t->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p, cgrp_ss_priv);
threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
return p;
bad_fork_cancel_cgroup:
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
}
進來先判斷了幾種不合理的情況然後建立子程序 :
1. 子程序獨立的命名空間和共享的檔案系統(以及根目錄、目前目錄)是沖突的
2. 子程序獨立的使用者空間和共享的檔案系統(以及根目錄、目前目錄)是沖突的
3. 子程序設定了和父程序歸屬相同的線程組,就必須共享信号處理表
4. 子程序共享父程序的信号處理表,那麼就必須共享 虛拟記憶體 區域和頁表
5. 目前程序忽略緻命信号(SIGNAL_UNKILLABLE),就不能傳遞 CLONE_THREAD(變父子為兄弟關系)
7 . 不能即設定新的使用者命名空間,又設定屬于同一線程組接下來進行 LSM(SELinux) 判斷是否能建立線程
8. dup_task_struct 複制父程序的 task_struct (申請子程序task_struct 、 棧 ,并複制父程序的 task_struct )
9. 判斷是否超過使用者程序限制,以及是否超過系統程序數限制
10. 接下來初始化子程序自己的一些資源,并根據傳遞的 CLONE_* 來copy_* ,目前沒有對 task_struct 結構體研究,成員的作用頁不太清楚,日後研究後再補充
copy的好多個資源,特别摘出來 copy_mm
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
int retval;
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif
tsk->mm = NULL;
tsk->active_mm = NULL;
/*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm;
if (!oldmm)
return 0;
/* initialize the new vmacache entries */
vmacache_flush(tsk);
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
goto good_mm;
}
retval = -ENOMEM;
mm = dup_mm(tsk);
if (!mm)
goto fail_nomem;
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
fail_nomem:
return retval;
}
當設定了 CLONE_VM ,也就是共享記憶體空間,直接将父程序的 mm 指派過來,vfork 及 kernel_thread 都設定了該标志。kernel_thread 建立核心線程,當然沒有獨立的記憶體空間,而vfork是為了 exec ,不需要獨立的記憶體空間,和父程序共享。而 fork 沒有該标志,通過 dup_mm 來建立自己的記憶體空間,其實還是複制父程序的,但是多了重要步驟,将頁表權限修改為 不可寫 ,這樣一旦要寫入,就會觸發權限異常,
通過 do_page_fault 中的 do_wp_page 完成 copy on write 功能!