进程管理

进程:处于执行期的程序。

线程:在进程中活动的对象

虚拟机制

虚拟处理器:多个进程分享一个处理器

虚拟内存:多个线程共享虚拟内存

一、进程描述符和任务结构

进程存放在双向循环链表中(队列),链表中的项为task_struct,称为进程描述符。在头文件<linux/sched.h>中。

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace; #ifdef CONFIG_SMP
struct task_struct *wake_entry;
int on_cpu;
#endif
int on_rq; int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt; #ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif /*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif unsigned int policy;
cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
char rcu_read_unlock_special;
#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
int rcu_boosted;
#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
#endif struct mm_struct *mm, *active_mm;
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:;
#endif
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
unsigned int group_stop; /* GROUP_STOP_*, siglock protected */
/* ??? */
unsigned int personality;
unsigned did_exec:;
unsigned in_execve:; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:; /* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:;
unsigned sched_contributes_to_load:; pid_t pid;
pid_t tgid; #ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif /*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct *real_parent; /* real parent process */
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */ /*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry; /* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group; struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt; struct task_cputime cputime_expires;
struct list_head cpu_timers[];
/* process credentials */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand; sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending; unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
unsigned int sessionid;
#endif
seccomp_t seccomp; /* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS
/* IRQ handler threads */
struct irqaction *irqaction;
#endif /* Protection of the PI data structures: */
raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif #ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif /* journalling filesystem info */
void *journal_info; /* stacked block device info */
struct bio_list *bio_list; #ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif /* VM state */
struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu; /*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns; struct list_head *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schedule */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
*/
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
struct memcg_batch_info {
int do_batch; /* incremented when batch uncharge started */
struct mem_cgroup *memcg; /* target memcg of uncharge */
unsigned long nr_pages; /* uncharged usage */
unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
} memcg_batch;
#endif
#ifdef CONFIG_HAVE_HW_BREAKPOINT
atomic_t ptrace_bp_refcnt;
#endif
};

task_struct

slab分配器分配task_struct结构,2.6后slab分配器动态生成task struct,只需要在栈底(向下增长的栈)或栈顶(向上增长的栈)创建一个新的结构struct thread_info。

在头文件<asm/thread_info.h>中,找了半天在arch/arm/include/asm/thread_info.h:

/*
* low level task data that entry.S needs immediate access to.
* __switch_to() assumes cpu_context follows immediately after cpu_domain.
*/
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
struct cpu_context_save cpu_context; /* cpu context */
__u32 syscall; /* syscall number */
__u8 used_cp[]; /* thread used copro */
unsigned long tp_value;
struct crunch_state crunchstate;
union fp_state fpstate __attribute__((aligned()));
union vfp_state vfpstate;
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
struct restart_block restart_block;
};

thread_info

在内核栈的尾端,分配了threa_info,结构体task域存放指向该任务的实际task_struct指针。

二、进程描述符

PID是一个数,实际上是int类型。内核把各自的进程的PID放在各自的进程描述符中。

修改进程最大限制可以在文件<linux/threads.h>修改,或者修改/proc/sys/kernel/pid_max来提高上限

2.1 进程状态

TASK_RUNNING:进程使可运行的

TASK_INTERRUPTIBLE:进程正在睡眠

TASK_UNINTERRUPTIBLE:进程就算收到信号也不会唤醒

__TASK_TRACED:被其他进程跟踪的进程

__TASK_STOPPED:进程停止执行

2.2 进程状态设置

使用函数调整某个进程的状态

set_task_state(task, state);        /* 将任务task的状态设置为state */

进程上下文:当一个程序执行了系统调用,或触发了某个异常,就陷入了内核空间。此时,内核代表进程执行并处于进程上下文中。

进程家族树:拥有同一个父进程的所有进程被称为兄弟。

每个task_struct都包含一个指向其父进程task_struct叫做parent的指针,还包含一个称为children的子进程链表。

/* 获取其父进程的进程描述符 */
struct task_struct *my_parent = current->parent; /* 一次访问子进程 */
struct task_struct *task;
struct list_head *list;
list_for_each(list, &current->children) {
task = list_entry(list, struct task_struct, sibling);
/* task现在是指向当前的某个子进程 */
} /* 访问init */
struct task_struct *task;
for(task = current; task != &init_task; task = task->parent)
; /* 遍历系统中所有进程 */
list_entry(task->tasks.next, struct task_struct, tasks)
list_entry(task->tasks.prev, struct task_struct, tasks) struct task_struct *task;
for_each_process(task) {
/* 它打印出每一个任务的名称和PID */
printk("%s[%d] \n", task->comm, task->pid);
}
/* 注意:代价极大*/

task_struct遍历

三、进程创建

首先,fork()通过拷贝当前进程创建要给子进程。子进程与父进程的区别仅仅在于PID和PPID,和某些资源和统计量(如挂起信号)。

exec()负责读取可执行我呢见并将其载入地址空间开始运行。

写时拷贝:一种可以推迟甚至免除拷贝数据的技术。内核并不复制整个进程地址空间,而是父子进程共享同一个拷贝。

只有在要写入时,数据才会被复制。在此之前,都是只读方式共享。

3.1 fork()

Linux通过clone()系统调用实现fork()。do_fork()完成创建中的大部分工作,定义在kernel/fork.c中。

该函数调用copy_process()函数,然后让进程开始运行。

/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
struct task_struct *p;
int trace = ;
long nr; /*
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
if (clone_flags & CLONE_NEWUSER) {
if (clone_flags & CLONE_THREAD)
return -EINVAL;
/* hopefully this check will go away when userns support is
* complete
*/
if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
!capable(CAP_SETGID))
return -EPERM;
} /*
* When called from kernel_thread, don't do user tracing stuff.
*/
if (likely(user_mode(regs)))
trace = tracehook_prepare_clone(clone_flags); /* 1) 调用copy_process()函数,然后让进程开始执行 */ p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
} audit_finish_fork(p);
tracehook_report_clone(regs, clone_flags, nr, p); /*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
* hasn't gotten to tracehook_report_clone() yet. Now we
* clear it and set the child going.
*/
p->flags &= ~PF_STARTING; wake_up_new_task(p); tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p); if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
tracehook_report_vfork_done(p, nr);
}
} else {
nr = PTR_ERR(p);
}
return nr;
} /*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace)
{
int retval;
struct task_struct *p;
int cgroup_callbacks_done = ; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL); /*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL); /*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL); /*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags);
if (retval)
goto fork_out; /* 1) 调用dup_task_struct()为新进程创建一个内核栈、thread_info结构和task_struct,这些值
* 与当前进程的值相同。此时,子进程和父进程的描述符是完全相同的。
*/
retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif /* 第一个if检查用户的进程数量是否超过限制
* 第二个if检查用户是否有足够的权限操作
*/
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->real_cred->user != INIT_USER)
goto bad_fork_free;
} retval = copy_creds(p, clone_flags);
if (retval < )
goto bad_fork_free; /*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
* 2) 检查并确保新创建的这个子进程后,
* 当前用户所拥有的进程数目没有超出给它分配的资源限制
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count; /* 确认exec_domain模块是否被加载 */
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count; p->did_exec = ;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
/* 5) 调用copy_flags()以更新task_struct的flags成员,
* 表明进程是否拥有超级用户权限的PF_SUPERPRIV标志被清0。
* 表明进程还没有调用exec()函数的PF_FORKNOEXEC标志被设备
*/
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, , sizeof(p->rss_stat));
#endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac);
acct_clear_integrals(p); posix_cpu_timers_init(p); do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_cgroup;
}
mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = ;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = ;
#else
p->hardirqs_enabled = ;
#endif
p->hardirq_enable_ip = ;
p->hardirq_enable_event = ;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = ;
p->softirqs_enabled = ;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = ;
p->softirq_disable_ip = ;
p->softirq_disable_event = ;
p->hardirq_context = ;
p->softirq_context = ;
#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = ; /* no locks held yet */
p->curr_chain_key = ;
p->lockdep_recursion = ;
#endif #ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
p->memcg_batch.do_batch = ;
p->memcg_batch.memcg = NULL;
#endif /* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p); retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_policy; /* 下面的操作中,根据flag中是否设置了相关标志
* 进行重新分配或者共享父进程的内容
*/
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces; /* 更新子进程的内核栈和寄存器中的值 */
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io; /* 6)为新进程分配一个有效的PID */
if (pid != &init_struct_pid) {
retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
} p->pid = pid_nr(pid);
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = ; /*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p); /* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? - : (clone_flags & CSIGNAL);
p->pdeath_signal = ;
p->exit_state = ; /*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
cgroup_fork_callbacks(p);
cgroup_callbacks_done = ; /* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
} spin_lock(&current->sighand->siglock); /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
} if (clone_flags & CLONE_THREAD) {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
atomic_inc(&current->signal->sigcnt);
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
} /* 8)做一些扫尾工作,并返回一个指向子进程的指针 */
if (likely(p->pid)) {
tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) {
if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__this_cpu_inc(process_counts);
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
} total_forks++;
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p; bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
task_lock(p);
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
atomic_dec(&p->mm->oom_disable_count);
task_unlock(p);
mmput(p->mm);
}
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_policy:
perf_event_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
} NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl));
WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!"); /*
* If do_exit is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
set_fs(USER_DS); tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); /*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
* cleanup has been done or not. In the worst case it
* loops once more. We pretend that the cleanup was
* done as there is no way to return. Either the
* OWNER_DIED bit is set by now or we push the blocked
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
} exit_irq_thread(); /* 1)将task_struct中的标志成员设置为PF_EXTING */
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count()); acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk, tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
audit_free(tsk); tsk->exit_code = code;
taskstats_exit(tsk, group_dead); /* 4)释放进程占用的mm_struct,如果没有别的进程使用它们,就彻底释放它们 */
exit_mm(tsk); if (group_dead)
acct_process();
trace_sched_process_exit(tsk); /* 5)如果进程派对等候IPC信号,则离开队列 */
exit_sem(tsk);
/* 6)分别递减文件描述符、文件系统数据的引用计数
* 如果其中某个引用计数的数值降为零,
* 那么就代表没有进程在使用相应的资源,可以释放
*/
exit_files(tsk);
exit_fs(tsk);
check_stack_usage();
exit_thread(); /*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
/* 把存放task_struct的exit_code成员中的任务退出代码,置为由exit()提供的退出代码
* 或者去完成任何其他由内核机制规定的退出动作。
* 退出代码存放在这里供父进程随时检索
*/
perf_event_exit_task(tsk); cgroup_exit(tsk, ); if (group_dead)
disassociate_ctty(); module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); /*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
ptrace_put_breakpoints(tsk); /* 8)向父进程发送信号,给子进程重新找养父,养父为线程组中的其他线程或者init进程,
* 并把进程状态设成EXIT_ZOMBIE
*/
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
* or not. In the worst case it loops once more.
*/
tsk->flags |= PF_EXITPIDONE; if (tsk->io_context)
exit_io_context(tsk); if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe); validate_creds_for_do_exit(tsk); preempt_disable();
exit_rcu();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
/* 切换到新的进程。因为处于EXIT_ZOMBIE状态
* 进程不会再被调度,所以这是进程所执行的最后一段代码
*/
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
}

do_fork()和copy_process()

3.2 创建线程

和创建进程类似,使用clone()的时候需要一些参数标志来指明共享资源。

clone(CLONE_VM|CLONE|FS|CLONE_FILES|CLONE_SIGHAND, );

这些标志在<linux/sched.h>中定义的

CLONE_FILES    父子进程共享打开文件
CLONE_FS 父子进程共享文件系统信息
CLONE_IDLETASK 将PID设置为0(只供idle进程使用)
CLONE_NEWNS 为子进程创建新的命名空间
CLONE_PARENT 指定子进程与父进程拥有同一个父进程
CLONE_PTRACE 继续调试子进程
CLONE_SETTID 将TID回写至用户空间
CLONE_SETTLS 为子进程创建新的TLS
CLONE_SIGHAND 父子进程共享信号处理函数及被阻断的信号
CLONE_SYSVSEM 父子进程共享system V SEM_UNDO语义
CLONE_THREAD 父子进程放入相同的线程组
CLONE_VFORK 调用vfork(),所以父进程准备睡眠等待子进程将其唤醒
CLONE_UNTRACED 防止跟踪进程在子进程上强制执行CLONE_PTRACE
CLONE_STOP 以TASK_STOPPED状态开始进程
CLONE_SETTLS 为子进程创建新的TLS
CLONE_CHILD_CLEARTID 清除子进程TID
CLONE_CHILD_SETTID 设置子进程TID
CLONE_PARENT_SETTID 设置父进程TID
CLONE_VM 父子进程共享地址空间

clone()参数标志

3.3 内核线程

内核线程和普通线程的区别在于内核线程没有独立的地址空间,并且只运行在内核空间,同样可以被调度和抢占。

在装有Linux系统上的运行ps -ef就可以看到内核线程了,在<linux/kthread.h>中声明有接口。创建一个新的内核线程的方法:

struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)
threadfn:运行的函数
data:给其传递的参数为data
namefmt:接受可变参数列表,类似于printf()

新创建的线程处于不可运行状态,需要通过wake_up_process()明确的唤醒,才会主动运行。

两步一起的函数为:

struct task_struct *kthread_run(int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)

内核线程启动后一直运行直到调用do_exit(),或者kthread_stop(),参数是创建时的task_struct结构地址。

int kthread_stop(struct task_struct *k)

四、进程终结

大部分任务依靠do_exit()来终结,定义于<kernel/exit.c>。

NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl));
WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!"); /*
* If do_exit is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
set_fs(USER_DS); tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); /*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
* cleanup has been done or not. In the worst case it
* loops once more. We pretend that the cleanup was
* done as there is no way to return. Either the
* OWNER_DIED bit is set by now or we push the blocked
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
} exit_irq_thread(); /* 1)设置进程标志为PF_EXITING */
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count()); acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk, tsk->mm);
/* 2)调用del_timer_sync()删除任一内核定时器 */
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
audit_free(tsk); tsk->exit_code = code;
taskstats_exit(tsk, group_dead); /* 4)调用exit_mm()释放进程占用的mm_struct,如果没有别的进程使用它们 */
exit_mm(tsk); if (group_dead)
acct_process();
trace_sched_process_exit(tsk); /* 5)如果进程派对等候IPC信号,它则离开队列 */
exit_sem(tsk);
/* 6)以分别递减文件描述符、文件系统数据的引用计数。如果其中某个引用计数的数值降为零
* 那么就代表没有进程在使用相应的资源,此时可以释放 */
exit_files(tsk);
exit_fs(tsk);
check_stack_usage();
exit_thread(); /*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk); cgroup_exit(tsk, ); if (group_dead)
disassociate_ctty(); module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); /*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
ptrace_put_breakpoints(tsk); /* 8)向父进程发送信号,给子进程重新找养父,养父为线程组中的其他线程或者位init线程
* 并把进程状态设为EXIT_ZOMBIE */
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
* or not. In the worst case it loops once more.
*/
tsk->flags |= PF_EXITPIDONE; if (tsk->io_context)
exit_io_context(tsk); if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe); validate_creds_for_do_exit(tsk); preempt_disable();
exit_rcu();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
/* 调用schedule切换到新的进程,处于ZOMBIE不会再被调度 */
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
}

do_exit

4.1 删除进程描述符

调用了do_exit()后,在父进程获得已结束的子进程信息后,或者通知内核它并不关注这些信息后,task_struct结构才被释放。

当最终释放进程描述符时,release_task()或被调用:

void release_task(struct task_struct * p)
{
struct task_struct *leader;
int zap_leader;
repeat:
tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
atomic_dec(&__task_cred(p)->user->processes);
rcu_read_unlock(); proc_flush_task(p); write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
/* 1)该函数调用_unhash_process(),后者调用detach_pid()从pidhash
* 上删除该进程,同时也要从任务列表中删除该进程
* 2)释放目前僵死进程所使用的所有剩余资源,并进行最终统计和记录 */
__exit_signal(p); /*
* If we are the last non-leader member of the thread
* group, and the leader is zombie, then notify the
* group leader's parent process. (if it wants notification.)
*/
/* 3)如果这个进程是线程组最后一个进程,并且领头进程已经死掉,
* 那么release_task()就要通知僵死的头领进程的父进程 */
zap_leader = ;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
BUG_ON(task_detached(leader));
do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
*
* do_notify_parent() will have marked it self-reaping in
* that case.
*/
zap_leader = task_detached(leader); /*
* This maintains the invariant that release_task()
* only runs on a task in EXIT_DEAD, just for sanity.
*/
if (zap_leader)
leader->exit_state = EXIT_DEAD;
} write_unlock_irq(&tasklist_lock);
/* 4)释放进程内核栈和thread_info结构所占的页,
* 并释放task_struct所占的slab高速缓存 */
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct); p = leader;
if (unlikely(zap_leader))
goto repeat;
}

release_task

最新文章

  1. python pexpect 学习与探索
  2. [转]asp.net的ajax以及json
  3. 《Java核心技术卷一》笔记 多线程同步(底层实现)
  4. memcached client --ref
  5. C# 解析嵌套的json文件.
  6. jquery图片3D旋绕效果 rotate3Di的操作
  7. a:hover和a:visited书写顺序的重要性
  8. 2017/4/27-Gradle的配置与Spring的下载
  9. Java对象的访问定位
  10. Android8 自定义广播接收不到的问题
  11. JAVA AES CBC 加密 解密
  12. tomcat的调优管理
  13. tensorFlow 三种启动图的用法
  14. 了解一些dos常用命令
  15. python之路----网络编程--黏包
  16. hdu 1203 dp(关于概率的```背包?)
  17. 如何查看mysql 默认端口号和修改端口号
  18. LeetCode: 61. Rotate List(Medium)
  19. lol人物模型提取(二)
  20. ETO的公开赛T3《寻星》 题解(BY 超級&#183;考場WA怪 )

热门文章

  1. 阶段1 语言基础+高级_1-3-Java语言高级_1-常用API_1_第6节 static静态_12_静态static关键字修饰成员变量
  2. IntelliJ IDEA的常用设置
  3. Python笔记(二十四)_魔法方法_运算符的魔法方法
  4. linux(centos7.0以上)下对mysql数据库的导入导出
  5. yolov--7--解决报错:/bin/sh: 1: nvcc: not found make: *** [obj/convolutional_kernels.o] Error 127
  6. ios平台appstore不支持网页内嵌app解决方案
  7. [Web 前端] 005 html 常用标签补充
  8. POJ-1502 MPI Maelstrom 迪杰斯特拉+题解
  9. UVA 10003 Cutting Sticks 区间DP+记忆化搜索
  10. Linux快速访问多个目录