Blame - kernel/fork.c - LeafOS-Devices/android_kernel_realme_mt6785

blob: 876b31cd822d2669b5831b31707aabfdba676304 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/kernel/fork.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	/*
				8	* 'fork.c' contains the help-routines for the 'fork' system call
				9	* (see also entry.S and others).
				10	* Fork is rather simple, once you get the hang of it, but the memory
				11	* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
				12	*/
				13
				14	#include <linux/config.h>
				15	#include <linux/slab.h>
				16	#include <linux/init.h>
				17	#include <linux/unistd.h>
				18	#include <linux/smp_lock.h>
				19	#include <linux/module.h>
				20	#include <linux/vmalloc.h>
				21	#include <linux/completion.h>
				22	#include <linux/namespace.h>
				23	#include <linux/personality.h>
				24	#include <linux/mempolicy.h>
				25	#include <linux/sem.h>
				26	#include <linux/file.h>
				27	#include <linux/key.h>
				28	#include <linux/binfmts.h>
				29	#include <linux/mman.h>
				30	#include <linux/fs.h>
				31	#include <linux/cpu.h>
				32	#include <linux/cpuset.h>
				33	#include <linux/security.h>
				34	#include <linux/swap.h>
				35	#include <linux/syscalls.h>
				36	#include <linux/jiffies.h>
				37	#include <linux/futex.h>
				38	#include <linux/ptrace.h>
				39	#include <linux/mount.h>
				40	#include <linux/audit.h>
				41	#include <linux/profile.h>
				42	#include <linux/rmap.h>
				43	#include <linux/acct.h>
				44
				45	#include <asm/pgtable.h>
				46	#include <asm/pgalloc.h>
				47	#include <asm/uaccess.h>
				48	#include <asm/mmu_context.h>
				49	#include <asm/cacheflush.h>
				50	#include <asm/tlbflush.h>
				51
				52	/*
				53	* Protected counters by write_lock_irq(&tasklist_lock)
				54	*/
				55	unsigned long total_forks; /* Handle normal Linux uptimes. */
				56	int nr_threads; /* The idle threads do not count.. */
				57
				58	int max_threads; /* tunable limit on nr_threads */
				59
				60	DEFINE_PER_CPU(unsigned long, process_counts) = 0;
				61
				62	__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
				63
				64	EXPORT_SYMBOL(tasklist_lock);
				65
				66	int nr_processes(void)
				67	{
				68	int cpu;
				69	int total = 0;
				70
				71	for_each_online_cpu(cpu)
				72	total += per_cpu(process_counts, cpu);
				73
				74	return total;
				75	}
				76
				77	#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
				78	# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
				79	# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
				80	static kmem_cache_t *task_struct_cachep;
				81	#endif
				82
				83	/* SLAB cache for signal_struct structures (tsk->signal) */
				84	kmem_cache_t *signal_cachep;
				85
				86	/* SLAB cache for sighand_struct structures (tsk->sighand) */
				87	kmem_cache_t *sighand_cachep;
				88
				89	/* SLAB cache for files_struct structures (tsk->files) */
				90	kmem_cache_t *files_cachep;
				91
				92	/* SLAB cache for fs_struct structures (tsk->fs) */
				93	kmem_cache_t *fs_cachep;
				94
				95	/* SLAB cache for vm_area_struct structures */
				96	kmem_cache_t *vm_area_cachep;
				97
				98	/* SLAB cache for mm_struct structures (tsk->mm) */
				99	static kmem_cache_t *mm_cachep;
				100
				101	void free_task(struct task_struct *tsk)
				102	{
				103	free_thread_info(tsk->thread_info);
				104	free_task_struct(tsk);
				105	}
				106	EXPORT_SYMBOL(free_task);
				107
				108	void __put_task_struct(struct task_struct *tsk)
				109	{
				110	WARN_ON(!(tsk->exit_state & (EXIT_DEAD \| EXIT_ZOMBIE)));
				111	WARN_ON(atomic_read(&tsk->usage));
				112	WARN_ON(tsk == current);
				113
				114	if (unlikely(tsk->audit_context))
				115	audit_free(tsk);
				116	security_task_free(tsk);
				117	free_uid(tsk->user);
				118	put_group_info(tsk->group_info);
				119
				120	if (!profile_handoff_task(tsk))
				121	free_task(tsk);
				122	}
				123
				124	void __init fork_init(unsigned long mempages)
				125	{
				126	#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
				127	#ifndef ARCH_MIN_TASKALIGN
				128	#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
				129	#endif
				130	/* create a slab on which task_structs can be allocated */
				131	task_struct_cachep =
				132	kmem_cache_create("task_struct", sizeof(struct task_struct),
				133	ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
				134	#endif
				135
				136	/*
				137	* The default maximum number of threads is set to a safe
				138	* value: the thread structures can take up at most half
				139	* of memory.
				140	*/
				141	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
				142
				143	/*
				144	* we need to allow at least 20 threads to boot a system
				145	*/
				146	if(max_threads < 20)
				147	max_threads = 20;
				148
				149	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
				150	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
				151	init_task.signal->rlim[RLIMIT_SIGPENDING] =
				152	init_task.signal->rlim[RLIMIT_NPROC];
				153	}
				154
				155	static struct task_struct dup_task_struct(struct task_struct orig)
				156	{
				157	struct task_struct *tsk;
				158	struct thread_info *ti;
				159
				160	prepare_to_copy(orig);
				161
				162	tsk = alloc_task_struct();
				163	if (!tsk)
				164	return NULL;
				165
				166	ti = alloc_thread_info(tsk);
				167	if (!ti) {
				168	free_task_struct(tsk);
				169	return NULL;
				170	}
				171
				172	ti = orig->thread_info;
				173	tsk = orig;
				174	tsk->thread_info = ti;
				175	ti->task = tsk;
				176
				177	/* One for us, one for whoever does the "release_task()" (usually parent) */
				178	atomic_set(&tsk->usage,2);
				179	return tsk;
				180	}
				181
				182	#ifdef CONFIG_MMU
				183	static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
				184	{
				185	struct vm_area_struct * mpnt, tmp, *pprev;
				186	struct rb_node *rb_link, rb_parent;
				187	int retval;
				188	unsigned long charge;
				189	struct mempolicy *pol;
				190
				191	down_write(&oldmm->mmap_sem);
				192	flush_cache_mm(current->mm);
				193	mm->locked_vm = 0;
				194	mm->mmap = NULL;
				195	mm->mmap_cache = NULL;
				196	mm->free_area_cache = oldmm->mmap_base;
Wolfgang Wander	1363c3c	2005-06-21 17:14:49 -0700	[diff] [blame^]	197	mm->cached_hole_size = ~0UL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	198	mm->map_count = 0;
				199	set_mm_counter(mm, rss, 0);
				200	set_mm_counter(mm, anon_rss, 0);
				201	cpus_clear(mm->cpu_vm_mask);
				202	mm->mm_rb = RB_ROOT;
				203	rb_link = &mm->mm_rb.rb_node;
				204	rb_parent = NULL;
				205	pprev = &mm->mmap;
				206
				207	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
				208	struct file *file;
				209
				210	if (mpnt->vm_flags & VM_DONTCOPY) {
				211	__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
				212	-vma_pages(mpnt));
				213	continue;
				214	}
				215	charge = 0;
				216	if (mpnt->vm_flags & VM_ACCOUNT) {
				217	unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
				218	if (security_vm_enough_memory(len))
				219	goto fail_nomem;
				220	charge = len;
				221	}
				222	tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
				223	if (!tmp)
				224	goto fail_nomem;
				225	tmp = mpnt;
				226	pol = mpol_copy(vma_policy(mpnt));
				227	retval = PTR_ERR(pol);
				228	if (IS_ERR(pol))
				229	goto fail_nomem_policy;
				230	vma_set_policy(tmp, pol);
				231	tmp->vm_flags &= ~VM_LOCKED;
				232	tmp->vm_mm = mm;
				233	tmp->vm_next = NULL;
				234	anon_vma_link(tmp);
				235	file = tmp->vm_file;
				236	if (file) {
				237	struct inode *inode = file->f_dentry->d_inode;
				238	get_file(file);
				239	if (tmp->vm_flags & VM_DENYWRITE)
				240	atomic_dec(&inode->i_writecount);
				241
				242	/* insert tmp into the share list, just after mpnt */
				243	spin_lock(&file->f_mapping->i_mmap_lock);
				244	tmp->vm_truncate_count = mpnt->vm_truncate_count;
				245	flush_dcache_mmap_lock(file->f_mapping);
				246	vma_prio_tree_add(tmp, mpnt);
				247	flush_dcache_mmap_unlock(file->f_mapping);
				248	spin_unlock(&file->f_mapping->i_mmap_lock);
				249	}
				250
				251	/*
				252	* Link in the new vma and copy the page table entries:
				253	* link in first so that swapoff can see swap entries,
				254	* and try_to_unmap_one's find_vma find the new vma.
				255	*/
				256	spin_lock(&mm->page_table_lock);
				257	*pprev = tmp;
				258	pprev = &tmp->vm_next;
				259
				260	__vma_link_rb(mm, tmp, rb_link, rb_parent);
				261	rb_link = &tmp->vm_rb.rb_right;
				262	rb_parent = &tmp->vm_rb;
				263
				264	mm->map_count++;
				265	retval = copy_page_range(mm, current->mm, tmp);
				266	spin_unlock(&mm->page_table_lock);
				267
				268	if (tmp->vm_ops && tmp->vm_ops->open)
				269	tmp->vm_ops->open(tmp);
				270
				271	if (retval)
				272	goto out;
				273	}
				274	retval = 0;
				275
				276	out:
				277	flush_tlb_mm(current->mm);
				278	up_write(&oldmm->mmap_sem);
				279	return retval;
				280	fail_nomem_policy:
				281	kmem_cache_free(vm_area_cachep, tmp);
				282	fail_nomem:
				283	retval = -ENOMEM;
				284	vm_unacct_memory(charge);
				285	goto out;
				286	}
				287
				288	static inline int mm_alloc_pgd(struct mm_struct * mm)
				289	{
				290	mm->pgd = pgd_alloc(mm);
				291	if (unlikely(!mm->pgd))
				292	return -ENOMEM;
				293	return 0;
				294	}
				295
				296	static inline void mm_free_pgd(struct mm_struct * mm)
				297	{
				298	pgd_free(mm->pgd);
				299	}
				300	#else
				301	#define dup_mmap(mm, oldmm) (0)
				302	#define mm_alloc_pgd(mm) (0)
				303	#define mm_free_pgd(mm)
				304	#endif /* CONFIG_MMU */
				305
				306	__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
				307
				308	#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
				309	#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
				310
				311	#include <linux/init_task.h>
				312
				313	static struct mm_struct * mm_init(struct mm_struct * mm)
				314	{
				315	atomic_set(&mm->mm_users, 1);
				316	atomic_set(&mm->mm_count, 1);
				317	init_rwsem(&mm->mmap_sem);
				318	INIT_LIST_HEAD(&mm->mmlist);
				319	mm->core_waiters = 0;
				320	mm->nr_ptes = 0;
				321	spin_lock_init(&mm->page_table_lock);
				322	rwlock_init(&mm->ioctx_list_lock);
				323	mm->ioctx_list = NULL;
				324	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
				325	mm->free_area_cache = TASK_UNMAPPED_BASE;
Wolfgang Wander	1363c3c	2005-06-21 17:14:49 -0700	[diff] [blame^]	326	mm->cached_hole_size = ~0UL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	327
				328	if (likely(!mm_alloc_pgd(mm))) {
				329	mm->def_flags = 0;
				330	return mm;
				331	}
				332	free_mm(mm);
				333	return NULL;
				334	}
				335
				336	/*
				337	* Allocate and initialize an mm_struct.
				338	*/
				339	struct mm_struct * mm_alloc(void)
				340	{
				341	struct mm_struct * mm;
				342
				343	mm = allocate_mm();
				344	if (mm) {
				345	memset(mm, 0, sizeof(*mm));
				346	mm = mm_init(mm);
				347	}
				348	return mm;
				349	}
				350
				351	/*
				352	* Called when the last reference to the mm
				353	* is dropped: either by a lazy thread or by
				354	* mmput. Free the page directory and the mm.
				355	*/
				356	void fastcall __mmdrop(struct mm_struct *mm)
				357	{
				358	BUG_ON(mm == &init_mm);
				359	mm_free_pgd(mm);
				360	destroy_context(mm);
				361	free_mm(mm);
				362	}
				363
				364	/*
				365	* Decrement the use count and release all resources for an mm.
				366	*/
				367	void mmput(struct mm_struct *mm)
				368	{
				369	if (atomic_dec_and_test(&mm->mm_users)) {
				370	exit_aio(mm);
				371	exit_mmap(mm);
				372	if (!list_empty(&mm->mmlist)) {
				373	spin_lock(&mmlist_lock);
				374	list_del(&mm->mmlist);
				375	spin_unlock(&mmlist_lock);
				376	}
				377	put_swap_token(mm);
				378	mmdrop(mm);
				379	}
				380	}
				381	EXPORT_SYMBOL_GPL(mmput);
				382
				383	/**
				384	* get_task_mm - acquire a reference to the task's mm
				385	*
				386	* Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
				387	* this kernel workthread has transiently adopted a user mm with use_mm,
				388	* to do its AIO) is not set and if so returns a reference to it, after
				389	* bumping up the use count. User must release the mm via mmput()
				390	* after use. Typically used by /proc and ptrace.
				391	*/
				392	struct mm_struct get_task_mm(struct task_struct task)
				393	{
				394	struct mm_struct *mm;
				395
				396	task_lock(task);
				397	mm = task->mm;
				398	if (mm) {
				399	if (task->flags & PF_BORROWED_MM)
				400	mm = NULL;
				401	else
				402	atomic_inc(&mm->mm_users);
				403	}
				404	task_unlock(task);
				405	return mm;
				406	}
				407	EXPORT_SYMBOL_GPL(get_task_mm);
				408
				409	/* Please note the differences between mmput and mm_release.
				410	* mmput is called whenever we stop holding onto a mm_struct,
				411	* error success whatever.
				412	*
				413	* mm_release is called after a mm_struct has been removed
				414	* from the current process.
				415	*
				416	* This difference is important for error handling, when we
				417	* only half set up a mm_struct for a new process and need to restore
				418	* the old one. Because we mmput the new mm_struct before
				419	* restoring the old one. . .
				420	* Eric Biederman 10 January 1998
				421	*/
				422	void mm_release(struct task_struct tsk, struct mm_struct mm)
				423	{
				424	struct completion *vfork_done = tsk->vfork_done;
				425
				426	/* Get rid of any cached register state */
				427	deactivate_mm(tsk, mm);
				428
				429	/* notify parent sleeping on vfork() */
				430	if (vfork_done) {
				431	tsk->vfork_done = NULL;
				432	complete(vfork_done);
				433	}
				434	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
				435	u32 __user * tidptr = tsk->clear_child_tid;
				436	tsk->clear_child_tid = NULL;
				437
				438	/*
				439	* We don't check the error code - if userspace has
				440	* not set up a proper pointer then tough luck.
				441	*/
				442	put_user(0, tidptr);
				443	sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
				444	}
				445	}
				446
				447	static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
				448	{
				449	struct mm_struct * mm, *oldmm;
				450	int retval;
				451
				452	tsk->min_flt = tsk->maj_flt = 0;
				453	tsk->nvcsw = tsk->nivcsw = 0;
				454
				455	tsk->mm = NULL;
				456	tsk->active_mm = NULL;
				457
				458	/*
				459	* Are we cloning a kernel thread?
				460	*
				461	* We need to steal a active VM for that..
				462	*/
				463	oldmm = current->mm;
				464	if (!oldmm)
				465	return 0;
				466
				467	if (clone_flags & CLONE_VM) {
				468	atomic_inc(&oldmm->mm_users);
				469	mm = oldmm;
				470	/*
				471	* There are cases where the PTL is held to ensure no
				472	* new threads start up in user mode using an mm, which
				473	* allows optimizing out ipis; the tlb_gather_mmu code
				474	* is an example.
				475	*/
				476	spin_unlock_wait(&oldmm->page_table_lock);
				477	goto good_mm;
				478	}
				479
				480	retval = -ENOMEM;
				481	mm = allocate_mm();
				482	if (!mm)
				483	goto fail_nomem;
				484
				485	/* Copy the current MM stuff.. */
				486	memcpy(mm, oldmm, sizeof(*mm));
				487	if (!mm_init(mm))
				488	goto fail_nomem;
				489
				490	if (init_new_context(tsk,mm))
				491	goto fail_nocontext;
				492
				493	retval = dup_mmap(mm, oldmm);
				494	if (retval)
				495	goto free_pt;
				496
				497	mm->hiwater_rss = get_mm_counter(mm,rss);
				498	mm->hiwater_vm = mm->total_vm;
				499
				500	good_mm:
				501	tsk->mm = mm;
				502	tsk->active_mm = mm;
				503	return 0;
				504
				505	free_pt:
				506	mmput(mm);
				507	fail_nomem:
				508	return retval;
				509
				510	fail_nocontext:
				511	/*
				512	* If init_new_context() failed, we cannot use mmput() to free the mm
				513	* because it calls destroy_context()
				514	*/
				515	mm_free_pgd(mm);
				516	free_mm(mm);
				517	return retval;
				518	}
				519
				520	static inline struct fs_struct __copy_fs_struct(struct fs_struct old)
				521	{
				522	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
				523	/* We don't need to lock fs - think why ;-) */
				524	if (fs) {
				525	atomic_set(&fs->count, 1);
				526	rwlock_init(&fs->lock);
				527	fs->umask = old->umask;
				528	read_lock(&old->lock);
				529	fs->rootmnt = mntget(old->rootmnt);
				530	fs->root = dget(old->root);
				531	fs->pwdmnt = mntget(old->pwdmnt);
				532	fs->pwd = dget(old->pwd);
				533	if (old->altroot) {
				534	fs->altrootmnt = mntget(old->altrootmnt);
				535	fs->altroot = dget(old->altroot);
				536	} else {
				537	fs->altrootmnt = NULL;
				538	fs->altroot = NULL;
				539	}
				540	read_unlock(&old->lock);
				541	}
				542	return fs;
				543	}
				544
				545	struct fs_struct copy_fs_struct(struct fs_struct old)
				546	{
				547	return __copy_fs_struct(old);
				548	}
				549
				550	EXPORT_SYMBOL_GPL(copy_fs_struct);
				551
				552	static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
				553	{
				554	if (clone_flags & CLONE_FS) {
				555	atomic_inc(&current->fs->count);
				556	return 0;
				557	}
				558	tsk->fs = __copy_fs_struct(current->fs);
				559	if (!tsk->fs)
				560	return -ENOMEM;
				561	return 0;
				562	}
				563
				564	static int count_open_files(struct files_struct *files, int size)
				565	{
				566	int i;
				567
				568	/* Find the last open fd */
				569	for (i = size/(8*sizeof(long)); i > 0; ) {
				570	if (files->open_fds->fds_bits[--i])
				571	break;
				572	}
				573	i = (i+1) * 8 * sizeof(long);
				574	return i;
				575	}
				576
				577	static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
				578	{
				579	struct files_struct oldf, newf;
				580	struct file old_fds, new_fds;
				581	int open_files, size, i, error = 0, expand;
				582
				583	/*
				584	* A background process may not have any files ...
				585	*/
				586	oldf = current->files;
				587	if (!oldf)
				588	goto out;
				589
				590	if (clone_flags & CLONE_FILES) {
				591	atomic_inc(&oldf->count);
				592	goto out;
				593	}
				594
				595	/*
				596	* Note: we may be using current for both targets (See exec.c)
				597	* This works because we cache current->files (old) as oldf. Don't
				598	* break this.
				599	*/
				600	tsk->files = NULL;
				601	error = -ENOMEM;
				602	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
				603	if (!newf)
				604	goto out;
				605
				606	atomic_set(&newf->count, 1);
				607
				608	spin_lock_init(&newf->file_lock);
				609	newf->next_fd = 0;
				610	newf->max_fds = NR_OPEN_DEFAULT;
				611	newf->max_fdset = __FD_SETSIZE;
				612	newf->close_on_exec = &newf->close_on_exec_init;
				613	newf->open_fds = &newf->open_fds_init;
				614	newf->fd = &newf->fd_array[0];
				615
				616	spin_lock(&oldf->file_lock);
				617
				618	open_files = count_open_files(oldf, oldf->max_fdset);
				619	expand = 0;
				620
				621	/*
				622	* Check whether we need to allocate a larger fd array or fd set.
				623	* Note: we're not a clone task, so the open count won't change.
				624	*/
				625	if (open_files > newf->max_fdset) {
				626	newf->max_fdset = 0;
				627	expand = 1;
				628	}
				629	if (open_files > newf->max_fds) {
				630	newf->max_fds = 0;
				631	expand = 1;
				632	}
				633
				634	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
				635	if (expand) {
				636	spin_unlock(&oldf->file_lock);
				637	spin_lock(&newf->file_lock);
				638	error = expand_files(newf, open_files-1);
				639	spin_unlock(&newf->file_lock);
				640	if (error < 0)
				641	goto out_release;
				642	spin_lock(&oldf->file_lock);
				643	}
				644
				645	old_fds = oldf->fd;
				646	new_fds = newf->fd;
				647
				648	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
				649	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
				650
				651	for (i = open_files; i != 0; i--) {
				652	struct file f = old_fds++;
				653	if (f) {
				654	get_file(f);
				655	} else {
				656	/*
				657	* The fd may be claimed in the fd bitmap but not yet
				658	* instantiated in the files array if a sibling thread
				659	* is partway through open(). So make sure that this
				660	* fd is available to the new process.
				661	*/
				662	FD_CLR(open_files - i, newf->open_fds);
				663	}
				664	*new_fds++ = f;
				665	}
				666	spin_unlock(&oldf->file_lock);
				667
				668	/* compute the remainder to be cleared */
				669	size = (newf->max_fds - open_files) * sizeof(struct file *);
				670
				671	/* This is long word aligned thus could use a optimized version */
				672	memset(new_fds, 0, size);
				673
				674	if (newf->max_fdset > open_files) {
				675	int left = (newf->max_fdset-open_files)/8;
				676	int start = open_files / (8 * sizeof(unsigned long));
				677
				678	memset(&newf->open_fds->fds_bits[start], 0, left);
				679	memset(&newf->close_on_exec->fds_bits[start], 0, left);
				680	}
				681
				682	tsk->files = newf;
				683	error = 0;
				684	out:
				685	return error;
				686
				687	out_release:
				688	free_fdset (newf->close_on_exec, newf->max_fdset);
				689	free_fdset (newf->open_fds, newf->max_fdset);
				690	free_fd_array(newf->fd, newf->max_fds);
				691	kmem_cache_free(files_cachep, newf);
				692	goto out;
				693	}
				694
				695	/*
				696	* Helper to unshare the files of the current task.
				697	* We don't want to expose copy_files internals to
				698	* the exec layer of the kernel.
				699	*/
				700
				701	int unshare_files(void)
				702	{
				703	struct files_struct *files = current->files;
				704	int rc;
				705
				706	if(!files)
				707	BUG();
				708
				709	/* This can race but the race causes us to copy when we don't
				710	need to and drop the copy */
				711	if(atomic_read(&files->count) == 1)
				712	{
				713	atomic_inc(&files->count);
				714	return 0;
				715	}
				716	rc = copy_files(0, current);
				717	if(rc)
				718	current->files = files;
				719	return rc;
				720	}
				721
				722	EXPORT_SYMBOL(unshare_files);
				723
				724	static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
				725	{
				726	struct sighand_struct *sig;
				727
				728	if (clone_flags & (CLONE_SIGHAND \| CLONE_THREAD)) {
				729	atomic_inc(&current->sighand->count);
				730	return 0;
				731	}
				732	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
				733	tsk->sighand = sig;
				734	if (!sig)
				735	return -ENOMEM;
				736	spin_lock_init(&sig->siglock);
				737	atomic_set(&sig->count, 1);
				738	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
				739	return 0;
				740	}
				741
				742	static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
				743	{
				744	struct signal_struct *sig;
				745	int ret;
				746
				747	if (clone_flags & CLONE_THREAD) {
				748	atomic_inc(&current->signal->count);
				749	atomic_inc(&current->signal->live);
				750	return 0;
				751	}
				752	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
				753	tsk->signal = sig;
				754	if (!sig)
				755	return -ENOMEM;
				756
				757	ret = copy_thread_group_keys(tsk);
				758	if (ret < 0) {
				759	kmem_cache_free(signal_cachep, sig);
				760	return ret;
				761	}
				762
				763	atomic_set(&sig->count, 1);
				764	atomic_set(&sig->live, 1);
				765	init_waitqueue_head(&sig->wait_chldexit);
				766	sig->flags = 0;
				767	sig->group_exit_code = 0;
				768	sig->group_exit_task = NULL;
				769	sig->group_stop_count = 0;
				770	sig->curr_target = NULL;
				771	init_sigpending(&sig->shared_pending);
				772	INIT_LIST_HEAD(&sig->posix_timers);
				773
				774	sig->it_real_value = sig->it_real_incr = 0;
				775	sig->real_timer.function = it_real_fn;
				776	sig->real_timer.data = (unsigned long) tsk;
				777	init_timer(&sig->real_timer);
				778
				779	sig->it_virt_expires = cputime_zero;
				780	sig->it_virt_incr = cputime_zero;
				781	sig->it_prof_expires = cputime_zero;
				782	sig->it_prof_incr = cputime_zero;
				783
				784	sig->tty = current->signal->tty;
				785	sig->pgrp = process_group(current);
				786	sig->session = current->signal->session;
				787	sig->leader = 0; /* session leadership doesn't inherit */
				788	sig->tty_old_pgrp = 0;
				789
				790	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
				791	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
				792	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
				793	sig->sched_time = 0;
				794	INIT_LIST_HEAD(&sig->cpu_timers[0]);
				795	INIT_LIST_HEAD(&sig->cpu_timers[1]);
				796	INIT_LIST_HEAD(&sig->cpu_timers[2]);
				797
				798	task_lock(current->group_leader);
				799	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
				800	task_unlock(current->group_leader);
				801
				802	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
				803	/*
				804	* New sole thread in the process gets an expiry time
				805	* of the whole CPU time limit.
				806	*/
				807	tsk->it_prof_expires =
				808	secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
				809	}
				810
				811	return 0;
				812	}
				813
				814	static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
				815	{
				816	unsigned long new_flags = p->flags;
				817
				818	new_flags &= ~PF_SUPERPRIV;
				819	new_flags \|= PF_FORKNOEXEC;
				820	if (!(clone_flags & CLONE_PTRACE))
				821	p->ptrace = 0;
				822	p->flags = new_flags;
				823	}
				824
				825	asmlinkage long sys_set_tid_address(int __user *tidptr)
				826	{
				827	current->clear_child_tid = tidptr;
				828
				829	return current->pid;
				830	}
				831
				832	/*
				833	* This creates a new process as a copy of the old one,
				834	* but does not actually start it yet.
				835	*
				836	* It copies the registers, and all the appropriate
				837	* parts of the process environment (as per the clone
				838	* flags). The actual kick-off is left to the caller.
				839	*/
				840	static task_t *copy_process(unsigned long clone_flags,
				841	unsigned long stack_start,
				842	struct pt_regs *regs,
				843	unsigned long stack_size,
				844	int __user *parent_tidptr,
				845	int __user *child_tidptr,
				846	int pid)
				847	{
				848	int retval;
				849	struct task_struct *p = NULL;
				850
				851	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
				852	return ERR_PTR(-EINVAL);
				853
				854	/*
				855	* Thread groups must share signals as well, and detached threads
				856	* can only be started up within the thread group.
				857	*/
				858	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
				859	return ERR_PTR(-EINVAL);
				860
				861	/*
				862	* Shared signal handlers imply shared VM. By way of the above,
				863	* thread groups also imply shared VM. Blocking this case allows
				864	* for various simplifications in other code.
				865	*/
				866	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
				867	return ERR_PTR(-EINVAL);
				868
				869	retval = security_task_create(clone_flags);
				870	if (retval)
				871	goto fork_out;
				872
				873	retval = -ENOMEM;
				874	p = dup_task_struct(current);
				875	if (!p)
				876	goto fork_out;
				877
				878	retval = -EAGAIN;
				879	if (atomic_read(&p->user->processes) >=
				880	p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
				881	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
				882	p->user != &root_user)
				883	goto bad_fork_free;
				884	}
				885
				886	atomic_inc(&p->user->__count);
				887	atomic_inc(&p->user->processes);
				888	get_group_info(p->group_info);
				889
				890	/*
				891	* If multiple threads are within copy_process(), then this check
				892	* triggers too late. This doesn't hurt, the check is only there
				893	* to stop root fork bombs.
				894	*/
				895	if (nr_threads >= max_threads)
				896	goto bad_fork_cleanup_count;
				897
				898	if (!try_module_get(p->thread_info->exec_domain->module))
				899	goto bad_fork_cleanup_count;
				900
				901	if (p->binfmt && !try_module_get(p->binfmt->module))
				902	goto bad_fork_cleanup_put_domain;
				903
				904	p->did_exec = 0;
				905	copy_flags(clone_flags, p);
				906	p->pid = pid;
				907	retval = -EFAULT;
				908	if (clone_flags & CLONE_PARENT_SETTID)
				909	if (put_user(p->pid, parent_tidptr))
				910	goto bad_fork_cleanup;
				911
				912	p->proc_dentry = NULL;
				913
				914	INIT_LIST_HEAD(&p->children);
				915	INIT_LIST_HEAD(&p->sibling);
				916	p->vfork_done = NULL;
				917	spin_lock_init(&p->alloc_lock);
				918	spin_lock_init(&p->proc_lock);
				919
				920	clear_tsk_thread_flag(p, TIF_SIGPENDING);
				921	init_sigpending(&p->pending);
				922
				923	p->utime = cputime_zero;
				924	p->stime = cputime_zero;
				925	p->sched_time = 0;
				926	p->rchar = 0; /* I/O counter: bytes read */
				927	p->wchar = 0; /* I/O counter: bytes written */
				928	p->syscr = 0; /* I/O counter: read syscalls */
				929	p->syscw = 0; /* I/O counter: write syscalls */
				930	acct_clear_integrals(p);
				931
				932	p->it_virt_expires = cputime_zero;
				933	p->it_prof_expires = cputime_zero;
				934	p->it_sched_expires = 0;
				935	INIT_LIST_HEAD(&p->cpu_timers[0]);
				936	INIT_LIST_HEAD(&p->cpu_timers[1]);
				937	INIT_LIST_HEAD(&p->cpu_timers[2]);
				938
				939	p->lock_depth = -1; /* -1 = no lock */
				940	do_posix_clock_monotonic_gettime(&p->start_time);
				941	p->security = NULL;
				942	p->io_context = NULL;
				943	p->io_wait = NULL;
				944	p->audit_context = NULL;
				945	#ifdef CONFIG_NUMA
				946	p->mempolicy = mpol_copy(p->mempolicy);
				947	if (IS_ERR(p->mempolicy)) {
				948	retval = PTR_ERR(p->mempolicy);
				949	p->mempolicy = NULL;
				950	goto bad_fork_cleanup;
				951	}
				952	#endif
				953
				954	p->tgid = p->pid;
				955	if (clone_flags & CLONE_THREAD)
				956	p->tgid = current->tgid;
				957
				958	if ((retval = security_task_alloc(p)))
				959	goto bad_fork_cleanup_policy;
				960	if ((retval = audit_alloc(p)))
				961	goto bad_fork_cleanup_security;
				962	/* copy all the process information */
				963	if ((retval = copy_semundo(clone_flags, p)))
				964	goto bad_fork_cleanup_audit;
				965	if ((retval = copy_files(clone_flags, p)))
				966	goto bad_fork_cleanup_semundo;
				967	if ((retval = copy_fs(clone_flags, p)))
				968	goto bad_fork_cleanup_files;
				969	if ((retval = copy_sighand(clone_flags, p)))
				970	goto bad_fork_cleanup_fs;
				971	if ((retval = copy_signal(clone_flags, p)))
				972	goto bad_fork_cleanup_sighand;
				973	if ((retval = copy_mm(clone_flags, p)))
				974	goto bad_fork_cleanup_signal;
				975	if ((retval = copy_keys(clone_flags, p)))
				976	goto bad_fork_cleanup_mm;
				977	if ((retval = copy_namespace(clone_flags, p)))
				978	goto bad_fork_cleanup_keys;
				979	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
				980	if (retval)
				981	goto bad_fork_cleanup_namespace;
				982
				983	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
				984	/*
				985	* Clear TID on mm_release()?
				986	*/
				987	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
				988
				989	/*
				990	* Syscall tracing should be turned off in the child regardless
				991	* of CLONE_PTRACE.
				992	*/
				993	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
				994
				995	/* Our parent execution domain becomes current domain
				996	These must match for thread signalling to apply */
				997
				998	p->parent_exec_id = p->self_exec_id;
				999
				1000	/* ok, now we should be set up.. */
				1001	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
				1002	p->pdeath_signal = 0;
				1003	p->exit_state = 0;
				1004
				1005	/* Perform scheduler related setup */
				1006	sched_fork(p);
				1007
				1008	/*
				1009	* Ok, make it visible to the rest of the system.
				1010	* We dont wake it up yet.
				1011	*/
				1012	p->group_leader = p;
				1013	INIT_LIST_HEAD(&p->ptrace_children);
				1014	INIT_LIST_HEAD(&p->ptrace_list);
				1015
				1016	/* Need tasklist lock for parent etc handling! */
				1017	write_lock_irq(&tasklist_lock);
				1018
				1019	/*
				1020	* The task hasn't been attached yet, so cpus_allowed mask cannot
				1021	* have changed. The cpus_allowed mask of the parent may have
				1022	* changed after it was copied first time, and it may then move to
				1023	* another CPU - so we re-copy it here and set the child's CPU to
				1024	* the parent's CPU. This avoids alot of nasty races.
				1025	*/
				1026	p->cpus_allowed = current->cpus_allowed;
				1027	set_task_cpu(p, smp_processor_id());
				1028
				1029	/*
				1030	* Check for pending SIGKILL! The new thread should not be allowed
				1031	* to slip out of an OOM kill. (or normal SIGKILL.)
				1032	*/
				1033	if (sigismember(&current->pending.signal, SIGKILL)) {
				1034	write_unlock_irq(&tasklist_lock);
				1035	retval = -EINTR;
				1036	goto bad_fork_cleanup_namespace;
				1037	}
				1038
				1039	/* CLONE_PARENT re-uses the old parent */
				1040	if (clone_flags & (CLONE_PARENT\|CLONE_THREAD))
				1041	p->real_parent = current->real_parent;
				1042	else
				1043	p->real_parent = current;
				1044	p->parent = p->real_parent;
				1045
				1046	if (clone_flags & CLONE_THREAD) {
				1047	spin_lock(&current->sighand->siglock);
				1048	/*
				1049	* Important: if an exit-all has been started then
				1050	* do not create this new thread - the whole thread
				1051	* group is supposed to exit anyway.
				1052	*/
				1053	if (current->signal->flags & SIGNAL_GROUP_EXIT) {
				1054	spin_unlock(&current->sighand->siglock);
				1055	write_unlock_irq(&tasklist_lock);
				1056	retval = -EAGAIN;
				1057	goto bad_fork_cleanup_namespace;
				1058	}
				1059	p->group_leader = current->group_leader;
				1060
				1061	if (current->signal->group_stop_count > 0) {
				1062	/*
				1063	* There is an all-stop in progress for the group.
				1064	* We ourselves will stop as soon as we check signals.
				1065	* Make the new thread part of that group stop too.
				1066	*/
				1067	current->signal->group_stop_count++;
				1068	set_tsk_thread_flag(p, TIF_SIGPENDING);
				1069	}
				1070
				1071	if (!cputime_eq(current->signal->it_virt_expires,
				1072	cputime_zero) \|\|
				1073	!cputime_eq(current->signal->it_prof_expires,
				1074	cputime_zero) \|\|
				1075	current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY \|\|
				1076	!list_empty(&current->signal->cpu_timers[0]) \|\|
				1077	!list_empty(&current->signal->cpu_timers[1]) \|\|
				1078	!list_empty(&current->signal->cpu_timers[2])) {
				1079	/*
				1080	* Have child wake up on its first tick to check
				1081	* for process CPU timers.
				1082	*/
				1083	p->it_prof_expires = jiffies_to_cputime(1);
				1084	}
				1085
				1086	spin_unlock(&current->sighand->siglock);
				1087	}
				1088
				1089	SET_LINKS(p);
				1090	if (unlikely(p->ptrace & PT_PTRACED))
				1091	__ptrace_link(p, current->parent);
				1092
				1093	cpuset_fork(p);
				1094
				1095	attach_pid(p, PIDTYPE_PID, p->pid);
				1096	attach_pid(p, PIDTYPE_TGID, p->tgid);
				1097	if (thread_group_leader(p)) {
				1098	attach_pid(p, PIDTYPE_PGID, process_group(p));
				1099	attach_pid(p, PIDTYPE_SID, p->signal->session);
				1100	if (p->pid)
				1101	__get_cpu_var(process_counts)++;
				1102	}
				1103
				1104	nr_threads++;
				1105	total_forks++;
				1106	write_unlock_irq(&tasklist_lock);
				1107	retval = 0;
				1108
				1109	fork_out:
				1110	if (retval)
				1111	return ERR_PTR(retval);
				1112	return p;
				1113
				1114	bad_fork_cleanup_namespace:
				1115	exit_namespace(p);
				1116	bad_fork_cleanup_keys:
				1117	exit_keys(p);
				1118	bad_fork_cleanup_mm:
				1119	if (p->mm)
				1120	mmput(p->mm);
				1121	bad_fork_cleanup_signal:
				1122	exit_signal(p);
				1123	bad_fork_cleanup_sighand:
				1124	exit_sighand(p);
				1125	bad_fork_cleanup_fs:
				1126	exit_fs(p); /* blocking */
				1127	bad_fork_cleanup_files:
				1128	exit_files(p); /* blocking */
				1129	bad_fork_cleanup_semundo:
				1130	exit_sem(p);
				1131	bad_fork_cleanup_audit:
				1132	audit_free(p);
				1133	bad_fork_cleanup_security:
				1134	security_task_free(p);
				1135	bad_fork_cleanup_policy:
				1136	#ifdef CONFIG_NUMA
				1137	mpol_free(p->mempolicy);
				1138	#endif
				1139	bad_fork_cleanup:
				1140	if (p->binfmt)
				1141	module_put(p->binfmt->module);
				1142	bad_fork_cleanup_put_domain:
				1143	module_put(p->thread_info->exec_domain->module);
				1144	bad_fork_cleanup_count:
				1145	put_group_info(p->group_info);
				1146	atomic_dec(&p->user->processes);
				1147	free_uid(p->user);
				1148	bad_fork_free:
				1149	free_task(p);
				1150	goto fork_out;
				1151	}
				1152
				1153	struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
				1154	{
				1155	memset(regs, 0, sizeof(struct pt_regs));
				1156	return regs;
				1157	}
				1158
				1159	task_t * __devinit fork_idle(int cpu)
				1160	{
				1161	task_t *task;
				1162	struct pt_regs regs;
				1163
				1164	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
				1165	if (!task)
				1166	return ERR_PTR(-ENOMEM);
				1167	init_idle(task, cpu);
				1168	unhash_process(task);
				1169	return task;
				1170	}
				1171
				1172	static inline int fork_traceflag (unsigned clone_flags)
				1173	{
				1174	if (clone_flags & CLONE_UNTRACED)
				1175	return 0;
				1176	else if (clone_flags & CLONE_VFORK) {
				1177	if (current->ptrace & PT_TRACE_VFORK)
				1178	return PTRACE_EVENT_VFORK;
				1179	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
				1180	if (current->ptrace & PT_TRACE_CLONE)
				1181	return PTRACE_EVENT_CLONE;
				1182	} else if (current->ptrace & PT_TRACE_FORK)
				1183	return PTRACE_EVENT_FORK;
				1184
				1185	return 0;
				1186	}
				1187
				1188	/*
				1189	* Ok, this is the main fork-routine.
				1190	*
				1191	* It copies the process, and if successful kick-starts
				1192	* it and waits for it to finish using the VM if required.
				1193	*/
				1194	long do_fork(unsigned long clone_flags,
				1195	unsigned long stack_start,
				1196	struct pt_regs *regs,
				1197	unsigned long stack_size,
				1198	int __user *parent_tidptr,
				1199	int __user *child_tidptr)
				1200	{
				1201	struct task_struct *p;
				1202	int trace = 0;
				1203	long pid = alloc_pidmap();
				1204
				1205	if (pid < 0)
				1206	return -EAGAIN;
				1207	if (unlikely(current->ptrace)) {
				1208	trace = fork_traceflag (clone_flags);
				1209	if (trace)
				1210	clone_flags \|= CLONE_PTRACE;
				1211	}
				1212
				1213	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
				1214	/*
				1215	* Do this prior waking up the new thread - the thread pointer
				1216	* might get invalid after that point, if the thread exits quickly.
				1217	*/
				1218	if (!IS_ERR(p)) {
				1219	struct completion vfork;
				1220
				1221	if (clone_flags & CLONE_VFORK) {
				1222	p->vfork_done = &vfork;
				1223	init_completion(&vfork);
				1224	}
				1225
				1226	if ((p->ptrace & PT_PTRACED) \|\| (clone_flags & CLONE_STOPPED)) {
				1227	/*
				1228	* We'll start up with an immediate SIGSTOP.
				1229	*/
				1230	sigaddset(&p->pending.signal, SIGSTOP);
				1231	set_tsk_thread_flag(p, TIF_SIGPENDING);
				1232	}
				1233
				1234	if (!(clone_flags & CLONE_STOPPED))
				1235	wake_up_new_task(p, clone_flags);
				1236	else
				1237	p->state = TASK_STOPPED;
				1238
				1239	if (unlikely (trace)) {
				1240	current->ptrace_message = pid;
				1241	ptrace_notify ((trace << 8) \| SIGTRAP);
				1242	}
				1243
				1244	if (clone_flags & CLONE_VFORK) {
				1245	wait_for_completion(&vfork);
				1246	if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
				1247	ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) \| SIGTRAP);
				1248	}
				1249	} else {
				1250	free_pidmap(pid);
				1251	pid = PTR_ERR(p);
				1252	}
				1253	return pid;
				1254	}
				1255
				1256	void __init proc_caches_init(void)
				1257	{
				1258	sighand_cachep = kmem_cache_create("sighand_cache",
				1259	sizeof(struct sighand_struct), 0,
				1260	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1261	signal_cachep = kmem_cache_create("signal_cache",
				1262	sizeof(struct signal_struct), 0,
				1263	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1264	files_cachep = kmem_cache_create("files_cache",
				1265	sizeof(struct files_struct), 0,
				1266	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1267	fs_cachep = kmem_cache_create("fs_cache",
				1268	sizeof(struct fs_struct), 0,
				1269	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1270	vm_area_cachep = kmem_cache_create("vm_area_struct",
				1271	sizeof(struct vm_area_struct), 0,
				1272	SLAB_PANIC, NULL, NULL);
				1273	mm_cachep = kmem_cache_create("mm_struct",
				1274	sizeof(struct mm_struct), 0,
				1275	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1276	}