Blame - mm/oom_kill.c - LeafOS-Devices/android_kernel_samsung_exynos9820

blob: 7a5c0b229c6ae17fbf9b448952c89def3ca97e4c [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/oom_kill.c
				3	*
				4	* Copyright (C) 1998,2000 Rik van Riel
				5	* Thanks go out to Claus Fischer for some serious inspiration and
				6	* for goading me into coding this file...
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	7	* Copyright (C) 2010 Google, Inc.
				8	* Rewritten by David Rientjes
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	9	*
				10	* The routines in this file are used to kill a process when
Paul Jackson	a49335c	2005-09-06 15:18:09 -0700	[diff] [blame]	11	* we're seriously out of memory. This gets called from __alloc_pages()
				12	* in mm/page_alloc.c when we really run out of memory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	13	*
				14	* Since we won't call these routines often (on a well-configured
				15	* machine) this file will double as a 'coding guide' and a signpost
				16	* for newbie kernel hackers. It features several pointers to major
				17	* kernel subsystems and hints as to where to find out what things do.
				18	*/
				19
Alexey Dobriyan	8ac773b	2006-10-19 23:28:32 -0700	[diff] [blame]	20	#include <linux/oom.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/mm.h>
Alexey Dobriyan	4e950f6	2007-07-30 02:36:13 +0400	[diff] [blame]	22	#include <linux/err.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	23	#include <linux/gfp.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/sched.h>
Ingo Molnar	6e84f31	2017-02-08 18:51:29 +0100	[diff] [blame]	25	#include <linux/sched/mm.h>
Ingo Molnar	f7ccbae	2017-02-08 18:51:30 +0100	[diff] [blame]	26	#include <linux/sched/coredump.h>
Ingo Molnar	2993002	2017-02-08 18:51:36 +0100	[diff] [blame]	27	#include <linux/sched/task.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/timex.h>
				30	#include <linux/jiffies.h>
Paul Jackson	ef08e3b	2005-09-06 15:18:13 -0700	[diff] [blame]	31	#include <linux/cpuset.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	32	#include <linux/export.h>
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	33	#include <linux/notifier.h>
Pavel Emelianov	c7ba5c9	2008-02-07 00:13:58 -0800	[diff] [blame]	34	#include <linux/memcontrol.h>
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	35	#include <linux/mempolicy.h>
David Howells	5cd9c58	2008-08-14 11:37:28 +0100	[diff] [blame]	36	#include <linux/security.h>
David Rientjes	edd4554	2011-03-22 16:30:12 -0700	[diff] [blame]	37	#include <linux/ptrace.h>
David Rientjes	f660daa	2011-10-31 17:07:07 -0700	[diff] [blame]	38	#include <linux/freezer.h>
KAMEZAWA Hiroyuki	43d2b11	2012-01-10 15:08:09 -0800	[diff] [blame]	39	#include <linux/ftrace.h>
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	40	#include <linux/ratelimit.h>
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	41	#include <linux/kthread.h>
				42	#include <linux/init.h>
Michal Hocko	4d4bbd8	2017-10-03 16:14:50 -0700	[diff] [blame]	43	#include <linux/mmu_notifier.h>
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	44
				45	#include <asm/tlb.h>
				46	#include "internal.h"
KAMEZAWA Hiroyuki	43d2b11	2012-01-10 15:08:09 -0800	[diff] [blame]	47
				48	#define CREATE_TRACE_POINTS
				49	#include <trace/events/oom.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	50
KAMEZAWA Hiroyuki	fadd8fb	2006-06-23 02:03:13 -0700	[diff] [blame]	51	int sysctl_panic_on_oom;
David Rientjes	fe071d7	2007-10-16 23:25:56 -0700	[diff] [blame]	52	int sysctl_oom_kill_allocating_task;
David Rientjes	ad915c4	2010-08-09 17:18:53 -0700	[diff] [blame]	53	int sysctl_oom_dump_tasks = 1;
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	54
				55	DEFINE_MUTEX(oom_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	56
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	57	#ifdef CONFIG_NUMA
				58	/**
				59	* has_intersects_mems_allowed() - check task eligiblity for kill
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	60	* @start: task struct of which task to consider
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	61	* @mask: nodemask passed to page allocator for mempolicy ooms
				62	*
				63	* Task eligibility is determined by whether or not a candidate task, @tsk,
				64	* shares the same mempolicy nodes as current if it is bound by such a policy
				65	* and whether or not it has the same set of allowed cpuset nodes.
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	66	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	67	static bool has_intersects_mems_allowed(struct task_struct *start,
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	68	const nodemask_t *mask)
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	69	{
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	70	struct task_struct *tsk;
				71	bool ret = false;
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	72
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	73	rcu_read_lock();
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	74	for_each_thread(start, tsk) {
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	75	if (mask) {
				76	/*
				77	* If this is a mempolicy constrained oom, tsk's
				78	* cpuset is irrelevant. Only return true if its
				79	* mempolicy intersects current, otherwise it may be
				80	* needlessly killed.
				81	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	82	ret = mempolicy_nodemask_intersects(tsk, mask);
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	83	} else {
				84	/*
				85	* This is not a mempolicy constrained oom, so only
				86	* check the mems of tsk's cpuset.
				87	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	88	ret = cpuset_mems_allowed_intersects(current, tsk);
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	89	}
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	90	if (ret)
				91	break;
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	92	}
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	93	rcu_read_unlock();
KOSAKI Motohiro	df1090a	2010-08-09 17:19:39 -0700	[diff] [blame]	94
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	95	return ret;
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	96	}
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	97	#else
				98	static bool has_intersects_mems_allowed(struct task_struct *tsk,
				99	const nodemask_t *mask)
				100	{
				101	return true;
				102	}
				103	#endif /* CONFIG_NUMA */
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	104
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	105	/*
				106	* The process p may have detached its own ->mm while exiting or through
				107	* use_mm(), but one or more of its subthreads may still have a valid
				108	* pointer. Return p, or any of its subthreads with a valid ->mm, with
				109	* task_lock() held.
				110	*/
KAMEZAWA Hiroyuki	158e0a2	2010-08-10 18:03:00 -0700	[diff] [blame]	111	struct task_struct find_lock_task_mm(struct task_struct p)
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	112	{
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	113	struct task_struct *t;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	114
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	115	rcu_read_lock();
				116
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	117	for_each_thread(p, t) {
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	118	task_lock(t);
				119	if (likely(t->mm))
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	120	goto found;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	121	task_unlock(t);
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	122	}
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	123	t = NULL;
				124	found:
				125	rcu_read_unlock();
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	126
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	127	return t;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	128	}
				129
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	130	/*
				131	* order == -1 means the oom kill is required by sysrq, otherwise only
				132	* for display purposes.
				133	*/
				134	static inline bool is_sysrq_oom(struct oom_control *oc)
				135	{
				136	return oc->order == -1;
				137	}
				138
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	139	static inline bool is_memcg_oom(struct oom_control *oc)
				140	{
				141	return oc->memcg != NULL;
				142	}
				143
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	144	/* return true if the task is not adequate as candidate victim task. */
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	145	static bool oom_unkillable_task(struct task_struct *p,
Johannes Weiner	2314b42	2014-12-10 15:44:33 -0800	[diff] [blame]	146	struct mem_cgroup memcg, const nodemask_t nodemask)
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	147	{
				148	if (is_global_init(p))
				149	return true;
				150	if (p->flags & PF_KTHREAD)
				151	return true;
				152
				153	/* When mem_cgroup_out_of_memory() and p is not member of the group */
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	154	if (memcg && !task_in_mem_cgroup(p, memcg))
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	155	return true;
				156
				157	/* p may not have freeable memory in nodemask */
				158	if (!has_intersects_mems_allowed(p, nodemask))
				159	return true;
				160
				161	return false;
				162	}
				163
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	/**
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	165	* oom_badness - heuristic function to determine which candidate task to kill
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	166	* @p: task struct of which task we should calculate
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	167	* @totalpages: total present RAM allowed for page allocation
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	168	*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	169	* The heuristic for determining which task to kill is made to be as simple and
				170	* predictable as possible. The goal is to return the highest value for the
				171	* task consuming the most memory to avoid subsequent oom failures.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	172	*/
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	173	unsigned long oom_badness(struct task_struct p, struct mem_cgroup memcg,
				174	const nodemask_t *nodemask, unsigned long totalpages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175	{
David Rientjes	1e11ad8	2012-06-08 13:21:26 -0700	[diff] [blame]	176	long points;
David Rientjes	61eafb0	2012-06-20 12:52:58 -0700	[diff] [blame]	177	long adj;
KOSAKI Motohiro	28b83c5	2009-09-21 17:03:13 -0700	[diff] [blame]	178
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	179	if (oom_unkillable_task(p, memcg, nodemask))
KOSAKI Motohiro	26ebc98	2010-08-09 17:19:37 -0700	[diff] [blame]	180	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	181
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	182	p = find_lock_task_mm(p);
				183	if (!p)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	return 0;
				185
Michal Hocko	bb8a4b7	2016-05-20 16:57:18 -0700	[diff] [blame]	186	/*
				187	* Do not even consider tasks which are explicitly marked oom
Michal Hocko	b18dc5f	2016-07-28 15:44:46 -0700	[diff] [blame]	188	* unkillable or have been already oom reaped or the are in
				189	* the middle of vfork
Michal Hocko	bb8a4b7	2016-05-20 16:57:18 -0700	[diff] [blame]	190	*/
David Rientjes	a9c58b90	2012-12-11 16:02:54 -0800	[diff] [blame]	191	adj = (long)p->signal->oom_score_adj;
Michal Hocko	bb8a4b7	2016-05-20 16:57:18 -0700	[diff] [blame]	192	if (adj == OOM_SCORE_ADJ_MIN \|\|
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	193	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
Michal Hocko	b18dc5f	2016-07-28 15:44:46 -0700	[diff] [blame]	194	in_vfork(p)) {
Michal Hocko	5aecc85	2011-11-15 14:36:07 -0800	[diff] [blame]	195	task_unlock(p);
				196	return 0;
				197	}
				198
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	200	* The baseline for the badness score is the proportion of RAM that each
KOSAKI Motohiro	f755a04	2011-04-27 15:26:50 -0700	[diff] [blame]	201	* task's rss, pagetable and swap space use.
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	202	*/
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	203	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
				204	atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
Andrew Morton	97c2c9b8	2006-04-18 22:20:38 -0700	[diff] [blame]	205	task_unlock(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	206
				207	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	208	* Root processes get 3% bonus, just like the __vm_enough_memory()
				209	* implementation used by LSMs.
Hugh Dickins	7ba3485	2007-01-05 16:37:03 -0800	[diff] [blame]	210	*/
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	211	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
David Rientjes	778c14a	2014-01-30 15:46:11 -0800	[diff] [blame]	212	points -= (points * 3) / 100;
Hugh Dickins	7ba3485	2007-01-05 16:37:03 -0800	[diff] [blame]	213
David Rientjes	61eafb0	2012-06-20 12:52:58 -0700	[diff] [blame]	214	/* Normalize to oom_score_adj units */
				215	adj *= totalpages / 1000;
				216	points += adj;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	217
David Rientjes	f19e8aa	2010-09-22 13:04:52 -0700	[diff] [blame]	218	/*
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	219	* Never return 0 for an eligible task regardless of the root bonus and
				220	* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
David Rientjes	f19e8aa	2010-09-22 13:04:52 -0700	[diff] [blame]	221	*/
David Rientjes	1e11ad8	2012-06-08 13:21:26 -0700	[diff] [blame]	222	return points > 0 ? points : 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	223	}
				224
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	225	enum oom_constraint {
				226	CONSTRAINT_NONE,
				227	CONSTRAINT_CPUSET,
				228	CONSTRAINT_MEMORY_POLICY,
				229	CONSTRAINT_MEMCG,
				230	};
				231
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	232	/*
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	233	* Determine the type of allocation constraint.
				234	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	235	static enum oom_constraint constrained_alloc(struct oom_control *oc)
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	236	{
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	237	struct zone *zone;
Mel Gorman	dd1a239	2008-04-28 02:12:17 -0700	[diff] [blame]	238	struct zoneref *z;
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	239	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	240	bool cpuset_limited = false;
				241	int nid;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	242
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	243	if (is_memcg_oom(oc)) {
				244	oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
				245	return CONSTRAINT_MEMCG;
				246	}
				247
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	248	/* Default to all available memory */
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	249	oc->totalpages = totalram_pages + total_swap_pages;
				250
				251	if (!IS_ENABLED(CONFIG_NUMA))
				252	return CONSTRAINT_NONE;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	253
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	254	if (!oc->zonelist)
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	255	return CONSTRAINT_NONE;
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	256	/*
				257	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
				258	* to kill current.We have to random task kill in this case.
				259	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
				260	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	261	if (oc->gfp_mask & __GFP_THISNODE)
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	262	return CONSTRAINT_NONE;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	263
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	264	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	265	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
				266	* the page allocator means a mempolicy is in effect. Cpuset policy
				267	* is enforced in get_page_from_freelist().
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	268	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	269	if (oc->nodemask &&
				270	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	271	oc->totalpages = total_swap_pages;
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	272	for_each_node_mask(nid, *oc->nodemask)
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	273	oc->totalpages += node_spanned_pages(nid);
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	274	return CONSTRAINT_MEMORY_POLICY;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	275	}
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	276
				277	/* Check this allocation failure is caused by cpuset's wall function */
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	278	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
				279	high_zoneidx, oc->nodemask)
				280	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	281	cpuset_limited = true;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	282
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	283	if (cpuset_limited) {
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	284	oc->totalpages = total_swap_pages;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	285	for_each_node_mask(nid, cpuset_current_mems_allowed)
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	286	oc->totalpages += node_spanned_pages(nid);
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	287	return CONSTRAINT_CPUSET;
				288	}
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	289	return CONSTRAINT_NONE;
				290	}
				291
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	292	static int oom_evaluate_task(struct task_struct task, void arg)
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	293	{
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	294	struct oom_control *oc = arg;
				295	unsigned long points;
				296
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	297	if (oom_unkillable_task(task, NULL, oc->nodemask))
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	298	goto next;
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	299
				300	/*
				301	* This task already has access to memory reserves and is being killed.
Michal Hocko	a373966	2016-07-28 15:45:01 -0700	[diff] [blame]	302	* Don't allow any other task to have access to the reserves unless
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	303	* the task has MMF_OOM_SKIP because chances that it would release
Michal Hocko	a373966	2016-07-28 15:45:01 -0700	[diff] [blame]	304	* any memory is quite low.
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	305	*/
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	306	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
				307	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	308	goto next;
				309	goto abort;
Michal Hocko	a373966	2016-07-28 15:45:01 -0700	[diff] [blame]	310	}
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	311
David Rientjes	e1e12d2	2012-12-11 16:02:56 -0800	[diff] [blame]	312	/*
				313	* If task is allocating a lot of memory and has been marked to be
				314	* killed first if it triggers an oom, then select it.
				315	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	316	if (oom_task_origin(task)) {
				317	points = ULONG_MAX;
				318	goto select;
				319	}
David Rientjes	e1e12d2	2012-12-11 16:02:56 -0800	[diff] [blame]	320
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	321	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
				322	if (!points \|\| points < oc->chosen_points)
				323	goto next;
				324
				325	/* Prefer thread group leaders for display purposes */
				326	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
				327	goto next;
				328	select:
				329	if (oc->chosen)
				330	put_task_struct(oc->chosen);
				331	get_task_struct(task);
				332	oc->chosen = task;
				333	oc->chosen_points = points;
				334	next:
				335	return 0;
				336	abort:
				337	if (oc->chosen)
				338	put_task_struct(oc->chosen);
				339	oc->chosen = (void *)-1UL;
				340	return 1;
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	341	}
				342
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	343	/*
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	344	* Simple selection loop. We choose the process with the highest number of
				345	* 'points'. In case scan was aborted, oc->chosen is set to -1.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	346	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	347	static void select_bad_process(struct oom_control *oc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	{
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	349	if (is_memcg_oom(oc))
				350	mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
				351	else {
				352	struct task_struct *p;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	353
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	354	rcu_read_lock();
				355	for_each_process(p)
				356	if (oom_evaluate_task(p, oc))
				357	break;
				358	rcu_read_unlock();
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	359	}
Oleg Nesterov	972c4ea	2006-09-29 02:01:12 -0700	[diff] [blame]	360
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	361	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	362	}
				363
				364	/**
Randy Dunlap	1b578df	2008-03-19 17:00:42 -0700	[diff] [blame]	365	* dump_tasks - dump current memory state of all system tasks
Wanpeng Li	dad7557	2012-06-20 12:53:01 -0700	[diff] [blame]	366	* @memcg: current's memory controller, if constrained
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	367	* @nodemask: nodemask passed to page allocator for mempolicy ooms
Randy Dunlap	1b578df	2008-03-19 17:00:42 -0700	[diff] [blame]	368	*
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	369	* Dumps the current memory state of all eligible tasks. Tasks not in the same
				370	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
				371	* are not shown.
David Rientjes	de34d96	2012-07-31 16:42:56 -0700	[diff] [blame]	372	* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
				373	* swapents, oom_score_adj value, and name.
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	374	*/
Johannes Weiner	2314b42	2014-12-10 15:44:33 -0800	[diff] [blame]	375	static void dump_tasks(struct mem_cgroup memcg, const nodemask_t nodemask)
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	376	{
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	377	struct task_struct *p;
				378	struct task_struct *task;
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	379
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	380	pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	381	rcu_read_lock();
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	382	for_each_process(p) {
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	383	if (oom_unkillable_task(p, memcg, nodemask))
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	384	continue;
				385
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	386	task = find_lock_task_mm(p);
				387	if (!task) {
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	388	/*
David Rientjes	74ab7f1	2010-08-09 17:18:46 -0700	[diff] [blame]	389	* This is a kthread or all of p's threads have already
				390	* detached their mm's. There's no need to report
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	391	* them; they can't be oom killed anyway.
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	392	*/
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	393	continue;
				394	}
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	395
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	396	pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
Eric W. Biederman	078de5f	2012-02-08 07:00:08 -0800	[diff] [blame]	397	task->pid, from_kuid(&init_user_ns, task_uid(task)),
				398	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
Kirill A. Shutemov	e1f56c8	2013-11-14 14:30:48 -0800	[diff] [blame]	399	atomic_long_read(&task->mm->nr_ptes),
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	400	mm_nr_pmds(task->mm),
David Rientjes	de34d96	2012-07-31 16:42:56 -0700	[diff] [blame]	401	get_mm_counter(task->mm, MM_SWAPENTS),
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	402	task->signal->oom_score_adj, task->comm);
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	403	task_unlock(task);
				404	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	405	rcu_read_unlock();
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	406	}
				407
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	408	static void dump_header(struct oom_control oc, struct task_struct p)
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	409	{
David Rientjes	299c517	2017-02-24 14:55:42 -0800	[diff] [blame]	410	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=",
				411	current->comm, oc->gfp_mask, &oc->gfp_mask);
				412	if (oc->nodemask)
				413	pr_cont("%*pbl", nodemask_pr_args(oc->nodemask));
				414	else
				415	pr_cont("(null)");
				416	pr_cont(", order=%d, oom_score_adj=%hd\n",
				417	oc->order, current->signal->oom_score_adj);
Michal Hocko	9254990	2016-10-07 16:59:33 -0700	[diff] [blame]	418	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
				419	pr_warn("COMPACTION is disabled!!!\n");
Vlastimil Babka	a0795cd	2016-03-15 14:56:05 -0700	[diff] [blame]	420
David Rientjes	da39da3	2015-11-05 18:48:05 -0800	[diff] [blame]	421	cpuset_print_current_mems_allowed();
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	422	dump_stack();
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	423	if (oc->memcg)
				424	mem_cgroup_print_oom_info(oc->memcg, p);
Sha Zhengju	58cf188	2013-02-22 16:32:05 -0800	[diff] [blame]	425	else
David Rientjes	299c517	2017-02-24 14:55:42 -0800	[diff] [blame]	426	show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	427	if (sysctl_oom_dump_tasks)
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	428	dump_tasks(oc->memcg, oc->nodemask);
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	429	}
				430
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	431	/*
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	432	* Number of OOM victims in flight
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	433	*/
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	434	static atomic_t oom_victims = ATOMIC_INIT(0);
				435	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	436
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	437	static bool oom_killer_disabled __read_mostly;
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	438
Michal Hocko	bc448e8	2016-03-25 14:20:30 -0700	[diff] [blame]	439	#define K(x) ((x) << (PAGE_SHIFT-10))
				440
Michal Hocko	3ef22df	2016-05-19 17:13:12 -0700	[diff] [blame]	441	/*
				442	* task->mm can be NULL if the task is the exited group leader. So to
				443	* determine whether the task is using a particular mm, we examine all the
				444	* task's threads: if one of those is using this mm then this task was also
				445	* using it.
				446	*/
Michal Hocko	44a70ade	2016-07-28 15:44:43 -0700	[diff] [blame]	447	bool process_shares_mm(struct task_struct p, struct mm_struct mm)
Michal Hocko	3ef22df	2016-05-19 17:13:12 -0700	[diff] [blame]	448	{
				449	struct task_struct *t;
				450
				451	for_each_thread(p, t) {
				452	struct mm_struct *t_mm = READ_ONCE(t->mm);
				453	if (t_mm)
				454	return t_mm == mm;
				455	}
				456	return false;
				457	}
				458
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	459	#ifdef CONFIG_MMU
				460	/*
				461	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
				462	* victim (if that is possible) to help the OOM killer to move on.
				463	*/
				464	static struct task_struct *oom_reaper_th;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	465	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
Vladimir Davydov	29c696e	2016-03-25 14:20:39 -0700	[diff] [blame]	466	static struct task_struct *oom_reaper_list;
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	467	static DEFINE_SPINLOCK(oom_reaper_lock);
				468
David Rientjes	2270dfc	2018-05-11 16:02:04 -0700	[diff] [blame]	469	void __oom_reap_task_mm(struct mm_struct *mm)
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	470	{
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	471	struct vm_area_struct *vma;
David Rientjes	2270dfc	2018-05-11 16:02:04 -0700	[diff] [blame]	472
				473	/*
				474	* Tell all users of get_user/copy_from_user etc... that the content
				475	* is no longer stable. No barriers really needed because unmapping
				476	* should imply barriers already and the reader would hit a page fault
				477	* if it stumbled over a reaped memory.
				478	*/
				479	set_bit(MMF_UNSTABLE, &mm->flags);
				480
				481	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
				482	if (!can_madv_dontneed_vma(vma))
				483	continue;
				484
				485	/*
				486	* Only anonymous pages have a good chance to be dropped
				487	* without additional steps which we cannot afford as we
				488	* are OOM already.
				489	*
				490	* We do not even care about fs backed pages because all
				491	* which are reclaimable have already been reclaimed and
				492	* we do not want to block exit_mmap by keeping mm ref
				493	* count elevated without a good reason.
				494	*/
				495	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
				496	struct mmu_gather tlb;
				497
				498	tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
				499	unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
				500	NULL);
				501	tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
				502	}
				503	}
				504	}
				505
				506	static bool oom_reap_task_mm(struct task_struct tsk, struct mm_struct mm)
				507	{
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	508	bool ret = true;
				509
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	510	/*
Michal Hocko	e2fe145	2016-05-27 14:27:35 -0700	[diff] [blame]	511	* We have to make sure to not race with the victim exit path
				512	* and cause premature new oom victim selection:
David Rientjes	2270dfc	2018-05-11 16:02:04 -0700	[diff] [blame]	513	* oom_reap_task_mm exit_mm
Michal Hocko	e5e3f4c	2016-07-26 15:24:50 -0700	[diff] [blame]	514	* mmget_not_zero
Michal Hocko	e2fe145	2016-05-27 14:27:35 -0700	[diff] [blame]	515	* mmput
				516	* atomic_dec_and_test
				517	* exit_oom_victim
				518	* [...]
				519	* out_of_memory
				520	* select_bad_process
				521	* # no TIF_MEMDIE task selects new victim
				522	* unmap_page_range # frees some memory
				523	*/
				524	mutex_lock(&oom_lock);
				525
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	526	if (!down_read_trylock(&mm->mmap_sem)) {
				527	ret = false;
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	528	trace_skip_task_reaping(tsk->pid);
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	529	goto unlock_oom;
Michal Hocko	e5e3f4c	2016-07-26 15:24:50 -0700	[diff] [blame]	530	}
				531
				532	/*
Michal Hocko	4d4bbd8	2017-10-03 16:14:50 -0700	[diff] [blame]	533	* If the mm has notifiers then we would need to invalidate them around
				534	* unmap_page_range and that is risky because notifiers can sleep and
				535	* what they do is basically undeterministic. So let's have a short
				536	* sleep to give the oom victim some more time.
				537	* TODO: we really want to get rid of this ugly hack and make sure that
				538	* notifiers cannot block for unbounded amount of time and add
				539	* mmu_notifier_invalidate_range_{start,end} around unmap_page_range
				540	*/
				541	if (mm_has_notifiers(mm)) {
				542	up_read(&mm->mmap_sem);
				543	schedule_timeout_idle(HZ);
				544	goto unlock_oom;
				545	}
				546
				547	/*
Andrea Arcangeli	2129258	2017-09-06 16:25:00 -0700	[diff] [blame]	548	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
				549	* work on the mm anymore. The check for MMF_OOM_SKIP must run
				550	* under mmap_sem for reading because it serializes against the
				551	* down_write();up_write() cycle in exit_mmap().
Michal Hocko	e5e3f4c	2016-07-26 15:24:50 -0700	[diff] [blame]	552	*/
Andrea Arcangeli	2129258	2017-09-06 16:25:00 -0700	[diff] [blame]	553	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
Michal Hocko	e5e3f4c	2016-07-26 15:24:50 -0700	[diff] [blame]	554	up_read(&mm->mmap_sem);
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	555	trace_skip_task_reaping(tsk->pid);
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	556	goto unlock_oom;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	557	}
				558
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	559	trace_start_task_reaping(tsk->pid);
				560
David Rientjes	2270dfc	2018-05-11 16:02:04 -0700	[diff] [blame]	561	__oom_reap_task_mm(mm);
Michal Hocko	3f70dc3	2016-10-07 16:59:06 -0700	[diff] [blame]	562
Michal Hocko	bc448e8	2016-03-25 14:20:30 -0700	[diff] [blame]	563	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
				564	task_pid_nr(tsk), tsk->comm,
				565	K(get_mm_counter(mm, MM_ANONPAGES)),
				566	K(get_mm_counter(mm, MM_FILEPAGES)),
				567	K(get_mm_counter(mm, MM_SHMEMPAGES)));
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	568	up_read(&mm->mmap_sem);
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	569
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	570	trace_finish_task_reaping(tsk->pid);
Michal Hocko	e5e3f4c	2016-07-26 15:24:50 -0700	[diff] [blame]	571	unlock_oom:
				572	mutex_unlock(&oom_lock);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	573	return ret;
				574	}
				575
Michal Hocko	bc448e8	2016-03-25 14:20:30 -0700	[diff] [blame]	576	#define MAX_OOM_REAP_RETRIES 10
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	577	static void oom_reap_task(struct task_struct *tsk)
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	578	{
				579	int attempts = 0;
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	580	struct mm_struct *mm = tsk->signal->oom_mm;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	581
				582	/* Retry the down_read_trylock(mmap_sem) a few times */
David Rientjes	2270dfc	2018-05-11 16:02:04 -0700	[diff] [blame]	583	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	584	schedule_timeout_idle(HZ/10);
				585
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	586	if (attempts <= MAX_OOM_REAP_RETRIES)
				587	goto done;
Michal Hocko	11a410d	2016-07-28 15:44:58 -0700	[diff] [blame]	588
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	589	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
				590	task_pid_nr(tsk), tsk->comm);
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	591	debug_show_all_locks();
Michal Hocko	bc448e8	2016-03-25 14:20:30 -0700	[diff] [blame]	592
Tetsuo Handa	7ebffa4	2016-10-07 16:58:45 -0700	[diff] [blame]	593	done:
Michal Hocko	449d777	2016-05-19 17:13:15 -0700	[diff] [blame]	594	tsk->oom_reaper_list = NULL;
Michal Hocko	449d777	2016-05-19 17:13:15 -0700	[diff] [blame]	595
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	596	/*
				597	* Hide this mm from OOM killer because it has been either reaped or
				598	* somebody can't call up_write(mmap_sem).
				599	*/
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	600	set_bit(MMF_OOM_SKIP, &mm->flags);
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	601
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	602	/* Drop a reference taken by wake_oom_reaper */
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	603	put_task_struct(tsk);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	604	}
				605
				606	static int oom_reaper(void *unused)
				607	{
				608	while (true) {
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	609	struct task_struct *tsk = NULL;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	610
Vladimir Davydov	29c696e	2016-03-25 14:20:39 -0700	[diff] [blame]	611	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	612	spin_lock(&oom_reaper_lock);
Vladimir Davydov	29c696e	2016-03-25 14:20:39 -0700	[diff] [blame]	613	if (oom_reaper_list != NULL) {
				614	tsk = oom_reaper_list;
				615	oom_reaper_list = tsk->oom_reaper_list;
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	616	}
				617	spin_unlock(&oom_reaper_lock);
				618
				619	if (tsk)
				620	oom_reap_task(tsk);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	621	}
				622
				623	return 0;
				624	}
				625
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	626	static void wake_oom_reaper(struct task_struct *tsk)
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	627	{
Michal Hocko	af8e15c	2016-04-01 14:31:34 -0700	[diff] [blame]	628	if (!oom_reaper_th)
				629	return;
				630
Tetsuo Handa	7317854	2019-02-01 14:20:31 -0800	[diff] [blame]	631	/* mm is already queued? */
				632	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	633	return;
				634
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	635	get_task_struct(tsk);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	636
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	637	spin_lock(&oom_reaper_lock);
Vladimir Davydov	29c696e	2016-03-25 14:20:39 -0700	[diff] [blame]	638	tsk->oom_reaper_list = oom_reaper_list;
				639	oom_reaper_list = tsk;
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	640	spin_unlock(&oom_reaper_lock);
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	641	trace_wake_reaper(tsk->pid);
Michal Hocko	0304926	2016-03-25 14:20:33 -0700	[diff] [blame]	642	wake_up(&oom_reaper_wait);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	643	}
				644
				645	static int __init oom_init(void)
				646	{
				647	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
				648	if (IS_ERR(oom_reaper_th)) {
				649	pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
				650	PTR_ERR(oom_reaper_th));
				651	oom_reaper_th = NULL;
				652	}
				653	return 0;
				654	}
				655	subsys_initcall(oom_init)
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	656	#else
				657	static inline void wake_oom_reaper(struct task_struct *tsk)
				658	{
				659	}
				660	#endif /* CONFIG_MMU */
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	661
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	662	/**
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	663	* mark_oom_victim - mark the given task as OOM victim
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	664	* @tsk: task to mark
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	665	*
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	666	* Has to be called with oom_lock held and never after
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	667	* oom has been disabled already.
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	668	*
				669	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
				670	* under task_lock or operate on the current).
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	671	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	672	static void mark_oom_victim(struct task_struct *tsk)
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	673	{
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	674	struct mm_struct *mm = tsk->mm;
				675
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	676	WARN_ON(oom_killer_disabled);
				677	/* OOM killer might race with memcg OOM */
				678	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
				679	return;
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	680
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	681	/* oom_mm is bound to the signal struct life time. */
Michal Hocko	55fe469	2017-12-14 15:33:15 -0800	[diff] [blame]	682	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
Vegard Nossum	f1f1007	2017-02-27 14:30:07 -0800	[diff] [blame]	683	mmgrab(tsk->signal->oom_mm);
Michal Hocko	55fe469	2017-12-14 15:33:15 -0800	[diff] [blame]	684	set_bit(MMF_OOM_VICTIM, &mm->flags);
				685	}
Michal Hocko	26db62f	2016-10-07 16:58:51 -0700	[diff] [blame]	686
Michal Hocko	63a8ca9	2015-02-11 15:26:15 -0800	[diff] [blame]	687	/*
				688	* Make sure that the task is woken up from uninterruptible sleep
				689	* if it is frozen because OOM killer wouldn't be able to free
				690	* any memory and livelock. freezing_slow_path will tell the freezer
				691	* that TIF_MEMDIE tasks should be ignored.
				692	*/
				693	__thaw_task(tsk);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	694	atomic_inc(&oom_victims);
Roman Gushchin	422580c	2017-07-10 15:49:05 -0700	[diff] [blame]	695	trace_mark_victim(tsk->pid);
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	696	}
				697
				698	/**
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	699	* exit_oom_victim - note the exit of an OOM victim
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	700	*/
Tetsuo Handa	3853120	2016-10-07 16:59:03 -0700	[diff] [blame]	701	void exit_oom_victim(void)
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	702	{
Tetsuo Handa	3853120	2016-10-07 16:59:03 -0700	[diff] [blame]	703	clear_thread_flag(TIF_MEMDIE);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	704
Johannes Weiner	c38f102	2015-06-24 16:57:13 -0700	[diff] [blame]	705	if (!atomic_dec_return(&oom_victims))
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	706	wake_up_all(&oom_victims_wait);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	707	}
				708
				709	/**
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	710	* oom_killer_enable - enable OOM killer
				711	*/
				712	void oom_killer_enable(void)
				713	{
				714	oom_killer_disabled = false;
Michal Hocko	d75da00	2017-05-03 14:54:57 -0700	[diff] [blame]	715	pr_info("OOM killer enabled.\n");
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	716	}
				717
				718	/**
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	719	* oom_killer_disable - disable OOM killer
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	720	* @timeout: maximum timeout to wait for oom victims in jiffies
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	721	*
				722	* Forces all page allocations to fail rather than trigger OOM killer.
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	723	* Will block and wait until all OOM victims are killed or the given
				724	* timeout expires.
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	725	*
				726	* The function cannot be called when there are runnable user tasks because
				727	* the userspace would see unexpected allocation failures as a result. Any
				728	* new usage of this function should be consulted with MM people.
				729	*
				730	* Returns true if successful and false if the OOM killer cannot be
				731	* disabled.
				732	*/
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	733	bool oom_killer_disable(signed long timeout)
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	734	{
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	735	signed long ret;
				736
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	737	/*
Tetsuo Handa	6afcf28	2016-03-17 14:20:45 -0700	[diff] [blame]	738	* Make sure to not race with an ongoing OOM killer. Check that the
				739	* current is not killed (possibly due to sharing the victim's memory).
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	740	*/
Tetsuo Handa	6afcf28	2016-03-17 14:20:45 -0700	[diff] [blame]	741	if (mutex_lock_killable(&oom_lock))
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	742	return false;
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	743	oom_killer_disabled = true;
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	744	mutex_unlock(&oom_lock);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	745
Michal Hocko	7d2e7a2	2016-10-07 16:59:00 -0700	[diff] [blame]	746	ret = wait_event_interruptible_timeout(oom_victims_wait,
				747	!atomic_read(&oom_victims), timeout);
				748	if (ret <= 0) {
				749	oom_killer_enable();
				750	return false;
				751	}
Michal Hocko	d75da00	2017-05-03 14:54:57 -0700	[diff] [blame]	752	pr_info("OOM killer disabled.\n");
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	753
				754	return true;
				755	}
				756
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	757	static inline bool __task_will_free_mem(struct task_struct *task)
				758	{
				759	struct signal_struct *sig = task->signal;
				760
				761	/*
				762	* A coredumping process may sleep for an extended period in exit_mm(),
				763	* so the oom killer cannot assume that the process will promptly exit
				764	* and release memory.
				765	*/
				766	if (sig->flags & SIGNAL_GROUP_COREDUMP)
				767	return false;
				768
				769	if (sig->flags & SIGNAL_GROUP_EXIT)
				770	return true;
				771
				772	if (thread_group_empty(task) && (task->flags & PF_EXITING))
				773	return true;
				774
				775	return false;
				776	}
				777
				778	/*
				779	* Checks whether the given task is dying or exiting and likely to
				780	* release its address space. This means that all threads and processes
				781	* sharing the same mm have to be killed or exiting.
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	782	* Caller has to make sure that task->mm is stable (hold task_lock or
				783	* it operates on the current).
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	784	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	785	static bool task_will_free_mem(struct task_struct *task)
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	786	{
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	787	struct mm_struct *mm = task->mm;
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	788	struct task_struct *p;
Geert Uytterhoeven	f33e6f0	2016-08-11 15:33:09 -0700	[diff] [blame]	789	bool ret = true;
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	790
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	791	/*
				792	* Skip tasks without mm because it might have passed its exit_mm and
				793	* exit_oom_victim. oom_reaper could have rescued that but do not rely
				794	* on that for now. We can consider find_lock_task_mm in future.
				795	*/
				796	if (!mm)
				797	return false;
				798
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	799	if (!__task_will_free_mem(task))
				800	return false;
				801
				802	/*
Michal Hocko	696453e	2016-07-28 15:44:55 -0700	[diff] [blame]	803	* This task has already been drained by the oom reaper so there are
				804	* only small chances it will free some more
				805	*/
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	806	if (test_bit(MMF_OOM_SKIP, &mm->flags))
Michal Hocko	696453e	2016-07-28 15:44:55 -0700	[diff] [blame]	807	return false;
Michal Hocko	696453e	2016-07-28 15:44:55 -0700	[diff] [blame]	808
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	809	if (atomic_read(&mm->mm_users) <= 1)
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	810	return true;
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	811
				812	/*
Michal Hocko	5870c2e	2016-10-07 16:57:32 -0700	[diff] [blame]	813	* Make sure that all tasks which share the mm with the given tasks
				814	* are dying as well to make sure that a) nobody pins its mm and
				815	* b) the task is also reapable by the oom reaper.
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	816	*/
				817	rcu_read_lock();
				818	for_each_process(p) {
				819	if (!process_shares_mm(p, mm))
				820	continue;
				821	if (same_thread_group(task, p))
				822	continue;
				823	ret = __task_will_free_mem(p);
				824	if (!ret)
				825	break;
				826	}
				827	rcu_read_unlock();
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	828
				829	return ret;
				830	}
				831
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	832	static void oom_kill_process(struct oom_control oc, const char message)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	833	{
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	834	struct task_struct *p = oc->chosen;
				835	unsigned int points = oc->chosen_points;
Linus Torvalds	52d3c03	2011-03-14 15:17:07 -0700	[diff] [blame]	836	struct task_struct *victim = p;
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	837	struct task_struct *child;
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	838	struct task_struct *t;
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	839	struct mm_struct *mm;
Linus Torvalds	52d3c03	2011-03-14 15:17:07 -0700	[diff] [blame]	840	unsigned int victim_points = 0;
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	841	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				842	DEFAULT_RATELIMIT_BURST);
Tetsuo Handa	bb29902	2016-03-25 14:20:44 -0700	[diff] [blame]	843	bool can_oom_reap = true;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	844
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	845	/*
				846	* If the task is already exiting, don't alarm the sysadmin or kill
Michal Hocko	cd04ae1	2017-09-06 16:24:50 -0700	[diff] [blame]	847	* its children or threads, just give it access to memory reserves
				848	* so it can die quickly
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	849	*/
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	850	task_lock(p);
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	851	if (task_will_free_mem(p)) {
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	852	mark_oom_victim(p);
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	853	wake_oom_reaper(p);
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	854	task_unlock(p);
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	855	put_task_struct(p);
David Rientjes	2a1c9b1	2012-03-21 16:33:46 -0700	[diff] [blame]	856	return;
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	857	}
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	858	task_unlock(p);
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	859
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	860	if (__ratelimit(&oom_rs))
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	861	dump_header(oc, p);
David Rientjes	8447d95	2012-03-21 16:33:47 -0700	[diff] [blame]	862
Wang Long	f0d6647	2015-06-24 16:58:01 -0700	[diff] [blame]	863	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	864	message, task_pid_nr(p), p->comm, points);
Nick Piggin	f3af38d	2006-12-06 20:31:51 -0800	[diff] [blame]	865
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	866	/*
				867	* If any of p's children has a different mm and is eligible for kill,
David Rientjes	1123983	2011-07-25 17:12:17 -0700	[diff] [blame]	868	* the one with the highest oom_badness() score is sacrificed for its
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	869	* parent. This attempts to lose the minimal amount of work done while
				870	* still freeing memory.
				871	*/
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	872	read_lock(&tasklist_lock);
Shakeel Butt	43f7e8b	2019-02-01 14:20:54 -0800	[diff] [blame]	873
				874	/*
				875	* The task 'p' might have already exited before reaching here. The
				876	* put_task_struct() will free task_struct 'p' while the loop still try
				877	* to access the field of 'p', so, get an extra reference.
				878	*/
				879	get_task_struct(p);
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	880	for_each_thread(p, t) {
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	881	list_for_each_entry(child, &t->children, sibling) {
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	882	unsigned int child_points;
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	883
Oleg Nesterov	4d7b339	2015-11-05 18:48:26 -0800	[diff] [blame]	884	if (process_shares_mm(child, p->mm))
David Rientjes	edd4554	2011-03-22 16:30:12 -0700	[diff] [blame]	885	continue;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	886	/*
				887	* oom_badness() returns 0 if the thread is unkillable
				888	*/
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	889	child_points = oom_badness(child,
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	890	oc->memcg, oc->nodemask, oc->totalpages);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	891	if (child_points > victim_points) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	892	put_task_struct(victim);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	893	victim = child;
				894	victim_points = child_points;
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	895	get_task_struct(victim);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	896	}
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	897	}
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	898	}
Shakeel Butt	43f7e8b	2019-02-01 14:20:54 -0800	[diff] [blame]	899	put_task_struct(p);
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	900	read_unlock(&tasklist_lock);
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	901
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	902	p = find_lock_task_mm(victim);
				903	if (!p) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	904	put_task_struct(victim);
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	905	return;
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	906	} else if (victim != p) {
				907	get_task_struct(p);
				908	put_task_struct(victim);
				909	victim = p;
				910	}
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	911
Tetsuo Handa	880b768	2015-11-05 18:47:51 -0800	[diff] [blame]	912	/* Get a reference to safely compare mm after task_unlock(victim) */
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	913	mm = victim->mm;
Vegard Nossum	f1f1007	2017-02-27 14:30:07 -0800	[diff] [blame]	914	mmgrab(mm);
Konstantin Khlebnikov	8e675f7	2017-07-06 15:40:28 -0700	[diff] [blame]	915
				916	/* Raise event before sending signal: task reaper must see this */
				917	count_vm_event(OOM_KILL);
				918	count_memcg_event_mm(mm, OOM_KILL);
				919
Tetsuo Handa	426fb5e	2015-11-05 18:47:44 -0800	[diff] [blame]	920	/*
Michal Hocko	cd04ae1	2017-09-06 16:24:50 -0700	[diff] [blame]	921	* We should send SIGKILL before granting access to memory reserves
				922	* in order to prevent the OOM victim from depleting the memory
				923	* reserves from the user space under its control.
Tetsuo Handa	426fb5e	2015-11-05 18:47:44 -0800	[diff] [blame]	924	*/
				925	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	926	mark_oom_victim(victim);
Jerome Marchand	eca56ff	2016-01-14 15:19:26 -0800	[diff] [blame]	927	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	928	task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
				929	K(get_mm_counter(victim->mm, MM_ANONPAGES)),
Jerome Marchand	eca56ff	2016-01-14 15:19:26 -0800	[diff] [blame]	930	K(get_mm_counter(victim->mm, MM_FILEPAGES)),
				931	K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	932	task_unlock(victim);
				933
				934	/*
				935	* Kill all user processes sharing victim->mm in other thread groups, if
				936	* any. They don't get access to memory reserves, though, to avoid
				937	* depletion of all memory. This prevents mm->mmap_sem livelock when an
				938	* oom killed thread cannot exit because it requires the semaphore and
				939	* its contended by another thread trying to allocate memory itself.
				940	* That thread will now get access to memory reserves since it has a
				941	* pending fatal signal.
				942	*/
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	943	rcu_read_lock();
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	944	for_each_process(p) {
Oleg Nesterov	4d7b339	2015-11-05 18:48:26 -0800	[diff] [blame]	945	if (!process_shares_mm(p, mm))
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	946	continue;
				947	if (same_thread_group(p, victim))
				948	continue;
Michal Hocko	1b51e65	2016-10-07 16:59:09 -0700	[diff] [blame]	949	if (is_global_init(p)) {
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	950	can_oom_reap = false;
Michal Hocko	862e307	2016-10-07 16:58:57 -0700	[diff] [blame]	951	set_bit(MMF_OOM_SKIP, &mm->flags);
Michal Hocko	a373966	2016-07-28 15:45:01 -0700	[diff] [blame]	952	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
				953	task_pid_nr(victim), victim->comm,
				954	task_pid_nr(p), p->comm);
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	955	continue;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	956	}
Michal Hocko	1b51e65	2016-10-07 16:59:09 -0700	[diff] [blame]	957	/*
				958	* No use_mm() user needs to read from the userspace so we are
				959	* ok to reap it.
				960	*/
				961	if (unlikely(p->flags & PF_KTHREAD))
				962	continue;
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	963	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
				964	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	965	rcu_read_unlock();
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	966
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	967	if (can_oom_reap)
Michal Hocko	36324a9	2016-03-25 14:20:27 -0700	[diff] [blame]	968	wake_oom_reaper(victim);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame]	969
Tetsuo Handa	880b768	2015-11-05 18:47:51 -0800	[diff] [blame]	970	mmdrop(mm);
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	971	put_task_struct(victim);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	972	}
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	973	#undef K
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	974
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	975	/*
				976	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
				977	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	978	static void check_panic_on_oom(struct oom_control *oc,
				979	enum oom_constraint constraint)
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	980	{
				981	if (likely(!sysctl_panic_on_oom))
				982	return;
				983	if (sysctl_panic_on_oom != 2) {
				984	/*
				985	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
				986	* does not panic for cpuset, mempolicy, or memcg allocation
				987	* failures.
				988	*/
				989	if (constraint != CONSTRAINT_NONE)
				990	return;
				991	}
David Rientjes	071a4be	2015-09-08 15:00:42 -0700	[diff] [blame]	992	/* Do not panic for oom kills triggered by sysrq */
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	993	if (is_sysrq_oom(oc))
David Rientjes	071a4be	2015-09-08 15:00:42 -0700	[diff] [blame]	994	return;
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	995	dump_header(oc, NULL);
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	996	panic("Out of memory: %s panic_on_oom is enabled\n",
				997	sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
				998	}
				999
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	1000	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
				1001
				1002	int register_oom_notifier(struct notifier_block *nb)
				1003	{
				1004	return blocking_notifier_chain_register(&oom_notify_list, nb);
				1005	}
				1006	EXPORT_SYMBOL_GPL(register_oom_notifier);
				1007
				1008	int unregister_oom_notifier(struct notifier_block *nb)
				1009	{
				1010	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
				1011	}
				1012	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
				1013
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1014	/**
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1015	* out_of_memory - kill the "best" process when we run out of memory
				1016	* @oc: pointer to struct oom_control
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1017	*
				1018	* If we run out of memory, we have the choice between either
				1019	* killing a random task (bad), letting the system crash (worse)
				1020	* OR try to be smart about which process to kill. Note that we
				1021	* don't have to be perfect here, we just have to be good.
				1022	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1023	bool out_of_memory(struct oom_control *oc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1024	{
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	1025	unsigned long freed = 0;
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	1026	enum oom_constraint constraint = CONSTRAINT_NONE;
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	1027
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	1028	if (oom_killer_disabled)
				1029	return false;
				1030
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1031	if (!is_memcg_oom(oc)) {
				1032	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
				1033	if (freed > 0)
				1034	/* Got some memory back in the last second. */
				1035	return true;
				1036	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1037
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	1038	/*
David Rientjes	9ff4868	2012-12-11 16:01:30 -0800	[diff] [blame]	1039	* If current has a pending SIGKILL or is exiting, then automatically
				1040	* select it. The goal is to allow it to allocate so that it may
				1041	* quickly exit and free its memory.
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	1042	*/
Michal Hocko	091f362	2016-07-28 15:45:04 -0700	[diff] [blame]	1043	if (task_will_free_mem(current)) {
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	1044	mark_oom_victim(current);
Michal Hocko	1af8bb4	2016-07-28 15:44:52 -0700	[diff] [blame]	1045	wake_oom_reaper(current);
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	1046	return true;
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	1047	}
				1048
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	1049	/*
Michal Hocko	3da88fb	2016-05-19 17:13:09 -0700	[diff] [blame]	1050	* The OOM killer does not compensate for IO-less reclaim.
				1051	* pagefault_out_of_memory lost its gfp context so we have to
				1052	* make sure exclude 0 mask - all other users should have at least
Tetsuo Handa	7bd49f0	2019-09-23 15:37:08 -0700	[diff] [blame]	1053	* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
				1054	* invoke the OOM killer even if it is a GFP_NOFS allocation.
Michal Hocko	3da88fb	2016-05-19 17:13:09 -0700	[diff] [blame]	1055	*/
Tetsuo Handa	7bd49f0	2019-09-23 15:37:08 -0700	[diff] [blame]	1056	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
Michal Hocko	3da88fb	2016-05-19 17:13:09 -0700	[diff] [blame]	1057	return true;
				1058
				1059	/*
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	1060	* Check if there were limitations on the allocation (only relevant for
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1061	* NUMA and memcg) that may require different handling.
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	1062	*/
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1063	constraint = constrained_alloc(oc);
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1064	if (constraint != CONSTRAINT_MEMORY_POLICY)
				1065	oc->nodemask = NULL;
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	1066	check_panic_on_oom(oc, constraint);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	1067
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1068	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
				1069	current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
David Rientjes	121d1ba	2012-07-31 16:42:55 -0700	[diff] [blame]	1070	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	1071	get_task_struct(current);
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1072	oc->chosen = current;
				1073	oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	1074	return true;
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	1075	}
				1076
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1077	select_bad_process(oc);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	1078	/* Found nothing?!?! Either we hang forever, or we panic. */
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1079	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	1080	dump_header(oc, NULL);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	1081	panic("Out of memory and no killable processes...\n");
				1082	}
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1083	if (oc->chosen && oc->chosen != (void *)-1UL) {
				1084	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				1085	"Memory cgroup out of memory");
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	1086	/*
				1087	* Give the killed process a good chance to exit before trying
				1088	* to allocate memory again.
				1089	*/
David Rientjes	4f774b9	2012-07-31 16:42:37 -0700	[diff] [blame]	1090	schedule_timeout_killable(1);
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	1091	}
Vladimir Davydov	7c5f64f	2016-10-07 16:57:23 -0700	[diff] [blame]	1092	return !!oc->chosen;
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	1093	}
				1094
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	1095	/*
				1096	* The pagefault handler calls here because it is out of memory, so kill a
Vladimir Davydov	798fd75	2016-07-26 15:22:30 -0700	[diff] [blame]	1097	* memory-hogging task. If oom_lock is held by somebody else, a parallel oom
				1098	* killing is already in progress so do nothing.
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	1099	*/
				1100	void pagefault_out_of_memory(void)
				1101	{
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1102	struct oom_control oc = {
				1103	.zonelist = NULL,
				1104	.nodemask = NULL,
Vladimir Davydov	2a966b7	2016-07-26 15:22:33 -0700	[diff] [blame]	1105	.memcg = NULL,
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1106	.gfp_mask = 0,
				1107	.order = 0,
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	1108	};
				1109
Johannes Weiner	4942642	2013-10-16 13:46:59 -0700	[diff] [blame]	1110	if (mem_cgroup_oom_synchronize(true))
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	1111	return;
Johannes Weiner	3812c8c	2013-09-12 15:13:44 -0700	[diff] [blame]	1112
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	1113	if (!mutex_trylock(&oom_lock))
				1114	return;
Tetsuo Handa	a104808	2016-10-07 17:00:49 -0700	[diff] [blame]	1115	out_of_memory(&oc);
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	1116	mutex_unlock(&oom_lock);
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	1117	}