xref: /openbmc/linux/mm/oom_kill.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  *  linux/mm/oom_kill.c
3  *
4  *  Copyright (C)  1998,2000  Rik van Riel
5  *	Thanks go out to Claus Fischer for some serious inspiration and
6  *	for goading me into coding this file...
7  *
8  *  The routines in this file are used to kill a process when
9  *  we're seriously out of memory. This gets called from __alloc_pages()
10  *  in mm/page_alloc.c when we really run out of memory.
11  *
12  *  Since we won't call these routines often (on a well-configured
13  *  machine) this file will double as a 'coding guide' and a signpost
14  *  for newbie kernel hackers. It features several pointers to major
15  *  kernel subsystems and hints as to where to find out what things do.
16  */
17 
18 #include <linux/mm.h>
19 #include <linux/sched.h>
20 #include <linux/swap.h>
21 #include <linux/timex.h>
22 #include <linux/jiffies.h>
23 #include <linux/cpuset.h>
24 
25 /* #define DEBUG */
26 
27 /**
28  * oom_badness - calculate a numeric value for how bad this task has been
29  * @p: task struct of which task we should calculate
30  * @uptime: current uptime in seconds
31  *
32  * The formula used is relatively simple and documented inline in the
33  * function. The main rationale is that we want to select a good task
34  * to kill when we run out of memory.
35  *
36  * Good in this context means that:
37  * 1) we lose the minimum amount of work done
38  * 2) we recover a large amount of memory
39  * 3) we don't kill anything innocent of eating tons of memory
40  * 4) we want to kill the minimum amount of processes (one)
41  * 5) we try to kill the process the user expects us to kill, this
42  *    algorithm has been meticulously tuned to meet the principle
43  *    of least surprise ... (be careful when you change it)
44  */
45 
46 unsigned long badness(struct task_struct *p, unsigned long uptime)
47 {
48 	unsigned long points, cpu_time, run_time, s;
49 	struct list_head *tsk;
50 
51 	if (!p->mm)
52 		return 0;
53 
54 	/*
55 	 * The memory size of the process is the basis for the badness.
56 	 */
57 	points = p->mm->total_vm;
58 
59 	/*
60 	 * Processes which fork a lot of child processes are likely
61 	 * a good choice. We add the vmsize of the children if they
62 	 * have an own mm. This prevents forking servers to flood the
63 	 * machine with an endless amount of children
64 	 */
65 	list_for_each(tsk, &p->children) {
66 		struct task_struct *chld;
67 		chld = list_entry(tsk, struct task_struct, sibling);
68 		if (chld->mm != p->mm && chld->mm)
69 			points += chld->mm->total_vm;
70 	}
71 
72 	/*
73 	 * CPU time is in tens of seconds and run time is in thousands
74          * of seconds. There is no particular reason for this other than
75          * that it turned out to work very well in practice.
76 	 */
77 	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
78 		>> (SHIFT_HZ + 3);
79 
80 	if (uptime >= p->start_time.tv_sec)
81 		run_time = (uptime - p->start_time.tv_sec) >> 10;
82 	else
83 		run_time = 0;
84 
85 	s = int_sqrt(cpu_time);
86 	if (s)
87 		points /= s;
88 	s = int_sqrt(int_sqrt(run_time));
89 	if (s)
90 		points /= s;
91 
92 	/*
93 	 * Niced processes are most likely less important, so double
94 	 * their badness points.
95 	 */
96 	if (task_nice(p) > 0)
97 		points *= 2;
98 
99 	/*
100 	 * Superuser processes are usually more important, so we make it
101 	 * less likely that we kill those.
102 	 */
103 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
104 				p->uid == 0 || p->euid == 0)
105 		points /= 4;
106 
107 	/*
108 	 * We don't want to kill a process with direct hardware access.
109 	 * Not only could that mess up the hardware, but usually users
110 	 * tend to only have this flag set on applications they think
111 	 * of as important.
112 	 */
113 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
114 		points /= 4;
115 
116 	/*
117 	 * Adjust the score by oomkilladj.
118 	 */
119 	if (p->oomkilladj) {
120 		if (p->oomkilladj > 0)
121 			points <<= p->oomkilladj;
122 		else
123 			points >>= -(p->oomkilladj);
124 	}
125 
126 #ifdef DEBUG
127 	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
128 	p->pid, p->comm, points);
129 #endif
130 	return points;
131 }
132 
133 /*
134  * Simple selection loop. We chose the process with the highest
135  * number of 'points'. We expect the caller will lock the tasklist.
136  *
137  * (not docbooked, we don't want this one cluttering up the manual)
138  */
139 static struct task_struct * select_bad_process(void)
140 {
141 	unsigned long maxpoints = 0;
142 	struct task_struct *g, *p;
143 	struct task_struct *chosen = NULL;
144 	struct timespec uptime;
145 
146 	do_posix_clock_monotonic_gettime(&uptime);
147 	do_each_thread(g, p) {
148 		unsigned long points;
149 		int releasing;
150 
151 		/* skip the init task with pid == 1 */
152 		if (p->pid == 1)
153 			continue;
154 		if (p->oomkilladj == OOM_DISABLE)
155 			continue;
156 		/* If p's nodes don't overlap ours, it won't help to kill p. */
157 		if (!cpuset_excl_nodes_overlap(p))
158 			continue;
159 
160 		/*
161 		 * This is in the process of releasing memory so for wait it
162 		 * to finish before killing some other task by mistake.
163 		 */
164 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
165 						p->flags & PF_EXITING;
166 		if (releasing && !(p->flags & PF_DEAD))
167 			return ERR_PTR(-1UL);
168 		if (p->flags & PF_SWAPOFF)
169 			return p;
170 
171 		points = badness(p, uptime.tv_sec);
172 		if (points > maxpoints || !chosen) {
173 			chosen = p;
174 			maxpoints = points;
175 		}
176 	} while_each_thread(g, p);
177 	return chosen;
178 }
179 
180 /**
181  * We must be careful though to never send SIGKILL a process with
182  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
183  * we select a process with CAP_SYS_RAW_IO set).
184  */
185 static void __oom_kill_task(task_t *p)
186 {
187 	if (p->pid == 1) {
188 		WARN_ON(1);
189 		printk(KERN_WARNING "tried to kill init!\n");
190 		return;
191 	}
192 
193 	task_lock(p);
194 	if (!p->mm || p->mm == &init_mm) {
195 		WARN_ON(1);
196 		printk(KERN_WARNING "tried to kill an mm-less task!\n");
197 		task_unlock(p);
198 		return;
199 	}
200 	task_unlock(p);
201 	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
202 							p->pid, p->comm);
203 
204 	/*
205 	 * We give our sacrificial lamb high priority and access to
206 	 * all the memory it needs. That way it should be able to
207 	 * exit() and clear out its resources quickly...
208 	 */
209 	p->time_slice = HZ;
210 	set_tsk_thread_flag(p, TIF_MEMDIE);
211 
212 	force_sig(SIGKILL, p);
213 }
214 
215 static struct mm_struct *oom_kill_task(task_t *p)
216 {
217 	struct mm_struct *mm = get_task_mm(p);
218 	task_t * g, * q;
219 
220 	if (!mm)
221 		return NULL;
222 	if (mm == &init_mm) {
223 		mmput(mm);
224 		return NULL;
225 	}
226 
227 	__oom_kill_task(p);
228 	/*
229 	 * kill all processes that share the ->mm (i.e. all threads),
230 	 * but are in a different thread group
231 	 */
232 	do_each_thread(g, q)
233 		if (q->mm == mm && q->tgid != p->tgid)
234 			__oom_kill_task(q);
235 	while_each_thread(g, q);
236 
237 	return mm;
238 }
239 
240 static struct mm_struct *oom_kill_process(struct task_struct *p)
241 {
242  	struct mm_struct *mm;
243 	struct task_struct *c;
244 	struct list_head *tsk;
245 
246 	/* Try to kill a child first */
247 	list_for_each(tsk, &p->children) {
248 		c = list_entry(tsk, struct task_struct, sibling);
249 		if (c->mm == p->mm)
250 			continue;
251 		mm = oom_kill_task(c);
252 		if (mm)
253 			return mm;
254 	}
255 	return oom_kill_task(p);
256 }
257 
258 /**
259  * oom_kill - kill the "best" process when we run out of memory
260  *
261  * If we run out of memory, we have the choice between either
262  * killing a random task (bad), letting the system crash (worse)
263  * OR try to be smart about which process to kill. Note that we
264  * don't have to be perfect here, we just have to be good.
265  */
266 void out_of_memory(gfp_t gfp_mask, int order)
267 {
268 	struct mm_struct *mm = NULL;
269 	task_t * p;
270 
271 	if (printk_ratelimit()) {
272 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
273 			gfp_mask, order);
274 		show_mem();
275 	}
276 
277 	read_lock(&tasklist_lock);
278 retry:
279 	p = select_bad_process();
280 
281 	if (PTR_ERR(p) == -1UL)
282 		goto out;
283 
284 	/* Found nothing?!?! Either we hang forever, or we panic. */
285 	if (!p) {
286 		read_unlock(&tasklist_lock);
287 		panic("Out of memory and no killable processes...\n");
288 	}
289 
290 	mm = oom_kill_process(p);
291 	if (!mm)
292 		goto retry;
293 
294  out:
295 	read_unlock(&tasklist_lock);
296 	if (mm)
297 		mmput(mm);
298 
299 	/*
300 	 * Give "p" a good chance of killing itself before we
301 	 * retry to allocate memory.
302 	 */
303 	schedule_timeout_interruptible(1);
304 }
305