1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/mm/oom_kill.c
4 *
5 * Copyright (C) 1998,2000 Rik van Riel
6 * Thanks go out to Claus Fischer for some serious inspiration and
7 * for goading me into coding this file...
8 * Copyright (C) 2010 Google, Inc.
9 * Rewritten by David Rientjes
10 *
11 * The routines in this file are used to kill a process when
12 * we're seriously out of memory. This gets called from __alloc_pages()
13 * in mm/page_alloc.c when we really run out of memory.
14 *
15 * Since we won't call these routines often (on a well-configured
16 * machine) this file will double as a 'coding guide' and a signpost
17 * for newbie kernel hackers. It features several pointers to major
18 * kernel subsystems and hints as to where to find out what things do.
19 */
20
21 #include <linux/oom.h>
22 #include <linux/mm.h>
23 #include <linux/err.h>
24 #include <linux/gfp.h>
25 #include <linux/sched.h>
26 #include <linux/sched/mm.h>
27 #include <linux/sched/coredump.h>
28 #include <linux/sched/task.h>
29 #include <linux/sched/debug.h>
30 #include <linux/swap.h>
31 #include <linux/syscalls.h>
32 #include <linux/timex.h>
33 #include <linux/jiffies.h>
34 #include <linux/cpuset.h>
35 #include <linux/export.h>
36 #include <linux/notifier.h>
37 #include <linux/memcontrol.h>
38 #include <linux/mempolicy.h>
39 #include <linux/security.h>
40 #include <linux/ptrace.h>
41 #include <linux/freezer.h>
42 #include <linux/ftrace.h>
43 #include <linux/ratelimit.h>
44 #include <linux/kthread.h>
45 #include <linux/init.h>
46 #include <linux/mmu_notifier.h>
47 #include <linux/cred.h>
48 #include <linux/nmi.h>
49
50 #include <asm/tlb.h>
51 #include "internal.h"
52 #include "slab.h"
53
54 #define CREATE_TRACE_POINTS
55 #include <trace/events/oom.h>
56
57 static int sysctl_panic_on_oom;
58 static int sysctl_oom_kill_allocating_task;
59 static int sysctl_oom_dump_tasks = 1;
60
61 /*
62 * Serializes oom killer invocations (out_of_memory()) from all contexts to
63 * prevent from over eager oom killing (e.g. when the oom killer is invoked
64 * from different domains).
65 *
66 * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
67 * and mark_oom_victim
68 */
69 DEFINE_MUTEX(oom_lock);
70 /* Serializes oom_score_adj and oom_score_adj_min updates */
71 DEFINE_MUTEX(oom_adj_mutex);
72
is_memcg_oom(struct oom_control * oc)73 static inline bool is_memcg_oom(struct oom_control *oc)
74 {
75 return oc->memcg != NULL;
76 }
77
78 #ifdef CONFIG_NUMA
79 /**
80 * oom_cpuset_eligible() - check task eligibility for kill
81 * @start: task struct of which task to consider
82 * @oc: pointer to struct oom_control
83 *
84 * Task eligibility is determined by whether or not a candidate task, @tsk,
85 * shares the same mempolicy nodes as current if it is bound by such a policy
86 * and whether or not it has the same set of allowed cpuset nodes.
87 *
88 * This function is assuming oom-killer context and 'current' has triggered
89 * the oom-killer.
90 */
oom_cpuset_eligible(struct task_struct * start,struct oom_control * oc)91 static bool oom_cpuset_eligible(struct task_struct *start,
92 struct oom_control *oc)
93 {
94 struct task_struct *tsk;
95 bool ret = false;
96 const nodemask_t *mask = oc->nodemask;
97
98 rcu_read_lock();
99 for_each_thread(start, tsk) {
100 if (mask) {
101 /*
102 * If this is a mempolicy constrained oom, tsk's
103 * cpuset is irrelevant. Only return true if its
104 * mempolicy intersects current, otherwise it may be
105 * needlessly killed.
106 */
107 ret = mempolicy_in_oom_domain(tsk, mask);
108 } else {
109 /*
110 * This is not a mempolicy constrained oom, so only
111 * check the mems of tsk's cpuset.
112 */
113 ret = cpuset_mems_allowed_intersects(current, tsk);
114 }
115 if (ret)
116 break;
117 }
118 rcu_read_unlock();
119
120 return ret;
121 }
122 #else
oom_cpuset_eligible(struct task_struct * tsk,struct oom_control * oc)123 static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
124 {
125 return true;
126 }
127 #endif /* CONFIG_NUMA */
128
129 /*
130 * The process p may have detached its own ->mm while exiting or through
131 * kthread_use_mm(), but one or more of its subthreads may still have a valid
132 * pointer. Return p, or any of its subthreads with a valid ->mm, with
133 * task_lock() held.
134 */
find_lock_task_mm(struct task_struct * p)135 struct task_struct *find_lock_task_mm(struct task_struct *p)
136 {
137 struct task_struct *t;
138
139 rcu_read_lock();
140
141 for_each_thread(p, t) {
142 task_lock(t);
143 if (likely(t->mm))
144 goto found;
145 task_unlock(t);
146 }
147 t = NULL;
148 found:
149 rcu_read_unlock();
150
151 return t;
152 }
153
154 /*
155 * order == -1 means the oom kill is required by sysrq, otherwise only
156 * for display purposes.
157 */
is_sysrq_oom(struct oom_control * oc)158 static inline bool is_sysrq_oom(struct oom_control *oc)
159 {
160 return oc->order == -1;
161 }
162
163 /* return true if the task is not adequate as candidate victim task. */
oom_unkillable_task(struct task_struct * p)164 static bool oom_unkillable_task(struct task_struct *p)
165 {
166 if (is_global_init(p))
167 return true;
168 if (p->flags & PF_KTHREAD)
169 return true;
170 return false;
171 }
172
173 /*
174 * Check whether unreclaimable slab amount is greater than
175 * all user memory(LRU pages).
176 * dump_unreclaimable_slab() could help in the case that
177 * oom due to too much unreclaimable slab used by kernel.
178 */
should_dump_unreclaim_slab(void)179 static bool should_dump_unreclaim_slab(void)
180 {
181 unsigned long nr_lru;
182
183 nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
184 global_node_page_state(NR_INACTIVE_ANON) +
185 global_node_page_state(NR_ACTIVE_FILE) +
186 global_node_page_state(NR_INACTIVE_FILE) +
187 global_node_page_state(NR_ISOLATED_ANON) +
188 global_node_page_state(NR_ISOLATED_FILE) +
189 global_node_page_state(NR_UNEVICTABLE);
190
191 return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
192 }
193
194 /**
195 * oom_badness - heuristic function to determine which candidate task to kill
196 * @p: task struct of which task we should calculate
197 * @totalpages: total present RAM allowed for page allocation
198 *
199 * The heuristic for determining which task to kill is made to be as simple and
200 * predictable as possible. The goal is to return the highest value for the
201 * task consuming the most memory to avoid subsequent oom failures.
202 */
oom_badness(struct task_struct * p,unsigned long totalpages)203 long oom_badness(struct task_struct *p, unsigned long totalpages)
204 {
205 long points;
206 long adj;
207
208 if (oom_unkillable_task(p))
209 return LONG_MIN;
210
211 p = find_lock_task_mm(p);
212 if (!p)
213 return LONG_MIN;
214
215 /*
216 * Do not even consider tasks which are explicitly marked oom
217 * unkillable or have been already oom reaped or the are in
218 * the middle of vfork
219 */
220 adj = (long)p->signal->oom_score_adj;
221 if (adj == OOM_SCORE_ADJ_MIN ||
222 test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
223 in_vfork(p)) {
224 task_unlock(p);
225 return LONG_MIN;
226 }
227
228 /*
229 * The baseline for the badness score is the proportion of RAM that each
230 * task's rss, pagetable and swap space use.
231 */
232 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
233 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
234 task_unlock(p);
235
236 /* Normalize to oom_score_adj units */
237 adj *= totalpages / 1000;
238 points += adj;
239
240 return points;
241 }
242
243 static const char * const oom_constraint_text[] = {
244 [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
245 [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
246 [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
247 [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
248 };
249
250 /*
251 * Determine the type of allocation constraint.
252 */
constrained_alloc(struct oom_control * oc)253 static enum oom_constraint constrained_alloc(struct oom_control *oc)
254 {
255 struct zone *zone;
256 struct zoneref *z;
257 enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
258 bool cpuset_limited = false;
259 int nid;
260
261 if (is_memcg_oom(oc)) {
262 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
263 return CONSTRAINT_MEMCG;
264 }
265
266 /* Default to all available memory */
267 oc->totalpages = totalram_pages() + total_swap_pages;
268
269 if (!IS_ENABLED(CONFIG_NUMA))
270 return CONSTRAINT_NONE;
271
272 if (!oc->zonelist)
273 return CONSTRAINT_NONE;
274 /*
275 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
276 * to kill current.We have to random task kill in this case.
277 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
278 */
279 if (oc->gfp_mask & __GFP_THISNODE)
280 return CONSTRAINT_NONE;
281
282 /*
283 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
284 * the page allocator means a mempolicy is in effect. Cpuset policy
285 * is enforced in get_page_from_freelist().
286 */
287 if (oc->nodemask &&
288 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
289 oc->totalpages = total_swap_pages;
290 for_each_node_mask(nid, *oc->nodemask)
291 oc->totalpages += node_present_pages(nid);
292 return CONSTRAINT_MEMORY_POLICY;
293 }
294
295 /* Check this allocation failure is caused by cpuset's wall function */
296 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
297 highest_zoneidx, oc->nodemask)
298 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
299 cpuset_limited = true;
300
301 if (cpuset_limited) {
302 oc->totalpages = total_swap_pages;
303 for_each_node_mask(nid, cpuset_current_mems_allowed)
304 oc->totalpages += node_present_pages(nid);
305 return CONSTRAINT_CPUSET;
306 }
307 return CONSTRAINT_NONE;
308 }
309
oom_evaluate_task(struct task_struct * task,void * arg)310 static int oom_evaluate_task(struct task_struct *task, void *arg)
311 {
312 struct oom_control *oc = arg;
313 long points;
314
315 if (oom_unkillable_task(task))
316 goto next;
317
318 /* p may not have freeable memory in nodemask */
319 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
320 goto next;
321
322 /*
323 * This task already has access to memory reserves and is being killed.
324 * Don't allow any other task to have access to the reserves unless
325 * the task has MMF_OOM_SKIP because chances that it would release
326 * any memory is quite low.
327 */
328 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
329 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
330 goto next;
331 goto abort;
332 }
333
334 /*
335 * If task is allocating a lot of memory and has been marked to be
336 * killed first if it triggers an oom, then select it.
337 */
338 if (oom_task_origin(task)) {
339 points = LONG_MAX;
340 goto select;
341 }
342
343 points = oom_badness(task, oc->totalpages);
344 if (points == LONG_MIN || points < oc->chosen_points)
345 goto next;
346
347 select:
348 if (oc->chosen)
349 put_task_struct(oc->chosen);
350 get_task_struct(task);
351 oc->chosen = task;
352 oc->chosen_points = points;
353 next:
354 return 0;
355 abort:
356 if (oc->chosen)
357 put_task_struct(oc->chosen);
358 oc->chosen = (void *)-1UL;
359 return 1;
360 }
361
362 /*
363 * Simple selection loop. We choose the process with the highest number of
364 * 'points'. In case scan was aborted, oc->chosen is set to -1.
365 */
select_bad_process(struct oom_control * oc)366 static void select_bad_process(struct oom_control *oc)
367 {
368 oc->chosen_points = LONG_MIN;
369
370 if (is_memcg_oom(oc))
371 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
372 else {
373 struct task_struct *p;
374
375 rcu_read_lock();
376 for_each_process(p)
377 if (oom_evaluate_task(p, oc))
378 break;
379 rcu_read_unlock();
380 }
381 }
382
dump_task(struct task_struct * p,void * arg)383 static int dump_task(struct task_struct *p, void *arg)
384 {
385 struct oom_control *oc = arg;
386 struct task_struct *task;
387
388 if (oom_unkillable_task(p))
389 return 0;
390
391 /* p may not have freeable memory in nodemask */
392 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
393 return 0;
394
395 task = find_lock_task_mm(p);
396 if (!task) {
397 /*
398 * All of p's threads have already detached their mm's. There's
399 * no need to report them; they can't be oom killed anyway.
400 */
401 return 0;
402 }
403
404 pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
405 task->pid, from_kuid(&init_user_ns, task_uid(task)),
406 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
407 mm_pgtables_bytes(task->mm),
408 get_mm_counter(task->mm, MM_SWAPENTS),
409 task->signal->oom_score_adj, task->comm);
410 task_unlock(task);
411
412 return 0;
413 }
414
415 /**
416 * dump_tasks - dump current memory state of all system tasks
417 * @oc: pointer to struct oom_control
418 *
419 * Dumps the current memory state of all eligible tasks. Tasks not in the same
420 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
421 * are not shown.
422 * State information includes task's pid, uid, tgid, vm size, rss,
423 * pgtables_bytes, swapents, oom_score_adj value, and name.
424 */
dump_tasks(struct oom_control * oc)425 static void dump_tasks(struct oom_control *oc)
426 {
427 pr_info("Tasks state (memory values in pages):\n");
428 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
429
430 if (is_memcg_oom(oc))
431 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
432 else {
433 struct task_struct *p;
434 int i = 0;
435
436 rcu_read_lock();
437 for_each_process(p) {
438 /* Avoid potential softlockup warning */
439 if ((++i & 1023) == 0)
440 touch_softlockup_watchdog();
441 dump_task(p, oc);
442 }
443 rcu_read_unlock();
444 }
445 }
446
dump_oom_summary(struct oom_control * oc,struct task_struct * victim)447 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
448 {
449 /* one line summary of the oom killer context. */
450 pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
451 oom_constraint_text[oc->constraint],
452 nodemask_pr_args(oc->nodemask));
453 cpuset_print_current_mems_allowed();
454 mem_cgroup_print_oom_context(oc->memcg, victim);
455 pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
456 from_kuid(&init_user_ns, task_uid(victim)));
457 }
458
dump_header(struct oom_control * oc,struct task_struct * p)459 static void dump_header(struct oom_control *oc, struct task_struct *p)
460 {
461 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
462 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
463 current->signal->oom_score_adj);
464 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
465 pr_warn("COMPACTION is disabled!!!\n");
466
467 dump_stack();
468 if (is_memcg_oom(oc))
469 mem_cgroup_print_oom_meminfo(oc->memcg);
470 else {
471 __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
472 if (should_dump_unreclaim_slab())
473 dump_unreclaimable_slab();
474 }
475 if (sysctl_oom_dump_tasks)
476 dump_tasks(oc);
477 if (p)
478 dump_oom_summary(oc, p);
479 }
480
481 /*
482 * Number of OOM victims in flight
483 */
484 static atomic_t oom_victims = ATOMIC_INIT(0);
485 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
486
487 static bool oom_killer_disabled __read_mostly;
488
489 /*
490 * task->mm can be NULL if the task is the exited group leader. So to
491 * determine whether the task is using a particular mm, we examine all the
492 * task's threads: if one of those is using this mm then this task was also
493 * using it.
494 */
process_shares_mm(struct task_struct * p,struct mm_struct * mm)495 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
496 {
497 struct task_struct *t;
498
499 for_each_thread(p, t) {
500 struct mm_struct *t_mm = READ_ONCE(t->mm);
501 if (t_mm)
502 return t_mm == mm;
503 }
504 return false;
505 }
506
507 #ifdef CONFIG_MMU
508 /*
509 * OOM Reaper kernel thread which tries to reap the memory used by the OOM
510 * victim (if that is possible) to help the OOM killer to move on.
511 */
512 static struct task_struct *oom_reaper_th;
513 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
514 static struct task_struct *oom_reaper_list;
515 static DEFINE_SPINLOCK(oom_reaper_lock);
516
__oom_reap_task_mm(struct mm_struct * mm)517 static bool __oom_reap_task_mm(struct mm_struct *mm)
518 {
519 struct vm_area_struct *vma;
520 bool ret = true;
521 VMA_ITERATOR(vmi, mm, 0);
522
523 /*
524 * Tell all users of get_user/copy_from_user etc... that the content
525 * is no longer stable. No barriers really needed because unmapping
526 * should imply barriers already and the reader would hit a page fault
527 * if it stumbled over a reaped memory.
528 */
529 set_bit(MMF_UNSTABLE, &mm->flags);
530
531 for_each_vma(vmi, vma) {
532 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
533 continue;
534
535 /*
536 * Only anonymous pages have a good chance to be dropped
537 * without additional steps which we cannot afford as we
538 * are OOM already.
539 *
540 * We do not even care about fs backed pages because all
541 * which are reclaimable have already been reclaimed and
542 * we do not want to block exit_mmap by keeping mm ref
543 * count elevated without a good reason.
544 */
545 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
546 struct mmu_notifier_range range;
547 struct mmu_gather tlb;
548
549 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
550 mm, vma->vm_start,
551 vma->vm_end);
552 tlb_gather_mmu(&tlb, mm);
553 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
554 tlb_finish_mmu(&tlb);
555 ret = false;
556 continue;
557 }
558 unmap_page_range(&tlb, vma, range.start, range.end, NULL);
559 mmu_notifier_invalidate_range_end(&range);
560 tlb_finish_mmu(&tlb);
561 }
562 }
563
564 return ret;
565 }
566
567 /*
568 * Reaps the address space of the give task.
569 *
570 * Returns true on success and false if none or part of the address space
571 * has been reclaimed and the caller should retry later.
572 */
oom_reap_task_mm(struct task_struct * tsk,struct mm_struct * mm)573 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
574 {
575 bool ret = true;
576
577 if (!mmap_read_trylock(mm)) {
578 trace_skip_task_reaping(tsk->pid);
579 return false;
580 }
581
582 /*
583 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
584 * work on the mm anymore. The check for MMF_OOM_SKIP must run
585 * under mmap_lock for reading because it serializes against the
586 * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
587 */
588 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
589 trace_skip_task_reaping(tsk->pid);
590 goto out_unlock;
591 }
592
593 trace_start_task_reaping(tsk->pid);
594
595 /* failed to reap part of the address space. Try again later */
596 ret = __oom_reap_task_mm(mm);
597 if (!ret)
598 goto out_finish;
599
600 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
601 task_pid_nr(tsk), tsk->comm,
602 K(get_mm_counter(mm, MM_ANONPAGES)),
603 K(get_mm_counter(mm, MM_FILEPAGES)),
604 K(get_mm_counter(mm, MM_SHMEMPAGES)));
605 out_finish:
606 trace_finish_task_reaping(tsk->pid);
607 out_unlock:
608 mmap_read_unlock(mm);
609
610 return ret;
611 }
612
613 #define MAX_OOM_REAP_RETRIES 10
oom_reap_task(struct task_struct * tsk)614 static void oom_reap_task(struct task_struct *tsk)
615 {
616 int attempts = 0;
617 struct mm_struct *mm = tsk->signal->oom_mm;
618
619 /* Retry the mmap_read_trylock(mm) a few times */
620 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
621 schedule_timeout_idle(HZ/10);
622
623 if (attempts <= MAX_OOM_REAP_RETRIES ||
624 test_bit(MMF_OOM_SKIP, &mm->flags))
625 goto done;
626
627 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
628 task_pid_nr(tsk), tsk->comm);
629 sched_show_task(tsk);
630 debug_show_all_locks();
631
632 done:
633 tsk->oom_reaper_list = NULL;
634
635 /*
636 * Hide this mm from OOM killer because it has been either reaped or
637 * somebody can't call mmap_write_unlock(mm).
638 */
639 set_bit(MMF_OOM_SKIP, &mm->flags);
640
641 /* Drop a reference taken by queue_oom_reaper */
642 put_task_struct(tsk);
643 }
644
oom_reaper(void * unused)645 static int oom_reaper(void *unused)
646 {
647 set_freezable();
648
649 while (true) {
650 struct task_struct *tsk = NULL;
651
652 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
653 spin_lock_irq(&oom_reaper_lock);
654 if (oom_reaper_list != NULL) {
655 tsk = oom_reaper_list;
656 oom_reaper_list = tsk->oom_reaper_list;
657 }
658 spin_unlock_irq(&oom_reaper_lock);
659
660 if (tsk)
661 oom_reap_task(tsk);
662 }
663
664 return 0;
665 }
666
wake_oom_reaper(struct timer_list * timer)667 static void wake_oom_reaper(struct timer_list *timer)
668 {
669 struct task_struct *tsk = container_of(timer, struct task_struct,
670 oom_reaper_timer);
671 struct mm_struct *mm = tsk->signal->oom_mm;
672 unsigned long flags;
673
674 /* The victim managed to terminate on its own - see exit_mmap */
675 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
676 put_task_struct(tsk);
677 return;
678 }
679
680 spin_lock_irqsave(&oom_reaper_lock, flags);
681 tsk->oom_reaper_list = oom_reaper_list;
682 oom_reaper_list = tsk;
683 spin_unlock_irqrestore(&oom_reaper_lock, flags);
684 trace_wake_reaper(tsk->pid);
685 wake_up(&oom_reaper_wait);
686 }
687
688 /*
689 * Give the OOM victim time to exit naturally before invoking the oom_reaping.
690 * The timers timeout is arbitrary... the longer it is, the longer the worst
691 * case scenario for the OOM can take. If it is too small, the oom_reaper can
692 * get in the way and release resources needed by the process exit path.
693 * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
694 * before the exit path is able to wake the futex waiters.
695 */
696 #define OOM_REAPER_DELAY (2*HZ)
queue_oom_reaper(struct task_struct * tsk)697 static void queue_oom_reaper(struct task_struct *tsk)
698 {
699 /* mm is already queued? */
700 if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
701 return;
702
703 get_task_struct(tsk);
704 timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
705 tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
706 add_timer(&tsk->oom_reaper_timer);
707 }
708
709 #ifdef CONFIG_SYSCTL
710 static struct ctl_table vm_oom_kill_table[] = {
711 {
712 .procname = "panic_on_oom",
713 .data = &sysctl_panic_on_oom,
714 .maxlen = sizeof(sysctl_panic_on_oom),
715 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax,
717 .extra1 = SYSCTL_ZERO,
718 .extra2 = SYSCTL_TWO,
719 },
720 {
721 .procname = "oom_kill_allocating_task",
722 .data = &sysctl_oom_kill_allocating_task,
723 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
724 .mode = 0644,
725 .proc_handler = proc_dointvec,
726 },
727 {
728 .procname = "oom_dump_tasks",
729 .data = &sysctl_oom_dump_tasks,
730 .maxlen = sizeof(sysctl_oom_dump_tasks),
731 .mode = 0644,
732 .proc_handler = proc_dointvec,
733 },
734 {}
735 };
736 #endif
737
oom_init(void)738 static int __init oom_init(void)
739 {
740 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
741 #ifdef CONFIG_SYSCTL
742 register_sysctl_init("vm", vm_oom_kill_table);
743 #endif
744 return 0;
745 }
subsys_initcall(oom_init)746 subsys_initcall(oom_init)
747 #else
748 static inline void queue_oom_reaper(struct task_struct *tsk)
749 {
750 }
751 #endif /* CONFIG_MMU */
752
753 /**
754 * mark_oom_victim - mark the given task as OOM victim
755 * @tsk: task to mark
756 *
757 * Has to be called with oom_lock held and never after
758 * oom has been disabled already.
759 *
760 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
761 * under task_lock or operate on the current).
762 */
763 static void mark_oom_victim(struct task_struct *tsk)
764 {
765 const struct cred *cred;
766 struct mm_struct *mm = tsk->mm;
767
768 WARN_ON(oom_killer_disabled);
769 /* OOM killer might race with memcg OOM */
770 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
771 return;
772
773 /* oom_mm is bound to the signal struct life time. */
774 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
775 mmgrab(tsk->signal->oom_mm);
776
777 /*
778 * Make sure that the task is woken up from uninterruptible sleep
779 * if it is frozen because OOM killer wouldn't be able to free
780 * any memory and livelock. freezing_slow_path will tell the freezer
781 * that TIF_MEMDIE tasks should be ignored.
782 */
783 __thaw_task(tsk);
784 atomic_inc(&oom_victims);
785 cred = get_task_cred(tsk);
786 trace_mark_victim(tsk, cred->uid.val);
787 put_cred(cred);
788 }
789
790 /**
791 * exit_oom_victim - note the exit of an OOM victim
792 */
exit_oom_victim(void)793 void exit_oom_victim(void)
794 {
795 clear_thread_flag(TIF_MEMDIE);
796
797 if (!atomic_dec_return(&oom_victims))
798 wake_up_all(&oom_victims_wait);
799 }
800
801 /**
802 * oom_killer_enable - enable OOM killer
803 */
oom_killer_enable(void)804 void oom_killer_enable(void)
805 {
806 oom_killer_disabled = false;
807 pr_info("OOM killer enabled.\n");
808 }
809
810 /**
811 * oom_killer_disable - disable OOM killer
812 * @timeout: maximum timeout to wait for oom victims in jiffies
813 *
814 * Forces all page allocations to fail rather than trigger OOM killer.
815 * Will block and wait until all OOM victims are killed or the given
816 * timeout expires.
817 *
818 * The function cannot be called when there are runnable user tasks because
819 * the userspace would see unexpected allocation failures as a result. Any
820 * new usage of this function should be consulted with MM people.
821 *
822 * Returns true if successful and false if the OOM killer cannot be
823 * disabled.
824 */
oom_killer_disable(signed long timeout)825 bool oom_killer_disable(signed long timeout)
826 {
827 signed long ret;
828
829 /*
830 * Make sure to not race with an ongoing OOM killer. Check that the
831 * current is not killed (possibly due to sharing the victim's memory).
832 */
833 if (mutex_lock_killable(&oom_lock))
834 return false;
835 oom_killer_disabled = true;
836 mutex_unlock(&oom_lock);
837
838 ret = wait_event_interruptible_timeout(oom_victims_wait,
839 !atomic_read(&oom_victims), timeout);
840 if (ret <= 0) {
841 oom_killer_enable();
842 return false;
843 }
844 pr_info("OOM killer disabled.\n");
845
846 return true;
847 }
848
__task_will_free_mem(struct task_struct * task)849 static inline bool __task_will_free_mem(struct task_struct *task)
850 {
851 struct signal_struct *sig = task->signal;
852
853 /*
854 * A coredumping process may sleep for an extended period in
855 * coredump_task_exit(), so the oom killer cannot assume that
856 * the process will promptly exit and release memory.
857 */
858 if (sig->core_state)
859 return false;
860
861 if (sig->flags & SIGNAL_GROUP_EXIT)
862 return true;
863
864 if (thread_group_empty(task) && (task->flags & PF_EXITING))
865 return true;
866
867 return false;
868 }
869
870 /*
871 * Checks whether the given task is dying or exiting and likely to
872 * release its address space. This means that all threads and processes
873 * sharing the same mm have to be killed or exiting.
874 * Caller has to make sure that task->mm is stable (hold task_lock or
875 * it operates on the current).
876 */
task_will_free_mem(struct task_struct * task)877 static bool task_will_free_mem(struct task_struct *task)
878 {
879 struct mm_struct *mm = task->mm;
880 struct task_struct *p;
881 bool ret = true;
882
883 /*
884 * Skip tasks without mm because it might have passed its exit_mm and
885 * exit_oom_victim. oom_reaper could have rescued that but do not rely
886 * on that for now. We can consider find_lock_task_mm in future.
887 */
888 if (!mm)
889 return false;
890
891 if (!__task_will_free_mem(task))
892 return false;
893
894 /*
895 * This task has already been drained by the oom reaper so there are
896 * only small chances it will free some more
897 */
898 if (test_bit(MMF_OOM_SKIP, &mm->flags))
899 return false;
900
901 if (atomic_read(&mm->mm_users) <= 1)
902 return true;
903
904 /*
905 * Make sure that all tasks which share the mm with the given tasks
906 * are dying as well to make sure that a) nobody pins its mm and
907 * b) the task is also reapable by the oom reaper.
908 */
909 rcu_read_lock();
910 for_each_process(p) {
911 if (!process_shares_mm(p, mm))
912 continue;
913 if (same_thread_group(task, p))
914 continue;
915 ret = __task_will_free_mem(p);
916 if (!ret)
917 break;
918 }
919 rcu_read_unlock();
920
921 return ret;
922 }
923
__oom_kill_process(struct task_struct * victim,const char * message)924 static void __oom_kill_process(struct task_struct *victim, const char *message)
925 {
926 struct task_struct *p;
927 struct mm_struct *mm;
928 bool can_oom_reap = true;
929
930 p = find_lock_task_mm(victim);
931 if (!p) {
932 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
933 message, task_pid_nr(victim), victim->comm);
934 put_task_struct(victim);
935 return;
936 } else if (victim != p) {
937 get_task_struct(p);
938 put_task_struct(victim);
939 victim = p;
940 }
941
942 /* Get a reference to safely compare mm after task_unlock(victim) */
943 mm = victim->mm;
944 mmgrab(mm);
945
946 /* Raise event before sending signal: task reaper must see this */
947 count_vm_event(OOM_KILL);
948 memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
949
950 /*
951 * We should send SIGKILL before granting access to memory reserves
952 * in order to prevent the OOM victim from depleting the memory
953 * reserves from the user space under its control.
954 */
955 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
956 mark_oom_victim(victim);
957 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
958 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
959 K(get_mm_counter(mm, MM_ANONPAGES)),
960 K(get_mm_counter(mm, MM_FILEPAGES)),
961 K(get_mm_counter(mm, MM_SHMEMPAGES)),
962 from_kuid(&init_user_ns, task_uid(victim)),
963 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
964 task_unlock(victim);
965
966 /*
967 * Kill all user processes sharing victim->mm in other thread groups, if
968 * any. They don't get access to memory reserves, though, to avoid
969 * depletion of all memory. This prevents mm->mmap_lock livelock when an
970 * oom killed thread cannot exit because it requires the semaphore and
971 * its contended by another thread trying to allocate memory itself.
972 * That thread will now get access to memory reserves since it has a
973 * pending fatal signal.
974 */
975 rcu_read_lock();
976 for_each_process(p) {
977 if (!process_shares_mm(p, mm))
978 continue;
979 if (same_thread_group(p, victim))
980 continue;
981 if (is_global_init(p)) {
982 can_oom_reap = false;
983 set_bit(MMF_OOM_SKIP, &mm->flags);
984 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
985 task_pid_nr(victim), victim->comm,
986 task_pid_nr(p), p->comm);
987 continue;
988 }
989 /*
990 * No kthread_use_mm() user needs to read from the userspace so
991 * we are ok to reap it.
992 */
993 if (unlikely(p->flags & PF_KTHREAD))
994 continue;
995 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
996 }
997 rcu_read_unlock();
998
999 if (can_oom_reap)
1000 queue_oom_reaper(victim);
1001
1002 mmdrop(mm);
1003 put_task_struct(victim);
1004 }
1005
1006 /*
1007 * Kill provided task unless it's secured by setting
1008 * oom_score_adj to OOM_SCORE_ADJ_MIN.
1009 */
oom_kill_memcg_member(struct task_struct * task,void * message)1010 static int oom_kill_memcg_member(struct task_struct *task, void *message)
1011 {
1012 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
1013 !is_global_init(task)) {
1014 get_task_struct(task);
1015 __oom_kill_process(task, message);
1016 }
1017 return 0;
1018 }
1019
oom_kill_process(struct oom_control * oc,const char * message)1020 static void oom_kill_process(struct oom_control *oc, const char *message)
1021 {
1022 struct task_struct *victim = oc->chosen;
1023 struct mem_cgroup *oom_group;
1024 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1025 DEFAULT_RATELIMIT_BURST);
1026
1027 /*
1028 * If the task is already exiting, don't alarm the sysadmin or kill
1029 * its children or threads, just give it access to memory reserves
1030 * so it can die quickly
1031 */
1032 task_lock(victim);
1033 if (task_will_free_mem(victim)) {
1034 mark_oom_victim(victim);
1035 queue_oom_reaper(victim);
1036 task_unlock(victim);
1037 put_task_struct(victim);
1038 return;
1039 }
1040 task_unlock(victim);
1041
1042 if (__ratelimit(&oom_rs))
1043 dump_header(oc, victim);
1044
1045 /*
1046 * Do we need to kill the entire memory cgroup?
1047 * Or even one of the ancestor memory cgroups?
1048 * Check this out before killing the victim task.
1049 */
1050 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
1051
1052 __oom_kill_process(victim, message);
1053
1054 /*
1055 * If necessary, kill all tasks in the selected memory cgroup.
1056 */
1057 if (oom_group) {
1058 memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
1059 mem_cgroup_print_oom_group(oom_group);
1060 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
1061 (void *)message);
1062 mem_cgroup_put(oom_group);
1063 }
1064 }
1065
1066 /*
1067 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
1068 */
check_panic_on_oom(struct oom_control * oc)1069 static void check_panic_on_oom(struct oom_control *oc)
1070 {
1071 if (likely(!sysctl_panic_on_oom))
1072 return;
1073 if (sysctl_panic_on_oom != 2) {
1074 /*
1075 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
1076 * does not panic for cpuset, mempolicy, or memcg allocation
1077 * failures.
1078 */
1079 if (oc->constraint != CONSTRAINT_NONE)
1080 return;
1081 }
1082 /* Do not panic for oom kills triggered by sysrq */
1083 if (is_sysrq_oom(oc))
1084 return;
1085 dump_header(oc, NULL);
1086 panic("Out of memory: %s panic_on_oom is enabled\n",
1087 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
1088 }
1089
1090 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
1091
register_oom_notifier(struct notifier_block * nb)1092 int register_oom_notifier(struct notifier_block *nb)
1093 {
1094 return blocking_notifier_chain_register(&oom_notify_list, nb);
1095 }
1096 EXPORT_SYMBOL_GPL(register_oom_notifier);
1097
unregister_oom_notifier(struct notifier_block * nb)1098 int unregister_oom_notifier(struct notifier_block *nb)
1099 {
1100 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
1101 }
1102 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
1103
1104 /**
1105 * out_of_memory - kill the "best" process when we run out of memory
1106 * @oc: pointer to struct oom_control
1107 *
1108 * If we run out of memory, we have the choice between either
1109 * killing a random task (bad), letting the system crash (worse)
1110 * OR try to be smart about which process to kill. Note that we
1111 * don't have to be perfect here, we just have to be good.
1112 */
out_of_memory(struct oom_control * oc)1113 bool out_of_memory(struct oom_control *oc)
1114 {
1115 unsigned long freed = 0;
1116
1117 if (oom_killer_disabled)
1118 return false;
1119
1120 if (!is_memcg_oom(oc)) {
1121 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
1122 if (freed > 0 && !is_sysrq_oom(oc))
1123 /* Got some memory back in the last second. */
1124 return true;
1125 }
1126
1127 /*
1128 * If current has a pending SIGKILL or is exiting, then automatically
1129 * select it. The goal is to allow it to allocate so that it may
1130 * quickly exit and free its memory.
1131 */
1132 if (task_will_free_mem(current)) {
1133 mark_oom_victim(current);
1134 queue_oom_reaper(current);
1135 return true;
1136 }
1137
1138 /*
1139 * The OOM killer does not compensate for IO-less reclaim.
1140 * But mem_cgroup_oom() has to invoke the OOM killer even
1141 * if it is a GFP_NOFS allocation.
1142 */
1143 if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
1144 return true;
1145
1146 /*
1147 * Check if there were limitations on the allocation (only relevant for
1148 * NUMA and memcg) that may require different handling.
1149 */
1150 oc->constraint = constrained_alloc(oc);
1151 if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
1152 oc->nodemask = NULL;
1153 check_panic_on_oom(oc);
1154
1155 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1156 current->mm && !oom_unkillable_task(current) &&
1157 oom_cpuset_eligible(current, oc) &&
1158 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1159 get_task_struct(current);
1160 oc->chosen = current;
1161 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
1162 return true;
1163 }
1164
1165 select_bad_process(oc);
1166 /* Found nothing?!?! */
1167 if (!oc->chosen) {
1168 dump_header(oc, NULL);
1169 pr_warn("Out of memory and no killable processes...\n");
1170 /*
1171 * If we got here due to an actual allocation at the
1172 * system level, we cannot survive this and will enter
1173 * an endless loop in the allocator. Bail out now.
1174 */
1175 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
1176 panic("System is deadlocked on memory\n");
1177 }
1178 if (oc->chosen && oc->chosen != (void *)-1UL)
1179 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
1180 "Memory cgroup out of memory");
1181 return !!oc->chosen;
1182 }
1183
1184 /*
1185 * The pagefault handler calls here because some allocation has failed. We have
1186 * to take care of the memcg OOM here because this is the only safe context without
1187 * any locks held but let the oom killer triggered from the allocation context care
1188 * about the global OOM.
1189 */
pagefault_out_of_memory(void)1190 void pagefault_out_of_memory(void)
1191 {
1192 static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
1193 DEFAULT_RATELIMIT_BURST);
1194
1195 if (mem_cgroup_oom_synchronize(true))
1196 return;
1197
1198 if (fatal_signal_pending(current))
1199 return;
1200
1201 if (__ratelimit(&pfoom_rs))
1202 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
1203 }
1204
SYSCALL_DEFINE2(process_mrelease,int,pidfd,unsigned int,flags)1205 SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
1206 {
1207 #ifdef CONFIG_MMU
1208 struct mm_struct *mm = NULL;
1209 struct task_struct *task;
1210 struct task_struct *p;
1211 unsigned int f_flags;
1212 bool reap = false;
1213 long ret = 0;
1214
1215 if (flags)
1216 return -EINVAL;
1217
1218 task = pidfd_get_task(pidfd, &f_flags);
1219 if (IS_ERR(task))
1220 return PTR_ERR(task);
1221
1222 /*
1223 * Make sure to choose a thread which still has a reference to mm
1224 * during the group exit
1225 */
1226 p = find_lock_task_mm(task);
1227 if (!p) {
1228 ret = -ESRCH;
1229 goto put_task;
1230 }
1231
1232 mm = p->mm;
1233 mmgrab(mm);
1234
1235 if (task_will_free_mem(p))
1236 reap = true;
1237 else {
1238 /* Error only if the work has not been done already */
1239 if (!test_bit(MMF_OOM_SKIP, &mm->flags))
1240 ret = -EINVAL;
1241 }
1242 task_unlock(p);
1243
1244 if (!reap)
1245 goto drop_mm;
1246
1247 if (mmap_read_lock_killable(mm)) {
1248 ret = -EINTR;
1249 goto drop_mm;
1250 }
1251 /*
1252 * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
1253 * possible change in exit_mmap is seen
1254 */
1255 if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
1256 ret = -EAGAIN;
1257 mmap_read_unlock(mm);
1258
1259 drop_mm:
1260 mmdrop(mm);
1261 put_task:
1262 put_task_struct(task);
1263 return ret;
1264 #else
1265 return -ENOSYS;
1266 #endif /* CONFIG_MMU */
1267 }
1268