1 /* 2 * linux/mm/oom_kill.c 3 * 4 * Copyright (C) 1998,2000 Rik van Riel 5 * Thanks go out to Claus Fischer for some serious inspiration and 6 * for goading me into coding this file... 7 * Copyright (C) 2010 Google, Inc. 8 * Rewritten by David Rientjes 9 * 10 * The routines in this file are used to kill a process when 11 * we're seriously out of memory. This gets called from __alloc_pages() 12 * in mm/page_alloc.c when we really run out of memory. 13 * 14 * Since we won't call these routines often (on a well-configured 15 * machine) this file will double as a 'coding guide' and a signpost 16 * for newbie kernel hackers. It features several pointers to major 17 * kernel subsystems and hints as to where to find out what things do. 18 */ 19 20 #include <linux/oom.h> 21 #include <linux/mm.h> 22 #include <linux/err.h> 23 #include <linux/gfp.h> 24 #include <linux/sched.h> 25 #include <linux/sched/mm.h> 26 #include <linux/sched/coredump.h> 27 #include <linux/sched/task.h> 28 #include <linux/swap.h> 29 #include <linux/timex.h> 30 #include <linux/jiffies.h> 31 #include <linux/cpuset.h> 32 #include <linux/export.h> 33 #include <linux/notifier.h> 34 #include <linux/memcontrol.h> 35 #include <linux/mempolicy.h> 36 #include <linux/security.h> 37 #include <linux/ptrace.h> 38 #include <linux/freezer.h> 39 #include <linux/ftrace.h> 40 #include <linux/ratelimit.h> 41 #include <linux/kthread.h> 42 #include <linux/init.h> 43 #include <linux/mmu_notifier.h> 44 45 #include <asm/tlb.h> 46 #include "internal.h" 47 48 #define CREATE_TRACE_POINTS 49 #include <trace/events/oom.h> 50 51 int sysctl_panic_on_oom; 52 int sysctl_oom_kill_allocating_task; 53 int sysctl_oom_dump_tasks = 1; 54 55 DEFINE_MUTEX(oom_lock); 56 57 #ifdef CONFIG_NUMA 58 /** 59 * has_intersects_mems_allowed() - check task eligiblity for kill 60 * @start: task struct of which task to consider 61 * @mask: nodemask passed to page allocator for mempolicy ooms 62 * 63 * Task eligibility is determined by whether or not a candidate task, @tsk, 64 * shares the same mempolicy nodes as current if it is bound by such a policy 65 * and whether or not it has the same set of allowed cpuset nodes. 66 */ 67 static bool has_intersects_mems_allowed(struct task_struct *start, 68 const nodemask_t *mask) 69 { 70 struct task_struct *tsk; 71 bool ret = false; 72 73 rcu_read_lock(); 74 for_each_thread(start, tsk) { 75 if (mask) { 76 /* 77 * If this is a mempolicy constrained oom, tsk's 78 * cpuset is irrelevant. Only return true if its 79 * mempolicy intersects current, otherwise it may be 80 * needlessly killed. 81 */ 82 ret = mempolicy_nodemask_intersects(tsk, mask); 83 } else { 84 /* 85 * This is not a mempolicy constrained oom, so only 86 * check the mems of tsk's cpuset. 87 */ 88 ret = cpuset_mems_allowed_intersects(current, tsk); 89 } 90 if (ret) 91 break; 92 } 93 rcu_read_unlock(); 94 95 return ret; 96 } 97 #else 98 static bool has_intersects_mems_allowed(struct task_struct *tsk, 99 const nodemask_t *mask) 100 { 101 return true; 102 } 103 #endif /* CONFIG_NUMA */ 104 105 /* 106 * The process p may have detached its own ->mm while exiting or through 107 * use_mm(), but one or more of its subthreads may still have a valid 108 * pointer. Return p, or any of its subthreads with a valid ->mm, with 109 * task_lock() held. 110 */ 111 struct task_struct *find_lock_task_mm(struct task_struct *p) 112 { 113 struct task_struct *t; 114 115 rcu_read_lock(); 116 117 for_each_thread(p, t) { 118 task_lock(t); 119 if (likely(t->mm)) 120 goto found; 121 task_unlock(t); 122 } 123 t = NULL; 124 found: 125 rcu_read_unlock(); 126 127 return t; 128 } 129 130 /* 131 * order == -1 means the oom kill is required by sysrq, otherwise only 132 * for display purposes. 133 */ 134 static inline bool is_sysrq_oom(struct oom_control *oc) 135 { 136 return oc->order == -1; 137 } 138 139 static inline bool is_memcg_oom(struct oom_control *oc) 140 { 141 return oc->memcg != NULL; 142 } 143 144 /* return true if the task is not adequate as candidate victim task. */ 145 static bool oom_unkillable_task(struct task_struct *p, 146 struct mem_cgroup *memcg, const nodemask_t *nodemask) 147 { 148 if (is_global_init(p)) 149 return true; 150 if (p->flags & PF_KTHREAD) 151 return true; 152 153 /* When mem_cgroup_out_of_memory() and p is not member of the group */ 154 if (memcg && !task_in_mem_cgroup(p, memcg)) 155 return true; 156 157 /* p may not have freeable memory in nodemask */ 158 if (!has_intersects_mems_allowed(p, nodemask)) 159 return true; 160 161 return false; 162 } 163 164 /** 165 * oom_badness - heuristic function to determine which candidate task to kill 166 * @p: task struct of which task we should calculate 167 * @totalpages: total present RAM allowed for page allocation 168 * 169 * The heuristic for determining which task to kill is made to be as simple and 170 * predictable as possible. The goal is to return the highest value for the 171 * task consuming the most memory to avoid subsequent oom failures. 172 */ 173 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 174 const nodemask_t *nodemask, unsigned long totalpages) 175 { 176 long points; 177 long adj; 178 179 if (oom_unkillable_task(p, memcg, nodemask)) 180 return 0; 181 182 p = find_lock_task_mm(p); 183 if (!p) 184 return 0; 185 186 /* 187 * Do not even consider tasks which are explicitly marked oom 188 * unkillable or have been already oom reaped or the are in 189 * the middle of vfork 190 */ 191 adj = (long)p->signal->oom_score_adj; 192 if (adj == OOM_SCORE_ADJ_MIN || 193 test_bit(MMF_OOM_SKIP, &p->mm->flags) || 194 in_vfork(p)) { 195 task_unlock(p); 196 return 0; 197 } 198 199 /* 200 * The baseline for the badness score is the proportion of RAM that each 201 * task's rss, pagetable and swap space use. 202 */ 203 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + 204 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); 205 task_unlock(p); 206 207 /* 208 * Root processes get 3% bonus, just like the __vm_enough_memory() 209 * implementation used by LSMs. 210 */ 211 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 212 points -= (points * 3) / 100; 213 214 /* Normalize to oom_score_adj units */ 215 adj *= totalpages / 1000; 216 points += adj; 217 218 /* 219 * Never return 0 for an eligible task regardless of the root bonus and 220 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). 221 */ 222 return points > 0 ? points : 1; 223 } 224 225 enum oom_constraint { 226 CONSTRAINT_NONE, 227 CONSTRAINT_CPUSET, 228 CONSTRAINT_MEMORY_POLICY, 229 CONSTRAINT_MEMCG, 230 }; 231 232 /* 233 * Determine the type of allocation constraint. 234 */ 235 static enum oom_constraint constrained_alloc(struct oom_control *oc) 236 { 237 struct zone *zone; 238 struct zoneref *z; 239 enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); 240 bool cpuset_limited = false; 241 int nid; 242 243 if (is_memcg_oom(oc)) { 244 oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; 245 return CONSTRAINT_MEMCG; 246 } 247 248 /* Default to all available memory */ 249 oc->totalpages = totalram_pages + total_swap_pages; 250 251 if (!IS_ENABLED(CONFIG_NUMA)) 252 return CONSTRAINT_NONE; 253 254 if (!oc->zonelist) 255 return CONSTRAINT_NONE; 256 /* 257 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 258 * to kill current.We have to random task kill in this case. 259 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. 260 */ 261 if (oc->gfp_mask & __GFP_THISNODE) 262 return CONSTRAINT_NONE; 263 264 /* 265 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in 266 * the page allocator means a mempolicy is in effect. Cpuset policy 267 * is enforced in get_page_from_freelist(). 268 */ 269 if (oc->nodemask && 270 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 271 oc->totalpages = total_swap_pages; 272 for_each_node_mask(nid, *oc->nodemask) 273 oc->totalpages += node_spanned_pages(nid); 274 return CONSTRAINT_MEMORY_POLICY; 275 } 276 277 /* Check this allocation failure is caused by cpuset's wall function */ 278 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, 279 high_zoneidx, oc->nodemask) 280 if (!cpuset_zone_allowed(zone, oc->gfp_mask)) 281 cpuset_limited = true; 282 283 if (cpuset_limited) { 284 oc->totalpages = total_swap_pages; 285 for_each_node_mask(nid, cpuset_current_mems_allowed) 286 oc->totalpages += node_spanned_pages(nid); 287 return CONSTRAINT_CPUSET; 288 } 289 return CONSTRAINT_NONE; 290 } 291 292 static int oom_evaluate_task(struct task_struct *task, void *arg) 293 { 294 struct oom_control *oc = arg; 295 unsigned long points; 296 297 if (oom_unkillable_task(task, NULL, oc->nodemask)) 298 goto next; 299 300 /* 301 * This task already has access to memory reserves and is being killed. 302 * Don't allow any other task to have access to the reserves unless 303 * the task has MMF_OOM_SKIP because chances that it would release 304 * any memory is quite low. 305 */ 306 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { 307 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) 308 goto next; 309 goto abort; 310 } 311 312 /* 313 * If task is allocating a lot of memory and has been marked to be 314 * killed first if it triggers an oom, then select it. 315 */ 316 if (oom_task_origin(task)) { 317 points = ULONG_MAX; 318 goto select; 319 } 320 321 points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); 322 if (!points || points < oc->chosen_points) 323 goto next; 324 325 /* Prefer thread group leaders for display purposes */ 326 if (points == oc->chosen_points && thread_group_leader(oc->chosen)) 327 goto next; 328 select: 329 if (oc->chosen) 330 put_task_struct(oc->chosen); 331 get_task_struct(task); 332 oc->chosen = task; 333 oc->chosen_points = points; 334 next: 335 return 0; 336 abort: 337 if (oc->chosen) 338 put_task_struct(oc->chosen); 339 oc->chosen = (void *)-1UL; 340 return 1; 341 } 342 343 /* 344 * Simple selection loop. We choose the process with the highest number of 345 * 'points'. In case scan was aborted, oc->chosen is set to -1. 346 */ 347 static void select_bad_process(struct oom_control *oc) 348 { 349 if (is_memcg_oom(oc)) 350 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); 351 else { 352 struct task_struct *p; 353 354 rcu_read_lock(); 355 for_each_process(p) 356 if (oom_evaluate_task(p, oc)) 357 break; 358 rcu_read_unlock(); 359 } 360 361 oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; 362 } 363 364 /** 365 * dump_tasks - dump current memory state of all system tasks 366 * @memcg: current's memory controller, if constrained 367 * @nodemask: nodemask passed to page allocator for mempolicy ooms 368 * 369 * Dumps the current memory state of all eligible tasks. Tasks not in the same 370 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 371 * are not shown. 372 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, 373 * swapents, oom_score_adj value, and name. 374 */ 375 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) 376 { 377 struct task_struct *p; 378 struct task_struct *task; 379 380 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); 381 rcu_read_lock(); 382 for_each_process(p) { 383 if (oom_unkillable_task(p, memcg, nodemask)) 384 continue; 385 386 task = find_lock_task_mm(p); 387 if (!task) { 388 /* 389 * This is a kthread or all of p's threads have already 390 * detached their mm's. There's no need to report 391 * them; they can't be oom killed anyway. 392 */ 393 continue; 394 } 395 396 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", 397 task->pid, from_kuid(&init_user_ns, task_uid(task)), 398 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 399 atomic_long_read(&task->mm->nr_ptes), 400 mm_nr_pmds(task->mm), 401 get_mm_counter(task->mm, MM_SWAPENTS), 402 task->signal->oom_score_adj, task->comm); 403 task_unlock(task); 404 } 405 rcu_read_unlock(); 406 } 407 408 static void dump_header(struct oom_control *oc, struct task_struct *p) 409 { 410 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", 411 current->comm, oc->gfp_mask, &oc->gfp_mask); 412 if (oc->nodemask) 413 pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); 414 else 415 pr_cont("(null)"); 416 pr_cont(", order=%d, oom_score_adj=%hd\n", 417 oc->order, current->signal->oom_score_adj); 418 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 419 pr_warn("COMPACTION is disabled!!!\n"); 420 421 cpuset_print_current_mems_allowed(); 422 dump_stack(); 423 if (oc->memcg) 424 mem_cgroup_print_oom_info(oc->memcg, p); 425 else 426 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); 427 if (sysctl_oom_dump_tasks) 428 dump_tasks(oc->memcg, oc->nodemask); 429 } 430 431 /* 432 * Number of OOM victims in flight 433 */ 434 static atomic_t oom_victims = ATOMIC_INIT(0); 435 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 436 437 static bool oom_killer_disabled __read_mostly; 438 439 #define K(x) ((x) << (PAGE_SHIFT-10)) 440 441 /* 442 * task->mm can be NULL if the task is the exited group leader. So to 443 * determine whether the task is using a particular mm, we examine all the 444 * task's threads: if one of those is using this mm then this task was also 445 * using it. 446 */ 447 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) 448 { 449 struct task_struct *t; 450 451 for_each_thread(p, t) { 452 struct mm_struct *t_mm = READ_ONCE(t->mm); 453 if (t_mm) 454 return t_mm == mm; 455 } 456 return false; 457 } 458 459 460 #ifdef CONFIG_MMU 461 /* 462 * OOM Reaper kernel thread which tries to reap the memory used by the OOM 463 * victim (if that is possible) to help the OOM killer to move on. 464 */ 465 static struct task_struct *oom_reaper_th; 466 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); 467 static struct task_struct *oom_reaper_list; 468 static DEFINE_SPINLOCK(oom_reaper_lock); 469 470 static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) 471 { 472 struct mmu_gather tlb; 473 struct vm_area_struct *vma; 474 bool ret = true; 475 476 /* 477 * We have to make sure to not race with the victim exit path 478 * and cause premature new oom victim selection: 479 * __oom_reap_task_mm exit_mm 480 * mmget_not_zero 481 * mmput 482 * atomic_dec_and_test 483 * exit_oom_victim 484 * [...] 485 * out_of_memory 486 * select_bad_process 487 * # no TIF_MEMDIE task selects new victim 488 * unmap_page_range # frees some memory 489 */ 490 mutex_lock(&oom_lock); 491 492 if (!down_read_trylock(&mm->mmap_sem)) { 493 ret = false; 494 trace_skip_task_reaping(tsk->pid); 495 goto unlock_oom; 496 } 497 498 /* 499 * If the mm has notifiers then we would need to invalidate them around 500 * unmap_page_range and that is risky because notifiers can sleep and 501 * what they do is basically undeterministic. So let's have a short 502 * sleep to give the oom victim some more time. 503 * TODO: we really want to get rid of this ugly hack and make sure that 504 * notifiers cannot block for unbounded amount of time and add 505 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range 506 */ 507 if (mm_has_notifiers(mm)) { 508 up_read(&mm->mmap_sem); 509 schedule_timeout_idle(HZ); 510 goto unlock_oom; 511 } 512 513 /* 514 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't 515 * work on the mm anymore. The check for MMF_OOM_SKIP must run 516 * under mmap_sem for reading because it serializes against the 517 * down_write();up_write() cycle in exit_mmap(). 518 */ 519 if (test_bit(MMF_OOM_SKIP, &mm->flags)) { 520 up_read(&mm->mmap_sem); 521 trace_skip_task_reaping(tsk->pid); 522 goto unlock_oom; 523 } 524 525 trace_start_task_reaping(tsk->pid); 526 527 /* 528 * Tell all users of get_user/copy_from_user etc... that the content 529 * is no longer stable. No barriers really needed because unmapping 530 * should imply barriers already and the reader would hit a page fault 531 * if it stumbled over a reaped memory. 532 */ 533 set_bit(MMF_UNSTABLE, &mm->flags); 534 535 tlb_gather_mmu(&tlb, mm, 0, -1); 536 for (vma = mm->mmap ; vma; vma = vma->vm_next) { 537 if (!can_madv_dontneed_vma(vma)) 538 continue; 539 540 /* 541 * Only anonymous pages have a good chance to be dropped 542 * without additional steps which we cannot afford as we 543 * are OOM already. 544 * 545 * We do not even care about fs backed pages because all 546 * which are reclaimable have already been reclaimed and 547 * we do not want to block exit_mmap by keeping mm ref 548 * count elevated without a good reason. 549 */ 550 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) 551 unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, 552 NULL); 553 } 554 tlb_finish_mmu(&tlb, 0, -1); 555 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 556 task_pid_nr(tsk), tsk->comm, 557 K(get_mm_counter(mm, MM_ANONPAGES)), 558 K(get_mm_counter(mm, MM_FILEPAGES)), 559 K(get_mm_counter(mm, MM_SHMEMPAGES))); 560 up_read(&mm->mmap_sem); 561 562 trace_finish_task_reaping(tsk->pid); 563 unlock_oom: 564 mutex_unlock(&oom_lock); 565 return ret; 566 } 567 568 #define MAX_OOM_REAP_RETRIES 10 569 static void oom_reap_task(struct task_struct *tsk) 570 { 571 int attempts = 0; 572 struct mm_struct *mm = tsk->signal->oom_mm; 573 574 /* Retry the down_read_trylock(mmap_sem) a few times */ 575 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) 576 schedule_timeout_idle(HZ/10); 577 578 if (attempts <= MAX_OOM_REAP_RETRIES) 579 goto done; 580 581 582 pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 583 task_pid_nr(tsk), tsk->comm); 584 debug_show_all_locks(); 585 586 done: 587 tsk->oom_reaper_list = NULL; 588 589 /* 590 * Hide this mm from OOM killer because it has been either reaped or 591 * somebody can't call up_write(mmap_sem). 592 */ 593 set_bit(MMF_OOM_SKIP, &mm->flags); 594 595 /* Drop a reference taken by wake_oom_reaper */ 596 put_task_struct(tsk); 597 } 598 599 static int oom_reaper(void *unused) 600 { 601 while (true) { 602 struct task_struct *tsk = NULL; 603 604 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); 605 spin_lock(&oom_reaper_lock); 606 if (oom_reaper_list != NULL) { 607 tsk = oom_reaper_list; 608 oom_reaper_list = tsk->oom_reaper_list; 609 } 610 spin_unlock(&oom_reaper_lock); 611 612 if (tsk) 613 oom_reap_task(tsk); 614 } 615 616 return 0; 617 } 618 619 static void wake_oom_reaper(struct task_struct *tsk) 620 { 621 if (!oom_reaper_th) 622 return; 623 624 /* tsk is already queued? */ 625 if (tsk == oom_reaper_list || tsk->oom_reaper_list) 626 return; 627 628 get_task_struct(tsk); 629 630 spin_lock(&oom_reaper_lock); 631 tsk->oom_reaper_list = oom_reaper_list; 632 oom_reaper_list = tsk; 633 spin_unlock(&oom_reaper_lock); 634 trace_wake_reaper(tsk->pid); 635 wake_up(&oom_reaper_wait); 636 } 637 638 static int __init oom_init(void) 639 { 640 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 641 if (IS_ERR(oom_reaper_th)) { 642 pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", 643 PTR_ERR(oom_reaper_th)); 644 oom_reaper_th = NULL; 645 } 646 return 0; 647 } 648 subsys_initcall(oom_init) 649 #else 650 static inline void wake_oom_reaper(struct task_struct *tsk) 651 { 652 } 653 #endif /* CONFIG_MMU */ 654 655 /** 656 * mark_oom_victim - mark the given task as OOM victim 657 * @tsk: task to mark 658 * 659 * Has to be called with oom_lock held and never after 660 * oom has been disabled already. 661 * 662 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either 663 * under task_lock or operate on the current). 664 */ 665 static void mark_oom_victim(struct task_struct *tsk) 666 { 667 struct mm_struct *mm = tsk->mm; 668 669 WARN_ON(oom_killer_disabled); 670 /* OOM killer might race with memcg OOM */ 671 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) 672 return; 673 674 /* oom_mm is bound to the signal struct life time. */ 675 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) 676 mmgrab(tsk->signal->oom_mm); 677 678 /* 679 * Make sure that the task is woken up from uninterruptible sleep 680 * if it is frozen because OOM killer wouldn't be able to free 681 * any memory and livelock. freezing_slow_path will tell the freezer 682 * that TIF_MEMDIE tasks should be ignored. 683 */ 684 __thaw_task(tsk); 685 atomic_inc(&oom_victims); 686 trace_mark_victim(tsk->pid); 687 } 688 689 /** 690 * exit_oom_victim - note the exit of an OOM victim 691 */ 692 void exit_oom_victim(void) 693 { 694 clear_thread_flag(TIF_MEMDIE); 695 696 if (!atomic_dec_return(&oom_victims)) 697 wake_up_all(&oom_victims_wait); 698 } 699 700 /** 701 * oom_killer_enable - enable OOM killer 702 */ 703 void oom_killer_enable(void) 704 { 705 oom_killer_disabled = false; 706 pr_info("OOM killer enabled.\n"); 707 } 708 709 /** 710 * oom_killer_disable - disable OOM killer 711 * @timeout: maximum timeout to wait for oom victims in jiffies 712 * 713 * Forces all page allocations to fail rather than trigger OOM killer. 714 * Will block and wait until all OOM victims are killed or the given 715 * timeout expires. 716 * 717 * The function cannot be called when there are runnable user tasks because 718 * the userspace would see unexpected allocation failures as a result. Any 719 * new usage of this function should be consulted with MM people. 720 * 721 * Returns true if successful and false if the OOM killer cannot be 722 * disabled. 723 */ 724 bool oom_killer_disable(signed long timeout) 725 { 726 signed long ret; 727 728 /* 729 * Make sure to not race with an ongoing OOM killer. Check that the 730 * current is not killed (possibly due to sharing the victim's memory). 731 */ 732 if (mutex_lock_killable(&oom_lock)) 733 return false; 734 oom_killer_disabled = true; 735 mutex_unlock(&oom_lock); 736 737 ret = wait_event_interruptible_timeout(oom_victims_wait, 738 !atomic_read(&oom_victims), timeout); 739 if (ret <= 0) { 740 oom_killer_enable(); 741 return false; 742 } 743 pr_info("OOM killer disabled.\n"); 744 745 return true; 746 } 747 748 static inline bool __task_will_free_mem(struct task_struct *task) 749 { 750 struct signal_struct *sig = task->signal; 751 752 /* 753 * A coredumping process may sleep for an extended period in exit_mm(), 754 * so the oom killer cannot assume that the process will promptly exit 755 * and release memory. 756 */ 757 if (sig->flags & SIGNAL_GROUP_COREDUMP) 758 return false; 759 760 if (sig->flags & SIGNAL_GROUP_EXIT) 761 return true; 762 763 if (thread_group_empty(task) && (task->flags & PF_EXITING)) 764 return true; 765 766 return false; 767 } 768 769 /* 770 * Checks whether the given task is dying or exiting and likely to 771 * release its address space. This means that all threads and processes 772 * sharing the same mm have to be killed or exiting. 773 * Caller has to make sure that task->mm is stable (hold task_lock or 774 * it operates on the current). 775 */ 776 static bool task_will_free_mem(struct task_struct *task) 777 { 778 struct mm_struct *mm = task->mm; 779 struct task_struct *p; 780 bool ret = true; 781 782 /* 783 * Skip tasks without mm because it might have passed its exit_mm and 784 * exit_oom_victim. oom_reaper could have rescued that but do not rely 785 * on that for now. We can consider find_lock_task_mm in future. 786 */ 787 if (!mm) 788 return false; 789 790 if (!__task_will_free_mem(task)) 791 return false; 792 793 /* 794 * This task has already been drained by the oom reaper so there are 795 * only small chances it will free some more 796 */ 797 if (test_bit(MMF_OOM_SKIP, &mm->flags)) 798 return false; 799 800 if (atomic_read(&mm->mm_users) <= 1) 801 return true; 802 803 /* 804 * Make sure that all tasks which share the mm with the given tasks 805 * are dying as well to make sure that a) nobody pins its mm and 806 * b) the task is also reapable by the oom reaper. 807 */ 808 rcu_read_lock(); 809 for_each_process(p) { 810 if (!process_shares_mm(p, mm)) 811 continue; 812 if (same_thread_group(task, p)) 813 continue; 814 ret = __task_will_free_mem(p); 815 if (!ret) 816 break; 817 } 818 rcu_read_unlock(); 819 820 return ret; 821 } 822 823 static void oom_kill_process(struct oom_control *oc, const char *message) 824 { 825 struct task_struct *p = oc->chosen; 826 unsigned int points = oc->chosen_points; 827 struct task_struct *victim = p; 828 struct task_struct *child; 829 struct task_struct *t; 830 struct mm_struct *mm; 831 unsigned int victim_points = 0; 832 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 833 DEFAULT_RATELIMIT_BURST); 834 bool can_oom_reap = true; 835 836 /* 837 * If the task is already exiting, don't alarm the sysadmin or kill 838 * its children or threads, just give it access to memory reserves 839 * so it can die quickly 840 */ 841 task_lock(p); 842 if (task_will_free_mem(p)) { 843 mark_oom_victim(p); 844 wake_oom_reaper(p); 845 task_unlock(p); 846 put_task_struct(p); 847 return; 848 } 849 task_unlock(p); 850 851 if (__ratelimit(&oom_rs)) 852 dump_header(oc, p); 853 854 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", 855 message, task_pid_nr(p), p->comm, points); 856 857 /* 858 * If any of p's children has a different mm and is eligible for kill, 859 * the one with the highest oom_badness() score is sacrificed for its 860 * parent. This attempts to lose the minimal amount of work done while 861 * still freeing memory. 862 */ 863 read_lock(&tasklist_lock); 864 for_each_thread(p, t) { 865 list_for_each_entry(child, &t->children, sibling) { 866 unsigned int child_points; 867 868 if (process_shares_mm(child, p->mm)) 869 continue; 870 /* 871 * oom_badness() returns 0 if the thread is unkillable 872 */ 873 child_points = oom_badness(child, 874 oc->memcg, oc->nodemask, oc->totalpages); 875 if (child_points > victim_points) { 876 put_task_struct(victim); 877 victim = child; 878 victim_points = child_points; 879 get_task_struct(victim); 880 } 881 } 882 } 883 read_unlock(&tasklist_lock); 884 885 p = find_lock_task_mm(victim); 886 if (!p) { 887 put_task_struct(victim); 888 return; 889 } else if (victim != p) { 890 get_task_struct(p); 891 put_task_struct(victim); 892 victim = p; 893 } 894 895 /* Get a reference to safely compare mm after task_unlock(victim) */ 896 mm = victim->mm; 897 mmgrab(mm); 898 899 /* Raise event before sending signal: task reaper must see this */ 900 count_vm_event(OOM_KILL); 901 count_memcg_event_mm(mm, OOM_KILL); 902 903 /* 904 * We should send SIGKILL before granting access to memory reserves 905 * in order to prevent the OOM victim from depleting the memory 906 * reserves from the user space under its control. 907 */ 908 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 909 mark_oom_victim(victim); 910 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 911 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 912 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 913 K(get_mm_counter(victim->mm, MM_FILEPAGES)), 914 K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); 915 task_unlock(victim); 916 917 /* 918 * Kill all user processes sharing victim->mm in other thread groups, if 919 * any. They don't get access to memory reserves, though, to avoid 920 * depletion of all memory. This prevents mm->mmap_sem livelock when an 921 * oom killed thread cannot exit because it requires the semaphore and 922 * its contended by another thread trying to allocate memory itself. 923 * That thread will now get access to memory reserves since it has a 924 * pending fatal signal. 925 */ 926 rcu_read_lock(); 927 for_each_process(p) { 928 if (!process_shares_mm(p, mm)) 929 continue; 930 if (same_thread_group(p, victim)) 931 continue; 932 if (is_global_init(p)) { 933 can_oom_reap = false; 934 set_bit(MMF_OOM_SKIP, &mm->flags); 935 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", 936 task_pid_nr(victim), victim->comm, 937 task_pid_nr(p), p->comm); 938 continue; 939 } 940 /* 941 * No use_mm() user needs to read from the userspace so we are 942 * ok to reap it. 943 */ 944 if (unlikely(p->flags & PF_KTHREAD)) 945 continue; 946 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 947 } 948 rcu_read_unlock(); 949 950 if (can_oom_reap) 951 wake_oom_reaper(victim); 952 953 mmdrop(mm); 954 put_task_struct(victim); 955 } 956 #undef K 957 958 /* 959 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 960 */ 961 static void check_panic_on_oom(struct oom_control *oc, 962 enum oom_constraint constraint) 963 { 964 if (likely(!sysctl_panic_on_oom)) 965 return; 966 if (sysctl_panic_on_oom != 2) { 967 /* 968 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel 969 * does not panic for cpuset, mempolicy, or memcg allocation 970 * failures. 971 */ 972 if (constraint != CONSTRAINT_NONE) 973 return; 974 } 975 /* Do not panic for oom kills triggered by sysrq */ 976 if (is_sysrq_oom(oc)) 977 return; 978 dump_header(oc, NULL); 979 panic("Out of memory: %s panic_on_oom is enabled\n", 980 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 981 } 982 983 static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 984 985 int register_oom_notifier(struct notifier_block *nb) 986 { 987 return blocking_notifier_chain_register(&oom_notify_list, nb); 988 } 989 EXPORT_SYMBOL_GPL(register_oom_notifier); 990 991 int unregister_oom_notifier(struct notifier_block *nb) 992 { 993 return blocking_notifier_chain_unregister(&oom_notify_list, nb); 994 } 995 EXPORT_SYMBOL_GPL(unregister_oom_notifier); 996 997 /** 998 * out_of_memory - kill the "best" process when we run out of memory 999 * @oc: pointer to struct oom_control 1000 * 1001 * If we run out of memory, we have the choice between either 1002 * killing a random task (bad), letting the system crash (worse) 1003 * OR try to be smart about which process to kill. Note that we 1004 * don't have to be perfect here, we just have to be good. 1005 */ 1006 bool out_of_memory(struct oom_control *oc) 1007 { 1008 unsigned long freed = 0; 1009 enum oom_constraint constraint = CONSTRAINT_NONE; 1010 1011 if (oom_killer_disabled) 1012 return false; 1013 1014 if (!is_memcg_oom(oc)) { 1015 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 1016 if (freed > 0) 1017 /* Got some memory back in the last second. */ 1018 return true; 1019 } 1020 1021 /* 1022 * If current has a pending SIGKILL or is exiting, then automatically 1023 * select it. The goal is to allow it to allocate so that it may 1024 * quickly exit and free its memory. 1025 */ 1026 if (task_will_free_mem(current)) { 1027 mark_oom_victim(current); 1028 wake_oom_reaper(current); 1029 return true; 1030 } 1031 1032 /* 1033 * The OOM killer does not compensate for IO-less reclaim. 1034 * pagefault_out_of_memory lost its gfp context so we have to 1035 * make sure exclude 0 mask - all other users should have at least 1036 * ___GFP_DIRECT_RECLAIM to get here. 1037 */ 1038 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) 1039 return true; 1040 1041 /* 1042 * Check if there were limitations on the allocation (only relevant for 1043 * NUMA and memcg) that may require different handling. 1044 */ 1045 constraint = constrained_alloc(oc); 1046 if (constraint != CONSTRAINT_MEMORY_POLICY) 1047 oc->nodemask = NULL; 1048 check_panic_on_oom(oc, constraint); 1049 1050 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && 1051 current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && 1052 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 1053 get_task_struct(current); 1054 oc->chosen = current; 1055 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); 1056 return true; 1057 } 1058 1059 select_bad_process(oc); 1060 /* Found nothing?!?! Either we hang forever, or we panic. */ 1061 if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { 1062 dump_header(oc, NULL); 1063 panic("Out of memory and no killable processes...\n"); 1064 } 1065 if (oc->chosen && oc->chosen != (void *)-1UL) { 1066 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : 1067 "Memory cgroup out of memory"); 1068 /* 1069 * Give the killed process a good chance to exit before trying 1070 * to allocate memory again. 1071 */ 1072 schedule_timeout_killable(1); 1073 } 1074 return !!oc->chosen; 1075 } 1076 1077 /* 1078 * The pagefault handler calls here because it is out of memory, so kill a 1079 * memory-hogging task. If oom_lock is held by somebody else, a parallel oom 1080 * killing is already in progress so do nothing. 1081 */ 1082 void pagefault_out_of_memory(void) 1083 { 1084 struct oom_control oc = { 1085 .zonelist = NULL, 1086 .nodemask = NULL, 1087 .memcg = NULL, 1088 .gfp_mask = 0, 1089 .order = 0, 1090 }; 1091 1092 if (mem_cgroup_oom_synchronize(true)) 1093 return; 1094 1095 if (!mutex_trylock(&oom_lock)) 1096 return; 1097 out_of_memory(&oc); 1098 mutex_unlock(&oom_lock); 1099 } 1100