1 /* 2 * linux/mm/oom_kill.c 3 * 4 * Copyright (C) 1998,2000 Rik van Riel 5 * Thanks go out to Claus Fischer for some serious inspiration and 6 * for goading me into coding this file... 7 * 8 * The routines in this file are used to kill a process when 9 * we're seriously out of memory. This gets called from __alloc_pages() 10 * in mm/page_alloc.c when we really run out of memory. 11 * 12 * Since we won't call these routines often (on a well-configured 13 * machine) this file will double as a 'coding guide' and a signpost 14 * for newbie kernel hackers. It features several pointers to major 15 * kernel subsystems and hints as to where to find out what things do. 16 */ 17 18 #include <linux/oom.h> 19 #include <linux/mm.h> 20 #include <linux/err.h> 21 #include <linux/sched.h> 22 #include <linux/swap.h> 23 #include <linux/timex.h> 24 #include <linux/jiffies.h> 25 #include <linux/cpuset.h> 26 #include <linux/module.h> 27 #include <linux/notifier.h> 28 #include <linux/memcontrol.h> 29 #include <linux/security.h> 30 31 int sysctl_panic_on_oom; 32 int sysctl_oom_kill_allocating_task; 33 int sysctl_oom_dump_tasks; 34 static DEFINE_SPINLOCK(zone_scan_lock); 35 /* #define DEBUG */ 36 37 /* 38 * Is all threads of the target process nodes overlap ours? 39 */ 40 static int has_intersects_mems_allowed(struct task_struct *tsk) 41 { 42 struct task_struct *t; 43 44 t = tsk; 45 do { 46 if (cpuset_mems_allowed_intersects(current, t)) 47 return 1; 48 t = next_thread(t); 49 } while (t != tsk); 50 51 return 0; 52 } 53 54 /** 55 * badness - calculate a numeric value for how bad this task has been 56 * @p: task struct of which task we should calculate 57 * @uptime: current uptime in seconds 58 * 59 * The formula used is relatively simple and documented inline in the 60 * function. The main rationale is that we want to select a good task 61 * to kill when we run out of memory. 62 * 63 * Good in this context means that: 64 * 1) we lose the minimum amount of work done 65 * 2) we recover a large amount of memory 66 * 3) we don't kill anything innocent of eating tons of memory 67 * 4) we want to kill the minimum amount of processes (one) 68 * 5) we try to kill the process the user expects us to kill, this 69 * algorithm has been meticulously tuned to meet the principle 70 * of least surprise ... (be careful when you change it) 71 */ 72 73 unsigned long badness(struct task_struct *p, unsigned long uptime) 74 { 75 unsigned long points, cpu_time, run_time; 76 struct mm_struct *mm; 77 struct task_struct *child; 78 int oom_adj = p->signal->oom_adj; 79 struct task_cputime task_time; 80 unsigned long utime; 81 unsigned long stime; 82 83 if (oom_adj == OOM_DISABLE) 84 return 0; 85 86 task_lock(p); 87 mm = p->mm; 88 if (!mm) { 89 task_unlock(p); 90 return 0; 91 } 92 93 /* 94 * The memory size of the process is the basis for the badness. 95 */ 96 points = mm->total_vm; 97 98 /* 99 * After this unlock we can no longer dereference local variable `mm' 100 */ 101 task_unlock(p); 102 103 /* 104 * swapoff can easily use up all memory, so kill those first. 105 */ 106 if (p->flags & PF_OOM_ORIGIN) 107 return ULONG_MAX; 108 109 /* 110 * Processes which fork a lot of child processes are likely 111 * a good choice. We add half the vmsize of the children if they 112 * have an own mm. This prevents forking servers to flood the 113 * machine with an endless amount of children. In case a single 114 * child is eating the vast majority of memory, adding only half 115 * to the parents will make the child our kill candidate of choice. 116 */ 117 list_for_each_entry(child, &p->children, sibling) { 118 task_lock(child); 119 if (child->mm != mm && child->mm) 120 points += child->mm->total_vm/2 + 1; 121 task_unlock(child); 122 } 123 124 /* 125 * CPU time is in tens of seconds and run time is in thousands 126 * of seconds. There is no particular reason for this other than 127 * that it turned out to work very well in practice. 128 */ 129 thread_group_cputime(p, &task_time); 130 utime = cputime_to_jiffies(task_time.utime); 131 stime = cputime_to_jiffies(task_time.stime); 132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 133 134 135 if (uptime >= p->start_time.tv_sec) 136 run_time = (uptime - p->start_time.tv_sec) >> 10; 137 else 138 run_time = 0; 139 140 if (cpu_time) 141 points /= int_sqrt(cpu_time); 142 if (run_time) 143 points /= int_sqrt(int_sqrt(run_time)); 144 145 /* 146 * Niced processes are most likely less important, so double 147 * their badness points. 148 */ 149 if (task_nice(p) > 0) 150 points *= 2; 151 152 /* 153 * Superuser processes are usually more important, so we make it 154 * less likely that we kill those. 155 */ 156 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 157 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 158 points /= 4; 159 160 /* 161 * We don't want to kill a process with direct hardware access. 162 * Not only could that mess up the hardware, but usually users 163 * tend to only have this flag set on applications they think 164 * of as important. 165 */ 166 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 167 points /= 4; 168 169 /* 170 * If p's nodes don't overlap ours, it may still help to kill p 171 * because p may have allocated or otherwise mapped memory on 172 * this node before. However it will be less likely. 173 */ 174 if (!has_intersects_mems_allowed(p)) 175 points /= 8; 176 177 /* 178 * Adjust the score by oom_adj. 179 */ 180 if (oom_adj) { 181 if (oom_adj > 0) { 182 if (!points) 183 points = 1; 184 points <<= oom_adj; 185 } else 186 points >>= -(oom_adj); 187 } 188 189 #ifdef DEBUG 190 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 191 p->pid, p->comm, points); 192 #endif 193 return points; 194 } 195 196 /* 197 * Determine the type of allocation constraint. 198 */ 199 #ifdef CONFIG_NUMA 200 static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 201 gfp_t gfp_mask, nodemask_t *nodemask) 202 { 203 struct zone *zone; 204 struct zoneref *z; 205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 206 207 /* 208 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 209 * to kill current.We have to random task kill in this case. 210 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. 211 */ 212 if (gfp_mask & __GFP_THISNODE) 213 return CONSTRAINT_NONE; 214 215 /* 216 * The nodemask here is a nodemask passed to alloc_pages(). Now, 217 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 218 * feature. mempolicy is an only user of nodemask here. 219 * check mempolicy's nodemask contains all N_HIGH_MEMORY 220 */ 221 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 222 return CONSTRAINT_MEMORY_POLICY; 223 224 /* Check this allocation failure is caused by cpuset's wall function */ 225 for_each_zone_zonelist_nodemask(zone, z, zonelist, 226 high_zoneidx, nodemask) 227 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 228 return CONSTRAINT_CPUSET; 229 230 return CONSTRAINT_NONE; 231 } 232 #else 233 static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 234 gfp_t gfp_mask, nodemask_t *nodemask) 235 { 236 return CONSTRAINT_NONE; 237 } 238 #endif 239 240 /* 241 * Simple selection loop. We chose the process with the highest 242 * number of 'points'. We expect the caller will lock the tasklist. 243 * 244 * (not docbooked, we don't want this one cluttering up the manual) 245 */ 246 static struct task_struct *select_bad_process(unsigned long *ppoints, 247 struct mem_cgroup *mem) 248 { 249 struct task_struct *p; 250 struct task_struct *chosen = NULL; 251 struct timespec uptime; 252 *ppoints = 0; 253 254 do_posix_clock_monotonic_gettime(&uptime); 255 for_each_process(p) { 256 unsigned long points; 257 258 /* 259 * skip kernel threads and tasks which have already released 260 * their mm. 261 */ 262 if (!p->mm) 263 continue; 264 /* skip the init task */ 265 if (is_global_init(p)) 266 continue; 267 if (mem && !task_in_mem_cgroup(p, mem)) 268 continue; 269 270 /* 271 * This task already has access to memory reserves and is 272 * being killed. Don't allow any other task access to the 273 * memory reserve. 274 * 275 * Note: this may have a chance of deadlock if it gets 276 * blocked waiting for another task which itself is waiting 277 * for memory. Is there a better alternative? 278 */ 279 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 280 return ERR_PTR(-1UL); 281 282 /* 283 * This is in the process of releasing memory so wait for it 284 * to finish before killing some other task by mistake. 285 * 286 * However, if p is the current task, we allow the 'kill' to 287 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 288 * which will allow it to gain access to memory reserves in 289 * the process of exiting and releasing its resources. 290 * Otherwise we could get an easy OOM deadlock. 291 */ 292 if (p->flags & PF_EXITING) { 293 if (p != current) 294 return ERR_PTR(-1UL); 295 296 chosen = p; 297 *ppoints = ULONG_MAX; 298 } 299 300 if (p->signal->oom_adj == OOM_DISABLE) 301 continue; 302 303 points = badness(p, uptime.tv_sec); 304 if (points > *ppoints || !chosen) { 305 chosen = p; 306 *ppoints = points; 307 } 308 } 309 310 return chosen; 311 } 312 313 /** 314 * dump_tasks - dump current memory state of all system tasks 315 * @mem: target memory controller 316 * 317 * Dumps the current memory state of all system tasks, excluding kernel threads. 318 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 319 * score, and name. 320 * 321 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 322 * shown. 323 * 324 * Call with tasklist_lock read-locked. 325 */ 326 static void dump_tasks(const struct mem_cgroup *mem) 327 { 328 struct task_struct *g, *p; 329 330 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 331 "name\n"); 332 do_each_thread(g, p) { 333 struct mm_struct *mm; 334 335 if (mem && !task_in_mem_cgroup(p, mem)) 336 continue; 337 if (!thread_group_leader(p)) 338 continue; 339 340 task_lock(p); 341 mm = p->mm; 342 if (!mm) { 343 /* 344 * total_vm and rss sizes do not exist for tasks with no 345 * mm so there's no need to report them; they can't be 346 * oom killed anyway. 347 */ 348 task_unlock(p); 349 continue; 350 } 351 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 352 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 353 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 354 p->comm); 355 task_unlock(p); 356 } while_each_thread(g, p); 357 } 358 359 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 360 struct mem_cgroup *mem) 361 { 362 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 363 "oom_adj=%d\n", 364 current->comm, gfp_mask, order, current->signal->oom_adj); 365 task_lock(current); 366 cpuset_print_task_mems_allowed(current); 367 task_unlock(current); 368 dump_stack(); 369 mem_cgroup_print_oom_info(mem, p); 370 show_mem(); 371 if (sysctl_oom_dump_tasks) 372 dump_tasks(mem); 373 } 374 375 #define K(x) ((x) << (PAGE_SHIFT-10)) 376 377 /* 378 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 379 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 380 * set. 381 */ 382 static void __oom_kill_task(struct task_struct *p, int verbose) 383 { 384 if (is_global_init(p)) { 385 WARN_ON(1); 386 printk(KERN_WARNING "tried to kill init!\n"); 387 return; 388 } 389 390 task_lock(p); 391 if (!p->mm) { 392 WARN_ON(1); 393 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", 394 task_pid_nr(p), p->comm); 395 task_unlock(p); 396 return; 397 } 398 399 if (verbose) 400 printk(KERN_ERR "Killed process %d (%s) " 401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 402 task_pid_nr(p), p->comm, 403 K(p->mm->total_vm), 404 K(get_mm_counter(p->mm, anon_rss)), 405 K(get_mm_counter(p->mm, file_rss))); 406 task_unlock(p); 407 408 /* 409 * We give our sacrificial lamb high priority and access to 410 * all the memory it needs. That way it should be able to 411 * exit() and clear out its resources quickly... 412 */ 413 p->rt.time_slice = HZ; 414 set_tsk_thread_flag(p, TIF_MEMDIE); 415 416 force_sig(SIGKILL, p); 417 } 418 419 static int oom_kill_task(struct task_struct *p) 420 { 421 /* WARNING: mm may not be dereferenced since we did not obtain its 422 * value from get_task_mm(p). This is OK since all we need to do is 423 * compare mm to q->mm below. 424 * 425 * Furthermore, even if mm contains a non-NULL value, p->mm may 426 * change to NULL at any time since we do not hold task_lock(p). 427 * However, this is of no concern to us. 428 */ 429 if (!p->mm || p->signal->oom_adj == OOM_DISABLE) 430 return 1; 431 432 __oom_kill_task(p, 1); 433 434 return 0; 435 } 436 437 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 438 unsigned long points, struct mem_cgroup *mem, 439 const char *message) 440 { 441 struct task_struct *c; 442 443 if (printk_ratelimit()) 444 dump_header(p, gfp_mask, order, mem); 445 446 /* 447 * If the task is already exiting, don't alarm the sysadmin or kill 448 * its children or threads, just set TIF_MEMDIE so it can die quickly 449 */ 450 if (p->flags & PF_EXITING) { 451 __oom_kill_task(p, 0); 452 return 0; 453 } 454 455 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 456 message, task_pid_nr(p), p->comm, points); 457 458 /* Try to kill a child first */ 459 list_for_each_entry(c, &p->children, sibling) { 460 if (c->mm == p->mm) 461 continue; 462 if (!oom_kill_task(c)) 463 return 0; 464 } 465 return oom_kill_task(p); 466 } 467 468 #ifdef CONFIG_CGROUP_MEM_RES_CTLR 469 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 470 { 471 unsigned long points = 0; 472 struct task_struct *p; 473 474 read_lock(&tasklist_lock); 475 retry: 476 p = select_bad_process(&points, mem); 477 if (PTR_ERR(p) == -1UL) 478 goto out; 479 480 if (!p) 481 p = current; 482 483 if (oom_kill_process(p, gfp_mask, 0, points, mem, 484 "Memory cgroup out of memory")) 485 goto retry; 486 out: 487 read_unlock(&tasklist_lock); 488 } 489 #endif 490 491 static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 492 493 int register_oom_notifier(struct notifier_block *nb) 494 { 495 return blocking_notifier_chain_register(&oom_notify_list, nb); 496 } 497 EXPORT_SYMBOL_GPL(register_oom_notifier); 498 499 int unregister_oom_notifier(struct notifier_block *nb) 500 { 501 return blocking_notifier_chain_unregister(&oom_notify_list, nb); 502 } 503 EXPORT_SYMBOL_GPL(unregister_oom_notifier); 504 505 /* 506 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero 507 * if a parallel OOM killing is already taking place that includes a zone in 508 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 509 */ 510 int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 511 { 512 struct zoneref *z; 513 struct zone *zone; 514 int ret = 1; 515 516 spin_lock(&zone_scan_lock); 517 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 518 if (zone_is_oom_locked(zone)) { 519 ret = 0; 520 goto out; 521 } 522 } 523 524 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 525 /* 526 * Lock each zone in the zonelist under zone_scan_lock so a 527 * parallel invocation of try_set_zone_oom() doesn't succeed 528 * when it shouldn't. 529 */ 530 zone_set_flag(zone, ZONE_OOM_LOCKED); 531 } 532 533 out: 534 spin_unlock(&zone_scan_lock); 535 return ret; 536 } 537 538 /* 539 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed 540 * allocation attempts with zonelists containing them may now recall the OOM 541 * killer, if necessary. 542 */ 543 void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) 544 { 545 struct zoneref *z; 546 struct zone *zone; 547 548 spin_lock(&zone_scan_lock); 549 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 550 zone_clear_flag(zone, ZONE_OOM_LOCKED); 551 } 552 spin_unlock(&zone_scan_lock); 553 } 554 555 /* 556 * Must be called with tasklist_lock held for read. 557 */ 558 static void __out_of_memory(gfp_t gfp_mask, int order) 559 { 560 struct task_struct *p; 561 unsigned long points; 562 563 if (sysctl_oom_kill_allocating_task) 564 if (!oom_kill_process(current, gfp_mask, order, 0, NULL, 565 "Out of memory (oom_kill_allocating_task)")) 566 return; 567 retry: 568 /* 569 * Rambo mode: Shoot down a process and hope it solves whatever 570 * issues we may have. 571 */ 572 p = select_bad_process(&points, NULL); 573 574 if (PTR_ERR(p) == -1UL) 575 return; 576 577 /* Found nothing?!?! Either we hang forever, or we panic. */ 578 if (!p) { 579 read_unlock(&tasklist_lock); 580 dump_header(NULL, gfp_mask, order, NULL); 581 panic("Out of memory and no killable processes...\n"); 582 } 583 584 if (oom_kill_process(p, gfp_mask, order, points, NULL, 585 "Out of memory")) 586 goto retry; 587 } 588 589 /* 590 * pagefault handler calls into here because it is out of memory but 591 * doesn't know exactly how or why. 592 */ 593 void pagefault_out_of_memory(void) 594 { 595 unsigned long freed = 0; 596 597 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 598 if (freed > 0) 599 /* Got some memory back in the last second. */ 600 return; 601 602 /* 603 * If this is from memcg, oom-killer is already invoked. 604 * and not worth to go system-wide-oom. 605 */ 606 if (mem_cgroup_oom_called(current)) 607 goto rest_and_return; 608 609 if (sysctl_panic_on_oom) 610 panic("out of memory from page fault. panic_on_oom is selected.\n"); 611 612 read_lock(&tasklist_lock); 613 __out_of_memory(0, 0); /* unknown gfp_mask and order */ 614 read_unlock(&tasklist_lock); 615 616 /* 617 * Give "p" a good chance of killing itself before we 618 * retry to allocate memory. 619 */ 620 rest_and_return: 621 if (!test_thread_flag(TIF_MEMDIE)) 622 schedule_timeout_uninterruptible(1); 623 } 624 625 /** 626 * out_of_memory - kill the "best" process when we run out of memory 627 * @zonelist: zonelist pointer 628 * @gfp_mask: memory allocation flags 629 * @order: amount of memory being requested as a power of 2 630 * 631 * If we run out of memory, we have the choice between either 632 * killing a random task (bad), letting the system crash (worse) 633 * OR try to be smart about which process to kill. Note that we 634 * don't have to be perfect here, we just have to be good. 635 */ 636 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 637 int order, nodemask_t *nodemask) 638 { 639 unsigned long freed = 0; 640 enum oom_constraint constraint; 641 642 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 643 if (freed > 0) 644 /* Got some memory back in the last second. */ 645 return; 646 647 if (sysctl_panic_on_oom == 2) { 648 dump_header(NULL, gfp_mask, order, NULL); 649 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 650 } 651 652 /* 653 * Check if there were limitations on the allocation (only relevant for 654 * NUMA) that may require different handling. 655 */ 656 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 657 read_lock(&tasklist_lock); 658 659 switch (constraint) { 660 case CONSTRAINT_MEMORY_POLICY: 661 oom_kill_process(current, gfp_mask, order, 0, NULL, 662 "No available memory (MPOL_BIND)"); 663 break; 664 665 case CONSTRAINT_NONE: 666 if (sysctl_panic_on_oom) { 667 dump_header(NULL, gfp_mask, order, NULL); 668 panic("out of memory. panic_on_oom is selected\n"); 669 } 670 /* Fall-through */ 671 case CONSTRAINT_CPUSET: 672 __out_of_memory(gfp_mask, order); 673 break; 674 } 675 676 read_unlock(&tasklist_lock); 677 678 /* 679 * Give "p" a good chance of killing itself before we 680 * retry to allocate memory unless "p" is current 681 */ 682 if (!test_thread_flag(TIF_MEMDIE)) 683 schedule_timeout_uninterruptible(1); 684 } 685