1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/slab.h> 30 #include <linux/swap.h> 31 #include <linux/spinlock.h> 32 #include <linux/fs.h> 33 #include <linux/seq_file.h> 34 #include <linux/vmalloc.h> 35 #include <linux/mm_inline.h> 36 #include <linux/page_cgroup.h> 37 38 #include <asm/uaccess.h> 39 40 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 41 #define MEM_CGROUP_RECLAIM_RETRIES 5 42 43 /* 44 * Statistics for memory cgroup. 45 */ 46 enum mem_cgroup_stat_index { 47 /* 48 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 49 */ 50 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 51 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 52 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 53 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 54 55 MEM_CGROUP_STAT_NSTATS, 56 }; 57 58 struct mem_cgroup_stat_cpu { 59 s64 count[MEM_CGROUP_STAT_NSTATS]; 60 } ____cacheline_aligned_in_smp; 61 62 struct mem_cgroup_stat { 63 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 64 }; 65 66 /* 67 * For accounting under irq disable, no need for increment preempt count. 68 */ 69 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 70 enum mem_cgroup_stat_index idx, int val) 71 { 72 stat->count[idx] += val; 73 } 74 75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 76 enum mem_cgroup_stat_index idx) 77 { 78 int cpu; 79 s64 ret = 0; 80 for_each_possible_cpu(cpu) 81 ret += stat->cpustat[cpu].count[idx]; 82 return ret; 83 } 84 85 /* 86 * per-zone information in memory controller. 87 */ 88 struct mem_cgroup_per_zone { 89 /* 90 * spin_lock to protect the per cgroup LRU 91 */ 92 spinlock_t lru_lock; 93 struct list_head lists[NR_LRU_LISTS]; 94 unsigned long count[NR_LRU_LISTS]; 95 }; 96 /* Macro for accessing counter */ 97 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 98 99 struct mem_cgroup_per_node { 100 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 101 }; 102 103 struct mem_cgroup_lru_info { 104 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 105 }; 106 107 /* 108 * The memory controller data structure. The memory controller controls both 109 * page cache and RSS per cgroup. We would eventually like to provide 110 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 111 * to help the administrator determine what knobs to tune. 112 * 113 * TODO: Add a water mark for the memory controller. Reclaim will begin when 114 * we hit the water mark. May be even add a low water mark, such that 115 * no reclaim occurs from a cgroup at it's low water mark, this is 116 * a feature that will be implemented much later in the future. 117 */ 118 struct mem_cgroup { 119 struct cgroup_subsys_state css; 120 /* 121 * the counter to account for memory usage 122 */ 123 struct res_counter res; 124 /* 125 * Per cgroup active and inactive list, similar to the 126 * per zone LRU lists. 127 */ 128 struct mem_cgroup_lru_info info; 129 130 int prev_priority; /* for recording reclaim priority */ 131 /* 132 * statistics. 133 */ 134 struct mem_cgroup_stat stat; 135 }; 136 static struct mem_cgroup init_mem_cgroup; 137 138 enum charge_type { 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED, 141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 143 NR_CHARGE_TYPE, 144 }; 145 146 /* only for here (for easy reading.) */ 147 #define PCGF_CACHE (1UL << PCG_CACHE) 148 #define PCGF_USED (1UL << PCG_USED) 149 #define PCGF_ACTIVE (1UL << PCG_ACTIVE) 150 #define PCGF_LOCK (1UL << PCG_LOCK) 151 #define PCGF_FILE (1UL << PCG_FILE) 152 static const unsigned long 153 pcg_default_flags[NR_CHARGE_TYPE] = { 154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ 155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ 156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 157 0, /* FORCE */ 158 }; 159 160 /* 161 * Always modified under lru lock. Then, not necessary to preempt_disable() 162 */ 163 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 164 struct page_cgroup *pc, 165 bool charge) 166 { 167 int val = (charge)? 1 : -1; 168 struct mem_cgroup_stat *stat = &mem->stat; 169 struct mem_cgroup_stat_cpu *cpustat; 170 171 VM_BUG_ON(!irqs_disabled()); 172 173 cpustat = &stat->cpustat[smp_processor_id()]; 174 if (PageCgroupCache(pc)) 175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 176 else 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 178 179 if (charge) 180 __mem_cgroup_stat_add_safe(cpustat, 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 182 else 183 __mem_cgroup_stat_add_safe(cpustat, 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 185 } 186 187 static struct mem_cgroup_per_zone * 188 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 189 { 190 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 191 } 192 193 static struct mem_cgroup_per_zone * 194 page_cgroup_zoneinfo(struct page_cgroup *pc) 195 { 196 struct mem_cgroup *mem = pc->mem_cgroup; 197 int nid = page_cgroup_nid(pc); 198 int zid = page_cgroup_zid(pc); 199 200 return mem_cgroup_zoneinfo(mem, nid, zid); 201 } 202 203 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 204 enum lru_list idx) 205 { 206 int nid, zid; 207 struct mem_cgroup_per_zone *mz; 208 u64 total = 0; 209 210 for_each_online_node(nid) 211 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 212 mz = mem_cgroup_zoneinfo(mem, nid, zid); 213 total += MEM_CGROUP_ZSTAT(mz, idx); 214 } 215 return total; 216 } 217 218 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 219 { 220 return container_of(cgroup_subsys_state(cont, 221 mem_cgroup_subsys_id), struct mem_cgroup, 222 css); 223 } 224 225 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 226 { 227 /* 228 * mm_update_next_owner() may clear mm->owner to NULL 229 * if it races with swapoff, page migration, etc. 230 * So this can be called with p == NULL. 231 */ 232 if (unlikely(!p)) 233 return NULL; 234 235 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 236 struct mem_cgroup, css); 237 } 238 239 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 240 struct page_cgroup *pc) 241 { 242 int lru = LRU_BASE; 243 244 if (PageCgroupUnevictable(pc)) 245 lru = LRU_UNEVICTABLE; 246 else { 247 if (PageCgroupActive(pc)) 248 lru += LRU_ACTIVE; 249 if (PageCgroupFile(pc)) 250 lru += LRU_FILE; 251 } 252 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 254 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); 256 list_del(&pc->lru); 257 } 258 259 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 260 struct page_cgroup *pc) 261 { 262 int lru = LRU_BASE; 263 264 if (PageCgroupUnevictable(pc)) 265 lru = LRU_UNEVICTABLE; 266 else { 267 if (PageCgroupActive(pc)) 268 lru += LRU_ACTIVE; 269 if (PageCgroupFile(pc)) 270 lru += LRU_FILE; 271 } 272 273 MEM_CGROUP_ZSTAT(mz, lru) += 1; 274 list_add(&pc->lru, &mz->lists[lru]); 275 276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 277 } 278 279 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) 280 { 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 282 int active = PageCgroupActive(pc); 283 int file = PageCgroupFile(pc); 284 int unevictable = PageCgroupUnevictable(pc); 285 enum lru_list from = unevictable ? LRU_UNEVICTABLE : 286 (LRU_FILE * !!file + !!active); 287 288 if (lru == from) 289 return; 290 291 MEM_CGROUP_ZSTAT(mz, from) -= 1; 292 /* 293 * However this is done under mz->lru_lock, another flags, which 294 * are not related to LRU, will be modified from out-of-lock. 295 * We have to use atomic set/clear flags. 296 */ 297 if (is_unevictable_lru(lru)) { 298 ClearPageCgroupActive(pc); 299 SetPageCgroupUnevictable(pc); 300 } else { 301 if (is_active_lru(lru)) 302 SetPageCgroupActive(pc); 303 else 304 ClearPageCgroupActive(pc); 305 ClearPageCgroupUnevictable(pc); 306 } 307 308 MEM_CGROUP_ZSTAT(mz, lru) += 1; 309 list_move(&pc->lru, &mz->lists[lru]); 310 } 311 312 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 313 { 314 int ret; 315 316 task_lock(task); 317 ret = task->mm && mm_match_cgroup(task->mm, mem); 318 task_unlock(task); 319 return ret; 320 } 321 322 /* 323 * This routine assumes that the appropriate zone's lru lock is already held 324 */ 325 void mem_cgroup_move_lists(struct page *page, enum lru_list lru) 326 { 327 struct page_cgroup *pc; 328 struct mem_cgroup_per_zone *mz; 329 unsigned long flags; 330 331 if (mem_cgroup_subsys.disabled) 332 return; 333 334 /* 335 * We cannot lock_page_cgroup while holding zone's lru_lock, 336 * because other holders of lock_page_cgroup can be interrupted 337 * with an attempt to rotate_reclaimable_page. But we cannot 338 * safely get to page_cgroup without it, so just try_lock it: 339 * mem_cgroup_isolate_pages allows for page left on wrong list. 340 */ 341 pc = lookup_page_cgroup(page); 342 if (!trylock_page_cgroup(pc)) 343 return; 344 if (pc && PageCgroupUsed(pc)) { 345 mz = page_cgroup_zoneinfo(pc); 346 spin_lock_irqsave(&mz->lru_lock, flags); 347 __mem_cgroup_move_lists(pc, lru); 348 spin_unlock_irqrestore(&mz->lru_lock, flags); 349 } 350 unlock_page_cgroup(pc); 351 } 352 353 /* 354 * Calculate mapped_ratio under memory controller. This will be used in 355 * vmscan.c for deteremining we have to reclaim mapped pages. 356 */ 357 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 358 { 359 long total, rss; 360 361 /* 362 * usage is recorded in bytes. But, here, we assume the number of 363 * physical pages can be represented by "long" on any arch. 364 */ 365 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 366 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 367 return (int)((rss * 100L) / total); 368 } 369 370 /* 371 * prev_priority control...this will be used in memory reclaim path. 372 */ 373 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 374 { 375 return mem->prev_priority; 376 } 377 378 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 379 { 380 if (priority < mem->prev_priority) 381 mem->prev_priority = priority; 382 } 383 384 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 385 { 386 mem->prev_priority = priority; 387 } 388 389 /* 390 * Calculate # of pages to be scanned in this priority/zone. 391 * See also vmscan.c 392 * 393 * priority starts from "DEF_PRIORITY" and decremented in each loop. 394 * (see include/linux/mmzone.h) 395 */ 396 397 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, 398 int priority, enum lru_list lru) 399 { 400 long nr_pages; 401 int nid = zone->zone_pgdat->node_id; 402 int zid = zone_idx(zone); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 404 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru); 406 407 return (nr_pages >> priority); 408 } 409 410 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 411 struct list_head *dst, 412 unsigned long *scanned, int order, 413 int mode, struct zone *z, 414 struct mem_cgroup *mem_cont, 415 int active, int file) 416 { 417 unsigned long nr_taken = 0; 418 struct page *page; 419 unsigned long scan; 420 LIST_HEAD(pc_list); 421 struct list_head *src; 422 struct page_cgroup *pc, *tmp; 423 int nid = z->zone_pgdat->node_id; 424 int zid = zone_idx(z); 425 struct mem_cgroup_per_zone *mz; 426 int lru = LRU_FILE * !!file + !!active; 427 428 BUG_ON(!mem_cont); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 430 src = &mz->lists[lru]; 431 432 spin_lock(&mz->lru_lock); 433 scan = 0; 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 435 if (scan >= nr_to_scan) 436 break; 437 if (unlikely(!PageCgroupUsed(pc))) 438 continue; 439 page = pc->page; 440 441 if (unlikely(!PageLRU(page))) 442 continue; 443 444 /* 445 * TODO: play better with lumpy reclaim, grabbing anything. 446 */ 447 if (PageUnevictable(page) || 448 (PageActive(page) && !active) || 449 (!PageActive(page) && active)) { 450 __mem_cgroup_move_lists(pc, page_lru(page)); 451 continue; 452 } 453 454 scan++; 455 list_move(&pc->lru, &pc_list); 456 457 if (__isolate_lru_page(page, mode, file) == 0) { 458 list_move(&page->lru, dst); 459 nr_taken++; 460 } 461 } 462 463 list_splice(&pc_list, src); 464 spin_unlock(&mz->lru_lock); 465 466 *scanned = scan; 467 return nr_taken; 468 } 469 470 /* 471 * Charge the memory controller for page usage. 472 * Return 473 * 0 if the charge was successful 474 * < 0 if the cgroup is over its limit 475 */ 476 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 477 gfp_t gfp_mask, enum charge_type ctype, 478 struct mem_cgroup *memcg) 479 { 480 struct mem_cgroup *mem; 481 struct page_cgroup *pc; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 483 struct mem_cgroup_per_zone *mz; 484 unsigned long flags; 485 486 pc = lookup_page_cgroup(page); 487 /* can happen at boot */ 488 if (unlikely(!pc)) 489 return 0; 490 prefetchw(pc); 491 /* 492 * We always charge the cgroup the mm_struct belongs to. 493 * The mm_struct's mem_cgroup changes on task migration if the 494 * thread group leader migrates. It's possible that mm is not 495 * set, if so charge the init_mm (happens for pagecache usage). 496 */ 497 498 if (likely(!memcg)) { 499 rcu_read_lock(); 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 501 if (unlikely(!mem)) { 502 rcu_read_unlock(); 503 return 0; 504 } 505 /* 506 * For every charge from the cgroup, increment reference count 507 */ 508 css_get(&mem->css); 509 rcu_read_unlock(); 510 } else { 511 mem = memcg; 512 css_get(&memcg->css); 513 } 514 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { 516 if (!(gfp_mask & __GFP_WAIT)) 517 goto out; 518 519 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 520 continue; 521 522 /* 523 * try_to_free_mem_cgroup_pages() might not give us a full 524 * picture of reclaim. Some pages are reclaimed and might be 525 * moved to swap cache or just unmapped from the cgroup. 526 * Check the limit again to see if the reclaim reduced the 527 * current usage of the cgroup before giving up 528 */ 529 if (res_counter_check_under_limit(&mem->res)) 530 continue; 531 532 if (!nr_retries--) { 533 mem_cgroup_out_of_memory(mem, gfp_mask); 534 goto out; 535 } 536 } 537 538 539 lock_page_cgroup(pc); 540 if (unlikely(PageCgroupUsed(pc))) { 541 unlock_page_cgroup(pc); 542 res_counter_uncharge(&mem->res, PAGE_SIZE); 543 css_put(&mem->css); 544 545 goto done; 546 } 547 pc->mem_cgroup = mem; 548 /* 549 * If a page is accounted as a page cache, insert to inactive list. 550 * If anon, insert to active list. 551 */ 552 pc->flags = pcg_default_flags[ctype]; 553 554 mz = page_cgroup_zoneinfo(pc); 555 556 spin_lock_irqsave(&mz->lru_lock, flags); 557 __mem_cgroup_add_list(mz, pc); 558 spin_unlock_irqrestore(&mz->lru_lock, flags); 559 unlock_page_cgroup(pc); 560 561 done: 562 return 0; 563 out: 564 css_put(&mem->css); 565 return -ENOMEM; 566 } 567 568 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 569 { 570 if (mem_cgroup_subsys.disabled) 571 return 0; 572 if (PageCompound(page)) 573 return 0; 574 /* 575 * If already mapped, we don't have to account. 576 * If page cache, page->mapping has address_space. 577 * But page->mapping may have out-of-use anon_vma pointer, 578 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 579 * is NULL. 580 */ 581 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 582 return 0; 583 if (unlikely(!mm)) 584 mm = &init_mm; 585 return mem_cgroup_charge_common(page, mm, gfp_mask, 586 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 587 } 588 589 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 590 gfp_t gfp_mask) 591 { 592 if (mem_cgroup_subsys.disabled) 593 return 0; 594 if (PageCompound(page)) 595 return 0; 596 /* 597 * Corner case handling. This is called from add_to_page_cache() 598 * in usual. But some FS (shmem) precharges this page before calling it 599 * and call add_to_page_cache() with GFP_NOWAIT. 600 * 601 * For GFP_NOWAIT case, the page may be pre-charged before calling 602 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 603 * charge twice. (It works but has to pay a bit larger cost.) 604 */ 605 if (!(gfp_mask & __GFP_WAIT)) { 606 struct page_cgroup *pc; 607 608 609 pc = lookup_page_cgroup(page); 610 if (!pc) 611 return 0; 612 lock_page_cgroup(pc); 613 if (PageCgroupUsed(pc)) { 614 unlock_page_cgroup(pc); 615 return 0; 616 } 617 unlock_page_cgroup(pc); 618 } 619 620 if (unlikely(!mm)) 621 mm = &init_mm; 622 623 if (page_is_file_cache(page)) 624 return mem_cgroup_charge_common(page, mm, gfp_mask, 625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 626 else 627 return mem_cgroup_charge_common(page, mm, gfp_mask, 628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 629 } 630 631 /* 632 * uncharge if !page_mapped(page) 633 */ 634 static void 635 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 636 { 637 struct page_cgroup *pc; 638 struct mem_cgroup *mem; 639 struct mem_cgroup_per_zone *mz; 640 unsigned long flags; 641 642 if (mem_cgroup_subsys.disabled) 643 return; 644 645 /* 646 * Check if our page_cgroup is valid 647 */ 648 pc = lookup_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc))) 650 return; 651 652 lock_page_cgroup(pc); 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) 654 || !PageCgroupUsed(pc)) { 655 /* This happens at race in zap_pte_range() and do_swap_page()*/ 656 unlock_page_cgroup(pc); 657 return; 658 } 659 ClearPageCgroupUsed(pc); 660 mem = pc->mem_cgroup; 661 662 mz = page_cgroup_zoneinfo(pc); 663 spin_lock_irqsave(&mz->lru_lock, flags); 664 __mem_cgroup_remove_list(mz, pc); 665 spin_unlock_irqrestore(&mz->lru_lock, flags); 666 unlock_page_cgroup(pc); 667 668 res_counter_uncharge(&mem->res, PAGE_SIZE); 669 css_put(&mem->css); 670 671 return; 672 } 673 674 void mem_cgroup_uncharge_page(struct page *page) 675 { 676 /* early check. */ 677 if (page_mapped(page)) 678 return; 679 if (page->mapping && !PageAnon(page)) 680 return; 681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 682 } 683 684 void mem_cgroup_uncharge_cache_page(struct page *page) 685 { 686 VM_BUG_ON(page_mapped(page)); 687 VM_BUG_ON(page->mapping); 688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 689 } 690 691 /* 692 * Before starting migration, account against new page. 693 */ 694 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) 695 { 696 struct page_cgroup *pc; 697 struct mem_cgroup *mem = NULL; 698 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 699 int ret = 0; 700 701 if (mem_cgroup_subsys.disabled) 702 return 0; 703 704 pc = lookup_page_cgroup(page); 705 lock_page_cgroup(pc); 706 if (PageCgroupUsed(pc)) { 707 mem = pc->mem_cgroup; 708 css_get(&mem->css); 709 if (PageCgroupCache(pc)) { 710 if (page_is_file_cache(page)) 711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 712 else 713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 714 } 715 } 716 unlock_page_cgroup(pc); 717 if (mem) { 718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 719 ctype, mem); 720 css_put(&mem->css); 721 } 722 return ret; 723 } 724 725 /* remove redundant charge if migration failed*/ 726 void mem_cgroup_end_migration(struct page *newpage) 727 { 728 /* 729 * At success, page->mapping is not NULL. 730 * special rollback care is necessary when 731 * 1. at migration failure. (newpage->mapping is cleared in this case) 732 * 2. the newpage was moved but not remapped again because the task 733 * exits and the newpage is obsolete. In this case, the new page 734 * may be a swapcache. So, we just call mem_cgroup_uncharge_page() 735 * always for avoiding mess. The page_cgroup will be removed if 736 * unnecessary. File cache pages is still on radix-tree. Don't 737 * care it. 738 */ 739 if (!newpage->mapping) 740 __mem_cgroup_uncharge_common(newpage, 741 MEM_CGROUP_CHARGE_TYPE_FORCE); 742 else if (PageAnon(newpage)) 743 mem_cgroup_uncharge_page(newpage); 744 } 745 746 /* 747 * A call to try to shrink memory usage under specified resource controller. 748 * This is typically used for page reclaiming for shmem for reducing side 749 * effect of page allocation from shmem, which is used by some mem_cgroup. 750 */ 751 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 752 { 753 struct mem_cgroup *mem; 754 int progress = 0; 755 int retry = MEM_CGROUP_RECLAIM_RETRIES; 756 757 if (mem_cgroup_subsys.disabled) 758 return 0; 759 if (!mm) 760 return 0; 761 762 rcu_read_lock(); 763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 764 if (unlikely(!mem)) { 765 rcu_read_unlock(); 766 return 0; 767 } 768 css_get(&mem->css); 769 rcu_read_unlock(); 770 771 do { 772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 773 progress += res_counter_check_under_limit(&mem->res); 774 } while (!progress && --retry); 775 776 css_put(&mem->css); 777 if (!retry) 778 return -ENOMEM; 779 return 0; 780 } 781 782 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 783 { 784 785 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 786 int progress; 787 int ret = 0; 788 789 while (res_counter_set_limit(&memcg->res, val)) { 790 if (signal_pending(current)) { 791 ret = -EINTR; 792 break; 793 } 794 if (!retry_count) { 795 ret = -EBUSY; 796 break; 797 } 798 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); 799 if (!progress) 800 retry_count--; 801 } 802 return ret; 803 } 804 805 806 /* 807 * This routine traverse page_cgroup in given list and drop them all. 808 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 809 */ 810 #define FORCE_UNCHARGE_BATCH (128) 811 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 812 struct mem_cgroup_per_zone *mz, 813 enum lru_list lru) 814 { 815 struct page_cgroup *pc; 816 struct page *page; 817 int count = FORCE_UNCHARGE_BATCH; 818 unsigned long flags; 819 struct list_head *list; 820 821 list = &mz->lists[lru]; 822 823 spin_lock_irqsave(&mz->lru_lock, flags); 824 while (!list_empty(list)) { 825 pc = list_entry(list->prev, struct page_cgroup, lru); 826 page = pc->page; 827 if (!PageCgroupUsed(pc)) 828 break; 829 get_page(page); 830 spin_unlock_irqrestore(&mz->lru_lock, flags); 831 /* 832 * Check if this page is on LRU. !LRU page can be found 833 * if it's under page migration. 834 */ 835 if (PageLRU(page)) { 836 __mem_cgroup_uncharge_common(page, 837 MEM_CGROUP_CHARGE_TYPE_FORCE); 838 put_page(page); 839 if (--count <= 0) { 840 count = FORCE_UNCHARGE_BATCH; 841 cond_resched(); 842 } 843 } else { 844 spin_lock_irqsave(&mz->lru_lock, flags); 845 break; 846 } 847 spin_lock_irqsave(&mz->lru_lock, flags); 848 } 849 spin_unlock_irqrestore(&mz->lru_lock, flags); 850 } 851 852 /* 853 * make mem_cgroup's charge to be 0 if there is no task. 854 * This enables deleting this mem_cgroup. 855 */ 856 static int mem_cgroup_force_empty(struct mem_cgroup *mem) 857 { 858 int ret = -EBUSY; 859 int node, zid; 860 861 css_get(&mem->css); 862 /* 863 * page reclaim code (kswapd etc..) will move pages between 864 * active_list <-> inactive_list while we don't take a lock. 865 * So, we have to do loop here until all lists are empty. 866 */ 867 while (mem->res.usage > 0) { 868 if (atomic_read(&mem->css.cgroup->count) > 0) 869 goto out; 870 /* This is for making all *used* pages to be on LRU. */ 871 lru_add_drain_all(); 872 for_each_node_state(node, N_POSSIBLE) 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 874 struct mem_cgroup_per_zone *mz; 875 enum lru_list l; 876 mz = mem_cgroup_zoneinfo(mem, node, zid); 877 for_each_lru(l) 878 mem_cgroup_force_empty_list(mem, mz, l); 879 } 880 cond_resched(); 881 } 882 ret = 0; 883 out: 884 css_put(&mem->css); 885 return ret; 886 } 887 888 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 889 { 890 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 891 cft->private); 892 } 893 /* 894 * The user of this function is... 895 * RES_LIMIT. 896 */ 897 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 898 const char *buffer) 899 { 900 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 901 unsigned long long val; 902 int ret; 903 904 switch (cft->private) { 905 case RES_LIMIT: 906 /* This function does all necessary parse...reuse it */ 907 ret = res_counter_memparse_write_strategy(buffer, &val); 908 if (!ret) 909 ret = mem_cgroup_resize_limit(memcg, val); 910 break; 911 default: 912 ret = -EINVAL; /* should be BUG() ? */ 913 break; 914 } 915 return ret; 916 } 917 918 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 919 { 920 struct mem_cgroup *mem; 921 922 mem = mem_cgroup_from_cont(cont); 923 switch (event) { 924 case RES_MAX_USAGE: 925 res_counter_reset_max(&mem->res); 926 break; 927 case RES_FAILCNT: 928 res_counter_reset_failcnt(&mem->res); 929 break; 930 } 931 return 0; 932 } 933 934 static int mem_force_empty_write(struct cgroup *cont, unsigned int event) 935 { 936 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); 937 } 938 939 static const struct mem_cgroup_stat_desc { 940 const char *msg; 941 u64 unit; 942 } mem_cgroup_stat_desc[] = { 943 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 944 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 945 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 946 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 947 }; 948 949 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 950 struct cgroup_map_cb *cb) 951 { 952 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 953 struct mem_cgroup_stat *stat = &mem_cont->stat; 954 int i; 955 956 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 957 s64 val; 958 959 val = mem_cgroup_read_stat(stat, i); 960 val *= mem_cgroup_stat_desc[i].unit; 961 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 962 } 963 /* showing # of active pages */ 964 { 965 unsigned long active_anon, inactive_anon; 966 unsigned long active_file, inactive_file; 967 unsigned long unevictable; 968 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, 970 LRU_INACTIVE_ANON); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont, 972 LRU_ACTIVE_ANON); 973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont, 974 LRU_INACTIVE_FILE); 975 active_file = mem_cgroup_get_all_zonestat(mem_cont, 976 LRU_ACTIVE_FILE); 977 unevictable = mem_cgroup_get_all_zonestat(mem_cont, 978 LRU_UNEVICTABLE); 979 980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); 981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); 982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); 983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); 984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 985 986 } 987 return 0; 988 } 989 990 static struct cftype mem_cgroup_files[] = { 991 { 992 .name = "usage_in_bytes", 993 .private = RES_USAGE, 994 .read_u64 = mem_cgroup_read, 995 }, 996 { 997 .name = "max_usage_in_bytes", 998 .private = RES_MAX_USAGE, 999 .trigger = mem_cgroup_reset, 1000 .read_u64 = mem_cgroup_read, 1001 }, 1002 { 1003 .name = "limit_in_bytes", 1004 .private = RES_LIMIT, 1005 .write_string = mem_cgroup_write, 1006 .read_u64 = mem_cgroup_read, 1007 }, 1008 { 1009 .name = "failcnt", 1010 .private = RES_FAILCNT, 1011 .trigger = mem_cgroup_reset, 1012 .read_u64 = mem_cgroup_read, 1013 }, 1014 { 1015 .name = "force_empty", 1016 .trigger = mem_force_empty_write, 1017 }, 1018 { 1019 .name = "stat", 1020 .read_map = mem_control_stat_show, 1021 }, 1022 }; 1023 1024 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1025 { 1026 struct mem_cgroup_per_node *pn; 1027 struct mem_cgroup_per_zone *mz; 1028 enum lru_list l; 1029 int zone, tmp = node; 1030 /* 1031 * This routine is called against possible nodes. 1032 * But it's BUG to call kmalloc() against offline node. 1033 * 1034 * TODO: this routine can waste much memory for nodes which will 1035 * never be onlined. It's better to use memory hotplug callback 1036 * function. 1037 */ 1038 if (!node_state(node, N_NORMAL_MEMORY)) 1039 tmp = -1; 1040 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 1041 if (!pn) 1042 return 1; 1043 1044 mem->info.nodeinfo[node] = pn; 1045 memset(pn, 0, sizeof(*pn)); 1046 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1048 mz = &pn->zoneinfo[zone]; 1049 spin_lock_init(&mz->lru_lock); 1050 for_each_lru(l) 1051 INIT_LIST_HEAD(&mz->lists[l]); 1052 } 1053 return 0; 1054 } 1055 1056 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1057 { 1058 kfree(mem->info.nodeinfo[node]); 1059 } 1060 1061 static struct mem_cgroup *mem_cgroup_alloc(void) 1062 { 1063 struct mem_cgroup *mem; 1064 1065 if (sizeof(*mem) < PAGE_SIZE) 1066 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 1067 else 1068 mem = vmalloc(sizeof(*mem)); 1069 1070 if (mem) 1071 memset(mem, 0, sizeof(*mem)); 1072 return mem; 1073 } 1074 1075 static void mem_cgroup_free(struct mem_cgroup *mem) 1076 { 1077 if (sizeof(*mem) < PAGE_SIZE) 1078 kfree(mem); 1079 else 1080 vfree(mem); 1081 } 1082 1083 1084 static struct cgroup_subsys_state * 1085 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1086 { 1087 struct mem_cgroup *mem; 1088 int node; 1089 1090 if (unlikely((cont->parent) == NULL)) { 1091 mem = &init_mem_cgroup; 1092 } else { 1093 mem = mem_cgroup_alloc(); 1094 if (!mem) 1095 return ERR_PTR(-ENOMEM); 1096 } 1097 1098 res_counter_init(&mem->res); 1099 1100 for_each_node_state(node, N_POSSIBLE) 1101 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1102 goto free_out; 1103 1104 return &mem->css; 1105 free_out: 1106 for_each_node_state(node, N_POSSIBLE) 1107 free_mem_cgroup_per_zone_info(mem, node); 1108 if (cont->parent != NULL) 1109 mem_cgroup_free(mem); 1110 return ERR_PTR(-ENOMEM); 1111 } 1112 1113 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1114 struct cgroup *cont) 1115 { 1116 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1117 mem_cgroup_force_empty(mem); 1118 } 1119 1120 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1121 struct cgroup *cont) 1122 { 1123 int node; 1124 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1125 1126 for_each_node_state(node, N_POSSIBLE) 1127 free_mem_cgroup_per_zone_info(mem, node); 1128 1129 mem_cgroup_free(mem_cgroup_from_cont(cont)); 1130 } 1131 1132 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1133 struct cgroup *cont) 1134 { 1135 return cgroup_add_files(cont, ss, mem_cgroup_files, 1136 ARRAY_SIZE(mem_cgroup_files)); 1137 } 1138 1139 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1140 struct cgroup *cont, 1141 struct cgroup *old_cont, 1142 struct task_struct *p) 1143 { 1144 struct mm_struct *mm; 1145 struct mem_cgroup *mem, *old_mem; 1146 1147 mm = get_task_mm(p); 1148 if (mm == NULL) 1149 return; 1150 1151 mem = mem_cgroup_from_cont(cont); 1152 old_mem = mem_cgroup_from_cont(old_cont); 1153 1154 /* 1155 * Only thread group leaders are allowed to migrate, the mm_struct is 1156 * in effect owned by the leader 1157 */ 1158 if (!thread_group_leader(p)) 1159 goto out; 1160 1161 out: 1162 mmput(mm); 1163 } 1164 1165 struct cgroup_subsys mem_cgroup_subsys = { 1166 .name = "memory", 1167 .subsys_id = mem_cgroup_subsys_id, 1168 .create = mem_cgroup_create, 1169 .pre_destroy = mem_cgroup_pre_destroy, 1170 .destroy = mem_cgroup_destroy, 1171 .populate = mem_cgroup_populate, 1172 .attach = mem_cgroup_move_task, 1173 .early_init = 0, 1174 }; 1175