1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/slab.h> 30 #include <linux/swap.h> 31 #include <linux/spinlock.h> 32 #include <linux/fs.h> 33 #include <linux/seq_file.h> 34 #include <linux/vmalloc.h> 35 36 #include <asm/uaccess.h> 37 38 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 39 static struct kmem_cache *page_cgroup_cache __read_mostly; 40 #define MEM_CGROUP_RECLAIM_RETRIES 5 41 42 /* 43 * Statistics for memory cgroup. 44 */ 45 enum mem_cgroup_stat_index { 46 /* 47 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 48 */ 49 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 50 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 51 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 52 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 53 54 MEM_CGROUP_STAT_NSTATS, 55 }; 56 57 struct mem_cgroup_stat_cpu { 58 s64 count[MEM_CGROUP_STAT_NSTATS]; 59 } ____cacheline_aligned_in_smp; 60 61 struct mem_cgroup_stat { 62 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 63 }; 64 65 /* 66 * For accounting under irq disable, no need for increment preempt count. 67 */ 68 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69 enum mem_cgroup_stat_index idx, int val) 70 { 71 int cpu = smp_processor_id(); 72 stat->cpustat[cpu].count[idx] += val; 73 } 74 75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 76 enum mem_cgroup_stat_index idx) 77 { 78 int cpu; 79 s64 ret = 0; 80 for_each_possible_cpu(cpu) 81 ret += stat->cpustat[cpu].count[idx]; 82 return ret; 83 } 84 85 /* 86 * per-zone information in memory controller. 87 */ 88 89 enum mem_cgroup_zstat_index { 90 MEM_CGROUP_ZSTAT_ACTIVE, 91 MEM_CGROUP_ZSTAT_INACTIVE, 92 93 NR_MEM_CGROUP_ZSTAT, 94 }; 95 96 struct mem_cgroup_per_zone { 97 /* 98 * spin_lock to protect the per cgroup LRU 99 */ 100 spinlock_t lru_lock; 101 struct list_head active_list; 102 struct list_head inactive_list; 103 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 104 }; 105 /* Macro for accessing counter */ 106 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 107 108 struct mem_cgroup_per_node { 109 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 110 }; 111 112 struct mem_cgroup_lru_info { 113 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 114 }; 115 116 /* 117 * The memory controller data structure. The memory controller controls both 118 * page cache and RSS per cgroup. We would eventually like to provide 119 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 120 * to help the administrator determine what knobs to tune. 121 * 122 * TODO: Add a water mark for the memory controller. Reclaim will begin when 123 * we hit the water mark. May be even add a low water mark, such that 124 * no reclaim occurs from a cgroup at it's low water mark, this is 125 * a feature that will be implemented much later in the future. 126 */ 127 struct mem_cgroup { 128 struct cgroup_subsys_state css; 129 /* 130 * the counter to account for memory usage 131 */ 132 struct res_counter res; 133 /* 134 * Per cgroup active and inactive list, similar to the 135 * per zone LRU lists. 136 */ 137 struct mem_cgroup_lru_info info; 138 139 int prev_priority; /* for recording reclaim priority */ 140 /* 141 * statistics. 142 */ 143 struct mem_cgroup_stat stat; 144 }; 145 static struct mem_cgroup init_mem_cgroup; 146 147 /* 148 * We use the lower bit of the page->page_cgroup pointer as a bit spin 149 * lock. We need to ensure that page->page_cgroup is at least two 150 * byte aligned (based on comments from Nick Piggin). But since 151 * bit_spin_lock doesn't actually set that lock bit in a non-debug 152 * uniprocessor kernel, we should avoid setting it here too. 153 */ 154 #define PAGE_CGROUP_LOCK_BIT 0x0 155 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) 156 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 157 #else 158 #define PAGE_CGROUP_LOCK 0x0 159 #endif 160 161 /* 162 * A page_cgroup page is associated with every page descriptor. The 163 * page_cgroup helps us identify information about the cgroup 164 */ 165 struct page_cgroup { 166 struct list_head lru; /* per cgroup LRU list */ 167 struct page *page; 168 struct mem_cgroup *mem_cgroup; 169 int flags; 170 }; 171 #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 172 #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 173 174 static int page_cgroup_nid(struct page_cgroup *pc) 175 { 176 return page_to_nid(pc->page); 177 } 178 179 static enum zone_type page_cgroup_zid(struct page_cgroup *pc) 180 { 181 return page_zonenum(pc->page); 182 } 183 184 enum charge_type { 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 188 }; 189 190 /* 191 * Always modified under lru lock. Then, not necessary to preempt_disable() 192 */ 193 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 194 bool charge) 195 { 196 int val = (charge)? 1 : -1; 197 struct mem_cgroup_stat *stat = &mem->stat; 198 199 VM_BUG_ON(!irqs_disabled()); 200 if (flags & PAGE_CGROUP_FLAG_CACHE) 201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 202 else 203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 204 205 if (charge) 206 __mem_cgroup_stat_add_safe(stat, 207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 208 else 209 __mem_cgroup_stat_add_safe(stat, 210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 211 } 212 213 static struct mem_cgroup_per_zone * 214 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 215 { 216 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 217 } 218 219 static struct mem_cgroup_per_zone * 220 page_cgroup_zoneinfo(struct page_cgroup *pc) 221 { 222 struct mem_cgroup *mem = pc->mem_cgroup; 223 int nid = page_cgroup_nid(pc); 224 int zid = page_cgroup_zid(pc); 225 226 return mem_cgroup_zoneinfo(mem, nid, zid); 227 } 228 229 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 230 enum mem_cgroup_zstat_index idx) 231 { 232 int nid, zid; 233 struct mem_cgroup_per_zone *mz; 234 u64 total = 0; 235 236 for_each_online_node(nid) 237 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 238 mz = mem_cgroup_zoneinfo(mem, nid, zid); 239 total += MEM_CGROUP_ZSTAT(mz, idx); 240 } 241 return total; 242 } 243 244 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 245 { 246 return container_of(cgroup_subsys_state(cont, 247 mem_cgroup_subsys_id), struct mem_cgroup, 248 css); 249 } 250 251 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 252 { 253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 254 struct mem_cgroup, css); 255 } 256 257 static inline int page_cgroup_locked(struct page *page) 258 { 259 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 260 } 261 262 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 263 { 264 VM_BUG_ON(!page_cgroup_locked(page)); 265 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); 266 } 267 268 struct page_cgroup *page_get_page_cgroup(struct page *page) 269 { 270 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); 271 } 272 273 static void lock_page_cgroup(struct page *page) 274 { 275 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 276 } 277 278 static int try_lock_page_cgroup(struct page *page) 279 { 280 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 281 } 282 283 static void unlock_page_cgroup(struct page *page) 284 { 285 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 286 } 287 288 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 289 struct page_cgroup *pc) 290 { 291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 292 293 if (from) 294 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 295 else 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 297 298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 299 list_del(&pc->lru); 300 } 301 302 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 303 struct page_cgroup *pc) 304 { 305 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 306 307 if (!to) { 308 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 309 list_add(&pc->lru, &mz->inactive_list); 310 } else { 311 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 312 list_add(&pc->lru, &mz->active_list); 313 } 314 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 315 } 316 317 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 318 { 319 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 320 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 321 322 if (from) 323 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 324 else 325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 326 327 if (active) { 328 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 329 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 330 list_move(&pc->lru, &mz->active_list); 331 } else { 332 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 333 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 334 list_move(&pc->lru, &mz->inactive_list); 335 } 336 } 337 338 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 339 { 340 int ret; 341 342 task_lock(task); 343 ret = task->mm && mm_match_cgroup(task->mm, mem); 344 task_unlock(task); 345 return ret; 346 } 347 348 /* 349 * This routine assumes that the appropriate zone's lru lock is already held 350 */ 351 void mem_cgroup_move_lists(struct page *page, bool active) 352 { 353 struct page_cgroup *pc; 354 struct mem_cgroup_per_zone *mz; 355 unsigned long flags; 356 357 if (mem_cgroup_subsys.disabled) 358 return; 359 360 /* 361 * We cannot lock_page_cgroup while holding zone's lru_lock, 362 * because other holders of lock_page_cgroup can be interrupted 363 * with an attempt to rotate_reclaimable_page. But we cannot 364 * safely get to page_cgroup without it, so just try_lock it: 365 * mem_cgroup_isolate_pages allows for page left on wrong list. 366 */ 367 if (!try_lock_page_cgroup(page)) 368 return; 369 370 pc = page_get_page_cgroup(page); 371 if (pc) { 372 mz = page_cgroup_zoneinfo(pc); 373 spin_lock_irqsave(&mz->lru_lock, flags); 374 __mem_cgroup_move_lists(pc, active); 375 spin_unlock_irqrestore(&mz->lru_lock, flags); 376 } 377 unlock_page_cgroup(page); 378 } 379 380 /* 381 * Calculate mapped_ratio under memory controller. This will be used in 382 * vmscan.c for deteremining we have to reclaim mapped pages. 383 */ 384 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 385 { 386 long total, rss; 387 388 /* 389 * usage is recorded in bytes. But, here, we assume the number of 390 * physical pages can be represented by "long" on any arch. 391 */ 392 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 393 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 394 return (int)((rss * 100L) / total); 395 } 396 397 /* 398 * This function is called from vmscan.c. In page reclaiming loop. balance 399 * between active and inactive list is calculated. For memory controller 400 * page reclaiming, we should use using mem_cgroup's imbalance rather than 401 * zone's global lru imbalance. 402 */ 403 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) 404 { 405 unsigned long active, inactive; 406 /* active and inactive are the number of pages. 'long' is ok.*/ 407 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); 408 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); 409 return (long) (active / (inactive + 1)); 410 } 411 412 /* 413 * prev_priority control...this will be used in memory reclaim path. 414 */ 415 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 416 { 417 return mem->prev_priority; 418 } 419 420 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 421 { 422 if (priority < mem->prev_priority) 423 mem->prev_priority = priority; 424 } 425 426 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 427 { 428 mem->prev_priority = priority; 429 } 430 431 /* 432 * Calculate # of pages to be scanned in this priority/zone. 433 * See also vmscan.c 434 * 435 * priority starts from "DEF_PRIORITY" and decremented in each loop. 436 * (see include/linux/mmzone.h) 437 */ 438 439 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 440 struct zone *zone, int priority) 441 { 442 long nr_active; 443 int nid = zone->zone_pgdat->node_id; 444 int zid = zone_idx(zone); 445 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 446 447 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 448 return (nr_active >> priority); 449 } 450 451 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 452 struct zone *zone, int priority) 453 { 454 long nr_inactive; 455 int nid = zone->zone_pgdat->node_id; 456 int zid = zone_idx(zone); 457 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 458 459 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 460 return (nr_inactive >> priority); 461 } 462 463 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 464 struct list_head *dst, 465 unsigned long *scanned, int order, 466 int mode, struct zone *z, 467 struct mem_cgroup *mem_cont, 468 int active) 469 { 470 unsigned long nr_taken = 0; 471 struct page *page; 472 unsigned long scan; 473 LIST_HEAD(pc_list); 474 struct list_head *src; 475 struct page_cgroup *pc, *tmp; 476 int nid = z->zone_pgdat->node_id; 477 int zid = zone_idx(z); 478 struct mem_cgroup_per_zone *mz; 479 480 BUG_ON(!mem_cont); 481 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 482 if (active) 483 src = &mz->active_list; 484 else 485 src = &mz->inactive_list; 486 487 488 spin_lock(&mz->lru_lock); 489 scan = 0; 490 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 491 if (scan >= nr_to_scan) 492 break; 493 page = pc->page; 494 495 if (unlikely(!PageLRU(page))) 496 continue; 497 498 if (PageActive(page) && !active) { 499 __mem_cgroup_move_lists(pc, true); 500 continue; 501 } 502 if (!PageActive(page) && active) { 503 __mem_cgroup_move_lists(pc, false); 504 continue; 505 } 506 507 scan++; 508 list_move(&pc->lru, &pc_list); 509 510 if (__isolate_lru_page(page, mode) == 0) { 511 list_move(&page->lru, dst); 512 nr_taken++; 513 } 514 } 515 516 list_splice(&pc_list, src); 517 spin_unlock(&mz->lru_lock); 518 519 *scanned = scan; 520 return nr_taken; 521 } 522 523 /* 524 * Charge the memory controller for page usage. 525 * Return 526 * 0 if the charge was successful 527 * < 0 if the cgroup is over its limit 528 */ 529 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 530 gfp_t gfp_mask, enum charge_type ctype, 531 struct mem_cgroup *memcg) 532 { 533 struct mem_cgroup *mem; 534 struct page_cgroup *pc; 535 unsigned long flags; 536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 537 struct mem_cgroup_per_zone *mz; 538 539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 540 if (unlikely(pc == NULL)) 541 goto err; 542 543 /* 544 * We always charge the cgroup the mm_struct belongs to. 545 * The mm_struct's mem_cgroup changes on task migration if the 546 * thread group leader migrates. It's possible that mm is not 547 * set, if so charge the init_mm (happens for pagecache usage). 548 */ 549 if (likely(!memcg)) { 550 rcu_read_lock(); 551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 552 /* 553 * For every charge from the cgroup, increment reference count 554 */ 555 css_get(&mem->css); 556 rcu_read_unlock(); 557 } else { 558 mem = memcg; 559 css_get(&memcg->css); 560 } 561 562 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 563 if (!(gfp_mask & __GFP_WAIT)) 564 goto out; 565 566 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 567 continue; 568 569 /* 570 * try_to_free_mem_cgroup_pages() might not give us a full 571 * picture of reclaim. Some pages are reclaimed and might be 572 * moved to swap cache or just unmapped from the cgroup. 573 * Check the limit again to see if the reclaim reduced the 574 * current usage of the cgroup before giving up 575 */ 576 if (res_counter_check_under_limit(&mem->res)) 577 continue; 578 579 if (!nr_retries--) { 580 mem_cgroup_out_of_memory(mem, gfp_mask); 581 goto out; 582 } 583 } 584 585 pc->mem_cgroup = mem; 586 pc->page = page; 587 /* 588 * If a page is accounted as a page cache, insert to inactive list. 589 * If anon, insert to active list. 590 */ 591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 592 pc->flags = PAGE_CGROUP_FLAG_CACHE; 593 else 594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 595 596 lock_page_cgroup(page); 597 if (unlikely(page_get_page_cgroup(page))) { 598 unlock_page_cgroup(page); 599 res_counter_uncharge(&mem->res, PAGE_SIZE); 600 css_put(&mem->css); 601 kmem_cache_free(page_cgroup_cache, pc); 602 goto done; 603 } 604 page_assign_page_cgroup(page, pc); 605 606 mz = page_cgroup_zoneinfo(pc); 607 spin_lock_irqsave(&mz->lru_lock, flags); 608 __mem_cgroup_add_list(mz, pc); 609 spin_unlock_irqrestore(&mz->lru_lock, flags); 610 611 unlock_page_cgroup(page); 612 done: 613 return 0; 614 out: 615 css_put(&mem->css); 616 kmem_cache_free(page_cgroup_cache, pc); 617 err: 618 return -ENOMEM; 619 } 620 621 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 622 { 623 if (mem_cgroup_subsys.disabled) 624 return 0; 625 626 /* 627 * If already mapped, we don't have to account. 628 * If page cache, page->mapping has address_space. 629 * But page->mapping may have out-of-use anon_vma pointer, 630 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 631 * is NULL. 632 */ 633 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 634 return 0; 635 if (unlikely(!mm)) 636 mm = &init_mm; 637 return mem_cgroup_charge_common(page, mm, gfp_mask, 638 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 639 } 640 641 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 642 gfp_t gfp_mask) 643 { 644 if (mem_cgroup_subsys.disabled) 645 return 0; 646 647 /* 648 * Corner case handling. This is called from add_to_page_cache() 649 * in usual. But some FS (shmem) precharges this page before calling it 650 * and call add_to_page_cache() with GFP_NOWAIT. 651 * 652 * For GFP_NOWAIT case, the page may be pre-charged before calling 653 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 654 * charge twice. (It works but has to pay a bit larger cost.) 655 */ 656 if (!(gfp_mask & __GFP_WAIT)) { 657 struct page_cgroup *pc; 658 659 lock_page_cgroup(page); 660 pc = page_get_page_cgroup(page); 661 if (pc) { 662 VM_BUG_ON(pc->page != page); 663 VM_BUG_ON(!pc->mem_cgroup); 664 unlock_page_cgroup(page); 665 return 0; 666 } 667 unlock_page_cgroup(page); 668 } 669 670 if (unlikely(!mm)) 671 mm = &init_mm; 672 673 return mem_cgroup_charge_common(page, mm, gfp_mask, 674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 675 } 676 677 /* 678 * uncharge if !page_mapped(page) 679 */ 680 static void 681 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 682 { 683 struct page_cgroup *pc; 684 struct mem_cgroup *mem; 685 struct mem_cgroup_per_zone *mz; 686 unsigned long flags; 687 688 if (mem_cgroup_subsys.disabled) 689 return; 690 691 /* 692 * Check if our page_cgroup is valid 693 */ 694 lock_page_cgroup(page); 695 pc = page_get_page_cgroup(page); 696 if (unlikely(!pc)) 697 goto unlock; 698 699 VM_BUG_ON(pc->page != page); 700 701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 703 || page_mapped(page))) 704 goto unlock; 705 706 mz = page_cgroup_zoneinfo(pc); 707 spin_lock_irqsave(&mz->lru_lock, flags); 708 __mem_cgroup_remove_list(mz, pc); 709 spin_unlock_irqrestore(&mz->lru_lock, flags); 710 711 page_assign_page_cgroup(page, NULL); 712 unlock_page_cgroup(page); 713 714 mem = pc->mem_cgroup; 715 res_counter_uncharge(&mem->res, PAGE_SIZE); 716 css_put(&mem->css); 717 718 kmem_cache_free(page_cgroup_cache, pc); 719 return; 720 unlock: 721 unlock_page_cgroup(page); 722 } 723 724 void mem_cgroup_uncharge_page(struct page *page) 725 { 726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 727 } 728 729 void mem_cgroup_uncharge_cache_page(struct page *page) 730 { 731 VM_BUG_ON(page_mapped(page)); 732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 733 } 734 735 /* 736 * Before starting migration, account against new page. 737 */ 738 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) 739 { 740 struct page_cgroup *pc; 741 struct mem_cgroup *mem = NULL; 742 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 743 int ret = 0; 744 745 if (mem_cgroup_subsys.disabled) 746 return 0; 747 748 lock_page_cgroup(page); 749 pc = page_get_page_cgroup(page); 750 if (pc) { 751 mem = pc->mem_cgroup; 752 css_get(&mem->css); 753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 755 } 756 unlock_page_cgroup(page); 757 if (mem) { 758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 759 ctype, mem); 760 css_put(&mem->css); 761 } 762 return ret; 763 } 764 765 /* remove redundant charge if migration failed*/ 766 void mem_cgroup_end_migration(struct page *newpage) 767 { 768 /* 769 * At success, page->mapping is not NULL. 770 * special rollback care is necessary when 771 * 1. at migration failure. (newpage->mapping is cleared in this case) 772 * 2. the newpage was moved but not remapped again because the task 773 * exits and the newpage is obsolete. In this case, the new page 774 * may be a swapcache. So, we just call mem_cgroup_uncharge_page() 775 * always for avoiding mess. The page_cgroup will be removed if 776 * unnecessary. File cache pages is still on radix-tree. Don't 777 * care it. 778 */ 779 if (!newpage->mapping) 780 __mem_cgroup_uncharge_common(newpage, 781 MEM_CGROUP_CHARGE_TYPE_FORCE); 782 else if (PageAnon(newpage)) 783 mem_cgroup_uncharge_page(newpage); 784 } 785 786 /* 787 * A call to try to shrink memory usage under specified resource controller. 788 * This is typically used for page reclaiming for shmem for reducing side 789 * effect of page allocation from shmem, which is used by some mem_cgroup. 790 */ 791 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 792 { 793 struct mem_cgroup *mem; 794 int progress = 0; 795 int retry = MEM_CGROUP_RECLAIM_RETRIES; 796 797 if (mem_cgroup_subsys.disabled) 798 return 0; 799 800 rcu_read_lock(); 801 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 802 css_get(&mem->css); 803 rcu_read_unlock(); 804 805 do { 806 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 807 } while (!progress && --retry); 808 809 css_put(&mem->css); 810 if (!retry) 811 return -ENOMEM; 812 return 0; 813 } 814 815 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 816 { 817 818 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 819 int progress; 820 int ret = 0; 821 822 while (res_counter_set_limit(&memcg->res, val)) { 823 if (signal_pending(current)) { 824 ret = -EINTR; 825 break; 826 } 827 if (!retry_count) { 828 ret = -EBUSY; 829 break; 830 } 831 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); 832 if (!progress) 833 retry_count--; 834 } 835 return ret; 836 } 837 838 839 /* 840 * This routine traverse page_cgroup in given list and drop them all. 841 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 842 */ 843 #define FORCE_UNCHARGE_BATCH (128) 844 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 845 struct mem_cgroup_per_zone *mz, 846 int active) 847 { 848 struct page_cgroup *pc; 849 struct page *page; 850 int count = FORCE_UNCHARGE_BATCH; 851 unsigned long flags; 852 struct list_head *list; 853 854 if (active) 855 list = &mz->active_list; 856 else 857 list = &mz->inactive_list; 858 859 spin_lock_irqsave(&mz->lru_lock, flags); 860 while (!list_empty(list)) { 861 pc = list_entry(list->prev, struct page_cgroup, lru); 862 page = pc->page; 863 get_page(page); 864 spin_unlock_irqrestore(&mz->lru_lock, flags); 865 /* 866 * Check if this page is on LRU. !LRU page can be found 867 * if it's under page migration. 868 */ 869 if (PageLRU(page)) { 870 __mem_cgroup_uncharge_common(page, 871 MEM_CGROUP_CHARGE_TYPE_FORCE); 872 put_page(page); 873 if (--count <= 0) { 874 count = FORCE_UNCHARGE_BATCH; 875 cond_resched(); 876 } 877 } else 878 cond_resched(); 879 spin_lock_irqsave(&mz->lru_lock, flags); 880 } 881 spin_unlock_irqrestore(&mz->lru_lock, flags); 882 } 883 884 /* 885 * make mem_cgroup's charge to be 0 if there is no task. 886 * This enables deleting this mem_cgroup. 887 */ 888 static int mem_cgroup_force_empty(struct mem_cgroup *mem) 889 { 890 int ret = -EBUSY; 891 int node, zid; 892 893 css_get(&mem->css); 894 /* 895 * page reclaim code (kswapd etc..) will move pages between 896 * active_list <-> inactive_list while we don't take a lock. 897 * So, we have to do loop here until all lists are empty. 898 */ 899 while (mem->res.usage > 0) { 900 if (atomic_read(&mem->css.cgroup->count) > 0) 901 goto out; 902 for_each_node_state(node, N_POSSIBLE) 903 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 904 struct mem_cgroup_per_zone *mz; 905 mz = mem_cgroup_zoneinfo(mem, node, zid); 906 /* drop all page_cgroup in active_list */ 907 mem_cgroup_force_empty_list(mem, mz, 1); 908 /* drop all page_cgroup in inactive_list */ 909 mem_cgroup_force_empty_list(mem, mz, 0); 910 } 911 } 912 ret = 0; 913 out: 914 css_put(&mem->css); 915 return ret; 916 } 917 918 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 919 { 920 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 921 cft->private); 922 } 923 /* 924 * The user of this function is... 925 * RES_LIMIT. 926 */ 927 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 928 const char *buffer) 929 { 930 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 931 unsigned long long val; 932 int ret; 933 934 switch (cft->private) { 935 case RES_LIMIT: 936 /* This function does all necessary parse...reuse it */ 937 ret = res_counter_memparse_write_strategy(buffer, &val); 938 if (!ret) 939 ret = mem_cgroup_resize_limit(memcg, val); 940 break; 941 default: 942 ret = -EINVAL; /* should be BUG() ? */ 943 break; 944 } 945 return ret; 946 } 947 948 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 949 { 950 struct mem_cgroup *mem; 951 952 mem = mem_cgroup_from_cont(cont); 953 switch (event) { 954 case RES_MAX_USAGE: 955 res_counter_reset_max(&mem->res); 956 break; 957 case RES_FAILCNT: 958 res_counter_reset_failcnt(&mem->res); 959 break; 960 } 961 return 0; 962 } 963 964 static int mem_force_empty_write(struct cgroup *cont, unsigned int event) 965 { 966 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); 967 } 968 969 static const struct mem_cgroup_stat_desc { 970 const char *msg; 971 u64 unit; 972 } mem_cgroup_stat_desc[] = { 973 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 974 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 975 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 976 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 977 }; 978 979 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 980 struct cgroup_map_cb *cb) 981 { 982 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 983 struct mem_cgroup_stat *stat = &mem_cont->stat; 984 int i; 985 986 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 987 s64 val; 988 989 val = mem_cgroup_read_stat(stat, i); 990 val *= mem_cgroup_stat_desc[i].unit; 991 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 992 } 993 /* showing # of active pages */ 994 { 995 unsigned long active, inactive; 996 997 inactive = mem_cgroup_get_all_zonestat(mem_cont, 998 MEM_CGROUP_ZSTAT_INACTIVE); 999 active = mem_cgroup_get_all_zonestat(mem_cont, 1000 MEM_CGROUP_ZSTAT_ACTIVE); 1001 cb->fill(cb, "active", (active) * PAGE_SIZE); 1002 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 1003 } 1004 return 0; 1005 } 1006 1007 static struct cftype mem_cgroup_files[] = { 1008 { 1009 .name = "usage_in_bytes", 1010 .private = RES_USAGE, 1011 .read_u64 = mem_cgroup_read, 1012 }, 1013 { 1014 .name = "max_usage_in_bytes", 1015 .private = RES_MAX_USAGE, 1016 .trigger = mem_cgroup_reset, 1017 .read_u64 = mem_cgroup_read, 1018 }, 1019 { 1020 .name = "limit_in_bytes", 1021 .private = RES_LIMIT, 1022 .write_string = mem_cgroup_write, 1023 .read_u64 = mem_cgroup_read, 1024 }, 1025 { 1026 .name = "failcnt", 1027 .private = RES_FAILCNT, 1028 .trigger = mem_cgroup_reset, 1029 .read_u64 = mem_cgroup_read, 1030 }, 1031 { 1032 .name = "force_empty", 1033 .trigger = mem_force_empty_write, 1034 }, 1035 { 1036 .name = "stat", 1037 .read_map = mem_control_stat_show, 1038 }, 1039 }; 1040 1041 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1042 { 1043 struct mem_cgroup_per_node *pn; 1044 struct mem_cgroup_per_zone *mz; 1045 int zone, tmp = node; 1046 /* 1047 * This routine is called against possible nodes. 1048 * But it's BUG to call kmalloc() against offline node. 1049 * 1050 * TODO: this routine can waste much memory for nodes which will 1051 * never be onlined. It's better to use memory hotplug callback 1052 * function. 1053 */ 1054 if (!node_state(node, N_NORMAL_MEMORY)) 1055 tmp = -1; 1056 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 1057 if (!pn) 1058 return 1; 1059 1060 mem->info.nodeinfo[node] = pn; 1061 memset(pn, 0, sizeof(*pn)); 1062 1063 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1064 mz = &pn->zoneinfo[zone]; 1065 INIT_LIST_HEAD(&mz->active_list); 1066 INIT_LIST_HEAD(&mz->inactive_list); 1067 spin_lock_init(&mz->lru_lock); 1068 } 1069 return 0; 1070 } 1071 1072 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1073 { 1074 kfree(mem->info.nodeinfo[node]); 1075 } 1076 1077 static struct mem_cgroup *mem_cgroup_alloc(void) 1078 { 1079 struct mem_cgroup *mem; 1080 1081 if (sizeof(*mem) < PAGE_SIZE) 1082 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 1083 else 1084 mem = vmalloc(sizeof(*mem)); 1085 1086 if (mem) 1087 memset(mem, 0, sizeof(*mem)); 1088 return mem; 1089 } 1090 1091 static void mem_cgroup_free(struct mem_cgroup *mem) 1092 { 1093 if (sizeof(*mem) < PAGE_SIZE) 1094 kfree(mem); 1095 else 1096 vfree(mem); 1097 } 1098 1099 1100 static struct cgroup_subsys_state * 1101 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1102 { 1103 struct mem_cgroup *mem; 1104 int node; 1105 1106 if (unlikely((cont->parent) == NULL)) { 1107 mem = &init_mem_cgroup; 1108 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); 1109 } else { 1110 mem = mem_cgroup_alloc(); 1111 if (!mem) 1112 return ERR_PTR(-ENOMEM); 1113 } 1114 1115 res_counter_init(&mem->res); 1116 1117 for_each_node_state(node, N_POSSIBLE) 1118 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1119 goto free_out; 1120 1121 return &mem->css; 1122 free_out: 1123 for_each_node_state(node, N_POSSIBLE) 1124 free_mem_cgroup_per_zone_info(mem, node); 1125 if (cont->parent != NULL) 1126 mem_cgroup_free(mem); 1127 return ERR_PTR(-ENOMEM); 1128 } 1129 1130 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1131 struct cgroup *cont) 1132 { 1133 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1134 mem_cgroup_force_empty(mem); 1135 } 1136 1137 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1138 struct cgroup *cont) 1139 { 1140 int node; 1141 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1142 1143 for_each_node_state(node, N_POSSIBLE) 1144 free_mem_cgroup_per_zone_info(mem, node); 1145 1146 mem_cgroup_free(mem_cgroup_from_cont(cont)); 1147 } 1148 1149 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1150 struct cgroup *cont) 1151 { 1152 return cgroup_add_files(cont, ss, mem_cgroup_files, 1153 ARRAY_SIZE(mem_cgroup_files)); 1154 } 1155 1156 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1157 struct cgroup *cont, 1158 struct cgroup *old_cont, 1159 struct task_struct *p) 1160 { 1161 struct mm_struct *mm; 1162 struct mem_cgroup *mem, *old_mem; 1163 1164 mm = get_task_mm(p); 1165 if (mm == NULL) 1166 return; 1167 1168 mem = mem_cgroup_from_cont(cont); 1169 old_mem = mem_cgroup_from_cont(old_cont); 1170 1171 if (mem == old_mem) 1172 goto out; 1173 1174 /* 1175 * Only thread group leaders are allowed to migrate, the mm_struct is 1176 * in effect owned by the leader 1177 */ 1178 if (!thread_group_leader(p)) 1179 goto out; 1180 1181 out: 1182 mmput(mm); 1183 } 1184 1185 struct cgroup_subsys mem_cgroup_subsys = { 1186 .name = "memory", 1187 .subsys_id = mem_cgroup_subsys_id, 1188 .create = mem_cgroup_create, 1189 .pre_destroy = mem_cgroup_pre_destroy, 1190 .destroy = mem_cgroup_destroy, 1191 .populate = mem_cgroup_populate, 1192 .attach = mem_cgroup_move_task, 1193 .early_init = 0, 1194 }; 1195