1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/slab.h> 30 #include <linux/swap.h> 31 #include <linux/spinlock.h> 32 #include <linux/fs.h> 33 #include <linux/seq_file.h> 34 #include <linux/vmalloc.h> 35 36 #include <asm/uaccess.h> 37 38 struct cgroup_subsys mem_cgroup_subsys; 39 static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 40 static struct kmem_cache *page_cgroup_cache; 41 42 /* 43 * Statistics for memory cgroup. 44 */ 45 enum mem_cgroup_stat_index { 46 /* 47 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 48 */ 49 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 50 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 51 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 52 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 53 54 MEM_CGROUP_STAT_NSTATS, 55 }; 56 57 struct mem_cgroup_stat_cpu { 58 s64 count[MEM_CGROUP_STAT_NSTATS]; 59 } ____cacheline_aligned_in_smp; 60 61 struct mem_cgroup_stat { 62 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 63 }; 64 65 /* 66 * For accounting under irq disable, no need for increment preempt count. 67 */ 68 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69 enum mem_cgroup_stat_index idx, int val) 70 { 71 int cpu = smp_processor_id(); 72 stat->cpustat[cpu].count[idx] += val; 73 } 74 75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 76 enum mem_cgroup_stat_index idx) 77 { 78 int cpu; 79 s64 ret = 0; 80 for_each_possible_cpu(cpu) 81 ret += stat->cpustat[cpu].count[idx]; 82 return ret; 83 } 84 85 /* 86 * per-zone information in memory controller. 87 */ 88 89 enum mem_cgroup_zstat_index { 90 MEM_CGROUP_ZSTAT_ACTIVE, 91 MEM_CGROUP_ZSTAT_INACTIVE, 92 93 NR_MEM_CGROUP_ZSTAT, 94 }; 95 96 struct mem_cgroup_per_zone { 97 /* 98 * spin_lock to protect the per cgroup LRU 99 */ 100 spinlock_t lru_lock; 101 struct list_head active_list; 102 struct list_head inactive_list; 103 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 104 }; 105 /* Macro for accessing counter */ 106 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 107 108 struct mem_cgroup_per_node { 109 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 110 }; 111 112 struct mem_cgroup_lru_info { 113 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 114 }; 115 116 /* 117 * The memory controller data structure. The memory controller controls both 118 * page cache and RSS per cgroup. We would eventually like to provide 119 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 120 * to help the administrator determine what knobs to tune. 121 * 122 * TODO: Add a water mark for the memory controller. Reclaim will begin when 123 * we hit the water mark. May be even add a low water mark, such that 124 * no reclaim occurs from a cgroup at it's low water mark, this is 125 * a feature that will be implemented much later in the future. 126 */ 127 struct mem_cgroup { 128 struct cgroup_subsys_state css; 129 /* 130 * the counter to account for memory usage 131 */ 132 struct res_counter res; 133 /* 134 * Per cgroup active and inactive list, similar to the 135 * per zone LRU lists. 136 */ 137 struct mem_cgroup_lru_info info; 138 139 int prev_priority; /* for recording reclaim priority */ 140 /* 141 * statistics. 142 */ 143 struct mem_cgroup_stat stat; 144 }; 145 static struct mem_cgroup init_mem_cgroup; 146 147 /* 148 * We use the lower bit of the page->page_cgroup pointer as a bit spin 149 * lock. We need to ensure that page->page_cgroup is at least two 150 * byte aligned (based on comments from Nick Piggin). But since 151 * bit_spin_lock doesn't actually set that lock bit in a non-debug 152 * uniprocessor kernel, we should avoid setting it here too. 153 */ 154 #define PAGE_CGROUP_LOCK_BIT 0x0 155 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) 156 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 157 #else 158 #define PAGE_CGROUP_LOCK 0x0 159 #endif 160 161 /* 162 * A page_cgroup page is associated with every page descriptor. The 163 * page_cgroup helps us identify information about the cgroup 164 */ 165 struct page_cgroup { 166 struct list_head lru; /* per cgroup LRU list */ 167 struct page *page; 168 struct mem_cgroup *mem_cgroup; 169 int ref_cnt; /* cached, mapped, migrating */ 170 int flags; 171 }; 172 #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 173 #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 174 175 static int page_cgroup_nid(struct page_cgroup *pc) 176 { 177 return page_to_nid(pc->page); 178 } 179 180 static enum zone_type page_cgroup_zid(struct page_cgroup *pc) 181 { 182 return page_zonenum(pc->page); 183 } 184 185 enum charge_type { 186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 188 }; 189 190 /* 191 * Always modified under lru lock. Then, not necessary to preempt_disable() 192 */ 193 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 194 bool charge) 195 { 196 int val = (charge)? 1 : -1; 197 struct mem_cgroup_stat *stat = &mem->stat; 198 199 VM_BUG_ON(!irqs_disabled()); 200 if (flags & PAGE_CGROUP_FLAG_CACHE) 201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 202 else 203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 204 205 if (charge) 206 __mem_cgroup_stat_add_safe(stat, 207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 208 else 209 __mem_cgroup_stat_add_safe(stat, 210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 211 } 212 213 static struct mem_cgroup_per_zone * 214 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 215 { 216 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 217 } 218 219 static struct mem_cgroup_per_zone * 220 page_cgroup_zoneinfo(struct page_cgroup *pc) 221 { 222 struct mem_cgroup *mem = pc->mem_cgroup; 223 int nid = page_cgroup_nid(pc); 224 int zid = page_cgroup_zid(pc); 225 226 return mem_cgroup_zoneinfo(mem, nid, zid); 227 } 228 229 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 230 enum mem_cgroup_zstat_index idx) 231 { 232 int nid, zid; 233 struct mem_cgroup_per_zone *mz; 234 u64 total = 0; 235 236 for_each_online_node(nid) 237 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 238 mz = mem_cgroup_zoneinfo(mem, nid, zid); 239 total += MEM_CGROUP_ZSTAT(mz, idx); 240 } 241 return total; 242 } 243 244 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 245 { 246 return container_of(cgroup_subsys_state(cont, 247 mem_cgroup_subsys_id), struct mem_cgroup, 248 css); 249 } 250 251 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 252 { 253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 254 struct mem_cgroup, css); 255 } 256 257 static inline int page_cgroup_locked(struct page *page) 258 { 259 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 260 } 261 262 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 263 { 264 VM_BUG_ON(!page_cgroup_locked(page)); 265 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); 266 } 267 268 struct page_cgroup *page_get_page_cgroup(struct page *page) 269 { 270 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); 271 } 272 273 static void lock_page_cgroup(struct page *page) 274 { 275 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 276 } 277 278 static int try_lock_page_cgroup(struct page *page) 279 { 280 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 281 } 282 283 static void unlock_page_cgroup(struct page *page) 284 { 285 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 286 } 287 288 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 289 struct page_cgroup *pc) 290 { 291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 292 293 if (from) 294 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 295 else 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 297 298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 299 list_del_init(&pc->lru); 300 } 301 302 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 303 struct page_cgroup *pc) 304 { 305 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 306 307 if (!to) { 308 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 309 list_add(&pc->lru, &mz->inactive_list); 310 } else { 311 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 312 list_add(&pc->lru, &mz->active_list); 313 } 314 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 315 } 316 317 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 318 { 319 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 320 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 321 322 if (from) 323 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 324 else 325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 326 327 if (active) { 328 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 329 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 330 list_move(&pc->lru, &mz->active_list); 331 } else { 332 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 333 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 334 list_move(&pc->lru, &mz->inactive_list); 335 } 336 } 337 338 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 339 { 340 int ret; 341 342 task_lock(task); 343 ret = task->mm && mm_match_cgroup(task->mm, mem); 344 task_unlock(task); 345 return ret; 346 } 347 348 /* 349 * This routine assumes that the appropriate zone's lru lock is already held 350 */ 351 void mem_cgroup_move_lists(struct page *page, bool active) 352 { 353 struct page_cgroup *pc; 354 struct mem_cgroup_per_zone *mz; 355 unsigned long flags; 356 357 /* 358 * We cannot lock_page_cgroup while holding zone's lru_lock, 359 * because other holders of lock_page_cgroup can be interrupted 360 * with an attempt to rotate_reclaimable_page. But we cannot 361 * safely get to page_cgroup without it, so just try_lock it: 362 * mem_cgroup_isolate_pages allows for page left on wrong list. 363 */ 364 if (!try_lock_page_cgroup(page)) 365 return; 366 367 pc = page_get_page_cgroup(page); 368 if (pc) { 369 mz = page_cgroup_zoneinfo(pc); 370 spin_lock_irqsave(&mz->lru_lock, flags); 371 __mem_cgroup_move_lists(pc, active); 372 spin_unlock_irqrestore(&mz->lru_lock, flags); 373 } 374 unlock_page_cgroup(page); 375 } 376 377 /* 378 * Calculate mapped_ratio under memory controller. This will be used in 379 * vmscan.c for deteremining we have to reclaim mapped pages. 380 */ 381 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 382 { 383 long total, rss; 384 385 /* 386 * usage is recorded in bytes. But, here, we assume the number of 387 * physical pages can be represented by "long" on any arch. 388 */ 389 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 390 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 391 return (int)((rss * 100L) / total); 392 } 393 394 /* 395 * This function is called from vmscan.c. In page reclaiming loop. balance 396 * between active and inactive list is calculated. For memory controller 397 * page reclaiming, we should use using mem_cgroup's imbalance rather than 398 * zone's global lru imbalance. 399 */ 400 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) 401 { 402 unsigned long active, inactive; 403 /* active and inactive are the number of pages. 'long' is ok.*/ 404 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); 405 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); 406 return (long) (active / (inactive + 1)); 407 } 408 409 /* 410 * prev_priority control...this will be used in memory reclaim path. 411 */ 412 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 413 { 414 return mem->prev_priority; 415 } 416 417 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 418 { 419 if (priority < mem->prev_priority) 420 mem->prev_priority = priority; 421 } 422 423 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 424 { 425 mem->prev_priority = priority; 426 } 427 428 /* 429 * Calculate # of pages to be scanned in this priority/zone. 430 * See also vmscan.c 431 * 432 * priority starts from "DEF_PRIORITY" and decremented in each loop. 433 * (see include/linux/mmzone.h) 434 */ 435 436 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 437 struct zone *zone, int priority) 438 { 439 long nr_active; 440 int nid = zone->zone_pgdat->node_id; 441 int zid = zone_idx(zone); 442 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 443 444 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 445 return (nr_active >> priority); 446 } 447 448 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 449 struct zone *zone, int priority) 450 { 451 long nr_inactive; 452 int nid = zone->zone_pgdat->node_id; 453 int zid = zone_idx(zone); 454 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 455 456 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 457 return (nr_inactive >> priority); 458 } 459 460 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 461 struct list_head *dst, 462 unsigned long *scanned, int order, 463 int mode, struct zone *z, 464 struct mem_cgroup *mem_cont, 465 int active) 466 { 467 unsigned long nr_taken = 0; 468 struct page *page; 469 unsigned long scan; 470 LIST_HEAD(pc_list); 471 struct list_head *src; 472 struct page_cgroup *pc, *tmp; 473 int nid = z->zone_pgdat->node_id; 474 int zid = zone_idx(z); 475 struct mem_cgroup_per_zone *mz; 476 477 BUG_ON(!mem_cont); 478 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 479 if (active) 480 src = &mz->active_list; 481 else 482 src = &mz->inactive_list; 483 484 485 spin_lock(&mz->lru_lock); 486 scan = 0; 487 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 488 if (scan >= nr_to_scan) 489 break; 490 page = pc->page; 491 492 if (unlikely(!PageLRU(page))) 493 continue; 494 495 if (PageActive(page) && !active) { 496 __mem_cgroup_move_lists(pc, true); 497 continue; 498 } 499 if (!PageActive(page) && active) { 500 __mem_cgroup_move_lists(pc, false); 501 continue; 502 } 503 504 scan++; 505 list_move(&pc->lru, &pc_list); 506 507 if (__isolate_lru_page(page, mode) == 0) { 508 list_move(&page->lru, dst); 509 nr_taken++; 510 } 511 } 512 513 list_splice(&pc_list, src); 514 spin_unlock(&mz->lru_lock); 515 516 *scanned = scan; 517 return nr_taken; 518 } 519 520 /* 521 * Charge the memory controller for page usage. 522 * Return 523 * 0 if the charge was successful 524 * < 0 if the cgroup is over its limit 525 */ 526 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 527 gfp_t gfp_mask, enum charge_type ctype) 528 { 529 struct mem_cgroup *mem; 530 struct page_cgroup *pc; 531 unsigned long flags; 532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 533 struct mem_cgroup_per_zone *mz; 534 535 if (mem_cgroup_subsys.disabled) 536 return 0; 537 538 /* 539 * Should page_cgroup's go to their own slab? 540 * One could optimize the performance of the charging routine 541 * by saving a bit in the page_flags and using it as a lock 542 * to see if the cgroup page already has a page_cgroup associated 543 * with it 544 */ 545 retry: 546 lock_page_cgroup(page); 547 pc = page_get_page_cgroup(page); 548 /* 549 * The page_cgroup exists and 550 * the page has already been accounted. 551 */ 552 if (pc) { 553 VM_BUG_ON(pc->page != page); 554 VM_BUG_ON(pc->ref_cnt <= 0); 555 556 pc->ref_cnt++; 557 unlock_page_cgroup(page); 558 goto done; 559 } 560 unlock_page_cgroup(page); 561 562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); 563 if (pc == NULL) 564 goto err; 565 566 /* 567 * We always charge the cgroup the mm_struct belongs to. 568 * The mm_struct's mem_cgroup changes on task migration if the 569 * thread group leader migrates. It's possible that mm is not 570 * set, if so charge the init_mm (happens for pagecache usage). 571 */ 572 if (!mm) 573 mm = &init_mm; 574 575 rcu_read_lock(); 576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 577 /* 578 * For every charge from the cgroup, increment reference count 579 */ 580 css_get(&mem->css); 581 rcu_read_unlock(); 582 583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 584 if (!(gfp_mask & __GFP_WAIT)) 585 goto out; 586 587 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 588 continue; 589 590 /* 591 * try_to_free_mem_cgroup_pages() might not give us a full 592 * picture of reclaim. Some pages are reclaimed and might be 593 * moved to swap cache or just unmapped from the cgroup. 594 * Check the limit again to see if the reclaim reduced the 595 * current usage of the cgroup before giving up 596 */ 597 if (res_counter_check_under_limit(&mem->res)) 598 continue; 599 600 if (!nr_retries--) { 601 mem_cgroup_out_of_memory(mem, gfp_mask); 602 goto out; 603 } 604 } 605 606 pc->ref_cnt = 1; 607 pc->mem_cgroup = mem; 608 pc->page = page; 609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 611 pc->flags = PAGE_CGROUP_FLAG_CACHE; 612 613 lock_page_cgroup(page); 614 if (page_get_page_cgroup(page)) { 615 unlock_page_cgroup(page); 616 /* 617 * Another charge has been added to this page already. 618 * We take lock_page_cgroup(page) again and read 619 * page->cgroup, increment refcnt.... just retry is OK. 620 */ 621 res_counter_uncharge(&mem->res, PAGE_SIZE); 622 css_put(&mem->css); 623 kmem_cache_free(page_cgroup_cache, pc); 624 goto retry; 625 } 626 page_assign_page_cgroup(page, pc); 627 628 mz = page_cgroup_zoneinfo(pc); 629 spin_lock_irqsave(&mz->lru_lock, flags); 630 __mem_cgroup_add_list(mz, pc); 631 spin_unlock_irqrestore(&mz->lru_lock, flags); 632 633 unlock_page_cgroup(page); 634 done: 635 return 0; 636 out: 637 css_put(&mem->css); 638 kmem_cache_free(page_cgroup_cache, pc); 639 err: 640 return -ENOMEM; 641 } 642 643 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 644 { 645 return mem_cgroup_charge_common(page, mm, gfp_mask, 646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 647 } 648 649 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 650 gfp_t gfp_mask) 651 { 652 if (!mm) 653 mm = &init_mm; 654 return mem_cgroup_charge_common(page, mm, gfp_mask, 655 MEM_CGROUP_CHARGE_TYPE_CACHE); 656 } 657 658 /* 659 * Uncharging is always a welcome operation, we never complain, simply 660 * uncharge. 661 */ 662 void mem_cgroup_uncharge_page(struct page *page) 663 { 664 struct page_cgroup *pc; 665 struct mem_cgroup *mem; 666 struct mem_cgroup_per_zone *mz; 667 unsigned long flags; 668 669 if (mem_cgroup_subsys.disabled) 670 return; 671 672 /* 673 * Check if our page_cgroup is valid 674 */ 675 lock_page_cgroup(page); 676 pc = page_get_page_cgroup(page); 677 if (!pc) 678 goto unlock; 679 680 VM_BUG_ON(pc->page != page); 681 VM_BUG_ON(pc->ref_cnt <= 0); 682 683 if (--(pc->ref_cnt) == 0) { 684 mz = page_cgroup_zoneinfo(pc); 685 spin_lock_irqsave(&mz->lru_lock, flags); 686 __mem_cgroup_remove_list(mz, pc); 687 spin_unlock_irqrestore(&mz->lru_lock, flags); 688 689 page_assign_page_cgroup(page, NULL); 690 unlock_page_cgroup(page); 691 692 mem = pc->mem_cgroup; 693 res_counter_uncharge(&mem->res, PAGE_SIZE); 694 css_put(&mem->css); 695 696 kmem_cache_free(page_cgroup_cache, pc); 697 return; 698 } 699 700 unlock: 701 unlock_page_cgroup(page); 702 } 703 704 /* 705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 706 * Refcnt of page_cgroup is incremented. 707 */ 708 int mem_cgroup_prepare_migration(struct page *page) 709 { 710 struct page_cgroup *pc; 711 712 if (mem_cgroup_subsys.disabled) 713 return 0; 714 715 lock_page_cgroup(page); 716 pc = page_get_page_cgroup(page); 717 if (pc) 718 pc->ref_cnt++; 719 unlock_page_cgroup(page); 720 return pc != NULL; 721 } 722 723 void mem_cgroup_end_migration(struct page *page) 724 { 725 mem_cgroup_uncharge_page(page); 726 } 727 728 /* 729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 730 * And no race with uncharge() routines because page_cgroup for *page* 731 * has extra one reference by mem_cgroup_prepare_migration. 732 */ 733 void mem_cgroup_page_migration(struct page *page, struct page *newpage) 734 { 735 struct page_cgroup *pc; 736 struct mem_cgroup_per_zone *mz; 737 unsigned long flags; 738 739 lock_page_cgroup(page); 740 pc = page_get_page_cgroup(page); 741 if (!pc) { 742 unlock_page_cgroup(page); 743 return; 744 } 745 746 mz = page_cgroup_zoneinfo(pc); 747 spin_lock_irqsave(&mz->lru_lock, flags); 748 __mem_cgroup_remove_list(mz, pc); 749 spin_unlock_irqrestore(&mz->lru_lock, flags); 750 751 page_assign_page_cgroup(page, NULL); 752 unlock_page_cgroup(page); 753 754 pc->page = newpage; 755 lock_page_cgroup(newpage); 756 page_assign_page_cgroup(newpage, pc); 757 758 mz = page_cgroup_zoneinfo(pc); 759 spin_lock_irqsave(&mz->lru_lock, flags); 760 __mem_cgroup_add_list(mz, pc); 761 spin_unlock_irqrestore(&mz->lru_lock, flags); 762 763 unlock_page_cgroup(newpage); 764 } 765 766 /* 767 * This routine traverse page_cgroup in given list and drop them all. 768 * This routine ignores page_cgroup->ref_cnt. 769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 770 */ 771 #define FORCE_UNCHARGE_BATCH (128) 772 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 773 struct mem_cgroup_per_zone *mz, 774 int active) 775 { 776 struct page_cgroup *pc; 777 struct page *page; 778 int count = FORCE_UNCHARGE_BATCH; 779 unsigned long flags; 780 struct list_head *list; 781 782 if (active) 783 list = &mz->active_list; 784 else 785 list = &mz->inactive_list; 786 787 spin_lock_irqsave(&mz->lru_lock, flags); 788 while (!list_empty(list)) { 789 pc = list_entry(list->prev, struct page_cgroup, lru); 790 page = pc->page; 791 get_page(page); 792 spin_unlock_irqrestore(&mz->lru_lock, flags); 793 mem_cgroup_uncharge_page(page); 794 put_page(page); 795 if (--count <= 0) { 796 count = FORCE_UNCHARGE_BATCH; 797 cond_resched(); 798 } 799 spin_lock_irqsave(&mz->lru_lock, flags); 800 } 801 spin_unlock_irqrestore(&mz->lru_lock, flags); 802 } 803 804 /* 805 * make mem_cgroup's charge to be 0 if there is no task. 806 * This enables deleting this mem_cgroup. 807 */ 808 static int mem_cgroup_force_empty(struct mem_cgroup *mem) 809 { 810 int ret = -EBUSY; 811 int node, zid; 812 813 if (mem_cgroup_subsys.disabled) 814 return 0; 815 816 css_get(&mem->css); 817 /* 818 * page reclaim code (kswapd etc..) will move pages between 819 * active_list <-> inactive_list while we don't take a lock. 820 * So, we have to do loop here until all lists are empty. 821 */ 822 while (mem->res.usage > 0) { 823 if (atomic_read(&mem->css.cgroup->count) > 0) 824 goto out; 825 for_each_node_state(node, N_POSSIBLE) 826 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 827 struct mem_cgroup_per_zone *mz; 828 mz = mem_cgroup_zoneinfo(mem, node, zid); 829 /* drop all page_cgroup in active_list */ 830 mem_cgroup_force_empty_list(mem, mz, 1); 831 /* drop all page_cgroup in inactive_list */ 832 mem_cgroup_force_empty_list(mem, mz, 0); 833 } 834 } 835 ret = 0; 836 out: 837 css_put(&mem->css); 838 return ret; 839 } 840 841 static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) 842 { 843 *tmp = memparse(buf, &buf); 844 if (*buf != '\0') 845 return -EINVAL; 846 847 /* 848 * Round up the value to the closest page size 849 */ 850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; 851 return 0; 852 } 853 854 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 855 { 856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 857 cft->private); 858 } 859 860 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 861 struct file *file, const char __user *userbuf, 862 size_t nbytes, loff_t *ppos) 863 { 864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 865 cft->private, userbuf, nbytes, ppos, 866 mem_cgroup_write_strategy); 867 } 868 869 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 870 { 871 struct mem_cgroup *mem; 872 873 mem = mem_cgroup_from_cont(cont); 874 switch (event) { 875 case RES_MAX_USAGE: 876 res_counter_reset_max(&mem->res); 877 break; 878 case RES_FAILCNT: 879 res_counter_reset_failcnt(&mem->res); 880 break; 881 } 882 return 0; 883 } 884 885 static int mem_force_empty_write(struct cgroup *cont, unsigned int event) 886 { 887 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); 888 } 889 890 static const struct mem_cgroup_stat_desc { 891 const char *msg; 892 u64 unit; 893 } mem_cgroup_stat_desc[] = { 894 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 895 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 896 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 897 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 898 }; 899 900 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 901 struct cgroup_map_cb *cb) 902 { 903 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 904 struct mem_cgroup_stat *stat = &mem_cont->stat; 905 int i; 906 907 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 908 s64 val; 909 910 val = mem_cgroup_read_stat(stat, i); 911 val *= mem_cgroup_stat_desc[i].unit; 912 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 913 } 914 /* showing # of active pages */ 915 { 916 unsigned long active, inactive; 917 918 inactive = mem_cgroup_get_all_zonestat(mem_cont, 919 MEM_CGROUP_ZSTAT_INACTIVE); 920 active = mem_cgroup_get_all_zonestat(mem_cont, 921 MEM_CGROUP_ZSTAT_ACTIVE); 922 cb->fill(cb, "active", (active) * PAGE_SIZE); 923 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 924 } 925 return 0; 926 } 927 928 static struct cftype mem_cgroup_files[] = { 929 { 930 .name = "usage_in_bytes", 931 .private = RES_USAGE, 932 .read_u64 = mem_cgroup_read, 933 }, 934 { 935 .name = "max_usage_in_bytes", 936 .private = RES_MAX_USAGE, 937 .trigger = mem_cgroup_reset, 938 .read_u64 = mem_cgroup_read, 939 }, 940 { 941 .name = "limit_in_bytes", 942 .private = RES_LIMIT, 943 .write = mem_cgroup_write, 944 .read_u64 = mem_cgroup_read, 945 }, 946 { 947 .name = "failcnt", 948 .private = RES_FAILCNT, 949 .trigger = mem_cgroup_reset, 950 .read_u64 = mem_cgroup_read, 951 }, 952 { 953 .name = "force_empty", 954 .trigger = mem_force_empty_write, 955 }, 956 { 957 .name = "stat", 958 .read_map = mem_control_stat_show, 959 }, 960 }; 961 962 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 963 { 964 struct mem_cgroup_per_node *pn; 965 struct mem_cgroup_per_zone *mz; 966 int zone, tmp = node; 967 /* 968 * This routine is called against possible nodes. 969 * But it's BUG to call kmalloc() against offline node. 970 * 971 * TODO: this routine can waste much memory for nodes which will 972 * never be onlined. It's better to use memory hotplug callback 973 * function. 974 */ 975 if (!node_state(node, N_NORMAL_MEMORY)) 976 tmp = -1; 977 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 978 if (!pn) 979 return 1; 980 981 mem->info.nodeinfo[node] = pn; 982 memset(pn, 0, sizeof(*pn)); 983 984 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 985 mz = &pn->zoneinfo[zone]; 986 INIT_LIST_HEAD(&mz->active_list); 987 INIT_LIST_HEAD(&mz->inactive_list); 988 spin_lock_init(&mz->lru_lock); 989 } 990 return 0; 991 } 992 993 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 994 { 995 kfree(mem->info.nodeinfo[node]); 996 } 997 998 static struct mem_cgroup *mem_cgroup_alloc(void) 999 { 1000 struct mem_cgroup *mem; 1001 1002 if (sizeof(*mem) < PAGE_SIZE) 1003 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 1004 else 1005 mem = vmalloc(sizeof(*mem)); 1006 1007 if (mem) 1008 memset(mem, 0, sizeof(*mem)); 1009 return mem; 1010 } 1011 1012 static void mem_cgroup_free(struct mem_cgroup *mem) 1013 { 1014 if (sizeof(*mem) < PAGE_SIZE) 1015 kfree(mem); 1016 else 1017 vfree(mem); 1018 } 1019 1020 1021 static struct cgroup_subsys_state * 1022 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1023 { 1024 struct mem_cgroup *mem; 1025 int node; 1026 1027 if (unlikely((cont->parent) == NULL)) { 1028 mem = &init_mem_cgroup; 1029 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); 1030 } else { 1031 mem = mem_cgroup_alloc(); 1032 if (!mem) 1033 return ERR_PTR(-ENOMEM); 1034 } 1035 1036 res_counter_init(&mem->res); 1037 1038 for_each_node_state(node, N_POSSIBLE) 1039 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1040 goto free_out; 1041 1042 return &mem->css; 1043 free_out: 1044 for_each_node_state(node, N_POSSIBLE) 1045 free_mem_cgroup_per_zone_info(mem, node); 1046 if (cont->parent != NULL) 1047 mem_cgroup_free(mem); 1048 return ERR_PTR(-ENOMEM); 1049 } 1050 1051 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1052 struct cgroup *cont) 1053 { 1054 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1055 mem_cgroup_force_empty(mem); 1056 } 1057 1058 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1059 struct cgroup *cont) 1060 { 1061 int node; 1062 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1063 1064 for_each_node_state(node, N_POSSIBLE) 1065 free_mem_cgroup_per_zone_info(mem, node); 1066 1067 mem_cgroup_free(mem_cgroup_from_cont(cont)); 1068 } 1069 1070 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1071 struct cgroup *cont) 1072 { 1073 if (mem_cgroup_subsys.disabled) 1074 return 0; 1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1076 ARRAY_SIZE(mem_cgroup_files)); 1077 } 1078 1079 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1080 struct cgroup *cont, 1081 struct cgroup *old_cont, 1082 struct task_struct *p) 1083 { 1084 struct mm_struct *mm; 1085 struct mem_cgroup *mem, *old_mem; 1086 1087 if (mem_cgroup_subsys.disabled) 1088 return; 1089 1090 mm = get_task_mm(p); 1091 if (mm == NULL) 1092 return; 1093 1094 mem = mem_cgroup_from_cont(cont); 1095 old_mem = mem_cgroup_from_cont(old_cont); 1096 1097 if (mem == old_mem) 1098 goto out; 1099 1100 /* 1101 * Only thread group leaders are allowed to migrate, the mm_struct is 1102 * in effect owned by the leader 1103 */ 1104 if (!thread_group_leader(p)) 1105 goto out; 1106 1107 out: 1108 mmput(mm); 1109 } 1110 1111 struct cgroup_subsys mem_cgroup_subsys = { 1112 .name = "memory", 1113 .subsys_id = mem_cgroup_subsys_id, 1114 .create = mem_cgroup_create, 1115 .pre_destroy = mem_cgroup_pre_destroy, 1116 .destroy = mem_cgroup_destroy, 1117 .populate = mem_cgroup_populate, 1118 .attach = mem_cgroup_move_task, 1119 .early_init = 0, 1120 }; 1121