1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/swap.h> 30 #include <linux/spinlock.h> 31 #include <linux/fs.h> 32 #include <linux/seq_file.h> 33 34 #include <asm/uaccess.h> 35 36 struct cgroup_subsys mem_cgroup_subsys; 37 static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 38 39 /* 40 * Statistics for memory cgroup. 41 */ 42 enum mem_cgroup_stat_index { 43 /* 44 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 45 */ 46 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 47 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 48 49 MEM_CGROUP_STAT_NSTATS, 50 }; 51 52 struct mem_cgroup_stat_cpu { 53 s64 count[MEM_CGROUP_STAT_NSTATS]; 54 } ____cacheline_aligned_in_smp; 55 56 struct mem_cgroup_stat { 57 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 58 }; 59 60 /* 61 * For accounting under irq disable, no need for increment preempt count. 62 */ 63 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 64 enum mem_cgroup_stat_index idx, int val) 65 { 66 int cpu = smp_processor_id(); 67 stat->cpustat[cpu].count[idx] += val; 68 } 69 70 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 71 enum mem_cgroup_stat_index idx) 72 { 73 int cpu; 74 s64 ret = 0; 75 for_each_possible_cpu(cpu) 76 ret += stat->cpustat[cpu].count[idx]; 77 return ret; 78 } 79 80 /* 81 * per-zone information in memory controller. 82 */ 83 84 enum mem_cgroup_zstat_index { 85 MEM_CGROUP_ZSTAT_ACTIVE, 86 MEM_CGROUP_ZSTAT_INACTIVE, 87 88 NR_MEM_CGROUP_ZSTAT, 89 }; 90 91 struct mem_cgroup_per_zone { 92 /* 93 * spin_lock to protect the per cgroup LRU 94 */ 95 spinlock_t lru_lock; 96 struct list_head active_list; 97 struct list_head inactive_list; 98 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 99 }; 100 /* Macro for accessing counter */ 101 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 102 103 struct mem_cgroup_per_node { 104 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 105 }; 106 107 struct mem_cgroup_lru_info { 108 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 109 }; 110 111 /* 112 * The memory controller data structure. The memory controller controls both 113 * page cache and RSS per cgroup. We would eventually like to provide 114 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 115 * to help the administrator determine what knobs to tune. 116 * 117 * TODO: Add a water mark for the memory controller. Reclaim will begin when 118 * we hit the water mark. May be even add a low water mark, such that 119 * no reclaim occurs from a cgroup at it's low water mark, this is 120 * a feature that will be implemented much later in the future. 121 */ 122 struct mem_cgroup { 123 struct cgroup_subsys_state css; 124 /* 125 * the counter to account for memory usage 126 */ 127 struct res_counter res; 128 /* 129 * Per cgroup active and inactive list, similar to the 130 * per zone LRU lists. 131 */ 132 struct mem_cgroup_lru_info info; 133 134 int prev_priority; /* for recording reclaim priority */ 135 /* 136 * statistics. 137 */ 138 struct mem_cgroup_stat stat; 139 }; 140 static struct mem_cgroup init_mem_cgroup; 141 142 /* 143 * We use the lower bit of the page->page_cgroup pointer as a bit spin 144 * lock. We need to ensure that page->page_cgroup is at least two 145 * byte aligned (based on comments from Nick Piggin). But since 146 * bit_spin_lock doesn't actually set that lock bit in a non-debug 147 * uniprocessor kernel, we should avoid setting it here too. 148 */ 149 #define PAGE_CGROUP_LOCK_BIT 0x0 150 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) 151 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 152 #else 153 #define PAGE_CGROUP_LOCK 0x0 154 #endif 155 156 /* 157 * A page_cgroup page is associated with every page descriptor. The 158 * page_cgroup helps us identify information about the cgroup 159 */ 160 struct page_cgroup { 161 struct list_head lru; /* per cgroup LRU list */ 162 struct page *page; 163 struct mem_cgroup *mem_cgroup; 164 int ref_cnt; /* cached, mapped, migrating */ 165 int flags; 166 }; 167 #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 168 #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 169 170 static int page_cgroup_nid(struct page_cgroup *pc) 171 { 172 return page_to_nid(pc->page); 173 } 174 175 static enum zone_type page_cgroup_zid(struct page_cgroup *pc) 176 { 177 return page_zonenum(pc->page); 178 } 179 180 enum charge_type { 181 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 182 MEM_CGROUP_CHARGE_TYPE_MAPPED, 183 }; 184 185 /* 186 * Always modified under lru lock. Then, not necessary to preempt_disable() 187 */ 188 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 189 bool charge) 190 { 191 int val = (charge)? 1 : -1; 192 struct mem_cgroup_stat *stat = &mem->stat; 193 194 VM_BUG_ON(!irqs_disabled()); 195 if (flags & PAGE_CGROUP_FLAG_CACHE) 196 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 197 else 198 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 199 } 200 201 static struct mem_cgroup_per_zone * 202 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 203 { 204 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 205 } 206 207 static struct mem_cgroup_per_zone * 208 page_cgroup_zoneinfo(struct page_cgroup *pc) 209 { 210 struct mem_cgroup *mem = pc->mem_cgroup; 211 int nid = page_cgroup_nid(pc); 212 int zid = page_cgroup_zid(pc); 213 214 return mem_cgroup_zoneinfo(mem, nid, zid); 215 } 216 217 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 218 enum mem_cgroup_zstat_index idx) 219 { 220 int nid, zid; 221 struct mem_cgroup_per_zone *mz; 222 u64 total = 0; 223 224 for_each_online_node(nid) 225 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 226 mz = mem_cgroup_zoneinfo(mem, nid, zid); 227 total += MEM_CGROUP_ZSTAT(mz, idx); 228 } 229 return total; 230 } 231 232 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 233 { 234 return container_of(cgroup_subsys_state(cont, 235 mem_cgroup_subsys_id), struct mem_cgroup, 236 css); 237 } 238 239 static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 240 { 241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 242 struct mem_cgroup, css); 243 } 244 245 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) 246 { 247 struct mem_cgroup *mem; 248 249 mem = mem_cgroup_from_task(p); 250 css_get(&mem->css); 251 mm->mem_cgroup = mem; 252 } 253 254 void mm_free_cgroup(struct mm_struct *mm) 255 { 256 css_put(&mm->mem_cgroup->css); 257 } 258 259 static inline int page_cgroup_locked(struct page *page) 260 { 261 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 262 } 263 264 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 265 { 266 VM_BUG_ON(!page_cgroup_locked(page)); 267 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); 268 } 269 270 struct page_cgroup *page_get_page_cgroup(struct page *page) 271 { 272 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); 273 } 274 275 static void lock_page_cgroup(struct page *page) 276 { 277 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 278 } 279 280 static int try_lock_page_cgroup(struct page *page) 281 { 282 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 283 } 284 285 static void unlock_page_cgroup(struct page *page) 286 { 287 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 288 } 289 290 static void __mem_cgroup_remove_list(struct page_cgroup *pc) 291 { 292 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 293 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 294 295 if (from) 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 297 else 298 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 299 300 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 301 list_del_init(&pc->lru); 302 } 303 304 static void __mem_cgroup_add_list(struct page_cgroup *pc) 305 { 306 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 307 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 308 309 if (!to) { 310 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 311 list_add(&pc->lru, &mz->inactive_list); 312 } else { 313 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 314 list_add(&pc->lru, &mz->active_list); 315 } 316 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 317 } 318 319 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 320 { 321 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 322 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 323 324 if (from) 325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 326 else 327 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 328 329 if (active) { 330 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 331 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 332 list_move(&pc->lru, &mz->active_list); 333 } else { 334 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 335 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 336 list_move(&pc->lru, &mz->inactive_list); 337 } 338 } 339 340 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 341 { 342 int ret; 343 344 task_lock(task); 345 ret = task->mm && mm_match_cgroup(task->mm, mem); 346 task_unlock(task); 347 return ret; 348 } 349 350 /* 351 * This routine assumes that the appropriate zone's lru lock is already held 352 */ 353 void mem_cgroup_move_lists(struct page *page, bool active) 354 { 355 struct page_cgroup *pc; 356 struct mem_cgroup_per_zone *mz; 357 unsigned long flags; 358 359 /* 360 * We cannot lock_page_cgroup while holding zone's lru_lock, 361 * because other holders of lock_page_cgroup can be interrupted 362 * with an attempt to rotate_reclaimable_page. But we cannot 363 * safely get to page_cgroup without it, so just try_lock it: 364 * mem_cgroup_isolate_pages allows for page left on wrong list. 365 */ 366 if (!try_lock_page_cgroup(page)) 367 return; 368 369 pc = page_get_page_cgroup(page); 370 if (pc) { 371 mz = page_cgroup_zoneinfo(pc); 372 spin_lock_irqsave(&mz->lru_lock, flags); 373 __mem_cgroup_move_lists(pc, active); 374 spin_unlock_irqrestore(&mz->lru_lock, flags); 375 } 376 unlock_page_cgroup(page); 377 } 378 379 /* 380 * Calculate mapped_ratio under memory controller. This will be used in 381 * vmscan.c for deteremining we have to reclaim mapped pages. 382 */ 383 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 384 { 385 long total, rss; 386 387 /* 388 * usage is recorded in bytes. But, here, we assume the number of 389 * physical pages can be represented by "long" on any arch. 390 */ 391 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 392 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 393 return (int)((rss * 100L) / total); 394 } 395 396 /* 397 * This function is called from vmscan.c. In page reclaiming loop. balance 398 * between active and inactive list is calculated. For memory controller 399 * page reclaiming, we should use using mem_cgroup's imbalance rather than 400 * zone's global lru imbalance. 401 */ 402 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) 403 { 404 unsigned long active, inactive; 405 /* active and inactive are the number of pages. 'long' is ok.*/ 406 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); 407 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); 408 return (long) (active / (inactive + 1)); 409 } 410 411 /* 412 * prev_priority control...this will be used in memory reclaim path. 413 */ 414 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 415 { 416 return mem->prev_priority; 417 } 418 419 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 420 { 421 if (priority < mem->prev_priority) 422 mem->prev_priority = priority; 423 } 424 425 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 426 { 427 mem->prev_priority = priority; 428 } 429 430 /* 431 * Calculate # of pages to be scanned in this priority/zone. 432 * See also vmscan.c 433 * 434 * priority starts from "DEF_PRIORITY" and decremented in each loop. 435 * (see include/linux/mmzone.h) 436 */ 437 438 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 439 struct zone *zone, int priority) 440 { 441 long nr_active; 442 int nid = zone->zone_pgdat->node_id; 443 int zid = zone_idx(zone); 444 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 445 446 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 447 return (nr_active >> priority); 448 } 449 450 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 451 struct zone *zone, int priority) 452 { 453 long nr_inactive; 454 int nid = zone->zone_pgdat->node_id; 455 int zid = zone_idx(zone); 456 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 457 458 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 459 return (nr_inactive >> priority); 460 } 461 462 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 463 struct list_head *dst, 464 unsigned long *scanned, int order, 465 int mode, struct zone *z, 466 struct mem_cgroup *mem_cont, 467 int active) 468 { 469 unsigned long nr_taken = 0; 470 struct page *page; 471 unsigned long scan; 472 LIST_HEAD(pc_list); 473 struct list_head *src; 474 struct page_cgroup *pc, *tmp; 475 int nid = z->zone_pgdat->node_id; 476 int zid = zone_idx(z); 477 struct mem_cgroup_per_zone *mz; 478 479 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 480 if (active) 481 src = &mz->active_list; 482 else 483 src = &mz->inactive_list; 484 485 486 spin_lock(&mz->lru_lock); 487 scan = 0; 488 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 489 if (scan >= nr_to_scan) 490 break; 491 page = pc->page; 492 493 if (unlikely(!PageLRU(page))) 494 continue; 495 496 if (PageActive(page) && !active) { 497 __mem_cgroup_move_lists(pc, true); 498 continue; 499 } 500 if (!PageActive(page) && active) { 501 __mem_cgroup_move_lists(pc, false); 502 continue; 503 } 504 505 scan++; 506 list_move(&pc->lru, &pc_list); 507 508 if (__isolate_lru_page(page, mode) == 0) { 509 list_move(&page->lru, dst); 510 nr_taken++; 511 } 512 } 513 514 list_splice(&pc_list, src); 515 spin_unlock(&mz->lru_lock); 516 517 *scanned = scan; 518 return nr_taken; 519 } 520 521 /* 522 * Charge the memory controller for page usage. 523 * Return 524 * 0 if the charge was successful 525 * < 0 if the cgroup is over its limit 526 */ 527 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 528 gfp_t gfp_mask, enum charge_type ctype) 529 { 530 struct mem_cgroup *mem; 531 struct page_cgroup *pc; 532 unsigned long flags; 533 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 534 struct mem_cgroup_per_zone *mz; 535 536 if (mem_cgroup_subsys.disabled) 537 return 0; 538 539 /* 540 * Should page_cgroup's go to their own slab? 541 * One could optimize the performance of the charging routine 542 * by saving a bit in the page_flags and using it as a lock 543 * to see if the cgroup page already has a page_cgroup associated 544 * with it 545 */ 546 retry: 547 lock_page_cgroup(page); 548 pc = page_get_page_cgroup(page); 549 /* 550 * The page_cgroup exists and 551 * the page has already been accounted. 552 */ 553 if (pc) { 554 VM_BUG_ON(pc->page != page); 555 VM_BUG_ON(pc->ref_cnt <= 0); 556 557 pc->ref_cnt++; 558 unlock_page_cgroup(page); 559 goto done; 560 } 561 unlock_page_cgroup(page); 562 563 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 564 if (pc == NULL) 565 goto err; 566 567 /* 568 * We always charge the cgroup the mm_struct belongs to. 569 * The mm_struct's mem_cgroup changes on task migration if the 570 * thread group leader migrates. It's possible that mm is not 571 * set, if so charge the init_mm (happens for pagecache usage). 572 */ 573 if (!mm) 574 mm = &init_mm; 575 576 rcu_read_lock(); 577 mem = rcu_dereference(mm->mem_cgroup); 578 /* 579 * For every charge from the cgroup, increment reference count 580 */ 581 css_get(&mem->css); 582 rcu_read_unlock(); 583 584 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 585 if (!(gfp_mask & __GFP_WAIT)) 586 goto out; 587 588 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 589 continue; 590 591 /* 592 * try_to_free_mem_cgroup_pages() might not give us a full 593 * picture of reclaim. Some pages are reclaimed and might be 594 * moved to swap cache or just unmapped from the cgroup. 595 * Check the limit again to see if the reclaim reduced the 596 * current usage of the cgroup before giving up 597 */ 598 if (res_counter_check_under_limit(&mem->res)) 599 continue; 600 601 if (!nr_retries--) { 602 mem_cgroup_out_of_memory(mem, gfp_mask); 603 goto out; 604 } 605 congestion_wait(WRITE, HZ/10); 606 } 607 608 pc->ref_cnt = 1; 609 pc->mem_cgroup = mem; 610 pc->page = page; 611 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 612 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 613 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 614 615 lock_page_cgroup(page); 616 if (page_get_page_cgroup(page)) { 617 unlock_page_cgroup(page); 618 /* 619 * Another charge has been added to this page already. 620 * We take lock_page_cgroup(page) again and read 621 * page->cgroup, increment refcnt.... just retry is OK. 622 */ 623 res_counter_uncharge(&mem->res, PAGE_SIZE); 624 css_put(&mem->css); 625 kfree(pc); 626 goto retry; 627 } 628 page_assign_page_cgroup(page, pc); 629 630 mz = page_cgroup_zoneinfo(pc); 631 spin_lock_irqsave(&mz->lru_lock, flags); 632 __mem_cgroup_add_list(pc); 633 spin_unlock_irqrestore(&mz->lru_lock, flags); 634 635 unlock_page_cgroup(page); 636 done: 637 return 0; 638 out: 639 css_put(&mem->css); 640 kfree(pc); 641 err: 642 return -ENOMEM; 643 } 644 645 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 646 { 647 return mem_cgroup_charge_common(page, mm, gfp_mask, 648 MEM_CGROUP_CHARGE_TYPE_MAPPED); 649 } 650 651 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 652 gfp_t gfp_mask) 653 { 654 if (!mm) 655 mm = &init_mm; 656 return mem_cgroup_charge_common(page, mm, gfp_mask, 657 MEM_CGROUP_CHARGE_TYPE_CACHE); 658 } 659 660 /* 661 * Uncharging is always a welcome operation, we never complain, simply 662 * uncharge. 663 */ 664 void mem_cgroup_uncharge_page(struct page *page) 665 { 666 struct page_cgroup *pc; 667 struct mem_cgroup *mem; 668 struct mem_cgroup_per_zone *mz; 669 unsigned long flags; 670 671 if (mem_cgroup_subsys.disabled) 672 return; 673 674 /* 675 * Check if our page_cgroup is valid 676 */ 677 lock_page_cgroup(page); 678 pc = page_get_page_cgroup(page); 679 if (!pc) 680 goto unlock; 681 682 VM_BUG_ON(pc->page != page); 683 VM_BUG_ON(pc->ref_cnt <= 0); 684 685 if (--(pc->ref_cnt) == 0) { 686 mz = page_cgroup_zoneinfo(pc); 687 spin_lock_irqsave(&mz->lru_lock, flags); 688 __mem_cgroup_remove_list(pc); 689 spin_unlock_irqrestore(&mz->lru_lock, flags); 690 691 page_assign_page_cgroup(page, NULL); 692 unlock_page_cgroup(page); 693 694 mem = pc->mem_cgroup; 695 res_counter_uncharge(&mem->res, PAGE_SIZE); 696 css_put(&mem->css); 697 698 kfree(pc); 699 return; 700 } 701 702 unlock: 703 unlock_page_cgroup(page); 704 } 705 706 /* 707 * Returns non-zero if a page (under migration) has valid page_cgroup member. 708 * Refcnt of page_cgroup is incremented. 709 */ 710 int mem_cgroup_prepare_migration(struct page *page) 711 { 712 struct page_cgroup *pc; 713 714 if (mem_cgroup_subsys.disabled) 715 return 0; 716 717 lock_page_cgroup(page); 718 pc = page_get_page_cgroup(page); 719 if (pc) 720 pc->ref_cnt++; 721 unlock_page_cgroup(page); 722 return pc != NULL; 723 } 724 725 void mem_cgroup_end_migration(struct page *page) 726 { 727 mem_cgroup_uncharge_page(page); 728 } 729 730 /* 731 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 732 * And no race with uncharge() routines because page_cgroup for *page* 733 * has extra one reference by mem_cgroup_prepare_migration. 734 */ 735 void mem_cgroup_page_migration(struct page *page, struct page *newpage) 736 { 737 struct page_cgroup *pc; 738 struct mem_cgroup_per_zone *mz; 739 unsigned long flags; 740 741 lock_page_cgroup(page); 742 pc = page_get_page_cgroup(page); 743 if (!pc) { 744 unlock_page_cgroup(page); 745 return; 746 } 747 748 mz = page_cgroup_zoneinfo(pc); 749 spin_lock_irqsave(&mz->lru_lock, flags); 750 __mem_cgroup_remove_list(pc); 751 spin_unlock_irqrestore(&mz->lru_lock, flags); 752 753 page_assign_page_cgroup(page, NULL); 754 unlock_page_cgroup(page); 755 756 pc->page = newpage; 757 lock_page_cgroup(newpage); 758 page_assign_page_cgroup(newpage, pc); 759 760 mz = page_cgroup_zoneinfo(pc); 761 spin_lock_irqsave(&mz->lru_lock, flags); 762 __mem_cgroup_add_list(pc); 763 spin_unlock_irqrestore(&mz->lru_lock, flags); 764 765 unlock_page_cgroup(newpage); 766 } 767 768 /* 769 * This routine traverse page_cgroup in given list and drop them all. 770 * This routine ignores page_cgroup->ref_cnt. 771 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 772 */ 773 #define FORCE_UNCHARGE_BATCH (128) 774 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 775 struct mem_cgroup_per_zone *mz, 776 int active) 777 { 778 struct page_cgroup *pc; 779 struct page *page; 780 int count = FORCE_UNCHARGE_BATCH; 781 unsigned long flags; 782 struct list_head *list; 783 784 if (active) 785 list = &mz->active_list; 786 else 787 list = &mz->inactive_list; 788 789 spin_lock_irqsave(&mz->lru_lock, flags); 790 while (!list_empty(list)) { 791 pc = list_entry(list->prev, struct page_cgroup, lru); 792 page = pc->page; 793 get_page(page); 794 spin_unlock_irqrestore(&mz->lru_lock, flags); 795 mem_cgroup_uncharge_page(page); 796 put_page(page); 797 if (--count <= 0) { 798 count = FORCE_UNCHARGE_BATCH; 799 cond_resched(); 800 } 801 spin_lock_irqsave(&mz->lru_lock, flags); 802 } 803 spin_unlock_irqrestore(&mz->lru_lock, flags); 804 } 805 806 /* 807 * make mem_cgroup's charge to be 0 if there is no task. 808 * This enables deleting this mem_cgroup. 809 */ 810 static int mem_cgroup_force_empty(struct mem_cgroup *mem) 811 { 812 int ret = -EBUSY; 813 int node, zid; 814 815 if (mem_cgroup_subsys.disabled) 816 return 0; 817 818 css_get(&mem->css); 819 /* 820 * page reclaim code (kswapd etc..) will move pages between 821 * active_list <-> inactive_list while we don't take a lock. 822 * So, we have to do loop here until all lists are empty. 823 */ 824 while (mem->res.usage > 0) { 825 if (atomic_read(&mem->css.cgroup->count) > 0) 826 goto out; 827 for_each_node_state(node, N_POSSIBLE) 828 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 829 struct mem_cgroup_per_zone *mz; 830 mz = mem_cgroup_zoneinfo(mem, node, zid); 831 /* drop all page_cgroup in active_list */ 832 mem_cgroup_force_empty_list(mem, mz, 1); 833 /* drop all page_cgroup in inactive_list */ 834 mem_cgroup_force_empty_list(mem, mz, 0); 835 } 836 } 837 ret = 0; 838 out: 839 css_put(&mem->css); 840 return ret; 841 } 842 843 static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) 844 { 845 *tmp = memparse(buf, &buf); 846 if (*buf != '\0') 847 return -EINVAL; 848 849 /* 850 * Round up the value to the closest page size 851 */ 852 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; 853 return 0; 854 } 855 856 static ssize_t mem_cgroup_read(struct cgroup *cont, 857 struct cftype *cft, struct file *file, 858 char __user *userbuf, size_t nbytes, loff_t *ppos) 859 { 860 return res_counter_read(&mem_cgroup_from_cont(cont)->res, 861 cft->private, userbuf, nbytes, ppos, 862 NULL); 863 } 864 865 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 866 struct file *file, const char __user *userbuf, 867 size_t nbytes, loff_t *ppos) 868 { 869 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 870 cft->private, userbuf, nbytes, ppos, 871 mem_cgroup_write_strategy); 872 } 873 874 static ssize_t mem_force_empty_write(struct cgroup *cont, 875 struct cftype *cft, struct file *file, 876 const char __user *userbuf, 877 size_t nbytes, loff_t *ppos) 878 { 879 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 880 int ret = mem_cgroup_force_empty(mem); 881 if (!ret) 882 ret = nbytes; 883 return ret; 884 } 885 886 /* 887 * Note: This should be removed if cgroup supports write-only file. 888 */ 889 static ssize_t mem_force_empty_read(struct cgroup *cont, 890 struct cftype *cft, 891 struct file *file, char __user *userbuf, 892 size_t nbytes, loff_t *ppos) 893 { 894 return -EINVAL; 895 } 896 897 static const struct mem_cgroup_stat_desc { 898 const char *msg; 899 u64 unit; 900 } mem_cgroup_stat_desc[] = { 901 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 902 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 903 }; 904 905 static int mem_control_stat_show(struct seq_file *m, void *arg) 906 { 907 struct cgroup *cont = m->private; 908 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 909 struct mem_cgroup_stat *stat = &mem_cont->stat; 910 int i; 911 912 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 913 s64 val; 914 915 val = mem_cgroup_read_stat(stat, i); 916 val *= mem_cgroup_stat_desc[i].unit; 917 seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, 918 (long long)val); 919 } 920 /* showing # of active pages */ 921 { 922 unsigned long active, inactive; 923 924 inactive = mem_cgroup_get_all_zonestat(mem_cont, 925 MEM_CGROUP_ZSTAT_INACTIVE); 926 active = mem_cgroup_get_all_zonestat(mem_cont, 927 MEM_CGROUP_ZSTAT_ACTIVE); 928 seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); 929 seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); 930 } 931 return 0; 932 } 933 934 static const struct file_operations mem_control_stat_file_operations = { 935 .read = seq_read, 936 .llseek = seq_lseek, 937 .release = single_release, 938 }; 939 940 static int mem_control_stat_open(struct inode *unused, struct file *file) 941 { 942 /* XXX __d_cont */ 943 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 944 945 file->f_op = &mem_control_stat_file_operations; 946 return single_open(file, mem_control_stat_show, cont); 947 } 948 949 static struct cftype mem_cgroup_files[] = { 950 { 951 .name = "usage_in_bytes", 952 .private = RES_USAGE, 953 .read = mem_cgroup_read, 954 }, 955 { 956 .name = "limit_in_bytes", 957 .private = RES_LIMIT, 958 .write = mem_cgroup_write, 959 .read = mem_cgroup_read, 960 }, 961 { 962 .name = "failcnt", 963 .private = RES_FAILCNT, 964 .read = mem_cgroup_read, 965 }, 966 { 967 .name = "force_empty", 968 .write = mem_force_empty_write, 969 .read = mem_force_empty_read, 970 }, 971 { 972 .name = "stat", 973 .open = mem_control_stat_open, 974 }, 975 }; 976 977 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 978 { 979 struct mem_cgroup_per_node *pn; 980 struct mem_cgroup_per_zone *mz; 981 int zone, tmp = node; 982 /* 983 * This routine is called against possible nodes. 984 * But it's BUG to call kmalloc() against offline node. 985 * 986 * TODO: this routine can waste much memory for nodes which will 987 * never be onlined. It's better to use memory hotplug callback 988 * function. 989 */ 990 if (!node_state(node, N_NORMAL_MEMORY)) 991 tmp = -1; 992 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 993 if (!pn) 994 return 1; 995 996 mem->info.nodeinfo[node] = pn; 997 memset(pn, 0, sizeof(*pn)); 998 999 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1000 mz = &pn->zoneinfo[zone]; 1001 INIT_LIST_HEAD(&mz->active_list); 1002 INIT_LIST_HEAD(&mz->inactive_list); 1003 spin_lock_init(&mz->lru_lock); 1004 } 1005 return 0; 1006 } 1007 1008 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1009 { 1010 kfree(mem->info.nodeinfo[node]); 1011 } 1012 1013 static struct cgroup_subsys_state * 1014 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1015 { 1016 struct mem_cgroup *mem; 1017 int node; 1018 1019 if (unlikely((cont->parent) == NULL)) { 1020 mem = &init_mem_cgroup; 1021 init_mm.mem_cgroup = mem; 1022 } else 1023 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); 1024 1025 if (mem == NULL) 1026 return ERR_PTR(-ENOMEM); 1027 1028 res_counter_init(&mem->res); 1029 1030 memset(&mem->info, 0, sizeof(mem->info)); 1031 1032 for_each_node_state(node, N_POSSIBLE) 1033 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1034 goto free_out; 1035 1036 return &mem->css; 1037 free_out: 1038 for_each_node_state(node, N_POSSIBLE) 1039 free_mem_cgroup_per_zone_info(mem, node); 1040 if (cont->parent != NULL) 1041 kfree(mem); 1042 return ERR_PTR(-ENOMEM); 1043 } 1044 1045 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1046 struct cgroup *cont) 1047 { 1048 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1049 mem_cgroup_force_empty(mem); 1050 } 1051 1052 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1053 struct cgroup *cont) 1054 { 1055 int node; 1056 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1057 1058 for_each_node_state(node, N_POSSIBLE) 1059 free_mem_cgroup_per_zone_info(mem, node); 1060 1061 kfree(mem_cgroup_from_cont(cont)); 1062 } 1063 1064 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1065 struct cgroup *cont) 1066 { 1067 if (mem_cgroup_subsys.disabled) 1068 return 0; 1069 return cgroup_add_files(cont, ss, mem_cgroup_files, 1070 ARRAY_SIZE(mem_cgroup_files)); 1071 } 1072 1073 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1074 struct cgroup *cont, 1075 struct cgroup *old_cont, 1076 struct task_struct *p) 1077 { 1078 struct mm_struct *mm; 1079 struct mem_cgroup *mem, *old_mem; 1080 1081 if (mem_cgroup_subsys.disabled) 1082 return; 1083 1084 mm = get_task_mm(p); 1085 if (mm == NULL) 1086 return; 1087 1088 mem = mem_cgroup_from_cont(cont); 1089 old_mem = mem_cgroup_from_cont(old_cont); 1090 1091 if (mem == old_mem) 1092 goto out; 1093 1094 /* 1095 * Only thread group leaders are allowed to migrate, the mm_struct is 1096 * in effect owned by the leader 1097 */ 1098 if (!thread_group_leader(p)) 1099 goto out; 1100 1101 css_get(&mem->css); 1102 rcu_assign_pointer(mm->mem_cgroup, mem); 1103 css_put(&old_mem->css); 1104 1105 out: 1106 mmput(mm); 1107 } 1108 1109 struct cgroup_subsys mem_cgroup_subsys = { 1110 .name = "memory", 1111 .subsys_id = mem_cgroup_subsys_id, 1112 .create = mem_cgroup_create, 1113 .pre_destroy = mem_cgroup_pre_destroy, 1114 .destroy = mem_cgroup_destroy, 1115 .populate = mem_cgroup_populate, 1116 .attach = mem_cgroup_move_task, 1117 .early_init = 0, 1118 }; 1119