memcontrol.c (a9dd0a83104c01269ea36a9b4ec42b51edf85427) memcontrol.c (ef8f2327996b5c20f11420f64e439e87c7a01604)
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *

--- 118 unchanged lines hidden (view full) ---

127#define SOFTLIMIT_EVENTS_TARGET 1024
128#define NUMAINFO_EVENTS_TARGET 1024
129
130/*
131 * Cgroups above their limits are maintained in a RB-Tree, independent of
132 * their hierarchy representation
133 */
134
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *

--- 118 unchanged lines hidden (view full) ---

127#define SOFTLIMIT_EVENTS_TARGET 1024
128#define NUMAINFO_EVENTS_TARGET 1024
129
130/*
131 * Cgroups above their limits are maintained in a RB-Tree, independent of
132 * their hierarchy representation
133 */
134
135struct mem_cgroup_tree_per_zone {
135struct mem_cgroup_tree_per_node {
136 struct rb_root rb_root;
137 spinlock_t lock;
138};
139
136 struct rb_root rb_root;
137 spinlock_t lock;
138};
139
140struct mem_cgroup_tree_per_node {
141 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
142};
143
144struct mem_cgroup_tree {
145 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
146};
147
148static struct mem_cgroup_tree soft_limit_tree __read_mostly;
149
150/* for OOM */
151struct mem_cgroup_eventfd_list {

--- 217 unchanged lines hidden (view full) ---

369 while (memcg && !(memcg->css.flags & CSS_ONLINE))
370 memcg = parent_mem_cgroup(memcg);
371 if (memcg)
372 ino = cgroup_ino(memcg->css.cgroup);
373 rcu_read_unlock();
374 return ino;
375}
376
140struct mem_cgroup_tree {
141 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
142};
143
144static struct mem_cgroup_tree soft_limit_tree __read_mostly;
145
146/* for OOM */
147struct mem_cgroup_eventfd_list {

--- 217 unchanged lines hidden (view full) ---

365 while (memcg && !(memcg->css.flags & CSS_ONLINE))
366 memcg = parent_mem_cgroup(memcg);
367 if (memcg)
368 ino = cgroup_ino(memcg->css.cgroup);
369 rcu_read_unlock();
370 return ino;
371}
372
377static struct mem_cgroup_per_zone *
378mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
373static struct mem_cgroup_per_node *
374mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
379{
380 int nid = page_to_nid(page);
375{
376 int nid = page_to_nid(page);
381 int zid = page_zonenum(page);
382
377
383 return &memcg->nodeinfo[nid]->zoneinfo[zid];
378 return memcg->nodeinfo[nid];
384}
385
379}
380
386static struct mem_cgroup_tree_per_zone *
387soft_limit_tree_node_zone(int nid, int zid)
381static struct mem_cgroup_tree_per_node *
382soft_limit_tree_node(int nid)
388{
383{
389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
384 return soft_limit_tree.rb_tree_per_node[nid];
390}
391
385}
386
392static struct mem_cgroup_tree_per_zone *
387static struct mem_cgroup_tree_per_node *
393soft_limit_tree_from_page(struct page *page)
394{
395 int nid = page_to_nid(page);
388soft_limit_tree_from_page(struct page *page)
389{
390 int nid = page_to_nid(page);
396 int zid = page_zonenum(page);
397
391
398 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
392 return soft_limit_tree.rb_tree_per_node[nid];
399}
400
393}
394
401static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
402 struct mem_cgroup_tree_per_zone *mctz,
395static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
396 struct mem_cgroup_tree_per_node *mctz,
403 unsigned long new_usage_in_excess)
404{
405 struct rb_node **p = &mctz->rb_root.rb_node;
406 struct rb_node *parent = NULL;
397 unsigned long new_usage_in_excess)
398{
399 struct rb_node **p = &mctz->rb_root.rb_node;
400 struct rb_node *parent = NULL;
407 struct mem_cgroup_per_zone *mz_node;
401 struct mem_cgroup_per_node *mz_node;
408
409 if (mz->on_tree)
410 return;
411
412 mz->usage_in_excess = new_usage_in_excess;
413 if (!mz->usage_in_excess)
414 return;
415 while (*p) {
416 parent = *p;
402
403 if (mz->on_tree)
404 return;
405
406 mz->usage_in_excess = new_usage_in_excess;
407 if (!mz->usage_in_excess)
408 return;
409 while (*p) {
410 parent = *p;
417 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
411 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
418 tree_node);
419 if (mz->usage_in_excess < mz_node->usage_in_excess)
420 p = &(*p)->rb_left;
421 /*
422 * We can't avoid mem cgroups that are over their soft
423 * limit by the same amount
424 */
425 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
426 p = &(*p)->rb_right;
427 }
428 rb_link_node(&mz->tree_node, parent, p);
429 rb_insert_color(&mz->tree_node, &mctz->rb_root);
430 mz->on_tree = true;
431}
432
412 tree_node);
413 if (mz->usage_in_excess < mz_node->usage_in_excess)
414 p = &(*p)->rb_left;
415 /*
416 * We can't avoid mem cgroups that are over their soft
417 * limit by the same amount
418 */
419 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
420 p = &(*p)->rb_right;
421 }
422 rb_link_node(&mz->tree_node, parent, p);
423 rb_insert_color(&mz->tree_node, &mctz->rb_root);
424 mz->on_tree = true;
425}
426
433static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
434 struct mem_cgroup_tree_per_zone *mctz)
427static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
428 struct mem_cgroup_tree_per_node *mctz)
435{
436 if (!mz->on_tree)
437 return;
438 rb_erase(&mz->tree_node, &mctz->rb_root);
439 mz->on_tree = false;
440}
441
429{
430 if (!mz->on_tree)
431 return;
432 rb_erase(&mz->tree_node, &mctz->rb_root);
433 mz->on_tree = false;
434}
435
442static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
443 struct mem_cgroup_tree_per_zone *mctz)
436static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
437 struct mem_cgroup_tree_per_node *mctz)
444{
445 unsigned long flags;
446
447 spin_lock_irqsave(&mctz->lock, flags);
448 __mem_cgroup_remove_exceeded(mz, mctz);
449 spin_unlock_irqrestore(&mctz->lock, flags);
450}
451

--- 7 unchanged lines hidden (view full) ---

459 excess = nr_pages - soft_limit;
460
461 return excess;
462}
463
464static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
465{
466 unsigned long excess;
438{
439 unsigned long flags;
440
441 spin_lock_irqsave(&mctz->lock, flags);
442 __mem_cgroup_remove_exceeded(mz, mctz);
443 spin_unlock_irqrestore(&mctz->lock, flags);
444}
445

--- 7 unchanged lines hidden (view full) ---

453 excess = nr_pages - soft_limit;
454
455 return excess;
456}
457
458static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
459{
460 unsigned long excess;
467 struct mem_cgroup_per_zone *mz;
468 struct mem_cgroup_tree_per_zone *mctz;
461 struct mem_cgroup_per_node *mz;
462 struct mem_cgroup_tree_per_node *mctz;
469
470 mctz = soft_limit_tree_from_page(page);
471 /*
472 * Necessary to update all ancestors when hierarchy is used.
473 * because their event counter is not touched.
474 */
475 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
463
464 mctz = soft_limit_tree_from_page(page);
465 /*
466 * Necessary to update all ancestors when hierarchy is used.
467 * because their event counter is not touched.
468 */
469 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
476 mz = mem_cgroup_page_zoneinfo(memcg, page);
470 mz = mem_cgroup_page_nodeinfo(memcg, page);
477 excess = soft_limit_excess(memcg);
478 /*
479 * We have to update the tree if mz is on RB-tree or
480 * mem is over its softlimit.
481 */
482 if (excess || mz->on_tree) {
483 unsigned long flags;
484

--- 8 unchanged lines hidden (view full) ---

493 __mem_cgroup_insert_exceeded(mz, mctz, excess);
494 spin_unlock_irqrestore(&mctz->lock, flags);
495 }
496 }
497}
498
499static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
500{
471 excess = soft_limit_excess(memcg);
472 /*
473 * We have to update the tree if mz is on RB-tree or
474 * mem is over its softlimit.
475 */
476 if (excess || mz->on_tree) {
477 unsigned long flags;
478

--- 8 unchanged lines hidden (view full) ---

487 __mem_cgroup_insert_exceeded(mz, mctz, excess);
488 spin_unlock_irqrestore(&mctz->lock, flags);
489 }
490 }
491}
492
493static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
494{
501 struct mem_cgroup_tree_per_zone *mctz;
502 struct mem_cgroup_per_zone *mz;
503 int nid, zid;
495 struct mem_cgroup_tree_per_node *mctz;
496 struct mem_cgroup_per_node *mz;
497 int nid;
504
505 for_each_node(nid) {
498
499 for_each_node(nid) {
506 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
507 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
508 mctz = soft_limit_tree_node_zone(nid, zid);
509 mem_cgroup_remove_exceeded(mz, mctz);
510 }
500 mz = mem_cgroup_nodeinfo(memcg, nid);
501 mctz = soft_limit_tree_node(nid);
502 mem_cgroup_remove_exceeded(mz, mctz);
511 }
512}
513
503 }
504}
505
514static struct mem_cgroup_per_zone *
515__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
506static struct mem_cgroup_per_node *
507__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
516{
517 struct rb_node *rightmost = NULL;
508{
509 struct rb_node *rightmost = NULL;
518 struct mem_cgroup_per_zone *mz;
510 struct mem_cgroup_per_node *mz;
519
520retry:
521 mz = NULL;
522 rightmost = rb_last(&mctz->rb_root);
523 if (!rightmost)
524 goto done; /* Nothing to reclaim from */
525
511
512retry:
513 mz = NULL;
514 rightmost = rb_last(&mctz->rb_root);
515 if (!rightmost)
516 goto done; /* Nothing to reclaim from */
517
526 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
518 mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
527 /*
528 * Remove the node now but someone else can add it back,
529 * we will to add it back at the end of reclaim to its correct
530 * position in the tree.
531 */
532 __mem_cgroup_remove_exceeded(mz, mctz);
533 if (!soft_limit_excess(mz->memcg) ||
534 !css_tryget_online(&mz->memcg->css))
535 goto retry;
536done:
537 return mz;
538}
539
519 /*
520 * Remove the node now but someone else can add it back,
521 * we will to add it back at the end of reclaim to its correct
522 * position in the tree.
523 */
524 __mem_cgroup_remove_exceeded(mz, mctz);
525 if (!soft_limit_excess(mz->memcg) ||
526 !css_tryget_online(&mz->memcg->css))
527 goto retry;
528done:
529 return mz;
530}
531
540static struct mem_cgroup_per_zone *
541mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
532static struct mem_cgroup_per_node *
533mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
542{
534{
543 struct mem_cgroup_per_zone *mz;
535 struct mem_cgroup_per_node *mz;
544
545 spin_lock_irq(&mctz->lock);
546 mz = __mem_cgroup_largest_soft_limit_node(mctz);
547 spin_unlock_irq(&mctz->lock);
548 return mz;
549}
550
551/*

--- 77 unchanged lines hidden (view full) ---

629
630 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
631}
632
633unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
634 int nid, unsigned int lru_mask)
635{
636 unsigned long nr = 0;
536
537 spin_lock_irq(&mctz->lock);
538 mz = __mem_cgroup_largest_soft_limit_node(mctz);
539 spin_unlock_irq(&mctz->lock);
540 return mz;
541}
542
543/*

--- 77 unchanged lines hidden (view full) ---

621
622 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
623}
624
625unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
626 int nid, unsigned int lru_mask)
627{
628 unsigned long nr = 0;
637 int zid;
629 struct mem_cgroup_per_node *mz;
630 enum lru_list lru;
638
639 VM_BUG_ON((unsigned)nid >= nr_node_ids);
640
631
632 VM_BUG_ON((unsigned)nid >= nr_node_ids);
633
641 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
642 struct mem_cgroup_per_zone *mz;
643 enum lru_list lru;
644
645 for_each_lru(lru) {
646 if (!(BIT(lru) & lru_mask))
647 continue;
648 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
649 nr += mz->lru_size[lru];
650 }
634 for_each_lru(lru) {
635 if (!(BIT(lru) & lru_mask))
636 continue;
637 mz = mem_cgroup_nodeinfo(memcg, nid);
638 nr += mz->lru_size[lru];
651 }
652 return nr;
653}
654
655static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
656 unsigned int lru_mask)
657{
658 unsigned long nr = 0;

--- 136 unchanged lines hidden (view full) ---

795 if (prev)
796 goto out;
797 return root;
798 }
799
800 rcu_read_lock();
801
802 if (reclaim) {
639 }
640 return nr;
641}
642
643static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
644 unsigned int lru_mask)
645{
646 unsigned long nr = 0;

--- 136 unchanged lines hidden (view full) ---

783 if (prev)
784 goto out;
785 return root;
786 }
787
788 rcu_read_lock();
789
790 if (reclaim) {
803 struct mem_cgroup_per_zone *mz;
791 struct mem_cgroup_per_node *mz;
804
792
805 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
793 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
806 iter = &mz->iter[reclaim->priority];
807
808 if (prev && reclaim->generation != iter->generation)
809 goto out_unlock;
810
811 while (1) {
812 pos = READ_ONCE(iter->position);
813 if (!pos || css_tryget(&pos->css))

--- 82 unchanged lines hidden (view full) ---

896 if (prev && prev != root)
897 css_put(&prev->css);
898}
899
900static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
901{
902 struct mem_cgroup *memcg = dead_memcg;
903 struct mem_cgroup_reclaim_iter *iter;
794 iter = &mz->iter[reclaim->priority];
795
796 if (prev && reclaim->generation != iter->generation)
797 goto out_unlock;
798
799 while (1) {
800 pos = READ_ONCE(iter->position);
801 if (!pos || css_tryget(&pos->css))

--- 82 unchanged lines hidden (view full) ---

884 if (prev && prev != root)
885 css_put(&prev->css);
886}
887
888static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
889{
890 struct mem_cgroup *memcg = dead_memcg;
891 struct mem_cgroup_reclaim_iter *iter;
904 struct mem_cgroup_per_zone *mz;
905 int nid, zid;
892 struct mem_cgroup_per_node *mz;
893 int nid;
906 int i;
907
908 while ((memcg = parent_mem_cgroup(memcg))) {
909 for_each_node(nid) {
894 int i;
895
896 while ((memcg = parent_mem_cgroup(memcg))) {
897 for_each_node(nid) {
910 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
911 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
912 for (i = 0; i <= DEF_PRIORITY; i++) {
913 iter = &mz->iter[i];
914 cmpxchg(&iter->position,
915 dead_memcg, NULL);
916 }
898 mz = mem_cgroup_nodeinfo(memcg, nid);
899 for (i = 0; i <= DEF_PRIORITY; i++) {
900 iter = &mz->iter[i];
901 cmpxchg(&iter->position,
902 dead_memcg, NULL);
917 }
918 }
919 }
920}
921
922/*
923 * Iteration constructs for visiting all cgroups (under a tree). If
924 * loops are exited prematurely (break), mem_cgroup_iter_break() must

--- 15 unchanged lines hidden (view full) ---

940 * @zone: zone of the page
941 *
942 * This function is only safe when following the LRU page isolation
943 * and putback protocol: the LRU lock must be held, and the page must
944 * either be PageLRU() or the caller must have isolated/allocated it.
945 */
946struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
947{
903 }
904 }
905 }
906}
907
908/*
909 * Iteration constructs for visiting all cgroups (under a tree). If
910 * loops are exited prematurely (break), mem_cgroup_iter_break() must

--- 15 unchanged lines hidden (view full) ---

926 * @zone: zone of the page
927 *
928 * This function is only safe when following the LRU page isolation
929 * and putback protocol: the LRU lock must be held, and the page must
930 * either be PageLRU() or the caller must have isolated/allocated it.
931 */
932struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
933{
948 struct mem_cgroup_per_zone *mz;
934 struct mem_cgroup_per_node *mz;
949 struct mem_cgroup *memcg;
950 struct lruvec *lruvec;
951
952 if (mem_cgroup_disabled()) {
953 lruvec = &pgdat->lruvec;
954 goto out;
955 }
956
957 memcg = page->mem_cgroup;
958 /*
959 * Swapcache readahead pages are added to the LRU - and
960 * possibly migrated - before they are charged.
961 */
962 if (!memcg)
963 memcg = root_mem_cgroup;
964
935 struct mem_cgroup *memcg;
936 struct lruvec *lruvec;
937
938 if (mem_cgroup_disabled()) {
939 lruvec = &pgdat->lruvec;
940 goto out;
941 }
942
943 memcg = page->mem_cgroup;
944 /*
945 * Swapcache readahead pages are added to the LRU - and
946 * possibly migrated - before they are charged.
947 */
948 if (!memcg)
949 memcg = root_mem_cgroup;
950
965 mz = mem_cgroup_page_zoneinfo(memcg, page);
951 mz = mem_cgroup_page_nodeinfo(memcg, page);
966 lruvec = &mz->lruvec;
967out:
968 /*
969 * Since a node can be onlined after the mem_cgroup was created,
970 * we have to be prepared to initialize lruvec->zone here;
971 * and if offlined then reonlined, we need to reinitialize it.
972 */
973 if (unlikely(lruvec->pgdat != pgdat))

--- 10 unchanged lines hidden (view full) ---

984 *
985 * This function must be called under lru_lock, just before a page is added
986 * to or just after a page is removed from an lru list (that ordering being
987 * so as to allow it to check that lru_size 0 is consistent with list_empty).
988 */
989void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
990 enum zone_type zid, int nr_pages)
991{
952 lruvec = &mz->lruvec;
953out:
954 /*
955 * Since a node can be onlined after the mem_cgroup was created,
956 * we have to be prepared to initialize lruvec->zone here;
957 * and if offlined then reonlined, we need to reinitialize it.
958 */
959 if (unlikely(lruvec->pgdat != pgdat))

--- 10 unchanged lines hidden (view full) ---

970 *
971 * This function must be called under lru_lock, just before a page is added
972 * to or just after a page is removed from an lru list (that ordering being
973 * so as to allow it to check that lru_size 0 is consistent with list_empty).
974 */
975void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
976 enum zone_type zid, int nr_pages)
977{
992 struct mem_cgroup_per_zone *mz;
978 struct mem_cgroup_per_node *mz;
993 unsigned long *lru_size;
994 long size;
995 bool empty;
996
997 __update_lru_size(lruvec, lru, zid, nr_pages);
998
999 if (mem_cgroup_disabled())
1000 return;
1001
979 unsigned long *lru_size;
980 long size;
981 bool empty;
982
983 __update_lru_size(lruvec, lru, zid, nr_pages);
984
985 if (mem_cgroup_disabled())
986 return;
987
1002 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
988 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1003 lru_size = mz->lru_size + lru;
1004 empty = list_empty(lruvec->lists + lru);
1005
1006 if (nr_pages < 0)
1007 *lru_size += nr_pages;
1008
1009 size = *lru_size;
1010 if (WARN_ONCE(size < 0 || empty != !size,

--- 376 unchanged lines hidden (view full) ---

1387#else
1388int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1389{
1390 return 0;
1391}
1392#endif
1393
1394static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
989 lru_size = mz->lru_size + lru;
990 empty = list_empty(lruvec->lists + lru);
991
992 if (nr_pages < 0)
993 *lru_size += nr_pages;
994
995 size = *lru_size;
996 if (WARN_ONCE(size < 0 || empty != !size,

--- 376 unchanged lines hidden (view full) ---

1373#else
1374int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1375{
1376 return 0;
1377}
1378#endif
1379
1380static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1395 struct zone *zone,
1381 pg_data_t *pgdat,
1396 gfp_t gfp_mask,
1397 unsigned long *total_scanned)
1398{
1399 struct mem_cgroup *victim = NULL;
1400 int total = 0;
1401 int loop = 0;
1402 unsigned long excess;
1403 unsigned long nr_scanned;
1404 struct mem_cgroup_reclaim_cookie reclaim = {
1382 gfp_t gfp_mask,
1383 unsigned long *total_scanned)
1384{
1385 struct mem_cgroup *victim = NULL;
1386 int total = 0;
1387 int loop = 0;
1388 unsigned long excess;
1389 unsigned long nr_scanned;
1390 struct mem_cgroup_reclaim_cookie reclaim = {
1405 .zone = zone,
1391 .pgdat = pgdat,
1406 .priority = 0,
1407 };
1408
1409 excess = soft_limit_excess(root_memcg);
1410
1411 while (1) {
1412 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1413 if (!victim) {

--- 14 unchanged lines hidden (view full) ---

1428 */
1429 if (total >= (excess >> 2) ||
1430 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1431 break;
1432 }
1433 continue;
1434 }
1435 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1392 .priority = 0,
1393 };
1394
1395 excess = soft_limit_excess(root_memcg);
1396
1397 while (1) {
1398 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1399 if (!victim) {

--- 14 unchanged lines hidden (view full) ---

1414 */
1415 if (total >= (excess >> 2) ||
1416 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1417 break;
1418 }
1419 continue;
1420 }
1421 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1436 zone, &nr_scanned);
1422 pgdat, &nr_scanned);
1437 *total_scanned += nr_scanned;
1438 if (!soft_limit_excess(root_memcg))
1439 break;
1440 }
1441 mem_cgroup_iter_break(root_memcg, victim);
1442 return total;
1443}
1444

--- 1110 unchanged lines hidden (view full) ---

2555 } while (retry_count);
2556
2557 if (!ret && enlarge)
2558 memcg_oom_recover(memcg);
2559
2560 return ret;
2561}
2562
1423 *total_scanned += nr_scanned;
1424 if (!soft_limit_excess(root_memcg))
1425 break;
1426 }
1427 mem_cgroup_iter_break(root_memcg, victim);
1428 return total;
1429}
1430

--- 1110 unchanged lines hidden (view full) ---

2541 } while (retry_count);
2542
2543 if (!ret && enlarge)
2544 memcg_oom_recover(memcg);
2545
2546 return ret;
2547}
2548
2563unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2549unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2564 gfp_t gfp_mask,
2565 unsigned long *total_scanned)
2566{
2567 unsigned long nr_reclaimed = 0;
2550 gfp_t gfp_mask,
2551 unsigned long *total_scanned)
2552{
2553 unsigned long nr_reclaimed = 0;
2568 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2554 struct mem_cgroup_per_node *mz, *next_mz = NULL;
2569 unsigned long reclaimed;
2570 int loop = 0;
2555 unsigned long reclaimed;
2556 int loop = 0;
2571 struct mem_cgroup_tree_per_zone *mctz;
2557 struct mem_cgroup_tree_per_node *mctz;
2572 unsigned long excess;
2573 unsigned long nr_scanned;
2574
2575 if (order > 0)
2576 return 0;
2577
2558 unsigned long excess;
2559 unsigned long nr_scanned;
2560
2561 if (order > 0)
2562 return 0;
2563
2578 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2564 mctz = soft_limit_tree_node(pgdat->node_id);
2579 /*
2580 * This loop can run a while, specially if mem_cgroup's continuously
2581 * keep exceeding their soft limit and putting the system under
2582 * pressure
2583 */
2584 do {
2585 if (next_mz)
2586 mz = next_mz;
2587 else
2588 mz = mem_cgroup_largest_soft_limit_node(mctz);
2589 if (!mz)
2590 break;
2591
2592 nr_scanned = 0;
2565 /*
2566 * This loop can run a while, specially if mem_cgroup's continuously
2567 * keep exceeding their soft limit and putting the system under
2568 * pressure
2569 */
2570 do {
2571 if (next_mz)
2572 mz = next_mz;
2573 else
2574 mz = mem_cgroup_largest_soft_limit_node(mctz);
2575 if (!mz)
2576 break;
2577
2578 nr_scanned = 0;
2593 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
2579 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2594 gfp_mask, &nr_scanned);
2595 nr_reclaimed += reclaimed;
2596 *total_scanned += nr_scanned;
2597 spin_lock_irq(&mctz->lock);
2598 __mem_cgroup_remove_exceeded(mz, mctz);
2599
2600 /*
2601 * If we failed to reclaim anything from this memory cgroup

--- 604 unchanged lines hidden (view full) ---

3206
3207 for_each_mem_cgroup_tree(mi, memcg)
3208 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3209 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3210 }
3211
3212#ifdef CONFIG_DEBUG_VM
3213 {
2580 gfp_mask, &nr_scanned);
2581 nr_reclaimed += reclaimed;
2582 *total_scanned += nr_scanned;
2583 spin_lock_irq(&mctz->lock);
2584 __mem_cgroup_remove_exceeded(mz, mctz);
2585
2586 /*
2587 * If we failed to reclaim anything from this memory cgroup

--- 604 unchanged lines hidden (view full) ---

3192
3193 for_each_mem_cgroup_tree(mi, memcg)
3194 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3195 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3196 }
3197
3198#ifdef CONFIG_DEBUG_VM
3199 {
3214 int nid, zid;
3215 struct mem_cgroup_per_zone *mz;
3200 pg_data_t *pgdat;
3201 struct mem_cgroup_per_node *mz;
3216 struct zone_reclaim_stat *rstat;
3217 unsigned long recent_rotated[2] = {0, 0};
3218 unsigned long recent_scanned[2] = {0, 0};
3219
3202 struct zone_reclaim_stat *rstat;
3203 unsigned long recent_rotated[2] = {0, 0};
3204 unsigned long recent_scanned[2] = {0, 0};
3205
3220 for_each_online_node(nid)
3221 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3222 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
3223 rstat = &mz->lruvec.reclaim_stat;
3206 for_each_online_pgdat(pgdat) {
3207 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3208 rstat = &mz->lruvec.reclaim_stat;
3224
3209
3225 recent_rotated[0] += rstat->recent_rotated[0];
3226 recent_rotated[1] += rstat->recent_rotated[1];
3227 recent_scanned[0] += rstat->recent_scanned[0];
3228 recent_scanned[1] += rstat->recent_scanned[1];
3229 }
3210 recent_rotated[0] += rstat->recent_rotated[0];
3211 recent_rotated[1] += rstat->recent_rotated[1];
3212 recent_scanned[0] += rstat->recent_scanned[0];
3213 recent_scanned[1] += rstat->recent_scanned[1];
3214 }
3230 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3231 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3232 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3233 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3234 }
3235#endif
3236
3237 return 0;

--- 863 unchanged lines hidden (view full) ---

4101 * Caller must hold rcu_read_lock().
4102 */
4103struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4104{
4105 WARN_ON_ONCE(!rcu_read_lock_held());
4106 return idr_find(&mem_cgroup_idr, id);
4107}
4108
3215 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3216 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3217 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3218 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3219 }
3220#endif
3221
3222 return 0;

--- 863 unchanged lines hidden (view full) ---

4086 * Caller must hold rcu_read_lock().
4087 */
4088struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4089{
4090 WARN_ON_ONCE(!rcu_read_lock_held());
4091 return idr_find(&mem_cgroup_idr, id);
4092}
4093
4109static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4094static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4110{
4111 struct mem_cgroup_per_node *pn;
4095{
4096 struct mem_cgroup_per_node *pn;
4112 struct mem_cgroup_per_zone *mz;
4113 int zone, tmp = node;
4097 int tmp = node;
4114 /*
4115 * This routine is called against possible nodes.
4116 * But it's BUG to call kmalloc() against offline node.
4117 *
4118 * TODO: this routine can waste much memory for nodes which will
4119 * never be onlined. It's better to use memory hotplug callback
4120 * function.
4121 */
4122 if (!node_state(node, N_NORMAL_MEMORY))
4123 tmp = -1;
4124 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4125 if (!pn)
4126 return 1;
4127
4098 /*
4099 * This routine is called against possible nodes.
4100 * But it's BUG to call kmalloc() against offline node.
4101 *
4102 * TODO: this routine can waste much memory for nodes which will
4103 * never be onlined. It's better to use memory hotplug callback
4104 * function.
4105 */
4106 if (!node_state(node, N_NORMAL_MEMORY))
4107 tmp = -1;
4108 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4109 if (!pn)
4110 return 1;
4111
4128 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4129 mz = &pn->zoneinfo[zone];
4130 lruvec_init(&mz->lruvec);
4131 mz->usage_in_excess = 0;
4132 mz->on_tree = false;
4133 mz->memcg = memcg;
4134 }
4112 lruvec_init(&pn->lruvec);
4113 pn->usage_in_excess = 0;
4114 pn->on_tree = false;
4115 pn->memcg = memcg;
4116
4135 memcg->nodeinfo[node] = pn;
4136 return 0;
4137}
4138
4117 memcg->nodeinfo[node] = pn;
4118 return 0;
4119}
4120
4139static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4121static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4140{
4141 kfree(memcg->nodeinfo[node]);
4142}
4143
4144static void mem_cgroup_free(struct mem_cgroup *memcg)
4145{
4146 int node;
4147
4148 memcg_wb_domain_exit(memcg);
4149 for_each_node(node)
4122{
4123 kfree(memcg->nodeinfo[node]);
4124}
4125
4126static void mem_cgroup_free(struct mem_cgroup *memcg)
4127{
4128 int node;
4129
4130 memcg_wb_domain_exit(memcg);
4131 for_each_node(node)
4150 free_mem_cgroup_per_zone_info(memcg, node);
4132 free_mem_cgroup_per_node_info(memcg, node);
4151 free_percpu(memcg->stat);
4152 kfree(memcg);
4153}
4154
4155static struct mem_cgroup *mem_cgroup_alloc(void)
4156{
4157 struct mem_cgroup *memcg;
4158 size_t size;

--- 12 unchanged lines hidden (view full) ---

4171 if (memcg->id.id < 0)
4172 goto fail;
4173
4174 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4175 if (!memcg->stat)
4176 goto fail;
4177
4178 for_each_node(node)
4133 free_percpu(memcg->stat);
4134 kfree(memcg);
4135}
4136
4137static struct mem_cgroup *mem_cgroup_alloc(void)
4138{
4139 struct mem_cgroup *memcg;
4140 size_t size;

--- 12 unchanged lines hidden (view full) ---

4153 if (memcg->id.id < 0)
4154 goto fail;
4155
4156 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4157 if (!memcg->stat)
4158 goto fail;
4159
4160 for_each_node(node)
4179 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4161 if (alloc_mem_cgroup_per_node_info(memcg, node))
4180 goto fail;
4181
4182 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4183 goto fail;
4184
4185 INIT_WORK(&memcg->high_work, high_work_func);
4186 memcg->last_scanned_node = MAX_NUMNODES;
4187 INIT_LIST_HEAD(&memcg->oom_notify);

--- 1586 unchanged lines hidden (view full) ---

5774 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5775
5776 for_each_possible_cpu(cpu)
5777 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5778 drain_local_stock);
5779
5780 for_each_node(node) {
5781 struct mem_cgroup_tree_per_node *rtpn;
4162 goto fail;
4163
4164 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4165 goto fail;
4166
4167 INIT_WORK(&memcg->high_work, high_work_func);
4168 memcg->last_scanned_node = MAX_NUMNODES;
4169 INIT_LIST_HEAD(&memcg->oom_notify);

--- 1586 unchanged lines hidden (view full) ---

5756 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5757
5758 for_each_possible_cpu(cpu)
5759 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5760 drain_local_stock);
5761
5762 for_each_node(node) {
5763 struct mem_cgroup_tree_per_node *rtpn;
5782 int zone;
5783
5784 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5785 node_online(node) ? node : NUMA_NO_NODE);
5786
5764
5765 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5766 node_online(node) ? node : NUMA_NO_NODE);
5767
5787 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5788 struct mem_cgroup_tree_per_zone *rtpz;
5789
5790 rtpz = &rtpn->rb_tree_per_zone[zone];
5791 rtpz->rb_root = RB_ROOT;
5792 spin_lock_init(&rtpz->lock);
5793 }
5768 rtpn->rb_root = RB_ROOT;
5769 spin_lock_init(&rtpn->lock);
5794 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5795 }
5796
5797 return 0;
5798}
5799subsys_initcall(mem_cgroup_init);
5800
5801#ifdef CONFIG_MEMCG_SWAP

--- 264 unchanged lines hidden ---
5770 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5771 }
5772
5773 return 0;
5774}
5775subsys_initcall(mem_cgroup_init);
5776
5777#ifdef CONFIG_MEMCG_SWAP

--- 264 unchanged lines hidden ---