xref: /openbmc/linux/mm/memcontrol.c (revision 9d56dd3b)
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  */
19 
20 #include <linux/res_counter.h>
21 #include <linux/memcontrol.h>
22 #include <linux/cgroup.h>
23 #include <linux/mm.h>
24 #include <linux/pagemap.h>
25 #include <linux/smp.h>
26 #include <linux/page-flags.h>
27 #include <linux/backing-dev.h>
28 #include <linux/bit_spinlock.h>
29 #include <linux/rcupdate.h>
30 #include <linux/limits.h>
31 #include <linux/mutex.h>
32 #include <linux/rbtree.h>
33 #include <linux/slab.h>
34 #include <linux/swap.h>
35 #include <linux/spinlock.h>
36 #include <linux/fs.h>
37 #include <linux/seq_file.h>
38 #include <linux/vmalloc.h>
39 #include <linux/mm_inline.h>
40 #include <linux/page_cgroup.h>
41 #include <linux/cpu.h>
42 #include "internal.h"
43 
44 #include <asm/uaccess.h>
45 
46 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
47 #define MEM_CGROUP_RECLAIM_RETRIES	5
48 struct mem_cgroup *root_mem_cgroup __read_mostly;
49 
50 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
51 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
52 int do_swap_account __read_mostly;
53 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54 #else
55 #define do_swap_account		(0)
56 #endif
57 
58 #define SOFTLIMIT_EVENTS_THRESH (1000)
59 
60 /*
61  * Statistics for memory cgroup.
62  */
63 enum mem_cgroup_stat_index {
64 	/*
65 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
66 	 */
67 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
68 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
69 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
70 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
71 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
72 	MEM_CGROUP_STAT_EVENTS,	/* sum of pagein + pageout for internal use */
73 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
74 
75 	MEM_CGROUP_STAT_NSTATS,
76 };
77 
78 struct mem_cgroup_stat_cpu {
79 	s64 count[MEM_CGROUP_STAT_NSTATS];
80 } ____cacheline_aligned_in_smp;
81 
82 struct mem_cgroup_stat {
83 	struct mem_cgroup_stat_cpu cpustat[0];
84 };
85 
86 static inline void
87 __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 				enum mem_cgroup_stat_index idx)
89 {
90 	stat->count[idx] = 0;
91 }
92 
93 static inline s64
94 __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 				enum mem_cgroup_stat_index idx)
96 {
97 	return stat->count[idx];
98 }
99 
100 /*
101  * For accounting under irq disable, no need for increment preempt count.
102  */
103 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104 		enum mem_cgroup_stat_index idx, int val)
105 {
106 	stat->count[idx] += val;
107 }
108 
109 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110 		enum mem_cgroup_stat_index idx)
111 {
112 	int cpu;
113 	s64 ret = 0;
114 	for_each_possible_cpu(cpu)
115 		ret += stat->cpustat[cpu].count[idx];
116 	return ret;
117 }
118 
119 static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120 {
121 	s64 ret;
122 
123 	ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 	ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 	return ret;
126 }
127 
128 /*
129  * per-zone information in memory controller.
130  */
131 struct mem_cgroup_per_zone {
132 	/*
133 	 * spin_lock to protect the per cgroup LRU
134 	 */
135 	struct list_head	lists[NR_LRU_LISTS];
136 	unsigned long		count[NR_LRU_LISTS];
137 
138 	struct zone_reclaim_stat reclaim_stat;
139 	struct rb_node		tree_node;	/* RB tree node */
140 	unsigned long long	usage_in_excess;/* Set to the value by which */
141 						/* the soft limit is exceeded*/
142 	bool			on_tree;
143 	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
144 						/* use container_of	   */
145 };
146 /* Macro for accessing counter */
147 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
148 
149 struct mem_cgroup_per_node {
150 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
151 };
152 
153 struct mem_cgroup_lru_info {
154 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
155 };
156 
157 /*
158  * Cgroups above their limits are maintained in a RB-Tree, independent of
159  * their hierarchy representation
160  */
161 
162 struct mem_cgroup_tree_per_zone {
163 	struct rb_root rb_root;
164 	spinlock_t lock;
165 };
166 
167 struct mem_cgroup_tree_per_node {
168 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169 };
170 
171 struct mem_cgroup_tree {
172 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173 };
174 
175 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176 
177 /*
178  * The memory controller data structure. The memory controller controls both
179  * page cache and RSS per cgroup. We would eventually like to provide
180  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
181  * to help the administrator determine what knobs to tune.
182  *
183  * TODO: Add a water mark for the memory controller. Reclaim will begin when
184  * we hit the water mark. May be even add a low water mark, such that
185  * no reclaim occurs from a cgroup at it's low water mark, this is
186  * a feature that will be implemented much later in the future.
187  */
188 struct mem_cgroup {
189 	struct cgroup_subsys_state css;
190 	/*
191 	 * the counter to account for memory usage
192 	 */
193 	struct res_counter res;
194 	/*
195 	 * the counter to account for mem+swap usage.
196 	 */
197 	struct res_counter memsw;
198 	/*
199 	 * Per cgroup active and inactive list, similar to the
200 	 * per zone LRU lists.
201 	 */
202 	struct mem_cgroup_lru_info info;
203 
204 	/*
205 	  protect against reclaim related member.
206 	*/
207 	spinlock_t reclaim_param_lock;
208 
209 	int	prev_priority;	/* for recording reclaim priority */
210 
211 	/*
212 	 * While reclaiming in a hierarchy, we cache the last child we
213 	 * reclaimed from.
214 	 */
215 	int last_scanned_child;
216 	/*
217 	 * Should the accounting and control be hierarchical, per subtree?
218 	 */
219 	bool use_hierarchy;
220 	unsigned long	last_oom_jiffies;
221 	atomic_t	refcnt;
222 
223 	unsigned int	swappiness;
224 
225 	/* set when res.limit == memsw.limit */
226 	bool		memsw_is_minimum;
227 
228 	/*
229 	 * statistics. This must be placed at the end of memcg.
230 	 */
231 	struct mem_cgroup_stat stat;
232 };
233 
234 /*
235  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236  * limit reclaim to prevent infinite loops, if they ever occur.
237  */
238 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
239 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
240 
241 enum charge_type {
242 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
243 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
244 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
245 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
246 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
247 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
248 	NR_CHARGE_TYPE,
249 };
250 
251 /* only for here (for easy reading.) */
252 #define PCGF_CACHE	(1UL << PCG_CACHE)
253 #define PCGF_USED	(1UL << PCG_USED)
254 #define PCGF_LOCK	(1UL << PCG_LOCK)
255 /* Not used, but added here for completeness */
256 #define PCGF_ACCT	(1UL << PCG_ACCT)
257 
258 /* for encoding cft->private value on file */
259 #define _MEM			(0)
260 #define _MEMSWAP		(1)
261 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
262 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
263 #define MEMFILE_ATTR(val)	((val) & 0xffff)
264 
265 /*
266  * Reclaim flags for mem_cgroup_hierarchical_reclaim
267  */
268 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
269 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
271 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
273 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274 
275 static void mem_cgroup_get(struct mem_cgroup *mem);
276 static void mem_cgroup_put(struct mem_cgroup *mem);
277 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278 static void drain_all_stock_async(void);
279 
280 static struct mem_cgroup_per_zone *
281 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
282 {
283 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
284 }
285 
286 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287 {
288 	return &mem->css;
289 }
290 
291 static struct mem_cgroup_per_zone *
292 page_cgroup_zoneinfo(struct page_cgroup *pc)
293 {
294 	struct mem_cgroup *mem = pc->mem_cgroup;
295 	int nid = page_cgroup_nid(pc);
296 	int zid = page_cgroup_zid(pc);
297 
298 	if (!mem)
299 		return NULL;
300 
301 	return mem_cgroup_zoneinfo(mem, nid, zid);
302 }
303 
304 static struct mem_cgroup_tree_per_zone *
305 soft_limit_tree_node_zone(int nid, int zid)
306 {
307 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
308 }
309 
310 static struct mem_cgroup_tree_per_zone *
311 soft_limit_tree_from_page(struct page *page)
312 {
313 	int nid = page_to_nid(page);
314 	int zid = page_zonenum(page);
315 
316 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
317 }
318 
319 static void
320 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
321 				struct mem_cgroup_per_zone *mz,
322 				struct mem_cgroup_tree_per_zone *mctz,
323 				unsigned long long new_usage_in_excess)
324 {
325 	struct rb_node **p = &mctz->rb_root.rb_node;
326 	struct rb_node *parent = NULL;
327 	struct mem_cgroup_per_zone *mz_node;
328 
329 	if (mz->on_tree)
330 		return;
331 
332 	mz->usage_in_excess = new_usage_in_excess;
333 	if (!mz->usage_in_excess)
334 		return;
335 	while (*p) {
336 		parent = *p;
337 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
338 					tree_node);
339 		if (mz->usage_in_excess < mz_node->usage_in_excess)
340 			p = &(*p)->rb_left;
341 		/*
342 		 * We can't avoid mem cgroups that are over their soft
343 		 * limit by the same amount
344 		 */
345 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
346 			p = &(*p)->rb_right;
347 	}
348 	rb_link_node(&mz->tree_node, parent, p);
349 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
350 	mz->on_tree = true;
351 }
352 
353 static void
354 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
355 				struct mem_cgroup_per_zone *mz,
356 				struct mem_cgroup_tree_per_zone *mctz)
357 {
358 	if (!mz->on_tree)
359 		return;
360 	rb_erase(&mz->tree_node, &mctz->rb_root);
361 	mz->on_tree = false;
362 }
363 
364 static void
365 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
366 				struct mem_cgroup_per_zone *mz,
367 				struct mem_cgroup_tree_per_zone *mctz)
368 {
369 	spin_lock(&mctz->lock);
370 	__mem_cgroup_remove_exceeded(mem, mz, mctz);
371 	spin_unlock(&mctz->lock);
372 }
373 
374 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
375 {
376 	bool ret = false;
377 	int cpu;
378 	s64 val;
379 	struct mem_cgroup_stat_cpu *cpustat;
380 
381 	cpu = get_cpu();
382 	cpustat = &mem->stat.cpustat[cpu];
383 	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
384 	if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
385 		__mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
386 		ret = true;
387 	}
388 	put_cpu();
389 	return ret;
390 }
391 
392 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
393 {
394 	unsigned long long excess;
395 	struct mem_cgroup_per_zone *mz;
396 	struct mem_cgroup_tree_per_zone *mctz;
397 	int nid = page_to_nid(page);
398 	int zid = page_zonenum(page);
399 	mctz = soft_limit_tree_from_page(page);
400 
401 	/*
402 	 * Necessary to update all ancestors when hierarchy is used.
403 	 * because their event counter is not touched.
404 	 */
405 	for (; mem; mem = parent_mem_cgroup(mem)) {
406 		mz = mem_cgroup_zoneinfo(mem, nid, zid);
407 		excess = res_counter_soft_limit_excess(&mem->res);
408 		/*
409 		 * We have to update the tree if mz is on RB-tree or
410 		 * mem is over its softlimit.
411 		 */
412 		if (excess || mz->on_tree) {
413 			spin_lock(&mctz->lock);
414 			/* if on-tree, remove it */
415 			if (mz->on_tree)
416 				__mem_cgroup_remove_exceeded(mem, mz, mctz);
417 			/*
418 			 * Insert again. mz->usage_in_excess will be updated.
419 			 * If excess is 0, no tree ops.
420 			 */
421 			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
422 			spin_unlock(&mctz->lock);
423 		}
424 	}
425 }
426 
427 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
428 {
429 	int node, zone;
430 	struct mem_cgroup_per_zone *mz;
431 	struct mem_cgroup_tree_per_zone *mctz;
432 
433 	for_each_node_state(node, N_POSSIBLE) {
434 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
435 			mz = mem_cgroup_zoneinfo(mem, node, zone);
436 			mctz = soft_limit_tree_node_zone(node, zone);
437 			mem_cgroup_remove_exceeded(mem, mz, mctz);
438 		}
439 	}
440 }
441 
442 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
443 {
444 	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
445 }
446 
447 static struct mem_cgroup_per_zone *
448 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
449 {
450 	struct rb_node *rightmost = NULL;
451 	struct mem_cgroup_per_zone *mz;
452 
453 retry:
454 	mz = NULL;
455 	rightmost = rb_last(&mctz->rb_root);
456 	if (!rightmost)
457 		goto done;		/* Nothing to reclaim from */
458 
459 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
460 	/*
461 	 * Remove the node now but someone else can add it back,
462 	 * we will to add it back at the end of reclaim to its correct
463 	 * position in the tree.
464 	 */
465 	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
466 	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
467 		!css_tryget(&mz->mem->css))
468 		goto retry;
469 done:
470 	return mz;
471 }
472 
473 static struct mem_cgroup_per_zone *
474 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
475 {
476 	struct mem_cgroup_per_zone *mz;
477 
478 	spin_lock(&mctz->lock);
479 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
480 	spin_unlock(&mctz->lock);
481 	return mz;
482 }
483 
484 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
485 					 bool charge)
486 {
487 	int val = (charge) ? 1 : -1;
488 	struct mem_cgroup_stat *stat = &mem->stat;
489 	struct mem_cgroup_stat_cpu *cpustat;
490 	int cpu = get_cpu();
491 
492 	cpustat = &stat->cpustat[cpu];
493 	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
494 	put_cpu();
495 }
496 
497 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
498 					 struct page_cgroup *pc,
499 					 bool charge)
500 {
501 	int val = (charge) ? 1 : -1;
502 	struct mem_cgroup_stat *stat = &mem->stat;
503 	struct mem_cgroup_stat_cpu *cpustat;
504 	int cpu = get_cpu();
505 
506 	cpustat = &stat->cpustat[cpu];
507 	if (PageCgroupCache(pc))
508 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
509 	else
510 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
511 
512 	if (charge)
513 		__mem_cgroup_stat_add_safe(cpustat,
514 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
515 	else
516 		__mem_cgroup_stat_add_safe(cpustat,
517 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
518 	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
519 	put_cpu();
520 }
521 
522 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
523 					enum lru_list idx)
524 {
525 	int nid, zid;
526 	struct mem_cgroup_per_zone *mz;
527 	u64 total = 0;
528 
529 	for_each_online_node(nid)
530 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
531 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
532 			total += MEM_CGROUP_ZSTAT(mz, idx);
533 		}
534 	return total;
535 }
536 
537 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
538 {
539 	return container_of(cgroup_subsys_state(cont,
540 				mem_cgroup_subsys_id), struct mem_cgroup,
541 				css);
542 }
543 
544 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
545 {
546 	/*
547 	 * mm_update_next_owner() may clear mm->owner to NULL
548 	 * if it races with swapoff, page migration, etc.
549 	 * So this can be called with p == NULL.
550 	 */
551 	if (unlikely(!p))
552 		return NULL;
553 
554 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
555 				struct mem_cgroup, css);
556 }
557 
558 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
559 {
560 	struct mem_cgroup *mem = NULL;
561 
562 	if (!mm)
563 		return NULL;
564 	/*
565 	 * Because we have no locks, mm->owner's may be being moved to other
566 	 * cgroup. We use css_tryget() here even if this looks
567 	 * pessimistic (rather than adding locks here).
568 	 */
569 	rcu_read_lock();
570 	do {
571 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
572 		if (unlikely(!mem))
573 			break;
574 	} while (!css_tryget(&mem->css));
575 	rcu_read_unlock();
576 	return mem;
577 }
578 
579 /*
580  * Call callback function against all cgroup under hierarchy tree.
581  */
582 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
583 			  int (*func)(struct mem_cgroup *, void *))
584 {
585 	int found, ret, nextid;
586 	struct cgroup_subsys_state *css;
587 	struct mem_cgroup *mem;
588 
589 	if (!root->use_hierarchy)
590 		return (*func)(root, data);
591 
592 	nextid = 1;
593 	do {
594 		ret = 0;
595 		mem = NULL;
596 
597 		rcu_read_lock();
598 		css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
599 				   &found);
600 		if (css && css_tryget(css))
601 			mem = container_of(css, struct mem_cgroup, css);
602 		rcu_read_unlock();
603 
604 		if (mem) {
605 			ret = (*func)(mem, data);
606 			css_put(&mem->css);
607 		}
608 		nextid = found + 1;
609 	} while (!ret && css);
610 
611 	return ret;
612 }
613 
614 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
615 {
616 	return (mem == root_mem_cgroup);
617 }
618 
619 /*
620  * Following LRU functions are allowed to be used without PCG_LOCK.
621  * Operations are called by routine of global LRU independently from memcg.
622  * What we have to take care of here is validness of pc->mem_cgroup.
623  *
624  * Changes to pc->mem_cgroup happens when
625  * 1. charge
626  * 2. moving account
627  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
628  * It is added to LRU before charge.
629  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
630  * When moving account, the page is not on LRU. It's isolated.
631  */
632 
633 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
634 {
635 	struct page_cgroup *pc;
636 	struct mem_cgroup_per_zone *mz;
637 
638 	if (mem_cgroup_disabled())
639 		return;
640 	pc = lookup_page_cgroup(page);
641 	/* can happen while we handle swapcache. */
642 	if (!TestClearPageCgroupAcctLRU(pc))
643 		return;
644 	VM_BUG_ON(!pc->mem_cgroup);
645 	/*
646 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
647 	 * removed from global LRU.
648 	 */
649 	mz = page_cgroup_zoneinfo(pc);
650 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
651 	if (mem_cgroup_is_root(pc->mem_cgroup))
652 		return;
653 	VM_BUG_ON(list_empty(&pc->lru));
654 	list_del_init(&pc->lru);
655 	return;
656 }
657 
658 void mem_cgroup_del_lru(struct page *page)
659 {
660 	mem_cgroup_del_lru_list(page, page_lru(page));
661 }
662 
663 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
664 {
665 	struct mem_cgroup_per_zone *mz;
666 	struct page_cgroup *pc;
667 
668 	if (mem_cgroup_disabled())
669 		return;
670 
671 	pc = lookup_page_cgroup(page);
672 	/*
673 	 * Used bit is set without atomic ops but after smp_wmb().
674 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
675 	 */
676 	smp_rmb();
677 	/* unused or root page is not rotated. */
678 	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
679 		return;
680 	mz = page_cgroup_zoneinfo(pc);
681 	list_move(&pc->lru, &mz->lists[lru]);
682 }
683 
684 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
685 {
686 	struct page_cgroup *pc;
687 	struct mem_cgroup_per_zone *mz;
688 
689 	if (mem_cgroup_disabled())
690 		return;
691 	pc = lookup_page_cgroup(page);
692 	VM_BUG_ON(PageCgroupAcctLRU(pc));
693 	/*
694 	 * Used bit is set without atomic ops but after smp_wmb().
695 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
696 	 */
697 	smp_rmb();
698 	if (!PageCgroupUsed(pc))
699 		return;
700 
701 	mz = page_cgroup_zoneinfo(pc);
702 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
703 	SetPageCgroupAcctLRU(pc);
704 	if (mem_cgroup_is_root(pc->mem_cgroup))
705 		return;
706 	list_add(&pc->lru, &mz->lists[lru]);
707 }
708 
709 /*
710  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
711  * lru because the page may.be reused after it's fully uncharged (because of
712  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
713  * it again. This function is only used to charge SwapCache. It's done under
714  * lock_page and expected that zone->lru_lock is never held.
715  */
716 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
717 {
718 	unsigned long flags;
719 	struct zone *zone = page_zone(page);
720 	struct page_cgroup *pc = lookup_page_cgroup(page);
721 
722 	spin_lock_irqsave(&zone->lru_lock, flags);
723 	/*
724 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
725 	 * is guarded by lock_page() because the page is SwapCache.
726 	 */
727 	if (!PageCgroupUsed(pc))
728 		mem_cgroup_del_lru_list(page, page_lru(page));
729 	spin_unlock_irqrestore(&zone->lru_lock, flags);
730 }
731 
732 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
733 {
734 	unsigned long flags;
735 	struct zone *zone = page_zone(page);
736 	struct page_cgroup *pc = lookup_page_cgroup(page);
737 
738 	spin_lock_irqsave(&zone->lru_lock, flags);
739 	/* link when the page is linked to LRU but page_cgroup isn't */
740 	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
741 		mem_cgroup_add_lru_list(page, page_lru(page));
742 	spin_unlock_irqrestore(&zone->lru_lock, flags);
743 }
744 
745 
746 void mem_cgroup_move_lists(struct page *page,
747 			   enum lru_list from, enum lru_list to)
748 {
749 	if (mem_cgroup_disabled())
750 		return;
751 	mem_cgroup_del_lru_list(page, from);
752 	mem_cgroup_add_lru_list(page, to);
753 }
754 
755 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
756 {
757 	int ret;
758 	struct mem_cgroup *curr = NULL;
759 
760 	task_lock(task);
761 	rcu_read_lock();
762 	curr = try_get_mem_cgroup_from_mm(task->mm);
763 	rcu_read_unlock();
764 	task_unlock(task);
765 	if (!curr)
766 		return 0;
767 	/*
768 	 * We should check use_hierarchy of "mem" not "curr". Because checking
769 	 * use_hierarchy of "curr" here make this function true if hierarchy is
770 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
771 	 * hierarchy(even if use_hierarchy is disabled in "mem").
772 	 */
773 	if (mem->use_hierarchy)
774 		ret = css_is_ancestor(&curr->css, &mem->css);
775 	else
776 		ret = (curr == mem);
777 	css_put(&curr->css);
778 	return ret;
779 }
780 
781 /*
782  * prev_priority control...this will be used in memory reclaim path.
783  */
784 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
785 {
786 	int prev_priority;
787 
788 	spin_lock(&mem->reclaim_param_lock);
789 	prev_priority = mem->prev_priority;
790 	spin_unlock(&mem->reclaim_param_lock);
791 
792 	return prev_priority;
793 }
794 
795 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
796 {
797 	spin_lock(&mem->reclaim_param_lock);
798 	if (priority < mem->prev_priority)
799 		mem->prev_priority = priority;
800 	spin_unlock(&mem->reclaim_param_lock);
801 }
802 
803 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
804 {
805 	spin_lock(&mem->reclaim_param_lock);
806 	mem->prev_priority = priority;
807 	spin_unlock(&mem->reclaim_param_lock);
808 }
809 
810 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
811 {
812 	unsigned long active;
813 	unsigned long inactive;
814 	unsigned long gb;
815 	unsigned long inactive_ratio;
816 
817 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
818 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
819 
820 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
821 	if (gb)
822 		inactive_ratio = int_sqrt(10 * gb);
823 	else
824 		inactive_ratio = 1;
825 
826 	if (present_pages) {
827 		present_pages[0] = inactive;
828 		present_pages[1] = active;
829 	}
830 
831 	return inactive_ratio;
832 }
833 
834 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
835 {
836 	unsigned long active;
837 	unsigned long inactive;
838 	unsigned long present_pages[2];
839 	unsigned long inactive_ratio;
840 
841 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
842 
843 	inactive = present_pages[0];
844 	active = present_pages[1];
845 
846 	if (inactive * inactive_ratio < active)
847 		return 1;
848 
849 	return 0;
850 }
851 
852 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
853 {
854 	unsigned long active;
855 	unsigned long inactive;
856 
857 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
858 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
859 
860 	return (active > inactive);
861 }
862 
863 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
864 				       struct zone *zone,
865 				       enum lru_list lru)
866 {
867 	int nid = zone->zone_pgdat->node_id;
868 	int zid = zone_idx(zone);
869 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
870 
871 	return MEM_CGROUP_ZSTAT(mz, lru);
872 }
873 
874 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
875 						      struct zone *zone)
876 {
877 	int nid = zone->zone_pgdat->node_id;
878 	int zid = zone_idx(zone);
879 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
880 
881 	return &mz->reclaim_stat;
882 }
883 
884 struct zone_reclaim_stat *
885 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
886 {
887 	struct page_cgroup *pc;
888 	struct mem_cgroup_per_zone *mz;
889 
890 	if (mem_cgroup_disabled())
891 		return NULL;
892 
893 	pc = lookup_page_cgroup(page);
894 	/*
895 	 * Used bit is set without atomic ops but after smp_wmb().
896 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
897 	 */
898 	smp_rmb();
899 	if (!PageCgroupUsed(pc))
900 		return NULL;
901 
902 	mz = page_cgroup_zoneinfo(pc);
903 	if (!mz)
904 		return NULL;
905 
906 	return &mz->reclaim_stat;
907 }
908 
909 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
910 					struct list_head *dst,
911 					unsigned long *scanned, int order,
912 					int mode, struct zone *z,
913 					struct mem_cgroup *mem_cont,
914 					int active, int file)
915 {
916 	unsigned long nr_taken = 0;
917 	struct page *page;
918 	unsigned long scan;
919 	LIST_HEAD(pc_list);
920 	struct list_head *src;
921 	struct page_cgroup *pc, *tmp;
922 	int nid = z->zone_pgdat->node_id;
923 	int zid = zone_idx(z);
924 	struct mem_cgroup_per_zone *mz;
925 	int lru = LRU_FILE * file + active;
926 	int ret;
927 
928 	BUG_ON(!mem_cont);
929 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
930 	src = &mz->lists[lru];
931 
932 	scan = 0;
933 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
934 		if (scan >= nr_to_scan)
935 			break;
936 
937 		page = pc->page;
938 		if (unlikely(!PageCgroupUsed(pc)))
939 			continue;
940 		if (unlikely(!PageLRU(page)))
941 			continue;
942 
943 		scan++;
944 		ret = __isolate_lru_page(page, mode, file);
945 		switch (ret) {
946 		case 0:
947 			list_move(&page->lru, dst);
948 			mem_cgroup_del_lru(page);
949 			nr_taken++;
950 			break;
951 		case -EBUSY:
952 			/* we don't affect global LRU but rotate in our LRU */
953 			mem_cgroup_rotate_lru_list(page, page_lru(page));
954 			break;
955 		default:
956 			break;
957 		}
958 	}
959 
960 	*scanned = scan;
961 	return nr_taken;
962 }
963 
964 #define mem_cgroup_from_res_counter(counter, member)	\
965 	container_of(counter, struct mem_cgroup, member)
966 
967 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
968 {
969 	if (do_swap_account) {
970 		if (res_counter_check_under_limit(&mem->res) &&
971 			res_counter_check_under_limit(&mem->memsw))
972 			return true;
973 	} else
974 		if (res_counter_check_under_limit(&mem->res))
975 			return true;
976 	return false;
977 }
978 
979 static unsigned int get_swappiness(struct mem_cgroup *memcg)
980 {
981 	struct cgroup *cgrp = memcg->css.cgroup;
982 	unsigned int swappiness;
983 
984 	/* root ? */
985 	if (cgrp->parent == NULL)
986 		return vm_swappiness;
987 
988 	spin_lock(&memcg->reclaim_param_lock);
989 	swappiness = memcg->swappiness;
990 	spin_unlock(&memcg->reclaim_param_lock);
991 
992 	return swappiness;
993 }
994 
995 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
996 {
997 	int *val = data;
998 	(*val)++;
999 	return 0;
1000 }
1001 
1002 /**
1003  * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
1004  * @memcg: The memory cgroup that went over limit
1005  * @p: Task that is going to be killed
1006  *
1007  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1008  * enabled
1009  */
1010 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1011 {
1012 	struct cgroup *task_cgrp;
1013 	struct cgroup *mem_cgrp;
1014 	/*
1015 	 * Need a buffer in BSS, can't rely on allocations. The code relies
1016 	 * on the assumption that OOM is serialized for memory controller.
1017 	 * If this assumption is broken, revisit this code.
1018 	 */
1019 	static char memcg_name[PATH_MAX];
1020 	int ret;
1021 
1022 	if (!memcg || !p)
1023 		return;
1024 
1025 
1026 	rcu_read_lock();
1027 
1028 	mem_cgrp = memcg->css.cgroup;
1029 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1030 
1031 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1032 	if (ret < 0) {
1033 		/*
1034 		 * Unfortunately, we are unable to convert to a useful name
1035 		 * But we'll still print out the usage information
1036 		 */
1037 		rcu_read_unlock();
1038 		goto done;
1039 	}
1040 	rcu_read_unlock();
1041 
1042 	printk(KERN_INFO "Task in %s killed", memcg_name);
1043 
1044 	rcu_read_lock();
1045 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1046 	if (ret < 0) {
1047 		rcu_read_unlock();
1048 		goto done;
1049 	}
1050 	rcu_read_unlock();
1051 
1052 	/*
1053 	 * Continues from above, so we don't need an KERN_ level
1054 	 */
1055 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1056 done:
1057 
1058 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1059 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1060 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1061 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
1062 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1063 		"failcnt %llu\n",
1064 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1065 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1066 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1067 }
1068 
1069 /*
1070  * This function returns the number of memcg under hierarchy tree. Returns
1071  * 1(self count) if no children.
1072  */
1073 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1074 {
1075 	int num = 0;
1076  	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1077 	return num;
1078 }
1079 
1080 /*
1081  * Visit the first child (need not be the first child as per the ordering
1082  * of the cgroup list, since we track last_scanned_child) of @mem and use
1083  * that to reclaim free pages from.
1084  */
1085 static struct mem_cgroup *
1086 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1087 {
1088 	struct mem_cgroup *ret = NULL;
1089 	struct cgroup_subsys_state *css;
1090 	int nextid, found;
1091 
1092 	if (!root_mem->use_hierarchy) {
1093 		css_get(&root_mem->css);
1094 		ret = root_mem;
1095 	}
1096 
1097 	while (!ret) {
1098 		rcu_read_lock();
1099 		nextid = root_mem->last_scanned_child + 1;
1100 		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1101 				   &found);
1102 		if (css && css_tryget(css))
1103 			ret = container_of(css, struct mem_cgroup, css);
1104 
1105 		rcu_read_unlock();
1106 		/* Updates scanning parameter */
1107 		spin_lock(&root_mem->reclaim_param_lock);
1108 		if (!css) {
1109 			/* this means start scan from ID:1 */
1110 			root_mem->last_scanned_child = 0;
1111 		} else
1112 			root_mem->last_scanned_child = found;
1113 		spin_unlock(&root_mem->reclaim_param_lock);
1114 	}
1115 
1116 	return ret;
1117 }
1118 
1119 /*
1120  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1121  * we reclaimed from, so that we don't end up penalizing one child extensively
1122  * based on its position in the children list.
1123  *
1124  * root_mem is the original ancestor that we've been reclaim from.
1125  *
1126  * We give up and return to the caller when we visit root_mem twice.
1127  * (other groups can be removed while we're walking....)
1128  *
1129  * If shrink==true, for avoiding to free too much, this returns immedieately.
1130  */
1131 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1132 						struct zone *zone,
1133 						gfp_t gfp_mask,
1134 						unsigned long reclaim_options)
1135 {
1136 	struct mem_cgroup *victim;
1137 	int ret, total = 0;
1138 	int loop = 0;
1139 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1140 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1141 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1142 	unsigned long excess = mem_cgroup_get_excess(root_mem);
1143 
1144 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1145 	if (root_mem->memsw_is_minimum)
1146 		noswap = true;
1147 
1148 	while (1) {
1149 		victim = mem_cgroup_select_victim(root_mem);
1150 		if (victim == root_mem) {
1151 			loop++;
1152 			if (loop >= 1)
1153 				drain_all_stock_async();
1154 			if (loop >= 2) {
1155 				/*
1156 				 * If we have not been able to reclaim
1157 				 * anything, it might because there are
1158 				 * no reclaimable pages under this hierarchy
1159 				 */
1160 				if (!check_soft || !total) {
1161 					css_put(&victim->css);
1162 					break;
1163 				}
1164 				/*
1165 				 * We want to do more targetted reclaim.
1166 				 * excess >> 2 is not to excessive so as to
1167 				 * reclaim too much, nor too less that we keep
1168 				 * coming back to reclaim from this cgroup
1169 				 */
1170 				if (total >= (excess >> 2) ||
1171 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1172 					css_put(&victim->css);
1173 					break;
1174 				}
1175 			}
1176 		}
1177 		if (!mem_cgroup_local_usage(&victim->stat)) {
1178 			/* this cgroup's local usage == 0 */
1179 			css_put(&victim->css);
1180 			continue;
1181 		}
1182 		/* we use swappiness of local cgroup */
1183 		if (check_soft)
1184 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1185 				noswap, get_swappiness(victim), zone,
1186 				zone->zone_pgdat->node_id);
1187 		else
1188 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1189 						noswap, get_swappiness(victim));
1190 		css_put(&victim->css);
1191 		/*
1192 		 * At shrinking usage, we can't check we should stop here or
1193 		 * reclaim more. It's depends on callers. last_scanned_child
1194 		 * will work enough for keeping fairness under tree.
1195 		 */
1196 		if (shrink)
1197 			return ret;
1198 		total += ret;
1199 		if (check_soft) {
1200 			if (res_counter_check_under_soft_limit(&root_mem->res))
1201 				return total;
1202 		} else if (mem_cgroup_check_under_limit(root_mem))
1203 			return 1 + total;
1204 	}
1205 	return total;
1206 }
1207 
1208 bool mem_cgroup_oom_called(struct task_struct *task)
1209 {
1210 	bool ret = false;
1211 	struct mem_cgroup *mem;
1212 	struct mm_struct *mm;
1213 
1214 	rcu_read_lock();
1215 	mm = task->mm;
1216 	if (!mm)
1217 		mm = &init_mm;
1218 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1219 	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1220 		ret = true;
1221 	rcu_read_unlock();
1222 	return ret;
1223 }
1224 
1225 static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
1226 {
1227 	mem->last_oom_jiffies = jiffies;
1228 	return 0;
1229 }
1230 
1231 static void record_last_oom(struct mem_cgroup *mem)
1232 {
1233 	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
1234 }
1235 
1236 /*
1237  * Currently used to update mapped file statistics, but the routine can be
1238  * generalized to update other statistics as well.
1239  */
1240 void mem_cgroup_update_file_mapped(struct page *page, int val)
1241 {
1242 	struct mem_cgroup *mem;
1243 	struct mem_cgroup_stat *stat;
1244 	struct mem_cgroup_stat_cpu *cpustat;
1245 	int cpu;
1246 	struct page_cgroup *pc;
1247 
1248 	pc = lookup_page_cgroup(page);
1249 	if (unlikely(!pc))
1250 		return;
1251 
1252 	lock_page_cgroup(pc);
1253 	mem = pc->mem_cgroup;
1254 	if (!mem)
1255 		goto done;
1256 
1257 	if (!PageCgroupUsed(pc))
1258 		goto done;
1259 
1260 	/*
1261 	 * Preemption is already disabled, we don't need get_cpu()
1262 	 */
1263 	cpu = smp_processor_id();
1264 	stat = &mem->stat;
1265 	cpustat = &stat->cpustat[cpu];
1266 
1267 	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1268 done:
1269 	unlock_page_cgroup(pc);
1270 }
1271 
1272 /*
1273  * size of first charge trial. "32" comes from vmscan.c's magic value.
1274  * TODO: maybe necessary to use big numbers in big irons.
1275  */
1276 #define CHARGE_SIZE	(32 * PAGE_SIZE)
1277 struct memcg_stock_pcp {
1278 	struct mem_cgroup *cached; /* this never be root cgroup */
1279 	int charge;
1280 	struct work_struct work;
1281 };
1282 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1283 static atomic_t memcg_drain_count;
1284 
1285 /*
1286  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1287  * from local stock and true is returned. If the stock is 0 or charges from a
1288  * cgroup which is not current target, returns false. This stock will be
1289  * refilled.
1290  */
1291 static bool consume_stock(struct mem_cgroup *mem)
1292 {
1293 	struct memcg_stock_pcp *stock;
1294 	bool ret = true;
1295 
1296 	stock = &get_cpu_var(memcg_stock);
1297 	if (mem == stock->cached && stock->charge)
1298 		stock->charge -= PAGE_SIZE;
1299 	else /* need to call res_counter_charge */
1300 		ret = false;
1301 	put_cpu_var(memcg_stock);
1302 	return ret;
1303 }
1304 
1305 /*
1306  * Returns stocks cached in percpu to res_counter and reset cached information.
1307  */
1308 static void drain_stock(struct memcg_stock_pcp *stock)
1309 {
1310 	struct mem_cgroup *old = stock->cached;
1311 
1312 	if (stock->charge) {
1313 		res_counter_uncharge(&old->res, stock->charge);
1314 		if (do_swap_account)
1315 			res_counter_uncharge(&old->memsw, stock->charge);
1316 	}
1317 	stock->cached = NULL;
1318 	stock->charge = 0;
1319 }
1320 
1321 /*
1322  * This must be called under preempt disabled or must be called by
1323  * a thread which is pinned to local cpu.
1324  */
1325 static void drain_local_stock(struct work_struct *dummy)
1326 {
1327 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1328 	drain_stock(stock);
1329 }
1330 
1331 /*
1332  * Cache charges(val) which is from res_counter, to local per_cpu area.
1333  * This will be consumed by consumt_stock() function, later.
1334  */
1335 static void refill_stock(struct mem_cgroup *mem, int val)
1336 {
1337 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1338 
1339 	if (stock->cached != mem) { /* reset if necessary */
1340 		drain_stock(stock);
1341 		stock->cached = mem;
1342 	}
1343 	stock->charge += val;
1344 	put_cpu_var(memcg_stock);
1345 }
1346 
1347 /*
1348  * Tries to drain stocked charges in other cpus. This function is asynchronous
1349  * and just put a work per cpu for draining localy on each cpu. Caller can
1350  * expects some charges will be back to res_counter later but cannot wait for
1351  * it.
1352  */
1353 static void drain_all_stock_async(void)
1354 {
1355 	int cpu;
1356 	/* This function is for scheduling "drain" in asynchronous way.
1357 	 * The result of "drain" is not directly handled by callers. Then,
1358 	 * if someone is calling drain, we don't have to call drain more.
1359 	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1360 	 * there is a race. We just do loose check here.
1361 	 */
1362 	if (atomic_read(&memcg_drain_count))
1363 		return;
1364 	/* Notify other cpus that system-wide "drain" is running */
1365 	atomic_inc(&memcg_drain_count);
1366 	get_online_cpus();
1367 	for_each_online_cpu(cpu) {
1368 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1369 		schedule_work_on(cpu, &stock->work);
1370 	}
1371  	put_online_cpus();
1372 	atomic_dec(&memcg_drain_count);
1373 	/* We don't wait for flush_work */
1374 }
1375 
1376 /* This is a synchronous drain interface. */
1377 static void drain_all_stock_sync(void)
1378 {
1379 	/* called when force_empty is called */
1380 	atomic_inc(&memcg_drain_count);
1381 	schedule_on_each_cpu(drain_local_stock);
1382 	atomic_dec(&memcg_drain_count);
1383 }
1384 
1385 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1386 					unsigned long action,
1387 					void *hcpu)
1388 {
1389 	int cpu = (unsigned long)hcpu;
1390 	struct memcg_stock_pcp *stock;
1391 
1392 	if (action != CPU_DEAD)
1393 		return NOTIFY_OK;
1394 	stock = &per_cpu(memcg_stock, cpu);
1395 	drain_stock(stock);
1396 	return NOTIFY_OK;
1397 }
1398 
1399 /*
1400  * Unlike exported interface, "oom" parameter is added. if oom==true,
1401  * oom-killer can be invoked.
1402  */
1403 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1404 			gfp_t gfp_mask, struct mem_cgroup **memcg,
1405 			bool oom, struct page *page)
1406 {
1407 	struct mem_cgroup *mem, *mem_over_limit;
1408 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1409 	struct res_counter *fail_res;
1410 	int csize = CHARGE_SIZE;
1411 
1412 	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1413 		/* Don't account this! */
1414 		*memcg = NULL;
1415 		return 0;
1416 	}
1417 
1418 	/*
1419 	 * We always charge the cgroup the mm_struct belongs to.
1420 	 * The mm_struct's mem_cgroup changes on task migration if the
1421 	 * thread group leader migrates. It's possible that mm is not
1422 	 * set, if so charge the init_mm (happens for pagecache usage).
1423 	 */
1424 	mem = *memcg;
1425 	if (likely(!mem)) {
1426 		mem = try_get_mem_cgroup_from_mm(mm);
1427 		*memcg = mem;
1428 	} else {
1429 		css_get(&mem->css);
1430 	}
1431 	if (unlikely(!mem))
1432 		return 0;
1433 
1434 	VM_BUG_ON(css_is_removed(&mem->css));
1435 	if (mem_cgroup_is_root(mem))
1436 		goto done;
1437 
1438 	while (1) {
1439 		int ret = 0;
1440 		unsigned long flags = 0;
1441 
1442 		if (consume_stock(mem))
1443 			goto charged;
1444 
1445 		ret = res_counter_charge(&mem->res, csize, &fail_res);
1446 		if (likely(!ret)) {
1447 			if (!do_swap_account)
1448 				break;
1449 			ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1450 			if (likely(!ret))
1451 				break;
1452 			/* mem+swap counter fails */
1453 			res_counter_uncharge(&mem->res, csize);
1454 			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1455 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1456 									memsw);
1457 		} else
1458 			/* mem counter fails */
1459 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1460 									res);
1461 
1462 		/* reduce request size and retry */
1463 		if (csize > PAGE_SIZE) {
1464 			csize = PAGE_SIZE;
1465 			continue;
1466 		}
1467 		if (!(gfp_mask & __GFP_WAIT))
1468 			goto nomem;
1469 
1470 		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1471 						gfp_mask, flags);
1472 		if (ret)
1473 			continue;
1474 
1475 		/*
1476 		 * try_to_free_mem_cgroup_pages() might not give us a full
1477 		 * picture of reclaim. Some pages are reclaimed and might be
1478 		 * moved to swap cache or just unmapped from the cgroup.
1479 		 * Check the limit again to see if the reclaim reduced the
1480 		 * current usage of the cgroup before giving up
1481 		 *
1482 		 */
1483 		if (mem_cgroup_check_under_limit(mem_over_limit))
1484 			continue;
1485 
1486 		if (!nr_retries--) {
1487 			if (oom) {
1488 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1489 				record_last_oom(mem_over_limit);
1490 			}
1491 			goto nomem;
1492 		}
1493 	}
1494 	if (csize > PAGE_SIZE)
1495 		refill_stock(mem, csize - PAGE_SIZE);
1496 charged:
1497 	/*
1498 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1499 	 * if they exceeds softlimit.
1500 	 */
1501 	if (mem_cgroup_soft_limit_check(mem))
1502 		mem_cgroup_update_tree(mem, page);
1503 done:
1504 	return 0;
1505 nomem:
1506 	css_put(&mem->css);
1507 	return -ENOMEM;
1508 }
1509 
1510 /*
1511  * Somemtimes we have to undo a charge we got by try_charge().
1512  * This function is for that and do uncharge, put css's refcnt.
1513  * gotten by try_charge().
1514  */
1515 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1516 {
1517 	if (!mem_cgroup_is_root(mem)) {
1518 		res_counter_uncharge(&mem->res, PAGE_SIZE);
1519 		if (do_swap_account)
1520 			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1521 	}
1522 	css_put(&mem->css);
1523 }
1524 
1525 /*
1526  * A helper function to get mem_cgroup from ID. must be called under
1527  * rcu_read_lock(). The caller must check css_is_removed() or some if
1528  * it's concern. (dropping refcnt from swap can be called against removed
1529  * memcg.)
1530  */
1531 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1532 {
1533 	struct cgroup_subsys_state *css;
1534 
1535 	/* ID 0 is unused ID */
1536 	if (!id)
1537 		return NULL;
1538 	css = css_lookup(&mem_cgroup_subsys, id);
1539 	if (!css)
1540 		return NULL;
1541 	return container_of(css, struct mem_cgroup, css);
1542 }
1543 
1544 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1545 {
1546 	struct mem_cgroup *mem = NULL;
1547 	struct page_cgroup *pc;
1548 	unsigned short id;
1549 	swp_entry_t ent;
1550 
1551 	VM_BUG_ON(!PageLocked(page));
1552 
1553 	pc = lookup_page_cgroup(page);
1554 	lock_page_cgroup(pc);
1555 	if (PageCgroupUsed(pc)) {
1556 		mem = pc->mem_cgroup;
1557 		if (mem && !css_tryget(&mem->css))
1558 			mem = NULL;
1559 	} else if (PageSwapCache(page)) {
1560 		ent.val = page_private(page);
1561 		id = lookup_swap_cgroup(ent);
1562 		rcu_read_lock();
1563 		mem = mem_cgroup_lookup(id);
1564 		if (mem && !css_tryget(&mem->css))
1565 			mem = NULL;
1566 		rcu_read_unlock();
1567 	}
1568 	unlock_page_cgroup(pc);
1569 	return mem;
1570 }
1571 
1572 /*
1573  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1574  * USED state. If already USED, uncharge and return.
1575  */
1576 
1577 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1578 				     struct page_cgroup *pc,
1579 				     enum charge_type ctype)
1580 {
1581 	/* try_charge() can return NULL to *memcg, taking care of it. */
1582 	if (!mem)
1583 		return;
1584 
1585 	lock_page_cgroup(pc);
1586 	if (unlikely(PageCgroupUsed(pc))) {
1587 		unlock_page_cgroup(pc);
1588 		mem_cgroup_cancel_charge(mem);
1589 		return;
1590 	}
1591 
1592 	pc->mem_cgroup = mem;
1593 	/*
1594 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
1595 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1596 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1597 	 * before USED bit, we need memory barrier here.
1598 	 * See mem_cgroup_add_lru_list(), etc.
1599  	 */
1600 	smp_wmb();
1601 	switch (ctype) {
1602 	case MEM_CGROUP_CHARGE_TYPE_CACHE:
1603 	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1604 		SetPageCgroupCache(pc);
1605 		SetPageCgroupUsed(pc);
1606 		break;
1607 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1608 		ClearPageCgroupCache(pc);
1609 		SetPageCgroupUsed(pc);
1610 		break;
1611 	default:
1612 		break;
1613 	}
1614 
1615 	mem_cgroup_charge_statistics(mem, pc, true);
1616 
1617 	unlock_page_cgroup(pc);
1618 }
1619 
1620 /**
1621  * __mem_cgroup_move_account - move account of the page
1622  * @pc:	page_cgroup of the page.
1623  * @from: mem_cgroup which the page is moved from.
1624  * @to:	mem_cgroup which the page is moved to. @from != @to.
1625  *
1626  * The caller must confirm following.
1627  * - page is not on LRU (isolate_page() is useful.)
1628  * - the pc is locked, used, and ->mem_cgroup points to @from.
1629  *
1630  * This function does "uncharge" from old cgroup but doesn't do "charge" to
1631  * new cgroup. It should be done by a caller.
1632  */
1633 
1634 static void __mem_cgroup_move_account(struct page_cgroup *pc,
1635 	struct mem_cgroup *from, struct mem_cgroup *to)
1636 {
1637 	struct page *page;
1638 	int cpu;
1639 	struct mem_cgroup_stat *stat;
1640 	struct mem_cgroup_stat_cpu *cpustat;
1641 
1642 	VM_BUG_ON(from == to);
1643 	VM_BUG_ON(PageLRU(pc->page));
1644 	VM_BUG_ON(!PageCgroupLocked(pc));
1645 	VM_BUG_ON(!PageCgroupUsed(pc));
1646 	VM_BUG_ON(pc->mem_cgroup != from);
1647 
1648 	if (!mem_cgroup_is_root(from))
1649 		res_counter_uncharge(&from->res, PAGE_SIZE);
1650 	mem_cgroup_charge_statistics(from, pc, false);
1651 
1652 	page = pc->page;
1653 	if (page_mapped(page) && !PageAnon(page)) {
1654 		cpu = smp_processor_id();
1655 		/* Update mapped_file data for mem_cgroup "from" */
1656 		stat = &from->stat;
1657 		cpustat = &stat->cpustat[cpu];
1658 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1659 						-1);
1660 
1661 		/* Update mapped_file data for mem_cgroup "to" */
1662 		stat = &to->stat;
1663 		cpustat = &stat->cpustat[cpu];
1664 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1665 						1);
1666 	}
1667 
1668 	if (do_swap_account && !mem_cgroup_is_root(from))
1669 		res_counter_uncharge(&from->memsw, PAGE_SIZE);
1670 	css_put(&from->css);
1671 
1672 	css_get(&to->css);
1673 	pc->mem_cgroup = to;
1674 	mem_cgroup_charge_statistics(to, pc, true);
1675 	/*
1676 	 * We charges against "to" which may not have any tasks. Then, "to"
1677 	 * can be under rmdir(). But in current implementation, caller of
1678 	 * this function is just force_empty() and it's garanteed that
1679 	 * "to" is never removed. So, we don't check rmdir status here.
1680 	 */
1681 }
1682 
1683 /*
1684  * check whether the @pc is valid for moving account and call
1685  * __mem_cgroup_move_account()
1686  */
1687 static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 				struct mem_cgroup *from, struct mem_cgroup *to)
1689 {
1690 	int ret = -EINVAL;
1691 	lock_page_cgroup(pc);
1692 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 		__mem_cgroup_move_account(pc, from, to);
1694 		ret = 0;
1695 	}
1696 	unlock_page_cgroup(pc);
1697 	return ret;
1698 }
1699 
1700 /*
1701  * move charges to its parent.
1702  */
1703 
1704 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1705 				  struct mem_cgroup *child,
1706 				  gfp_t gfp_mask)
1707 {
1708 	struct page *page = pc->page;
1709 	struct cgroup *cg = child->css.cgroup;
1710 	struct cgroup *pcg = cg->parent;
1711 	struct mem_cgroup *parent;
1712 	int ret;
1713 
1714 	/* Is ROOT ? */
1715 	if (!pcg)
1716 		return -EINVAL;
1717 
1718 	ret = -EBUSY;
1719 	if (!get_page_unless_zero(page))
1720 		goto out;
1721 	if (isolate_lru_page(page))
1722 		goto put;
1723 
1724 	parent = mem_cgroup_from_cont(pcg);
1725 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1726 	if (ret || !parent)
1727 		goto put_back;
1728 
1729 	ret = mem_cgroup_move_account(pc, child, parent);
1730 	if (!ret)
1731 		css_put(&parent->css);	/* drop extra refcnt by try_charge() */
1732 	else
1733 		mem_cgroup_cancel_charge(parent);	/* does css_put */
1734 put_back:
1735 	putback_lru_page(page);
1736 put:
1737 	put_page(page);
1738 out:
1739 	return ret;
1740 }
1741 
1742 /*
1743  * Charge the memory controller for page usage.
1744  * Return
1745  * 0 if the charge was successful
1746  * < 0 if the cgroup is over its limit
1747  */
1748 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1749 				gfp_t gfp_mask, enum charge_type ctype,
1750 				struct mem_cgroup *memcg)
1751 {
1752 	struct mem_cgroup *mem;
1753 	struct page_cgroup *pc;
1754 	int ret;
1755 
1756 	pc = lookup_page_cgroup(page);
1757 	/* can happen at boot */
1758 	if (unlikely(!pc))
1759 		return 0;
1760 	prefetchw(pc);
1761 
1762 	mem = memcg;
1763 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1764 	if (ret || !mem)
1765 		return ret;
1766 
1767 	__mem_cgroup_commit_charge(mem, pc, ctype);
1768 	return 0;
1769 }
1770 
1771 int mem_cgroup_newpage_charge(struct page *page,
1772 			      struct mm_struct *mm, gfp_t gfp_mask)
1773 {
1774 	if (mem_cgroup_disabled())
1775 		return 0;
1776 	if (PageCompound(page))
1777 		return 0;
1778 	/*
1779 	 * If already mapped, we don't have to account.
1780 	 * If page cache, page->mapping has address_space.
1781 	 * But page->mapping may have out-of-use anon_vma pointer,
1782 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1783 	 * is NULL.
1784   	 */
1785 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1786 		return 0;
1787 	if (unlikely(!mm))
1788 		mm = &init_mm;
1789 	return mem_cgroup_charge_common(page, mm, gfp_mask,
1790 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1791 }
1792 
1793 static void
1794 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1795 					enum charge_type ctype);
1796 
1797 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1798 				gfp_t gfp_mask)
1799 {
1800 	struct mem_cgroup *mem = NULL;
1801 	int ret;
1802 
1803 	if (mem_cgroup_disabled())
1804 		return 0;
1805 	if (PageCompound(page))
1806 		return 0;
1807 	/*
1808 	 * Corner case handling. This is called from add_to_page_cache()
1809 	 * in usual. But some FS (shmem) precharges this page before calling it
1810 	 * and call add_to_page_cache() with GFP_NOWAIT.
1811 	 *
1812 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
1813 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1814 	 * charge twice. (It works but has to pay a bit larger cost.)
1815 	 * And when the page is SwapCache, it should take swap information
1816 	 * into account. This is under lock_page() now.
1817 	 */
1818 	if (!(gfp_mask & __GFP_WAIT)) {
1819 		struct page_cgroup *pc;
1820 
1821 
1822 		pc = lookup_page_cgroup(page);
1823 		if (!pc)
1824 			return 0;
1825 		lock_page_cgroup(pc);
1826 		if (PageCgroupUsed(pc)) {
1827 			unlock_page_cgroup(pc);
1828 			return 0;
1829 		}
1830 		unlock_page_cgroup(pc);
1831 	}
1832 
1833 	if (unlikely(!mm && !mem))
1834 		mm = &init_mm;
1835 
1836 	if (page_is_file_cache(page))
1837 		return mem_cgroup_charge_common(page, mm, gfp_mask,
1838 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1839 
1840 	/* shmem */
1841 	if (PageSwapCache(page)) {
1842 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1843 		if (!ret)
1844 			__mem_cgroup_commit_charge_swapin(page, mem,
1845 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
1846 	} else
1847 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1848 					MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1849 
1850 	return ret;
1851 }
1852 
1853 /*
1854  * While swap-in, try_charge -> commit or cancel, the page is locked.
1855  * And when try_charge() successfully returns, one refcnt to memcg without
1856  * struct page_cgroup is acquired. This refcnt will be consumed by
1857  * "commit()" or removed by "cancel()"
1858  */
1859 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1860 				 struct page *page,
1861 				 gfp_t mask, struct mem_cgroup **ptr)
1862 {
1863 	struct mem_cgroup *mem;
1864 	int ret;
1865 
1866 	if (mem_cgroup_disabled())
1867 		return 0;
1868 
1869 	if (!do_swap_account)
1870 		goto charge_cur_mm;
1871 	/*
1872 	 * A racing thread's fault, or swapoff, may have already updated
1873 	 * the pte, and even removed page from swap cache: in those cases
1874 	 * do_swap_page()'s pte_same() test will fail; but there's also a
1875 	 * KSM case which does need to charge the page.
1876 	 */
1877 	if (!PageSwapCache(page))
1878 		goto charge_cur_mm;
1879 	mem = try_get_mem_cgroup_from_page(page);
1880 	if (!mem)
1881 		goto charge_cur_mm;
1882 	*ptr = mem;
1883 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1884 	/* drop extra refcnt from tryget */
1885 	css_put(&mem->css);
1886 	return ret;
1887 charge_cur_mm:
1888 	if (unlikely(!mm))
1889 		mm = &init_mm;
1890 	return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1891 }
1892 
1893 static void
1894 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1895 					enum charge_type ctype)
1896 {
1897 	struct page_cgroup *pc;
1898 
1899 	if (mem_cgroup_disabled())
1900 		return;
1901 	if (!ptr)
1902 		return;
1903 	cgroup_exclude_rmdir(&ptr->css);
1904 	pc = lookup_page_cgroup(page);
1905 	mem_cgroup_lru_del_before_commit_swapcache(page);
1906 	__mem_cgroup_commit_charge(ptr, pc, ctype);
1907 	mem_cgroup_lru_add_after_commit_swapcache(page);
1908 	/*
1909 	 * Now swap is on-memory. This means this page may be
1910 	 * counted both as mem and swap....double count.
1911 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1912 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1913 	 * may call delete_from_swap_cache() before reach here.
1914 	 */
1915 	if (do_swap_account && PageSwapCache(page)) {
1916 		swp_entry_t ent = {.val = page_private(page)};
1917 		unsigned short id;
1918 		struct mem_cgroup *memcg;
1919 
1920 		id = swap_cgroup_record(ent, 0);
1921 		rcu_read_lock();
1922 		memcg = mem_cgroup_lookup(id);
1923 		if (memcg) {
1924 			/*
1925 			 * This recorded memcg can be obsolete one. So, avoid
1926 			 * calling css_tryget
1927 			 */
1928 			if (!mem_cgroup_is_root(memcg))
1929 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1930 			mem_cgroup_swap_statistics(memcg, false);
1931 			mem_cgroup_put(memcg);
1932 		}
1933 		rcu_read_unlock();
1934 	}
1935 	/*
1936 	 * At swapin, we may charge account against cgroup which has no tasks.
1937 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
1938 	 * In that case, we need to call pre_destroy() again. check it here.
1939 	 */
1940 	cgroup_release_and_wakeup_rmdir(&ptr->css);
1941 }
1942 
1943 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1944 {
1945 	__mem_cgroup_commit_charge_swapin(page, ptr,
1946 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
1947 }
1948 
1949 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1950 {
1951 	if (mem_cgroup_disabled())
1952 		return;
1953 	if (!mem)
1954 		return;
1955 	mem_cgroup_cancel_charge(mem);
1956 }
1957 
1958 static void
1959 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1960 {
1961 	struct memcg_batch_info *batch = NULL;
1962 	bool uncharge_memsw = true;
1963 	/* If swapout, usage of swap doesn't decrease */
1964 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1965 		uncharge_memsw = false;
1966 	/*
1967 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1968 	 * In those cases, all pages freed continously can be expected to be in
1969 	 * the same cgroup and we have chance to coalesce uncharges.
1970 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1971 	 * because we want to do uncharge as soon as possible.
1972 	 */
1973 	if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1974 		goto direct_uncharge;
1975 
1976 	batch = &current->memcg_batch;
1977 	/*
1978 	 * In usual, we do css_get() when we remember memcg pointer.
1979 	 * But in this case, we keep res->usage until end of a series of
1980 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
1981 	 */
1982 	if (!batch->memcg)
1983 		batch->memcg = mem;
1984 	/*
1985 	 * In typical case, batch->memcg == mem. This means we can
1986 	 * merge a series of uncharges to an uncharge of res_counter.
1987 	 * If not, we uncharge res_counter ony by one.
1988 	 */
1989 	if (batch->memcg != mem)
1990 		goto direct_uncharge;
1991 	/* remember freed charge and uncharge it later */
1992 	batch->bytes += PAGE_SIZE;
1993 	if (uncharge_memsw)
1994 		batch->memsw_bytes += PAGE_SIZE;
1995 	return;
1996 direct_uncharge:
1997 	res_counter_uncharge(&mem->res, PAGE_SIZE);
1998 	if (uncharge_memsw)
1999 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2000 	return;
2001 }
2002 
2003 /*
2004  * uncharge if !page_mapped(page)
2005  */
2006 static struct mem_cgroup *
2007 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2008 {
2009 	struct page_cgroup *pc;
2010 	struct mem_cgroup *mem = NULL;
2011 	struct mem_cgroup_per_zone *mz;
2012 
2013 	if (mem_cgroup_disabled())
2014 		return NULL;
2015 
2016 	if (PageSwapCache(page))
2017 		return NULL;
2018 
2019 	/*
2020 	 * Check if our page_cgroup is valid
2021 	 */
2022 	pc = lookup_page_cgroup(page);
2023 	if (unlikely(!pc || !PageCgroupUsed(pc)))
2024 		return NULL;
2025 
2026 	lock_page_cgroup(pc);
2027 
2028 	mem = pc->mem_cgroup;
2029 
2030 	if (!PageCgroupUsed(pc))
2031 		goto unlock_out;
2032 
2033 	switch (ctype) {
2034 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2035 	case MEM_CGROUP_CHARGE_TYPE_DROP:
2036 		if (page_mapped(page))
2037 			goto unlock_out;
2038 		break;
2039 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2040 		if (!PageAnon(page)) {	/* Shared memory */
2041 			if (page->mapping && !page_is_file_cache(page))
2042 				goto unlock_out;
2043 		} else if (page_mapped(page)) /* Anon */
2044 				goto unlock_out;
2045 		break;
2046 	default:
2047 		break;
2048 	}
2049 
2050 	if (!mem_cgroup_is_root(mem))
2051 		__do_uncharge(mem, ctype);
2052 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2053 		mem_cgroup_swap_statistics(mem, true);
2054 	mem_cgroup_charge_statistics(mem, pc, false);
2055 
2056 	ClearPageCgroupUsed(pc);
2057 	/*
2058 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
2059 	 * freed from LRU. This is safe because uncharged page is expected not
2060 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
2061 	 * special functions.
2062 	 */
2063 
2064 	mz = page_cgroup_zoneinfo(pc);
2065 	unlock_page_cgroup(pc);
2066 
2067 	if (mem_cgroup_soft_limit_check(mem))
2068 		mem_cgroup_update_tree(mem, page);
2069 	/* at swapout, this memcg will be accessed to record to swap */
2070 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2071 		css_put(&mem->css);
2072 
2073 	return mem;
2074 
2075 unlock_out:
2076 	unlock_page_cgroup(pc);
2077 	return NULL;
2078 }
2079 
2080 void mem_cgroup_uncharge_page(struct page *page)
2081 {
2082 	/* early check. */
2083 	if (page_mapped(page))
2084 		return;
2085 	if (page->mapping && !PageAnon(page))
2086 		return;
2087 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2088 }
2089 
2090 void mem_cgroup_uncharge_cache_page(struct page *page)
2091 {
2092 	VM_BUG_ON(page_mapped(page));
2093 	VM_BUG_ON(page->mapping);
2094 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2095 }
2096 
2097 /*
2098  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2099  * In that cases, pages are freed continuously and we can expect pages
2100  * are in the same memcg. All these calls itself limits the number of
2101  * pages freed at once, then uncharge_start/end() is called properly.
2102  * This may be called prural(2) times in a context,
2103  */
2104 
2105 void mem_cgroup_uncharge_start(void)
2106 {
2107 	current->memcg_batch.do_batch++;
2108 	/* We can do nest. */
2109 	if (current->memcg_batch.do_batch == 1) {
2110 		current->memcg_batch.memcg = NULL;
2111 		current->memcg_batch.bytes = 0;
2112 		current->memcg_batch.memsw_bytes = 0;
2113 	}
2114 }
2115 
2116 void mem_cgroup_uncharge_end(void)
2117 {
2118 	struct memcg_batch_info *batch = &current->memcg_batch;
2119 
2120 	if (!batch->do_batch)
2121 		return;
2122 
2123 	batch->do_batch--;
2124 	if (batch->do_batch) /* If stacked, do nothing. */
2125 		return;
2126 
2127 	if (!batch->memcg)
2128 		return;
2129 	/*
2130 	 * This "batch->memcg" is valid without any css_get/put etc...
2131 	 * bacause we hide charges behind us.
2132 	 */
2133 	if (batch->bytes)
2134 		res_counter_uncharge(&batch->memcg->res, batch->bytes);
2135 	if (batch->memsw_bytes)
2136 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2137 	/* forget this pointer (for sanity check) */
2138 	batch->memcg = NULL;
2139 }
2140 
2141 #ifdef CONFIG_SWAP
2142 /*
2143  * called after __delete_from_swap_cache() and drop "page" account.
2144  * memcg information is recorded to swap_cgroup of "ent"
2145  */
2146 void
2147 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2148 {
2149 	struct mem_cgroup *memcg;
2150 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2151 
2152 	if (!swapout) /* this was a swap cache but the swap is unused ! */
2153 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2154 
2155 	memcg = __mem_cgroup_uncharge_common(page, ctype);
2156 
2157 	/* record memcg information */
2158 	if (do_swap_account && swapout && memcg) {
2159 		swap_cgroup_record(ent, css_id(&memcg->css));
2160 		mem_cgroup_get(memcg);
2161 	}
2162 	if (swapout && memcg)
2163 		css_put(&memcg->css);
2164 }
2165 #endif
2166 
2167 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2168 /*
2169  * called from swap_entry_free(). remove record in swap_cgroup and
2170  * uncharge "memsw" account.
2171  */
2172 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2173 {
2174 	struct mem_cgroup *memcg;
2175 	unsigned short id;
2176 
2177 	if (!do_swap_account)
2178 		return;
2179 
2180 	id = swap_cgroup_record(ent, 0);
2181 	rcu_read_lock();
2182 	memcg = mem_cgroup_lookup(id);
2183 	if (memcg) {
2184 		/*
2185 		 * We uncharge this because swap is freed.
2186 		 * This memcg can be obsolete one. We avoid calling css_tryget
2187 		 */
2188 		if (!mem_cgroup_is_root(memcg))
2189 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2190 		mem_cgroup_swap_statistics(memcg, false);
2191 		mem_cgroup_put(memcg);
2192 	}
2193 	rcu_read_unlock();
2194 }
2195 #endif
2196 
2197 /*
2198  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2199  * page belongs to.
2200  */
2201 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2202 {
2203 	struct page_cgroup *pc;
2204 	struct mem_cgroup *mem = NULL;
2205 	int ret = 0;
2206 
2207 	if (mem_cgroup_disabled())
2208 		return 0;
2209 
2210 	pc = lookup_page_cgroup(page);
2211 	lock_page_cgroup(pc);
2212 	if (PageCgroupUsed(pc)) {
2213 		mem = pc->mem_cgroup;
2214 		css_get(&mem->css);
2215 	}
2216 	unlock_page_cgroup(pc);
2217 
2218 	if (mem) {
2219 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2220 						page);
2221 		css_put(&mem->css);
2222 	}
2223 	*ptr = mem;
2224 	return ret;
2225 }
2226 
2227 /* remove redundant charge if migration failed*/
2228 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2229 		struct page *oldpage, struct page *newpage)
2230 {
2231 	struct page *target, *unused;
2232 	struct page_cgroup *pc;
2233 	enum charge_type ctype;
2234 
2235 	if (!mem)
2236 		return;
2237 	cgroup_exclude_rmdir(&mem->css);
2238 	/* at migration success, oldpage->mapping is NULL. */
2239 	if (oldpage->mapping) {
2240 		target = oldpage;
2241 		unused = NULL;
2242 	} else {
2243 		target = newpage;
2244 		unused = oldpage;
2245 	}
2246 
2247 	if (PageAnon(target))
2248 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2249 	else if (page_is_file_cache(target))
2250 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2251 	else
2252 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2253 
2254 	/* unused page is not on radix-tree now. */
2255 	if (unused)
2256 		__mem_cgroup_uncharge_common(unused, ctype);
2257 
2258 	pc = lookup_page_cgroup(target);
2259 	/*
2260 	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
2261 	 * So, double-counting is effectively avoided.
2262 	 */
2263 	__mem_cgroup_commit_charge(mem, pc, ctype);
2264 
2265 	/*
2266 	 * Both of oldpage and newpage are still under lock_page().
2267 	 * Then, we don't have to care about race in radix-tree.
2268 	 * But we have to be careful that this page is unmapped or not.
2269 	 *
2270 	 * There is a case for !page_mapped(). At the start of
2271 	 * migration, oldpage was mapped. But now, it's zapped.
2272 	 * But we know *target* page is not freed/reused under us.
2273 	 * mem_cgroup_uncharge_page() does all necessary checks.
2274 	 */
2275 	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2276 		mem_cgroup_uncharge_page(target);
2277 	/*
2278 	 * At migration, we may charge account against cgroup which has no tasks
2279 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2280 	 * In that case, we need to call pre_destroy() again. check it here.
2281 	 */
2282 	cgroup_release_and_wakeup_rmdir(&mem->css);
2283 }
2284 
2285 /*
2286  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2287  * Calling hierarchical_reclaim is not enough because we should update
2288  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2289  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2290  * not from the memcg which this page would be charged to.
2291  * try_charge_swapin does all of these works properly.
2292  */
2293 int mem_cgroup_shmem_charge_fallback(struct page *page,
2294 			    struct mm_struct *mm,
2295 			    gfp_t gfp_mask)
2296 {
2297 	struct mem_cgroup *mem = NULL;
2298 	int ret;
2299 
2300 	if (mem_cgroup_disabled())
2301 		return 0;
2302 
2303 	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2304 	if (!ret)
2305 		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2306 
2307 	return ret;
2308 }
2309 
2310 static DEFINE_MUTEX(set_limit_mutex);
2311 
2312 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2313 				unsigned long long val)
2314 {
2315 	int retry_count;
2316 	u64 memswlimit;
2317 	int ret = 0;
2318 	int children = mem_cgroup_count_children(memcg);
2319 	u64 curusage, oldusage;
2320 
2321 	/*
2322 	 * For keeping hierarchical_reclaim simple, how long we should retry
2323 	 * is depends on callers. We set our retry-count to be function
2324 	 * of # of children which we should visit in this loop.
2325 	 */
2326 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2327 
2328 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2329 
2330 	while (retry_count) {
2331 		if (signal_pending(current)) {
2332 			ret = -EINTR;
2333 			break;
2334 		}
2335 		/*
2336 		 * Rather than hide all in some function, I do this in
2337 		 * open coded manner. You see what this really does.
2338 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2339 		 */
2340 		mutex_lock(&set_limit_mutex);
2341 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2342 		if (memswlimit < val) {
2343 			ret = -EINVAL;
2344 			mutex_unlock(&set_limit_mutex);
2345 			break;
2346 		}
2347 		ret = res_counter_set_limit(&memcg->res, val);
2348 		if (!ret) {
2349 			if (memswlimit == val)
2350 				memcg->memsw_is_minimum = true;
2351 			else
2352 				memcg->memsw_is_minimum = false;
2353 		}
2354 		mutex_unlock(&set_limit_mutex);
2355 
2356 		if (!ret)
2357 			break;
2358 
2359 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2360 						MEM_CGROUP_RECLAIM_SHRINK);
2361 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2362 		/* Usage is reduced ? */
2363   		if (curusage >= oldusage)
2364 			retry_count--;
2365 		else
2366 			oldusage = curusage;
2367 	}
2368 
2369 	return ret;
2370 }
2371 
2372 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2373 					unsigned long long val)
2374 {
2375 	int retry_count;
2376 	u64 memlimit, oldusage, curusage;
2377 	int children = mem_cgroup_count_children(memcg);
2378 	int ret = -EBUSY;
2379 
2380 	/* see mem_cgroup_resize_res_limit */
2381  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2382 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2383 	while (retry_count) {
2384 		if (signal_pending(current)) {
2385 			ret = -EINTR;
2386 			break;
2387 		}
2388 		/*
2389 		 * Rather than hide all in some function, I do this in
2390 		 * open coded manner. You see what this really does.
2391 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2392 		 */
2393 		mutex_lock(&set_limit_mutex);
2394 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2395 		if (memlimit > val) {
2396 			ret = -EINVAL;
2397 			mutex_unlock(&set_limit_mutex);
2398 			break;
2399 		}
2400 		ret = res_counter_set_limit(&memcg->memsw, val);
2401 		if (!ret) {
2402 			if (memlimit == val)
2403 				memcg->memsw_is_minimum = true;
2404 			else
2405 				memcg->memsw_is_minimum = false;
2406 		}
2407 		mutex_unlock(&set_limit_mutex);
2408 
2409 		if (!ret)
2410 			break;
2411 
2412 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2413 						MEM_CGROUP_RECLAIM_NOSWAP |
2414 						MEM_CGROUP_RECLAIM_SHRINK);
2415 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2416 		/* Usage is reduced ? */
2417 		if (curusage >= oldusage)
2418 			retry_count--;
2419 		else
2420 			oldusage = curusage;
2421 	}
2422 	return ret;
2423 }
2424 
2425 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2426 						gfp_t gfp_mask, int nid,
2427 						int zid)
2428 {
2429 	unsigned long nr_reclaimed = 0;
2430 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2431 	unsigned long reclaimed;
2432 	int loop = 0;
2433 	struct mem_cgroup_tree_per_zone *mctz;
2434 	unsigned long long excess;
2435 
2436 	if (order > 0)
2437 		return 0;
2438 
2439 	mctz = soft_limit_tree_node_zone(nid, zid);
2440 	/*
2441 	 * This loop can run a while, specially if mem_cgroup's continuously
2442 	 * keep exceeding their soft limit and putting the system under
2443 	 * pressure
2444 	 */
2445 	do {
2446 		if (next_mz)
2447 			mz = next_mz;
2448 		else
2449 			mz = mem_cgroup_largest_soft_limit_node(mctz);
2450 		if (!mz)
2451 			break;
2452 
2453 		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2454 						gfp_mask,
2455 						MEM_CGROUP_RECLAIM_SOFT);
2456 		nr_reclaimed += reclaimed;
2457 		spin_lock(&mctz->lock);
2458 
2459 		/*
2460 		 * If we failed to reclaim anything from this memory cgroup
2461 		 * it is time to move on to the next cgroup
2462 		 */
2463 		next_mz = NULL;
2464 		if (!reclaimed) {
2465 			do {
2466 				/*
2467 				 * Loop until we find yet another one.
2468 				 *
2469 				 * By the time we get the soft_limit lock
2470 				 * again, someone might have aded the
2471 				 * group back on the RB tree. Iterate to
2472 				 * make sure we get a different mem.
2473 				 * mem_cgroup_largest_soft_limit_node returns
2474 				 * NULL if no other cgroup is present on
2475 				 * the tree
2476 				 */
2477 				next_mz =
2478 				__mem_cgroup_largest_soft_limit_node(mctz);
2479 				if (next_mz == mz) {
2480 					css_put(&next_mz->mem->css);
2481 					next_mz = NULL;
2482 				} else /* next_mz == NULL or other memcg */
2483 					break;
2484 			} while (1);
2485 		}
2486 		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2487 		excess = res_counter_soft_limit_excess(&mz->mem->res);
2488 		/*
2489 		 * One school of thought says that we should not add
2490 		 * back the node to the tree if reclaim returns 0.
2491 		 * But our reclaim could return 0, simply because due
2492 		 * to priority we are exposing a smaller subset of
2493 		 * memory to reclaim from. Consider this as a longer
2494 		 * term TODO.
2495 		 */
2496 		/* If excess == 0, no tree ops */
2497 		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2498 		spin_unlock(&mctz->lock);
2499 		css_put(&mz->mem->css);
2500 		loop++;
2501 		/*
2502 		 * Could not reclaim anything and there are no more
2503 		 * mem cgroups to try or we seem to be looping without
2504 		 * reclaiming anything.
2505 		 */
2506 		if (!nr_reclaimed &&
2507 			(next_mz == NULL ||
2508 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2509 			break;
2510 	} while (!nr_reclaimed);
2511 	if (next_mz)
2512 		css_put(&next_mz->mem->css);
2513 	return nr_reclaimed;
2514 }
2515 
2516 /*
2517  * This routine traverse page_cgroup in given list and drop them all.
2518  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2519  */
2520 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2521 				int node, int zid, enum lru_list lru)
2522 {
2523 	struct zone *zone;
2524 	struct mem_cgroup_per_zone *mz;
2525 	struct page_cgroup *pc, *busy;
2526 	unsigned long flags, loop;
2527 	struct list_head *list;
2528 	int ret = 0;
2529 
2530 	zone = &NODE_DATA(node)->node_zones[zid];
2531 	mz = mem_cgroup_zoneinfo(mem, node, zid);
2532 	list = &mz->lists[lru];
2533 
2534 	loop = MEM_CGROUP_ZSTAT(mz, lru);
2535 	/* give some margin against EBUSY etc...*/
2536 	loop += 256;
2537 	busy = NULL;
2538 	while (loop--) {
2539 		ret = 0;
2540 		spin_lock_irqsave(&zone->lru_lock, flags);
2541 		if (list_empty(list)) {
2542 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2543 			break;
2544 		}
2545 		pc = list_entry(list->prev, struct page_cgroup, lru);
2546 		if (busy == pc) {
2547 			list_move(&pc->lru, list);
2548 			busy = 0;
2549 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2550 			continue;
2551 		}
2552 		spin_unlock_irqrestore(&zone->lru_lock, flags);
2553 
2554 		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2555 		if (ret == -ENOMEM)
2556 			break;
2557 
2558 		if (ret == -EBUSY || ret == -EINVAL) {
2559 			/* found lock contention or "pc" is obsolete. */
2560 			busy = pc;
2561 			cond_resched();
2562 		} else
2563 			busy = NULL;
2564 	}
2565 
2566 	if (!ret && !list_empty(list))
2567 		return -EBUSY;
2568 	return ret;
2569 }
2570 
2571 /*
2572  * make mem_cgroup's charge to be 0 if there is no task.
2573  * This enables deleting this mem_cgroup.
2574  */
2575 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2576 {
2577 	int ret;
2578 	int node, zid, shrink;
2579 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2580 	struct cgroup *cgrp = mem->css.cgroup;
2581 
2582 	css_get(&mem->css);
2583 
2584 	shrink = 0;
2585 	/* should free all ? */
2586 	if (free_all)
2587 		goto try_to_free;
2588 move_account:
2589 	do {
2590 		ret = -EBUSY;
2591 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2592 			goto out;
2593 		ret = -EINTR;
2594 		if (signal_pending(current))
2595 			goto out;
2596 		/* This is for making all *used* pages to be on LRU. */
2597 		lru_add_drain_all();
2598 		drain_all_stock_sync();
2599 		ret = 0;
2600 		for_each_node_state(node, N_HIGH_MEMORY) {
2601 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2602 				enum lru_list l;
2603 				for_each_lru(l) {
2604 					ret = mem_cgroup_force_empty_list(mem,
2605 							node, zid, l);
2606 					if (ret)
2607 						break;
2608 				}
2609 			}
2610 			if (ret)
2611 				break;
2612 		}
2613 		/* it seems parent cgroup doesn't have enough mem */
2614 		if (ret == -ENOMEM)
2615 			goto try_to_free;
2616 		cond_resched();
2617 	/* "ret" should also be checked to ensure all lists are empty. */
2618 	} while (mem->res.usage > 0 || ret);
2619 out:
2620 	css_put(&mem->css);
2621 	return ret;
2622 
2623 try_to_free:
2624 	/* returns EBUSY if there is a task or if we come here twice. */
2625 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2626 		ret = -EBUSY;
2627 		goto out;
2628 	}
2629 	/* we call try-to-free pages for make this cgroup empty */
2630 	lru_add_drain_all();
2631 	/* try to free all pages in this cgroup */
2632 	shrink = 1;
2633 	while (nr_retries && mem->res.usage > 0) {
2634 		int progress;
2635 
2636 		if (signal_pending(current)) {
2637 			ret = -EINTR;
2638 			goto out;
2639 		}
2640 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
2641 						false, get_swappiness(mem));
2642 		if (!progress) {
2643 			nr_retries--;
2644 			/* maybe some writeback is necessary */
2645 			congestion_wait(BLK_RW_ASYNC, HZ/10);
2646 		}
2647 
2648 	}
2649 	lru_add_drain();
2650 	/* try move_account...there may be some *locked* pages. */
2651 	goto move_account;
2652 }
2653 
2654 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
2655 {
2656 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
2657 }
2658 
2659 
2660 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
2661 {
2662 	return mem_cgroup_from_cont(cont)->use_hierarchy;
2663 }
2664 
2665 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2666 					u64 val)
2667 {
2668 	int retval = 0;
2669 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2670 	struct cgroup *parent = cont->parent;
2671 	struct mem_cgroup *parent_mem = NULL;
2672 
2673 	if (parent)
2674 		parent_mem = mem_cgroup_from_cont(parent);
2675 
2676 	cgroup_lock();
2677 	/*
2678 	 * If parent's use_hierarchy is set, we can't make any modifications
2679 	 * in the child subtrees. If it is unset, then the change can
2680 	 * occur, provided the current cgroup has no children.
2681 	 *
2682 	 * For the root cgroup, parent_mem is NULL, we allow value to be
2683 	 * set if there are no children.
2684 	 */
2685 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
2686 				(val == 1 || val == 0)) {
2687 		if (list_empty(&cont->children))
2688 			mem->use_hierarchy = val;
2689 		else
2690 			retval = -EBUSY;
2691 	} else
2692 		retval = -EINVAL;
2693 	cgroup_unlock();
2694 
2695 	return retval;
2696 }
2697 
2698 struct mem_cgroup_idx_data {
2699 	s64 val;
2700 	enum mem_cgroup_stat_index idx;
2701 };
2702 
2703 static int
2704 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2705 {
2706 	struct mem_cgroup_idx_data *d = data;
2707 	d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2708 	return 0;
2709 }
2710 
2711 static void
2712 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2713 				enum mem_cgroup_stat_index idx, s64 *val)
2714 {
2715 	struct mem_cgroup_idx_data d;
2716 	d.idx = idx;
2717 	d.val = 0;
2718 	mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2719 	*val = d.val;
2720 }
2721 
2722 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2723 {
2724 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2725 	u64 idx_val, val;
2726 	int type, name;
2727 
2728 	type = MEMFILE_TYPE(cft->private);
2729 	name = MEMFILE_ATTR(cft->private);
2730 	switch (type) {
2731 	case _MEM:
2732 		if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2733 			mem_cgroup_get_recursive_idx_stat(mem,
2734 				MEM_CGROUP_STAT_CACHE, &idx_val);
2735 			val = idx_val;
2736 			mem_cgroup_get_recursive_idx_stat(mem,
2737 				MEM_CGROUP_STAT_RSS, &idx_val);
2738 			val += idx_val;
2739 			val <<= PAGE_SHIFT;
2740 		} else
2741 			val = res_counter_read_u64(&mem->res, name);
2742 		break;
2743 	case _MEMSWAP:
2744 		if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2745 			mem_cgroup_get_recursive_idx_stat(mem,
2746 				MEM_CGROUP_STAT_CACHE, &idx_val);
2747 			val = idx_val;
2748 			mem_cgroup_get_recursive_idx_stat(mem,
2749 				MEM_CGROUP_STAT_RSS, &idx_val);
2750 			val += idx_val;
2751 			mem_cgroup_get_recursive_idx_stat(mem,
2752 				MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 			val += idx_val;
2754 			val <<= PAGE_SHIFT;
2755 		} else
2756 			val = res_counter_read_u64(&mem->memsw, name);
2757 		break;
2758 	default:
2759 		BUG();
2760 		break;
2761 	}
2762 	return val;
2763 }
2764 /*
2765  * The user of this function is...
2766  * RES_LIMIT.
2767  */
2768 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2769 			    const char *buffer)
2770 {
2771 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2772 	int type, name;
2773 	unsigned long long val;
2774 	int ret;
2775 
2776 	type = MEMFILE_TYPE(cft->private);
2777 	name = MEMFILE_ATTR(cft->private);
2778 	switch (name) {
2779 	case RES_LIMIT:
2780 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2781 			ret = -EINVAL;
2782 			break;
2783 		}
2784 		/* This function does all necessary parse...reuse it */
2785 		ret = res_counter_memparse_write_strategy(buffer, &val);
2786 		if (ret)
2787 			break;
2788 		if (type == _MEM)
2789 			ret = mem_cgroup_resize_limit(memcg, val);
2790 		else
2791 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
2792 		break;
2793 	case RES_SOFT_LIMIT:
2794 		ret = res_counter_memparse_write_strategy(buffer, &val);
2795 		if (ret)
2796 			break;
2797 		/*
2798 		 * For memsw, soft limits are hard to implement in terms
2799 		 * of semantics, for now, we support soft limits for
2800 		 * control without swap
2801 		 */
2802 		if (type == _MEM)
2803 			ret = res_counter_set_soft_limit(&memcg->res, val);
2804 		else
2805 			ret = -EINVAL;
2806 		break;
2807 	default:
2808 		ret = -EINVAL; /* should be BUG() ? */
2809 		break;
2810 	}
2811 	return ret;
2812 }
2813 
2814 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
2815 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
2816 {
2817 	struct cgroup *cgroup;
2818 	unsigned long long min_limit, min_memsw_limit, tmp;
2819 
2820 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2821 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2822 	cgroup = memcg->css.cgroup;
2823 	if (!memcg->use_hierarchy)
2824 		goto out;
2825 
2826 	while (cgroup->parent) {
2827 		cgroup = cgroup->parent;
2828 		memcg = mem_cgroup_from_cont(cgroup);
2829 		if (!memcg->use_hierarchy)
2830 			break;
2831 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
2832 		min_limit = min(min_limit, tmp);
2833 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2834 		min_memsw_limit = min(min_memsw_limit, tmp);
2835 	}
2836 out:
2837 	*mem_limit = min_limit;
2838 	*memsw_limit = min_memsw_limit;
2839 	return;
2840 }
2841 
2842 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2843 {
2844 	struct mem_cgroup *mem;
2845 	int type, name;
2846 
2847 	mem = mem_cgroup_from_cont(cont);
2848 	type = MEMFILE_TYPE(event);
2849 	name = MEMFILE_ATTR(event);
2850 	switch (name) {
2851 	case RES_MAX_USAGE:
2852 		if (type == _MEM)
2853 			res_counter_reset_max(&mem->res);
2854 		else
2855 			res_counter_reset_max(&mem->memsw);
2856 		break;
2857 	case RES_FAILCNT:
2858 		if (type == _MEM)
2859 			res_counter_reset_failcnt(&mem->res);
2860 		else
2861 			res_counter_reset_failcnt(&mem->memsw);
2862 		break;
2863 	}
2864 
2865 	return 0;
2866 }
2867 
2868 
2869 /* For read statistics */
2870 enum {
2871 	MCS_CACHE,
2872 	MCS_RSS,
2873 	MCS_FILE_MAPPED,
2874 	MCS_PGPGIN,
2875 	MCS_PGPGOUT,
2876 	MCS_SWAP,
2877 	MCS_INACTIVE_ANON,
2878 	MCS_ACTIVE_ANON,
2879 	MCS_INACTIVE_FILE,
2880 	MCS_ACTIVE_FILE,
2881 	MCS_UNEVICTABLE,
2882 	NR_MCS_STAT,
2883 };
2884 
2885 struct mcs_total_stat {
2886 	s64 stat[NR_MCS_STAT];
2887 };
2888 
2889 struct {
2890 	char *local_name;
2891 	char *total_name;
2892 } memcg_stat_strings[NR_MCS_STAT] = {
2893 	{"cache", "total_cache"},
2894 	{"rss", "total_rss"},
2895 	{"mapped_file", "total_mapped_file"},
2896 	{"pgpgin", "total_pgpgin"},
2897 	{"pgpgout", "total_pgpgout"},
2898 	{"swap", "total_swap"},
2899 	{"inactive_anon", "total_inactive_anon"},
2900 	{"active_anon", "total_active_anon"},
2901 	{"inactive_file", "total_inactive_file"},
2902 	{"active_file", "total_active_file"},
2903 	{"unevictable", "total_unevictable"}
2904 };
2905 
2906 
2907 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2908 {
2909 	struct mcs_total_stat *s = data;
2910 	s64 val;
2911 
2912 	/* per cpu stat */
2913 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2914 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
2915 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2916 	s->stat[MCS_RSS] += val * PAGE_SIZE;
2917 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2918 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2919 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2920 	s->stat[MCS_PGPGIN] += val;
2921 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2922 	s->stat[MCS_PGPGOUT] += val;
2923 	if (do_swap_account) {
2924 		val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2925 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
2926 	}
2927 
2928 	/* per zone stat */
2929 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2930 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2931 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2932 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2933 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2934 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2935 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2936 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2937 	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2938 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2939 	return 0;
2940 }
2941 
2942 static void
2943 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2944 {
2945 	mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2946 }
2947 
2948 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2949 				 struct cgroup_map_cb *cb)
2950 {
2951 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
2952 	struct mcs_total_stat mystat;
2953 	int i;
2954 
2955 	memset(&mystat, 0, sizeof(mystat));
2956 	mem_cgroup_get_local_stat(mem_cont, &mystat);
2957 
2958 	for (i = 0; i < NR_MCS_STAT; i++) {
2959 		if (i == MCS_SWAP && !do_swap_account)
2960 			continue;
2961 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2962 	}
2963 
2964 	/* Hierarchical information */
2965 	{
2966 		unsigned long long limit, memsw_limit;
2967 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
2968 		cb->fill(cb, "hierarchical_memory_limit", limit);
2969 		if (do_swap_account)
2970 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
2971 	}
2972 
2973 	memset(&mystat, 0, sizeof(mystat));
2974 	mem_cgroup_get_total_stat(mem_cont, &mystat);
2975 	for (i = 0; i < NR_MCS_STAT; i++) {
2976 		if (i == MCS_SWAP && !do_swap_account)
2977 			continue;
2978 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2979 	}
2980 
2981 #ifdef CONFIG_DEBUG_VM
2982 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
2983 
2984 	{
2985 		int nid, zid;
2986 		struct mem_cgroup_per_zone *mz;
2987 		unsigned long recent_rotated[2] = {0, 0};
2988 		unsigned long recent_scanned[2] = {0, 0};
2989 
2990 		for_each_online_node(nid)
2991 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2992 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
2993 
2994 				recent_rotated[0] +=
2995 					mz->reclaim_stat.recent_rotated[0];
2996 				recent_rotated[1] +=
2997 					mz->reclaim_stat.recent_rotated[1];
2998 				recent_scanned[0] +=
2999 					mz->reclaim_stat.recent_scanned[0];
3000 				recent_scanned[1] +=
3001 					mz->reclaim_stat.recent_scanned[1];
3002 			}
3003 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3004 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3005 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3006 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3007 	}
3008 #endif
3009 
3010 	return 0;
3011 }
3012 
3013 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3014 {
3015 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3016 
3017 	return get_swappiness(memcg);
3018 }
3019 
3020 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3021 				       u64 val)
3022 {
3023 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3024 	struct mem_cgroup *parent;
3025 
3026 	if (val > 100)
3027 		return -EINVAL;
3028 
3029 	if (cgrp->parent == NULL)
3030 		return -EINVAL;
3031 
3032 	parent = mem_cgroup_from_cont(cgrp->parent);
3033 
3034 	cgroup_lock();
3035 
3036 	/* If under hierarchy, only empty-root can set this value */
3037 	if ((parent->use_hierarchy) ||
3038 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3039 		cgroup_unlock();
3040 		return -EINVAL;
3041 	}
3042 
3043 	spin_lock(&memcg->reclaim_param_lock);
3044 	memcg->swappiness = val;
3045 	spin_unlock(&memcg->reclaim_param_lock);
3046 
3047 	cgroup_unlock();
3048 
3049 	return 0;
3050 }
3051 
3052 
3053 static struct cftype mem_cgroup_files[] = {
3054 	{
3055 		.name = "usage_in_bytes",
3056 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3057 		.read_u64 = mem_cgroup_read,
3058 	},
3059 	{
3060 		.name = "max_usage_in_bytes",
3061 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3062 		.trigger = mem_cgroup_reset,
3063 		.read_u64 = mem_cgroup_read,
3064 	},
3065 	{
3066 		.name = "limit_in_bytes",
3067 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3068 		.write_string = mem_cgroup_write,
3069 		.read_u64 = mem_cgroup_read,
3070 	},
3071 	{
3072 		.name = "soft_limit_in_bytes",
3073 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3074 		.write_string = mem_cgroup_write,
3075 		.read_u64 = mem_cgroup_read,
3076 	},
3077 	{
3078 		.name = "failcnt",
3079 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3080 		.trigger = mem_cgroup_reset,
3081 		.read_u64 = mem_cgroup_read,
3082 	},
3083 	{
3084 		.name = "stat",
3085 		.read_map = mem_control_stat_show,
3086 	},
3087 	{
3088 		.name = "force_empty",
3089 		.trigger = mem_cgroup_force_empty_write,
3090 	},
3091 	{
3092 		.name = "use_hierarchy",
3093 		.write_u64 = mem_cgroup_hierarchy_write,
3094 		.read_u64 = mem_cgroup_hierarchy_read,
3095 	},
3096 	{
3097 		.name = "swappiness",
3098 		.read_u64 = mem_cgroup_swappiness_read,
3099 		.write_u64 = mem_cgroup_swappiness_write,
3100 	},
3101 };
3102 
3103 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3104 static struct cftype memsw_cgroup_files[] = {
3105 	{
3106 		.name = "memsw.usage_in_bytes",
3107 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3108 		.read_u64 = mem_cgroup_read,
3109 	},
3110 	{
3111 		.name = "memsw.max_usage_in_bytes",
3112 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3113 		.trigger = mem_cgroup_reset,
3114 		.read_u64 = mem_cgroup_read,
3115 	},
3116 	{
3117 		.name = "memsw.limit_in_bytes",
3118 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3119 		.write_string = mem_cgroup_write,
3120 		.read_u64 = mem_cgroup_read,
3121 	},
3122 	{
3123 		.name = "memsw.failcnt",
3124 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3125 		.trigger = mem_cgroup_reset,
3126 		.read_u64 = mem_cgroup_read,
3127 	},
3128 };
3129 
3130 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3131 {
3132 	if (!do_swap_account)
3133 		return 0;
3134 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
3135 				ARRAY_SIZE(memsw_cgroup_files));
3136 };
3137 #else
3138 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3139 {
3140 	return 0;
3141 }
3142 #endif
3143 
3144 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3145 {
3146 	struct mem_cgroup_per_node *pn;
3147 	struct mem_cgroup_per_zone *mz;
3148 	enum lru_list l;
3149 	int zone, tmp = node;
3150 	/*
3151 	 * This routine is called against possible nodes.
3152 	 * But it's BUG to call kmalloc() against offline node.
3153 	 *
3154 	 * TODO: this routine can waste much memory for nodes which will
3155 	 *       never be onlined. It's better to use memory hotplug callback
3156 	 *       function.
3157 	 */
3158 	if (!node_state(node, N_NORMAL_MEMORY))
3159 		tmp = -1;
3160 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3161 	if (!pn)
3162 		return 1;
3163 
3164 	mem->info.nodeinfo[node] = pn;
3165 	memset(pn, 0, sizeof(*pn));
3166 
3167 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3168 		mz = &pn->zoneinfo[zone];
3169 		for_each_lru(l)
3170 			INIT_LIST_HEAD(&mz->lists[l]);
3171 		mz->usage_in_excess = 0;
3172 		mz->on_tree = false;
3173 		mz->mem = mem;
3174 	}
3175 	return 0;
3176 }
3177 
3178 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3179 {
3180 	kfree(mem->info.nodeinfo[node]);
3181 }
3182 
3183 static int mem_cgroup_size(void)
3184 {
3185 	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3186 	return sizeof(struct mem_cgroup) + cpustat_size;
3187 }
3188 
3189 static struct mem_cgroup *mem_cgroup_alloc(void)
3190 {
3191 	struct mem_cgroup *mem;
3192 	int size = mem_cgroup_size();
3193 
3194 	if (size < PAGE_SIZE)
3195 		mem = kmalloc(size, GFP_KERNEL);
3196 	else
3197 		mem = vmalloc(size);
3198 
3199 	if (mem)
3200 		memset(mem, 0, size);
3201 	return mem;
3202 }
3203 
3204 /*
3205  * At destroying mem_cgroup, references from swap_cgroup can remain.
3206  * (scanning all at force_empty is too costly...)
3207  *
3208  * Instead of clearing all references at force_empty, we remember
3209  * the number of reference from swap_cgroup and free mem_cgroup when
3210  * it goes down to 0.
3211  *
3212  * Removal of cgroup itself succeeds regardless of refs from swap.
3213  */
3214 
3215 static void __mem_cgroup_free(struct mem_cgroup *mem)
3216 {
3217 	int node;
3218 
3219 	mem_cgroup_remove_from_trees(mem);
3220 	free_css_id(&mem_cgroup_subsys, &mem->css);
3221 
3222 	for_each_node_state(node, N_POSSIBLE)
3223 		free_mem_cgroup_per_zone_info(mem, node);
3224 
3225 	if (mem_cgroup_size() < PAGE_SIZE)
3226 		kfree(mem);
3227 	else
3228 		vfree(mem);
3229 }
3230 
3231 static void mem_cgroup_get(struct mem_cgroup *mem)
3232 {
3233 	atomic_inc(&mem->refcnt);
3234 }
3235 
3236 static void mem_cgroup_put(struct mem_cgroup *mem)
3237 {
3238 	if (atomic_dec_and_test(&mem->refcnt)) {
3239 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
3240 		__mem_cgroup_free(mem);
3241 		if (parent)
3242 			mem_cgroup_put(parent);
3243 	}
3244 }
3245 
3246 /*
3247  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3248  */
3249 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
3250 {
3251 	if (!mem->res.parent)
3252 		return NULL;
3253 	return mem_cgroup_from_res_counter(mem->res.parent, res);
3254 }
3255 
3256 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3257 static void __init enable_swap_cgroup(void)
3258 {
3259 	if (!mem_cgroup_disabled() && really_do_swap_account)
3260 		do_swap_account = 1;
3261 }
3262 #else
3263 static void __init enable_swap_cgroup(void)
3264 {
3265 }
3266 #endif
3267 
3268 static int mem_cgroup_soft_limit_tree_init(void)
3269 {
3270 	struct mem_cgroup_tree_per_node *rtpn;
3271 	struct mem_cgroup_tree_per_zone *rtpz;
3272 	int tmp, node, zone;
3273 
3274 	for_each_node_state(node, N_POSSIBLE) {
3275 		tmp = node;
3276 		if (!node_state(node, N_NORMAL_MEMORY))
3277 			tmp = -1;
3278 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3279 		if (!rtpn)
3280 			return 1;
3281 
3282 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
3283 
3284 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3285 			rtpz = &rtpn->rb_tree_per_zone[zone];
3286 			rtpz->rb_root = RB_ROOT;
3287 			spin_lock_init(&rtpz->lock);
3288 		}
3289 	}
3290 	return 0;
3291 }
3292 
3293 static struct cgroup_subsys_state * __ref
3294 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3295 {
3296 	struct mem_cgroup *mem, *parent;
3297 	long error = -ENOMEM;
3298 	int node;
3299 
3300 	mem = mem_cgroup_alloc();
3301 	if (!mem)
3302 		return ERR_PTR(error);
3303 
3304 	for_each_node_state(node, N_POSSIBLE)
3305 		if (alloc_mem_cgroup_per_zone_info(mem, node))
3306 			goto free_out;
3307 
3308 	/* root ? */
3309 	if (cont->parent == NULL) {
3310 		int cpu;
3311 		enable_swap_cgroup();
3312 		parent = NULL;
3313 		root_mem_cgroup = mem;
3314 		if (mem_cgroup_soft_limit_tree_init())
3315 			goto free_out;
3316 		for_each_possible_cpu(cpu) {
3317 			struct memcg_stock_pcp *stock =
3318 						&per_cpu(memcg_stock, cpu);
3319 			INIT_WORK(&stock->work, drain_local_stock);
3320 		}
3321 		hotcpu_notifier(memcg_stock_cpu_callback, 0);
3322 
3323 	} else {
3324 		parent = mem_cgroup_from_cont(cont->parent);
3325 		mem->use_hierarchy = parent->use_hierarchy;
3326 	}
3327 
3328 	if (parent && parent->use_hierarchy) {
3329 		res_counter_init(&mem->res, &parent->res);
3330 		res_counter_init(&mem->memsw, &parent->memsw);
3331 		/*
3332 		 * We increment refcnt of the parent to ensure that we can
3333 		 * safely access it on res_counter_charge/uncharge.
3334 		 * This refcnt will be decremented when freeing this
3335 		 * mem_cgroup(see mem_cgroup_put).
3336 		 */
3337 		mem_cgroup_get(parent);
3338 	} else {
3339 		res_counter_init(&mem->res, NULL);
3340 		res_counter_init(&mem->memsw, NULL);
3341 	}
3342 	mem->last_scanned_child = 0;
3343 	spin_lock_init(&mem->reclaim_param_lock);
3344 
3345 	if (parent)
3346 		mem->swappiness = get_swappiness(parent);
3347 	atomic_set(&mem->refcnt, 1);
3348 	return &mem->css;
3349 free_out:
3350 	__mem_cgroup_free(mem);
3351 	root_mem_cgroup = NULL;
3352 	return ERR_PTR(error);
3353 }
3354 
3355 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
3356 					struct cgroup *cont)
3357 {
3358 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3359 
3360 	return mem_cgroup_force_empty(mem, false);
3361 }
3362 
3363 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
3364 				struct cgroup *cont)
3365 {
3366 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3367 
3368 	mem_cgroup_put(mem);
3369 }
3370 
3371 static int mem_cgroup_populate(struct cgroup_subsys *ss,
3372 				struct cgroup *cont)
3373 {
3374 	int ret;
3375 
3376 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
3377 				ARRAY_SIZE(mem_cgroup_files));
3378 
3379 	if (!ret)
3380 		ret = register_memsw_files(cont, ss);
3381 	return ret;
3382 }
3383 
3384 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3385 				struct cgroup *cont,
3386 				struct cgroup *old_cont,
3387 				struct task_struct *p,
3388 				bool threadgroup)
3389 {
3390 	/*
3391 	 * FIXME: It's better to move charges of this process from old
3392 	 * memcg to new memcg. But it's just on TODO-List now.
3393 	 */
3394 }
3395 
3396 struct cgroup_subsys mem_cgroup_subsys = {
3397 	.name = "memory",
3398 	.subsys_id = mem_cgroup_subsys_id,
3399 	.create = mem_cgroup_create,
3400 	.pre_destroy = mem_cgroup_pre_destroy,
3401 	.destroy = mem_cgroup_destroy,
3402 	.populate = mem_cgroup_populate,
3403 	.attach = mem_cgroup_move_task,
3404 	.early_init = 0,
3405 	.use_id = 1,
3406 };
3407 
3408 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3409 
3410 static int __init disable_swap_account(char *s)
3411 {
3412 	really_do_swap_account = 0;
3413 	return 1;
3414 }
3415 __setup("noswapaccount", disable_swap_account);
3416 #endif
3417