xref: /openbmc/linux/mm/memcontrol.c (revision 5d4a2e29)
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * Memory thresholds
10  * Copyright (C) 2009 Nokia Corporation
11  * Author: Kirill A. Shutemov
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  */
23 
24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h>
27 #include <linux/mm.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h>
30 #include <linux/smp.h>
31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h>
35 #include <linux/limits.h>
36 #include <linux/mutex.h>
37 #include <linux/rbtree.h>
38 #include <linux/slab.h>
39 #include <linux/swap.h>
40 #include <linux/swapops.h>
41 #include <linux/spinlock.h>
42 #include <linux/eventfd.h>
43 #include <linux/sort.h>
44 #include <linux/fs.h>
45 #include <linux/seq_file.h>
46 #include <linux/vmalloc.h>
47 #include <linux/mm_inline.h>
48 #include <linux/page_cgroup.h>
49 #include <linux/cpu.h>
50 #include "internal.h"
51 
52 #include <asm/uaccess.h>
53 
54 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55 #define MEM_CGROUP_RECLAIM_RETRIES	5
56 struct mem_cgroup *root_mem_cgroup __read_mostly;
57 
58 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
59 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
60 int do_swap_account __read_mostly;
61 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
62 #else
63 #define do_swap_account		(0)
64 #endif
65 
66 /*
67  * Per memcg event counter is incremented at every pagein/pageout. This counter
68  * is used for trigger some periodic events. This is straightforward and better
69  * than using jiffies etc. to handle periodic memcg event.
70  *
71  * These values will be used as !((event) & ((1 <<(thresh)) - 1))
72  */
73 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
74 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
75 
76 /*
77  * Statistics for memory cgroup.
78  */
79 enum mem_cgroup_stat_index {
80 	/*
81 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
82 	 */
83 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
84 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
85 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
86 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
87 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
88 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
90 
91 	MEM_CGROUP_STAT_NSTATS,
92 };
93 
94 struct mem_cgroup_stat_cpu {
95 	s64 count[MEM_CGROUP_STAT_NSTATS];
96 };
97 
98 /*
99  * per-zone information in memory controller.
100  */
101 struct mem_cgroup_per_zone {
102 	/*
103 	 * spin_lock to protect the per cgroup LRU
104 	 */
105 	struct list_head	lists[NR_LRU_LISTS];
106 	unsigned long		count[NR_LRU_LISTS];
107 
108 	struct zone_reclaim_stat reclaim_stat;
109 	struct rb_node		tree_node;	/* RB tree node */
110 	unsigned long long	usage_in_excess;/* Set to the value by which */
111 						/* the soft limit is exceeded*/
112 	bool			on_tree;
113 	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
114 						/* use container_of	   */
115 };
116 /* Macro for accessing counter */
117 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
118 
119 struct mem_cgroup_per_node {
120 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
121 };
122 
123 struct mem_cgroup_lru_info {
124 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
125 };
126 
127 /*
128  * Cgroups above their limits are maintained in a RB-Tree, independent of
129  * their hierarchy representation
130  */
131 
132 struct mem_cgroup_tree_per_zone {
133 	struct rb_root rb_root;
134 	spinlock_t lock;
135 };
136 
137 struct mem_cgroup_tree_per_node {
138 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
139 };
140 
141 struct mem_cgroup_tree {
142 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
143 };
144 
145 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
146 
147 struct mem_cgroup_threshold {
148 	struct eventfd_ctx *eventfd;
149 	u64 threshold;
150 };
151 
152 /* For threshold */
153 struct mem_cgroup_threshold_ary {
154 	/* An array index points to threshold just below usage. */
155 	int current_threshold;
156 	/* Size of entries[] */
157 	unsigned int size;
158 	/* Array of thresholds */
159 	struct mem_cgroup_threshold entries[0];
160 };
161 
162 struct mem_cgroup_thresholds {
163 	/* Primary thresholds array */
164 	struct mem_cgroup_threshold_ary *primary;
165 	/*
166 	 * Spare threshold array.
167 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
168 	 * It must be able to store at least primary->size - 1 entries.
169 	 */
170 	struct mem_cgroup_threshold_ary *spare;
171 };
172 
173 /* for OOM */
174 struct mem_cgroup_eventfd_list {
175 	struct list_head list;
176 	struct eventfd_ctx *eventfd;
177 };
178 
179 static void mem_cgroup_threshold(struct mem_cgroup *mem);
180 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
181 
182 /*
183  * The memory controller data structure. The memory controller controls both
184  * page cache and RSS per cgroup. We would eventually like to provide
185  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
186  * to help the administrator determine what knobs to tune.
187  *
188  * TODO: Add a water mark for the memory controller. Reclaim will begin when
189  * we hit the water mark. May be even add a low water mark, such that
190  * no reclaim occurs from a cgroup at it's low water mark, this is
191  * a feature that will be implemented much later in the future.
192  */
193 struct mem_cgroup {
194 	struct cgroup_subsys_state css;
195 	/*
196 	 * the counter to account for memory usage
197 	 */
198 	struct res_counter res;
199 	/*
200 	 * the counter to account for mem+swap usage.
201 	 */
202 	struct res_counter memsw;
203 	/*
204 	 * Per cgroup active and inactive list, similar to the
205 	 * per zone LRU lists.
206 	 */
207 	struct mem_cgroup_lru_info info;
208 
209 	/*
210 	  protect against reclaim related member.
211 	*/
212 	spinlock_t reclaim_param_lock;
213 
214 	int	prev_priority;	/* for recording reclaim priority */
215 
216 	/*
217 	 * While reclaiming in a hierarchy, we cache the last child we
218 	 * reclaimed from.
219 	 */
220 	int last_scanned_child;
221 	/*
222 	 * Should the accounting and control be hierarchical, per subtree?
223 	 */
224 	bool use_hierarchy;
225 	atomic_t	oom_lock;
226 	atomic_t	refcnt;
227 
228 	unsigned int	swappiness;
229 	/* OOM-Killer disable */
230 	int		oom_kill_disable;
231 
232 	/* set when res.limit == memsw.limit */
233 	bool		memsw_is_minimum;
234 
235 	/* protect arrays of thresholds */
236 	struct mutex thresholds_lock;
237 
238 	/* thresholds for memory usage. RCU-protected */
239 	struct mem_cgroup_thresholds thresholds;
240 
241 	/* thresholds for mem+swap usage. RCU-protected */
242 	struct mem_cgroup_thresholds memsw_thresholds;
243 
244 	/* For oom notifier event fd */
245 	struct list_head oom_notify;
246 
247 	/*
248 	 * Should we move charges of a task when a task is moved into this
249 	 * mem_cgroup ? And what type of charges should we move ?
250 	 */
251 	unsigned long 	move_charge_at_immigrate;
252 	/*
253 	 * percpu counter.
254 	 */
255 	struct mem_cgroup_stat_cpu *stat;
256 };
257 
258 /* Stuffs for move charges at task migration. */
259 /*
260  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
261  * left-shifted bitmap of these types.
262  */
263 enum move_type {
264 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
265 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
266 	NR_MOVE_TYPE,
267 };
268 
269 /* "mc" and its members are protected by cgroup_mutex */
270 static struct move_charge_struct {
271 	struct mem_cgroup *from;
272 	struct mem_cgroup *to;
273 	unsigned long precharge;
274 	unsigned long moved_charge;
275 	unsigned long moved_swap;
276 	struct task_struct *moving_task;	/* a task moving charges */
277 	wait_queue_head_t waitq;		/* a waitq for other context */
278 } mc = {
279 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280 };
281 
282 static bool move_anon(void)
283 {
284 	return test_bit(MOVE_CHARGE_TYPE_ANON,
285 					&mc.to->move_charge_at_immigrate);
286 }
287 
288 static bool move_file(void)
289 {
290 	return test_bit(MOVE_CHARGE_TYPE_FILE,
291 					&mc.to->move_charge_at_immigrate);
292 }
293 
294 /*
295  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
296  * limit reclaim to prevent infinite loops, if they ever occur.
297  */
298 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
299 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
300 
301 enum charge_type {
302 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
303 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
304 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
305 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
306 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
307 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
308 	NR_CHARGE_TYPE,
309 };
310 
311 /* only for here (for easy reading.) */
312 #define PCGF_CACHE	(1UL << PCG_CACHE)
313 #define PCGF_USED	(1UL << PCG_USED)
314 #define PCGF_LOCK	(1UL << PCG_LOCK)
315 /* Not used, but added here for completeness */
316 #define PCGF_ACCT	(1UL << PCG_ACCT)
317 
318 /* for encoding cft->private value on file */
319 #define _MEM			(0)
320 #define _MEMSWAP		(1)
321 #define _OOM_TYPE		(2)
322 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
323 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
324 #define MEMFILE_ATTR(val)	((val) & 0xffff)
325 /* Used for OOM nofiier */
326 #define OOM_CONTROL		(0)
327 
328 /*
329  * Reclaim flags for mem_cgroup_hierarchical_reclaim
330  */
331 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
332 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
333 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
334 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
335 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
336 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
337 
338 static void mem_cgroup_get(struct mem_cgroup *mem);
339 static void mem_cgroup_put(struct mem_cgroup *mem);
340 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
341 static void drain_all_stock_async(void);
342 
343 static struct mem_cgroup_per_zone *
344 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
345 {
346 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
347 }
348 
349 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
350 {
351 	return &mem->css;
352 }
353 
354 static struct mem_cgroup_per_zone *
355 page_cgroup_zoneinfo(struct page_cgroup *pc)
356 {
357 	struct mem_cgroup *mem = pc->mem_cgroup;
358 	int nid = page_cgroup_nid(pc);
359 	int zid = page_cgroup_zid(pc);
360 
361 	if (!mem)
362 		return NULL;
363 
364 	return mem_cgroup_zoneinfo(mem, nid, zid);
365 }
366 
367 static struct mem_cgroup_tree_per_zone *
368 soft_limit_tree_node_zone(int nid, int zid)
369 {
370 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
371 }
372 
373 static struct mem_cgroup_tree_per_zone *
374 soft_limit_tree_from_page(struct page *page)
375 {
376 	int nid = page_to_nid(page);
377 	int zid = page_zonenum(page);
378 
379 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
380 }
381 
382 static void
383 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
384 				struct mem_cgroup_per_zone *mz,
385 				struct mem_cgroup_tree_per_zone *mctz,
386 				unsigned long long new_usage_in_excess)
387 {
388 	struct rb_node **p = &mctz->rb_root.rb_node;
389 	struct rb_node *parent = NULL;
390 	struct mem_cgroup_per_zone *mz_node;
391 
392 	if (mz->on_tree)
393 		return;
394 
395 	mz->usage_in_excess = new_usage_in_excess;
396 	if (!mz->usage_in_excess)
397 		return;
398 	while (*p) {
399 		parent = *p;
400 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
401 					tree_node);
402 		if (mz->usage_in_excess < mz_node->usage_in_excess)
403 			p = &(*p)->rb_left;
404 		/*
405 		 * We can't avoid mem cgroups that are over their soft
406 		 * limit by the same amount
407 		 */
408 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
409 			p = &(*p)->rb_right;
410 	}
411 	rb_link_node(&mz->tree_node, parent, p);
412 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
413 	mz->on_tree = true;
414 }
415 
416 static void
417 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
418 				struct mem_cgroup_per_zone *mz,
419 				struct mem_cgroup_tree_per_zone *mctz)
420 {
421 	if (!mz->on_tree)
422 		return;
423 	rb_erase(&mz->tree_node, &mctz->rb_root);
424 	mz->on_tree = false;
425 }
426 
427 static void
428 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
429 				struct mem_cgroup_per_zone *mz,
430 				struct mem_cgroup_tree_per_zone *mctz)
431 {
432 	spin_lock(&mctz->lock);
433 	__mem_cgroup_remove_exceeded(mem, mz, mctz);
434 	spin_unlock(&mctz->lock);
435 }
436 
437 
438 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
439 {
440 	unsigned long long excess;
441 	struct mem_cgroup_per_zone *mz;
442 	struct mem_cgroup_tree_per_zone *mctz;
443 	int nid = page_to_nid(page);
444 	int zid = page_zonenum(page);
445 	mctz = soft_limit_tree_from_page(page);
446 
447 	/*
448 	 * Necessary to update all ancestors when hierarchy is used.
449 	 * because their event counter is not touched.
450 	 */
451 	for (; mem; mem = parent_mem_cgroup(mem)) {
452 		mz = mem_cgroup_zoneinfo(mem, nid, zid);
453 		excess = res_counter_soft_limit_excess(&mem->res);
454 		/*
455 		 * We have to update the tree if mz is on RB-tree or
456 		 * mem is over its softlimit.
457 		 */
458 		if (excess || mz->on_tree) {
459 			spin_lock(&mctz->lock);
460 			/* if on-tree, remove it */
461 			if (mz->on_tree)
462 				__mem_cgroup_remove_exceeded(mem, mz, mctz);
463 			/*
464 			 * Insert again. mz->usage_in_excess will be updated.
465 			 * If excess is 0, no tree ops.
466 			 */
467 			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
468 			spin_unlock(&mctz->lock);
469 		}
470 	}
471 }
472 
473 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
474 {
475 	int node, zone;
476 	struct mem_cgroup_per_zone *mz;
477 	struct mem_cgroup_tree_per_zone *mctz;
478 
479 	for_each_node_state(node, N_POSSIBLE) {
480 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
481 			mz = mem_cgroup_zoneinfo(mem, node, zone);
482 			mctz = soft_limit_tree_node_zone(node, zone);
483 			mem_cgroup_remove_exceeded(mem, mz, mctz);
484 		}
485 	}
486 }
487 
488 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
489 {
490 	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
491 }
492 
493 static struct mem_cgroup_per_zone *
494 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
495 {
496 	struct rb_node *rightmost = NULL;
497 	struct mem_cgroup_per_zone *mz;
498 
499 retry:
500 	mz = NULL;
501 	rightmost = rb_last(&mctz->rb_root);
502 	if (!rightmost)
503 		goto done;		/* Nothing to reclaim from */
504 
505 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
506 	/*
507 	 * Remove the node now but someone else can add it back,
508 	 * we will to add it back at the end of reclaim to its correct
509 	 * position in the tree.
510 	 */
511 	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
512 	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
513 		!css_tryget(&mz->mem->css))
514 		goto retry;
515 done:
516 	return mz;
517 }
518 
519 static struct mem_cgroup_per_zone *
520 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
521 {
522 	struct mem_cgroup_per_zone *mz;
523 
524 	spin_lock(&mctz->lock);
525 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
526 	spin_unlock(&mctz->lock);
527 	return mz;
528 }
529 
530 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
531 		enum mem_cgroup_stat_index idx)
532 {
533 	int cpu;
534 	s64 val = 0;
535 
536 	for_each_possible_cpu(cpu)
537 		val += per_cpu(mem->stat->count[idx], cpu);
538 	return val;
539 }
540 
541 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
542 {
543 	s64 ret;
544 
545 	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
546 	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
547 	return ret;
548 }
549 
550 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
551 					 bool charge)
552 {
553 	int val = (charge) ? 1 : -1;
554 	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
555 }
556 
557 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
558 					 struct page_cgroup *pc,
559 					 bool charge)
560 {
561 	int val = (charge) ? 1 : -1;
562 
563 	preempt_disable();
564 
565 	if (PageCgroupCache(pc))
566 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
567 	else
568 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
569 
570 	if (charge)
571 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
572 	else
573 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
574 	__this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
575 
576 	preempt_enable();
577 }
578 
579 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
580 					enum lru_list idx)
581 {
582 	int nid, zid;
583 	struct mem_cgroup_per_zone *mz;
584 	u64 total = 0;
585 
586 	for_each_online_node(nid)
587 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
588 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
589 			total += MEM_CGROUP_ZSTAT(mz, idx);
590 		}
591 	return total;
592 }
593 
594 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
595 {
596 	s64 val;
597 
598 	val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
599 
600 	return !(val & ((1 << event_mask_shift) - 1));
601 }
602 
603 /*
604  * Check events in order.
605  *
606  */
607 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
608 {
609 	/* threshold event is triggered in finer grain than soft limit */
610 	if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
611 		mem_cgroup_threshold(mem);
612 		if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
613 			mem_cgroup_update_tree(mem, page);
614 	}
615 }
616 
617 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
618 {
619 	return container_of(cgroup_subsys_state(cont,
620 				mem_cgroup_subsys_id), struct mem_cgroup,
621 				css);
622 }
623 
624 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
625 {
626 	/*
627 	 * mm_update_next_owner() may clear mm->owner to NULL
628 	 * if it races with swapoff, page migration, etc.
629 	 * So this can be called with p == NULL.
630 	 */
631 	if (unlikely(!p))
632 		return NULL;
633 
634 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
635 				struct mem_cgroup, css);
636 }
637 
638 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
639 {
640 	struct mem_cgroup *mem = NULL;
641 
642 	if (!mm)
643 		return NULL;
644 	/*
645 	 * Because we have no locks, mm->owner's may be being moved to other
646 	 * cgroup. We use css_tryget() here even if this looks
647 	 * pessimistic (rather than adding locks here).
648 	 */
649 	rcu_read_lock();
650 	do {
651 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
652 		if (unlikely(!mem))
653 			break;
654 	} while (!css_tryget(&mem->css));
655 	rcu_read_unlock();
656 	return mem;
657 }
658 
659 /*
660  * Call callback function against all cgroup under hierarchy tree.
661  */
662 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
663 			  int (*func)(struct mem_cgroup *, void *))
664 {
665 	int found, ret, nextid;
666 	struct cgroup_subsys_state *css;
667 	struct mem_cgroup *mem;
668 
669 	if (!root->use_hierarchy)
670 		return (*func)(root, data);
671 
672 	nextid = 1;
673 	do {
674 		ret = 0;
675 		mem = NULL;
676 
677 		rcu_read_lock();
678 		css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
679 				   &found);
680 		if (css && css_tryget(css))
681 			mem = container_of(css, struct mem_cgroup, css);
682 		rcu_read_unlock();
683 
684 		if (mem) {
685 			ret = (*func)(mem, data);
686 			css_put(&mem->css);
687 		}
688 		nextid = found + 1;
689 	} while (!ret && css);
690 
691 	return ret;
692 }
693 
694 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
695 {
696 	return (mem == root_mem_cgroup);
697 }
698 
699 /*
700  * Following LRU functions are allowed to be used without PCG_LOCK.
701  * Operations are called by routine of global LRU independently from memcg.
702  * What we have to take care of here is validness of pc->mem_cgroup.
703  *
704  * Changes to pc->mem_cgroup happens when
705  * 1. charge
706  * 2. moving account
707  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
708  * It is added to LRU before charge.
709  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
710  * When moving account, the page is not on LRU. It's isolated.
711  */
712 
713 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
714 {
715 	struct page_cgroup *pc;
716 	struct mem_cgroup_per_zone *mz;
717 
718 	if (mem_cgroup_disabled())
719 		return;
720 	pc = lookup_page_cgroup(page);
721 	/* can happen while we handle swapcache. */
722 	if (!TestClearPageCgroupAcctLRU(pc))
723 		return;
724 	VM_BUG_ON(!pc->mem_cgroup);
725 	/*
726 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
727 	 * removed from global LRU.
728 	 */
729 	mz = page_cgroup_zoneinfo(pc);
730 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
731 	if (mem_cgroup_is_root(pc->mem_cgroup))
732 		return;
733 	VM_BUG_ON(list_empty(&pc->lru));
734 	list_del_init(&pc->lru);
735 	return;
736 }
737 
738 void mem_cgroup_del_lru(struct page *page)
739 {
740 	mem_cgroup_del_lru_list(page, page_lru(page));
741 }
742 
743 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
744 {
745 	struct mem_cgroup_per_zone *mz;
746 	struct page_cgroup *pc;
747 
748 	if (mem_cgroup_disabled())
749 		return;
750 
751 	pc = lookup_page_cgroup(page);
752 	/*
753 	 * Used bit is set without atomic ops but after smp_wmb().
754 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
755 	 */
756 	smp_rmb();
757 	/* unused or root page is not rotated. */
758 	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
759 		return;
760 	mz = page_cgroup_zoneinfo(pc);
761 	list_move(&pc->lru, &mz->lists[lru]);
762 }
763 
764 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
765 {
766 	struct page_cgroup *pc;
767 	struct mem_cgroup_per_zone *mz;
768 
769 	if (mem_cgroup_disabled())
770 		return;
771 	pc = lookup_page_cgroup(page);
772 	VM_BUG_ON(PageCgroupAcctLRU(pc));
773 	/*
774 	 * Used bit is set without atomic ops but after smp_wmb().
775 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
776 	 */
777 	smp_rmb();
778 	if (!PageCgroupUsed(pc))
779 		return;
780 
781 	mz = page_cgroup_zoneinfo(pc);
782 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
783 	SetPageCgroupAcctLRU(pc);
784 	if (mem_cgroup_is_root(pc->mem_cgroup))
785 		return;
786 	list_add(&pc->lru, &mz->lists[lru]);
787 }
788 
789 /*
790  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
791  * lru because the page may.be reused after it's fully uncharged (because of
792  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
793  * it again. This function is only used to charge SwapCache. It's done under
794  * lock_page and expected that zone->lru_lock is never held.
795  */
796 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
797 {
798 	unsigned long flags;
799 	struct zone *zone = page_zone(page);
800 	struct page_cgroup *pc = lookup_page_cgroup(page);
801 
802 	spin_lock_irqsave(&zone->lru_lock, flags);
803 	/*
804 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
805 	 * is guarded by lock_page() because the page is SwapCache.
806 	 */
807 	if (!PageCgroupUsed(pc))
808 		mem_cgroup_del_lru_list(page, page_lru(page));
809 	spin_unlock_irqrestore(&zone->lru_lock, flags);
810 }
811 
812 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
813 {
814 	unsigned long flags;
815 	struct zone *zone = page_zone(page);
816 	struct page_cgroup *pc = lookup_page_cgroup(page);
817 
818 	spin_lock_irqsave(&zone->lru_lock, flags);
819 	/* link when the page is linked to LRU but page_cgroup isn't */
820 	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
821 		mem_cgroup_add_lru_list(page, page_lru(page));
822 	spin_unlock_irqrestore(&zone->lru_lock, flags);
823 }
824 
825 
826 void mem_cgroup_move_lists(struct page *page,
827 			   enum lru_list from, enum lru_list to)
828 {
829 	if (mem_cgroup_disabled())
830 		return;
831 	mem_cgroup_del_lru_list(page, from);
832 	mem_cgroup_add_lru_list(page, to);
833 }
834 
835 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836 {
837 	int ret;
838 	struct mem_cgroup *curr = NULL;
839 
840 	task_lock(task);
841 	rcu_read_lock();
842 	curr = try_get_mem_cgroup_from_mm(task->mm);
843 	rcu_read_unlock();
844 	task_unlock(task);
845 	if (!curr)
846 		return 0;
847 	/*
848 	 * We should check use_hierarchy of "mem" not "curr". Because checking
849 	 * use_hierarchy of "curr" here make this function true if hierarchy is
850 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
851 	 * hierarchy(even if use_hierarchy is disabled in "mem").
852 	 */
853 	if (mem->use_hierarchy)
854 		ret = css_is_ancestor(&curr->css, &mem->css);
855 	else
856 		ret = (curr == mem);
857 	css_put(&curr->css);
858 	return ret;
859 }
860 
861 /*
862  * prev_priority control...this will be used in memory reclaim path.
863  */
864 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865 {
866 	int prev_priority;
867 
868 	spin_lock(&mem->reclaim_param_lock);
869 	prev_priority = mem->prev_priority;
870 	spin_unlock(&mem->reclaim_param_lock);
871 
872 	return prev_priority;
873 }
874 
875 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876 {
877 	spin_lock(&mem->reclaim_param_lock);
878 	if (priority < mem->prev_priority)
879 		mem->prev_priority = priority;
880 	spin_unlock(&mem->reclaim_param_lock);
881 }
882 
883 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884 {
885 	spin_lock(&mem->reclaim_param_lock);
886 	mem->prev_priority = priority;
887 	spin_unlock(&mem->reclaim_param_lock);
888 }
889 
890 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891 {
892 	unsigned long active;
893 	unsigned long inactive;
894 	unsigned long gb;
895 	unsigned long inactive_ratio;
896 
897 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
898 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
899 
900 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
901 	if (gb)
902 		inactive_ratio = int_sqrt(10 * gb);
903 	else
904 		inactive_ratio = 1;
905 
906 	if (present_pages) {
907 		present_pages[0] = inactive;
908 		present_pages[1] = active;
909 	}
910 
911 	return inactive_ratio;
912 }
913 
914 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
915 {
916 	unsigned long active;
917 	unsigned long inactive;
918 	unsigned long present_pages[2];
919 	unsigned long inactive_ratio;
920 
921 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
922 
923 	inactive = present_pages[0];
924 	active = present_pages[1];
925 
926 	if (inactive * inactive_ratio < active)
927 		return 1;
928 
929 	return 0;
930 }
931 
932 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
933 {
934 	unsigned long active;
935 	unsigned long inactive;
936 
937 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
938 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
939 
940 	return (active > inactive);
941 }
942 
943 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
944 				       struct zone *zone,
945 				       enum lru_list lru)
946 {
947 	int nid = zone->zone_pgdat->node_id;
948 	int zid = zone_idx(zone);
949 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
950 
951 	return MEM_CGROUP_ZSTAT(mz, lru);
952 }
953 
954 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
955 						      struct zone *zone)
956 {
957 	int nid = zone->zone_pgdat->node_id;
958 	int zid = zone_idx(zone);
959 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
960 
961 	return &mz->reclaim_stat;
962 }
963 
964 struct zone_reclaim_stat *
965 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
966 {
967 	struct page_cgroup *pc;
968 	struct mem_cgroup_per_zone *mz;
969 
970 	if (mem_cgroup_disabled())
971 		return NULL;
972 
973 	pc = lookup_page_cgroup(page);
974 	/*
975 	 * Used bit is set without atomic ops but after smp_wmb().
976 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
977 	 */
978 	smp_rmb();
979 	if (!PageCgroupUsed(pc))
980 		return NULL;
981 
982 	mz = page_cgroup_zoneinfo(pc);
983 	if (!mz)
984 		return NULL;
985 
986 	return &mz->reclaim_stat;
987 }
988 
989 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
990 					struct list_head *dst,
991 					unsigned long *scanned, int order,
992 					int mode, struct zone *z,
993 					struct mem_cgroup *mem_cont,
994 					int active, int file)
995 {
996 	unsigned long nr_taken = 0;
997 	struct page *page;
998 	unsigned long scan;
999 	LIST_HEAD(pc_list);
1000 	struct list_head *src;
1001 	struct page_cgroup *pc, *tmp;
1002 	int nid = z->zone_pgdat->node_id;
1003 	int zid = zone_idx(z);
1004 	struct mem_cgroup_per_zone *mz;
1005 	int lru = LRU_FILE * file + active;
1006 	int ret;
1007 
1008 	BUG_ON(!mem_cont);
1009 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1010 	src = &mz->lists[lru];
1011 
1012 	scan = 0;
1013 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1014 		if (scan >= nr_to_scan)
1015 			break;
1016 
1017 		page = pc->page;
1018 		if (unlikely(!PageCgroupUsed(pc)))
1019 			continue;
1020 		if (unlikely(!PageLRU(page)))
1021 			continue;
1022 
1023 		scan++;
1024 		ret = __isolate_lru_page(page, mode, file);
1025 		switch (ret) {
1026 		case 0:
1027 			list_move(&page->lru, dst);
1028 			mem_cgroup_del_lru(page);
1029 			nr_taken++;
1030 			break;
1031 		case -EBUSY:
1032 			/* we don't affect global LRU but rotate in our LRU */
1033 			mem_cgroup_rotate_lru_list(page, page_lru(page));
1034 			break;
1035 		default:
1036 			break;
1037 		}
1038 	}
1039 
1040 	*scanned = scan;
1041 	return nr_taken;
1042 }
1043 
1044 #define mem_cgroup_from_res_counter(counter, member)	\
1045 	container_of(counter, struct mem_cgroup, member)
1046 
1047 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1048 {
1049 	if (do_swap_account) {
1050 		if (res_counter_check_under_limit(&mem->res) &&
1051 			res_counter_check_under_limit(&mem->memsw))
1052 			return true;
1053 	} else
1054 		if (res_counter_check_under_limit(&mem->res))
1055 			return true;
1056 	return false;
1057 }
1058 
1059 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1060 {
1061 	struct cgroup *cgrp = memcg->css.cgroup;
1062 	unsigned int swappiness;
1063 
1064 	/* root ? */
1065 	if (cgrp->parent == NULL)
1066 		return vm_swappiness;
1067 
1068 	spin_lock(&memcg->reclaim_param_lock);
1069 	swappiness = memcg->swappiness;
1070 	spin_unlock(&memcg->reclaim_param_lock);
1071 
1072 	return swappiness;
1073 }
1074 
1075 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1076 {
1077 	int *val = data;
1078 	(*val)++;
1079 	return 0;
1080 }
1081 
1082 /**
1083  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1084  * @memcg: The memory cgroup that went over limit
1085  * @p: Task that is going to be killed
1086  *
1087  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1088  * enabled
1089  */
1090 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1091 {
1092 	struct cgroup *task_cgrp;
1093 	struct cgroup *mem_cgrp;
1094 	/*
1095 	 * Need a buffer in BSS, can't rely on allocations. The code relies
1096 	 * on the assumption that OOM is serialized for memory controller.
1097 	 * If this assumption is broken, revisit this code.
1098 	 */
1099 	static char memcg_name[PATH_MAX];
1100 	int ret;
1101 
1102 	if (!memcg || !p)
1103 		return;
1104 
1105 
1106 	rcu_read_lock();
1107 
1108 	mem_cgrp = memcg->css.cgroup;
1109 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1110 
1111 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1112 	if (ret < 0) {
1113 		/*
1114 		 * Unfortunately, we are unable to convert to a useful name
1115 		 * But we'll still print out the usage information
1116 		 */
1117 		rcu_read_unlock();
1118 		goto done;
1119 	}
1120 	rcu_read_unlock();
1121 
1122 	printk(KERN_INFO "Task in %s killed", memcg_name);
1123 
1124 	rcu_read_lock();
1125 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1126 	if (ret < 0) {
1127 		rcu_read_unlock();
1128 		goto done;
1129 	}
1130 	rcu_read_unlock();
1131 
1132 	/*
1133 	 * Continues from above, so we don't need an KERN_ level
1134 	 */
1135 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1136 done:
1137 
1138 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1139 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1140 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1141 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
1142 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1143 		"failcnt %llu\n",
1144 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1145 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1146 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1147 }
1148 
1149 /*
1150  * This function returns the number of memcg under hierarchy tree. Returns
1151  * 1(self count) if no children.
1152  */
1153 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1154 {
1155 	int num = 0;
1156  	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1157 	return num;
1158 }
1159 
1160 /*
1161  * Visit the first child (need not be the first child as per the ordering
1162  * of the cgroup list, since we track last_scanned_child) of @mem and use
1163  * that to reclaim free pages from.
1164  */
1165 static struct mem_cgroup *
1166 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1167 {
1168 	struct mem_cgroup *ret = NULL;
1169 	struct cgroup_subsys_state *css;
1170 	int nextid, found;
1171 
1172 	if (!root_mem->use_hierarchy) {
1173 		css_get(&root_mem->css);
1174 		ret = root_mem;
1175 	}
1176 
1177 	while (!ret) {
1178 		rcu_read_lock();
1179 		nextid = root_mem->last_scanned_child + 1;
1180 		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1181 				   &found);
1182 		if (css && css_tryget(css))
1183 			ret = container_of(css, struct mem_cgroup, css);
1184 
1185 		rcu_read_unlock();
1186 		/* Updates scanning parameter */
1187 		spin_lock(&root_mem->reclaim_param_lock);
1188 		if (!css) {
1189 			/* this means start scan from ID:1 */
1190 			root_mem->last_scanned_child = 0;
1191 		} else
1192 			root_mem->last_scanned_child = found;
1193 		spin_unlock(&root_mem->reclaim_param_lock);
1194 	}
1195 
1196 	return ret;
1197 }
1198 
1199 /*
1200  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1201  * we reclaimed from, so that we don't end up penalizing one child extensively
1202  * based on its position in the children list.
1203  *
1204  * root_mem is the original ancestor that we've been reclaim from.
1205  *
1206  * We give up and return to the caller when we visit root_mem twice.
1207  * (other groups can be removed while we're walking....)
1208  *
1209  * If shrink==true, for avoiding to free too much, this returns immedieately.
1210  */
1211 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1212 						struct zone *zone,
1213 						gfp_t gfp_mask,
1214 						unsigned long reclaim_options)
1215 {
1216 	struct mem_cgroup *victim;
1217 	int ret, total = 0;
1218 	int loop = 0;
1219 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1220 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1221 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1222 	unsigned long excess = mem_cgroup_get_excess(root_mem);
1223 
1224 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1225 	if (root_mem->memsw_is_minimum)
1226 		noswap = true;
1227 
1228 	while (1) {
1229 		victim = mem_cgroup_select_victim(root_mem);
1230 		if (victim == root_mem) {
1231 			loop++;
1232 			if (loop >= 1)
1233 				drain_all_stock_async();
1234 			if (loop >= 2) {
1235 				/*
1236 				 * If we have not been able to reclaim
1237 				 * anything, it might because there are
1238 				 * no reclaimable pages under this hierarchy
1239 				 */
1240 				if (!check_soft || !total) {
1241 					css_put(&victim->css);
1242 					break;
1243 				}
1244 				/*
1245 				 * We want to do more targetted reclaim.
1246 				 * excess >> 2 is not to excessive so as to
1247 				 * reclaim too much, nor too less that we keep
1248 				 * coming back to reclaim from this cgroup
1249 				 */
1250 				if (total >= (excess >> 2) ||
1251 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1252 					css_put(&victim->css);
1253 					break;
1254 				}
1255 			}
1256 		}
1257 		if (!mem_cgroup_local_usage(victim)) {
1258 			/* this cgroup's local usage == 0 */
1259 			css_put(&victim->css);
1260 			continue;
1261 		}
1262 		/* we use swappiness of local cgroup */
1263 		if (check_soft)
1264 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1265 				noswap, get_swappiness(victim), zone,
1266 				zone->zone_pgdat->node_id);
1267 		else
1268 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1269 						noswap, get_swappiness(victim));
1270 		css_put(&victim->css);
1271 		/*
1272 		 * At shrinking usage, we can't check we should stop here or
1273 		 * reclaim more. It's depends on callers. last_scanned_child
1274 		 * will work enough for keeping fairness under tree.
1275 		 */
1276 		if (shrink)
1277 			return ret;
1278 		total += ret;
1279 		if (check_soft) {
1280 			if (res_counter_check_under_soft_limit(&root_mem->res))
1281 				return total;
1282 		} else if (mem_cgroup_check_under_limit(root_mem))
1283 			return 1 + total;
1284 	}
1285 	return total;
1286 }
1287 
1288 static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1289 {
1290 	int *val = (int *)data;
1291 	int x;
1292 	/*
1293 	 * Logically, we can stop scanning immediately when we find
1294 	 * a memcg is already locked. But condidering unlock ops and
1295 	 * creation/removal of memcg, scan-all is simple operation.
1296 	 */
1297 	x = atomic_inc_return(&mem->oom_lock);
1298 	*val = max(x, *val);
1299 	return 0;
1300 }
1301 /*
1302  * Check OOM-Killer is already running under our hierarchy.
1303  * If someone is running, return false.
1304  */
1305 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1306 {
1307 	int lock_count = 0;
1308 
1309 	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1310 
1311 	if (lock_count == 1)
1312 		return true;
1313 	return false;
1314 }
1315 
1316 static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1317 {
1318 	/*
1319 	 * When a new child is created while the hierarchy is under oom,
1320 	 * mem_cgroup_oom_lock() may not be called. We have to use
1321 	 * atomic_add_unless() here.
1322 	 */
1323 	atomic_add_unless(&mem->oom_lock, -1, 0);
1324 	return 0;
1325 }
1326 
1327 static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1328 {
1329 	mem_cgroup_walk_tree(mem, NULL,	mem_cgroup_oom_unlock_cb);
1330 }
1331 
1332 static DEFINE_MUTEX(memcg_oom_mutex);
1333 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1334 
1335 struct oom_wait_info {
1336 	struct mem_cgroup *mem;
1337 	wait_queue_t	wait;
1338 };
1339 
1340 static int memcg_oom_wake_function(wait_queue_t *wait,
1341 	unsigned mode, int sync, void *arg)
1342 {
1343 	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1344 	struct oom_wait_info *oom_wait_info;
1345 
1346 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1347 
1348 	if (oom_wait_info->mem == wake_mem)
1349 		goto wakeup;
1350 	/* if no hierarchy, no match */
1351 	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1352 		return 0;
1353 	/*
1354 	 * Both of oom_wait_info->mem and wake_mem are stable under us.
1355 	 * Then we can use css_is_ancestor without taking care of RCU.
1356 	 */
1357 	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1358 	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1359 		return 0;
1360 
1361 wakeup:
1362 	return autoremove_wake_function(wait, mode, sync, arg);
1363 }
1364 
1365 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1366 {
1367 	/* for filtering, pass "mem" as argument. */
1368 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1369 }
1370 
1371 static void memcg_oom_recover(struct mem_cgroup *mem)
1372 {
1373 	if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
1374 		memcg_wakeup_oom(mem);
1375 }
1376 
1377 /*
1378  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1379  */
1380 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1381 {
1382 	struct oom_wait_info owait;
1383 	bool locked, need_to_kill;
1384 
1385 	owait.mem = mem;
1386 	owait.wait.flags = 0;
1387 	owait.wait.func = memcg_oom_wake_function;
1388 	owait.wait.private = current;
1389 	INIT_LIST_HEAD(&owait.wait.task_list);
1390 	need_to_kill = true;
1391 	/* At first, try to OOM lock hierarchy under mem.*/
1392 	mutex_lock(&memcg_oom_mutex);
1393 	locked = mem_cgroup_oom_lock(mem);
1394 	/*
1395 	 * Even if signal_pending(), we can't quit charge() loop without
1396 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1397 	 * under OOM is always welcomed, use TASK_KILLABLE here.
1398 	 */
1399 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1400 	if (!locked || mem->oom_kill_disable)
1401 		need_to_kill = false;
1402 	if (locked)
1403 		mem_cgroup_oom_notify(mem);
1404 	mutex_unlock(&memcg_oom_mutex);
1405 
1406 	if (need_to_kill) {
1407 		finish_wait(&memcg_oom_waitq, &owait.wait);
1408 		mem_cgroup_out_of_memory(mem, mask);
1409 	} else {
1410 		schedule();
1411 		finish_wait(&memcg_oom_waitq, &owait.wait);
1412 	}
1413 	mutex_lock(&memcg_oom_mutex);
1414 	mem_cgroup_oom_unlock(mem);
1415 	memcg_wakeup_oom(mem);
1416 	mutex_unlock(&memcg_oom_mutex);
1417 
1418 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1419 		return false;
1420 	/* Give chance to dying process */
1421 	schedule_timeout(1);
1422 	return true;
1423 }
1424 
1425 /*
1426  * Currently used to update mapped file statistics, but the routine can be
1427  * generalized to update other statistics as well.
1428  */
1429 void mem_cgroup_update_file_mapped(struct page *page, int val)
1430 {
1431 	struct mem_cgroup *mem;
1432 	struct page_cgroup *pc;
1433 
1434 	pc = lookup_page_cgroup(page);
1435 	if (unlikely(!pc))
1436 		return;
1437 
1438 	lock_page_cgroup(pc);
1439 	mem = pc->mem_cgroup;
1440 	if (!mem || !PageCgroupUsed(pc))
1441 		goto done;
1442 
1443 	/*
1444 	 * Preemption is already disabled. We can use __this_cpu_xxx
1445 	 */
1446 	if (val > 0) {
1447 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1448 		SetPageCgroupFileMapped(pc);
1449 	} else {
1450 		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1451 		ClearPageCgroupFileMapped(pc);
1452 	}
1453 
1454 done:
1455 	unlock_page_cgroup(pc);
1456 }
1457 
1458 /*
1459  * size of first charge trial. "32" comes from vmscan.c's magic value.
1460  * TODO: maybe necessary to use big numbers in big irons.
1461  */
1462 #define CHARGE_SIZE	(32 * PAGE_SIZE)
1463 struct memcg_stock_pcp {
1464 	struct mem_cgroup *cached; /* this never be root cgroup */
1465 	int charge;
1466 	struct work_struct work;
1467 };
1468 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1469 static atomic_t memcg_drain_count;
1470 
1471 /*
1472  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1473  * from local stock and true is returned. If the stock is 0 or charges from a
1474  * cgroup which is not current target, returns false. This stock will be
1475  * refilled.
1476  */
1477 static bool consume_stock(struct mem_cgroup *mem)
1478 {
1479 	struct memcg_stock_pcp *stock;
1480 	bool ret = true;
1481 
1482 	stock = &get_cpu_var(memcg_stock);
1483 	if (mem == stock->cached && stock->charge)
1484 		stock->charge -= PAGE_SIZE;
1485 	else /* need to call res_counter_charge */
1486 		ret = false;
1487 	put_cpu_var(memcg_stock);
1488 	return ret;
1489 }
1490 
1491 /*
1492  * Returns stocks cached in percpu to res_counter and reset cached information.
1493  */
1494 static void drain_stock(struct memcg_stock_pcp *stock)
1495 {
1496 	struct mem_cgroup *old = stock->cached;
1497 
1498 	if (stock->charge) {
1499 		res_counter_uncharge(&old->res, stock->charge);
1500 		if (do_swap_account)
1501 			res_counter_uncharge(&old->memsw, stock->charge);
1502 	}
1503 	stock->cached = NULL;
1504 	stock->charge = 0;
1505 }
1506 
1507 /*
1508  * This must be called under preempt disabled or must be called by
1509  * a thread which is pinned to local cpu.
1510  */
1511 static void drain_local_stock(struct work_struct *dummy)
1512 {
1513 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1514 	drain_stock(stock);
1515 }
1516 
1517 /*
1518  * Cache charges(val) which is from res_counter, to local per_cpu area.
1519  * This will be consumed by consume_stock() function, later.
1520  */
1521 static void refill_stock(struct mem_cgroup *mem, int val)
1522 {
1523 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1524 
1525 	if (stock->cached != mem) { /* reset if necessary */
1526 		drain_stock(stock);
1527 		stock->cached = mem;
1528 	}
1529 	stock->charge += val;
1530 	put_cpu_var(memcg_stock);
1531 }
1532 
1533 /*
1534  * Tries to drain stocked charges in other cpus. This function is asynchronous
1535  * and just put a work per cpu for draining localy on each cpu. Caller can
1536  * expects some charges will be back to res_counter later but cannot wait for
1537  * it.
1538  */
1539 static void drain_all_stock_async(void)
1540 {
1541 	int cpu;
1542 	/* This function is for scheduling "drain" in asynchronous way.
1543 	 * The result of "drain" is not directly handled by callers. Then,
1544 	 * if someone is calling drain, we don't have to call drain more.
1545 	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1546 	 * there is a race. We just do loose check here.
1547 	 */
1548 	if (atomic_read(&memcg_drain_count))
1549 		return;
1550 	/* Notify other cpus that system-wide "drain" is running */
1551 	atomic_inc(&memcg_drain_count);
1552 	get_online_cpus();
1553 	for_each_online_cpu(cpu) {
1554 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1555 		schedule_work_on(cpu, &stock->work);
1556 	}
1557  	put_online_cpus();
1558 	atomic_dec(&memcg_drain_count);
1559 	/* We don't wait for flush_work */
1560 }
1561 
1562 /* This is a synchronous drain interface. */
1563 static void drain_all_stock_sync(void)
1564 {
1565 	/* called when force_empty is called */
1566 	atomic_inc(&memcg_drain_count);
1567 	schedule_on_each_cpu(drain_local_stock);
1568 	atomic_dec(&memcg_drain_count);
1569 }
1570 
1571 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1572 					unsigned long action,
1573 					void *hcpu)
1574 {
1575 	int cpu = (unsigned long)hcpu;
1576 	struct memcg_stock_pcp *stock;
1577 
1578 	if (action != CPU_DEAD)
1579 		return NOTIFY_OK;
1580 	stock = &per_cpu(memcg_stock, cpu);
1581 	drain_stock(stock);
1582 	return NOTIFY_OK;
1583 }
1584 
1585 /*
1586  * Unlike exported interface, "oom" parameter is added. if oom==true,
1587  * oom-killer can be invoked.
1588  */
1589 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1590 			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1591 {
1592 	struct mem_cgroup *mem, *mem_over_limit;
1593 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1594 	struct res_counter *fail_res;
1595 	int csize = CHARGE_SIZE;
1596 
1597 	/*
1598 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1599 	 * in system level. So, allow to go ahead dying process in addition to
1600 	 * MEMDIE process.
1601 	 */
1602 	if (unlikely(test_thread_flag(TIF_MEMDIE)
1603 		     || fatal_signal_pending(current)))
1604 		goto bypass;
1605 
1606 	/*
1607 	 * We always charge the cgroup the mm_struct belongs to.
1608 	 * The mm_struct's mem_cgroup changes on task migration if the
1609 	 * thread group leader migrates. It's possible that mm is not
1610 	 * set, if so charge the init_mm (happens for pagecache usage).
1611 	 */
1612 	mem = *memcg;
1613 	if (likely(!mem)) {
1614 		mem = try_get_mem_cgroup_from_mm(mm);
1615 		*memcg = mem;
1616 	} else {
1617 		css_get(&mem->css);
1618 	}
1619 	if (unlikely(!mem))
1620 		return 0;
1621 
1622 	VM_BUG_ON(css_is_removed(&mem->css));
1623 	if (mem_cgroup_is_root(mem))
1624 		goto done;
1625 
1626 	while (1) {
1627 		int ret = 0;
1628 		unsigned long flags = 0;
1629 
1630 		if (consume_stock(mem))
1631 			goto done;
1632 
1633 		ret = res_counter_charge(&mem->res, csize, &fail_res);
1634 		if (likely(!ret)) {
1635 			if (!do_swap_account)
1636 				break;
1637 			ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1638 			if (likely(!ret))
1639 				break;
1640 			/* mem+swap counter fails */
1641 			res_counter_uncharge(&mem->res, csize);
1642 			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1643 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1644 									memsw);
1645 		} else
1646 			/* mem counter fails */
1647 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1648 									res);
1649 
1650 		/* reduce request size and retry */
1651 		if (csize > PAGE_SIZE) {
1652 			csize = PAGE_SIZE;
1653 			continue;
1654 		}
1655 		if (!(gfp_mask & __GFP_WAIT))
1656 			goto nomem;
1657 
1658 		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1659 						gfp_mask, flags);
1660 		if (ret)
1661 			continue;
1662 
1663 		/*
1664 		 * try_to_free_mem_cgroup_pages() might not give us a full
1665 		 * picture of reclaim. Some pages are reclaimed and might be
1666 		 * moved to swap cache or just unmapped from the cgroup.
1667 		 * Check the limit again to see if the reclaim reduced the
1668 		 * current usage of the cgroup before giving up
1669 		 *
1670 		 */
1671 		if (mem_cgroup_check_under_limit(mem_over_limit))
1672 			continue;
1673 
1674 		/* try to avoid oom while someone is moving charge */
1675 		if (mc.moving_task && current != mc.moving_task) {
1676 			struct mem_cgroup *from, *to;
1677 			bool do_continue = false;
1678 			/*
1679 			 * There is a small race that "from" or "to" can be
1680 			 * freed by rmdir, so we use css_tryget().
1681 			 */
1682 			from = mc.from;
1683 			to = mc.to;
1684 			if (from && css_tryget(&from->css)) {
1685 				if (mem_over_limit->use_hierarchy)
1686 					do_continue = css_is_ancestor(
1687 							&from->css,
1688 							&mem_over_limit->css);
1689 				else
1690 					do_continue = (from == mem_over_limit);
1691 				css_put(&from->css);
1692 			}
1693 			if (!do_continue && to && css_tryget(&to->css)) {
1694 				if (mem_over_limit->use_hierarchy)
1695 					do_continue = css_is_ancestor(
1696 							&to->css,
1697 							&mem_over_limit->css);
1698 				else
1699 					do_continue = (to == mem_over_limit);
1700 				css_put(&to->css);
1701 			}
1702 			if (do_continue) {
1703 				DEFINE_WAIT(wait);
1704 				prepare_to_wait(&mc.waitq, &wait,
1705 							TASK_INTERRUPTIBLE);
1706 				/* moving charge context might have finished. */
1707 				if (mc.moving_task)
1708 					schedule();
1709 				finish_wait(&mc.waitq, &wait);
1710 				continue;
1711 			}
1712 		}
1713 
1714 		if (!nr_retries--) {
1715 			if (!oom)
1716 				goto nomem;
1717 			if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1718 				nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1719 				continue;
1720 			}
1721 			/* When we reach here, current task is dying .*/
1722 			css_put(&mem->css);
1723 			goto bypass;
1724 		}
1725 	}
1726 	if (csize > PAGE_SIZE)
1727 		refill_stock(mem, csize - PAGE_SIZE);
1728 done:
1729 	return 0;
1730 nomem:
1731 	css_put(&mem->css);
1732 	return -ENOMEM;
1733 bypass:
1734 	*memcg = NULL;
1735 	return 0;
1736 }
1737 
1738 /*
1739  * Somemtimes we have to undo a charge we got by try_charge().
1740  * This function is for that and do uncharge, put css's refcnt.
1741  * gotten by try_charge().
1742  */
1743 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1744 							unsigned long count)
1745 {
1746 	if (!mem_cgroup_is_root(mem)) {
1747 		res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1748 		if (do_swap_account)
1749 			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1750 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1751 		WARN_ON_ONCE(count > INT_MAX);
1752 		__css_put(&mem->css, (int)count);
1753 	}
1754 	/* we don't need css_put for root */
1755 }
1756 
1757 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1758 {
1759 	__mem_cgroup_cancel_charge(mem, 1);
1760 }
1761 
1762 /*
1763  * A helper function to get mem_cgroup from ID. must be called under
1764  * rcu_read_lock(). The caller must check css_is_removed() or some if
1765  * it's concern. (dropping refcnt from swap can be called against removed
1766  * memcg.)
1767  */
1768 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1769 {
1770 	struct cgroup_subsys_state *css;
1771 
1772 	/* ID 0 is unused ID */
1773 	if (!id)
1774 		return NULL;
1775 	css = css_lookup(&mem_cgroup_subsys, id);
1776 	if (!css)
1777 		return NULL;
1778 	return container_of(css, struct mem_cgroup, css);
1779 }
1780 
1781 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1782 {
1783 	struct mem_cgroup *mem = NULL;
1784 	struct page_cgroup *pc;
1785 	unsigned short id;
1786 	swp_entry_t ent;
1787 
1788 	VM_BUG_ON(!PageLocked(page));
1789 
1790 	pc = lookup_page_cgroup(page);
1791 	lock_page_cgroup(pc);
1792 	if (PageCgroupUsed(pc)) {
1793 		mem = pc->mem_cgroup;
1794 		if (mem && !css_tryget(&mem->css))
1795 			mem = NULL;
1796 	} else if (PageSwapCache(page)) {
1797 		ent.val = page_private(page);
1798 		id = lookup_swap_cgroup(ent);
1799 		rcu_read_lock();
1800 		mem = mem_cgroup_lookup(id);
1801 		if (mem && !css_tryget(&mem->css))
1802 			mem = NULL;
1803 		rcu_read_unlock();
1804 	}
1805 	unlock_page_cgroup(pc);
1806 	return mem;
1807 }
1808 
1809 /*
1810  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1811  * USED state. If already USED, uncharge and return.
1812  */
1813 
1814 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1815 				     struct page_cgroup *pc,
1816 				     enum charge_type ctype)
1817 {
1818 	/* try_charge() can return NULL to *memcg, taking care of it. */
1819 	if (!mem)
1820 		return;
1821 
1822 	lock_page_cgroup(pc);
1823 	if (unlikely(PageCgroupUsed(pc))) {
1824 		unlock_page_cgroup(pc);
1825 		mem_cgroup_cancel_charge(mem);
1826 		return;
1827 	}
1828 
1829 	pc->mem_cgroup = mem;
1830 	/*
1831 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
1832 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1833 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1834 	 * before USED bit, we need memory barrier here.
1835 	 * See mem_cgroup_add_lru_list(), etc.
1836  	 */
1837 	smp_wmb();
1838 	switch (ctype) {
1839 	case MEM_CGROUP_CHARGE_TYPE_CACHE:
1840 	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1841 		SetPageCgroupCache(pc);
1842 		SetPageCgroupUsed(pc);
1843 		break;
1844 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1845 		ClearPageCgroupCache(pc);
1846 		SetPageCgroupUsed(pc);
1847 		break;
1848 	default:
1849 		break;
1850 	}
1851 
1852 	mem_cgroup_charge_statistics(mem, pc, true);
1853 
1854 	unlock_page_cgroup(pc);
1855 	/*
1856 	 * "charge_statistics" updated event counter. Then, check it.
1857 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1858 	 * if they exceeds softlimit.
1859 	 */
1860 	memcg_check_events(mem, pc->page);
1861 }
1862 
1863 /**
1864  * __mem_cgroup_move_account - move account of the page
1865  * @pc:	page_cgroup of the page.
1866  * @from: mem_cgroup which the page is moved from.
1867  * @to:	mem_cgroup which the page is moved to. @from != @to.
1868  * @uncharge: whether we should call uncharge and css_put against @from.
1869  *
1870  * The caller must confirm following.
1871  * - page is not on LRU (isolate_page() is useful.)
1872  * - the pc is locked, used, and ->mem_cgroup points to @from.
1873  *
1874  * This function doesn't do "charge" nor css_get to new cgroup. It should be
1875  * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1876  * true, this function does "uncharge" from old cgroup, but it doesn't if
1877  * @uncharge is false, so a caller should do "uncharge".
1878  */
1879 
1880 static void __mem_cgroup_move_account(struct page_cgroup *pc,
1881 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1882 {
1883 	VM_BUG_ON(from == to);
1884 	VM_BUG_ON(PageLRU(pc->page));
1885 	VM_BUG_ON(!PageCgroupLocked(pc));
1886 	VM_BUG_ON(!PageCgroupUsed(pc));
1887 	VM_BUG_ON(pc->mem_cgroup != from);
1888 
1889 	if (PageCgroupFileMapped(pc)) {
1890 		/* Update mapped_file data for mem_cgroup */
1891 		preempt_disable();
1892 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1893 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1894 		preempt_enable();
1895 	}
1896 	mem_cgroup_charge_statistics(from, pc, false);
1897 	if (uncharge)
1898 		/* This is not "cancel", but cancel_charge does all we need. */
1899 		mem_cgroup_cancel_charge(from);
1900 
1901 	/* caller should have done css_get */
1902 	pc->mem_cgroup = to;
1903 	mem_cgroup_charge_statistics(to, pc, true);
1904 	/*
1905 	 * We charges against "to" which may not have any tasks. Then, "to"
1906 	 * can be under rmdir(). But in current implementation, caller of
1907 	 * this function is just force_empty() and move charge, so it's
1908 	 * garanteed that "to" is never removed. So, we don't check rmdir
1909 	 * status here.
1910 	 */
1911 }
1912 
1913 /*
1914  * check whether the @pc is valid for moving account and call
1915  * __mem_cgroup_move_account()
1916  */
1917 static int mem_cgroup_move_account(struct page_cgroup *pc,
1918 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1919 {
1920 	int ret = -EINVAL;
1921 	lock_page_cgroup(pc);
1922 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1923 		__mem_cgroup_move_account(pc, from, to, uncharge);
1924 		ret = 0;
1925 	}
1926 	unlock_page_cgroup(pc);
1927 	/*
1928 	 * check events
1929 	 */
1930 	memcg_check_events(to, pc->page);
1931 	memcg_check_events(from, pc->page);
1932 	return ret;
1933 }
1934 
1935 /*
1936  * move charges to its parent.
1937  */
1938 
1939 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1940 				  struct mem_cgroup *child,
1941 				  gfp_t gfp_mask)
1942 {
1943 	struct page *page = pc->page;
1944 	struct cgroup *cg = child->css.cgroup;
1945 	struct cgroup *pcg = cg->parent;
1946 	struct mem_cgroup *parent;
1947 	int ret;
1948 
1949 	/* Is ROOT ? */
1950 	if (!pcg)
1951 		return -EINVAL;
1952 
1953 	ret = -EBUSY;
1954 	if (!get_page_unless_zero(page))
1955 		goto out;
1956 	if (isolate_lru_page(page))
1957 		goto put;
1958 
1959 	parent = mem_cgroup_from_cont(pcg);
1960 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1961 	if (ret || !parent)
1962 		goto put_back;
1963 
1964 	ret = mem_cgroup_move_account(pc, child, parent, true);
1965 	if (ret)
1966 		mem_cgroup_cancel_charge(parent);
1967 put_back:
1968 	putback_lru_page(page);
1969 put:
1970 	put_page(page);
1971 out:
1972 	return ret;
1973 }
1974 
1975 /*
1976  * Charge the memory controller for page usage.
1977  * Return
1978  * 0 if the charge was successful
1979  * < 0 if the cgroup is over its limit
1980  */
1981 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1982 				gfp_t gfp_mask, enum charge_type ctype,
1983 				struct mem_cgroup *memcg)
1984 {
1985 	struct mem_cgroup *mem;
1986 	struct page_cgroup *pc;
1987 	int ret;
1988 
1989 	pc = lookup_page_cgroup(page);
1990 	/* can happen at boot */
1991 	if (unlikely(!pc))
1992 		return 0;
1993 	prefetchw(pc);
1994 
1995 	mem = memcg;
1996 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1997 	if (ret || !mem)
1998 		return ret;
1999 
2000 	__mem_cgroup_commit_charge(mem, pc, ctype);
2001 	return 0;
2002 }
2003 
2004 int mem_cgroup_newpage_charge(struct page *page,
2005 			      struct mm_struct *mm, gfp_t gfp_mask)
2006 {
2007 	if (mem_cgroup_disabled())
2008 		return 0;
2009 	if (PageCompound(page))
2010 		return 0;
2011 	/*
2012 	 * If already mapped, we don't have to account.
2013 	 * If page cache, page->mapping has address_space.
2014 	 * But page->mapping may have out-of-use anon_vma pointer,
2015 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2016 	 * is NULL.
2017   	 */
2018 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2019 		return 0;
2020 	if (unlikely(!mm))
2021 		mm = &init_mm;
2022 	return mem_cgroup_charge_common(page, mm, gfp_mask,
2023 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
2024 }
2025 
2026 static void
2027 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2028 					enum charge_type ctype);
2029 
2030 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2031 				gfp_t gfp_mask)
2032 {
2033 	struct mem_cgroup *mem = NULL;
2034 	int ret;
2035 
2036 	if (mem_cgroup_disabled())
2037 		return 0;
2038 	if (PageCompound(page))
2039 		return 0;
2040 	/*
2041 	 * Corner case handling. This is called from add_to_page_cache()
2042 	 * in usual. But some FS (shmem) precharges this page before calling it
2043 	 * and call add_to_page_cache() with GFP_NOWAIT.
2044 	 *
2045 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
2046 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2047 	 * charge twice. (It works but has to pay a bit larger cost.)
2048 	 * And when the page is SwapCache, it should take swap information
2049 	 * into account. This is under lock_page() now.
2050 	 */
2051 	if (!(gfp_mask & __GFP_WAIT)) {
2052 		struct page_cgroup *pc;
2053 
2054 
2055 		pc = lookup_page_cgroup(page);
2056 		if (!pc)
2057 			return 0;
2058 		lock_page_cgroup(pc);
2059 		if (PageCgroupUsed(pc)) {
2060 			unlock_page_cgroup(pc);
2061 			return 0;
2062 		}
2063 		unlock_page_cgroup(pc);
2064 	}
2065 
2066 	if (unlikely(!mm && !mem))
2067 		mm = &init_mm;
2068 
2069 	if (page_is_file_cache(page))
2070 		return mem_cgroup_charge_common(page, mm, gfp_mask,
2071 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
2072 
2073 	/* shmem */
2074 	if (PageSwapCache(page)) {
2075 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2076 		if (!ret)
2077 			__mem_cgroup_commit_charge_swapin(page, mem,
2078 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2079 	} else
2080 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2081 					MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
2082 
2083 	return ret;
2084 }
2085 
2086 /*
2087  * While swap-in, try_charge -> commit or cancel, the page is locked.
2088  * And when try_charge() successfully returns, one refcnt to memcg without
2089  * struct page_cgroup is acquired. This refcnt will be consumed by
2090  * "commit()" or removed by "cancel()"
2091  */
2092 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2093 				 struct page *page,
2094 				 gfp_t mask, struct mem_cgroup **ptr)
2095 {
2096 	struct mem_cgroup *mem;
2097 	int ret;
2098 
2099 	if (mem_cgroup_disabled())
2100 		return 0;
2101 
2102 	if (!do_swap_account)
2103 		goto charge_cur_mm;
2104 	/*
2105 	 * A racing thread's fault, or swapoff, may have already updated
2106 	 * the pte, and even removed page from swap cache: in those cases
2107 	 * do_swap_page()'s pte_same() test will fail; but there's also a
2108 	 * KSM case which does need to charge the page.
2109 	 */
2110 	if (!PageSwapCache(page))
2111 		goto charge_cur_mm;
2112 	mem = try_get_mem_cgroup_from_page(page);
2113 	if (!mem)
2114 		goto charge_cur_mm;
2115 	*ptr = mem;
2116 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2117 	/* drop extra refcnt from tryget */
2118 	css_put(&mem->css);
2119 	return ret;
2120 charge_cur_mm:
2121 	if (unlikely(!mm))
2122 		mm = &init_mm;
2123 	return __mem_cgroup_try_charge(mm, mask, ptr, true);
2124 }
2125 
2126 static void
2127 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2128 					enum charge_type ctype)
2129 {
2130 	struct page_cgroup *pc;
2131 
2132 	if (mem_cgroup_disabled())
2133 		return;
2134 	if (!ptr)
2135 		return;
2136 	cgroup_exclude_rmdir(&ptr->css);
2137 	pc = lookup_page_cgroup(page);
2138 	mem_cgroup_lru_del_before_commit_swapcache(page);
2139 	__mem_cgroup_commit_charge(ptr, pc, ctype);
2140 	mem_cgroup_lru_add_after_commit_swapcache(page);
2141 	/*
2142 	 * Now swap is on-memory. This means this page may be
2143 	 * counted both as mem and swap....double count.
2144 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2145 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2146 	 * may call delete_from_swap_cache() before reach here.
2147 	 */
2148 	if (do_swap_account && PageSwapCache(page)) {
2149 		swp_entry_t ent = {.val = page_private(page)};
2150 		unsigned short id;
2151 		struct mem_cgroup *memcg;
2152 
2153 		id = swap_cgroup_record(ent, 0);
2154 		rcu_read_lock();
2155 		memcg = mem_cgroup_lookup(id);
2156 		if (memcg) {
2157 			/*
2158 			 * This recorded memcg can be obsolete one. So, avoid
2159 			 * calling css_tryget
2160 			 */
2161 			if (!mem_cgroup_is_root(memcg))
2162 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2163 			mem_cgroup_swap_statistics(memcg, false);
2164 			mem_cgroup_put(memcg);
2165 		}
2166 		rcu_read_unlock();
2167 	}
2168 	/*
2169 	 * At swapin, we may charge account against cgroup which has no tasks.
2170 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2171 	 * In that case, we need to call pre_destroy() again. check it here.
2172 	 */
2173 	cgroup_release_and_wakeup_rmdir(&ptr->css);
2174 }
2175 
2176 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2177 {
2178 	__mem_cgroup_commit_charge_swapin(page, ptr,
2179 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
2180 }
2181 
2182 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2183 {
2184 	if (mem_cgroup_disabled())
2185 		return;
2186 	if (!mem)
2187 		return;
2188 	mem_cgroup_cancel_charge(mem);
2189 }
2190 
2191 static void
2192 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2193 {
2194 	struct memcg_batch_info *batch = NULL;
2195 	bool uncharge_memsw = true;
2196 	/* If swapout, usage of swap doesn't decrease */
2197 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2198 		uncharge_memsw = false;
2199 
2200 	batch = &current->memcg_batch;
2201 	/*
2202 	 * In usual, we do css_get() when we remember memcg pointer.
2203 	 * But in this case, we keep res->usage until end of a series of
2204 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
2205 	 */
2206 	if (!batch->memcg)
2207 		batch->memcg = mem;
2208 	/*
2209 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2210 	 * In those cases, all pages freed continously can be expected to be in
2211 	 * the same cgroup and we have chance to coalesce uncharges.
2212 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2213 	 * because we want to do uncharge as soon as possible.
2214 	 */
2215 
2216 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2217 		goto direct_uncharge;
2218 
2219 	/*
2220 	 * In typical case, batch->memcg == mem. This means we can
2221 	 * merge a series of uncharges to an uncharge of res_counter.
2222 	 * If not, we uncharge res_counter ony by one.
2223 	 */
2224 	if (batch->memcg != mem)
2225 		goto direct_uncharge;
2226 	/* remember freed charge and uncharge it later */
2227 	batch->bytes += PAGE_SIZE;
2228 	if (uncharge_memsw)
2229 		batch->memsw_bytes += PAGE_SIZE;
2230 	return;
2231 direct_uncharge:
2232 	res_counter_uncharge(&mem->res, PAGE_SIZE);
2233 	if (uncharge_memsw)
2234 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2235 	if (unlikely(batch->memcg != mem))
2236 		memcg_oom_recover(mem);
2237 	return;
2238 }
2239 
2240 /*
2241  * uncharge if !page_mapped(page)
2242  */
2243 static struct mem_cgroup *
2244 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2245 {
2246 	struct page_cgroup *pc;
2247 	struct mem_cgroup *mem = NULL;
2248 	struct mem_cgroup_per_zone *mz;
2249 
2250 	if (mem_cgroup_disabled())
2251 		return NULL;
2252 
2253 	if (PageSwapCache(page))
2254 		return NULL;
2255 
2256 	/*
2257 	 * Check if our page_cgroup is valid
2258 	 */
2259 	pc = lookup_page_cgroup(page);
2260 	if (unlikely(!pc || !PageCgroupUsed(pc)))
2261 		return NULL;
2262 
2263 	lock_page_cgroup(pc);
2264 
2265 	mem = pc->mem_cgroup;
2266 
2267 	if (!PageCgroupUsed(pc))
2268 		goto unlock_out;
2269 
2270 	switch (ctype) {
2271 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2272 	case MEM_CGROUP_CHARGE_TYPE_DROP:
2273 		/* See mem_cgroup_prepare_migration() */
2274 		if (page_mapped(page) || PageCgroupMigration(pc))
2275 			goto unlock_out;
2276 		break;
2277 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2278 		if (!PageAnon(page)) {	/* Shared memory */
2279 			if (page->mapping && !page_is_file_cache(page))
2280 				goto unlock_out;
2281 		} else if (page_mapped(page)) /* Anon */
2282 				goto unlock_out;
2283 		break;
2284 	default:
2285 		break;
2286 	}
2287 
2288 	if (!mem_cgroup_is_root(mem))
2289 		__do_uncharge(mem, ctype);
2290 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2291 		mem_cgroup_swap_statistics(mem, true);
2292 	mem_cgroup_charge_statistics(mem, pc, false);
2293 
2294 	ClearPageCgroupUsed(pc);
2295 	/*
2296 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
2297 	 * freed from LRU. This is safe because uncharged page is expected not
2298 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
2299 	 * special functions.
2300 	 */
2301 
2302 	mz = page_cgroup_zoneinfo(pc);
2303 	unlock_page_cgroup(pc);
2304 
2305 	memcg_check_events(mem, page);
2306 	/* at swapout, this memcg will be accessed to record to swap */
2307 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2308 		css_put(&mem->css);
2309 
2310 	return mem;
2311 
2312 unlock_out:
2313 	unlock_page_cgroup(pc);
2314 	return NULL;
2315 }
2316 
2317 void mem_cgroup_uncharge_page(struct page *page)
2318 {
2319 	/* early check. */
2320 	if (page_mapped(page))
2321 		return;
2322 	if (page->mapping && !PageAnon(page))
2323 		return;
2324 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2325 }
2326 
2327 void mem_cgroup_uncharge_cache_page(struct page *page)
2328 {
2329 	VM_BUG_ON(page_mapped(page));
2330 	VM_BUG_ON(page->mapping);
2331 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2332 }
2333 
2334 /*
2335  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2336  * In that cases, pages are freed continuously and we can expect pages
2337  * are in the same memcg. All these calls itself limits the number of
2338  * pages freed at once, then uncharge_start/end() is called properly.
2339  * This may be called prural(2) times in a context,
2340  */
2341 
2342 void mem_cgroup_uncharge_start(void)
2343 {
2344 	current->memcg_batch.do_batch++;
2345 	/* We can do nest. */
2346 	if (current->memcg_batch.do_batch == 1) {
2347 		current->memcg_batch.memcg = NULL;
2348 		current->memcg_batch.bytes = 0;
2349 		current->memcg_batch.memsw_bytes = 0;
2350 	}
2351 }
2352 
2353 void mem_cgroup_uncharge_end(void)
2354 {
2355 	struct memcg_batch_info *batch = &current->memcg_batch;
2356 
2357 	if (!batch->do_batch)
2358 		return;
2359 
2360 	batch->do_batch--;
2361 	if (batch->do_batch) /* If stacked, do nothing. */
2362 		return;
2363 
2364 	if (!batch->memcg)
2365 		return;
2366 	/*
2367 	 * This "batch->memcg" is valid without any css_get/put etc...
2368 	 * bacause we hide charges behind us.
2369 	 */
2370 	if (batch->bytes)
2371 		res_counter_uncharge(&batch->memcg->res, batch->bytes);
2372 	if (batch->memsw_bytes)
2373 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2374 	memcg_oom_recover(batch->memcg);
2375 	/* forget this pointer (for sanity check) */
2376 	batch->memcg = NULL;
2377 }
2378 
2379 #ifdef CONFIG_SWAP
2380 /*
2381  * called after __delete_from_swap_cache() and drop "page" account.
2382  * memcg information is recorded to swap_cgroup of "ent"
2383  */
2384 void
2385 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2386 {
2387 	struct mem_cgroup *memcg;
2388 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2389 
2390 	if (!swapout) /* this was a swap cache but the swap is unused ! */
2391 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2392 
2393 	memcg = __mem_cgroup_uncharge_common(page, ctype);
2394 
2395 	/* record memcg information */
2396 	if (do_swap_account && swapout && memcg) {
2397 		swap_cgroup_record(ent, css_id(&memcg->css));
2398 		mem_cgroup_get(memcg);
2399 	}
2400 	if (swapout && memcg)
2401 		css_put(&memcg->css);
2402 }
2403 #endif
2404 
2405 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2406 /*
2407  * called from swap_entry_free(). remove record in swap_cgroup and
2408  * uncharge "memsw" account.
2409  */
2410 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2411 {
2412 	struct mem_cgroup *memcg;
2413 	unsigned short id;
2414 
2415 	if (!do_swap_account)
2416 		return;
2417 
2418 	id = swap_cgroup_record(ent, 0);
2419 	rcu_read_lock();
2420 	memcg = mem_cgroup_lookup(id);
2421 	if (memcg) {
2422 		/*
2423 		 * We uncharge this because swap is freed.
2424 		 * This memcg can be obsolete one. We avoid calling css_tryget
2425 		 */
2426 		if (!mem_cgroup_is_root(memcg))
2427 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2428 		mem_cgroup_swap_statistics(memcg, false);
2429 		mem_cgroup_put(memcg);
2430 	}
2431 	rcu_read_unlock();
2432 }
2433 
2434 /**
2435  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2436  * @entry: swap entry to be moved
2437  * @from:  mem_cgroup which the entry is moved from
2438  * @to:  mem_cgroup which the entry is moved to
2439  * @need_fixup: whether we should fixup res_counters and refcounts.
2440  *
2441  * It succeeds only when the swap_cgroup's record for this entry is the same
2442  * as the mem_cgroup's id of @from.
2443  *
2444  * Returns 0 on success, -EINVAL on failure.
2445  *
2446  * The caller must have charged to @to, IOW, called res_counter_charge() about
2447  * both res and memsw, and called css_get().
2448  */
2449 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2450 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2451 {
2452 	unsigned short old_id, new_id;
2453 
2454 	old_id = css_id(&from->css);
2455 	new_id = css_id(&to->css);
2456 
2457 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2458 		mem_cgroup_swap_statistics(from, false);
2459 		mem_cgroup_swap_statistics(to, true);
2460 		/*
2461 		 * This function is only called from task migration context now.
2462 		 * It postpones res_counter and refcount handling till the end
2463 		 * of task migration(mem_cgroup_clear_mc()) for performance
2464 		 * improvement. But we cannot postpone mem_cgroup_get(to)
2465 		 * because if the process that has been moved to @to does
2466 		 * swap-in, the refcount of @to might be decreased to 0.
2467 		 */
2468 		mem_cgroup_get(to);
2469 		if (need_fixup) {
2470 			if (!mem_cgroup_is_root(from))
2471 				res_counter_uncharge(&from->memsw, PAGE_SIZE);
2472 			mem_cgroup_put(from);
2473 			/*
2474 			 * we charged both to->res and to->memsw, so we should
2475 			 * uncharge to->res.
2476 			 */
2477 			if (!mem_cgroup_is_root(to))
2478 				res_counter_uncharge(&to->res, PAGE_SIZE);
2479 			css_put(&to->css);
2480 		}
2481 		return 0;
2482 	}
2483 	return -EINVAL;
2484 }
2485 #else
2486 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2487 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2488 {
2489 	return -EINVAL;
2490 }
2491 #endif
2492 
2493 /*
2494  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2495  * page belongs to.
2496  */
2497 int mem_cgroup_prepare_migration(struct page *page,
2498 	struct page *newpage, struct mem_cgroup **ptr)
2499 {
2500 	struct page_cgroup *pc;
2501 	struct mem_cgroup *mem = NULL;
2502 	enum charge_type ctype;
2503 	int ret = 0;
2504 
2505 	if (mem_cgroup_disabled())
2506 		return 0;
2507 
2508 	pc = lookup_page_cgroup(page);
2509 	lock_page_cgroup(pc);
2510 	if (PageCgroupUsed(pc)) {
2511 		mem = pc->mem_cgroup;
2512 		css_get(&mem->css);
2513 		/*
2514 		 * At migrating an anonymous page, its mapcount goes down
2515 		 * to 0 and uncharge() will be called. But, even if it's fully
2516 		 * unmapped, migration may fail and this page has to be
2517 		 * charged again. We set MIGRATION flag here and delay uncharge
2518 		 * until end_migration() is called
2519 		 *
2520 		 * Corner Case Thinking
2521 		 * A)
2522 		 * When the old page was mapped as Anon and it's unmap-and-freed
2523 		 * while migration was ongoing.
2524 		 * If unmap finds the old page, uncharge() of it will be delayed
2525 		 * until end_migration(). If unmap finds a new page, it's
2526 		 * uncharged when it make mapcount to be 1->0. If unmap code
2527 		 * finds swap_migration_entry, the new page will not be mapped
2528 		 * and end_migration() will find it(mapcount==0).
2529 		 *
2530 		 * B)
2531 		 * When the old page was mapped but migraion fails, the kernel
2532 		 * remaps it. A charge for it is kept by MIGRATION flag even
2533 		 * if mapcount goes down to 0. We can do remap successfully
2534 		 * without charging it again.
2535 		 *
2536 		 * C)
2537 		 * The "old" page is under lock_page() until the end of
2538 		 * migration, so, the old page itself will not be swapped-out.
2539 		 * If the new page is swapped out before end_migraton, our
2540 		 * hook to usual swap-out path will catch the event.
2541 		 */
2542 		if (PageAnon(page))
2543 			SetPageCgroupMigration(pc);
2544 	}
2545 	unlock_page_cgroup(pc);
2546 	/*
2547 	 * If the page is not charged at this point,
2548 	 * we return here.
2549 	 */
2550 	if (!mem)
2551 		return 0;
2552 
2553 	*ptr = mem;
2554 	ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2555 	css_put(&mem->css);/* drop extra refcnt */
2556 	if (ret || *ptr == NULL) {
2557 		if (PageAnon(page)) {
2558 			lock_page_cgroup(pc);
2559 			ClearPageCgroupMigration(pc);
2560 			unlock_page_cgroup(pc);
2561 			/*
2562 			 * The old page may be fully unmapped while we kept it.
2563 			 */
2564 			mem_cgroup_uncharge_page(page);
2565 		}
2566 		return -ENOMEM;
2567 	}
2568 	/*
2569 	 * We charge new page before it's used/mapped. So, even if unlock_page()
2570 	 * is called before end_migration, we can catch all events on this new
2571 	 * page. In the case new page is migrated but not remapped, new page's
2572 	 * mapcount will be finally 0 and we call uncharge in end_migration().
2573 	 */
2574 	pc = lookup_page_cgroup(newpage);
2575 	if (PageAnon(page))
2576 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2577 	else if (page_is_file_cache(page))
2578 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2579 	else
2580 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2581 	__mem_cgroup_commit_charge(mem, pc, ctype);
2582 	return ret;
2583 }
2584 
2585 /* remove redundant charge if migration failed*/
2586 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2587 	struct page *oldpage, struct page *newpage)
2588 {
2589 	struct page *used, *unused;
2590 	struct page_cgroup *pc;
2591 
2592 	if (!mem)
2593 		return;
2594 	/* blocks rmdir() */
2595 	cgroup_exclude_rmdir(&mem->css);
2596 	/* at migration success, oldpage->mapping is NULL. */
2597 	if (oldpage->mapping) {
2598 		used = oldpage;
2599 		unused = newpage;
2600 	} else {
2601 		used = newpage;
2602 		unused = oldpage;
2603 	}
2604 	/*
2605 	 * We disallowed uncharge of pages under migration because mapcount
2606 	 * of the page goes down to zero, temporarly.
2607 	 * Clear the flag and check the page should be charged.
2608 	 */
2609 	pc = lookup_page_cgroup(oldpage);
2610 	lock_page_cgroup(pc);
2611 	ClearPageCgroupMigration(pc);
2612 	unlock_page_cgroup(pc);
2613 
2614 	if (unused != oldpage)
2615 		pc = lookup_page_cgroup(unused);
2616 	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2617 
2618 	pc = lookup_page_cgroup(used);
2619 	/*
2620 	 * If a page is a file cache, radix-tree replacement is very atomic
2621 	 * and we can skip this check. When it was an Anon page, its mapcount
2622 	 * goes down to 0. But because we added MIGRATION flage, it's not
2623 	 * uncharged yet. There are several case but page->mapcount check
2624 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
2625 	 * check. (see prepare_charge() also)
2626 	 */
2627 	if (PageAnon(used))
2628 		mem_cgroup_uncharge_page(used);
2629 	/*
2630 	 * At migration, we may charge account against cgroup which has no
2631 	 * tasks.
2632 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2633 	 * In that case, we need to call pre_destroy() again. check it here.
2634 	 */
2635 	cgroup_release_and_wakeup_rmdir(&mem->css);
2636 }
2637 
2638 /*
2639  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2640  * Calling hierarchical_reclaim is not enough because we should update
2641  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2642  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2643  * not from the memcg which this page would be charged to.
2644  * try_charge_swapin does all of these works properly.
2645  */
2646 int mem_cgroup_shmem_charge_fallback(struct page *page,
2647 			    struct mm_struct *mm,
2648 			    gfp_t gfp_mask)
2649 {
2650 	struct mem_cgroup *mem = NULL;
2651 	int ret;
2652 
2653 	if (mem_cgroup_disabled())
2654 		return 0;
2655 
2656 	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2657 	if (!ret)
2658 		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2659 
2660 	return ret;
2661 }
2662 
2663 static DEFINE_MUTEX(set_limit_mutex);
2664 
2665 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2666 				unsigned long long val)
2667 {
2668 	int retry_count;
2669 	u64 memswlimit, memlimit;
2670 	int ret = 0;
2671 	int children = mem_cgroup_count_children(memcg);
2672 	u64 curusage, oldusage;
2673 	int enlarge;
2674 
2675 	/*
2676 	 * For keeping hierarchical_reclaim simple, how long we should retry
2677 	 * is depends on callers. We set our retry-count to be function
2678 	 * of # of children which we should visit in this loop.
2679 	 */
2680 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2681 
2682 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2683 
2684 	enlarge = 0;
2685 	while (retry_count) {
2686 		if (signal_pending(current)) {
2687 			ret = -EINTR;
2688 			break;
2689 		}
2690 		/*
2691 		 * Rather than hide all in some function, I do this in
2692 		 * open coded manner. You see what this really does.
2693 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2694 		 */
2695 		mutex_lock(&set_limit_mutex);
2696 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2697 		if (memswlimit < val) {
2698 			ret = -EINVAL;
2699 			mutex_unlock(&set_limit_mutex);
2700 			break;
2701 		}
2702 
2703 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2704 		if (memlimit < val)
2705 			enlarge = 1;
2706 
2707 		ret = res_counter_set_limit(&memcg->res, val);
2708 		if (!ret) {
2709 			if (memswlimit == val)
2710 				memcg->memsw_is_minimum = true;
2711 			else
2712 				memcg->memsw_is_minimum = false;
2713 		}
2714 		mutex_unlock(&set_limit_mutex);
2715 
2716 		if (!ret)
2717 			break;
2718 
2719 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2720 						MEM_CGROUP_RECLAIM_SHRINK);
2721 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2722 		/* Usage is reduced ? */
2723   		if (curusage >= oldusage)
2724 			retry_count--;
2725 		else
2726 			oldusage = curusage;
2727 	}
2728 	if (!ret && enlarge)
2729 		memcg_oom_recover(memcg);
2730 
2731 	return ret;
2732 }
2733 
2734 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2735 					unsigned long long val)
2736 {
2737 	int retry_count;
2738 	u64 memlimit, memswlimit, oldusage, curusage;
2739 	int children = mem_cgroup_count_children(memcg);
2740 	int ret = -EBUSY;
2741 	int enlarge = 0;
2742 
2743 	/* see mem_cgroup_resize_res_limit */
2744  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2745 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2746 	while (retry_count) {
2747 		if (signal_pending(current)) {
2748 			ret = -EINTR;
2749 			break;
2750 		}
2751 		/*
2752 		 * Rather than hide all in some function, I do this in
2753 		 * open coded manner. You see what this really does.
2754 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2755 		 */
2756 		mutex_lock(&set_limit_mutex);
2757 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2758 		if (memlimit > val) {
2759 			ret = -EINVAL;
2760 			mutex_unlock(&set_limit_mutex);
2761 			break;
2762 		}
2763 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2764 		if (memswlimit < val)
2765 			enlarge = 1;
2766 		ret = res_counter_set_limit(&memcg->memsw, val);
2767 		if (!ret) {
2768 			if (memlimit == val)
2769 				memcg->memsw_is_minimum = true;
2770 			else
2771 				memcg->memsw_is_minimum = false;
2772 		}
2773 		mutex_unlock(&set_limit_mutex);
2774 
2775 		if (!ret)
2776 			break;
2777 
2778 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2779 						MEM_CGROUP_RECLAIM_NOSWAP |
2780 						MEM_CGROUP_RECLAIM_SHRINK);
2781 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2782 		/* Usage is reduced ? */
2783 		if (curusage >= oldusage)
2784 			retry_count--;
2785 		else
2786 			oldusage = curusage;
2787 	}
2788 	if (!ret && enlarge)
2789 		memcg_oom_recover(memcg);
2790 	return ret;
2791 }
2792 
2793 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2794 						gfp_t gfp_mask, int nid,
2795 						int zid)
2796 {
2797 	unsigned long nr_reclaimed = 0;
2798 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2799 	unsigned long reclaimed;
2800 	int loop = 0;
2801 	struct mem_cgroup_tree_per_zone *mctz;
2802 	unsigned long long excess;
2803 
2804 	if (order > 0)
2805 		return 0;
2806 
2807 	mctz = soft_limit_tree_node_zone(nid, zid);
2808 	/*
2809 	 * This loop can run a while, specially if mem_cgroup's continuously
2810 	 * keep exceeding their soft limit and putting the system under
2811 	 * pressure
2812 	 */
2813 	do {
2814 		if (next_mz)
2815 			mz = next_mz;
2816 		else
2817 			mz = mem_cgroup_largest_soft_limit_node(mctz);
2818 		if (!mz)
2819 			break;
2820 
2821 		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2822 						gfp_mask,
2823 						MEM_CGROUP_RECLAIM_SOFT);
2824 		nr_reclaimed += reclaimed;
2825 		spin_lock(&mctz->lock);
2826 
2827 		/*
2828 		 * If we failed to reclaim anything from this memory cgroup
2829 		 * it is time to move on to the next cgroup
2830 		 */
2831 		next_mz = NULL;
2832 		if (!reclaimed) {
2833 			do {
2834 				/*
2835 				 * Loop until we find yet another one.
2836 				 *
2837 				 * By the time we get the soft_limit lock
2838 				 * again, someone might have aded the
2839 				 * group back on the RB tree. Iterate to
2840 				 * make sure we get a different mem.
2841 				 * mem_cgroup_largest_soft_limit_node returns
2842 				 * NULL if no other cgroup is present on
2843 				 * the tree
2844 				 */
2845 				next_mz =
2846 				__mem_cgroup_largest_soft_limit_node(mctz);
2847 				if (next_mz == mz) {
2848 					css_put(&next_mz->mem->css);
2849 					next_mz = NULL;
2850 				} else /* next_mz == NULL or other memcg */
2851 					break;
2852 			} while (1);
2853 		}
2854 		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2855 		excess = res_counter_soft_limit_excess(&mz->mem->res);
2856 		/*
2857 		 * One school of thought says that we should not add
2858 		 * back the node to the tree if reclaim returns 0.
2859 		 * But our reclaim could return 0, simply because due
2860 		 * to priority we are exposing a smaller subset of
2861 		 * memory to reclaim from. Consider this as a longer
2862 		 * term TODO.
2863 		 */
2864 		/* If excess == 0, no tree ops */
2865 		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2866 		spin_unlock(&mctz->lock);
2867 		css_put(&mz->mem->css);
2868 		loop++;
2869 		/*
2870 		 * Could not reclaim anything and there are no more
2871 		 * mem cgroups to try or we seem to be looping without
2872 		 * reclaiming anything.
2873 		 */
2874 		if (!nr_reclaimed &&
2875 			(next_mz == NULL ||
2876 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2877 			break;
2878 	} while (!nr_reclaimed);
2879 	if (next_mz)
2880 		css_put(&next_mz->mem->css);
2881 	return nr_reclaimed;
2882 }
2883 
2884 /*
2885  * This routine traverse page_cgroup in given list and drop them all.
2886  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2887  */
2888 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2889 				int node, int zid, enum lru_list lru)
2890 {
2891 	struct zone *zone;
2892 	struct mem_cgroup_per_zone *mz;
2893 	struct page_cgroup *pc, *busy;
2894 	unsigned long flags, loop;
2895 	struct list_head *list;
2896 	int ret = 0;
2897 
2898 	zone = &NODE_DATA(node)->node_zones[zid];
2899 	mz = mem_cgroup_zoneinfo(mem, node, zid);
2900 	list = &mz->lists[lru];
2901 
2902 	loop = MEM_CGROUP_ZSTAT(mz, lru);
2903 	/* give some margin against EBUSY etc...*/
2904 	loop += 256;
2905 	busy = NULL;
2906 	while (loop--) {
2907 		ret = 0;
2908 		spin_lock_irqsave(&zone->lru_lock, flags);
2909 		if (list_empty(list)) {
2910 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2911 			break;
2912 		}
2913 		pc = list_entry(list->prev, struct page_cgroup, lru);
2914 		if (busy == pc) {
2915 			list_move(&pc->lru, list);
2916 			busy = NULL;
2917 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2918 			continue;
2919 		}
2920 		spin_unlock_irqrestore(&zone->lru_lock, flags);
2921 
2922 		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2923 		if (ret == -ENOMEM)
2924 			break;
2925 
2926 		if (ret == -EBUSY || ret == -EINVAL) {
2927 			/* found lock contention or "pc" is obsolete. */
2928 			busy = pc;
2929 			cond_resched();
2930 		} else
2931 			busy = NULL;
2932 	}
2933 
2934 	if (!ret && !list_empty(list))
2935 		return -EBUSY;
2936 	return ret;
2937 }
2938 
2939 /*
2940  * make mem_cgroup's charge to be 0 if there is no task.
2941  * This enables deleting this mem_cgroup.
2942  */
2943 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2944 {
2945 	int ret;
2946 	int node, zid, shrink;
2947 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2948 	struct cgroup *cgrp = mem->css.cgroup;
2949 
2950 	css_get(&mem->css);
2951 
2952 	shrink = 0;
2953 	/* should free all ? */
2954 	if (free_all)
2955 		goto try_to_free;
2956 move_account:
2957 	do {
2958 		ret = -EBUSY;
2959 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2960 			goto out;
2961 		ret = -EINTR;
2962 		if (signal_pending(current))
2963 			goto out;
2964 		/* This is for making all *used* pages to be on LRU. */
2965 		lru_add_drain_all();
2966 		drain_all_stock_sync();
2967 		ret = 0;
2968 		for_each_node_state(node, N_HIGH_MEMORY) {
2969 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2970 				enum lru_list l;
2971 				for_each_lru(l) {
2972 					ret = mem_cgroup_force_empty_list(mem,
2973 							node, zid, l);
2974 					if (ret)
2975 						break;
2976 				}
2977 			}
2978 			if (ret)
2979 				break;
2980 		}
2981 		memcg_oom_recover(mem);
2982 		/* it seems parent cgroup doesn't have enough mem */
2983 		if (ret == -ENOMEM)
2984 			goto try_to_free;
2985 		cond_resched();
2986 	/* "ret" should also be checked to ensure all lists are empty. */
2987 	} while (mem->res.usage > 0 || ret);
2988 out:
2989 	css_put(&mem->css);
2990 	return ret;
2991 
2992 try_to_free:
2993 	/* returns EBUSY if there is a task or if we come here twice. */
2994 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2995 		ret = -EBUSY;
2996 		goto out;
2997 	}
2998 	/* we call try-to-free pages for make this cgroup empty */
2999 	lru_add_drain_all();
3000 	/* try to free all pages in this cgroup */
3001 	shrink = 1;
3002 	while (nr_retries && mem->res.usage > 0) {
3003 		int progress;
3004 
3005 		if (signal_pending(current)) {
3006 			ret = -EINTR;
3007 			goto out;
3008 		}
3009 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3010 						false, get_swappiness(mem));
3011 		if (!progress) {
3012 			nr_retries--;
3013 			/* maybe some writeback is necessary */
3014 			congestion_wait(BLK_RW_ASYNC, HZ/10);
3015 		}
3016 
3017 	}
3018 	lru_add_drain();
3019 	/* try move_account...there may be some *locked* pages. */
3020 	goto move_account;
3021 }
3022 
3023 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3024 {
3025 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3026 }
3027 
3028 
3029 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3030 {
3031 	return mem_cgroup_from_cont(cont)->use_hierarchy;
3032 }
3033 
3034 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3035 					u64 val)
3036 {
3037 	int retval = 0;
3038 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3039 	struct cgroup *parent = cont->parent;
3040 	struct mem_cgroup *parent_mem = NULL;
3041 
3042 	if (parent)
3043 		parent_mem = mem_cgroup_from_cont(parent);
3044 
3045 	cgroup_lock();
3046 	/*
3047 	 * If parent's use_hierarchy is set, we can't make any modifications
3048 	 * in the child subtrees. If it is unset, then the change can
3049 	 * occur, provided the current cgroup has no children.
3050 	 *
3051 	 * For the root cgroup, parent_mem is NULL, we allow value to be
3052 	 * set if there are no children.
3053 	 */
3054 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
3055 				(val == 1 || val == 0)) {
3056 		if (list_empty(&cont->children))
3057 			mem->use_hierarchy = val;
3058 		else
3059 			retval = -EBUSY;
3060 	} else
3061 		retval = -EINVAL;
3062 	cgroup_unlock();
3063 
3064 	return retval;
3065 }
3066 
3067 struct mem_cgroup_idx_data {
3068 	s64 val;
3069 	enum mem_cgroup_stat_index idx;
3070 };
3071 
3072 static int
3073 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
3074 {
3075 	struct mem_cgroup_idx_data *d = data;
3076 	d->val += mem_cgroup_read_stat(mem, d->idx);
3077 	return 0;
3078 }
3079 
3080 static void
3081 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3082 				enum mem_cgroup_stat_index idx, s64 *val)
3083 {
3084 	struct mem_cgroup_idx_data d;
3085 	d.idx = idx;
3086 	d.val = 0;
3087 	mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3088 	*val = d.val;
3089 }
3090 
3091 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3092 {
3093 	u64 idx_val, val;
3094 
3095 	if (!mem_cgroup_is_root(mem)) {
3096 		if (!swap)
3097 			return res_counter_read_u64(&mem->res, RES_USAGE);
3098 		else
3099 			return res_counter_read_u64(&mem->memsw, RES_USAGE);
3100 	}
3101 
3102 	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
3103 	val = idx_val;
3104 	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3105 	val += idx_val;
3106 
3107 	if (swap) {
3108 		mem_cgroup_get_recursive_idx_stat(mem,
3109 				MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3110 		val += idx_val;
3111 	}
3112 
3113 	return val << PAGE_SHIFT;
3114 }
3115 
3116 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3117 {
3118 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3119 	u64 val;
3120 	int type, name;
3121 
3122 	type = MEMFILE_TYPE(cft->private);
3123 	name = MEMFILE_ATTR(cft->private);
3124 	switch (type) {
3125 	case _MEM:
3126 		if (name == RES_USAGE)
3127 			val = mem_cgroup_usage(mem, false);
3128 		else
3129 			val = res_counter_read_u64(&mem->res, name);
3130 		break;
3131 	case _MEMSWAP:
3132 		if (name == RES_USAGE)
3133 			val = mem_cgroup_usage(mem, true);
3134 		else
3135 			val = res_counter_read_u64(&mem->memsw, name);
3136 		break;
3137 	default:
3138 		BUG();
3139 		break;
3140 	}
3141 	return val;
3142 }
3143 /*
3144  * The user of this function is...
3145  * RES_LIMIT.
3146  */
3147 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3148 			    const char *buffer)
3149 {
3150 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3151 	int type, name;
3152 	unsigned long long val;
3153 	int ret;
3154 
3155 	type = MEMFILE_TYPE(cft->private);
3156 	name = MEMFILE_ATTR(cft->private);
3157 	switch (name) {
3158 	case RES_LIMIT:
3159 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3160 			ret = -EINVAL;
3161 			break;
3162 		}
3163 		/* This function does all necessary parse...reuse it */
3164 		ret = res_counter_memparse_write_strategy(buffer, &val);
3165 		if (ret)
3166 			break;
3167 		if (type == _MEM)
3168 			ret = mem_cgroup_resize_limit(memcg, val);
3169 		else
3170 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
3171 		break;
3172 	case RES_SOFT_LIMIT:
3173 		ret = res_counter_memparse_write_strategy(buffer, &val);
3174 		if (ret)
3175 			break;
3176 		/*
3177 		 * For memsw, soft limits are hard to implement in terms
3178 		 * of semantics, for now, we support soft limits for
3179 		 * control without swap
3180 		 */
3181 		if (type == _MEM)
3182 			ret = res_counter_set_soft_limit(&memcg->res, val);
3183 		else
3184 			ret = -EINVAL;
3185 		break;
3186 	default:
3187 		ret = -EINVAL; /* should be BUG() ? */
3188 		break;
3189 	}
3190 	return ret;
3191 }
3192 
3193 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3194 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
3195 {
3196 	struct cgroup *cgroup;
3197 	unsigned long long min_limit, min_memsw_limit, tmp;
3198 
3199 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3200 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3201 	cgroup = memcg->css.cgroup;
3202 	if (!memcg->use_hierarchy)
3203 		goto out;
3204 
3205 	while (cgroup->parent) {
3206 		cgroup = cgroup->parent;
3207 		memcg = mem_cgroup_from_cont(cgroup);
3208 		if (!memcg->use_hierarchy)
3209 			break;
3210 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3211 		min_limit = min(min_limit, tmp);
3212 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3213 		min_memsw_limit = min(min_memsw_limit, tmp);
3214 	}
3215 out:
3216 	*mem_limit = min_limit;
3217 	*memsw_limit = min_memsw_limit;
3218 	return;
3219 }
3220 
3221 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3222 {
3223 	struct mem_cgroup *mem;
3224 	int type, name;
3225 
3226 	mem = mem_cgroup_from_cont(cont);
3227 	type = MEMFILE_TYPE(event);
3228 	name = MEMFILE_ATTR(event);
3229 	switch (name) {
3230 	case RES_MAX_USAGE:
3231 		if (type == _MEM)
3232 			res_counter_reset_max(&mem->res);
3233 		else
3234 			res_counter_reset_max(&mem->memsw);
3235 		break;
3236 	case RES_FAILCNT:
3237 		if (type == _MEM)
3238 			res_counter_reset_failcnt(&mem->res);
3239 		else
3240 			res_counter_reset_failcnt(&mem->memsw);
3241 		break;
3242 	}
3243 
3244 	return 0;
3245 }
3246 
3247 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3248 					struct cftype *cft)
3249 {
3250 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3251 }
3252 
3253 #ifdef CONFIG_MMU
3254 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3255 					struct cftype *cft, u64 val)
3256 {
3257 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3258 
3259 	if (val >= (1 << NR_MOVE_TYPE))
3260 		return -EINVAL;
3261 	/*
3262 	 * We check this value several times in both in can_attach() and
3263 	 * attach(), so we need cgroup lock to prevent this value from being
3264 	 * inconsistent.
3265 	 */
3266 	cgroup_lock();
3267 	mem->move_charge_at_immigrate = val;
3268 	cgroup_unlock();
3269 
3270 	return 0;
3271 }
3272 #else
3273 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3274 					struct cftype *cft, u64 val)
3275 {
3276 	return -ENOSYS;
3277 }
3278 #endif
3279 
3280 
3281 /* For read statistics */
3282 enum {
3283 	MCS_CACHE,
3284 	MCS_RSS,
3285 	MCS_FILE_MAPPED,
3286 	MCS_PGPGIN,
3287 	MCS_PGPGOUT,
3288 	MCS_SWAP,
3289 	MCS_INACTIVE_ANON,
3290 	MCS_ACTIVE_ANON,
3291 	MCS_INACTIVE_FILE,
3292 	MCS_ACTIVE_FILE,
3293 	MCS_UNEVICTABLE,
3294 	NR_MCS_STAT,
3295 };
3296 
3297 struct mcs_total_stat {
3298 	s64 stat[NR_MCS_STAT];
3299 };
3300 
3301 struct {
3302 	char *local_name;
3303 	char *total_name;
3304 } memcg_stat_strings[NR_MCS_STAT] = {
3305 	{"cache", "total_cache"},
3306 	{"rss", "total_rss"},
3307 	{"mapped_file", "total_mapped_file"},
3308 	{"pgpgin", "total_pgpgin"},
3309 	{"pgpgout", "total_pgpgout"},
3310 	{"swap", "total_swap"},
3311 	{"inactive_anon", "total_inactive_anon"},
3312 	{"active_anon", "total_active_anon"},
3313 	{"inactive_file", "total_inactive_file"},
3314 	{"active_file", "total_active_file"},
3315 	{"unevictable", "total_unevictable"}
3316 };
3317 
3318 
3319 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3320 {
3321 	struct mcs_total_stat *s = data;
3322 	s64 val;
3323 
3324 	/* per cpu stat */
3325 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3326 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
3327 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3328 	s->stat[MCS_RSS] += val * PAGE_SIZE;
3329 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3330 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3331 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3332 	s->stat[MCS_PGPGIN] += val;
3333 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3334 	s->stat[MCS_PGPGOUT] += val;
3335 	if (do_swap_account) {
3336 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3337 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
3338 	}
3339 
3340 	/* per zone stat */
3341 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3342 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3343 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3344 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3345 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3346 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3347 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3348 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3349 	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3350 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3351 	return 0;
3352 }
3353 
3354 static void
3355 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3356 {
3357 	mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3358 }
3359 
3360 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3361 				 struct cgroup_map_cb *cb)
3362 {
3363 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3364 	struct mcs_total_stat mystat;
3365 	int i;
3366 
3367 	memset(&mystat, 0, sizeof(mystat));
3368 	mem_cgroup_get_local_stat(mem_cont, &mystat);
3369 
3370 	for (i = 0; i < NR_MCS_STAT; i++) {
3371 		if (i == MCS_SWAP && !do_swap_account)
3372 			continue;
3373 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3374 	}
3375 
3376 	/* Hierarchical information */
3377 	{
3378 		unsigned long long limit, memsw_limit;
3379 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3380 		cb->fill(cb, "hierarchical_memory_limit", limit);
3381 		if (do_swap_account)
3382 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3383 	}
3384 
3385 	memset(&mystat, 0, sizeof(mystat));
3386 	mem_cgroup_get_total_stat(mem_cont, &mystat);
3387 	for (i = 0; i < NR_MCS_STAT; i++) {
3388 		if (i == MCS_SWAP && !do_swap_account)
3389 			continue;
3390 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3391 	}
3392 
3393 #ifdef CONFIG_DEBUG_VM
3394 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3395 
3396 	{
3397 		int nid, zid;
3398 		struct mem_cgroup_per_zone *mz;
3399 		unsigned long recent_rotated[2] = {0, 0};
3400 		unsigned long recent_scanned[2] = {0, 0};
3401 
3402 		for_each_online_node(nid)
3403 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3404 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3405 
3406 				recent_rotated[0] +=
3407 					mz->reclaim_stat.recent_rotated[0];
3408 				recent_rotated[1] +=
3409 					mz->reclaim_stat.recent_rotated[1];
3410 				recent_scanned[0] +=
3411 					mz->reclaim_stat.recent_scanned[0];
3412 				recent_scanned[1] +=
3413 					mz->reclaim_stat.recent_scanned[1];
3414 			}
3415 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3416 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3417 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3418 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3419 	}
3420 #endif
3421 
3422 	return 0;
3423 }
3424 
3425 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3426 {
3427 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3428 
3429 	return get_swappiness(memcg);
3430 }
3431 
3432 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3433 				       u64 val)
3434 {
3435 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3436 	struct mem_cgroup *parent;
3437 
3438 	if (val > 100)
3439 		return -EINVAL;
3440 
3441 	if (cgrp->parent == NULL)
3442 		return -EINVAL;
3443 
3444 	parent = mem_cgroup_from_cont(cgrp->parent);
3445 
3446 	cgroup_lock();
3447 
3448 	/* If under hierarchy, only empty-root can set this value */
3449 	if ((parent->use_hierarchy) ||
3450 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3451 		cgroup_unlock();
3452 		return -EINVAL;
3453 	}
3454 
3455 	spin_lock(&memcg->reclaim_param_lock);
3456 	memcg->swappiness = val;
3457 	spin_unlock(&memcg->reclaim_param_lock);
3458 
3459 	cgroup_unlock();
3460 
3461 	return 0;
3462 }
3463 
3464 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3465 {
3466 	struct mem_cgroup_threshold_ary *t;
3467 	u64 usage;
3468 	int i;
3469 
3470 	rcu_read_lock();
3471 	if (!swap)
3472 		t = rcu_dereference(memcg->thresholds.primary);
3473 	else
3474 		t = rcu_dereference(memcg->memsw_thresholds.primary);
3475 
3476 	if (!t)
3477 		goto unlock;
3478 
3479 	usage = mem_cgroup_usage(memcg, swap);
3480 
3481 	/*
3482 	 * current_threshold points to threshold just below usage.
3483 	 * If it's not true, a threshold was crossed after last
3484 	 * call of __mem_cgroup_threshold().
3485 	 */
3486 	i = t->current_threshold;
3487 
3488 	/*
3489 	 * Iterate backward over array of thresholds starting from
3490 	 * current_threshold and check if a threshold is crossed.
3491 	 * If none of thresholds below usage is crossed, we read
3492 	 * only one element of the array here.
3493 	 */
3494 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3495 		eventfd_signal(t->entries[i].eventfd, 1);
3496 
3497 	/* i = current_threshold + 1 */
3498 	i++;
3499 
3500 	/*
3501 	 * Iterate forward over array of thresholds starting from
3502 	 * current_threshold+1 and check if a threshold is crossed.
3503 	 * If none of thresholds above usage is crossed, we read
3504 	 * only one element of the array here.
3505 	 */
3506 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3507 		eventfd_signal(t->entries[i].eventfd, 1);
3508 
3509 	/* Update current_threshold */
3510 	t->current_threshold = i - 1;
3511 unlock:
3512 	rcu_read_unlock();
3513 }
3514 
3515 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3516 {
3517 	__mem_cgroup_threshold(memcg, false);
3518 	if (do_swap_account)
3519 		__mem_cgroup_threshold(memcg, true);
3520 }
3521 
3522 static int compare_thresholds(const void *a, const void *b)
3523 {
3524 	const struct mem_cgroup_threshold *_a = a;
3525 	const struct mem_cgroup_threshold *_b = b;
3526 
3527 	return _a->threshold - _b->threshold;
3528 }
3529 
3530 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3531 {
3532 	struct mem_cgroup_eventfd_list *ev;
3533 
3534 	list_for_each_entry(ev, &mem->oom_notify, list)
3535 		eventfd_signal(ev->eventfd, 1);
3536 	return 0;
3537 }
3538 
3539 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3540 {
3541 	mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3542 }
3543 
3544 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3545 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3546 {
3547 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3548 	struct mem_cgroup_thresholds *thresholds;
3549 	struct mem_cgroup_threshold_ary *new;
3550 	int type = MEMFILE_TYPE(cft->private);
3551 	u64 threshold, usage;
3552 	int i, size, ret;
3553 
3554 	ret = res_counter_memparse_write_strategy(args, &threshold);
3555 	if (ret)
3556 		return ret;
3557 
3558 	mutex_lock(&memcg->thresholds_lock);
3559 
3560 	if (type == _MEM)
3561 		thresholds = &memcg->thresholds;
3562 	else if (type == _MEMSWAP)
3563 		thresholds = &memcg->memsw_thresholds;
3564 	else
3565 		BUG();
3566 
3567 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3568 
3569 	/* Check if a threshold crossed before adding a new one */
3570 	if (thresholds->primary)
3571 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3572 
3573 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3574 
3575 	/* Allocate memory for new array of thresholds */
3576 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3577 			GFP_KERNEL);
3578 	if (!new) {
3579 		ret = -ENOMEM;
3580 		goto unlock;
3581 	}
3582 	new->size = size;
3583 
3584 	/* Copy thresholds (if any) to new array */
3585 	if (thresholds->primary) {
3586 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3587 				sizeof(struct mem_cgroup_threshold));
3588 	}
3589 
3590 	/* Add new threshold */
3591 	new->entries[size - 1].eventfd = eventfd;
3592 	new->entries[size - 1].threshold = threshold;
3593 
3594 	/* Sort thresholds. Registering of new threshold isn't time-critical */
3595 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3596 			compare_thresholds, NULL);
3597 
3598 	/* Find current threshold */
3599 	new->current_threshold = -1;
3600 	for (i = 0; i < size; i++) {
3601 		if (new->entries[i].threshold < usage) {
3602 			/*
3603 			 * new->current_threshold will not be used until
3604 			 * rcu_assign_pointer(), so it's safe to increment
3605 			 * it here.
3606 			 */
3607 			++new->current_threshold;
3608 		}
3609 	}
3610 
3611 	/* Free old spare buffer and save old primary buffer as spare */
3612 	kfree(thresholds->spare);
3613 	thresholds->spare = thresholds->primary;
3614 
3615 	rcu_assign_pointer(thresholds->primary, new);
3616 
3617 	/* To be sure that nobody uses thresholds */
3618 	synchronize_rcu();
3619 
3620 unlock:
3621 	mutex_unlock(&memcg->thresholds_lock);
3622 
3623 	return ret;
3624 }
3625 
3626 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3627 	struct cftype *cft, struct eventfd_ctx *eventfd)
3628 {
3629 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3630 	struct mem_cgroup_thresholds *thresholds;
3631 	struct mem_cgroup_threshold_ary *new;
3632 	int type = MEMFILE_TYPE(cft->private);
3633 	u64 usage;
3634 	int i, j, size;
3635 
3636 	mutex_lock(&memcg->thresholds_lock);
3637 	if (type == _MEM)
3638 		thresholds = &memcg->thresholds;
3639 	else if (type == _MEMSWAP)
3640 		thresholds = &memcg->memsw_thresholds;
3641 	else
3642 		BUG();
3643 
3644 	/*
3645 	 * Something went wrong if we trying to unregister a threshold
3646 	 * if we don't have thresholds
3647 	 */
3648 	BUG_ON(!thresholds);
3649 
3650 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3651 
3652 	/* Check if a threshold crossed before removing */
3653 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3654 
3655 	/* Calculate new number of threshold */
3656 	size = 0;
3657 	for (i = 0; i < thresholds->primary->size; i++) {
3658 		if (thresholds->primary->entries[i].eventfd != eventfd)
3659 			size++;
3660 	}
3661 
3662 	new = thresholds->spare;
3663 
3664 	/* Set thresholds array to NULL if we don't have thresholds */
3665 	if (!size) {
3666 		kfree(new);
3667 		new = NULL;
3668 		goto swap_buffers;
3669 	}
3670 
3671 	new->size = size;
3672 
3673 	/* Copy thresholds and find current threshold */
3674 	new->current_threshold = -1;
3675 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3676 		if (thresholds->primary->entries[i].eventfd == eventfd)
3677 			continue;
3678 
3679 		new->entries[j] = thresholds->primary->entries[i];
3680 		if (new->entries[j].threshold < usage) {
3681 			/*
3682 			 * new->current_threshold will not be used
3683 			 * until rcu_assign_pointer(), so it's safe to increment
3684 			 * it here.
3685 			 */
3686 			++new->current_threshold;
3687 		}
3688 		j++;
3689 	}
3690 
3691 swap_buffers:
3692 	/* Swap primary and spare array */
3693 	thresholds->spare = thresholds->primary;
3694 	rcu_assign_pointer(thresholds->primary, new);
3695 
3696 	/* To be sure that nobody uses thresholds */
3697 	synchronize_rcu();
3698 
3699 	mutex_unlock(&memcg->thresholds_lock);
3700 }
3701 
3702 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3703 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3704 {
3705 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3706 	struct mem_cgroup_eventfd_list *event;
3707 	int type = MEMFILE_TYPE(cft->private);
3708 
3709 	BUG_ON(type != _OOM_TYPE);
3710 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
3711 	if (!event)
3712 		return -ENOMEM;
3713 
3714 	mutex_lock(&memcg_oom_mutex);
3715 
3716 	event->eventfd = eventfd;
3717 	list_add(&event->list, &memcg->oom_notify);
3718 
3719 	/* already in OOM ? */
3720 	if (atomic_read(&memcg->oom_lock))
3721 		eventfd_signal(eventfd, 1);
3722 	mutex_unlock(&memcg_oom_mutex);
3723 
3724 	return 0;
3725 }
3726 
3727 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3728 	struct cftype *cft, struct eventfd_ctx *eventfd)
3729 {
3730 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3731 	struct mem_cgroup_eventfd_list *ev, *tmp;
3732 	int type = MEMFILE_TYPE(cft->private);
3733 
3734 	BUG_ON(type != _OOM_TYPE);
3735 
3736 	mutex_lock(&memcg_oom_mutex);
3737 
3738 	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3739 		if (ev->eventfd == eventfd) {
3740 			list_del(&ev->list);
3741 			kfree(ev);
3742 		}
3743 	}
3744 
3745 	mutex_unlock(&memcg_oom_mutex);
3746 }
3747 
3748 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3749 	struct cftype *cft,  struct cgroup_map_cb *cb)
3750 {
3751 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3752 
3753 	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3754 
3755 	if (atomic_read(&mem->oom_lock))
3756 		cb->fill(cb, "under_oom", 1);
3757 	else
3758 		cb->fill(cb, "under_oom", 0);
3759 	return 0;
3760 }
3761 
3762 /*
3763  */
3764 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 	struct cftype *cft, u64 val)
3766 {
3767 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3768 	struct mem_cgroup *parent;
3769 
3770 	/* cannot set to root cgroup and only 0 and 1 are allowed */
3771 	if (!cgrp->parent || !((val == 0) || (val == 1)))
3772 		return -EINVAL;
3773 
3774 	parent = mem_cgroup_from_cont(cgrp->parent);
3775 
3776 	cgroup_lock();
3777 	/* oom-kill-disable is a flag for subhierarchy. */
3778 	if ((parent->use_hierarchy) ||
3779 	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3780 		cgroup_unlock();
3781 		return -EINVAL;
3782 	}
3783 	mem->oom_kill_disable = val;
3784 	cgroup_unlock();
3785 	return 0;
3786 }
3787 
3788 static struct cftype mem_cgroup_files[] = {
3789 	{
3790 		.name = "usage_in_bytes",
3791 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3792 		.read_u64 = mem_cgroup_read,
3793 		.register_event = mem_cgroup_usage_register_event,
3794 		.unregister_event = mem_cgroup_usage_unregister_event,
3795 	},
3796 	{
3797 		.name = "max_usage_in_bytes",
3798 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3799 		.trigger = mem_cgroup_reset,
3800 		.read_u64 = mem_cgroup_read,
3801 	},
3802 	{
3803 		.name = "limit_in_bytes",
3804 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3805 		.write_string = mem_cgroup_write,
3806 		.read_u64 = mem_cgroup_read,
3807 	},
3808 	{
3809 		.name = "soft_limit_in_bytes",
3810 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3811 		.write_string = mem_cgroup_write,
3812 		.read_u64 = mem_cgroup_read,
3813 	},
3814 	{
3815 		.name = "failcnt",
3816 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3817 		.trigger = mem_cgroup_reset,
3818 		.read_u64 = mem_cgroup_read,
3819 	},
3820 	{
3821 		.name = "stat",
3822 		.read_map = mem_control_stat_show,
3823 	},
3824 	{
3825 		.name = "force_empty",
3826 		.trigger = mem_cgroup_force_empty_write,
3827 	},
3828 	{
3829 		.name = "use_hierarchy",
3830 		.write_u64 = mem_cgroup_hierarchy_write,
3831 		.read_u64 = mem_cgroup_hierarchy_read,
3832 	},
3833 	{
3834 		.name = "swappiness",
3835 		.read_u64 = mem_cgroup_swappiness_read,
3836 		.write_u64 = mem_cgroup_swappiness_write,
3837 	},
3838 	{
3839 		.name = "move_charge_at_immigrate",
3840 		.read_u64 = mem_cgroup_move_charge_read,
3841 		.write_u64 = mem_cgroup_move_charge_write,
3842 	},
3843 	{
3844 		.name = "oom_control",
3845 		.read_map = mem_cgroup_oom_control_read,
3846 		.write_u64 = mem_cgroup_oom_control_write,
3847 		.register_event = mem_cgroup_oom_register_event,
3848 		.unregister_event = mem_cgroup_oom_unregister_event,
3849 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3850 	},
3851 };
3852 
3853 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3854 static struct cftype memsw_cgroup_files[] = {
3855 	{
3856 		.name = "memsw.usage_in_bytes",
3857 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3858 		.read_u64 = mem_cgroup_read,
3859 		.register_event = mem_cgroup_usage_register_event,
3860 		.unregister_event = mem_cgroup_usage_unregister_event,
3861 	},
3862 	{
3863 		.name = "memsw.max_usage_in_bytes",
3864 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3865 		.trigger = mem_cgroup_reset,
3866 		.read_u64 = mem_cgroup_read,
3867 	},
3868 	{
3869 		.name = "memsw.limit_in_bytes",
3870 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3871 		.write_string = mem_cgroup_write,
3872 		.read_u64 = mem_cgroup_read,
3873 	},
3874 	{
3875 		.name = "memsw.failcnt",
3876 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3877 		.trigger = mem_cgroup_reset,
3878 		.read_u64 = mem_cgroup_read,
3879 	},
3880 };
3881 
3882 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3883 {
3884 	if (!do_swap_account)
3885 		return 0;
3886 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
3887 				ARRAY_SIZE(memsw_cgroup_files));
3888 };
3889 #else
3890 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3891 {
3892 	return 0;
3893 }
3894 #endif
3895 
3896 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3897 {
3898 	struct mem_cgroup_per_node *pn;
3899 	struct mem_cgroup_per_zone *mz;
3900 	enum lru_list l;
3901 	int zone, tmp = node;
3902 	/*
3903 	 * This routine is called against possible nodes.
3904 	 * But it's BUG to call kmalloc() against offline node.
3905 	 *
3906 	 * TODO: this routine can waste much memory for nodes which will
3907 	 *       never be onlined. It's better to use memory hotplug callback
3908 	 *       function.
3909 	 */
3910 	if (!node_state(node, N_NORMAL_MEMORY))
3911 		tmp = -1;
3912 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3913 	if (!pn)
3914 		return 1;
3915 
3916 	mem->info.nodeinfo[node] = pn;
3917 	memset(pn, 0, sizeof(*pn));
3918 
3919 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3920 		mz = &pn->zoneinfo[zone];
3921 		for_each_lru(l)
3922 			INIT_LIST_HEAD(&mz->lists[l]);
3923 		mz->usage_in_excess = 0;
3924 		mz->on_tree = false;
3925 		mz->mem = mem;
3926 	}
3927 	return 0;
3928 }
3929 
3930 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3931 {
3932 	kfree(mem->info.nodeinfo[node]);
3933 }
3934 
3935 static struct mem_cgroup *mem_cgroup_alloc(void)
3936 {
3937 	struct mem_cgroup *mem;
3938 	int size = sizeof(struct mem_cgroup);
3939 
3940 	/* Can be very big if MAX_NUMNODES is very big */
3941 	if (size < PAGE_SIZE)
3942 		mem = kmalloc(size, GFP_KERNEL);
3943 	else
3944 		mem = vmalloc(size);
3945 
3946 	if (!mem)
3947 		return NULL;
3948 
3949 	memset(mem, 0, size);
3950 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3951 	if (!mem->stat) {
3952 		if (size < PAGE_SIZE)
3953 			kfree(mem);
3954 		else
3955 			vfree(mem);
3956 		mem = NULL;
3957 	}
3958 	return mem;
3959 }
3960 
3961 /*
3962  * At destroying mem_cgroup, references from swap_cgroup can remain.
3963  * (scanning all at force_empty is too costly...)
3964  *
3965  * Instead of clearing all references at force_empty, we remember
3966  * the number of reference from swap_cgroup and free mem_cgroup when
3967  * it goes down to 0.
3968  *
3969  * Removal of cgroup itself succeeds regardless of refs from swap.
3970  */
3971 
3972 static void __mem_cgroup_free(struct mem_cgroup *mem)
3973 {
3974 	int node;
3975 
3976 	mem_cgroup_remove_from_trees(mem);
3977 	free_css_id(&mem_cgroup_subsys, &mem->css);
3978 
3979 	for_each_node_state(node, N_POSSIBLE)
3980 		free_mem_cgroup_per_zone_info(mem, node);
3981 
3982 	free_percpu(mem->stat);
3983 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3984 		kfree(mem);
3985 	else
3986 		vfree(mem);
3987 }
3988 
3989 static void mem_cgroup_get(struct mem_cgroup *mem)
3990 {
3991 	atomic_inc(&mem->refcnt);
3992 }
3993 
3994 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3995 {
3996 	if (atomic_sub_and_test(count, &mem->refcnt)) {
3997 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
3998 		__mem_cgroup_free(mem);
3999 		if (parent)
4000 			mem_cgroup_put(parent);
4001 	}
4002 }
4003 
4004 static void mem_cgroup_put(struct mem_cgroup *mem)
4005 {
4006 	__mem_cgroup_put(mem, 1);
4007 }
4008 
4009 /*
4010  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4011  */
4012 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4013 {
4014 	if (!mem->res.parent)
4015 		return NULL;
4016 	return mem_cgroup_from_res_counter(mem->res.parent, res);
4017 }
4018 
4019 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4020 static void __init enable_swap_cgroup(void)
4021 {
4022 	if (!mem_cgroup_disabled() && really_do_swap_account)
4023 		do_swap_account = 1;
4024 }
4025 #else
4026 static void __init enable_swap_cgroup(void)
4027 {
4028 }
4029 #endif
4030 
4031 static int mem_cgroup_soft_limit_tree_init(void)
4032 {
4033 	struct mem_cgroup_tree_per_node *rtpn;
4034 	struct mem_cgroup_tree_per_zone *rtpz;
4035 	int tmp, node, zone;
4036 
4037 	for_each_node_state(node, N_POSSIBLE) {
4038 		tmp = node;
4039 		if (!node_state(node, N_NORMAL_MEMORY))
4040 			tmp = -1;
4041 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4042 		if (!rtpn)
4043 			return 1;
4044 
4045 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
4046 
4047 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4048 			rtpz = &rtpn->rb_tree_per_zone[zone];
4049 			rtpz->rb_root = RB_ROOT;
4050 			spin_lock_init(&rtpz->lock);
4051 		}
4052 	}
4053 	return 0;
4054 }
4055 
4056 static struct cgroup_subsys_state * __ref
4057 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4058 {
4059 	struct mem_cgroup *mem, *parent;
4060 	long error = -ENOMEM;
4061 	int node;
4062 
4063 	mem = mem_cgroup_alloc();
4064 	if (!mem)
4065 		return ERR_PTR(error);
4066 
4067 	for_each_node_state(node, N_POSSIBLE)
4068 		if (alloc_mem_cgroup_per_zone_info(mem, node))
4069 			goto free_out;
4070 
4071 	/* root ? */
4072 	if (cont->parent == NULL) {
4073 		int cpu;
4074 		enable_swap_cgroup();
4075 		parent = NULL;
4076 		root_mem_cgroup = mem;
4077 		if (mem_cgroup_soft_limit_tree_init())
4078 			goto free_out;
4079 		for_each_possible_cpu(cpu) {
4080 			struct memcg_stock_pcp *stock =
4081 						&per_cpu(memcg_stock, cpu);
4082 			INIT_WORK(&stock->work, drain_local_stock);
4083 		}
4084 		hotcpu_notifier(memcg_stock_cpu_callback, 0);
4085 	} else {
4086 		parent = mem_cgroup_from_cont(cont->parent);
4087 		mem->use_hierarchy = parent->use_hierarchy;
4088 		mem->oom_kill_disable = parent->oom_kill_disable;
4089 	}
4090 
4091 	if (parent && parent->use_hierarchy) {
4092 		res_counter_init(&mem->res, &parent->res);
4093 		res_counter_init(&mem->memsw, &parent->memsw);
4094 		/*
4095 		 * We increment refcnt of the parent to ensure that we can
4096 		 * safely access it on res_counter_charge/uncharge.
4097 		 * This refcnt will be decremented when freeing this
4098 		 * mem_cgroup(see mem_cgroup_put).
4099 		 */
4100 		mem_cgroup_get(parent);
4101 	} else {
4102 		res_counter_init(&mem->res, NULL);
4103 		res_counter_init(&mem->memsw, NULL);
4104 	}
4105 	mem->last_scanned_child = 0;
4106 	spin_lock_init(&mem->reclaim_param_lock);
4107 	INIT_LIST_HEAD(&mem->oom_notify);
4108 
4109 	if (parent)
4110 		mem->swappiness = get_swappiness(parent);
4111 	atomic_set(&mem->refcnt, 1);
4112 	mem->move_charge_at_immigrate = 0;
4113 	mutex_init(&mem->thresholds_lock);
4114 	return &mem->css;
4115 free_out:
4116 	__mem_cgroup_free(mem);
4117 	root_mem_cgroup = NULL;
4118 	return ERR_PTR(error);
4119 }
4120 
4121 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4122 					struct cgroup *cont)
4123 {
4124 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4125 
4126 	return mem_cgroup_force_empty(mem, false);
4127 }
4128 
4129 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4130 				struct cgroup *cont)
4131 {
4132 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4133 
4134 	mem_cgroup_put(mem);
4135 }
4136 
4137 static int mem_cgroup_populate(struct cgroup_subsys *ss,
4138 				struct cgroup *cont)
4139 {
4140 	int ret;
4141 
4142 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4143 				ARRAY_SIZE(mem_cgroup_files));
4144 
4145 	if (!ret)
4146 		ret = register_memsw_files(cont, ss);
4147 	return ret;
4148 }
4149 
4150 #ifdef CONFIG_MMU
4151 /* Handlers for move charge at task migration. */
4152 #define PRECHARGE_COUNT_AT_ONCE	256
4153 static int mem_cgroup_do_precharge(unsigned long count)
4154 {
4155 	int ret = 0;
4156 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
4157 	struct mem_cgroup *mem = mc.to;
4158 
4159 	if (mem_cgroup_is_root(mem)) {
4160 		mc.precharge += count;
4161 		/* we don't need css_get for root */
4162 		return ret;
4163 	}
4164 	/* try to charge at once */
4165 	if (count > 1) {
4166 		struct res_counter *dummy;
4167 		/*
4168 		 * "mem" cannot be under rmdir() because we've already checked
4169 		 * by cgroup_lock_live_cgroup() that it is not removed and we
4170 		 * are still under the same cgroup_mutex. So we can postpone
4171 		 * css_get().
4172 		 */
4173 		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4174 			goto one_by_one;
4175 		if (do_swap_account && res_counter_charge(&mem->memsw,
4176 						PAGE_SIZE * count, &dummy)) {
4177 			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4178 			goto one_by_one;
4179 		}
4180 		mc.precharge += count;
4181 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4182 		WARN_ON_ONCE(count > INT_MAX);
4183 		__css_get(&mem->css, (int)count);
4184 		return ret;
4185 	}
4186 one_by_one:
4187 	/* fall back to one by one charge */
4188 	while (count--) {
4189 		if (signal_pending(current)) {
4190 			ret = -EINTR;
4191 			break;
4192 		}
4193 		if (!batch_count--) {
4194 			batch_count = PRECHARGE_COUNT_AT_ONCE;
4195 			cond_resched();
4196 		}
4197 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4198 		if (ret || !mem)
4199 			/* mem_cgroup_clear_mc() will do uncharge later */
4200 			return -ENOMEM;
4201 		mc.precharge++;
4202 	}
4203 	return ret;
4204 }
4205 
4206 /**
4207  * is_target_pte_for_mc - check a pte whether it is valid for move charge
4208  * @vma: the vma the pte to be checked belongs
4209  * @addr: the address corresponding to the pte to be checked
4210  * @ptent: the pte to be checked
4211  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4212  *
4213  * Returns
4214  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4215  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4216  *     move charge. if @target is not NULL, the page is stored in target->page
4217  *     with extra refcnt got(Callers should handle it).
4218  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4219  *     target for charge migration. if @target is not NULL, the entry is stored
4220  *     in target->ent.
4221  *
4222  * Called with pte lock held.
4223  */
4224 union mc_target {
4225 	struct page	*page;
4226 	swp_entry_t	ent;
4227 };
4228 
4229 enum mc_target_type {
4230 	MC_TARGET_NONE,	/* not used */
4231 	MC_TARGET_PAGE,
4232 	MC_TARGET_SWAP,
4233 };
4234 
4235 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4236 						unsigned long addr, pte_t ptent)
4237 {
4238 	struct page *page = vm_normal_page(vma, addr, ptent);
4239 
4240 	if (!page || !page_mapped(page))
4241 		return NULL;
4242 	if (PageAnon(page)) {
4243 		/* we don't move shared anon */
4244 		if (!move_anon() || page_mapcount(page) > 2)
4245 			return NULL;
4246 	} else if (!move_file())
4247 		/* we ignore mapcount for file pages */
4248 		return NULL;
4249 	if (!get_page_unless_zero(page))
4250 		return NULL;
4251 
4252 	return page;
4253 }
4254 
4255 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4256 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4257 {
4258 	int usage_count;
4259 	struct page *page = NULL;
4260 	swp_entry_t ent = pte_to_swp_entry(ptent);
4261 
4262 	if (!move_anon() || non_swap_entry(ent))
4263 		return NULL;
4264 	usage_count = mem_cgroup_count_swap_user(ent, &page);
4265 	if (usage_count > 1) { /* we don't move shared anon */
4266 		if (page)
4267 			put_page(page);
4268 		return NULL;
4269 	}
4270 	if (do_swap_account)
4271 		entry->val = ent.val;
4272 
4273 	return page;
4274 }
4275 
4276 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4277 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4278 {
4279 	struct page *page = NULL;
4280 	struct inode *inode;
4281 	struct address_space *mapping;
4282 	pgoff_t pgoff;
4283 
4284 	if (!vma->vm_file) /* anonymous vma */
4285 		return NULL;
4286 	if (!move_file())
4287 		return NULL;
4288 
4289 	inode = vma->vm_file->f_path.dentry->d_inode;
4290 	mapping = vma->vm_file->f_mapping;
4291 	if (pte_none(ptent))
4292 		pgoff = linear_page_index(vma, addr);
4293 	else /* pte_file(ptent) is true */
4294 		pgoff = pte_to_pgoff(ptent);
4295 
4296 	/* page is moved even if it's not RSS of this task(page-faulted). */
4297 	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
4298 		page = find_get_page(mapping, pgoff);
4299 	} else { /* shmem/tmpfs file. we should take account of swap too. */
4300 		swp_entry_t ent;
4301 		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4302 		if (do_swap_account)
4303 			entry->val = ent.val;
4304 	}
4305 
4306 	return page;
4307 }
4308 
4309 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4310 		unsigned long addr, pte_t ptent, union mc_target *target)
4311 {
4312 	struct page *page = NULL;
4313 	struct page_cgroup *pc;
4314 	int ret = 0;
4315 	swp_entry_t ent = { .val = 0 };
4316 
4317 	if (pte_present(ptent))
4318 		page = mc_handle_present_pte(vma, addr, ptent);
4319 	else if (is_swap_pte(ptent))
4320 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4321 	else if (pte_none(ptent) || pte_file(ptent))
4322 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
4323 
4324 	if (!page && !ent.val)
4325 		return 0;
4326 	if (page) {
4327 		pc = lookup_page_cgroup(page);
4328 		/*
4329 		 * Do only loose check w/o page_cgroup lock.
4330 		 * mem_cgroup_move_account() checks the pc is valid or not under
4331 		 * the lock.
4332 		 */
4333 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4334 			ret = MC_TARGET_PAGE;
4335 			if (target)
4336 				target->page = page;
4337 		}
4338 		if (!ret || !target)
4339 			put_page(page);
4340 	}
4341 	/* There is a swap entry and a page doesn't exist or isn't charged */
4342 	if (ent.val && !ret &&
4343 			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4344 		ret = MC_TARGET_SWAP;
4345 		if (target)
4346 			target->ent = ent;
4347 	}
4348 	return ret;
4349 }
4350 
4351 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4352 					unsigned long addr, unsigned long end,
4353 					struct mm_walk *walk)
4354 {
4355 	struct vm_area_struct *vma = walk->private;
4356 	pte_t *pte;
4357 	spinlock_t *ptl;
4358 
4359 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4360 	for (; addr != end; pte++, addr += PAGE_SIZE)
4361 		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4362 			mc.precharge++;	/* increment precharge temporarily */
4363 	pte_unmap_unlock(pte - 1, ptl);
4364 	cond_resched();
4365 
4366 	return 0;
4367 }
4368 
4369 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4370 {
4371 	unsigned long precharge;
4372 	struct vm_area_struct *vma;
4373 
4374 	down_read(&mm->mmap_sem);
4375 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4376 		struct mm_walk mem_cgroup_count_precharge_walk = {
4377 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
4378 			.mm = mm,
4379 			.private = vma,
4380 		};
4381 		if (is_vm_hugetlb_page(vma))
4382 			continue;
4383 		walk_page_range(vma->vm_start, vma->vm_end,
4384 					&mem_cgroup_count_precharge_walk);
4385 	}
4386 	up_read(&mm->mmap_sem);
4387 
4388 	precharge = mc.precharge;
4389 	mc.precharge = 0;
4390 
4391 	return precharge;
4392 }
4393 
4394 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4395 {
4396 	return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4397 }
4398 
4399 static void mem_cgroup_clear_mc(void)
4400 {
4401 	/* we must uncharge all the leftover precharges from mc.to */
4402 	if (mc.precharge) {
4403 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
4404 		mc.precharge = 0;
4405 		memcg_oom_recover(mc.to);
4406 	}
4407 	/*
4408 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4409 	 * we must uncharge here.
4410 	 */
4411 	if (mc.moved_charge) {
4412 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4413 		mc.moved_charge = 0;
4414 		memcg_oom_recover(mc.from);
4415 	}
4416 	/* we must fixup refcnts and charges */
4417 	if (mc.moved_swap) {
4418 		WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4419 		/* uncharge swap account from the old cgroup */
4420 		if (!mem_cgroup_is_root(mc.from))
4421 			res_counter_uncharge(&mc.from->memsw,
4422 						PAGE_SIZE * mc.moved_swap);
4423 		__mem_cgroup_put(mc.from, mc.moved_swap);
4424 
4425 		if (!mem_cgroup_is_root(mc.to)) {
4426 			/*
4427 			 * we charged both to->res and to->memsw, so we should
4428 			 * uncharge to->res.
4429 			 */
4430 			res_counter_uncharge(&mc.to->res,
4431 						PAGE_SIZE * mc.moved_swap);
4432 			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4433 			__css_put(&mc.to->css, mc.moved_swap);
4434 		}
4435 		/* we've already done mem_cgroup_get(mc.to) */
4436 
4437 		mc.moved_swap = 0;
4438 	}
4439 	mc.from = NULL;
4440 	mc.to = NULL;
4441 	mc.moving_task = NULL;
4442 	wake_up_all(&mc.waitq);
4443 }
4444 
4445 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4446 				struct cgroup *cgroup,
4447 				struct task_struct *p,
4448 				bool threadgroup)
4449 {
4450 	int ret = 0;
4451 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4452 
4453 	if (mem->move_charge_at_immigrate) {
4454 		struct mm_struct *mm;
4455 		struct mem_cgroup *from = mem_cgroup_from_task(p);
4456 
4457 		VM_BUG_ON(from == mem);
4458 
4459 		mm = get_task_mm(p);
4460 		if (!mm)
4461 			return 0;
4462 		/* We move charges only when we move a owner of the mm */
4463 		if (mm->owner == p) {
4464 			VM_BUG_ON(mc.from);
4465 			VM_BUG_ON(mc.to);
4466 			VM_BUG_ON(mc.precharge);
4467 			VM_BUG_ON(mc.moved_charge);
4468 			VM_BUG_ON(mc.moved_swap);
4469 			VM_BUG_ON(mc.moving_task);
4470 			mc.from = from;
4471 			mc.to = mem;
4472 			mc.precharge = 0;
4473 			mc.moved_charge = 0;
4474 			mc.moved_swap = 0;
4475 			mc.moving_task = current;
4476 
4477 			ret = mem_cgroup_precharge_mc(mm);
4478 			if (ret)
4479 				mem_cgroup_clear_mc();
4480 		}
4481 		mmput(mm);
4482 	}
4483 	return ret;
4484 }
4485 
4486 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4487 				struct cgroup *cgroup,
4488 				struct task_struct *p,
4489 				bool threadgroup)
4490 {
4491 	mem_cgroup_clear_mc();
4492 }
4493 
4494 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4495 				unsigned long addr, unsigned long end,
4496 				struct mm_walk *walk)
4497 {
4498 	int ret = 0;
4499 	struct vm_area_struct *vma = walk->private;
4500 	pte_t *pte;
4501 	spinlock_t *ptl;
4502 
4503 retry:
4504 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4505 	for (; addr != end; addr += PAGE_SIZE) {
4506 		pte_t ptent = *(pte++);
4507 		union mc_target target;
4508 		int type;
4509 		struct page *page;
4510 		struct page_cgroup *pc;
4511 		swp_entry_t ent;
4512 
4513 		if (!mc.precharge)
4514 			break;
4515 
4516 		type = is_target_pte_for_mc(vma, addr, ptent, &target);
4517 		switch (type) {
4518 		case MC_TARGET_PAGE:
4519 			page = target.page;
4520 			if (isolate_lru_page(page))
4521 				goto put;
4522 			pc = lookup_page_cgroup(page);
4523 			if (!mem_cgroup_move_account(pc,
4524 						mc.from, mc.to, false)) {
4525 				mc.precharge--;
4526 				/* we uncharge from mc.from later. */
4527 				mc.moved_charge++;
4528 			}
4529 			putback_lru_page(page);
4530 put:			/* is_target_pte_for_mc() gets the page */
4531 			put_page(page);
4532 			break;
4533 		case MC_TARGET_SWAP:
4534 			ent = target.ent;
4535 			if (!mem_cgroup_move_swap_account(ent,
4536 						mc.from, mc.to, false)) {
4537 				mc.precharge--;
4538 				/* we fixup refcnts and charges later. */
4539 				mc.moved_swap++;
4540 			}
4541 			break;
4542 		default:
4543 			break;
4544 		}
4545 	}
4546 	pte_unmap_unlock(pte - 1, ptl);
4547 	cond_resched();
4548 
4549 	if (addr != end) {
4550 		/*
4551 		 * We have consumed all precharges we got in can_attach().
4552 		 * We try charge one by one, but don't do any additional
4553 		 * charges to mc.to if we have failed in charge once in attach()
4554 		 * phase.
4555 		 */
4556 		ret = mem_cgroup_do_precharge(1);
4557 		if (!ret)
4558 			goto retry;
4559 	}
4560 
4561 	return ret;
4562 }
4563 
4564 static void mem_cgroup_move_charge(struct mm_struct *mm)
4565 {
4566 	struct vm_area_struct *vma;
4567 
4568 	lru_add_drain_all();
4569 	down_read(&mm->mmap_sem);
4570 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4571 		int ret;
4572 		struct mm_walk mem_cgroup_move_charge_walk = {
4573 			.pmd_entry = mem_cgroup_move_charge_pte_range,
4574 			.mm = mm,
4575 			.private = vma,
4576 		};
4577 		if (is_vm_hugetlb_page(vma))
4578 			continue;
4579 		ret = walk_page_range(vma->vm_start, vma->vm_end,
4580 						&mem_cgroup_move_charge_walk);
4581 		if (ret)
4582 			/*
4583 			 * means we have consumed all precharges and failed in
4584 			 * doing additional charge. Just abandon here.
4585 			 */
4586 			break;
4587 	}
4588 	up_read(&mm->mmap_sem);
4589 }
4590 
4591 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4592 				struct cgroup *cont,
4593 				struct cgroup *old_cont,
4594 				struct task_struct *p,
4595 				bool threadgroup)
4596 {
4597 	struct mm_struct *mm;
4598 
4599 	if (!mc.to)
4600 		/* no need to move charge */
4601 		return;
4602 
4603 	mm = get_task_mm(p);
4604 	if (mm) {
4605 		mem_cgroup_move_charge(mm);
4606 		mmput(mm);
4607 	}
4608 	mem_cgroup_clear_mc();
4609 }
4610 #else	/* !CONFIG_MMU */
4611 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4612 				struct cgroup *cgroup,
4613 				struct task_struct *p,
4614 				bool threadgroup)
4615 {
4616 	return 0;
4617 }
4618 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4619 				struct cgroup *cgroup,
4620 				struct task_struct *p,
4621 				bool threadgroup)
4622 {
4623 }
4624 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4625 				struct cgroup *cont,
4626 				struct cgroup *old_cont,
4627 				struct task_struct *p,
4628 				bool threadgroup)
4629 {
4630 }
4631 #endif
4632 
4633 struct cgroup_subsys mem_cgroup_subsys = {
4634 	.name = "memory",
4635 	.subsys_id = mem_cgroup_subsys_id,
4636 	.create = mem_cgroup_create,
4637 	.pre_destroy = mem_cgroup_pre_destroy,
4638 	.destroy = mem_cgroup_destroy,
4639 	.populate = mem_cgroup_populate,
4640 	.can_attach = mem_cgroup_can_attach,
4641 	.cancel_attach = mem_cgroup_cancel_attach,
4642 	.attach = mem_cgroup_move_task,
4643 	.early_init = 0,
4644 	.use_id = 1,
4645 };
4646 
4647 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4648 
4649 static int __init disable_swap_account(char *s)
4650 {
4651 	really_do_swap_account = 0;
4652 	return 1;
4653 }
4654 __setup("noswapaccount", disable_swap_account);
4655 #endif
4656