xref: /openbmc/linux/mm/memcontrol.c (revision f42b3800)
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  */
19 
20 #include <linux/res_counter.h>
21 #include <linux/memcontrol.h>
22 #include <linux/cgroup.h>
23 #include <linux/mm.h>
24 #include <linux/smp.h>
25 #include <linux/page-flags.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bit_spinlock.h>
28 #include <linux/rcupdate.h>
29 #include <linux/swap.h>
30 #include <linux/spinlock.h>
31 #include <linux/fs.h>
32 #include <linux/seq_file.h>
33 
34 #include <asm/uaccess.h>
35 
36 struct cgroup_subsys mem_cgroup_subsys;
37 static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
38 
39 /*
40  * Statistics for memory cgroup.
41  */
42 enum mem_cgroup_stat_index {
43 	/*
44 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
45 	 */
46 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
47 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
48 
49 	MEM_CGROUP_STAT_NSTATS,
50 };
51 
52 struct mem_cgroup_stat_cpu {
53 	s64 count[MEM_CGROUP_STAT_NSTATS];
54 } ____cacheline_aligned_in_smp;
55 
56 struct mem_cgroup_stat {
57 	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
58 };
59 
60 /*
61  * For accounting under irq disable, no need for increment preempt count.
62  */
63 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
64 		enum mem_cgroup_stat_index idx, int val)
65 {
66 	int cpu = smp_processor_id();
67 	stat->cpustat[cpu].count[idx] += val;
68 }
69 
70 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
71 		enum mem_cgroup_stat_index idx)
72 {
73 	int cpu;
74 	s64 ret = 0;
75 	for_each_possible_cpu(cpu)
76 		ret += stat->cpustat[cpu].count[idx];
77 	return ret;
78 }
79 
80 /*
81  * per-zone information in memory controller.
82  */
83 
84 enum mem_cgroup_zstat_index {
85 	MEM_CGROUP_ZSTAT_ACTIVE,
86 	MEM_CGROUP_ZSTAT_INACTIVE,
87 
88 	NR_MEM_CGROUP_ZSTAT,
89 };
90 
91 struct mem_cgroup_per_zone {
92 	/*
93 	 * spin_lock to protect the per cgroup LRU
94 	 */
95 	spinlock_t		lru_lock;
96 	struct list_head	active_list;
97 	struct list_head	inactive_list;
98 	unsigned long count[NR_MEM_CGROUP_ZSTAT];
99 };
100 /* Macro for accessing counter */
101 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
102 
103 struct mem_cgroup_per_node {
104 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
105 };
106 
107 struct mem_cgroup_lru_info {
108 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
109 };
110 
111 /*
112  * The memory controller data structure. The memory controller controls both
113  * page cache and RSS per cgroup. We would eventually like to provide
114  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
115  * to help the administrator determine what knobs to tune.
116  *
117  * TODO: Add a water mark for the memory controller. Reclaim will begin when
118  * we hit the water mark. May be even add a low water mark, such that
119  * no reclaim occurs from a cgroup at it's low water mark, this is
120  * a feature that will be implemented much later in the future.
121  */
122 struct mem_cgroup {
123 	struct cgroup_subsys_state css;
124 	/*
125 	 * the counter to account for memory usage
126 	 */
127 	struct res_counter res;
128 	/*
129 	 * Per cgroup active and inactive list, similar to the
130 	 * per zone LRU lists.
131 	 */
132 	struct mem_cgroup_lru_info info;
133 
134 	int	prev_priority;	/* for recording reclaim priority */
135 	/*
136 	 * statistics.
137 	 */
138 	struct mem_cgroup_stat stat;
139 };
140 static struct mem_cgroup init_mem_cgroup;
141 
142 /*
143  * We use the lower bit of the page->page_cgroup pointer as a bit spin
144  * lock.  We need to ensure that page->page_cgroup is at least two
145  * byte aligned (based on comments from Nick Piggin).  But since
146  * bit_spin_lock doesn't actually set that lock bit in a non-debug
147  * uniprocessor kernel, we should avoid setting it here too.
148  */
149 #define PAGE_CGROUP_LOCK_BIT 	0x0
150 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
151 #define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
152 #else
153 #define PAGE_CGROUP_LOCK	0x0
154 #endif
155 
156 /*
157  * A page_cgroup page is associated with every page descriptor. The
158  * page_cgroup helps us identify information about the cgroup
159  */
160 struct page_cgroup {
161 	struct list_head lru;		/* per cgroup LRU list */
162 	struct page *page;
163 	struct mem_cgroup *mem_cgroup;
164 	int ref_cnt;			/* cached, mapped, migrating */
165 	int flags;
166 };
167 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
168 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
169 
170 static int page_cgroup_nid(struct page_cgroup *pc)
171 {
172 	return page_to_nid(pc->page);
173 }
174 
175 static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
176 {
177 	return page_zonenum(pc->page);
178 }
179 
180 enum charge_type {
181 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
182 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
183 };
184 
185 /*
186  * Always modified under lru lock. Then, not necessary to preempt_disable()
187  */
188 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
189 					bool charge)
190 {
191 	int val = (charge)? 1 : -1;
192 	struct mem_cgroup_stat *stat = &mem->stat;
193 
194 	VM_BUG_ON(!irqs_disabled());
195 	if (flags & PAGE_CGROUP_FLAG_CACHE)
196 		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
197 	else
198 		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
199 }
200 
201 static struct mem_cgroup_per_zone *
202 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
203 {
204 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
205 }
206 
207 static struct mem_cgroup_per_zone *
208 page_cgroup_zoneinfo(struct page_cgroup *pc)
209 {
210 	struct mem_cgroup *mem = pc->mem_cgroup;
211 	int nid = page_cgroup_nid(pc);
212 	int zid = page_cgroup_zid(pc);
213 
214 	return mem_cgroup_zoneinfo(mem, nid, zid);
215 }
216 
217 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
218 					enum mem_cgroup_zstat_index idx)
219 {
220 	int nid, zid;
221 	struct mem_cgroup_per_zone *mz;
222 	u64 total = 0;
223 
224 	for_each_online_node(nid)
225 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
226 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
227 			total += MEM_CGROUP_ZSTAT(mz, idx);
228 		}
229 	return total;
230 }
231 
232 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
233 {
234 	return container_of(cgroup_subsys_state(cont,
235 				mem_cgroup_subsys_id), struct mem_cgroup,
236 				css);
237 }
238 
239 static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
240 {
241 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
242 				struct mem_cgroup, css);
243 }
244 
245 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
246 {
247 	struct mem_cgroup *mem;
248 
249 	mem = mem_cgroup_from_task(p);
250 	css_get(&mem->css);
251 	mm->mem_cgroup = mem;
252 }
253 
254 void mm_free_cgroup(struct mm_struct *mm)
255 {
256 	css_put(&mm->mem_cgroup->css);
257 }
258 
259 static inline int page_cgroup_locked(struct page *page)
260 {
261 	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
262 }
263 
264 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
265 {
266 	VM_BUG_ON(!page_cgroup_locked(page));
267 	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
268 }
269 
270 struct page_cgroup *page_get_page_cgroup(struct page *page)
271 {
272 	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
273 }
274 
275 static void lock_page_cgroup(struct page *page)
276 {
277 	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
278 }
279 
280 static int try_lock_page_cgroup(struct page *page)
281 {
282 	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
283 }
284 
285 static void unlock_page_cgroup(struct page *page)
286 {
287 	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
288 }
289 
290 static void __mem_cgroup_remove_list(struct page_cgroup *pc)
291 {
292 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
293 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
294 
295 	if (from)
296 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
297 	else
298 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
299 
300 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
301 	list_del_init(&pc->lru);
302 }
303 
304 static void __mem_cgroup_add_list(struct page_cgroup *pc)
305 {
306 	int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
307 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
308 
309 	if (!to) {
310 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
311 		list_add(&pc->lru, &mz->inactive_list);
312 	} else {
313 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
314 		list_add(&pc->lru, &mz->active_list);
315 	}
316 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
317 }
318 
319 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
320 {
321 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
322 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
323 
324 	if (from)
325 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
326 	else
327 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
328 
329 	if (active) {
330 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
331 		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
332 		list_move(&pc->lru, &mz->active_list);
333 	} else {
334 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
335 		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
336 		list_move(&pc->lru, &mz->inactive_list);
337 	}
338 }
339 
340 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
341 {
342 	int ret;
343 
344 	task_lock(task);
345 	ret = task->mm && mm_match_cgroup(task->mm, mem);
346 	task_unlock(task);
347 	return ret;
348 }
349 
350 /*
351  * This routine assumes that the appropriate zone's lru lock is already held
352  */
353 void mem_cgroup_move_lists(struct page *page, bool active)
354 {
355 	struct page_cgroup *pc;
356 	struct mem_cgroup_per_zone *mz;
357 	unsigned long flags;
358 
359 	/*
360 	 * We cannot lock_page_cgroup while holding zone's lru_lock,
361 	 * because other holders of lock_page_cgroup can be interrupted
362 	 * with an attempt to rotate_reclaimable_page.  But we cannot
363 	 * safely get to page_cgroup without it, so just try_lock it:
364 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
365 	 */
366 	if (!try_lock_page_cgroup(page))
367 		return;
368 
369 	pc = page_get_page_cgroup(page);
370 	if (pc) {
371 		mz = page_cgroup_zoneinfo(pc);
372 		spin_lock_irqsave(&mz->lru_lock, flags);
373 		__mem_cgroup_move_lists(pc, active);
374 		spin_unlock_irqrestore(&mz->lru_lock, flags);
375 	}
376 	unlock_page_cgroup(page);
377 }
378 
379 /*
380  * Calculate mapped_ratio under memory controller. This will be used in
381  * vmscan.c for deteremining we have to reclaim mapped pages.
382  */
383 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
384 {
385 	long total, rss;
386 
387 	/*
388 	 * usage is recorded in bytes. But, here, we assume the number of
389 	 * physical pages can be represented by "long" on any arch.
390 	 */
391 	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
392 	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
393 	return (int)((rss * 100L) / total);
394 }
395 
396 /*
397  * This function is called from vmscan.c. In page reclaiming loop. balance
398  * between active and inactive list is calculated. For memory controller
399  * page reclaiming, we should use using mem_cgroup's imbalance rather than
400  * zone's global lru imbalance.
401  */
402 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
403 {
404 	unsigned long active, inactive;
405 	/* active and inactive are the number of pages. 'long' is ok.*/
406 	active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
407 	inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
408 	return (long) (active / (inactive + 1));
409 }
410 
411 /*
412  * prev_priority control...this will be used in memory reclaim path.
413  */
414 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
415 {
416 	return mem->prev_priority;
417 }
418 
419 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
420 {
421 	if (priority < mem->prev_priority)
422 		mem->prev_priority = priority;
423 }
424 
425 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
426 {
427 	mem->prev_priority = priority;
428 }
429 
430 /*
431  * Calculate # of pages to be scanned in this priority/zone.
432  * See also vmscan.c
433  *
434  * priority starts from "DEF_PRIORITY" and decremented in each loop.
435  * (see include/linux/mmzone.h)
436  */
437 
438 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
439 				   struct zone *zone, int priority)
440 {
441 	long nr_active;
442 	int nid = zone->zone_pgdat->node_id;
443 	int zid = zone_idx(zone);
444 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
445 
446 	nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
447 	return (nr_active >> priority);
448 }
449 
450 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
451 					struct zone *zone, int priority)
452 {
453 	long nr_inactive;
454 	int nid = zone->zone_pgdat->node_id;
455 	int zid = zone_idx(zone);
456 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
457 
458 	nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
459 	return (nr_inactive >> priority);
460 }
461 
462 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
463 					struct list_head *dst,
464 					unsigned long *scanned, int order,
465 					int mode, struct zone *z,
466 					struct mem_cgroup *mem_cont,
467 					int active)
468 {
469 	unsigned long nr_taken = 0;
470 	struct page *page;
471 	unsigned long scan;
472 	LIST_HEAD(pc_list);
473 	struct list_head *src;
474 	struct page_cgroup *pc, *tmp;
475 	int nid = z->zone_pgdat->node_id;
476 	int zid = zone_idx(z);
477 	struct mem_cgroup_per_zone *mz;
478 
479 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
480 	if (active)
481 		src = &mz->active_list;
482 	else
483 		src = &mz->inactive_list;
484 
485 
486 	spin_lock(&mz->lru_lock);
487 	scan = 0;
488 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
489 		if (scan >= nr_to_scan)
490 			break;
491 		page = pc->page;
492 
493 		if (unlikely(!PageLRU(page)))
494 			continue;
495 
496 		if (PageActive(page) && !active) {
497 			__mem_cgroup_move_lists(pc, true);
498 			continue;
499 		}
500 		if (!PageActive(page) && active) {
501 			__mem_cgroup_move_lists(pc, false);
502 			continue;
503 		}
504 
505 		scan++;
506 		list_move(&pc->lru, &pc_list);
507 
508 		if (__isolate_lru_page(page, mode) == 0) {
509 			list_move(&page->lru, dst);
510 			nr_taken++;
511 		}
512 	}
513 
514 	list_splice(&pc_list, src);
515 	spin_unlock(&mz->lru_lock);
516 
517 	*scanned = scan;
518 	return nr_taken;
519 }
520 
521 /*
522  * Charge the memory controller for page usage.
523  * Return
524  * 0 if the charge was successful
525  * < 0 if the cgroup is over its limit
526  */
527 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
528 				gfp_t gfp_mask, enum charge_type ctype)
529 {
530 	struct mem_cgroup *mem;
531 	struct page_cgroup *pc;
532 	unsigned long flags;
533 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
534 	struct mem_cgroup_per_zone *mz;
535 
536 	if (mem_cgroup_subsys.disabled)
537 		return 0;
538 
539 	/*
540 	 * Should page_cgroup's go to their own slab?
541 	 * One could optimize the performance of the charging routine
542 	 * by saving a bit in the page_flags and using it as a lock
543 	 * to see if the cgroup page already has a page_cgroup associated
544 	 * with it
545 	 */
546 retry:
547 	lock_page_cgroup(page);
548 	pc = page_get_page_cgroup(page);
549 	/*
550 	 * The page_cgroup exists and
551 	 * the page has already been accounted.
552 	 */
553 	if (pc) {
554 		VM_BUG_ON(pc->page != page);
555 		VM_BUG_ON(pc->ref_cnt <= 0);
556 
557 		pc->ref_cnt++;
558 		unlock_page_cgroup(page);
559 		goto done;
560 	}
561 	unlock_page_cgroup(page);
562 
563 	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
564 	if (pc == NULL)
565 		goto err;
566 
567 	/*
568 	 * We always charge the cgroup the mm_struct belongs to.
569 	 * The mm_struct's mem_cgroup changes on task migration if the
570 	 * thread group leader migrates. It's possible that mm is not
571 	 * set, if so charge the init_mm (happens for pagecache usage).
572 	 */
573 	if (!mm)
574 		mm = &init_mm;
575 
576 	rcu_read_lock();
577 	mem = rcu_dereference(mm->mem_cgroup);
578 	/*
579 	 * For every charge from the cgroup, increment reference count
580 	 */
581 	css_get(&mem->css);
582 	rcu_read_unlock();
583 
584 	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
585 		if (!(gfp_mask & __GFP_WAIT))
586 			goto out;
587 
588 		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
589 			continue;
590 
591 		/*
592 		 * try_to_free_mem_cgroup_pages() might not give us a full
593 		 * picture of reclaim. Some pages are reclaimed and might be
594 		 * moved to swap cache or just unmapped from the cgroup.
595 		 * Check the limit again to see if the reclaim reduced the
596 		 * current usage of the cgroup before giving up
597 		 */
598 		if (res_counter_check_under_limit(&mem->res))
599 			continue;
600 
601 		if (!nr_retries--) {
602 			mem_cgroup_out_of_memory(mem, gfp_mask);
603 			goto out;
604 		}
605 		congestion_wait(WRITE, HZ/10);
606 	}
607 
608 	pc->ref_cnt = 1;
609 	pc->mem_cgroup = mem;
610 	pc->page = page;
611 	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
612 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
613 		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
614 
615 	lock_page_cgroup(page);
616 	if (page_get_page_cgroup(page)) {
617 		unlock_page_cgroup(page);
618 		/*
619 		 * Another charge has been added to this page already.
620 		 * We take lock_page_cgroup(page) again and read
621 		 * page->cgroup, increment refcnt.... just retry is OK.
622 		 */
623 		res_counter_uncharge(&mem->res, PAGE_SIZE);
624 		css_put(&mem->css);
625 		kfree(pc);
626 		goto retry;
627 	}
628 	page_assign_page_cgroup(page, pc);
629 
630 	mz = page_cgroup_zoneinfo(pc);
631 	spin_lock_irqsave(&mz->lru_lock, flags);
632 	__mem_cgroup_add_list(pc);
633 	spin_unlock_irqrestore(&mz->lru_lock, flags);
634 
635 	unlock_page_cgroup(page);
636 done:
637 	return 0;
638 out:
639 	css_put(&mem->css);
640 	kfree(pc);
641 err:
642 	return -ENOMEM;
643 }
644 
645 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
646 {
647 	return mem_cgroup_charge_common(page, mm, gfp_mask,
648 				MEM_CGROUP_CHARGE_TYPE_MAPPED);
649 }
650 
651 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
652 				gfp_t gfp_mask)
653 {
654 	if (!mm)
655 		mm = &init_mm;
656 	return mem_cgroup_charge_common(page, mm, gfp_mask,
657 				MEM_CGROUP_CHARGE_TYPE_CACHE);
658 }
659 
660 /*
661  * Uncharging is always a welcome operation, we never complain, simply
662  * uncharge.
663  */
664 void mem_cgroup_uncharge_page(struct page *page)
665 {
666 	struct page_cgroup *pc;
667 	struct mem_cgroup *mem;
668 	struct mem_cgroup_per_zone *mz;
669 	unsigned long flags;
670 
671 	if (mem_cgroup_subsys.disabled)
672 		return;
673 
674 	/*
675 	 * Check if our page_cgroup is valid
676 	 */
677 	lock_page_cgroup(page);
678 	pc = page_get_page_cgroup(page);
679 	if (!pc)
680 		goto unlock;
681 
682 	VM_BUG_ON(pc->page != page);
683 	VM_BUG_ON(pc->ref_cnt <= 0);
684 
685 	if (--(pc->ref_cnt) == 0) {
686 		mz = page_cgroup_zoneinfo(pc);
687 		spin_lock_irqsave(&mz->lru_lock, flags);
688 		__mem_cgroup_remove_list(pc);
689 		spin_unlock_irqrestore(&mz->lru_lock, flags);
690 
691 		page_assign_page_cgroup(page, NULL);
692 		unlock_page_cgroup(page);
693 
694 		mem = pc->mem_cgroup;
695 		res_counter_uncharge(&mem->res, PAGE_SIZE);
696 		css_put(&mem->css);
697 
698 		kfree(pc);
699 		return;
700 	}
701 
702 unlock:
703 	unlock_page_cgroup(page);
704 }
705 
706 /*
707  * Returns non-zero if a page (under migration) has valid page_cgroup member.
708  * Refcnt of page_cgroup is incremented.
709  */
710 int mem_cgroup_prepare_migration(struct page *page)
711 {
712 	struct page_cgroup *pc;
713 
714 	if (mem_cgroup_subsys.disabled)
715 		return 0;
716 
717 	lock_page_cgroup(page);
718 	pc = page_get_page_cgroup(page);
719 	if (pc)
720 		pc->ref_cnt++;
721 	unlock_page_cgroup(page);
722 	return pc != NULL;
723 }
724 
725 void mem_cgroup_end_migration(struct page *page)
726 {
727 	mem_cgroup_uncharge_page(page);
728 }
729 
730 /*
731  * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
732  * And no race with uncharge() routines because page_cgroup for *page*
733  * has extra one reference by mem_cgroup_prepare_migration.
734  */
735 void mem_cgroup_page_migration(struct page *page, struct page *newpage)
736 {
737 	struct page_cgroup *pc;
738 	struct mem_cgroup_per_zone *mz;
739 	unsigned long flags;
740 
741 	lock_page_cgroup(page);
742 	pc = page_get_page_cgroup(page);
743 	if (!pc) {
744 		unlock_page_cgroup(page);
745 		return;
746 	}
747 
748 	mz = page_cgroup_zoneinfo(pc);
749 	spin_lock_irqsave(&mz->lru_lock, flags);
750 	__mem_cgroup_remove_list(pc);
751 	spin_unlock_irqrestore(&mz->lru_lock, flags);
752 
753 	page_assign_page_cgroup(page, NULL);
754 	unlock_page_cgroup(page);
755 
756 	pc->page = newpage;
757 	lock_page_cgroup(newpage);
758 	page_assign_page_cgroup(newpage, pc);
759 
760 	mz = page_cgroup_zoneinfo(pc);
761 	spin_lock_irqsave(&mz->lru_lock, flags);
762 	__mem_cgroup_add_list(pc);
763 	spin_unlock_irqrestore(&mz->lru_lock, flags);
764 
765 	unlock_page_cgroup(newpage);
766 }
767 
768 /*
769  * This routine traverse page_cgroup in given list and drop them all.
770  * This routine ignores page_cgroup->ref_cnt.
771  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
772  */
773 #define FORCE_UNCHARGE_BATCH	(128)
774 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
775 			    struct mem_cgroup_per_zone *mz,
776 			    int active)
777 {
778 	struct page_cgroup *pc;
779 	struct page *page;
780 	int count = FORCE_UNCHARGE_BATCH;
781 	unsigned long flags;
782 	struct list_head *list;
783 
784 	if (active)
785 		list = &mz->active_list;
786 	else
787 		list = &mz->inactive_list;
788 
789 	spin_lock_irqsave(&mz->lru_lock, flags);
790 	while (!list_empty(list)) {
791 		pc = list_entry(list->prev, struct page_cgroup, lru);
792 		page = pc->page;
793 		get_page(page);
794 		spin_unlock_irqrestore(&mz->lru_lock, flags);
795 		mem_cgroup_uncharge_page(page);
796 		put_page(page);
797 		if (--count <= 0) {
798 			count = FORCE_UNCHARGE_BATCH;
799 			cond_resched();
800 		}
801 		spin_lock_irqsave(&mz->lru_lock, flags);
802 	}
803 	spin_unlock_irqrestore(&mz->lru_lock, flags);
804 }
805 
806 /*
807  * make mem_cgroup's charge to be 0 if there is no task.
808  * This enables deleting this mem_cgroup.
809  */
810 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
811 {
812 	int ret = -EBUSY;
813 	int node, zid;
814 
815 	if (mem_cgroup_subsys.disabled)
816 		return 0;
817 
818 	css_get(&mem->css);
819 	/*
820 	 * page reclaim code (kswapd etc..) will move pages between
821 	 * active_list <-> inactive_list while we don't take a lock.
822 	 * So, we have to do loop here until all lists are empty.
823 	 */
824 	while (mem->res.usage > 0) {
825 		if (atomic_read(&mem->css.cgroup->count) > 0)
826 			goto out;
827 		for_each_node_state(node, N_POSSIBLE)
828 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
829 				struct mem_cgroup_per_zone *mz;
830 				mz = mem_cgroup_zoneinfo(mem, node, zid);
831 				/* drop all page_cgroup in active_list */
832 				mem_cgroup_force_empty_list(mem, mz, 1);
833 				/* drop all page_cgroup in inactive_list */
834 				mem_cgroup_force_empty_list(mem, mz, 0);
835 			}
836 	}
837 	ret = 0;
838 out:
839 	css_put(&mem->css);
840 	return ret;
841 }
842 
843 static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
844 {
845 	*tmp = memparse(buf, &buf);
846 	if (*buf != '\0')
847 		return -EINVAL;
848 
849 	/*
850 	 * Round up the value to the closest page size
851 	 */
852 	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
853 	return 0;
854 }
855 
856 static ssize_t mem_cgroup_read(struct cgroup *cont,
857 			struct cftype *cft, struct file *file,
858 			char __user *userbuf, size_t nbytes, loff_t *ppos)
859 {
860 	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
861 				cft->private, userbuf, nbytes, ppos,
862 				NULL);
863 }
864 
865 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
866 				struct file *file, const char __user *userbuf,
867 				size_t nbytes, loff_t *ppos)
868 {
869 	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
870 				cft->private, userbuf, nbytes, ppos,
871 				mem_cgroup_write_strategy);
872 }
873 
874 static ssize_t mem_force_empty_write(struct cgroup *cont,
875 				struct cftype *cft, struct file *file,
876 				const char __user *userbuf,
877 				size_t nbytes, loff_t *ppos)
878 {
879 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
880 	int ret = mem_cgroup_force_empty(mem);
881 	if (!ret)
882 		ret = nbytes;
883 	return ret;
884 }
885 
886 /*
887  * Note: This should be removed if cgroup supports write-only file.
888  */
889 static ssize_t mem_force_empty_read(struct cgroup *cont,
890 				struct cftype *cft,
891 				struct file *file, char __user *userbuf,
892 				size_t nbytes, loff_t *ppos)
893 {
894 	return -EINVAL;
895 }
896 
897 static const struct mem_cgroup_stat_desc {
898 	const char *msg;
899 	u64 unit;
900 } mem_cgroup_stat_desc[] = {
901 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
902 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
903 };
904 
905 static int mem_control_stat_show(struct seq_file *m, void *arg)
906 {
907 	struct cgroup *cont = m->private;
908 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
909 	struct mem_cgroup_stat *stat = &mem_cont->stat;
910 	int i;
911 
912 	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
913 		s64 val;
914 
915 		val = mem_cgroup_read_stat(stat, i);
916 		val *= mem_cgroup_stat_desc[i].unit;
917 		seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
918 				(long long)val);
919 	}
920 	/* showing # of active pages */
921 	{
922 		unsigned long active, inactive;
923 
924 		inactive = mem_cgroup_get_all_zonestat(mem_cont,
925 						MEM_CGROUP_ZSTAT_INACTIVE);
926 		active = mem_cgroup_get_all_zonestat(mem_cont,
927 						MEM_CGROUP_ZSTAT_ACTIVE);
928 		seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
929 		seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
930 	}
931 	return 0;
932 }
933 
934 static const struct file_operations mem_control_stat_file_operations = {
935 	.read = seq_read,
936 	.llseek = seq_lseek,
937 	.release = single_release,
938 };
939 
940 static int mem_control_stat_open(struct inode *unused, struct file *file)
941 {
942 	/* XXX __d_cont */
943 	struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
944 
945 	file->f_op = &mem_control_stat_file_operations;
946 	return single_open(file, mem_control_stat_show, cont);
947 }
948 
949 static struct cftype mem_cgroup_files[] = {
950 	{
951 		.name = "usage_in_bytes",
952 		.private = RES_USAGE,
953 		.read = mem_cgroup_read,
954 	},
955 	{
956 		.name = "limit_in_bytes",
957 		.private = RES_LIMIT,
958 		.write = mem_cgroup_write,
959 		.read = mem_cgroup_read,
960 	},
961 	{
962 		.name = "failcnt",
963 		.private = RES_FAILCNT,
964 		.read = mem_cgroup_read,
965 	},
966 	{
967 		.name = "force_empty",
968 		.write = mem_force_empty_write,
969 		.read = mem_force_empty_read,
970 	},
971 	{
972 		.name = "stat",
973 		.open = mem_control_stat_open,
974 	},
975 };
976 
977 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
978 {
979 	struct mem_cgroup_per_node *pn;
980 	struct mem_cgroup_per_zone *mz;
981 	int zone, tmp = node;
982 	/*
983 	 * This routine is called against possible nodes.
984 	 * But it's BUG to call kmalloc() against offline node.
985 	 *
986 	 * TODO: this routine can waste much memory for nodes which will
987 	 *       never be onlined. It's better to use memory hotplug callback
988 	 *       function.
989 	 */
990 	if (!node_state(node, N_NORMAL_MEMORY))
991 		tmp = -1;
992 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
993 	if (!pn)
994 		return 1;
995 
996 	mem->info.nodeinfo[node] = pn;
997 	memset(pn, 0, sizeof(*pn));
998 
999 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1000 		mz = &pn->zoneinfo[zone];
1001 		INIT_LIST_HEAD(&mz->active_list);
1002 		INIT_LIST_HEAD(&mz->inactive_list);
1003 		spin_lock_init(&mz->lru_lock);
1004 	}
1005 	return 0;
1006 }
1007 
1008 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1009 {
1010 	kfree(mem->info.nodeinfo[node]);
1011 }
1012 
1013 static struct cgroup_subsys_state *
1014 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1015 {
1016 	struct mem_cgroup *mem;
1017 	int node;
1018 
1019 	if (unlikely((cont->parent) == NULL)) {
1020 		mem = &init_mem_cgroup;
1021 		init_mm.mem_cgroup = mem;
1022 	} else
1023 		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
1024 
1025 	if (mem == NULL)
1026 		return ERR_PTR(-ENOMEM);
1027 
1028 	res_counter_init(&mem->res);
1029 
1030 	memset(&mem->info, 0, sizeof(mem->info));
1031 
1032 	for_each_node_state(node, N_POSSIBLE)
1033 		if (alloc_mem_cgroup_per_zone_info(mem, node))
1034 			goto free_out;
1035 
1036 	return &mem->css;
1037 free_out:
1038 	for_each_node_state(node, N_POSSIBLE)
1039 		free_mem_cgroup_per_zone_info(mem, node);
1040 	if (cont->parent != NULL)
1041 		kfree(mem);
1042 	return ERR_PTR(-ENOMEM);
1043 }
1044 
1045 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1046 					struct cgroup *cont)
1047 {
1048 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1049 	mem_cgroup_force_empty(mem);
1050 }
1051 
1052 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1053 				struct cgroup *cont)
1054 {
1055 	int node;
1056 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1057 
1058 	for_each_node_state(node, N_POSSIBLE)
1059 		free_mem_cgroup_per_zone_info(mem, node);
1060 
1061 	kfree(mem_cgroup_from_cont(cont));
1062 }
1063 
1064 static int mem_cgroup_populate(struct cgroup_subsys *ss,
1065 				struct cgroup *cont)
1066 {
1067 	if (mem_cgroup_subsys.disabled)
1068 		return 0;
1069 	return cgroup_add_files(cont, ss, mem_cgroup_files,
1070 					ARRAY_SIZE(mem_cgroup_files));
1071 }
1072 
1073 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1074 				struct cgroup *cont,
1075 				struct cgroup *old_cont,
1076 				struct task_struct *p)
1077 {
1078 	struct mm_struct *mm;
1079 	struct mem_cgroup *mem, *old_mem;
1080 
1081 	if (mem_cgroup_subsys.disabled)
1082 		return;
1083 
1084 	mm = get_task_mm(p);
1085 	if (mm == NULL)
1086 		return;
1087 
1088 	mem = mem_cgroup_from_cont(cont);
1089 	old_mem = mem_cgroup_from_cont(old_cont);
1090 
1091 	if (mem == old_mem)
1092 		goto out;
1093 
1094 	/*
1095 	 * Only thread group leaders are allowed to migrate, the mm_struct is
1096 	 * in effect owned by the leader
1097 	 */
1098 	if (!thread_group_leader(p))
1099 		goto out;
1100 
1101 	css_get(&mem->css);
1102 	rcu_assign_pointer(mm->mem_cgroup, mem);
1103 	css_put(&old_mem->css);
1104 
1105 out:
1106 	mmput(mm);
1107 }
1108 
1109 struct cgroup_subsys mem_cgroup_subsys = {
1110 	.name = "memory",
1111 	.subsys_id = mem_cgroup_subsys_id,
1112 	.create = mem_cgroup_create,
1113 	.pre_destroy = mem_cgroup_pre_destroy,
1114 	.destroy = mem_cgroup_destroy,
1115 	.populate = mem_cgroup_populate,
1116 	.attach = mem_cgroup_move_task,
1117 	.early_init = 0,
1118 };
1119