xref: /openbmc/linux/block/blk-cgroup.c (revision 1fa6ac37)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include "blk-cgroup.h"
21 #include <linux/genhd.h>
22 
23 #define MAX_KEY_LEN 100
24 
25 static DEFINE_SPINLOCK(blkio_list_lock);
26 static LIST_HEAD(blkio_list);
27 
28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 
31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 						  struct cgroup *);
33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34 			      struct task_struct *, bool);
35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36 			   struct cgroup *, struct task_struct *, bool);
37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 
40 struct cgroup_subsys blkio_subsys = {
41 	.name = "blkio",
42 	.create = blkiocg_create,
43 	.can_attach = blkiocg_can_attach,
44 	.attach = blkiocg_attach,
45 	.destroy = blkiocg_destroy,
46 	.populate = blkiocg_populate,
47 #ifdef CONFIG_BLK_CGROUP
48 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
49 	.subsys_id = blkio_subsys_id,
50 #endif
51 	.use_id = 1,
52 	.module = THIS_MODULE,
53 };
54 EXPORT_SYMBOL_GPL(blkio_subsys);
55 
56 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
57 					    struct blkio_policy_node *pn)
58 {
59 	list_add(&pn->node, &blkcg->policy_list);
60 }
61 
62 /* Must be called with blkcg->lock held */
63 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64 {
65 	list_del(&pn->node);
66 }
67 
68 /* Must be called with blkcg->lock held */
69 static struct blkio_policy_node *
70 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
71 {
72 	struct blkio_policy_node *pn;
73 
74 	list_for_each_entry(pn, &blkcg->policy_list, node) {
75 		if (pn->dev == dev)
76 			return pn;
77 	}
78 
79 	return NULL;
80 }
81 
82 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
83 {
84 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
85 			    struct blkio_cgroup, css);
86 }
87 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 
89 /*
90  * Add to the appropriate stat variable depending on the request type.
91  * This should be called with the blkg->stats_lock held.
92  */
93 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
94 				bool sync)
95 {
96 	if (direction)
97 		stat[BLKIO_STAT_WRITE] += add;
98 	else
99 		stat[BLKIO_STAT_READ] += add;
100 	if (sync)
101 		stat[BLKIO_STAT_SYNC] += add;
102 	else
103 		stat[BLKIO_STAT_ASYNC] += add;
104 }
105 
106 /*
107  * Decrements the appropriate stat variable if non-zero depending on the
108  * request type. Panics on value being zero.
109  * This should be called with the blkg->stats_lock held.
110  */
111 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
112 {
113 	if (direction) {
114 		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
115 		stat[BLKIO_STAT_WRITE]--;
116 	} else {
117 		BUG_ON(stat[BLKIO_STAT_READ] == 0);
118 		stat[BLKIO_STAT_READ]--;
119 	}
120 	if (sync) {
121 		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
122 		stat[BLKIO_STAT_SYNC]--;
123 	} else {
124 		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
125 		stat[BLKIO_STAT_ASYNC]--;
126 	}
127 }
128 
129 #ifdef CONFIG_DEBUG_BLK_CGROUP
130 /* This should be called with the blkg->stats_lock held. */
131 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
132 						struct blkio_group *curr_blkg)
133 {
134 	if (blkio_blkg_waiting(&blkg->stats))
135 		return;
136 	if (blkg == curr_blkg)
137 		return;
138 	blkg->stats.start_group_wait_time = sched_clock();
139 	blkio_mark_blkg_waiting(&blkg->stats);
140 }
141 
142 /* This should be called with the blkg->stats_lock held. */
143 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
144 {
145 	unsigned long long now;
146 
147 	if (!blkio_blkg_waiting(stats))
148 		return;
149 
150 	now = sched_clock();
151 	if (time_after64(now, stats->start_group_wait_time))
152 		stats->group_wait_time += now - stats->start_group_wait_time;
153 	blkio_clear_blkg_waiting(stats);
154 }
155 
156 /* This should be called with the blkg->stats_lock held. */
157 static void blkio_end_empty_time(struct blkio_group_stats *stats)
158 {
159 	unsigned long long now;
160 
161 	if (!blkio_blkg_empty(stats))
162 		return;
163 
164 	now = sched_clock();
165 	if (time_after64(now, stats->start_empty_time))
166 		stats->empty_time += now - stats->start_empty_time;
167 	blkio_clear_blkg_empty(stats);
168 }
169 
170 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
171 {
172 	unsigned long flags;
173 
174 	spin_lock_irqsave(&blkg->stats_lock, flags);
175 	BUG_ON(blkio_blkg_idling(&blkg->stats));
176 	blkg->stats.start_idle_time = sched_clock();
177 	blkio_mark_blkg_idling(&blkg->stats);
178 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
179 }
180 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
181 
182 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
183 {
184 	unsigned long flags;
185 	unsigned long long now;
186 	struct blkio_group_stats *stats;
187 
188 	spin_lock_irqsave(&blkg->stats_lock, flags);
189 	stats = &blkg->stats;
190 	if (blkio_blkg_idling(stats)) {
191 		now = sched_clock();
192 		if (time_after64(now, stats->start_idle_time))
193 			stats->idle_time += now - stats->start_idle_time;
194 		blkio_clear_blkg_idling(stats);
195 	}
196 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
197 }
198 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
199 
200 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
201 {
202 	unsigned long flags;
203 	struct blkio_group_stats *stats;
204 
205 	spin_lock_irqsave(&blkg->stats_lock, flags);
206 	stats = &blkg->stats;
207 	stats->avg_queue_size_sum +=
208 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
209 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
210 	stats->avg_queue_size_samples++;
211 	blkio_update_group_wait_time(stats);
212 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
213 }
214 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
215 
216 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
217 {
218 	unsigned long flags;
219 	struct blkio_group_stats *stats;
220 
221 	spin_lock_irqsave(&blkg->stats_lock, flags);
222 	stats = &blkg->stats;
223 
224 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
225 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
226 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
227 		return;
228 	}
229 
230 	/*
231 	 * group is already marked empty. This can happen if cfqq got new
232 	 * request in parent group and moved to this group while being added
233 	 * to service tree. Just ignore the event and move on.
234 	 */
235 	if(blkio_blkg_empty(stats)) {
236 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
237 		return;
238 	}
239 
240 	stats->start_empty_time = sched_clock();
241 	blkio_mark_blkg_empty(stats);
242 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
243 }
244 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
245 
246 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
247 			unsigned long dequeue)
248 {
249 	blkg->stats.dequeue += dequeue;
250 }
251 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
252 #else
253 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
254 					struct blkio_group *curr_blkg) {}
255 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
256 #endif
257 
258 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
259 			struct blkio_group *curr_blkg, bool direction,
260 			bool sync)
261 {
262 	unsigned long flags;
263 
264 	spin_lock_irqsave(&blkg->stats_lock, flags);
265 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
266 			sync);
267 	blkio_end_empty_time(&blkg->stats);
268 	blkio_set_start_group_wait_time(blkg, curr_blkg);
269 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
270 }
271 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
272 
273 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
274 						bool direction, bool sync)
275 {
276 	unsigned long flags;
277 
278 	spin_lock_irqsave(&blkg->stats_lock, flags);
279 	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
280 					direction, sync);
281 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
282 }
283 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284 
285 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
286 {
287 	unsigned long flags;
288 
289 	spin_lock_irqsave(&blkg->stats_lock, flags);
290 	blkg->stats.time += time;
291 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
292 }
293 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294 
295 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296 				uint64_t bytes, bool direction, bool sync)
297 {
298 	struct blkio_group_stats *stats;
299 	unsigned long flags;
300 
301 	spin_lock_irqsave(&blkg->stats_lock, flags);
302 	stats = &blkg->stats;
303 	stats->sectors += bytes >> 9;
304 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
305 			sync);
306 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
307 			direction, sync);
308 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
309 }
310 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311 
312 void blkiocg_update_completion_stats(struct blkio_group *blkg,
313 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
314 {
315 	struct blkio_group_stats *stats;
316 	unsigned long flags;
317 	unsigned long long now = sched_clock();
318 
319 	spin_lock_irqsave(&blkg->stats_lock, flags);
320 	stats = &blkg->stats;
321 	if (time_after64(now, io_start_time))
322 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
323 				now - io_start_time, direction, sync);
324 	if (time_after64(io_start_time, start_time))
325 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
326 				io_start_time - start_time, direction, sync);
327 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
328 }
329 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330 
331 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332 					bool sync)
333 {
334 	unsigned long flags;
335 
336 	spin_lock_irqsave(&blkg->stats_lock, flags);
337 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
338 			sync);
339 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
340 }
341 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 
343 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 			struct blkio_group *blkg, void *key, dev_t dev)
345 {
346 	unsigned long flags;
347 
348 	spin_lock_irqsave(&blkcg->lock, flags);
349 	spin_lock_init(&blkg->stats_lock);
350 	rcu_assign_pointer(blkg->key, key);
351 	blkg->blkcg_id = css_id(&blkcg->css);
352 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
353 	spin_unlock_irqrestore(&blkcg->lock, flags);
354 	/* Need to take css reference ? */
355 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
356 	blkg->dev = dev;
357 }
358 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
359 
360 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
361 {
362 	hlist_del_init_rcu(&blkg->blkcg_node);
363 	blkg->blkcg_id = 0;
364 }
365 
366 /*
367  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
368  * indicating that blk_group was unhashed by the time we got to it.
369  */
370 int blkiocg_del_blkio_group(struct blkio_group *blkg)
371 {
372 	struct blkio_cgroup *blkcg;
373 	unsigned long flags;
374 	struct cgroup_subsys_state *css;
375 	int ret = 1;
376 
377 	rcu_read_lock();
378 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
379 	if (css) {
380 		blkcg = container_of(css, struct blkio_cgroup, css);
381 		spin_lock_irqsave(&blkcg->lock, flags);
382 		if (!hlist_unhashed(&blkg->blkcg_node)) {
383 			__blkiocg_del_blkio_group(blkg);
384 			ret = 0;
385 		}
386 		spin_unlock_irqrestore(&blkcg->lock, flags);
387 	}
388 
389 	rcu_read_unlock();
390 	return ret;
391 }
392 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
393 
394 /* called under rcu_read_lock(). */
395 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
396 {
397 	struct blkio_group *blkg;
398 	struct hlist_node *n;
399 	void *__key;
400 
401 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
402 		__key = blkg->key;
403 		if (__key == key)
404 			return blkg;
405 	}
406 
407 	return NULL;
408 }
409 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 
411 #define SHOW_FUNCTION(__VAR)						\
412 static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
413 				       struct cftype *cftype)		\
414 {									\
415 	struct blkio_cgroup *blkcg;					\
416 									\
417 	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
418 	return (u64)blkcg->__VAR;					\
419 }
420 
421 SHOW_FUNCTION(weight);
422 #undef SHOW_FUNCTION
423 
424 static int
425 blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426 {
427 	struct blkio_cgroup *blkcg;
428 	struct blkio_group *blkg;
429 	struct hlist_node *n;
430 	struct blkio_policy_type *blkiop;
431 	struct blkio_policy_node *pn;
432 
433 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434 		return -EINVAL;
435 
436 	blkcg = cgroup_to_blkio_cgroup(cgroup);
437 	spin_lock(&blkio_list_lock);
438 	spin_lock_irq(&blkcg->lock);
439 	blkcg->weight = (unsigned int)val;
440 
441 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 		pn = blkio_policy_search_node(blkcg, blkg->dev);
443 
444 		if (pn)
445 			continue;
446 
447 		list_for_each_entry(blkiop, &blkio_list, list)
448 			blkiop->ops.blkio_update_group_weight_fn(blkg,
449 					blkcg->weight);
450 	}
451 	spin_unlock_irq(&blkcg->lock);
452 	spin_unlock(&blkio_list_lock);
453 	return 0;
454 }
455 
456 static int
457 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458 {
459 	struct blkio_cgroup *blkcg;
460 	struct blkio_group *blkg;
461 	struct blkio_group_stats *stats;
462 	struct hlist_node *n;
463 	uint64_t queued[BLKIO_STAT_TOTAL];
464 	int i;
465 #ifdef CONFIG_DEBUG_BLK_CGROUP
466 	bool idling, waiting, empty;
467 	unsigned long long now = sched_clock();
468 #endif
469 
470 	blkcg = cgroup_to_blkio_cgroup(cgroup);
471 	spin_lock_irq(&blkcg->lock);
472 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
473 		spin_lock(&blkg->stats_lock);
474 		stats = &blkg->stats;
475 #ifdef CONFIG_DEBUG_BLK_CGROUP
476 		idling = blkio_blkg_idling(stats);
477 		waiting = blkio_blkg_waiting(stats);
478 		empty = blkio_blkg_empty(stats);
479 #endif
480 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
481 			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
482 		memset(stats, 0, sizeof(struct blkio_group_stats));
483 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
484 			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
485 #ifdef CONFIG_DEBUG_BLK_CGROUP
486 		if (idling) {
487 			blkio_mark_blkg_idling(stats);
488 			stats->start_idle_time = now;
489 		}
490 		if (waiting) {
491 			blkio_mark_blkg_waiting(stats);
492 			stats->start_group_wait_time = now;
493 		}
494 		if (empty) {
495 			blkio_mark_blkg_empty(stats);
496 			stats->start_empty_time = now;
497 		}
498 #endif
499 		spin_unlock(&blkg->stats_lock);
500 	}
501 	spin_unlock_irq(&blkcg->lock);
502 	return 0;
503 }
504 
505 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
506 				int chars_left, bool diskname_only)
507 {
508 	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
509 	chars_left -= strlen(str);
510 	if (chars_left <= 0) {
511 		printk(KERN_WARNING
512 			"Possibly incorrect cgroup stat display format");
513 		return;
514 	}
515 	if (diskname_only)
516 		return;
517 	switch (type) {
518 	case BLKIO_STAT_READ:
519 		strlcat(str, " Read", chars_left);
520 		break;
521 	case BLKIO_STAT_WRITE:
522 		strlcat(str, " Write", chars_left);
523 		break;
524 	case BLKIO_STAT_SYNC:
525 		strlcat(str, " Sync", chars_left);
526 		break;
527 	case BLKIO_STAT_ASYNC:
528 		strlcat(str, " Async", chars_left);
529 		break;
530 	case BLKIO_STAT_TOTAL:
531 		strlcat(str, " Total", chars_left);
532 		break;
533 	default:
534 		strlcat(str, " Invalid", chars_left);
535 	}
536 }
537 
538 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
539 				struct cgroup_map_cb *cb, dev_t dev)
540 {
541 	blkio_get_key_name(0, dev, str, chars_left, true);
542 	cb->fill(cb, str, val);
543 	return val;
544 }
545 
546 /* This should be called with blkg->stats_lock held */
547 static uint64_t blkio_get_stat(struct blkio_group *blkg,
548 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
549 {
550 	uint64_t disk_total;
551 	char key_str[MAX_KEY_LEN];
552 	enum stat_sub_type sub_type;
553 
554 	if (type == BLKIO_STAT_TIME)
555 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
556 					blkg->stats.time, cb, dev);
557 	if (type == BLKIO_STAT_SECTORS)
558 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
559 					blkg->stats.sectors, cb, dev);
560 #ifdef CONFIG_DEBUG_BLK_CGROUP
561 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
562 		uint64_t sum = blkg->stats.avg_queue_size_sum;
563 		uint64_t samples = blkg->stats.avg_queue_size_samples;
564 		if (samples)
565 			do_div(sum, samples);
566 		else
567 			sum = 0;
568 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
569 	}
570 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
571 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
572 					blkg->stats.group_wait_time, cb, dev);
573 	if (type == BLKIO_STAT_IDLE_TIME)
574 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
575 					blkg->stats.idle_time, cb, dev);
576 	if (type == BLKIO_STAT_EMPTY_TIME)
577 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
578 					blkg->stats.empty_time, cb, dev);
579 	if (type == BLKIO_STAT_DEQUEUE)
580 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
581 					blkg->stats.dequeue, cb, dev);
582 #endif
583 
584 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
585 			sub_type++) {
586 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
587 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
588 	}
589 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
590 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
591 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
592 	cb->fill(cb, key_str, disk_total);
593 	return disk_total;
594 }
595 
596 #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
597 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
598 		struct cftype *cftype, struct cgroup_map_cb *cb)	\
599 {									\
600 	struct blkio_cgroup *blkcg;					\
601 	struct blkio_group *blkg;					\
602 	struct hlist_node *n;						\
603 	uint64_t cgroup_total = 0;					\
604 									\
605 	if (!cgroup_lock_live_group(cgroup))				\
606 		return -ENODEV;						\
607 									\
608 	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
609 	rcu_read_lock();						\
610 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 		if (blkg->dev) {					\
612 			spin_lock_irq(&blkg->stats_lock);		\
613 			cgroup_total += blkio_get_stat(blkg, cb,	\
614 						blkg->dev, type);	\
615 			spin_unlock_irq(&blkg->stats_lock);		\
616 		}							\
617 	}								\
618 	if (show_total)							\
619 		cb->fill(cb, "Total", cgroup_total);			\
620 	rcu_read_unlock();						\
621 	cgroup_unlock();						\
622 	return 0;							\
623 }
624 
625 SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626 SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627 SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631 SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632 SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633 #ifdef CONFIG_DEBUG_BLK_CGROUP
634 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635 SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636 SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637 SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638 SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639 #endif
640 #undef SHOW_FUNCTION_PER_GROUP
641 
642 static int blkio_check_dev_num(dev_t dev)
643 {
644 	int part = 0;
645 	struct gendisk *disk;
646 
647 	disk = get_gendisk(dev, &part);
648 	if (!disk || part)
649 		return -ENODEV;
650 
651 	return 0;
652 }
653 
654 static int blkio_policy_parse_and_set(char *buf,
655 				      struct blkio_policy_node *newpn)
656 {
657 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 	int ret;
659 	unsigned long major, minor, temp;
660 	int i = 0;
661 	dev_t dev;
662 
663 	memset(s, 0, sizeof(s));
664 
665 	while ((p = strsep(&buf, " ")) != NULL) {
666 		if (!*p)
667 			continue;
668 
669 		s[i++] = p;
670 
671 		/* Prevent from inputing too many things */
672 		if (i == 3)
673 			break;
674 	}
675 
676 	if (i != 2)
677 		return -EINVAL;
678 
679 	p = strsep(&s[0], ":");
680 	if (p != NULL)
681 		major_s = p;
682 	else
683 		return -EINVAL;
684 
685 	minor_s = s[0];
686 	if (!minor_s)
687 		return -EINVAL;
688 
689 	ret = strict_strtoul(major_s, 10, &major);
690 	if (ret)
691 		return -EINVAL;
692 
693 	ret = strict_strtoul(minor_s, 10, &minor);
694 	if (ret)
695 		return -EINVAL;
696 
697 	dev = MKDEV(major, minor);
698 
699 	ret = blkio_check_dev_num(dev);
700 	if (ret)
701 		return ret;
702 
703 	newpn->dev = dev;
704 
705 	if (s[1] == NULL)
706 		return -EINVAL;
707 
708 	ret = strict_strtoul(s[1], 10, &temp);
709 	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
710 	    temp > BLKIO_WEIGHT_MAX)
711 		return -EINVAL;
712 
713 	newpn->weight =  temp;
714 
715 	return 0;
716 }
717 
718 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
719 			      dev_t dev)
720 {
721 	struct blkio_policy_node *pn;
722 
723 	pn = blkio_policy_search_node(blkcg, dev);
724 	if (pn)
725 		return pn->weight;
726 	else
727 		return blkcg->weight;
728 }
729 EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 
731 
732 static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
733 				       const char *buffer)
734 {
735 	int ret = 0;
736 	char *buf;
737 	struct blkio_policy_node *newpn, *pn;
738 	struct blkio_cgroup *blkcg;
739 	struct blkio_group *blkg;
740 	int keep_newpn = 0;
741 	struct hlist_node *n;
742 	struct blkio_policy_type *blkiop;
743 
744 	buf = kstrdup(buffer, GFP_KERNEL);
745 	if (!buf)
746 		return -ENOMEM;
747 
748 	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
749 	if (!newpn) {
750 		ret = -ENOMEM;
751 		goto free_buf;
752 	}
753 
754 	ret = blkio_policy_parse_and_set(buf, newpn);
755 	if (ret)
756 		goto free_newpn;
757 
758 	blkcg = cgroup_to_blkio_cgroup(cgrp);
759 
760 	spin_lock_irq(&blkcg->lock);
761 
762 	pn = blkio_policy_search_node(blkcg, newpn->dev);
763 	if (!pn) {
764 		if (newpn->weight != 0) {
765 			blkio_policy_insert_node(blkcg, newpn);
766 			keep_newpn = 1;
767 		}
768 		spin_unlock_irq(&blkcg->lock);
769 		goto update_io_group;
770 	}
771 
772 	if (newpn->weight == 0) {
773 		/* weight == 0 means deleteing a specific weight */
774 		blkio_policy_delete_node(pn);
775 		spin_unlock_irq(&blkcg->lock);
776 		goto update_io_group;
777 	}
778 	spin_unlock_irq(&blkcg->lock);
779 
780 	pn->weight = newpn->weight;
781 
782 update_io_group:
783 	/* update weight for each cfqg */
784 	spin_lock(&blkio_list_lock);
785 	spin_lock_irq(&blkcg->lock);
786 
787 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 		if (newpn->dev == blkg->dev) {
789 			list_for_each_entry(blkiop, &blkio_list, list)
790 				blkiop->ops.blkio_update_group_weight_fn(blkg,
791 							 newpn->weight ?
792 							 newpn->weight :
793 							 blkcg->weight);
794 		}
795 	}
796 
797 	spin_unlock_irq(&blkcg->lock);
798 	spin_unlock(&blkio_list_lock);
799 
800 free_newpn:
801 	if (!keep_newpn)
802 		kfree(newpn);
803 free_buf:
804 	kfree(buf);
805 	return ret;
806 }
807 
808 static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
809 				      struct seq_file *m)
810 {
811 	struct blkio_cgroup *blkcg;
812 	struct blkio_policy_node *pn;
813 
814 	seq_printf(m, "dev\tweight\n");
815 
816 	blkcg = cgroup_to_blkio_cgroup(cgrp);
817 	if (!list_empty(&blkcg->policy_list)) {
818 		spin_lock_irq(&blkcg->lock);
819 		list_for_each_entry(pn, &blkcg->policy_list, node) {
820 			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
821 				   MINOR(pn->dev), pn->weight);
822 		}
823 		spin_unlock_irq(&blkcg->lock);
824 	}
825 
826 	return 0;
827 }
828 
829 struct cftype blkio_files[] = {
830 	{
831 		.name = "weight_device",
832 		.read_seq_string = blkiocg_weight_device_read,
833 		.write_string = blkiocg_weight_device_write,
834 		.max_write_len = 256,
835 	},
836 	{
837 		.name = "weight",
838 		.read_u64 = blkiocg_weight_read,
839 		.write_u64 = blkiocg_weight_write,
840 	},
841 	{
842 		.name = "time",
843 		.read_map = blkiocg_time_read,
844 	},
845 	{
846 		.name = "sectors",
847 		.read_map = blkiocg_sectors_read,
848 	},
849 	{
850 		.name = "io_service_bytes",
851 		.read_map = blkiocg_io_service_bytes_read,
852 	},
853 	{
854 		.name = "io_serviced",
855 		.read_map = blkiocg_io_serviced_read,
856 	},
857 	{
858 		.name = "io_service_time",
859 		.read_map = blkiocg_io_service_time_read,
860 	},
861 	{
862 		.name = "io_wait_time",
863 		.read_map = blkiocg_io_wait_time_read,
864 	},
865 	{
866 		.name = "io_merged",
867 		.read_map = blkiocg_io_merged_read,
868 	},
869 	{
870 		.name = "io_queued",
871 		.read_map = blkiocg_io_queued_read,
872 	},
873 	{
874 		.name = "reset_stats",
875 		.write_u64 = blkiocg_reset_stats,
876 	},
877 #ifdef CONFIG_DEBUG_BLK_CGROUP
878 	{
879 		.name = "avg_queue_size",
880 		.read_map = blkiocg_avg_queue_size_read,
881 	},
882 	{
883 		.name = "group_wait_time",
884 		.read_map = blkiocg_group_wait_time_read,
885 	},
886 	{
887 		.name = "idle_time",
888 		.read_map = blkiocg_idle_time_read,
889 	},
890 	{
891 		.name = "empty_time",
892 		.read_map = blkiocg_empty_time_read,
893 	},
894 	{
895 		.name = "dequeue",
896 		.read_map = blkiocg_dequeue_read,
897 	},
898 #endif
899 };
900 
901 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
902 {
903 	return cgroup_add_files(cgroup, subsys, blkio_files,
904 				ARRAY_SIZE(blkio_files));
905 }
906 
907 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
908 {
909 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
910 	unsigned long flags;
911 	struct blkio_group *blkg;
912 	void *key;
913 	struct blkio_policy_type *blkiop;
914 	struct blkio_policy_node *pn, *pntmp;
915 
916 	rcu_read_lock();
917 	do {
918 		spin_lock_irqsave(&blkcg->lock, flags);
919 
920 		if (hlist_empty(&blkcg->blkg_list)) {
921 			spin_unlock_irqrestore(&blkcg->lock, flags);
922 			break;
923 		}
924 
925 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
926 					blkcg_node);
927 		key = rcu_dereference(blkg->key);
928 		__blkiocg_del_blkio_group(blkg);
929 
930 		spin_unlock_irqrestore(&blkcg->lock, flags);
931 
932 		/*
933 		 * This blkio_group is being unlinked as associated cgroup is
934 		 * going away. Let all the IO controlling policies know about
935 		 * this event. Currently this is static call to one io
936 		 * controlling policy. Once we have more policies in place, we
937 		 * need some dynamic registration of callback function.
938 		 */
939 		spin_lock(&blkio_list_lock);
940 		list_for_each_entry(blkiop, &blkio_list, list)
941 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
942 		spin_unlock(&blkio_list_lock);
943 	} while (1);
944 
945 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
946 		blkio_policy_delete_node(pn);
947 		kfree(pn);
948 	}
949 
950 	free_css_id(&blkio_subsys, &blkcg->css);
951 	rcu_read_unlock();
952 	if (blkcg != &blkio_root_cgroup)
953 		kfree(blkcg);
954 }
955 
956 static struct cgroup_subsys_state *
957 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
958 {
959 	struct blkio_cgroup *blkcg;
960 	struct cgroup *parent = cgroup->parent;
961 
962 	if (!parent) {
963 		blkcg = &blkio_root_cgroup;
964 		goto done;
965 	}
966 
967 	/* Currently we do not support hierarchy deeper than two level (0,1) */
968 	if (parent != cgroup->top_cgroup)
969 		return ERR_PTR(-EINVAL);
970 
971 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
972 	if (!blkcg)
973 		return ERR_PTR(-ENOMEM);
974 
975 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
976 done:
977 	spin_lock_init(&blkcg->lock);
978 	INIT_HLIST_HEAD(&blkcg->blkg_list);
979 
980 	INIT_LIST_HEAD(&blkcg->policy_list);
981 	return &blkcg->css;
982 }
983 
984 /*
985  * We cannot support shared io contexts, as we have no mean to support
986  * two tasks with the same ioc in two different groups without major rework
987  * of the main cic data structures.  For now we allow a task to change
988  * its cgroup only if it's the only owner of its ioc.
989  */
990 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
991 				struct cgroup *cgroup, struct task_struct *tsk,
992 				bool threadgroup)
993 {
994 	struct io_context *ioc;
995 	int ret = 0;
996 
997 	/* task_lock() is needed to avoid races with exit_io_context() */
998 	task_lock(tsk);
999 	ioc = tsk->io_context;
1000 	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1001 		ret = -EINVAL;
1002 	task_unlock(tsk);
1003 
1004 	return ret;
1005 }
1006 
1007 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1008 				struct cgroup *prev, struct task_struct *tsk,
1009 				bool threadgroup)
1010 {
1011 	struct io_context *ioc;
1012 
1013 	task_lock(tsk);
1014 	ioc = tsk->io_context;
1015 	if (ioc)
1016 		ioc->cgroup_changed = 1;
1017 	task_unlock(tsk);
1018 }
1019 
1020 void blkio_policy_register(struct blkio_policy_type *blkiop)
1021 {
1022 	spin_lock(&blkio_list_lock);
1023 	list_add_tail(&blkiop->list, &blkio_list);
1024 	spin_unlock(&blkio_list_lock);
1025 }
1026 EXPORT_SYMBOL_GPL(blkio_policy_register);
1027 
1028 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1029 {
1030 	spin_lock(&blkio_list_lock);
1031 	list_del_init(&blkiop->list);
1032 	spin_unlock(&blkio_list_lock);
1033 }
1034 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1035 
1036 static int __init init_cgroup_blkio(void)
1037 {
1038 	return cgroup_load_subsys(&blkio_subsys);
1039 }
1040 
1041 static void __exit exit_cgroup_blkio(void)
1042 {
1043 	cgroup_unload_subsys(&blkio_subsys);
1044 }
1045 
1046 module_init(init_cgroup_blkio);
1047 module_exit(exit_cgroup_blkio);
1048 MODULE_LICENSE("GPL");
1049