xref: /openbmc/linux/block/blk-cgroup.c (revision 4800cd83)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include "blk-cgroup.h"
21 #include <linux/genhd.h>
22 
23 #define MAX_KEY_LEN 100
24 
25 static DEFINE_SPINLOCK(blkio_list_lock);
26 static LIST_HEAD(blkio_list);
27 
28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 
31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 						  struct cgroup *);
33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34 			      struct task_struct *, bool);
35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36 			   struct cgroup *, struct task_struct *, bool);
37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 
40 /* for encoding cft->private value on file */
41 #define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
42 /* What policy owns the file, proportional or throttle */
43 #define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
44 #define BLKIOFILE_ATTR(val)		((val) & 0xffff)
45 
46 struct cgroup_subsys blkio_subsys = {
47 	.name = "blkio",
48 	.create = blkiocg_create,
49 	.can_attach = blkiocg_can_attach,
50 	.attach = blkiocg_attach,
51 	.destroy = blkiocg_destroy,
52 	.populate = blkiocg_populate,
53 #ifdef CONFIG_BLK_CGROUP
54 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
55 	.subsys_id = blkio_subsys_id,
56 #endif
57 	.use_id = 1,
58 	.module = THIS_MODULE,
59 };
60 EXPORT_SYMBOL_GPL(blkio_subsys);
61 
62 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
63 					    struct blkio_policy_node *pn)
64 {
65 	list_add(&pn->node, &blkcg->policy_list);
66 }
67 
68 static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 			struct blkio_group *blkg)
70 {
71 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72 
73 	if (blkg->plid == plid)
74 		return 1;
75 
76 	return 0;
77 }
78 
79 /* Determines if policy node matches cgroup file being accessed */
80 static inline bool pn_matches_cftype(struct cftype *cft,
81 			struct blkio_policy_node *pn)
82 {
83 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 	int fileid = BLKIOFILE_ATTR(cft->private);
85 
86 	return (plid == pn->plid && fileid == pn->fileid);
87 }
88 
89 /* Must be called with blkcg->lock held */
90 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
91 {
92 	list_del(&pn->node);
93 }
94 
95 /* Must be called with blkcg->lock held */
96 static struct blkio_policy_node *
97 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 		enum blkio_policy_id plid, int fileid)
99 {
100 	struct blkio_policy_node *pn;
101 
102 	list_for_each_entry(pn, &blkcg->policy_list, node) {
103 		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
104 			return pn;
105 	}
106 
107 	return NULL;
108 }
109 
110 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
111 {
112 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
113 			    struct blkio_cgroup, css);
114 }
115 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
116 
117 static inline void
118 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119 {
120 	struct blkio_policy_type *blkiop;
121 
122 	list_for_each_entry(blkiop, &blkio_list, list) {
123 		/* If this policy does not own the blkg, do not send updates */
124 		if (blkiop->plid != blkg->plid)
125 			continue;
126 		if (blkiop->ops.blkio_update_group_weight_fn)
127 			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 							blkg, weight);
129 	}
130 }
131 
132 static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 				int fileid)
134 {
135 	struct blkio_policy_type *blkiop;
136 
137 	list_for_each_entry(blkiop, &blkio_list, list) {
138 
139 		/* If this policy does not own the blkg, do not send updates */
140 		if (blkiop->plid != blkg->plid)
141 			continue;
142 
143 		if (fileid == BLKIO_THROTL_read_bps_device
144 		    && blkiop->ops.blkio_update_group_read_bps_fn)
145 			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 								blkg, bps);
147 
148 		if (fileid == BLKIO_THROTL_write_bps_device
149 		    && blkiop->ops.blkio_update_group_write_bps_fn)
150 			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 								blkg, bps);
152 	}
153 }
154 
155 static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 			unsigned int iops, int fileid)
157 {
158 	struct blkio_policy_type *blkiop;
159 
160 	list_for_each_entry(blkiop, &blkio_list, list) {
161 
162 		/* If this policy does not own the blkg, do not send updates */
163 		if (blkiop->plid != blkg->plid)
164 			continue;
165 
166 		if (fileid == BLKIO_THROTL_read_iops_device
167 		    && blkiop->ops.blkio_update_group_read_iops_fn)
168 			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 								blkg, iops);
170 
171 		if (fileid == BLKIO_THROTL_write_iops_device
172 		    && blkiop->ops.blkio_update_group_write_iops_fn)
173 			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 								blkg,iops);
175 	}
176 }
177 
178 /*
179  * Add to the appropriate stat variable depending on the request type.
180  * This should be called with the blkg->stats_lock held.
181  */
182 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
183 				bool sync)
184 {
185 	if (direction)
186 		stat[BLKIO_STAT_WRITE] += add;
187 	else
188 		stat[BLKIO_STAT_READ] += add;
189 	if (sync)
190 		stat[BLKIO_STAT_SYNC] += add;
191 	else
192 		stat[BLKIO_STAT_ASYNC] += add;
193 }
194 
195 /*
196  * Decrements the appropriate stat variable if non-zero depending on the
197  * request type. Panics on value being zero.
198  * This should be called with the blkg->stats_lock held.
199  */
200 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
201 {
202 	if (direction) {
203 		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
204 		stat[BLKIO_STAT_WRITE]--;
205 	} else {
206 		BUG_ON(stat[BLKIO_STAT_READ] == 0);
207 		stat[BLKIO_STAT_READ]--;
208 	}
209 	if (sync) {
210 		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
211 		stat[BLKIO_STAT_SYNC]--;
212 	} else {
213 		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
214 		stat[BLKIO_STAT_ASYNC]--;
215 	}
216 }
217 
218 #ifdef CONFIG_DEBUG_BLK_CGROUP
219 /* This should be called with the blkg->stats_lock held. */
220 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
221 						struct blkio_group *curr_blkg)
222 {
223 	if (blkio_blkg_waiting(&blkg->stats))
224 		return;
225 	if (blkg == curr_blkg)
226 		return;
227 	blkg->stats.start_group_wait_time = sched_clock();
228 	blkio_mark_blkg_waiting(&blkg->stats);
229 }
230 
231 /* This should be called with the blkg->stats_lock held. */
232 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
233 {
234 	unsigned long long now;
235 
236 	if (!blkio_blkg_waiting(stats))
237 		return;
238 
239 	now = sched_clock();
240 	if (time_after64(now, stats->start_group_wait_time))
241 		stats->group_wait_time += now - stats->start_group_wait_time;
242 	blkio_clear_blkg_waiting(stats);
243 }
244 
245 /* This should be called with the blkg->stats_lock held. */
246 static void blkio_end_empty_time(struct blkio_group_stats *stats)
247 {
248 	unsigned long long now;
249 
250 	if (!blkio_blkg_empty(stats))
251 		return;
252 
253 	now = sched_clock();
254 	if (time_after64(now, stats->start_empty_time))
255 		stats->empty_time += now - stats->start_empty_time;
256 	blkio_clear_blkg_empty(stats);
257 }
258 
259 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
260 {
261 	unsigned long flags;
262 
263 	spin_lock_irqsave(&blkg->stats_lock, flags);
264 	BUG_ON(blkio_blkg_idling(&blkg->stats));
265 	blkg->stats.start_idle_time = sched_clock();
266 	blkio_mark_blkg_idling(&blkg->stats);
267 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
268 }
269 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
270 
271 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
272 {
273 	unsigned long flags;
274 	unsigned long long now;
275 	struct blkio_group_stats *stats;
276 
277 	spin_lock_irqsave(&blkg->stats_lock, flags);
278 	stats = &blkg->stats;
279 	if (blkio_blkg_idling(stats)) {
280 		now = sched_clock();
281 		if (time_after64(now, stats->start_idle_time))
282 			stats->idle_time += now - stats->start_idle_time;
283 		blkio_clear_blkg_idling(stats);
284 	}
285 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
286 }
287 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
288 
289 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
290 {
291 	unsigned long flags;
292 	struct blkio_group_stats *stats;
293 
294 	spin_lock_irqsave(&blkg->stats_lock, flags);
295 	stats = &blkg->stats;
296 	stats->avg_queue_size_sum +=
297 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
298 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
299 	stats->avg_queue_size_samples++;
300 	blkio_update_group_wait_time(stats);
301 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
302 }
303 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
304 
305 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
306 {
307 	unsigned long flags;
308 	struct blkio_group_stats *stats;
309 
310 	spin_lock_irqsave(&blkg->stats_lock, flags);
311 	stats = &blkg->stats;
312 
313 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
314 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
315 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
316 		return;
317 	}
318 
319 	/*
320 	 * group is already marked empty. This can happen if cfqq got new
321 	 * request in parent group and moved to this group while being added
322 	 * to service tree. Just ignore the event and move on.
323 	 */
324 	if(blkio_blkg_empty(stats)) {
325 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
326 		return;
327 	}
328 
329 	stats->start_empty_time = sched_clock();
330 	blkio_mark_blkg_empty(stats);
331 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
332 }
333 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
334 
335 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
336 			unsigned long dequeue)
337 {
338 	blkg->stats.dequeue += dequeue;
339 }
340 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
341 #else
342 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
343 					struct blkio_group *curr_blkg) {}
344 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
345 #endif
346 
347 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
348 			struct blkio_group *curr_blkg, bool direction,
349 			bool sync)
350 {
351 	unsigned long flags;
352 
353 	spin_lock_irqsave(&blkg->stats_lock, flags);
354 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
355 			sync);
356 	blkio_end_empty_time(&blkg->stats);
357 	blkio_set_start_group_wait_time(blkg, curr_blkg);
358 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
359 }
360 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
361 
362 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
363 						bool direction, bool sync)
364 {
365 	unsigned long flags;
366 
367 	spin_lock_irqsave(&blkg->stats_lock, flags);
368 	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
369 					direction, sync);
370 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
371 }
372 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
373 
374 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
375 {
376 	unsigned long flags;
377 
378 	spin_lock_irqsave(&blkg->stats_lock, flags);
379 	blkg->stats.time += time;
380 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
381 }
382 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
383 
384 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
385 				uint64_t bytes, bool direction, bool sync)
386 {
387 	struct blkio_group_stats *stats;
388 	unsigned long flags;
389 
390 	spin_lock_irqsave(&blkg->stats_lock, flags);
391 	stats = &blkg->stats;
392 	stats->sectors += bytes >> 9;
393 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
394 			sync);
395 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
396 			direction, sync);
397 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
398 }
399 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
400 
401 void blkiocg_update_completion_stats(struct blkio_group *blkg,
402 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
403 {
404 	struct blkio_group_stats *stats;
405 	unsigned long flags;
406 	unsigned long long now = sched_clock();
407 
408 	spin_lock_irqsave(&blkg->stats_lock, flags);
409 	stats = &blkg->stats;
410 	if (time_after64(now, io_start_time))
411 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
412 				now - io_start_time, direction, sync);
413 	if (time_after64(io_start_time, start_time))
414 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
415 				io_start_time - start_time, direction, sync);
416 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
417 }
418 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
419 
420 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
421 					bool sync)
422 {
423 	unsigned long flags;
424 
425 	spin_lock_irqsave(&blkg->stats_lock, flags);
426 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
427 			sync);
428 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
429 }
430 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
431 
432 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
433 		struct blkio_group *blkg, void *key, dev_t dev,
434 		enum blkio_policy_id plid)
435 {
436 	unsigned long flags;
437 
438 	spin_lock_irqsave(&blkcg->lock, flags);
439 	spin_lock_init(&blkg->stats_lock);
440 	rcu_assign_pointer(blkg->key, key);
441 	blkg->blkcg_id = css_id(&blkcg->css);
442 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
443 	blkg->plid = plid;
444 	spin_unlock_irqrestore(&blkcg->lock, flags);
445 	/* Need to take css reference ? */
446 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
447 	blkg->dev = dev;
448 }
449 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
450 
451 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
452 {
453 	hlist_del_init_rcu(&blkg->blkcg_node);
454 	blkg->blkcg_id = 0;
455 }
456 
457 /*
458  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
459  * indicating that blk_group was unhashed by the time we got to it.
460  */
461 int blkiocg_del_blkio_group(struct blkio_group *blkg)
462 {
463 	struct blkio_cgroup *blkcg;
464 	unsigned long flags;
465 	struct cgroup_subsys_state *css;
466 	int ret = 1;
467 
468 	rcu_read_lock();
469 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
470 	if (css) {
471 		blkcg = container_of(css, struct blkio_cgroup, css);
472 		spin_lock_irqsave(&blkcg->lock, flags);
473 		if (!hlist_unhashed(&blkg->blkcg_node)) {
474 			__blkiocg_del_blkio_group(blkg);
475 			ret = 0;
476 		}
477 		spin_unlock_irqrestore(&blkcg->lock, flags);
478 	}
479 
480 	rcu_read_unlock();
481 	return ret;
482 }
483 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
484 
485 /* called under rcu_read_lock(). */
486 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
487 {
488 	struct blkio_group *blkg;
489 	struct hlist_node *n;
490 	void *__key;
491 
492 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
493 		__key = blkg->key;
494 		if (__key == key)
495 			return blkg;
496 	}
497 
498 	return NULL;
499 }
500 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
501 
502 static int
503 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
504 {
505 	struct blkio_cgroup *blkcg;
506 	struct blkio_group *blkg;
507 	struct blkio_group_stats *stats;
508 	struct hlist_node *n;
509 	uint64_t queued[BLKIO_STAT_TOTAL];
510 	int i;
511 #ifdef CONFIG_DEBUG_BLK_CGROUP
512 	bool idling, waiting, empty;
513 	unsigned long long now = sched_clock();
514 #endif
515 
516 	blkcg = cgroup_to_blkio_cgroup(cgroup);
517 	spin_lock_irq(&blkcg->lock);
518 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
519 		spin_lock(&blkg->stats_lock);
520 		stats = &blkg->stats;
521 #ifdef CONFIG_DEBUG_BLK_CGROUP
522 		idling = blkio_blkg_idling(stats);
523 		waiting = blkio_blkg_waiting(stats);
524 		empty = blkio_blkg_empty(stats);
525 #endif
526 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
527 			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
528 		memset(stats, 0, sizeof(struct blkio_group_stats));
529 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
530 			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
531 #ifdef CONFIG_DEBUG_BLK_CGROUP
532 		if (idling) {
533 			blkio_mark_blkg_idling(stats);
534 			stats->start_idle_time = now;
535 		}
536 		if (waiting) {
537 			blkio_mark_blkg_waiting(stats);
538 			stats->start_group_wait_time = now;
539 		}
540 		if (empty) {
541 			blkio_mark_blkg_empty(stats);
542 			stats->start_empty_time = now;
543 		}
544 #endif
545 		spin_unlock(&blkg->stats_lock);
546 	}
547 	spin_unlock_irq(&blkcg->lock);
548 	return 0;
549 }
550 
551 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
552 				int chars_left, bool diskname_only)
553 {
554 	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
555 	chars_left -= strlen(str);
556 	if (chars_left <= 0) {
557 		printk(KERN_WARNING
558 			"Possibly incorrect cgroup stat display format");
559 		return;
560 	}
561 	if (diskname_only)
562 		return;
563 	switch (type) {
564 	case BLKIO_STAT_READ:
565 		strlcat(str, " Read", chars_left);
566 		break;
567 	case BLKIO_STAT_WRITE:
568 		strlcat(str, " Write", chars_left);
569 		break;
570 	case BLKIO_STAT_SYNC:
571 		strlcat(str, " Sync", chars_left);
572 		break;
573 	case BLKIO_STAT_ASYNC:
574 		strlcat(str, " Async", chars_left);
575 		break;
576 	case BLKIO_STAT_TOTAL:
577 		strlcat(str, " Total", chars_left);
578 		break;
579 	default:
580 		strlcat(str, " Invalid", chars_left);
581 	}
582 }
583 
584 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
585 				struct cgroup_map_cb *cb, dev_t dev)
586 {
587 	blkio_get_key_name(0, dev, str, chars_left, true);
588 	cb->fill(cb, str, val);
589 	return val;
590 }
591 
592 /* This should be called with blkg->stats_lock held */
593 static uint64_t blkio_get_stat(struct blkio_group *blkg,
594 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
595 {
596 	uint64_t disk_total;
597 	char key_str[MAX_KEY_LEN];
598 	enum stat_sub_type sub_type;
599 
600 	if (type == BLKIO_STAT_TIME)
601 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
602 					blkg->stats.time, cb, dev);
603 	if (type == BLKIO_STAT_SECTORS)
604 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
605 					blkg->stats.sectors, cb, dev);
606 #ifdef CONFIG_DEBUG_BLK_CGROUP
607 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
608 		uint64_t sum = blkg->stats.avg_queue_size_sum;
609 		uint64_t samples = blkg->stats.avg_queue_size_samples;
610 		if (samples)
611 			do_div(sum, samples);
612 		else
613 			sum = 0;
614 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
615 	}
616 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
617 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
618 					blkg->stats.group_wait_time, cb, dev);
619 	if (type == BLKIO_STAT_IDLE_TIME)
620 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
621 					blkg->stats.idle_time, cb, dev);
622 	if (type == BLKIO_STAT_EMPTY_TIME)
623 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
624 					blkg->stats.empty_time, cb, dev);
625 	if (type == BLKIO_STAT_DEQUEUE)
626 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
627 					blkg->stats.dequeue, cb, dev);
628 #endif
629 
630 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
631 			sub_type++) {
632 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
633 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
634 	}
635 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
636 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
637 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
638 	cb->fill(cb, key_str, disk_total);
639 	return disk_total;
640 }
641 
642 static int blkio_check_dev_num(dev_t dev)
643 {
644 	int part = 0;
645 	struct gendisk *disk;
646 
647 	disk = get_gendisk(dev, &part);
648 	if (!disk || part)
649 		return -ENODEV;
650 
651 	return 0;
652 }
653 
654 static int blkio_policy_parse_and_set(char *buf,
655 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656 {
657 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 	int ret;
659 	unsigned long major, minor, temp;
660 	int i = 0;
661 	dev_t dev;
662 	u64 bps, iops;
663 
664 	memset(s, 0, sizeof(s));
665 
666 	while ((p = strsep(&buf, " ")) != NULL) {
667 		if (!*p)
668 			continue;
669 
670 		s[i++] = p;
671 
672 		/* Prevent from inputing too many things */
673 		if (i == 3)
674 			break;
675 	}
676 
677 	if (i != 2)
678 		return -EINVAL;
679 
680 	p = strsep(&s[0], ":");
681 	if (p != NULL)
682 		major_s = p;
683 	else
684 		return -EINVAL;
685 
686 	minor_s = s[0];
687 	if (!minor_s)
688 		return -EINVAL;
689 
690 	ret = strict_strtoul(major_s, 10, &major);
691 	if (ret)
692 		return -EINVAL;
693 
694 	ret = strict_strtoul(minor_s, 10, &minor);
695 	if (ret)
696 		return -EINVAL;
697 
698 	dev = MKDEV(major, minor);
699 
700 	ret = blkio_check_dev_num(dev);
701 	if (ret)
702 		return ret;
703 
704 	newpn->dev = dev;
705 
706 	if (s[1] == NULL)
707 		return -EINVAL;
708 
709 	switch (plid) {
710 	case BLKIO_POLICY_PROP:
711 		ret = strict_strtoul(s[1], 10, &temp);
712 		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
713 			temp > BLKIO_WEIGHT_MAX)
714 			return -EINVAL;
715 
716 		newpn->plid = plid;
717 		newpn->fileid = fileid;
718 		newpn->val.weight = temp;
719 		break;
720 	case BLKIO_POLICY_THROTL:
721 		switch(fileid) {
722 		case BLKIO_THROTL_read_bps_device:
723 		case BLKIO_THROTL_write_bps_device:
724 			ret = strict_strtoull(s[1], 10, &bps);
725 			if (ret)
726 				return -EINVAL;
727 
728 			newpn->plid = plid;
729 			newpn->fileid = fileid;
730 			newpn->val.bps = bps;
731 			break;
732 		case BLKIO_THROTL_read_iops_device:
733 		case BLKIO_THROTL_write_iops_device:
734 			ret = strict_strtoull(s[1], 10, &iops);
735 			if (ret)
736 				return -EINVAL;
737 
738 			if (iops > THROTL_IOPS_MAX)
739 				return -EINVAL;
740 
741 			newpn->plid = plid;
742 			newpn->fileid = fileid;
743 			newpn->val.iops = (unsigned int)iops;
744 			break;
745 		}
746 		break;
747 	default:
748 		BUG();
749 	}
750 
751 	return 0;
752 }
753 
754 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
755 			      dev_t dev)
756 {
757 	struct blkio_policy_node *pn;
758 
759 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
760 				BLKIO_PROP_weight_device);
761 	if (pn)
762 		return pn->val.weight;
763 	else
764 		return blkcg->weight;
765 }
766 EXPORT_SYMBOL_GPL(blkcg_get_weight);
767 
768 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
769 {
770 	struct blkio_policy_node *pn;
771 
772 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
773 				BLKIO_THROTL_read_bps_device);
774 	if (pn)
775 		return pn->val.bps;
776 	else
777 		return -1;
778 }
779 
780 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
781 {
782 	struct blkio_policy_node *pn;
783 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
784 				BLKIO_THROTL_write_bps_device);
785 	if (pn)
786 		return pn->val.bps;
787 	else
788 		return -1;
789 }
790 
791 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
792 {
793 	struct blkio_policy_node *pn;
794 
795 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 				BLKIO_THROTL_read_iops_device);
797 	if (pn)
798 		return pn->val.iops;
799 	else
800 		return -1;
801 }
802 
803 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
804 {
805 	struct blkio_policy_node *pn;
806 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
807 				BLKIO_THROTL_write_iops_device);
808 	if (pn)
809 		return pn->val.iops;
810 	else
811 		return -1;
812 }
813 
814 /* Checks whether user asked for deleting a policy rule */
815 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
816 {
817 	switch(pn->plid) {
818 	case BLKIO_POLICY_PROP:
819 		if (pn->val.weight == 0)
820 			return 1;
821 		break;
822 	case BLKIO_POLICY_THROTL:
823 		switch(pn->fileid) {
824 		case BLKIO_THROTL_read_bps_device:
825 		case BLKIO_THROTL_write_bps_device:
826 			if (pn->val.bps == 0)
827 				return 1;
828 			break;
829 		case BLKIO_THROTL_read_iops_device:
830 		case BLKIO_THROTL_write_iops_device:
831 			if (pn->val.iops == 0)
832 				return 1;
833 		}
834 		break;
835 	default:
836 		BUG();
837 	}
838 
839 	return 0;
840 }
841 
842 static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
843 					struct blkio_policy_node *newpn)
844 {
845 	switch(oldpn->plid) {
846 	case BLKIO_POLICY_PROP:
847 		oldpn->val.weight = newpn->val.weight;
848 		break;
849 	case BLKIO_POLICY_THROTL:
850 		switch(newpn->fileid) {
851 		case BLKIO_THROTL_read_bps_device:
852 		case BLKIO_THROTL_write_bps_device:
853 			oldpn->val.bps = newpn->val.bps;
854 			break;
855 		case BLKIO_THROTL_read_iops_device:
856 		case BLKIO_THROTL_write_iops_device:
857 			oldpn->val.iops = newpn->val.iops;
858 		}
859 		break;
860 	default:
861 		BUG();
862 	}
863 }
864 
865 /*
866  * Some rules/values in blkg have changed. Propogate those to respective
867  * policies.
868  */
869 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
870 		struct blkio_group *blkg, struct blkio_policy_node *pn)
871 {
872 	unsigned int weight, iops;
873 	u64 bps;
874 
875 	switch(pn->plid) {
876 	case BLKIO_POLICY_PROP:
877 		weight = pn->val.weight ? pn->val.weight :
878 				blkcg->weight;
879 		blkio_update_group_weight(blkg, weight);
880 		break;
881 	case BLKIO_POLICY_THROTL:
882 		switch(pn->fileid) {
883 		case BLKIO_THROTL_read_bps_device:
884 		case BLKIO_THROTL_write_bps_device:
885 			bps = pn->val.bps ? pn->val.bps : (-1);
886 			blkio_update_group_bps(blkg, bps, pn->fileid);
887 			break;
888 		case BLKIO_THROTL_read_iops_device:
889 		case BLKIO_THROTL_write_iops_device:
890 			iops = pn->val.iops ? pn->val.iops : (-1);
891 			blkio_update_group_iops(blkg, iops, pn->fileid);
892 			break;
893 		}
894 		break;
895 	default:
896 		BUG();
897 	}
898 }
899 
900 /*
901  * A policy node rule has been updated. Propogate this update to all the
902  * block groups which might be affected by this update.
903  */
904 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
905 				struct blkio_policy_node *pn)
906 {
907 	struct blkio_group *blkg;
908 	struct hlist_node *n;
909 
910 	spin_lock(&blkio_list_lock);
911 	spin_lock_irq(&blkcg->lock);
912 
913 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
914 		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
915 			continue;
916 		blkio_update_blkg_policy(blkcg, blkg, pn);
917 	}
918 
919 	spin_unlock_irq(&blkcg->lock);
920 	spin_unlock(&blkio_list_lock);
921 }
922 
923 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
924  				       const char *buffer)
925 {
926 	int ret = 0;
927 	char *buf;
928 	struct blkio_policy_node *newpn, *pn;
929 	struct blkio_cgroup *blkcg;
930 	int keep_newpn = 0;
931 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
932 	int fileid = BLKIOFILE_ATTR(cft->private);
933 
934 	buf = kstrdup(buffer, GFP_KERNEL);
935 	if (!buf)
936 		return -ENOMEM;
937 
938 	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
939 	if (!newpn) {
940 		ret = -ENOMEM;
941 		goto free_buf;
942 	}
943 
944 	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
945 	if (ret)
946 		goto free_newpn;
947 
948 	blkcg = cgroup_to_blkio_cgroup(cgrp);
949 
950 	spin_lock_irq(&blkcg->lock);
951 
952 	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
953 	if (!pn) {
954 		if (!blkio_delete_rule_command(newpn)) {
955 			blkio_policy_insert_node(blkcg, newpn);
956 			keep_newpn = 1;
957 		}
958 		spin_unlock_irq(&blkcg->lock);
959 		goto update_io_group;
960 	}
961 
962 	if (blkio_delete_rule_command(newpn)) {
963 		blkio_policy_delete_node(pn);
964 		spin_unlock_irq(&blkcg->lock);
965 		goto update_io_group;
966 	}
967 	spin_unlock_irq(&blkcg->lock);
968 
969 	blkio_update_policy_rule(pn, newpn);
970 
971 update_io_group:
972 	blkio_update_policy_node_blkg(blkcg, newpn);
973 
974 free_newpn:
975 	if (!keep_newpn)
976 		kfree(newpn);
977 free_buf:
978 	kfree(buf);
979 	return ret;
980 }
981 
982 static void
983 blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
984 {
985 	switch(pn->plid) {
986 		case BLKIO_POLICY_PROP:
987 			if (pn->fileid == BLKIO_PROP_weight_device)
988 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
989 					MINOR(pn->dev), pn->val.weight);
990 			break;
991 		case BLKIO_POLICY_THROTL:
992 			switch(pn->fileid) {
993 			case BLKIO_THROTL_read_bps_device:
994 			case BLKIO_THROTL_write_bps_device:
995 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
996 					MINOR(pn->dev), pn->val.bps);
997 				break;
998 			case BLKIO_THROTL_read_iops_device:
999 			case BLKIO_THROTL_write_iops_device:
1000 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 					MINOR(pn->dev), pn->val.iops);
1002 				break;
1003 			}
1004 			break;
1005 		default:
1006 			BUG();
1007 	}
1008 }
1009 
1010 /* cgroup files which read their data from policy nodes end up here */
1011 static void blkio_read_policy_node_files(struct cftype *cft,
1012 			struct blkio_cgroup *blkcg, struct seq_file *m)
1013 {
1014 	struct blkio_policy_node *pn;
1015 
1016 	if (!list_empty(&blkcg->policy_list)) {
1017 		spin_lock_irq(&blkcg->lock);
1018 		list_for_each_entry(pn, &blkcg->policy_list, node) {
1019 			if (!pn_matches_cftype(cft, pn))
1020 				continue;
1021 			blkio_print_policy_node(m, pn);
1022 		}
1023 		spin_unlock_irq(&blkcg->lock);
1024 	}
1025 }
1026 
1027 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028 				struct seq_file *m)
1029 {
1030 	struct blkio_cgroup *blkcg;
1031 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032 	int name = BLKIOFILE_ATTR(cft->private);
1033 
1034 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1035 
1036 	switch(plid) {
1037 	case BLKIO_POLICY_PROP:
1038 		switch(name) {
1039 		case BLKIO_PROP_weight_device:
1040 			blkio_read_policy_node_files(cft, blkcg, m);
1041 			return 0;
1042 		default:
1043 			BUG();
1044 		}
1045 		break;
1046 	case BLKIO_POLICY_THROTL:
1047 		switch(name){
1048 		case BLKIO_THROTL_read_bps_device:
1049 		case BLKIO_THROTL_write_bps_device:
1050 		case BLKIO_THROTL_read_iops_device:
1051 		case BLKIO_THROTL_write_iops_device:
1052 			blkio_read_policy_node_files(cft, blkcg, m);
1053 			return 0;
1054 		default:
1055 			BUG();
1056 		}
1057 		break;
1058 	default:
1059 		BUG();
1060 	}
1061 
1062 	return 0;
1063 }
1064 
1065 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066 		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067 		bool show_total)
1068 {
1069 	struct blkio_group *blkg;
1070 	struct hlist_node *n;
1071 	uint64_t cgroup_total = 0;
1072 
1073 	rcu_read_lock();
1074 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075 		if (blkg->dev) {
1076 			if (!cftype_blkg_same_policy(cft, blkg))
1077 				continue;
1078 			spin_lock_irq(&blkg->stats_lock);
1079 			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080 						type);
1081 			spin_unlock_irq(&blkg->stats_lock);
1082 		}
1083 	}
1084 	if (show_total)
1085 		cb->fill(cb, "Total", cgroup_total);
1086 	rcu_read_unlock();
1087 	return 0;
1088 }
1089 
1090 /* All map kind of cgroup file get serviced by this function */
1091 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092 				struct cgroup_map_cb *cb)
1093 {
1094 	struct blkio_cgroup *blkcg;
1095 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096 	int name = BLKIOFILE_ATTR(cft->private);
1097 
1098 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1099 
1100 	switch(plid) {
1101 	case BLKIO_POLICY_PROP:
1102 		switch(name) {
1103 		case BLKIO_PROP_time:
1104 			return blkio_read_blkg_stats(blkcg, cft, cb,
1105 						BLKIO_STAT_TIME, 0);
1106 		case BLKIO_PROP_sectors:
1107 			return blkio_read_blkg_stats(blkcg, cft, cb,
1108 						BLKIO_STAT_SECTORS, 0);
1109 		case BLKIO_PROP_io_service_bytes:
1110 			return blkio_read_blkg_stats(blkcg, cft, cb,
1111 						BLKIO_STAT_SERVICE_BYTES, 1);
1112 		case BLKIO_PROP_io_serviced:
1113 			return blkio_read_blkg_stats(blkcg, cft, cb,
1114 						BLKIO_STAT_SERVICED, 1);
1115 		case BLKIO_PROP_io_service_time:
1116 			return blkio_read_blkg_stats(blkcg, cft, cb,
1117 						BLKIO_STAT_SERVICE_TIME, 1);
1118 		case BLKIO_PROP_io_wait_time:
1119 			return blkio_read_blkg_stats(blkcg, cft, cb,
1120 						BLKIO_STAT_WAIT_TIME, 1);
1121 		case BLKIO_PROP_io_merged:
1122 			return blkio_read_blkg_stats(blkcg, cft, cb,
1123 						BLKIO_STAT_MERGED, 1);
1124 		case BLKIO_PROP_io_queued:
1125 			return blkio_read_blkg_stats(blkcg, cft, cb,
1126 						BLKIO_STAT_QUEUED, 1);
1127 #ifdef CONFIG_DEBUG_BLK_CGROUP
1128 		case BLKIO_PROP_dequeue:
1129 			return blkio_read_blkg_stats(blkcg, cft, cb,
1130 						BLKIO_STAT_DEQUEUE, 0);
1131 		case BLKIO_PROP_avg_queue_size:
1132 			return blkio_read_blkg_stats(blkcg, cft, cb,
1133 						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134 		case BLKIO_PROP_group_wait_time:
1135 			return blkio_read_blkg_stats(blkcg, cft, cb,
1136 						BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137 		case BLKIO_PROP_idle_time:
1138 			return blkio_read_blkg_stats(blkcg, cft, cb,
1139 						BLKIO_STAT_IDLE_TIME, 0);
1140 		case BLKIO_PROP_empty_time:
1141 			return blkio_read_blkg_stats(blkcg, cft, cb,
1142 						BLKIO_STAT_EMPTY_TIME, 0);
1143 #endif
1144 		default:
1145 			BUG();
1146 		}
1147 		break;
1148 	case BLKIO_POLICY_THROTL:
1149 		switch(name){
1150 		case BLKIO_THROTL_io_service_bytes:
1151 			return blkio_read_blkg_stats(blkcg, cft, cb,
1152 						BLKIO_STAT_SERVICE_BYTES, 1);
1153 		case BLKIO_THROTL_io_serviced:
1154 			return blkio_read_blkg_stats(blkcg, cft, cb,
1155 						BLKIO_STAT_SERVICED, 1);
1156 		default:
1157 			BUG();
1158 		}
1159 		break;
1160 	default:
1161 		BUG();
1162 	}
1163 
1164 	return 0;
1165 }
1166 
1167 static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168 {
1169 	struct blkio_group *blkg;
1170 	struct hlist_node *n;
1171 	struct blkio_policy_node *pn;
1172 
1173 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174 		return -EINVAL;
1175 
1176 	spin_lock(&blkio_list_lock);
1177 	spin_lock_irq(&blkcg->lock);
1178 	blkcg->weight = (unsigned int)val;
1179 
1180 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181 		pn = blkio_policy_search_node(blkcg, blkg->dev,
1182 				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183 		if (pn)
1184 			continue;
1185 
1186 		blkio_update_group_weight(blkg, blkcg->weight);
1187 	}
1188 	spin_unlock_irq(&blkcg->lock);
1189 	spin_unlock(&blkio_list_lock);
1190 	return 0;
1191 }
1192 
1193 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194 	struct blkio_cgroup *blkcg;
1195 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196 	int name = BLKIOFILE_ATTR(cft->private);
1197 
1198 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1199 
1200 	switch(plid) {
1201 	case BLKIO_POLICY_PROP:
1202 		switch(name) {
1203 		case BLKIO_PROP_weight:
1204 			return (u64)blkcg->weight;
1205 		}
1206 		break;
1207 	default:
1208 		BUG();
1209 	}
1210 	return 0;
1211 }
1212 
1213 static int
1214 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215 {
1216 	struct blkio_cgroup *blkcg;
1217 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218 	int name = BLKIOFILE_ATTR(cft->private);
1219 
1220 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1221 
1222 	switch(plid) {
1223 	case BLKIO_POLICY_PROP:
1224 		switch(name) {
1225 		case BLKIO_PROP_weight:
1226 			return blkio_weight_write(blkcg, val);
1227 		}
1228 		break;
1229 	default:
1230 		BUG();
1231 	}
1232 
1233 	return 0;
1234 }
1235 
1236 struct cftype blkio_files[] = {
1237 	{
1238 		.name = "weight_device",
1239 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1240 				BLKIO_PROP_weight_device),
1241 		.read_seq_string = blkiocg_file_read,
1242 		.write_string = blkiocg_file_write,
1243 		.max_write_len = 256,
1244 	},
1245 	{
1246 		.name = "weight",
1247 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1248 				BLKIO_PROP_weight),
1249 		.read_u64 = blkiocg_file_read_u64,
1250 		.write_u64 = blkiocg_file_write_u64,
1251 	},
1252 	{
1253 		.name = "time",
1254 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 				BLKIO_PROP_time),
1256 		.read_map = blkiocg_file_read_map,
1257 	},
1258 	{
1259 		.name = "sectors",
1260 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261 				BLKIO_PROP_sectors),
1262 		.read_map = blkiocg_file_read_map,
1263 	},
1264 	{
1265 		.name = "io_service_bytes",
1266 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267 				BLKIO_PROP_io_service_bytes),
1268 		.read_map = blkiocg_file_read_map,
1269 	},
1270 	{
1271 		.name = "io_serviced",
1272 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273 				BLKIO_PROP_io_serviced),
1274 		.read_map = blkiocg_file_read_map,
1275 	},
1276 	{
1277 		.name = "io_service_time",
1278 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279 				BLKIO_PROP_io_service_time),
1280 		.read_map = blkiocg_file_read_map,
1281 	},
1282 	{
1283 		.name = "io_wait_time",
1284 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285 				BLKIO_PROP_io_wait_time),
1286 		.read_map = blkiocg_file_read_map,
1287 	},
1288 	{
1289 		.name = "io_merged",
1290 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291 				BLKIO_PROP_io_merged),
1292 		.read_map = blkiocg_file_read_map,
1293 	},
1294 	{
1295 		.name = "io_queued",
1296 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297 				BLKIO_PROP_io_queued),
1298 		.read_map = blkiocg_file_read_map,
1299 	},
1300 	{
1301 		.name = "reset_stats",
1302 		.write_u64 = blkiocg_reset_stats,
1303 	},
1304 #ifdef CONFIG_BLK_DEV_THROTTLING
1305 	{
1306 		.name = "throttle.read_bps_device",
1307 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308 				BLKIO_THROTL_read_bps_device),
1309 		.read_seq_string = blkiocg_file_read,
1310 		.write_string = blkiocg_file_write,
1311 		.max_write_len = 256,
1312 	},
1313 
1314 	{
1315 		.name = "throttle.write_bps_device",
1316 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317 				BLKIO_THROTL_write_bps_device),
1318 		.read_seq_string = blkiocg_file_read,
1319 		.write_string = blkiocg_file_write,
1320 		.max_write_len = 256,
1321 	},
1322 
1323 	{
1324 		.name = "throttle.read_iops_device",
1325 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326 				BLKIO_THROTL_read_iops_device),
1327 		.read_seq_string = blkiocg_file_read,
1328 		.write_string = blkiocg_file_write,
1329 		.max_write_len = 256,
1330 	},
1331 
1332 	{
1333 		.name = "throttle.write_iops_device",
1334 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335 				BLKIO_THROTL_write_iops_device),
1336 		.read_seq_string = blkiocg_file_read,
1337 		.write_string = blkiocg_file_write,
1338 		.max_write_len = 256,
1339 	},
1340 	{
1341 		.name = "throttle.io_service_bytes",
1342 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 				BLKIO_THROTL_io_service_bytes),
1344 		.read_map = blkiocg_file_read_map,
1345 	},
1346 	{
1347 		.name = "throttle.io_serviced",
1348 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349 				BLKIO_THROTL_io_serviced),
1350 		.read_map = blkiocg_file_read_map,
1351 	},
1352 #endif /* CONFIG_BLK_DEV_THROTTLING */
1353 
1354 #ifdef CONFIG_DEBUG_BLK_CGROUP
1355 	{
1356 		.name = "avg_queue_size",
1357 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358 				BLKIO_PROP_avg_queue_size),
1359 		.read_map = blkiocg_file_read_map,
1360 	},
1361 	{
1362 		.name = "group_wait_time",
1363 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364 				BLKIO_PROP_group_wait_time),
1365 		.read_map = blkiocg_file_read_map,
1366 	},
1367 	{
1368 		.name = "idle_time",
1369 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370 				BLKIO_PROP_idle_time),
1371 		.read_map = blkiocg_file_read_map,
1372 	},
1373 	{
1374 		.name = "empty_time",
1375 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376 				BLKIO_PROP_empty_time),
1377 		.read_map = blkiocg_file_read_map,
1378 	},
1379 	{
1380 		.name = "dequeue",
1381 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382 				BLKIO_PROP_dequeue),
1383 		.read_map = blkiocg_file_read_map,
1384 	},
1385 #endif
1386 };
1387 
1388 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1389 {
1390 	return cgroup_add_files(cgroup, subsys, blkio_files,
1391 				ARRAY_SIZE(blkio_files));
1392 }
1393 
1394 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1395 {
1396 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1397 	unsigned long flags;
1398 	struct blkio_group *blkg;
1399 	void *key;
1400 	struct blkio_policy_type *blkiop;
1401 	struct blkio_policy_node *pn, *pntmp;
1402 
1403 	rcu_read_lock();
1404 	do {
1405 		spin_lock_irqsave(&blkcg->lock, flags);
1406 
1407 		if (hlist_empty(&blkcg->blkg_list)) {
1408 			spin_unlock_irqrestore(&blkcg->lock, flags);
1409 			break;
1410 		}
1411 
1412 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1413 					blkcg_node);
1414 		key = rcu_dereference(blkg->key);
1415 		__blkiocg_del_blkio_group(blkg);
1416 
1417 		spin_unlock_irqrestore(&blkcg->lock, flags);
1418 
1419 		/*
1420 		 * This blkio_group is being unlinked as associated cgroup is
1421 		 * going away. Let all the IO controlling policies know about
1422 		 * this event.
1423 		 */
1424 		spin_lock(&blkio_list_lock);
1425 		list_for_each_entry(blkiop, &blkio_list, list) {
1426 			if (blkiop->plid != blkg->plid)
1427 				continue;
1428 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429 		}
1430 		spin_unlock(&blkio_list_lock);
1431 	} while (1);
1432 
1433 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1434 		blkio_policy_delete_node(pn);
1435 		kfree(pn);
1436 	}
1437 
1438 	free_css_id(&blkio_subsys, &blkcg->css);
1439 	rcu_read_unlock();
1440 	if (blkcg != &blkio_root_cgroup)
1441 		kfree(blkcg);
1442 }
1443 
1444 static struct cgroup_subsys_state *
1445 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1446 {
1447 	struct blkio_cgroup *blkcg;
1448 	struct cgroup *parent = cgroup->parent;
1449 
1450 	if (!parent) {
1451 		blkcg = &blkio_root_cgroup;
1452 		goto done;
1453 	}
1454 
1455 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1456 	if (!blkcg)
1457 		return ERR_PTR(-ENOMEM);
1458 
1459 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1460 done:
1461 	spin_lock_init(&blkcg->lock);
1462 	INIT_HLIST_HEAD(&blkcg->blkg_list);
1463 
1464 	INIT_LIST_HEAD(&blkcg->policy_list);
1465 	return &blkcg->css;
1466 }
1467 
1468 /*
1469  * We cannot support shared io contexts, as we have no mean to support
1470  * two tasks with the same ioc in two different groups without major rework
1471  * of the main cic data structures.  For now we allow a task to change
1472  * its cgroup only if it's the only owner of its ioc.
1473  */
1474 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1475 				struct cgroup *cgroup, struct task_struct *tsk,
1476 				bool threadgroup)
1477 {
1478 	struct io_context *ioc;
1479 	int ret = 0;
1480 
1481 	/* task_lock() is needed to avoid races with exit_io_context() */
1482 	task_lock(tsk);
1483 	ioc = tsk->io_context;
1484 	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1485 		ret = -EINVAL;
1486 	task_unlock(tsk);
1487 
1488 	return ret;
1489 }
1490 
1491 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1492 				struct cgroup *prev, struct task_struct *tsk,
1493 				bool threadgroup)
1494 {
1495 	struct io_context *ioc;
1496 
1497 	task_lock(tsk);
1498 	ioc = tsk->io_context;
1499 	if (ioc)
1500 		ioc->cgroup_changed = 1;
1501 	task_unlock(tsk);
1502 }
1503 
1504 void blkio_policy_register(struct blkio_policy_type *blkiop)
1505 {
1506 	spin_lock(&blkio_list_lock);
1507 	list_add_tail(&blkiop->list, &blkio_list);
1508 	spin_unlock(&blkio_list_lock);
1509 }
1510 EXPORT_SYMBOL_GPL(blkio_policy_register);
1511 
1512 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1513 {
1514 	spin_lock(&blkio_list_lock);
1515 	list_del_init(&blkiop->list);
1516 	spin_unlock(&blkio_list_lock);
1517 }
1518 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1519 
1520 static int __init init_cgroup_blkio(void)
1521 {
1522 	return cgroup_load_subsys(&blkio_subsys);
1523 }
1524 
1525 static void __exit exit_cgroup_blkio(void)
1526 {
1527 	cgroup_unload_subsys(&blkio_subsys);
1528 }
1529 
1530 module_init(init_cgroup_blkio);
1531 module_exit(exit_cgroup_blkio);
1532 MODULE_LICENSE("GPL");
1533