xref: /openbmc/linux/block/blk-cgroup.c (revision 1ab142d4)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include "blk-cgroup.h"
21 #include <linux/genhd.h>
22 
23 #define MAX_KEY_LEN 100
24 
25 static DEFINE_SPINLOCK(blkio_list_lock);
26 static LIST_HEAD(blkio_list);
27 
28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 
31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup *);
32 static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *);
33 static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *);
34 static void blkiocg_destroy(struct cgroup *);
35 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
36 
37 /* for encoding cft->private value on file */
38 #define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
39 /* What policy owns the file, proportional or throttle */
40 #define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
41 #define BLKIOFILE_ATTR(val)		((val) & 0xffff)
42 
43 struct cgroup_subsys blkio_subsys = {
44 	.name = "blkio",
45 	.create = blkiocg_create,
46 	.can_attach = blkiocg_can_attach,
47 	.attach = blkiocg_attach,
48 	.destroy = blkiocg_destroy,
49 	.populate = blkiocg_populate,
50 #ifdef CONFIG_BLK_CGROUP
51 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
52 	.subsys_id = blkio_subsys_id,
53 #endif
54 	.use_id = 1,
55 	.module = THIS_MODULE,
56 };
57 EXPORT_SYMBOL_GPL(blkio_subsys);
58 
59 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
60 					    struct blkio_policy_node *pn)
61 {
62 	list_add(&pn->node, &blkcg->policy_list);
63 }
64 
65 static inline bool cftype_blkg_same_policy(struct cftype *cft,
66 			struct blkio_group *blkg)
67 {
68 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
69 
70 	if (blkg->plid == plid)
71 		return 1;
72 
73 	return 0;
74 }
75 
76 /* Determines if policy node matches cgroup file being accessed */
77 static inline bool pn_matches_cftype(struct cftype *cft,
78 			struct blkio_policy_node *pn)
79 {
80 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
81 	int fileid = BLKIOFILE_ATTR(cft->private);
82 
83 	return (plid == pn->plid && fileid == pn->fileid);
84 }
85 
86 /* Must be called with blkcg->lock held */
87 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
88 {
89 	list_del(&pn->node);
90 }
91 
92 /* Must be called with blkcg->lock held */
93 static struct blkio_policy_node *
94 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
95 		enum blkio_policy_id plid, int fileid)
96 {
97 	struct blkio_policy_node *pn;
98 
99 	list_for_each_entry(pn, &blkcg->policy_list, node) {
100 		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
101 			return pn;
102 	}
103 
104 	return NULL;
105 }
106 
107 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
108 {
109 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
110 			    struct blkio_cgroup, css);
111 }
112 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
113 
114 struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
115 {
116 	return container_of(task_subsys_state(tsk, blkio_subsys_id),
117 			    struct blkio_cgroup, css);
118 }
119 EXPORT_SYMBOL_GPL(task_blkio_cgroup);
120 
121 static inline void
122 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
123 {
124 	struct blkio_policy_type *blkiop;
125 
126 	list_for_each_entry(blkiop, &blkio_list, list) {
127 		/* If this policy does not own the blkg, do not send updates */
128 		if (blkiop->plid != blkg->plid)
129 			continue;
130 		if (blkiop->ops.blkio_update_group_weight_fn)
131 			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
132 							blkg, weight);
133 	}
134 }
135 
136 static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
137 				int fileid)
138 {
139 	struct blkio_policy_type *blkiop;
140 
141 	list_for_each_entry(blkiop, &blkio_list, list) {
142 
143 		/* If this policy does not own the blkg, do not send updates */
144 		if (blkiop->plid != blkg->plid)
145 			continue;
146 
147 		if (fileid == BLKIO_THROTL_read_bps_device
148 		    && blkiop->ops.blkio_update_group_read_bps_fn)
149 			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
150 								blkg, bps);
151 
152 		if (fileid == BLKIO_THROTL_write_bps_device
153 		    && blkiop->ops.blkio_update_group_write_bps_fn)
154 			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
155 								blkg, bps);
156 	}
157 }
158 
159 static inline void blkio_update_group_iops(struct blkio_group *blkg,
160 			unsigned int iops, int fileid)
161 {
162 	struct blkio_policy_type *blkiop;
163 
164 	list_for_each_entry(blkiop, &blkio_list, list) {
165 
166 		/* If this policy does not own the blkg, do not send updates */
167 		if (blkiop->plid != blkg->plid)
168 			continue;
169 
170 		if (fileid == BLKIO_THROTL_read_iops_device
171 		    && blkiop->ops.blkio_update_group_read_iops_fn)
172 			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
173 								blkg, iops);
174 
175 		if (fileid == BLKIO_THROTL_write_iops_device
176 		    && blkiop->ops.blkio_update_group_write_iops_fn)
177 			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
178 								blkg,iops);
179 	}
180 }
181 
182 /*
183  * Add to the appropriate stat variable depending on the request type.
184  * This should be called with the blkg->stats_lock held.
185  */
186 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
187 				bool sync)
188 {
189 	if (direction)
190 		stat[BLKIO_STAT_WRITE] += add;
191 	else
192 		stat[BLKIO_STAT_READ] += add;
193 	if (sync)
194 		stat[BLKIO_STAT_SYNC] += add;
195 	else
196 		stat[BLKIO_STAT_ASYNC] += add;
197 }
198 
199 /*
200  * Decrements the appropriate stat variable if non-zero depending on the
201  * request type. Panics on value being zero.
202  * This should be called with the blkg->stats_lock held.
203  */
204 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
205 {
206 	if (direction) {
207 		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
208 		stat[BLKIO_STAT_WRITE]--;
209 	} else {
210 		BUG_ON(stat[BLKIO_STAT_READ] == 0);
211 		stat[BLKIO_STAT_READ]--;
212 	}
213 	if (sync) {
214 		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
215 		stat[BLKIO_STAT_SYNC]--;
216 	} else {
217 		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
218 		stat[BLKIO_STAT_ASYNC]--;
219 	}
220 }
221 
222 #ifdef CONFIG_DEBUG_BLK_CGROUP
223 /* This should be called with the blkg->stats_lock held. */
224 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
225 						struct blkio_group *curr_blkg)
226 {
227 	if (blkio_blkg_waiting(&blkg->stats))
228 		return;
229 	if (blkg == curr_blkg)
230 		return;
231 	blkg->stats.start_group_wait_time = sched_clock();
232 	blkio_mark_blkg_waiting(&blkg->stats);
233 }
234 
235 /* This should be called with the blkg->stats_lock held. */
236 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
237 {
238 	unsigned long long now;
239 
240 	if (!blkio_blkg_waiting(stats))
241 		return;
242 
243 	now = sched_clock();
244 	if (time_after64(now, stats->start_group_wait_time))
245 		stats->group_wait_time += now - stats->start_group_wait_time;
246 	blkio_clear_blkg_waiting(stats);
247 }
248 
249 /* This should be called with the blkg->stats_lock held. */
250 static void blkio_end_empty_time(struct blkio_group_stats *stats)
251 {
252 	unsigned long long now;
253 
254 	if (!blkio_blkg_empty(stats))
255 		return;
256 
257 	now = sched_clock();
258 	if (time_after64(now, stats->start_empty_time))
259 		stats->empty_time += now - stats->start_empty_time;
260 	blkio_clear_blkg_empty(stats);
261 }
262 
263 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
264 {
265 	unsigned long flags;
266 
267 	spin_lock_irqsave(&blkg->stats_lock, flags);
268 	BUG_ON(blkio_blkg_idling(&blkg->stats));
269 	blkg->stats.start_idle_time = sched_clock();
270 	blkio_mark_blkg_idling(&blkg->stats);
271 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
272 }
273 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
274 
275 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
276 {
277 	unsigned long flags;
278 	unsigned long long now;
279 	struct blkio_group_stats *stats;
280 
281 	spin_lock_irqsave(&blkg->stats_lock, flags);
282 	stats = &blkg->stats;
283 	if (blkio_blkg_idling(stats)) {
284 		now = sched_clock();
285 		if (time_after64(now, stats->start_idle_time))
286 			stats->idle_time += now - stats->start_idle_time;
287 		blkio_clear_blkg_idling(stats);
288 	}
289 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
290 }
291 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
292 
293 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
294 {
295 	unsigned long flags;
296 	struct blkio_group_stats *stats;
297 
298 	spin_lock_irqsave(&blkg->stats_lock, flags);
299 	stats = &blkg->stats;
300 	stats->avg_queue_size_sum +=
301 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
302 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
303 	stats->avg_queue_size_samples++;
304 	blkio_update_group_wait_time(stats);
305 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
306 }
307 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
308 
309 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
310 {
311 	unsigned long flags;
312 	struct blkio_group_stats *stats;
313 
314 	spin_lock_irqsave(&blkg->stats_lock, flags);
315 	stats = &blkg->stats;
316 
317 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
318 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
319 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
320 		return;
321 	}
322 
323 	/*
324 	 * group is already marked empty. This can happen if cfqq got new
325 	 * request in parent group and moved to this group while being added
326 	 * to service tree. Just ignore the event and move on.
327 	 */
328 	if(blkio_blkg_empty(stats)) {
329 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
330 		return;
331 	}
332 
333 	stats->start_empty_time = sched_clock();
334 	blkio_mark_blkg_empty(stats);
335 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
336 }
337 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
338 
339 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
340 			unsigned long dequeue)
341 {
342 	blkg->stats.dequeue += dequeue;
343 }
344 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
345 #else
346 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
347 					struct blkio_group *curr_blkg) {}
348 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
349 #endif
350 
351 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
352 			struct blkio_group *curr_blkg, bool direction,
353 			bool sync)
354 {
355 	unsigned long flags;
356 
357 	spin_lock_irqsave(&blkg->stats_lock, flags);
358 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
359 			sync);
360 	blkio_end_empty_time(&blkg->stats);
361 	blkio_set_start_group_wait_time(blkg, curr_blkg);
362 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
363 }
364 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
365 
366 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
367 						bool direction, bool sync)
368 {
369 	unsigned long flags;
370 
371 	spin_lock_irqsave(&blkg->stats_lock, flags);
372 	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
373 					direction, sync);
374 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
375 }
376 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
377 
378 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
379 				unsigned long unaccounted_time)
380 {
381 	unsigned long flags;
382 
383 	spin_lock_irqsave(&blkg->stats_lock, flags);
384 	blkg->stats.time += time;
385 #ifdef CONFIG_DEBUG_BLK_CGROUP
386 	blkg->stats.unaccounted_time += unaccounted_time;
387 #endif
388 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
389 }
390 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
391 
392 /*
393  * should be called under rcu read lock or queue lock to make sure blkg pointer
394  * is valid.
395  */
396 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
397 				uint64_t bytes, bool direction, bool sync)
398 {
399 	struct blkio_group_stats_cpu *stats_cpu;
400 	unsigned long flags;
401 
402 	/*
403 	 * Disabling interrupts to provide mutual exclusion between two
404 	 * writes on same cpu. It probably is not needed for 64bit. Not
405 	 * optimizing that case yet.
406 	 */
407 	local_irq_save(flags);
408 
409 	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
410 
411 	u64_stats_update_begin(&stats_cpu->syncp);
412 	stats_cpu->sectors += bytes >> 9;
413 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
414 			1, direction, sync);
415 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
416 			bytes, direction, sync);
417 	u64_stats_update_end(&stats_cpu->syncp);
418 	local_irq_restore(flags);
419 }
420 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
421 
422 void blkiocg_update_completion_stats(struct blkio_group *blkg,
423 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
424 {
425 	struct blkio_group_stats *stats;
426 	unsigned long flags;
427 	unsigned long long now = sched_clock();
428 
429 	spin_lock_irqsave(&blkg->stats_lock, flags);
430 	stats = &blkg->stats;
431 	if (time_after64(now, io_start_time))
432 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
433 				now - io_start_time, direction, sync);
434 	if (time_after64(io_start_time, start_time))
435 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
436 				io_start_time - start_time, direction, sync);
437 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
438 }
439 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
440 
441 /*  Merged stats are per cpu.  */
442 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
443 					bool sync)
444 {
445 	struct blkio_group_stats_cpu *stats_cpu;
446 	unsigned long flags;
447 
448 	/*
449 	 * Disabling interrupts to provide mutual exclusion between two
450 	 * writes on same cpu. It probably is not needed for 64bit. Not
451 	 * optimizing that case yet.
452 	 */
453 	local_irq_save(flags);
454 
455 	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
456 
457 	u64_stats_update_begin(&stats_cpu->syncp);
458 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
459 				direction, sync);
460 	u64_stats_update_end(&stats_cpu->syncp);
461 	local_irq_restore(flags);
462 }
463 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
464 
465 /*
466  * This function allocates the per cpu stats for blkio_group. Should be called
467  * from sleepable context as alloc_per_cpu() requires that.
468  */
469 int blkio_alloc_blkg_stats(struct blkio_group *blkg)
470 {
471 	/* Allocate memory for per cpu stats */
472 	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
473 	if (!blkg->stats_cpu)
474 		return -ENOMEM;
475 	return 0;
476 }
477 EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
478 
479 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
480 		struct blkio_group *blkg, void *key, dev_t dev,
481 		enum blkio_policy_id plid)
482 {
483 	unsigned long flags;
484 
485 	spin_lock_irqsave(&blkcg->lock, flags);
486 	spin_lock_init(&blkg->stats_lock);
487 	rcu_assign_pointer(blkg->key, key);
488 	blkg->blkcg_id = css_id(&blkcg->css);
489 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
490 	blkg->plid = plid;
491 	spin_unlock_irqrestore(&blkcg->lock, flags);
492 	/* Need to take css reference ? */
493 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
494 	blkg->dev = dev;
495 }
496 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
497 
498 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
499 {
500 	hlist_del_init_rcu(&blkg->blkcg_node);
501 	blkg->blkcg_id = 0;
502 }
503 
504 /*
505  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
506  * indicating that blk_group was unhashed by the time we got to it.
507  */
508 int blkiocg_del_blkio_group(struct blkio_group *blkg)
509 {
510 	struct blkio_cgroup *blkcg;
511 	unsigned long flags;
512 	struct cgroup_subsys_state *css;
513 	int ret = 1;
514 
515 	rcu_read_lock();
516 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
517 	if (css) {
518 		blkcg = container_of(css, struct blkio_cgroup, css);
519 		spin_lock_irqsave(&blkcg->lock, flags);
520 		if (!hlist_unhashed(&blkg->blkcg_node)) {
521 			__blkiocg_del_blkio_group(blkg);
522 			ret = 0;
523 		}
524 		spin_unlock_irqrestore(&blkcg->lock, flags);
525 	}
526 
527 	rcu_read_unlock();
528 	return ret;
529 }
530 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
531 
532 /* called under rcu_read_lock(). */
533 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
534 {
535 	struct blkio_group *blkg;
536 	struct hlist_node *n;
537 	void *__key;
538 
539 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
540 		__key = blkg->key;
541 		if (__key == key)
542 			return blkg;
543 	}
544 
545 	return NULL;
546 }
547 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
548 
549 static void blkio_reset_stats_cpu(struct blkio_group *blkg)
550 {
551 	struct blkio_group_stats_cpu *stats_cpu;
552 	int i, j, k;
553 	/*
554 	 * Note: On 64 bit arch this should not be an issue. This has the
555 	 * possibility of returning some inconsistent value on 32bit arch
556 	 * as 64bit update on 32bit is non atomic. Taking care of this
557 	 * corner case makes code very complicated, like sending IPIs to
558 	 * cpus, taking care of stats of offline cpus etc.
559 	 *
560 	 * reset stats is anyway more of a debug feature and this sounds a
561 	 * corner case. So I am not complicating the code yet until and
562 	 * unless this becomes a real issue.
563 	 */
564 	for_each_possible_cpu(i) {
565 		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
566 		stats_cpu->sectors = 0;
567 		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
568 			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
569 				stats_cpu->stat_arr_cpu[j][k] = 0;
570 	}
571 }
572 
573 static int
574 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
575 {
576 	struct blkio_cgroup *blkcg;
577 	struct blkio_group *blkg;
578 	struct blkio_group_stats *stats;
579 	struct hlist_node *n;
580 	uint64_t queued[BLKIO_STAT_TOTAL];
581 	int i;
582 #ifdef CONFIG_DEBUG_BLK_CGROUP
583 	bool idling, waiting, empty;
584 	unsigned long long now = sched_clock();
585 #endif
586 
587 	blkcg = cgroup_to_blkio_cgroup(cgroup);
588 	spin_lock_irq(&blkcg->lock);
589 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
590 		spin_lock(&blkg->stats_lock);
591 		stats = &blkg->stats;
592 #ifdef CONFIG_DEBUG_BLK_CGROUP
593 		idling = blkio_blkg_idling(stats);
594 		waiting = blkio_blkg_waiting(stats);
595 		empty = blkio_blkg_empty(stats);
596 #endif
597 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
598 			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
599 		memset(stats, 0, sizeof(struct blkio_group_stats));
600 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
601 			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
602 #ifdef CONFIG_DEBUG_BLK_CGROUP
603 		if (idling) {
604 			blkio_mark_blkg_idling(stats);
605 			stats->start_idle_time = now;
606 		}
607 		if (waiting) {
608 			blkio_mark_blkg_waiting(stats);
609 			stats->start_group_wait_time = now;
610 		}
611 		if (empty) {
612 			blkio_mark_blkg_empty(stats);
613 			stats->start_empty_time = now;
614 		}
615 #endif
616 		spin_unlock(&blkg->stats_lock);
617 
618 		/* Reset Per cpu stats which don't take blkg->stats_lock */
619 		blkio_reset_stats_cpu(blkg);
620 	}
621 
622 	spin_unlock_irq(&blkcg->lock);
623 	return 0;
624 }
625 
626 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
627 				int chars_left, bool diskname_only)
628 {
629 	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
630 	chars_left -= strlen(str);
631 	if (chars_left <= 0) {
632 		printk(KERN_WARNING
633 			"Possibly incorrect cgroup stat display format");
634 		return;
635 	}
636 	if (diskname_only)
637 		return;
638 	switch (type) {
639 	case BLKIO_STAT_READ:
640 		strlcat(str, " Read", chars_left);
641 		break;
642 	case BLKIO_STAT_WRITE:
643 		strlcat(str, " Write", chars_left);
644 		break;
645 	case BLKIO_STAT_SYNC:
646 		strlcat(str, " Sync", chars_left);
647 		break;
648 	case BLKIO_STAT_ASYNC:
649 		strlcat(str, " Async", chars_left);
650 		break;
651 	case BLKIO_STAT_TOTAL:
652 		strlcat(str, " Total", chars_left);
653 		break;
654 	default:
655 		strlcat(str, " Invalid", chars_left);
656 	}
657 }
658 
659 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
660 				struct cgroup_map_cb *cb, dev_t dev)
661 {
662 	blkio_get_key_name(0, dev, str, chars_left, true);
663 	cb->fill(cb, str, val);
664 	return val;
665 }
666 
667 
668 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
669 			enum stat_type_cpu type, enum stat_sub_type sub_type)
670 {
671 	int cpu;
672 	struct blkio_group_stats_cpu *stats_cpu;
673 	u64 val = 0, tval;
674 
675 	for_each_possible_cpu(cpu) {
676 		unsigned int start;
677 		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
678 
679 		do {
680 			start = u64_stats_fetch_begin(&stats_cpu->syncp);
681 			if (type == BLKIO_STAT_CPU_SECTORS)
682 				tval = stats_cpu->sectors;
683 			else
684 				tval = stats_cpu->stat_arr_cpu[type][sub_type];
685 		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
686 
687 		val += tval;
688 	}
689 
690 	return val;
691 }
692 
693 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
694 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
695 {
696 	uint64_t disk_total, val;
697 	char key_str[MAX_KEY_LEN];
698 	enum stat_sub_type sub_type;
699 
700 	if (type == BLKIO_STAT_CPU_SECTORS) {
701 		val = blkio_read_stat_cpu(blkg, type, 0);
702 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
703 	}
704 
705 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
706 			sub_type++) {
707 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
708 		val = blkio_read_stat_cpu(blkg, type, sub_type);
709 		cb->fill(cb, key_str, val);
710 	}
711 
712 	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
713 			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
714 
715 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
716 	cb->fill(cb, key_str, disk_total);
717 	return disk_total;
718 }
719 
720 /* This should be called with blkg->stats_lock held */
721 static uint64_t blkio_get_stat(struct blkio_group *blkg,
722 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
723 {
724 	uint64_t disk_total;
725 	char key_str[MAX_KEY_LEN];
726 	enum stat_sub_type sub_type;
727 
728 	if (type == BLKIO_STAT_TIME)
729 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
730 					blkg->stats.time, cb, dev);
731 #ifdef CONFIG_DEBUG_BLK_CGROUP
732 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
733 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
734 					blkg->stats.unaccounted_time, cb, dev);
735 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
736 		uint64_t sum = blkg->stats.avg_queue_size_sum;
737 		uint64_t samples = blkg->stats.avg_queue_size_samples;
738 		if (samples)
739 			do_div(sum, samples);
740 		else
741 			sum = 0;
742 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
743 	}
744 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
745 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
746 					blkg->stats.group_wait_time, cb, dev);
747 	if (type == BLKIO_STAT_IDLE_TIME)
748 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
749 					blkg->stats.idle_time, cb, dev);
750 	if (type == BLKIO_STAT_EMPTY_TIME)
751 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
752 					blkg->stats.empty_time, cb, dev);
753 	if (type == BLKIO_STAT_DEQUEUE)
754 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
755 					blkg->stats.dequeue, cb, dev);
756 #endif
757 
758 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
759 			sub_type++) {
760 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
761 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
762 	}
763 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
764 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
765 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
766 	cb->fill(cb, key_str, disk_total);
767 	return disk_total;
768 }
769 
770 static int blkio_policy_parse_and_set(char *buf,
771 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
772 {
773 	struct gendisk *disk = NULL;
774 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
775 	unsigned long major, minor;
776 	int i = 0, ret = -EINVAL;
777 	int part;
778 	dev_t dev;
779 	u64 temp;
780 
781 	memset(s, 0, sizeof(s));
782 
783 	while ((p = strsep(&buf, " ")) != NULL) {
784 		if (!*p)
785 			continue;
786 
787 		s[i++] = p;
788 
789 		/* Prevent from inputing too many things */
790 		if (i == 3)
791 			break;
792 	}
793 
794 	if (i != 2)
795 		goto out;
796 
797 	p = strsep(&s[0], ":");
798 	if (p != NULL)
799 		major_s = p;
800 	else
801 		goto out;
802 
803 	minor_s = s[0];
804 	if (!minor_s)
805 		goto out;
806 
807 	if (strict_strtoul(major_s, 10, &major))
808 		goto out;
809 
810 	if (strict_strtoul(minor_s, 10, &minor))
811 		goto out;
812 
813 	dev = MKDEV(major, minor);
814 
815 	if (strict_strtoull(s[1], 10, &temp))
816 		goto out;
817 
818 	/* For rule removal, do not check for device presence. */
819 	if (temp) {
820 		disk = get_gendisk(dev, &part);
821 		if (!disk || part) {
822 			ret = -ENODEV;
823 			goto out;
824 		}
825 	}
826 
827 	newpn->dev = dev;
828 
829 	switch (plid) {
830 	case BLKIO_POLICY_PROP:
831 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
832 		     temp > BLKIO_WEIGHT_MAX)
833 			goto out;
834 
835 		newpn->plid = plid;
836 		newpn->fileid = fileid;
837 		newpn->val.weight = temp;
838 		break;
839 	case BLKIO_POLICY_THROTL:
840 		switch(fileid) {
841 		case BLKIO_THROTL_read_bps_device:
842 		case BLKIO_THROTL_write_bps_device:
843 			newpn->plid = plid;
844 			newpn->fileid = fileid;
845 			newpn->val.bps = temp;
846 			break;
847 		case BLKIO_THROTL_read_iops_device:
848 		case BLKIO_THROTL_write_iops_device:
849 			if (temp > THROTL_IOPS_MAX)
850 				goto out;
851 
852 			newpn->plid = plid;
853 			newpn->fileid = fileid;
854 			newpn->val.iops = (unsigned int)temp;
855 			break;
856 		}
857 		break;
858 	default:
859 		BUG();
860 	}
861 	ret = 0;
862 out:
863 	put_disk(disk);
864 	return ret;
865 }
866 
867 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
868 			      dev_t dev)
869 {
870 	struct blkio_policy_node *pn;
871 	unsigned long flags;
872 	unsigned int weight;
873 
874 	spin_lock_irqsave(&blkcg->lock, flags);
875 
876 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
877 				BLKIO_PROP_weight_device);
878 	if (pn)
879 		weight = pn->val.weight;
880 	else
881 		weight = blkcg->weight;
882 
883 	spin_unlock_irqrestore(&blkcg->lock, flags);
884 
885 	return weight;
886 }
887 EXPORT_SYMBOL_GPL(blkcg_get_weight);
888 
889 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
890 {
891 	struct blkio_policy_node *pn;
892 	unsigned long flags;
893 	uint64_t bps = -1;
894 
895 	spin_lock_irqsave(&blkcg->lock, flags);
896 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
897 				BLKIO_THROTL_read_bps_device);
898 	if (pn)
899 		bps = pn->val.bps;
900 	spin_unlock_irqrestore(&blkcg->lock, flags);
901 
902 	return bps;
903 }
904 
905 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
906 {
907 	struct blkio_policy_node *pn;
908 	unsigned long flags;
909 	uint64_t bps = -1;
910 
911 	spin_lock_irqsave(&blkcg->lock, flags);
912 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
913 				BLKIO_THROTL_write_bps_device);
914 	if (pn)
915 		bps = pn->val.bps;
916 	spin_unlock_irqrestore(&blkcg->lock, flags);
917 
918 	return bps;
919 }
920 
921 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
922 {
923 	struct blkio_policy_node *pn;
924 	unsigned long flags;
925 	unsigned int iops = -1;
926 
927 	spin_lock_irqsave(&blkcg->lock, flags);
928 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
929 				BLKIO_THROTL_read_iops_device);
930 	if (pn)
931 		iops = pn->val.iops;
932 	spin_unlock_irqrestore(&blkcg->lock, flags);
933 
934 	return iops;
935 }
936 
937 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
938 {
939 	struct blkio_policy_node *pn;
940 	unsigned long flags;
941 	unsigned int iops = -1;
942 
943 	spin_lock_irqsave(&blkcg->lock, flags);
944 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
945 				BLKIO_THROTL_write_iops_device);
946 	if (pn)
947 		iops = pn->val.iops;
948 	spin_unlock_irqrestore(&blkcg->lock, flags);
949 
950 	return iops;
951 }
952 
953 /* Checks whether user asked for deleting a policy rule */
954 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
955 {
956 	switch(pn->plid) {
957 	case BLKIO_POLICY_PROP:
958 		if (pn->val.weight == 0)
959 			return 1;
960 		break;
961 	case BLKIO_POLICY_THROTL:
962 		switch(pn->fileid) {
963 		case BLKIO_THROTL_read_bps_device:
964 		case BLKIO_THROTL_write_bps_device:
965 			if (pn->val.bps == 0)
966 				return 1;
967 			break;
968 		case BLKIO_THROTL_read_iops_device:
969 		case BLKIO_THROTL_write_iops_device:
970 			if (pn->val.iops == 0)
971 				return 1;
972 		}
973 		break;
974 	default:
975 		BUG();
976 	}
977 
978 	return 0;
979 }
980 
981 static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
982 					struct blkio_policy_node *newpn)
983 {
984 	switch(oldpn->plid) {
985 	case BLKIO_POLICY_PROP:
986 		oldpn->val.weight = newpn->val.weight;
987 		break;
988 	case BLKIO_POLICY_THROTL:
989 		switch(newpn->fileid) {
990 		case BLKIO_THROTL_read_bps_device:
991 		case BLKIO_THROTL_write_bps_device:
992 			oldpn->val.bps = newpn->val.bps;
993 			break;
994 		case BLKIO_THROTL_read_iops_device:
995 		case BLKIO_THROTL_write_iops_device:
996 			oldpn->val.iops = newpn->val.iops;
997 		}
998 		break;
999 	default:
1000 		BUG();
1001 	}
1002 }
1003 
1004 /*
1005  * Some rules/values in blkg have changed. Propagate those to respective
1006  * policies.
1007  */
1008 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
1009 		struct blkio_group *blkg, struct blkio_policy_node *pn)
1010 {
1011 	unsigned int weight, iops;
1012 	u64 bps;
1013 
1014 	switch(pn->plid) {
1015 	case BLKIO_POLICY_PROP:
1016 		weight = pn->val.weight ? pn->val.weight :
1017 				blkcg->weight;
1018 		blkio_update_group_weight(blkg, weight);
1019 		break;
1020 	case BLKIO_POLICY_THROTL:
1021 		switch(pn->fileid) {
1022 		case BLKIO_THROTL_read_bps_device:
1023 		case BLKIO_THROTL_write_bps_device:
1024 			bps = pn->val.bps ? pn->val.bps : (-1);
1025 			blkio_update_group_bps(blkg, bps, pn->fileid);
1026 			break;
1027 		case BLKIO_THROTL_read_iops_device:
1028 		case BLKIO_THROTL_write_iops_device:
1029 			iops = pn->val.iops ? pn->val.iops : (-1);
1030 			blkio_update_group_iops(blkg, iops, pn->fileid);
1031 			break;
1032 		}
1033 		break;
1034 	default:
1035 		BUG();
1036 	}
1037 }
1038 
1039 /*
1040  * A policy node rule has been updated. Propagate this update to all the
1041  * block groups which might be affected by this update.
1042  */
1043 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1044 				struct blkio_policy_node *pn)
1045 {
1046 	struct blkio_group *blkg;
1047 	struct hlist_node *n;
1048 
1049 	spin_lock(&blkio_list_lock);
1050 	spin_lock_irq(&blkcg->lock);
1051 
1052 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1053 		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
1054 			continue;
1055 		blkio_update_blkg_policy(blkcg, blkg, pn);
1056 	}
1057 
1058 	spin_unlock_irq(&blkcg->lock);
1059 	spin_unlock(&blkio_list_lock);
1060 }
1061 
1062 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1063  				       const char *buffer)
1064 {
1065 	int ret = 0;
1066 	char *buf;
1067 	struct blkio_policy_node *newpn, *pn;
1068 	struct blkio_cgroup *blkcg;
1069 	int keep_newpn = 0;
1070 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1071 	int fileid = BLKIOFILE_ATTR(cft->private);
1072 
1073 	buf = kstrdup(buffer, GFP_KERNEL);
1074 	if (!buf)
1075 		return -ENOMEM;
1076 
1077 	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1078 	if (!newpn) {
1079 		ret = -ENOMEM;
1080 		goto free_buf;
1081 	}
1082 
1083 	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
1084 	if (ret)
1085 		goto free_newpn;
1086 
1087 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1088 
1089 	spin_lock_irq(&blkcg->lock);
1090 
1091 	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1092 	if (!pn) {
1093 		if (!blkio_delete_rule_command(newpn)) {
1094 			blkio_policy_insert_node(blkcg, newpn);
1095 			keep_newpn = 1;
1096 		}
1097 		spin_unlock_irq(&blkcg->lock);
1098 		goto update_io_group;
1099 	}
1100 
1101 	if (blkio_delete_rule_command(newpn)) {
1102 		blkio_policy_delete_node(pn);
1103 		kfree(pn);
1104 		spin_unlock_irq(&blkcg->lock);
1105 		goto update_io_group;
1106 	}
1107 	spin_unlock_irq(&blkcg->lock);
1108 
1109 	blkio_update_policy_rule(pn, newpn);
1110 
1111 update_io_group:
1112 	blkio_update_policy_node_blkg(blkcg, newpn);
1113 
1114 free_newpn:
1115 	if (!keep_newpn)
1116 		kfree(newpn);
1117 free_buf:
1118 	kfree(buf);
1119 	return ret;
1120 }
1121 
1122 static void
1123 blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
1124 {
1125 	switch(pn->plid) {
1126 		case BLKIO_POLICY_PROP:
1127 			if (pn->fileid == BLKIO_PROP_weight_device)
1128 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1129 					MINOR(pn->dev), pn->val.weight);
1130 			break;
1131 		case BLKIO_POLICY_THROTL:
1132 			switch(pn->fileid) {
1133 			case BLKIO_THROTL_read_bps_device:
1134 			case BLKIO_THROTL_write_bps_device:
1135 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1136 					MINOR(pn->dev), pn->val.bps);
1137 				break;
1138 			case BLKIO_THROTL_read_iops_device:
1139 			case BLKIO_THROTL_write_iops_device:
1140 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1141 					MINOR(pn->dev), pn->val.iops);
1142 				break;
1143 			}
1144 			break;
1145 		default:
1146 			BUG();
1147 	}
1148 }
1149 
1150 /* cgroup files which read their data from policy nodes end up here */
1151 static void blkio_read_policy_node_files(struct cftype *cft,
1152 			struct blkio_cgroup *blkcg, struct seq_file *m)
1153 {
1154 	struct blkio_policy_node *pn;
1155 
1156 	if (!list_empty(&blkcg->policy_list)) {
1157 		spin_lock_irq(&blkcg->lock);
1158 		list_for_each_entry(pn, &blkcg->policy_list, node) {
1159 			if (!pn_matches_cftype(cft, pn))
1160 				continue;
1161 			blkio_print_policy_node(m, pn);
1162 		}
1163 		spin_unlock_irq(&blkcg->lock);
1164 	}
1165 }
1166 
1167 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1168 				struct seq_file *m)
1169 {
1170 	struct blkio_cgroup *blkcg;
1171 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1172 	int name = BLKIOFILE_ATTR(cft->private);
1173 
1174 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1175 
1176 	switch(plid) {
1177 	case BLKIO_POLICY_PROP:
1178 		switch(name) {
1179 		case BLKIO_PROP_weight_device:
1180 			blkio_read_policy_node_files(cft, blkcg, m);
1181 			return 0;
1182 		default:
1183 			BUG();
1184 		}
1185 		break;
1186 	case BLKIO_POLICY_THROTL:
1187 		switch(name){
1188 		case BLKIO_THROTL_read_bps_device:
1189 		case BLKIO_THROTL_write_bps_device:
1190 		case BLKIO_THROTL_read_iops_device:
1191 		case BLKIO_THROTL_write_iops_device:
1192 			blkio_read_policy_node_files(cft, blkcg, m);
1193 			return 0;
1194 		default:
1195 			BUG();
1196 		}
1197 		break;
1198 	default:
1199 		BUG();
1200 	}
1201 
1202 	return 0;
1203 }
1204 
1205 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1206 		struct cftype *cft, struct cgroup_map_cb *cb,
1207 		enum stat_type type, bool show_total, bool pcpu)
1208 {
1209 	struct blkio_group *blkg;
1210 	struct hlist_node *n;
1211 	uint64_t cgroup_total = 0;
1212 
1213 	rcu_read_lock();
1214 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1215 		if (blkg->dev) {
1216 			if (!cftype_blkg_same_policy(cft, blkg))
1217 				continue;
1218 			if (pcpu)
1219 				cgroup_total += blkio_get_stat_cpu(blkg, cb,
1220 						blkg->dev, type);
1221 			else {
1222 				spin_lock_irq(&blkg->stats_lock);
1223 				cgroup_total += blkio_get_stat(blkg, cb,
1224 						blkg->dev, type);
1225 				spin_unlock_irq(&blkg->stats_lock);
1226 			}
1227 		}
1228 	}
1229 	if (show_total)
1230 		cb->fill(cb, "Total", cgroup_total);
1231 	rcu_read_unlock();
1232 	return 0;
1233 }
1234 
1235 /* All map kind of cgroup file get serviced by this function */
1236 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1237 				struct cgroup_map_cb *cb)
1238 {
1239 	struct blkio_cgroup *blkcg;
1240 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1241 	int name = BLKIOFILE_ATTR(cft->private);
1242 
1243 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1244 
1245 	switch(plid) {
1246 	case BLKIO_POLICY_PROP:
1247 		switch(name) {
1248 		case BLKIO_PROP_time:
1249 			return blkio_read_blkg_stats(blkcg, cft, cb,
1250 						BLKIO_STAT_TIME, 0, 0);
1251 		case BLKIO_PROP_sectors:
1252 			return blkio_read_blkg_stats(blkcg, cft, cb,
1253 						BLKIO_STAT_CPU_SECTORS, 0, 1);
1254 		case BLKIO_PROP_io_service_bytes:
1255 			return blkio_read_blkg_stats(blkcg, cft, cb,
1256 					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1257 		case BLKIO_PROP_io_serviced:
1258 			return blkio_read_blkg_stats(blkcg, cft, cb,
1259 						BLKIO_STAT_CPU_SERVICED, 1, 1);
1260 		case BLKIO_PROP_io_service_time:
1261 			return blkio_read_blkg_stats(blkcg, cft, cb,
1262 						BLKIO_STAT_SERVICE_TIME, 1, 0);
1263 		case BLKIO_PROP_io_wait_time:
1264 			return blkio_read_blkg_stats(blkcg, cft, cb,
1265 						BLKIO_STAT_WAIT_TIME, 1, 0);
1266 		case BLKIO_PROP_io_merged:
1267 			return blkio_read_blkg_stats(blkcg, cft, cb,
1268 						BLKIO_STAT_CPU_MERGED, 1, 1);
1269 		case BLKIO_PROP_io_queued:
1270 			return blkio_read_blkg_stats(blkcg, cft, cb,
1271 						BLKIO_STAT_QUEUED, 1, 0);
1272 #ifdef CONFIG_DEBUG_BLK_CGROUP
1273 		case BLKIO_PROP_unaccounted_time:
1274 			return blkio_read_blkg_stats(blkcg, cft, cb,
1275 					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1276 		case BLKIO_PROP_dequeue:
1277 			return blkio_read_blkg_stats(blkcg, cft, cb,
1278 						BLKIO_STAT_DEQUEUE, 0, 0);
1279 		case BLKIO_PROP_avg_queue_size:
1280 			return blkio_read_blkg_stats(blkcg, cft, cb,
1281 					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1282 		case BLKIO_PROP_group_wait_time:
1283 			return blkio_read_blkg_stats(blkcg, cft, cb,
1284 					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1285 		case BLKIO_PROP_idle_time:
1286 			return blkio_read_blkg_stats(blkcg, cft, cb,
1287 						BLKIO_STAT_IDLE_TIME, 0, 0);
1288 		case BLKIO_PROP_empty_time:
1289 			return blkio_read_blkg_stats(blkcg, cft, cb,
1290 						BLKIO_STAT_EMPTY_TIME, 0, 0);
1291 #endif
1292 		default:
1293 			BUG();
1294 		}
1295 		break;
1296 	case BLKIO_POLICY_THROTL:
1297 		switch(name){
1298 		case BLKIO_THROTL_io_service_bytes:
1299 			return blkio_read_blkg_stats(blkcg, cft, cb,
1300 						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1301 		case BLKIO_THROTL_io_serviced:
1302 			return blkio_read_blkg_stats(blkcg, cft, cb,
1303 						BLKIO_STAT_CPU_SERVICED, 1, 1);
1304 		default:
1305 			BUG();
1306 		}
1307 		break;
1308 	default:
1309 		BUG();
1310 	}
1311 
1312 	return 0;
1313 }
1314 
1315 static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1316 {
1317 	struct blkio_group *blkg;
1318 	struct hlist_node *n;
1319 	struct blkio_policy_node *pn;
1320 
1321 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1322 		return -EINVAL;
1323 
1324 	spin_lock(&blkio_list_lock);
1325 	spin_lock_irq(&blkcg->lock);
1326 	blkcg->weight = (unsigned int)val;
1327 
1328 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1329 		pn = blkio_policy_search_node(blkcg, blkg->dev,
1330 				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1331 		if (pn)
1332 			continue;
1333 
1334 		blkio_update_group_weight(blkg, blkcg->weight);
1335 	}
1336 	spin_unlock_irq(&blkcg->lock);
1337 	spin_unlock(&blkio_list_lock);
1338 	return 0;
1339 }
1340 
1341 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1342 	struct blkio_cgroup *blkcg;
1343 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1344 	int name = BLKIOFILE_ATTR(cft->private);
1345 
1346 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1347 
1348 	switch(plid) {
1349 	case BLKIO_POLICY_PROP:
1350 		switch(name) {
1351 		case BLKIO_PROP_weight:
1352 			return (u64)blkcg->weight;
1353 		}
1354 		break;
1355 	default:
1356 		BUG();
1357 	}
1358 	return 0;
1359 }
1360 
1361 static int
1362 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1363 {
1364 	struct blkio_cgroup *blkcg;
1365 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1366 	int name = BLKIOFILE_ATTR(cft->private);
1367 
1368 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1369 
1370 	switch(plid) {
1371 	case BLKIO_POLICY_PROP:
1372 		switch(name) {
1373 		case BLKIO_PROP_weight:
1374 			return blkio_weight_write(blkcg, val);
1375 		}
1376 		break;
1377 	default:
1378 		BUG();
1379 	}
1380 
1381 	return 0;
1382 }
1383 
1384 struct cftype blkio_files[] = {
1385 	{
1386 		.name = "weight_device",
1387 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1388 				BLKIO_PROP_weight_device),
1389 		.read_seq_string = blkiocg_file_read,
1390 		.write_string = blkiocg_file_write,
1391 		.max_write_len = 256,
1392 	},
1393 	{
1394 		.name = "weight",
1395 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 				BLKIO_PROP_weight),
1397 		.read_u64 = blkiocg_file_read_u64,
1398 		.write_u64 = blkiocg_file_write_u64,
1399 	},
1400 	{
1401 		.name = "time",
1402 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1403 				BLKIO_PROP_time),
1404 		.read_map = blkiocg_file_read_map,
1405 	},
1406 	{
1407 		.name = "sectors",
1408 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1409 				BLKIO_PROP_sectors),
1410 		.read_map = blkiocg_file_read_map,
1411 	},
1412 	{
1413 		.name = "io_service_bytes",
1414 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1415 				BLKIO_PROP_io_service_bytes),
1416 		.read_map = blkiocg_file_read_map,
1417 	},
1418 	{
1419 		.name = "io_serviced",
1420 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1421 				BLKIO_PROP_io_serviced),
1422 		.read_map = blkiocg_file_read_map,
1423 	},
1424 	{
1425 		.name = "io_service_time",
1426 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1427 				BLKIO_PROP_io_service_time),
1428 		.read_map = blkiocg_file_read_map,
1429 	},
1430 	{
1431 		.name = "io_wait_time",
1432 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1433 				BLKIO_PROP_io_wait_time),
1434 		.read_map = blkiocg_file_read_map,
1435 	},
1436 	{
1437 		.name = "io_merged",
1438 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1439 				BLKIO_PROP_io_merged),
1440 		.read_map = blkiocg_file_read_map,
1441 	},
1442 	{
1443 		.name = "io_queued",
1444 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1445 				BLKIO_PROP_io_queued),
1446 		.read_map = blkiocg_file_read_map,
1447 	},
1448 	{
1449 		.name = "reset_stats",
1450 		.write_u64 = blkiocg_reset_stats,
1451 	},
1452 #ifdef CONFIG_BLK_DEV_THROTTLING
1453 	{
1454 		.name = "throttle.read_bps_device",
1455 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1456 				BLKIO_THROTL_read_bps_device),
1457 		.read_seq_string = blkiocg_file_read,
1458 		.write_string = blkiocg_file_write,
1459 		.max_write_len = 256,
1460 	},
1461 
1462 	{
1463 		.name = "throttle.write_bps_device",
1464 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1465 				BLKIO_THROTL_write_bps_device),
1466 		.read_seq_string = blkiocg_file_read,
1467 		.write_string = blkiocg_file_write,
1468 		.max_write_len = 256,
1469 	},
1470 
1471 	{
1472 		.name = "throttle.read_iops_device",
1473 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1474 				BLKIO_THROTL_read_iops_device),
1475 		.read_seq_string = blkiocg_file_read,
1476 		.write_string = blkiocg_file_write,
1477 		.max_write_len = 256,
1478 	},
1479 
1480 	{
1481 		.name = "throttle.write_iops_device",
1482 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1483 				BLKIO_THROTL_write_iops_device),
1484 		.read_seq_string = blkiocg_file_read,
1485 		.write_string = blkiocg_file_write,
1486 		.max_write_len = 256,
1487 	},
1488 	{
1489 		.name = "throttle.io_service_bytes",
1490 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1491 				BLKIO_THROTL_io_service_bytes),
1492 		.read_map = blkiocg_file_read_map,
1493 	},
1494 	{
1495 		.name = "throttle.io_serviced",
1496 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1497 				BLKIO_THROTL_io_serviced),
1498 		.read_map = blkiocg_file_read_map,
1499 	},
1500 #endif /* CONFIG_BLK_DEV_THROTTLING */
1501 
1502 #ifdef CONFIG_DEBUG_BLK_CGROUP
1503 	{
1504 		.name = "avg_queue_size",
1505 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1506 				BLKIO_PROP_avg_queue_size),
1507 		.read_map = blkiocg_file_read_map,
1508 	},
1509 	{
1510 		.name = "group_wait_time",
1511 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1512 				BLKIO_PROP_group_wait_time),
1513 		.read_map = blkiocg_file_read_map,
1514 	},
1515 	{
1516 		.name = "idle_time",
1517 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1518 				BLKIO_PROP_idle_time),
1519 		.read_map = blkiocg_file_read_map,
1520 	},
1521 	{
1522 		.name = "empty_time",
1523 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1524 				BLKIO_PROP_empty_time),
1525 		.read_map = blkiocg_file_read_map,
1526 	},
1527 	{
1528 		.name = "dequeue",
1529 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1530 				BLKIO_PROP_dequeue),
1531 		.read_map = blkiocg_file_read_map,
1532 	},
1533 	{
1534 		.name = "unaccounted_time",
1535 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1536 				BLKIO_PROP_unaccounted_time),
1537 		.read_map = blkiocg_file_read_map,
1538 	},
1539 #endif
1540 };
1541 
1542 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1543 {
1544 	return cgroup_add_files(cgroup, subsys, blkio_files,
1545 				ARRAY_SIZE(blkio_files));
1546 }
1547 
1548 static void blkiocg_destroy(struct cgroup *cgroup)
1549 {
1550 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1551 	unsigned long flags;
1552 	struct blkio_group *blkg;
1553 	void *key;
1554 	struct blkio_policy_type *blkiop;
1555 	struct blkio_policy_node *pn, *pntmp;
1556 
1557 	rcu_read_lock();
1558 	do {
1559 		spin_lock_irqsave(&blkcg->lock, flags);
1560 
1561 		if (hlist_empty(&blkcg->blkg_list)) {
1562 			spin_unlock_irqrestore(&blkcg->lock, flags);
1563 			break;
1564 		}
1565 
1566 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1567 					blkcg_node);
1568 		key = rcu_dereference(blkg->key);
1569 		__blkiocg_del_blkio_group(blkg);
1570 
1571 		spin_unlock_irqrestore(&blkcg->lock, flags);
1572 
1573 		/*
1574 		 * This blkio_group is being unlinked as associated cgroup is
1575 		 * going away. Let all the IO controlling policies know about
1576 		 * this event.
1577 		 */
1578 		spin_lock(&blkio_list_lock);
1579 		list_for_each_entry(blkiop, &blkio_list, list) {
1580 			if (blkiop->plid != blkg->plid)
1581 				continue;
1582 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1583 		}
1584 		spin_unlock(&blkio_list_lock);
1585 	} while (1);
1586 
1587 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1588 		blkio_policy_delete_node(pn);
1589 		kfree(pn);
1590 	}
1591 
1592 	free_css_id(&blkio_subsys, &blkcg->css);
1593 	rcu_read_unlock();
1594 	if (blkcg != &blkio_root_cgroup)
1595 		kfree(blkcg);
1596 }
1597 
1598 static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
1599 {
1600 	struct blkio_cgroup *blkcg;
1601 	struct cgroup *parent = cgroup->parent;
1602 
1603 	if (!parent) {
1604 		blkcg = &blkio_root_cgroup;
1605 		goto done;
1606 	}
1607 
1608 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1609 	if (!blkcg)
1610 		return ERR_PTR(-ENOMEM);
1611 
1612 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1613 done:
1614 	spin_lock_init(&blkcg->lock);
1615 	INIT_HLIST_HEAD(&blkcg->blkg_list);
1616 
1617 	INIT_LIST_HEAD(&blkcg->policy_list);
1618 	return &blkcg->css;
1619 }
1620 
1621 /*
1622  * We cannot support shared io contexts, as we have no mean to support
1623  * two tasks with the same ioc in two different groups without major rework
1624  * of the main cic data structures.  For now we allow a task to change
1625  * its cgroup only if it's the only owner of its ioc.
1626  */
1627 static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1628 {
1629 	struct task_struct *task;
1630 	struct io_context *ioc;
1631 	int ret = 0;
1632 
1633 	/* task_lock() is needed to avoid races with exit_io_context() */
1634 	cgroup_taskset_for_each(task, cgrp, tset) {
1635 		task_lock(task);
1636 		ioc = task->io_context;
1637 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1638 			ret = -EINVAL;
1639 		task_unlock(task);
1640 		if (ret)
1641 			break;
1642 	}
1643 	return ret;
1644 }
1645 
1646 static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1647 {
1648 	struct task_struct *task;
1649 	struct io_context *ioc;
1650 
1651 	cgroup_taskset_for_each(task, cgrp, tset) {
1652 		/* we don't lose anything even if ioc allocation fails */
1653 		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1654 		if (ioc) {
1655 			ioc_cgroup_changed(ioc);
1656 			put_io_context(ioc);
1657 		}
1658 	}
1659 }
1660 
1661 void blkio_policy_register(struct blkio_policy_type *blkiop)
1662 {
1663 	spin_lock(&blkio_list_lock);
1664 	list_add_tail(&blkiop->list, &blkio_list);
1665 	spin_unlock(&blkio_list_lock);
1666 }
1667 EXPORT_SYMBOL_GPL(blkio_policy_register);
1668 
1669 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1670 {
1671 	spin_lock(&blkio_list_lock);
1672 	list_del_init(&blkiop->list);
1673 	spin_unlock(&blkio_list_lock);
1674 }
1675 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1676 
1677 static int __init init_cgroup_blkio(void)
1678 {
1679 	return cgroup_load_subsys(&blkio_subsys);
1680 }
1681 
1682 static void __exit exit_cgroup_blkio(void)
1683 {
1684 	cgroup_unload_subsys(&blkio_subsys);
1685 }
1686 
1687 module_init(init_cgroup_blkio);
1688 module_exit(exit_cgroup_blkio);
1689 MODULE_LICENSE("GPL");
1690