xref: /openbmc/linux/block/blk-cgroup.c (revision c4c14c3b)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  *
13  * For policy-specific per-blkcg data:
14  * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
15  *                    Arianna Avanzini <avanzini.arianna@gmail.com>
16  */
17 #include <linux/ioprio.h>
18 #include <linux/kdev_t.h>
19 #include <linux/module.h>
20 #include <linux/sched/signal.h>
21 #include <linux/err.h>
22 #include <linux/blkdev.h>
23 #include <linux/backing-dev.h>
24 #include <linux/slab.h>
25 #include <linux/genhd.h>
26 #include <linux/delay.h>
27 #include <linux/atomic.h>
28 #include <linux/ctype.h>
29 #include <linux/blk-cgroup.h>
30 #include <linux/tracehook.h>
31 #include "blk.h"
32 
33 #define MAX_KEY_LEN 100
34 
35 /*
36  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
37  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
38  * policy [un]register operations including cgroup file additions /
39  * removals.  Putting cgroup file registration outside blkcg_pol_mutex
40  * allows grabbing it from cgroup callbacks.
41  */
42 static DEFINE_MUTEX(blkcg_pol_register_mutex);
43 static DEFINE_MUTEX(blkcg_pol_mutex);
44 
45 struct blkcg blkcg_root;
46 EXPORT_SYMBOL_GPL(blkcg_root);
47 
48 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
49 
50 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
51 
52 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
53 
54 static bool blkcg_debug_stats = false;
55 
56 static bool blkcg_policy_enabled(struct request_queue *q,
57 				 const struct blkcg_policy *pol)
58 {
59 	return pol && test_bit(pol->plid, q->blkcg_pols);
60 }
61 
62 /**
63  * blkg_free - free a blkg
64  * @blkg: blkg to free
65  *
66  * Free @blkg which may be partially allocated.
67  */
68 static void blkg_free(struct blkcg_gq *blkg)
69 {
70 	int i;
71 
72 	if (!blkg)
73 		return;
74 
75 	for (i = 0; i < BLKCG_MAX_POLS; i++)
76 		if (blkg->pd[i])
77 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
78 
79 	if (blkg->blkcg != &blkcg_root)
80 		blk_exit_rl(blkg->q, &blkg->rl);
81 
82 	blkg_rwstat_exit(&blkg->stat_ios);
83 	blkg_rwstat_exit(&blkg->stat_bytes);
84 	kfree(blkg);
85 }
86 
87 static void __blkg_release(struct rcu_head *rcu)
88 {
89 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
90 
91 	percpu_ref_exit(&blkg->refcnt);
92 
93 	/* release the blkcg and parent blkg refs this blkg has been holding */
94 	css_put(&blkg->blkcg->css);
95 	if (blkg->parent)
96 		blkg_put(blkg->parent);
97 
98 	wb_congested_put(blkg->wb_congested);
99 
100 	blkg_free(blkg);
101 }
102 
103 /*
104  * A group is RCU protected, but having an rcu lock does not mean that one
105  * can access all the fields of blkg and assume these are valid.  For
106  * example, don't try to follow throtl_data and request queue links.
107  *
108  * Having a reference to blkg under an rcu allows accesses to only values
109  * local to groups like group stats and group rate limits.
110  */
111 static void blkg_release(struct percpu_ref *ref)
112 {
113 	struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
114 
115 	call_rcu(&blkg->rcu_head, __blkg_release);
116 }
117 
118 /**
119  * blkg_alloc - allocate a blkg
120  * @blkcg: block cgroup the new blkg is associated with
121  * @q: request_queue the new blkg is associated with
122  * @gfp_mask: allocation mask to use
123  *
124  * Allocate a new blkg assocating @blkcg and @q.
125  */
126 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
127 				   gfp_t gfp_mask)
128 {
129 	struct blkcg_gq *blkg;
130 	int i;
131 
132 	/* alloc and init base part */
133 	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
134 	if (!blkg)
135 		return NULL;
136 
137 	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
138 	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
139 		goto err_free;
140 
141 	blkg->q = q;
142 	INIT_LIST_HEAD(&blkg->q_node);
143 	blkg->blkcg = blkcg;
144 
145 	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
146 	if (blkcg != &blkcg_root) {
147 		if (blk_init_rl(&blkg->rl, q, gfp_mask))
148 			goto err_free;
149 		blkg->rl.blkg = blkg;
150 	}
151 
152 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
153 		struct blkcg_policy *pol = blkcg_policy[i];
154 		struct blkg_policy_data *pd;
155 
156 		if (!blkcg_policy_enabled(q, pol))
157 			continue;
158 
159 		/* alloc per-policy data and attach it to blkg */
160 		pd = pol->pd_alloc_fn(gfp_mask, q->node);
161 		if (!pd)
162 			goto err_free;
163 
164 		blkg->pd[i] = pd;
165 		pd->blkg = blkg;
166 		pd->plid = i;
167 	}
168 
169 	return blkg;
170 
171 err_free:
172 	blkg_free(blkg);
173 	return NULL;
174 }
175 
176 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
177 				      struct request_queue *q, bool update_hint)
178 {
179 	struct blkcg_gq *blkg;
180 
181 	/*
182 	 * Hint didn't match.  Look up from the radix tree.  Note that the
183 	 * hint can only be updated under queue_lock as otherwise @blkg
184 	 * could have already been removed from blkg_tree.  The caller is
185 	 * responsible for grabbing queue_lock if @update_hint.
186 	 */
187 	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
188 	if (blkg && blkg->q == q) {
189 		if (update_hint) {
190 			lockdep_assert_held(q->queue_lock);
191 			rcu_assign_pointer(blkcg->blkg_hint, blkg);
192 		}
193 		return blkg;
194 	}
195 
196 	return NULL;
197 }
198 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
199 
200 /*
201  * If @new_blkg is %NULL, this function tries to allocate a new one as
202  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
203  */
204 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
205 				    struct request_queue *q,
206 				    struct blkcg_gq *new_blkg)
207 {
208 	struct blkcg_gq *blkg;
209 	struct bdi_writeback_congested *wb_congested;
210 	int i, ret;
211 
212 	WARN_ON_ONCE(!rcu_read_lock_held());
213 	lockdep_assert_held(q->queue_lock);
214 
215 	/* blkg holds a reference to blkcg */
216 	if (!css_tryget_online(&blkcg->css)) {
217 		ret = -ENODEV;
218 		goto err_free_blkg;
219 	}
220 
221 	wb_congested = wb_congested_get_create(q->backing_dev_info,
222 					       blkcg->css.id,
223 					       GFP_NOWAIT | __GFP_NOWARN);
224 	if (!wb_congested) {
225 		ret = -ENOMEM;
226 		goto err_put_css;
227 	}
228 
229 	/* allocate */
230 	if (!new_blkg) {
231 		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
232 		if (unlikely(!new_blkg)) {
233 			ret = -ENOMEM;
234 			goto err_put_congested;
235 		}
236 	}
237 	blkg = new_blkg;
238 	blkg->wb_congested = wb_congested;
239 
240 	/* link parent */
241 	if (blkcg_parent(blkcg)) {
242 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
243 		if (WARN_ON_ONCE(!blkg->parent)) {
244 			ret = -ENODEV;
245 			goto err_put_congested;
246 		}
247 		blkg_get(blkg->parent);
248 	}
249 
250 	ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
251 			      GFP_NOWAIT | __GFP_NOWARN);
252 	if (ret)
253 		goto err_cancel_ref;
254 
255 	/* invoke per-policy init */
256 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
257 		struct blkcg_policy *pol = blkcg_policy[i];
258 
259 		if (blkg->pd[i] && pol->pd_init_fn)
260 			pol->pd_init_fn(blkg->pd[i]);
261 	}
262 
263 	/* insert */
264 	spin_lock(&blkcg->lock);
265 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
266 	if (likely(!ret)) {
267 		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
268 		list_add(&blkg->q_node, &q->blkg_list);
269 
270 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
271 			struct blkcg_policy *pol = blkcg_policy[i];
272 
273 			if (blkg->pd[i] && pol->pd_online_fn)
274 				pol->pd_online_fn(blkg->pd[i]);
275 		}
276 	}
277 	blkg->online = true;
278 	spin_unlock(&blkcg->lock);
279 
280 	if (!ret)
281 		return blkg;
282 
283 	/* @blkg failed fully initialized, use the usual release path */
284 	blkg_put(blkg);
285 	return ERR_PTR(ret);
286 
287 err_cancel_ref:
288 	percpu_ref_exit(&blkg->refcnt);
289 err_put_congested:
290 	wb_congested_put(wb_congested);
291 err_put_css:
292 	css_put(&blkcg->css);
293 err_free_blkg:
294 	blkg_free(new_blkg);
295 	return ERR_PTR(ret);
296 }
297 
298 /**
299  * __blkg_lookup_create - lookup blkg, try to create one if not there
300  * @blkcg: blkcg of interest
301  * @q: request_queue of interest
302  *
303  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
304  * create one.  blkg creation is performed recursively from blkcg_root such
305  * that all non-root blkg's have access to the parent blkg.  This function
306  * should be called under RCU read lock and @q->queue_lock.
307  *
308  * Returns the blkg or the closest blkg if blkg_create fails as it walks
309  * down from root.
310  */
311 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
312 				      struct request_queue *q)
313 {
314 	struct blkcg_gq *blkg;
315 
316 	WARN_ON_ONCE(!rcu_read_lock_held());
317 	lockdep_assert_held(q->queue_lock);
318 
319 	/*
320 	 * This could be the first entry point of blkcg implementation and
321 	 * we shouldn't allow anything to go through for a bypassing queue.
322 	 */
323 	if (unlikely(blk_queue_bypass(q)))
324 		return q->root_blkg;
325 
326 	blkg = __blkg_lookup(blkcg, q, true);
327 	if (blkg)
328 		return blkg;
329 
330 	/*
331 	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
332 	 * non-root blkgs have access to their parents.  Returns the closest
333 	 * blkg to the intended blkg should blkg_create() fail.
334 	 */
335 	while (true) {
336 		struct blkcg *pos = blkcg;
337 		struct blkcg *parent = blkcg_parent(blkcg);
338 		struct blkcg_gq *ret_blkg = q->root_blkg;
339 
340 		while (parent) {
341 			blkg = __blkg_lookup(parent, q, false);
342 			if (blkg) {
343 				/* remember closest blkg */
344 				ret_blkg = blkg;
345 				break;
346 			}
347 			pos = parent;
348 			parent = blkcg_parent(parent);
349 		}
350 
351 		blkg = blkg_create(pos, q, NULL);
352 		if (IS_ERR(blkg))
353 			return ret_blkg;
354 		if (pos == blkcg)
355 			return blkg;
356 	}
357 }
358 
359 /**
360  * blkg_lookup_create - find or create a blkg
361  * @blkcg: target block cgroup
362  * @q: target request_queue
363  *
364  * This looks up or creates the blkg representing the unique pair
365  * of the blkcg and the request_queue.
366  */
367 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
368 				    struct request_queue *q)
369 {
370 	struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
371 	unsigned long flags;
372 
373 	if (unlikely(!blkg)) {
374 		spin_lock_irqsave(q->queue_lock, flags);
375 
376 		blkg = __blkg_lookup_create(blkcg, q);
377 
378 		spin_unlock_irqrestore(q->queue_lock, flags);
379 	}
380 
381 	return blkg;
382 }
383 
384 static void blkg_destroy(struct blkcg_gq *blkg)
385 {
386 	struct blkcg *blkcg = blkg->blkcg;
387 	struct blkcg_gq *parent = blkg->parent;
388 	int i;
389 
390 	lockdep_assert_held(blkg->q->queue_lock);
391 	lockdep_assert_held(&blkcg->lock);
392 
393 	/* Something wrong if we are trying to remove same group twice */
394 	WARN_ON_ONCE(list_empty(&blkg->q_node));
395 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
396 
397 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
398 		struct blkcg_policy *pol = blkcg_policy[i];
399 
400 		if (blkg->pd[i] && pol->pd_offline_fn)
401 			pol->pd_offline_fn(blkg->pd[i]);
402 	}
403 
404 	if (parent) {
405 		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
406 		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
407 	}
408 
409 	blkg->online = false;
410 
411 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
412 	list_del_init(&blkg->q_node);
413 	hlist_del_init_rcu(&blkg->blkcg_node);
414 
415 	/*
416 	 * Both setting lookup hint to and clearing it from @blkg are done
417 	 * under queue_lock.  If it's not pointing to @blkg now, it never
418 	 * will.  Hint assignment itself can race safely.
419 	 */
420 	if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
421 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
422 
423 	/*
424 	 * Put the reference taken at the time of creation so that when all
425 	 * queues are gone, group can be destroyed.
426 	 */
427 	percpu_ref_kill(&blkg->refcnt);
428 }
429 
430 /**
431  * blkg_destroy_all - destroy all blkgs associated with a request_queue
432  * @q: request_queue of interest
433  *
434  * Destroy all blkgs associated with @q.
435  */
436 static void blkg_destroy_all(struct request_queue *q)
437 {
438 	struct blkcg_gq *blkg, *n;
439 
440 	lockdep_assert_held(q->queue_lock);
441 
442 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
443 		struct blkcg *blkcg = blkg->blkcg;
444 
445 		spin_lock(&blkcg->lock);
446 		blkg_destroy(blkg);
447 		spin_unlock(&blkcg->lock);
448 	}
449 
450 	q->root_blkg = NULL;
451 	q->root_rl.blkg = NULL;
452 }
453 
454 /*
455  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
456  * because the root blkg uses @q->root_rl instead of its own rl.
457  */
458 struct request_list *__blk_queue_next_rl(struct request_list *rl,
459 					 struct request_queue *q)
460 {
461 	struct list_head *ent;
462 	struct blkcg_gq *blkg;
463 
464 	/*
465 	 * Determine the current blkg list_head.  The first entry is
466 	 * root_rl which is off @q->blkg_list and mapped to the head.
467 	 */
468 	if (rl == &q->root_rl) {
469 		ent = &q->blkg_list;
470 		/* There are no more block groups, hence no request lists */
471 		if (list_empty(ent))
472 			return NULL;
473 	} else {
474 		blkg = container_of(rl, struct blkcg_gq, rl);
475 		ent = &blkg->q_node;
476 	}
477 
478 	/* walk to the next list_head, skip root blkcg */
479 	ent = ent->next;
480 	if (ent == &q->root_blkg->q_node)
481 		ent = ent->next;
482 	if (ent == &q->blkg_list)
483 		return NULL;
484 
485 	blkg = container_of(ent, struct blkcg_gq, q_node);
486 	return &blkg->rl;
487 }
488 
489 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
490 			     struct cftype *cftype, u64 val)
491 {
492 	struct blkcg *blkcg = css_to_blkcg(css);
493 	struct blkcg_gq *blkg;
494 	int i;
495 
496 	mutex_lock(&blkcg_pol_mutex);
497 	spin_lock_irq(&blkcg->lock);
498 
499 	/*
500 	 * Note that stat reset is racy - it doesn't synchronize against
501 	 * stat updates.  This is a debug feature which shouldn't exist
502 	 * anyway.  If you get hit by a race, retry.
503 	 */
504 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
505 		blkg_rwstat_reset(&blkg->stat_bytes);
506 		blkg_rwstat_reset(&blkg->stat_ios);
507 
508 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
509 			struct blkcg_policy *pol = blkcg_policy[i];
510 
511 			if (blkg->pd[i] && pol->pd_reset_stats_fn)
512 				pol->pd_reset_stats_fn(blkg->pd[i]);
513 		}
514 	}
515 
516 	spin_unlock_irq(&blkcg->lock);
517 	mutex_unlock(&blkcg_pol_mutex);
518 	return 0;
519 }
520 
521 const char *blkg_dev_name(struct blkcg_gq *blkg)
522 {
523 	/* some drivers (floppy) instantiate a queue w/o disk registered */
524 	if (blkg->q->backing_dev_info->dev)
525 		return dev_name(blkg->q->backing_dev_info->dev);
526 	return NULL;
527 }
528 EXPORT_SYMBOL_GPL(blkg_dev_name);
529 
530 /**
531  * blkcg_print_blkgs - helper for printing per-blkg data
532  * @sf: seq_file to print to
533  * @blkcg: blkcg of interest
534  * @prfill: fill function to print out a blkg
535  * @pol: policy in question
536  * @data: data to be passed to @prfill
537  * @show_total: to print out sum of prfill return values or not
538  *
539  * This function invokes @prfill on each blkg of @blkcg if pd for the
540  * policy specified by @pol exists.  @prfill is invoked with @sf, the
541  * policy data and @data and the matching queue lock held.  If @show_total
542  * is %true, the sum of the return values from @prfill is printed with
543  * "Total" label at the end.
544  *
545  * This is to be used to construct print functions for
546  * cftype->read_seq_string method.
547  */
548 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
549 		       u64 (*prfill)(struct seq_file *,
550 				     struct blkg_policy_data *, int),
551 		       const struct blkcg_policy *pol, int data,
552 		       bool show_total)
553 {
554 	struct blkcg_gq *blkg;
555 	u64 total = 0;
556 
557 	rcu_read_lock();
558 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
559 		spin_lock_irq(blkg->q->queue_lock);
560 		if (blkcg_policy_enabled(blkg->q, pol))
561 			total += prfill(sf, blkg->pd[pol->plid], data);
562 		spin_unlock_irq(blkg->q->queue_lock);
563 	}
564 	rcu_read_unlock();
565 
566 	if (show_total)
567 		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
568 }
569 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
570 
571 /**
572  * __blkg_prfill_u64 - prfill helper for a single u64 value
573  * @sf: seq_file to print to
574  * @pd: policy private data of interest
575  * @v: value to print
576  *
577  * Print @v to @sf for the device assocaited with @pd.
578  */
579 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
580 {
581 	const char *dname = blkg_dev_name(pd->blkg);
582 
583 	if (!dname)
584 		return 0;
585 
586 	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
587 	return v;
588 }
589 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
590 
591 /**
592  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
593  * @sf: seq_file to print to
594  * @pd: policy private data of interest
595  * @rwstat: rwstat to print
596  *
597  * Print @rwstat to @sf for the device assocaited with @pd.
598  */
599 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
600 			 const struct blkg_rwstat *rwstat)
601 {
602 	static const char *rwstr[] = {
603 		[BLKG_RWSTAT_READ]	= "Read",
604 		[BLKG_RWSTAT_WRITE]	= "Write",
605 		[BLKG_RWSTAT_SYNC]	= "Sync",
606 		[BLKG_RWSTAT_ASYNC]	= "Async",
607 		[BLKG_RWSTAT_DISCARD]	= "Discard",
608 	};
609 	const char *dname = blkg_dev_name(pd->blkg);
610 	u64 v;
611 	int i;
612 
613 	if (!dname)
614 		return 0;
615 
616 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
617 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
618 			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
619 
620 	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
621 		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
622 		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
623 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
624 	return v;
625 }
626 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
627 
628 /**
629  * blkg_prfill_stat - prfill callback for blkg_stat
630  * @sf: seq_file to print to
631  * @pd: policy private data of interest
632  * @off: offset to the blkg_stat in @pd
633  *
634  * prfill callback for printing a blkg_stat.
635  */
636 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
637 {
638 	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
639 }
640 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
641 
642 /**
643  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
644  * @sf: seq_file to print to
645  * @pd: policy private data of interest
646  * @off: offset to the blkg_rwstat in @pd
647  *
648  * prfill callback for printing a blkg_rwstat.
649  */
650 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
651 		       int off)
652 {
653 	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
654 
655 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
656 }
657 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
658 
659 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
660 				    struct blkg_policy_data *pd, int off)
661 {
662 	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
663 
664 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
665 }
666 
667 /**
668  * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
669  * @sf: seq_file to print to
670  * @v: unused
671  *
672  * To be used as cftype->seq_show to print blkg->stat_bytes.
673  * cftype->private must be set to the blkcg_policy.
674  */
675 int blkg_print_stat_bytes(struct seq_file *sf, void *v)
676 {
677 	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
678 			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
679 			  offsetof(struct blkcg_gq, stat_bytes), true);
680 	return 0;
681 }
682 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
683 
684 /**
685  * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
686  * @sf: seq_file to print to
687  * @v: unused
688  *
689  * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
690  * must be set to the blkcg_policy.
691  */
692 int blkg_print_stat_ios(struct seq_file *sf, void *v)
693 {
694 	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
695 			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
696 			  offsetof(struct blkcg_gq, stat_ios), true);
697 	return 0;
698 }
699 EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
700 
701 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
702 					      struct blkg_policy_data *pd,
703 					      int off)
704 {
705 	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
706 							      NULL, off);
707 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
708 }
709 
710 /**
711  * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
712  * @sf: seq_file to print to
713  * @v: unused
714  */
715 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
716 {
717 	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
718 			  blkg_prfill_rwstat_field_recursive,
719 			  (void *)seq_cft(sf)->private,
720 			  offsetof(struct blkcg_gq, stat_bytes), true);
721 	return 0;
722 }
723 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
724 
725 /**
726  * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
727  * @sf: seq_file to print to
728  * @v: unused
729  */
730 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
731 {
732 	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
733 			  blkg_prfill_rwstat_field_recursive,
734 			  (void *)seq_cft(sf)->private,
735 			  offsetof(struct blkcg_gq, stat_ios), true);
736 	return 0;
737 }
738 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
739 
740 /**
741  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
742  * @blkg: blkg of interest
743  * @pol: blkcg_policy which contains the blkg_stat
744  * @off: offset to the blkg_stat in blkg_policy_data or @blkg
745  *
746  * Collect the blkg_stat specified by @blkg, @pol and @off and all its
747  * online descendants and their aux counts.  The caller must be holding the
748  * queue lock for online tests.
749  *
750  * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
751  * at @off bytes into @blkg's blkg_policy_data of the policy.
752  */
753 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
754 			    struct blkcg_policy *pol, int off)
755 {
756 	struct blkcg_gq *pos_blkg;
757 	struct cgroup_subsys_state *pos_css;
758 	u64 sum = 0;
759 
760 	lockdep_assert_held(blkg->q->queue_lock);
761 
762 	rcu_read_lock();
763 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
764 		struct blkg_stat *stat;
765 
766 		if (!pos_blkg->online)
767 			continue;
768 
769 		if (pol)
770 			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
771 		else
772 			stat = (void *)blkg + off;
773 
774 		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
775 	}
776 	rcu_read_unlock();
777 
778 	return sum;
779 }
780 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
781 
782 /**
783  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
784  * @blkg: blkg of interest
785  * @pol: blkcg_policy which contains the blkg_rwstat
786  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
787  *
788  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
789  * online descendants and their aux counts.  The caller must be holding the
790  * queue lock for online tests.
791  *
792  * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
793  * is at @off bytes into @blkg's blkg_policy_data of the policy.
794  */
795 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
796 					     struct blkcg_policy *pol, int off)
797 {
798 	struct blkcg_gq *pos_blkg;
799 	struct cgroup_subsys_state *pos_css;
800 	struct blkg_rwstat sum = { };
801 	int i;
802 
803 	lockdep_assert_held(blkg->q->queue_lock);
804 
805 	rcu_read_lock();
806 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
807 		struct blkg_rwstat *rwstat;
808 
809 		if (!pos_blkg->online)
810 			continue;
811 
812 		if (pol)
813 			rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
814 		else
815 			rwstat = (void *)pos_blkg + off;
816 
817 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
818 			atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
819 				percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
820 				&sum.aux_cnt[i]);
821 	}
822 	rcu_read_unlock();
823 
824 	return sum;
825 }
826 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
827 
828 /* Performs queue bypass and policy enabled checks then looks up blkg. */
829 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
830 					  const struct blkcg_policy *pol,
831 					  struct request_queue *q)
832 {
833 	WARN_ON_ONCE(!rcu_read_lock_held());
834 	lockdep_assert_held(q->queue_lock);
835 
836 	if (!blkcg_policy_enabled(q, pol))
837 		return ERR_PTR(-EOPNOTSUPP);
838 
839 	/*
840 	 * This could be the first entry point of blkcg implementation and
841 	 * we shouldn't allow anything to go through for a bypassing queue.
842 	 */
843 	if (unlikely(blk_queue_bypass(q)))
844 		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
845 
846 	return __blkg_lookup(blkcg, q, true /* update_hint */);
847 }
848 
849 /**
850  * blkg_conf_prep - parse and prepare for per-blkg config update
851  * @blkcg: target block cgroup
852  * @pol: target policy
853  * @input: input string
854  * @ctx: blkg_conf_ctx to be filled
855  *
856  * Parse per-blkg config update from @input and initialize @ctx with the
857  * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
858  * part of @input following MAJ:MIN.  This function returns with RCU read
859  * lock and queue lock held and must be paired with blkg_conf_finish().
860  */
861 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
862 		   char *input, struct blkg_conf_ctx *ctx)
863 	__acquires(rcu) __acquires(disk->queue->queue_lock)
864 {
865 	struct gendisk *disk;
866 	struct request_queue *q;
867 	struct blkcg_gq *blkg;
868 	unsigned int major, minor;
869 	int key_len, part, ret;
870 	char *body;
871 
872 	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
873 		return -EINVAL;
874 
875 	body = input + key_len;
876 	if (!isspace(*body))
877 		return -EINVAL;
878 	body = skip_spaces(body);
879 
880 	disk = get_gendisk(MKDEV(major, minor), &part);
881 	if (!disk)
882 		return -ENODEV;
883 	if (part) {
884 		ret = -ENODEV;
885 		goto fail;
886 	}
887 
888 	q = disk->queue;
889 
890 	rcu_read_lock();
891 	spin_lock_irq(q->queue_lock);
892 
893 	blkg = blkg_lookup_check(blkcg, pol, q);
894 	if (IS_ERR(blkg)) {
895 		ret = PTR_ERR(blkg);
896 		goto fail_unlock;
897 	}
898 
899 	if (blkg)
900 		goto success;
901 
902 	/*
903 	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
904 	 * non-root blkgs have access to their parents.
905 	 */
906 	while (true) {
907 		struct blkcg *pos = blkcg;
908 		struct blkcg *parent;
909 		struct blkcg_gq *new_blkg;
910 
911 		parent = blkcg_parent(blkcg);
912 		while (parent && !__blkg_lookup(parent, q, false)) {
913 			pos = parent;
914 			parent = blkcg_parent(parent);
915 		}
916 
917 		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
918 		spin_unlock_irq(q->queue_lock);
919 		rcu_read_unlock();
920 
921 		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
922 		if (unlikely(!new_blkg)) {
923 			ret = -ENOMEM;
924 			goto fail;
925 		}
926 
927 		rcu_read_lock();
928 		spin_lock_irq(q->queue_lock);
929 
930 		blkg = blkg_lookup_check(pos, pol, q);
931 		if (IS_ERR(blkg)) {
932 			ret = PTR_ERR(blkg);
933 			goto fail_unlock;
934 		}
935 
936 		if (blkg) {
937 			blkg_free(new_blkg);
938 		} else {
939 			blkg = blkg_create(pos, q, new_blkg);
940 			if (unlikely(IS_ERR(blkg))) {
941 				ret = PTR_ERR(blkg);
942 				goto fail_unlock;
943 			}
944 		}
945 
946 		if (pos == blkcg)
947 			goto success;
948 	}
949 success:
950 	ctx->disk = disk;
951 	ctx->blkg = blkg;
952 	ctx->body = body;
953 	return 0;
954 
955 fail_unlock:
956 	spin_unlock_irq(q->queue_lock);
957 	rcu_read_unlock();
958 fail:
959 	put_disk_and_module(disk);
960 	/*
961 	 * If queue was bypassing, we should retry.  Do so after a
962 	 * short msleep().  It isn't strictly necessary but queue
963 	 * can be bypassing for some time and it's always nice to
964 	 * avoid busy looping.
965 	 */
966 	if (ret == -EBUSY) {
967 		msleep(10);
968 		ret = restart_syscall();
969 	}
970 	return ret;
971 }
972 EXPORT_SYMBOL_GPL(blkg_conf_prep);
973 
974 /**
975  * blkg_conf_finish - finish up per-blkg config update
976  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
977  *
978  * Finish up after per-blkg config update.  This function must be paired
979  * with blkg_conf_prep().
980  */
981 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
982 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
983 {
984 	spin_unlock_irq(ctx->disk->queue->queue_lock);
985 	rcu_read_unlock();
986 	put_disk_and_module(ctx->disk);
987 }
988 EXPORT_SYMBOL_GPL(blkg_conf_finish);
989 
990 static int blkcg_print_stat(struct seq_file *sf, void *v)
991 {
992 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
993 	struct blkcg_gq *blkg;
994 
995 	rcu_read_lock();
996 
997 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
998 		const char *dname;
999 		char *buf;
1000 		struct blkg_rwstat rwstat;
1001 		u64 rbytes, wbytes, rios, wios, dbytes, dios;
1002 		size_t size = seq_get_buf(sf, &buf), off = 0;
1003 		int i;
1004 		bool has_stats = false;
1005 
1006 		dname = blkg_dev_name(blkg);
1007 		if (!dname)
1008 			continue;
1009 
1010 		/*
1011 		 * Hooray string manipulation, count is the size written NOT
1012 		 * INCLUDING THE \0, so size is now count+1 less than what we
1013 		 * had before, but we want to start writing the next bit from
1014 		 * the \0 so we only add count to buf.
1015 		 */
1016 		off += scnprintf(buf+off, size-off, "%s ", dname);
1017 
1018 		spin_lock_irq(blkg->q->queue_lock);
1019 
1020 		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1021 					offsetof(struct blkcg_gq, stat_bytes));
1022 		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1023 		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1024 		dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1025 
1026 		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1027 					offsetof(struct blkcg_gq, stat_ios));
1028 		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1029 		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1030 		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1031 
1032 		spin_unlock_irq(blkg->q->queue_lock);
1033 
1034 		if (rbytes || wbytes || rios || wios) {
1035 			has_stats = true;
1036 			off += scnprintf(buf+off, size-off,
1037 					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
1038 					 rbytes, wbytes, rios, wios,
1039 					 dbytes, dios);
1040 		}
1041 
1042 		if (!blkcg_debug_stats)
1043 			goto next;
1044 
1045 		if (atomic_read(&blkg->use_delay)) {
1046 			has_stats = true;
1047 			off += scnprintf(buf+off, size-off,
1048 					 " use_delay=%d delay_nsec=%llu",
1049 					 atomic_read(&blkg->use_delay),
1050 					(unsigned long long)atomic64_read(&blkg->delay_nsec));
1051 		}
1052 
1053 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
1054 			struct blkcg_policy *pol = blkcg_policy[i];
1055 			size_t written;
1056 
1057 			if (!blkg->pd[i] || !pol->pd_stat_fn)
1058 				continue;
1059 
1060 			written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1061 			if (written)
1062 				has_stats = true;
1063 			off += written;
1064 		}
1065 next:
1066 		if (has_stats) {
1067 			off += scnprintf(buf+off, size-off, "\n");
1068 			seq_commit(sf, off);
1069 		}
1070 	}
1071 
1072 	rcu_read_unlock();
1073 	return 0;
1074 }
1075 
1076 static struct cftype blkcg_files[] = {
1077 	{
1078 		.name = "stat",
1079 		.flags = CFTYPE_NOT_ON_ROOT,
1080 		.seq_show = blkcg_print_stat,
1081 	},
1082 	{ }	/* terminate */
1083 };
1084 
1085 static struct cftype blkcg_legacy_files[] = {
1086 	{
1087 		.name = "reset_stats",
1088 		.write_u64 = blkcg_reset_stats,
1089 	},
1090 	{ }	/* terminate */
1091 };
1092 
1093 /*
1094  * blkcg destruction is a three-stage process.
1095  *
1096  * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
1097  *    which offlines writeback.  Here we tie the next stage of blkg destruction
1098  *    to the completion of writeback associated with the blkcg.  This lets us
1099  *    avoid punting potentially large amounts of outstanding writeback to root
1100  *    while maintaining any ongoing policies.  The next stage is triggered when
1101  *    the nr_cgwbs count goes to zero.
1102  *
1103  * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
1104  *    and handles the destruction of blkgs.  Here the css reference held by
1105  *    the blkg is put back eventually allowing blkcg_css_free() to be called.
1106  *    This work may occur in cgwb_release_workfn() on the cgwb_release
1107  *    workqueue.  Any submitted ios that fail to get the blkg ref will be
1108  *    punted to the root_blkg.
1109  *
1110  * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
1111  *    This finally frees the blkcg.
1112  */
1113 
1114 /**
1115  * blkcg_css_offline - cgroup css_offline callback
1116  * @css: css of interest
1117  *
1118  * This function is called when @css is about to go away.  Here the cgwbs are
1119  * offlined first and only once writeback associated with the blkcg has
1120  * finished do we start step 2 (see above).
1121  */
1122 static void blkcg_css_offline(struct cgroup_subsys_state *css)
1123 {
1124 	struct blkcg *blkcg = css_to_blkcg(css);
1125 
1126 	/* this prevents anyone from attaching or migrating to this blkcg */
1127 	wb_blkcg_offline(blkcg);
1128 
1129 	/* put the base cgwb reference allowing step 2 to be triggered */
1130 	blkcg_cgwb_put(blkcg);
1131 }
1132 
1133 /**
1134  * blkcg_destroy_blkgs - responsible for shooting down blkgs
1135  * @blkcg: blkcg of interest
1136  *
1137  * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
1138  * is nested inside q lock, this function performs reverse double lock dancing.
1139  * Destroying the blkgs releases the reference held on the blkcg's css allowing
1140  * blkcg_css_free to eventually be called.
1141  *
1142  * This is the blkcg counterpart of ioc_release_fn().
1143  */
1144 void blkcg_destroy_blkgs(struct blkcg *blkcg)
1145 {
1146 	spin_lock_irq(&blkcg->lock);
1147 
1148 	while (!hlist_empty(&blkcg->blkg_list)) {
1149 		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1150 						struct blkcg_gq, blkcg_node);
1151 		struct request_queue *q = blkg->q;
1152 
1153 		if (spin_trylock(q->queue_lock)) {
1154 			blkg_destroy(blkg);
1155 			spin_unlock(q->queue_lock);
1156 		} else {
1157 			spin_unlock_irq(&blkcg->lock);
1158 			cpu_relax();
1159 			spin_lock_irq(&blkcg->lock);
1160 		}
1161 	}
1162 
1163 	spin_unlock_irq(&blkcg->lock);
1164 }
1165 
1166 static void blkcg_css_free(struct cgroup_subsys_state *css)
1167 {
1168 	struct blkcg *blkcg = css_to_blkcg(css);
1169 	int i;
1170 
1171 	mutex_lock(&blkcg_pol_mutex);
1172 
1173 	list_del(&blkcg->all_blkcgs_node);
1174 
1175 	for (i = 0; i < BLKCG_MAX_POLS; i++)
1176 		if (blkcg->cpd[i])
1177 			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1178 
1179 	mutex_unlock(&blkcg_pol_mutex);
1180 
1181 	kfree(blkcg);
1182 }
1183 
1184 static struct cgroup_subsys_state *
1185 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1186 {
1187 	struct blkcg *blkcg;
1188 	struct cgroup_subsys_state *ret;
1189 	int i;
1190 
1191 	mutex_lock(&blkcg_pol_mutex);
1192 
1193 	if (!parent_css) {
1194 		blkcg = &blkcg_root;
1195 	} else {
1196 		blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1197 		if (!blkcg) {
1198 			ret = ERR_PTR(-ENOMEM);
1199 			goto unlock;
1200 		}
1201 	}
1202 
1203 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1204 		struct blkcg_policy *pol = blkcg_policy[i];
1205 		struct blkcg_policy_data *cpd;
1206 
1207 		/*
1208 		 * If the policy hasn't been attached yet, wait for it
1209 		 * to be attached before doing anything else. Otherwise,
1210 		 * check if the policy requires any specific per-cgroup
1211 		 * data: if it does, allocate and initialize it.
1212 		 */
1213 		if (!pol || !pol->cpd_alloc_fn)
1214 			continue;
1215 
1216 		cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1217 		if (!cpd) {
1218 			ret = ERR_PTR(-ENOMEM);
1219 			goto free_pd_blkcg;
1220 		}
1221 		blkcg->cpd[i] = cpd;
1222 		cpd->blkcg = blkcg;
1223 		cpd->plid = i;
1224 		if (pol->cpd_init_fn)
1225 			pol->cpd_init_fn(cpd);
1226 	}
1227 
1228 	spin_lock_init(&blkcg->lock);
1229 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1230 	INIT_HLIST_HEAD(&blkcg->blkg_list);
1231 #ifdef CONFIG_CGROUP_WRITEBACK
1232 	INIT_LIST_HEAD(&blkcg->cgwb_list);
1233 	refcount_set(&blkcg->cgwb_refcnt, 1);
1234 #endif
1235 	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1236 
1237 	mutex_unlock(&blkcg_pol_mutex);
1238 	return &blkcg->css;
1239 
1240 free_pd_blkcg:
1241 	for (i--; i >= 0; i--)
1242 		if (blkcg->cpd[i])
1243 			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1244 
1245 	if (blkcg != &blkcg_root)
1246 		kfree(blkcg);
1247 unlock:
1248 	mutex_unlock(&blkcg_pol_mutex);
1249 	return ret;
1250 }
1251 
1252 /**
1253  * blkcg_init_queue - initialize blkcg part of request queue
1254  * @q: request_queue to initialize
1255  *
1256  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1257  * part of new request_queue @q.
1258  *
1259  * RETURNS:
1260  * 0 on success, -errno on failure.
1261  */
1262 int blkcg_init_queue(struct request_queue *q)
1263 {
1264 	struct blkcg_gq *new_blkg, *blkg;
1265 	bool preloaded;
1266 	int ret;
1267 
1268 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1269 	if (!new_blkg)
1270 		return -ENOMEM;
1271 
1272 	preloaded = !radix_tree_preload(GFP_KERNEL);
1273 
1274 	/* Make sure the root blkg exists. */
1275 	rcu_read_lock();
1276 	spin_lock_irq(q->queue_lock);
1277 	blkg = blkg_create(&blkcg_root, q, new_blkg);
1278 	if (IS_ERR(blkg))
1279 		goto err_unlock;
1280 	q->root_blkg = blkg;
1281 	q->root_rl.blkg = blkg;
1282 	spin_unlock_irq(q->queue_lock);
1283 	rcu_read_unlock();
1284 
1285 	if (preloaded)
1286 		radix_tree_preload_end();
1287 
1288 	ret = blk_iolatency_init(q);
1289 	if (ret) {
1290 		spin_lock_irq(q->queue_lock);
1291 		blkg_destroy_all(q);
1292 		spin_unlock_irq(q->queue_lock);
1293 		return ret;
1294 	}
1295 
1296 	ret = blk_throtl_init(q);
1297 	if (ret) {
1298 		spin_lock_irq(q->queue_lock);
1299 		blkg_destroy_all(q);
1300 		spin_unlock_irq(q->queue_lock);
1301 	}
1302 	return ret;
1303 
1304 err_unlock:
1305 	spin_unlock_irq(q->queue_lock);
1306 	rcu_read_unlock();
1307 	if (preloaded)
1308 		radix_tree_preload_end();
1309 	return PTR_ERR(blkg);
1310 }
1311 
1312 /**
1313  * blkcg_drain_queue - drain blkcg part of request_queue
1314  * @q: request_queue to drain
1315  *
1316  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1317  */
1318 void blkcg_drain_queue(struct request_queue *q)
1319 {
1320 	lockdep_assert_held(q->queue_lock);
1321 
1322 	/*
1323 	 * @q could be exiting and already have destroyed all blkgs as
1324 	 * indicated by NULL root_blkg.  If so, don't confuse policies.
1325 	 */
1326 	if (!q->root_blkg)
1327 		return;
1328 
1329 	blk_throtl_drain(q);
1330 }
1331 
1332 /**
1333  * blkcg_exit_queue - exit and release blkcg part of request_queue
1334  * @q: request_queue being released
1335  *
1336  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1337  */
1338 void blkcg_exit_queue(struct request_queue *q)
1339 {
1340 	spin_lock_irq(q->queue_lock);
1341 	blkg_destroy_all(q);
1342 	spin_unlock_irq(q->queue_lock);
1343 
1344 	blk_throtl_exit(q);
1345 }
1346 
1347 /*
1348  * We cannot support shared io contexts, as we have no mean to support
1349  * two tasks with the same ioc in two different groups without major rework
1350  * of the main cic data structures.  For now we allow a task to change
1351  * its cgroup only if it's the only owner of its ioc.
1352  */
1353 static int blkcg_can_attach(struct cgroup_taskset *tset)
1354 {
1355 	struct task_struct *task;
1356 	struct cgroup_subsys_state *dst_css;
1357 	struct io_context *ioc;
1358 	int ret = 0;
1359 
1360 	/* task_lock() is needed to avoid races with exit_io_context() */
1361 	cgroup_taskset_for_each(task, dst_css, tset) {
1362 		task_lock(task);
1363 		ioc = task->io_context;
1364 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1365 			ret = -EINVAL;
1366 		task_unlock(task);
1367 		if (ret)
1368 			break;
1369 	}
1370 	return ret;
1371 }
1372 
1373 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1374 {
1375 	int i;
1376 
1377 	mutex_lock(&blkcg_pol_mutex);
1378 
1379 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
1380 		struct blkcg_policy *pol = blkcg_policy[i];
1381 		struct blkcg *blkcg;
1382 
1383 		if (!pol || !pol->cpd_bind_fn)
1384 			continue;
1385 
1386 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1387 			if (blkcg->cpd[pol->plid])
1388 				pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1389 	}
1390 	mutex_unlock(&blkcg_pol_mutex);
1391 }
1392 
1393 static void blkcg_exit(struct task_struct *tsk)
1394 {
1395 	if (tsk->throttle_queue)
1396 		blk_put_queue(tsk->throttle_queue);
1397 	tsk->throttle_queue = NULL;
1398 }
1399 
1400 struct cgroup_subsys io_cgrp_subsys = {
1401 	.css_alloc = blkcg_css_alloc,
1402 	.css_offline = blkcg_css_offline,
1403 	.css_free = blkcg_css_free,
1404 	.can_attach = blkcg_can_attach,
1405 	.bind = blkcg_bind,
1406 	.dfl_cftypes = blkcg_files,
1407 	.legacy_cftypes = blkcg_legacy_files,
1408 	.legacy_name = "blkio",
1409 	.exit = blkcg_exit,
1410 #ifdef CONFIG_MEMCG
1411 	/*
1412 	 * This ensures that, if available, memcg is automatically enabled
1413 	 * together on the default hierarchy so that the owner cgroup can
1414 	 * be retrieved from writeback pages.
1415 	 */
1416 	.depends_on = 1 << memory_cgrp_id,
1417 #endif
1418 };
1419 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1420 
1421 /**
1422  * blkcg_activate_policy - activate a blkcg policy on a request_queue
1423  * @q: request_queue of interest
1424  * @pol: blkcg policy to activate
1425  *
1426  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1427  * bypass mode to populate its blkgs with policy_data for @pol.
1428  *
1429  * Activation happens with @q bypassed, so nobody would be accessing blkgs
1430  * from IO path.  Update of each blkg is protected by both queue and blkcg
1431  * locks so that holding either lock and testing blkcg_policy_enabled() is
1432  * always enough for dereferencing policy data.
1433  *
1434  * The caller is responsible for synchronizing [de]activations and policy
1435  * [un]registerations.  Returns 0 on success, -errno on failure.
1436  */
1437 int blkcg_activate_policy(struct request_queue *q,
1438 			  const struct blkcg_policy *pol)
1439 {
1440 	struct blkg_policy_data *pd_prealloc = NULL;
1441 	struct blkcg_gq *blkg;
1442 	int ret;
1443 
1444 	if (blkcg_policy_enabled(q, pol))
1445 		return 0;
1446 
1447 	if (q->mq_ops)
1448 		blk_mq_freeze_queue(q);
1449 	else
1450 		blk_queue_bypass_start(q);
1451 pd_prealloc:
1452 	if (!pd_prealloc) {
1453 		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1454 		if (!pd_prealloc) {
1455 			ret = -ENOMEM;
1456 			goto out_bypass_end;
1457 		}
1458 	}
1459 
1460 	spin_lock_irq(q->queue_lock);
1461 
1462 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
1463 		struct blkg_policy_data *pd;
1464 
1465 		if (blkg->pd[pol->plid])
1466 			continue;
1467 
1468 		pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1469 		if (!pd)
1470 			swap(pd, pd_prealloc);
1471 		if (!pd) {
1472 			spin_unlock_irq(q->queue_lock);
1473 			goto pd_prealloc;
1474 		}
1475 
1476 		blkg->pd[pol->plid] = pd;
1477 		pd->blkg = blkg;
1478 		pd->plid = pol->plid;
1479 		if (pol->pd_init_fn)
1480 			pol->pd_init_fn(pd);
1481 	}
1482 
1483 	__set_bit(pol->plid, q->blkcg_pols);
1484 	ret = 0;
1485 
1486 	spin_unlock_irq(q->queue_lock);
1487 out_bypass_end:
1488 	if (q->mq_ops)
1489 		blk_mq_unfreeze_queue(q);
1490 	else
1491 		blk_queue_bypass_end(q);
1492 	if (pd_prealloc)
1493 		pol->pd_free_fn(pd_prealloc);
1494 	return ret;
1495 }
1496 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1497 
1498 /**
1499  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1500  * @q: request_queue of interest
1501  * @pol: blkcg policy to deactivate
1502  *
1503  * Deactivate @pol on @q.  Follows the same synchronization rules as
1504  * blkcg_activate_policy().
1505  */
1506 void blkcg_deactivate_policy(struct request_queue *q,
1507 			     const struct blkcg_policy *pol)
1508 {
1509 	struct blkcg_gq *blkg;
1510 
1511 	if (!blkcg_policy_enabled(q, pol))
1512 		return;
1513 
1514 	if (q->mq_ops)
1515 		blk_mq_freeze_queue(q);
1516 	else
1517 		blk_queue_bypass_start(q);
1518 
1519 	spin_lock_irq(q->queue_lock);
1520 
1521 	__clear_bit(pol->plid, q->blkcg_pols);
1522 
1523 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
1524 		if (blkg->pd[pol->plid]) {
1525 			if (pol->pd_offline_fn)
1526 				pol->pd_offline_fn(blkg->pd[pol->plid]);
1527 			pol->pd_free_fn(blkg->pd[pol->plid]);
1528 			blkg->pd[pol->plid] = NULL;
1529 		}
1530 	}
1531 
1532 	spin_unlock_irq(q->queue_lock);
1533 
1534 	if (q->mq_ops)
1535 		blk_mq_unfreeze_queue(q);
1536 	else
1537 		blk_queue_bypass_end(q);
1538 }
1539 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1540 
1541 /**
1542  * blkcg_policy_register - register a blkcg policy
1543  * @pol: blkcg policy to register
1544  *
1545  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1546  * successful registration.  Returns 0 on success and -errno on failure.
1547  */
1548 int blkcg_policy_register(struct blkcg_policy *pol)
1549 {
1550 	struct blkcg *blkcg;
1551 	int i, ret;
1552 
1553 	mutex_lock(&blkcg_pol_register_mutex);
1554 	mutex_lock(&blkcg_pol_mutex);
1555 
1556 	/* find an empty slot */
1557 	ret = -ENOSPC;
1558 	for (i = 0; i < BLKCG_MAX_POLS; i++)
1559 		if (!blkcg_policy[i])
1560 			break;
1561 	if (i >= BLKCG_MAX_POLS) {
1562 		pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1563 		goto err_unlock;
1564 	}
1565 
1566 	/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1567 	if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1568 		(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1569 		goto err_unlock;
1570 
1571 	/* register @pol */
1572 	pol->plid = i;
1573 	blkcg_policy[pol->plid] = pol;
1574 
1575 	/* allocate and install cpd's */
1576 	if (pol->cpd_alloc_fn) {
1577 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1578 			struct blkcg_policy_data *cpd;
1579 
1580 			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1581 			if (!cpd)
1582 				goto err_free_cpds;
1583 
1584 			blkcg->cpd[pol->plid] = cpd;
1585 			cpd->blkcg = blkcg;
1586 			cpd->plid = pol->plid;
1587 			pol->cpd_init_fn(cpd);
1588 		}
1589 	}
1590 
1591 	mutex_unlock(&blkcg_pol_mutex);
1592 
1593 	/* everything is in place, add intf files for the new policy */
1594 	if (pol->dfl_cftypes)
1595 		WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1596 					       pol->dfl_cftypes));
1597 	if (pol->legacy_cftypes)
1598 		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1599 						  pol->legacy_cftypes));
1600 	mutex_unlock(&blkcg_pol_register_mutex);
1601 	return 0;
1602 
1603 err_free_cpds:
1604 	if (pol->cpd_free_fn) {
1605 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1606 			if (blkcg->cpd[pol->plid]) {
1607 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1608 				blkcg->cpd[pol->plid] = NULL;
1609 			}
1610 		}
1611 	}
1612 	blkcg_policy[pol->plid] = NULL;
1613 err_unlock:
1614 	mutex_unlock(&blkcg_pol_mutex);
1615 	mutex_unlock(&blkcg_pol_register_mutex);
1616 	return ret;
1617 }
1618 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1619 
1620 /**
1621  * blkcg_policy_unregister - unregister a blkcg policy
1622  * @pol: blkcg policy to unregister
1623  *
1624  * Undo blkcg_policy_register(@pol).  Might sleep.
1625  */
1626 void blkcg_policy_unregister(struct blkcg_policy *pol)
1627 {
1628 	struct blkcg *blkcg;
1629 
1630 	mutex_lock(&blkcg_pol_register_mutex);
1631 
1632 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
1633 		goto out_unlock;
1634 
1635 	/* kill the intf files first */
1636 	if (pol->dfl_cftypes)
1637 		cgroup_rm_cftypes(pol->dfl_cftypes);
1638 	if (pol->legacy_cftypes)
1639 		cgroup_rm_cftypes(pol->legacy_cftypes);
1640 
1641 	/* remove cpds and unregister */
1642 	mutex_lock(&blkcg_pol_mutex);
1643 
1644 	if (pol->cpd_free_fn) {
1645 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1646 			if (blkcg->cpd[pol->plid]) {
1647 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1648 				blkcg->cpd[pol->plid] = NULL;
1649 			}
1650 		}
1651 	}
1652 	blkcg_policy[pol->plid] = NULL;
1653 
1654 	mutex_unlock(&blkcg_pol_mutex);
1655 out_unlock:
1656 	mutex_unlock(&blkcg_pol_register_mutex);
1657 }
1658 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1659 
1660 /*
1661  * Scale the accumulated delay based on how long it has been since we updated
1662  * the delay.  We only call this when we are adding delay, in case it's been a
1663  * while since we added delay, and when we are checking to see if we need to
1664  * delay a task, to account for any delays that may have occurred.
1665  */
1666 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1667 {
1668 	u64 old = atomic64_read(&blkg->delay_start);
1669 
1670 	/*
1671 	 * We only want to scale down every second.  The idea here is that we
1672 	 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1673 	 * time window.  We only want to throttle tasks for recent delay that
1674 	 * has occurred, in 1 second time windows since that's the maximum
1675 	 * things can be throttled.  We save the current delay window in
1676 	 * blkg->last_delay so we know what amount is still left to be charged
1677 	 * to the blkg from this point onward.  blkg->last_use keeps track of
1678 	 * the use_delay counter.  The idea is if we're unthrottling the blkg we
1679 	 * are ok with whatever is happening now, and we can take away more of
1680 	 * the accumulated delay as we've already throttled enough that
1681 	 * everybody is happy with their IO latencies.
1682 	 */
1683 	if (time_before64(old + NSEC_PER_SEC, now) &&
1684 	    atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1685 		u64 cur = atomic64_read(&blkg->delay_nsec);
1686 		u64 sub = min_t(u64, blkg->last_delay, now - old);
1687 		int cur_use = atomic_read(&blkg->use_delay);
1688 
1689 		/*
1690 		 * We've been unthrottled, subtract a larger chunk of our
1691 		 * accumulated delay.
1692 		 */
1693 		if (cur_use < blkg->last_use)
1694 			sub = max_t(u64, sub, blkg->last_delay >> 1);
1695 
1696 		/*
1697 		 * This shouldn't happen, but handle it anyway.  Our delay_nsec
1698 		 * should only ever be growing except here where we subtract out
1699 		 * min(last_delay, 1 second), but lord knows bugs happen and I'd
1700 		 * rather not end up with negative numbers.
1701 		 */
1702 		if (unlikely(cur < sub)) {
1703 			atomic64_set(&blkg->delay_nsec, 0);
1704 			blkg->last_delay = 0;
1705 		} else {
1706 			atomic64_sub(sub, &blkg->delay_nsec);
1707 			blkg->last_delay = cur - sub;
1708 		}
1709 		blkg->last_use = cur_use;
1710 	}
1711 }
1712 
1713 /*
1714  * This is called when we want to actually walk up the hierarchy and check to
1715  * see if we need to throttle, and then actually throttle if there is some
1716  * accumulated delay.  This should only be called upon return to user space so
1717  * we're not holding some lock that would induce a priority inversion.
1718  */
1719 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1720 {
1721 	u64 now = ktime_to_ns(ktime_get());
1722 	u64 exp;
1723 	u64 delay_nsec = 0;
1724 	int tok;
1725 
1726 	while (blkg->parent) {
1727 		if (atomic_read(&blkg->use_delay)) {
1728 			blkcg_scale_delay(blkg, now);
1729 			delay_nsec = max_t(u64, delay_nsec,
1730 					   atomic64_read(&blkg->delay_nsec));
1731 		}
1732 		blkg = blkg->parent;
1733 	}
1734 
1735 	if (!delay_nsec)
1736 		return;
1737 
1738 	/*
1739 	 * Let's not sleep for all eternity if we've amassed a huge delay.
1740 	 * Swapping or metadata IO can accumulate 10's of seconds worth of
1741 	 * delay, and we want userspace to be able to do _something_ so cap the
1742 	 * delays at 1 second.  If there's 10's of seconds worth of delay then
1743 	 * the tasks will be delayed for 1 second for every syscall.
1744 	 */
1745 	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1746 
1747 	/*
1748 	 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1749 	 * that hasn't landed upstream yet.  Once that stuff is in place we need
1750 	 * to do a psi_memstall_enter/leave if memdelay is set.
1751 	 */
1752 
1753 	exp = ktime_add_ns(now, delay_nsec);
1754 	tok = io_schedule_prepare();
1755 	do {
1756 		__set_current_state(TASK_KILLABLE);
1757 		if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1758 			break;
1759 	} while (!fatal_signal_pending(current));
1760 	io_schedule_finish(tok);
1761 }
1762 
1763 /**
1764  * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1765  *
1766  * This is only called if we've been marked with set_notify_resume().  Obviously
1767  * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1768  * check to see if current->throttle_queue is set and if not this doesn't do
1769  * anything.  This should only ever be called by the resume code, it's not meant
1770  * to be called by people willy-nilly as it will actually do the work to
1771  * throttle the task if it is setup for throttling.
1772  */
1773 void blkcg_maybe_throttle_current(void)
1774 {
1775 	struct request_queue *q = current->throttle_queue;
1776 	struct cgroup_subsys_state *css;
1777 	struct blkcg *blkcg;
1778 	struct blkcg_gq *blkg;
1779 	bool use_memdelay = current->use_memdelay;
1780 
1781 	if (!q)
1782 		return;
1783 
1784 	current->throttle_queue = NULL;
1785 	current->use_memdelay = false;
1786 
1787 	rcu_read_lock();
1788 	css = kthread_blkcg();
1789 	if (css)
1790 		blkcg = css_to_blkcg(css);
1791 	else
1792 		blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1793 
1794 	if (!blkcg)
1795 		goto out;
1796 	blkg = blkg_lookup(blkcg, q);
1797 	if (!blkg)
1798 		goto out;
1799 	if (!blkg_tryget(blkg))
1800 		goto out;
1801 	rcu_read_unlock();
1802 
1803 	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1804 	blkg_put(blkg);
1805 	blk_put_queue(q);
1806 	return;
1807 out:
1808 	rcu_read_unlock();
1809 	blk_put_queue(q);
1810 }
1811 EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1812 
1813 /**
1814  * blkcg_schedule_throttle - this task needs to check for throttling
1815  * @q - the request queue IO was submitted on
1816  * @use_memdelay - do we charge this to memory delay for PSI
1817  *
1818  * This is called by the IO controller when we know there's delay accumulated
1819  * for the blkg for this task.  We do not pass the blkg because there are places
1820  * we call this that may not have that information, the swapping code for
1821  * instance will only have a request_queue at that point.  This set's the
1822  * notify_resume for the task to check and see if it requires throttling before
1823  * returning to user space.
1824  *
1825  * We will only schedule once per syscall.  You can call this over and over
1826  * again and it will only do the check once upon return to user space, and only
1827  * throttle once.  If the task needs to be throttled again it'll need to be
1828  * re-set at the next time we see the task.
1829  */
1830 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1831 {
1832 	if (unlikely(current->flags & PF_KTHREAD))
1833 		return;
1834 
1835 	if (!blk_get_queue(q))
1836 		return;
1837 
1838 	if (current->throttle_queue)
1839 		blk_put_queue(current->throttle_queue);
1840 	current->throttle_queue = q;
1841 	if (use_memdelay)
1842 		current->use_memdelay = use_memdelay;
1843 	set_notify_resume(current);
1844 }
1845 EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1846 
1847 /**
1848  * blkcg_add_delay - add delay to this blkg
1849  * @now - the current time in nanoseconds
1850  * @delta - how many nanoseconds of delay to add
1851  *
1852  * Charge @delta to the blkg's current delay accumulation.  This is used to
1853  * throttle tasks if an IO controller thinks we need more throttling.
1854  */
1855 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1856 {
1857 	blkcg_scale_delay(blkg, now);
1858 	atomic64_add(delta, &blkg->delay_nsec);
1859 }
1860 EXPORT_SYMBOL_GPL(blkcg_add_delay);
1861 
1862 module_param(blkcg_debug_stats, bool, 0644);
1863 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1864