1 /* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 * 13 * For policy-specific per-blkcg data: 14 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> 15 * Arianna Avanzini <avanzini.arianna@gmail.com> 16 */ 17 #include <linux/ioprio.h> 18 #include <linux/kdev_t.h> 19 #include <linux/module.h> 20 #include <linux/sched/signal.h> 21 #include <linux/err.h> 22 #include <linux/blkdev.h> 23 #include <linux/backing-dev.h> 24 #include <linux/slab.h> 25 #include <linux/genhd.h> 26 #include <linux/delay.h> 27 #include <linux/atomic.h> 28 #include <linux/ctype.h> 29 #include <linux/blk-cgroup.h> 30 #include <linux/tracehook.h> 31 #include "blk.h" 32 33 #define MAX_KEY_LEN 100 34 35 /* 36 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. 37 * blkcg_pol_register_mutex nests outside of it and synchronizes entire 38 * policy [un]register operations including cgroup file additions / 39 * removals. Putting cgroup file registration outside blkcg_pol_mutex 40 * allows grabbing it from cgroup callbacks. 41 */ 42 static DEFINE_MUTEX(blkcg_pol_register_mutex); 43 static DEFINE_MUTEX(blkcg_pol_mutex); 44 45 struct blkcg blkcg_root; 46 EXPORT_SYMBOL_GPL(blkcg_root); 47 48 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; 49 50 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 51 52 static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 53 54 static bool blkcg_debug_stats = false; 55 56 static bool blkcg_policy_enabled(struct request_queue *q, 57 const struct blkcg_policy *pol) 58 { 59 return pol && test_bit(pol->plid, q->blkcg_pols); 60 } 61 62 /** 63 * blkg_free - free a blkg 64 * @blkg: blkg to free 65 * 66 * Free @blkg which may be partially allocated. 67 */ 68 static void blkg_free(struct blkcg_gq *blkg) 69 { 70 int i; 71 72 if (!blkg) 73 return; 74 75 for (i = 0; i < BLKCG_MAX_POLS; i++) 76 if (blkg->pd[i]) 77 blkcg_policy[i]->pd_free_fn(blkg->pd[i]); 78 79 if (blkg->blkcg != &blkcg_root) 80 blk_exit_rl(blkg->q, &blkg->rl); 81 82 blkg_rwstat_exit(&blkg->stat_ios); 83 blkg_rwstat_exit(&blkg->stat_bytes); 84 kfree(blkg); 85 } 86 87 static void __blkg_release(struct rcu_head *rcu) 88 { 89 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 90 91 percpu_ref_exit(&blkg->refcnt); 92 93 /* release the blkcg and parent blkg refs this blkg has been holding */ 94 css_put(&blkg->blkcg->css); 95 if (blkg->parent) 96 blkg_put(blkg->parent); 97 98 wb_congested_put(blkg->wb_congested); 99 100 blkg_free(blkg); 101 } 102 103 /* 104 * A group is RCU protected, but having an rcu lock does not mean that one 105 * can access all the fields of blkg and assume these are valid. For 106 * example, don't try to follow throtl_data and request queue links. 107 * 108 * Having a reference to blkg under an rcu allows accesses to only values 109 * local to groups like group stats and group rate limits. 110 */ 111 static void blkg_release(struct percpu_ref *ref) 112 { 113 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); 114 115 call_rcu(&blkg->rcu_head, __blkg_release); 116 } 117 118 /** 119 * blkg_alloc - allocate a blkg 120 * @blkcg: block cgroup the new blkg is associated with 121 * @q: request_queue the new blkg is associated with 122 * @gfp_mask: allocation mask to use 123 * 124 * Allocate a new blkg assocating @blkcg and @q. 125 */ 126 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 127 gfp_t gfp_mask) 128 { 129 struct blkcg_gq *blkg; 130 int i; 131 132 /* alloc and init base part */ 133 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 134 if (!blkg) 135 return NULL; 136 137 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || 138 blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) 139 goto err_free; 140 141 blkg->q = q; 142 INIT_LIST_HEAD(&blkg->q_node); 143 blkg->blkcg = blkcg; 144 145 /* root blkg uses @q->root_rl, init rl only for !root blkgs */ 146 if (blkcg != &blkcg_root) { 147 if (blk_init_rl(&blkg->rl, q, gfp_mask)) 148 goto err_free; 149 blkg->rl.blkg = blkg; 150 } 151 152 for (i = 0; i < BLKCG_MAX_POLS; i++) { 153 struct blkcg_policy *pol = blkcg_policy[i]; 154 struct blkg_policy_data *pd; 155 156 if (!blkcg_policy_enabled(q, pol)) 157 continue; 158 159 /* alloc per-policy data and attach it to blkg */ 160 pd = pol->pd_alloc_fn(gfp_mask, q->node); 161 if (!pd) 162 goto err_free; 163 164 blkg->pd[i] = pd; 165 pd->blkg = blkg; 166 pd->plid = i; 167 } 168 169 return blkg; 170 171 err_free: 172 blkg_free(blkg); 173 return NULL; 174 } 175 176 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 177 struct request_queue *q, bool update_hint) 178 { 179 struct blkcg_gq *blkg; 180 181 /* 182 * Hint didn't match. Look up from the radix tree. Note that the 183 * hint can only be updated under queue_lock as otherwise @blkg 184 * could have already been removed from blkg_tree. The caller is 185 * responsible for grabbing queue_lock if @update_hint. 186 */ 187 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 188 if (blkg && blkg->q == q) { 189 if (update_hint) { 190 lockdep_assert_held(q->queue_lock); 191 rcu_assign_pointer(blkcg->blkg_hint, blkg); 192 } 193 return blkg; 194 } 195 196 return NULL; 197 } 198 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); 199 200 /* 201 * If @new_blkg is %NULL, this function tries to allocate a new one as 202 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. 203 */ 204 static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 205 struct request_queue *q, 206 struct blkcg_gq *new_blkg) 207 { 208 struct blkcg_gq *blkg; 209 struct bdi_writeback_congested *wb_congested; 210 int i, ret; 211 212 WARN_ON_ONCE(!rcu_read_lock_held()); 213 lockdep_assert_held(q->queue_lock); 214 215 /* blkg holds a reference to blkcg */ 216 if (!css_tryget_online(&blkcg->css)) { 217 ret = -ENODEV; 218 goto err_free_blkg; 219 } 220 221 wb_congested = wb_congested_get_create(q->backing_dev_info, 222 blkcg->css.id, 223 GFP_NOWAIT | __GFP_NOWARN); 224 if (!wb_congested) { 225 ret = -ENOMEM; 226 goto err_put_css; 227 } 228 229 /* allocate */ 230 if (!new_blkg) { 231 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); 232 if (unlikely(!new_blkg)) { 233 ret = -ENOMEM; 234 goto err_put_congested; 235 } 236 } 237 blkg = new_blkg; 238 blkg->wb_congested = wb_congested; 239 240 /* link parent */ 241 if (blkcg_parent(blkcg)) { 242 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 243 if (WARN_ON_ONCE(!blkg->parent)) { 244 ret = -ENODEV; 245 goto err_put_congested; 246 } 247 blkg_get(blkg->parent); 248 } 249 250 ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, 251 GFP_NOWAIT | __GFP_NOWARN); 252 if (ret) 253 goto err_cancel_ref; 254 255 /* invoke per-policy init */ 256 for (i = 0; i < BLKCG_MAX_POLS; i++) { 257 struct blkcg_policy *pol = blkcg_policy[i]; 258 259 if (blkg->pd[i] && pol->pd_init_fn) 260 pol->pd_init_fn(blkg->pd[i]); 261 } 262 263 /* insert */ 264 spin_lock(&blkcg->lock); 265 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 266 if (likely(!ret)) { 267 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 268 list_add(&blkg->q_node, &q->blkg_list); 269 270 for (i = 0; i < BLKCG_MAX_POLS; i++) { 271 struct blkcg_policy *pol = blkcg_policy[i]; 272 273 if (blkg->pd[i] && pol->pd_online_fn) 274 pol->pd_online_fn(blkg->pd[i]); 275 } 276 } 277 blkg->online = true; 278 spin_unlock(&blkcg->lock); 279 280 if (!ret) 281 return blkg; 282 283 /* @blkg failed fully initialized, use the usual release path */ 284 blkg_put(blkg); 285 return ERR_PTR(ret); 286 287 err_cancel_ref: 288 percpu_ref_exit(&blkg->refcnt); 289 err_put_congested: 290 wb_congested_put(wb_congested); 291 err_put_css: 292 css_put(&blkcg->css); 293 err_free_blkg: 294 blkg_free(new_blkg); 295 return ERR_PTR(ret); 296 } 297 298 /** 299 * __blkg_lookup_create - lookup blkg, try to create one if not there 300 * @blkcg: blkcg of interest 301 * @q: request_queue of interest 302 * 303 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to 304 * create one. blkg creation is performed recursively from blkcg_root such 305 * that all non-root blkg's have access to the parent blkg. This function 306 * should be called under RCU read lock and @q->queue_lock. 307 * 308 * Returns the blkg or the closest blkg if blkg_create fails as it walks 309 * down from root. 310 */ 311 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 312 struct request_queue *q) 313 { 314 struct blkcg_gq *blkg; 315 316 WARN_ON_ONCE(!rcu_read_lock_held()); 317 lockdep_assert_held(q->queue_lock); 318 319 /* 320 * This could be the first entry point of blkcg implementation and 321 * we shouldn't allow anything to go through for a bypassing queue. 322 */ 323 if (unlikely(blk_queue_bypass(q))) 324 return q->root_blkg; 325 326 blkg = __blkg_lookup(blkcg, q, true); 327 if (blkg) 328 return blkg; 329 330 /* 331 * Create blkgs walking down from blkcg_root to @blkcg, so that all 332 * non-root blkgs have access to their parents. Returns the closest 333 * blkg to the intended blkg should blkg_create() fail. 334 */ 335 while (true) { 336 struct blkcg *pos = blkcg; 337 struct blkcg *parent = blkcg_parent(blkcg); 338 struct blkcg_gq *ret_blkg = q->root_blkg; 339 340 while (parent) { 341 blkg = __blkg_lookup(parent, q, false); 342 if (blkg) { 343 /* remember closest blkg */ 344 ret_blkg = blkg; 345 break; 346 } 347 pos = parent; 348 parent = blkcg_parent(parent); 349 } 350 351 blkg = blkg_create(pos, q, NULL); 352 if (IS_ERR(blkg)) 353 return ret_blkg; 354 if (pos == blkcg) 355 return blkg; 356 } 357 } 358 359 /** 360 * blkg_lookup_create - find or create a blkg 361 * @blkcg: target block cgroup 362 * @q: target request_queue 363 * 364 * This looks up or creates the blkg representing the unique pair 365 * of the blkcg and the request_queue. 366 */ 367 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 368 struct request_queue *q) 369 { 370 struct blkcg_gq *blkg = blkg_lookup(blkcg, q); 371 unsigned long flags; 372 373 if (unlikely(!blkg)) { 374 spin_lock_irqsave(q->queue_lock, flags); 375 376 blkg = __blkg_lookup_create(blkcg, q); 377 378 spin_unlock_irqrestore(q->queue_lock, flags); 379 } 380 381 return blkg; 382 } 383 384 static void blkg_destroy(struct blkcg_gq *blkg) 385 { 386 struct blkcg *blkcg = blkg->blkcg; 387 struct blkcg_gq *parent = blkg->parent; 388 int i; 389 390 lockdep_assert_held(blkg->q->queue_lock); 391 lockdep_assert_held(&blkcg->lock); 392 393 /* Something wrong if we are trying to remove same group twice */ 394 WARN_ON_ONCE(list_empty(&blkg->q_node)); 395 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 396 397 for (i = 0; i < BLKCG_MAX_POLS; i++) { 398 struct blkcg_policy *pol = blkcg_policy[i]; 399 400 if (blkg->pd[i] && pol->pd_offline_fn) 401 pol->pd_offline_fn(blkg->pd[i]); 402 } 403 404 if (parent) { 405 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); 406 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); 407 } 408 409 blkg->online = false; 410 411 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 412 list_del_init(&blkg->q_node); 413 hlist_del_init_rcu(&blkg->blkcg_node); 414 415 /* 416 * Both setting lookup hint to and clearing it from @blkg are done 417 * under queue_lock. If it's not pointing to @blkg now, it never 418 * will. Hint assignment itself can race safely. 419 */ 420 if (rcu_access_pointer(blkcg->blkg_hint) == blkg) 421 rcu_assign_pointer(blkcg->blkg_hint, NULL); 422 423 /* 424 * Put the reference taken at the time of creation so that when all 425 * queues are gone, group can be destroyed. 426 */ 427 percpu_ref_kill(&blkg->refcnt); 428 } 429 430 /** 431 * blkg_destroy_all - destroy all blkgs associated with a request_queue 432 * @q: request_queue of interest 433 * 434 * Destroy all blkgs associated with @q. 435 */ 436 static void blkg_destroy_all(struct request_queue *q) 437 { 438 struct blkcg_gq *blkg, *n; 439 440 lockdep_assert_held(q->queue_lock); 441 442 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 443 struct blkcg *blkcg = blkg->blkcg; 444 445 spin_lock(&blkcg->lock); 446 blkg_destroy(blkg); 447 spin_unlock(&blkcg->lock); 448 } 449 450 q->root_blkg = NULL; 451 q->root_rl.blkg = NULL; 452 } 453 454 /* 455 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 456 * because the root blkg uses @q->root_rl instead of its own rl. 457 */ 458 struct request_list *__blk_queue_next_rl(struct request_list *rl, 459 struct request_queue *q) 460 { 461 struct list_head *ent; 462 struct blkcg_gq *blkg; 463 464 /* 465 * Determine the current blkg list_head. The first entry is 466 * root_rl which is off @q->blkg_list and mapped to the head. 467 */ 468 if (rl == &q->root_rl) { 469 ent = &q->blkg_list; 470 /* There are no more block groups, hence no request lists */ 471 if (list_empty(ent)) 472 return NULL; 473 } else { 474 blkg = container_of(rl, struct blkcg_gq, rl); 475 ent = &blkg->q_node; 476 } 477 478 /* walk to the next list_head, skip root blkcg */ 479 ent = ent->next; 480 if (ent == &q->root_blkg->q_node) 481 ent = ent->next; 482 if (ent == &q->blkg_list) 483 return NULL; 484 485 blkg = container_of(ent, struct blkcg_gq, q_node); 486 return &blkg->rl; 487 } 488 489 static int blkcg_reset_stats(struct cgroup_subsys_state *css, 490 struct cftype *cftype, u64 val) 491 { 492 struct blkcg *blkcg = css_to_blkcg(css); 493 struct blkcg_gq *blkg; 494 int i; 495 496 mutex_lock(&blkcg_pol_mutex); 497 spin_lock_irq(&blkcg->lock); 498 499 /* 500 * Note that stat reset is racy - it doesn't synchronize against 501 * stat updates. This is a debug feature which shouldn't exist 502 * anyway. If you get hit by a race, retry. 503 */ 504 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 505 blkg_rwstat_reset(&blkg->stat_bytes); 506 blkg_rwstat_reset(&blkg->stat_ios); 507 508 for (i = 0; i < BLKCG_MAX_POLS; i++) { 509 struct blkcg_policy *pol = blkcg_policy[i]; 510 511 if (blkg->pd[i] && pol->pd_reset_stats_fn) 512 pol->pd_reset_stats_fn(blkg->pd[i]); 513 } 514 } 515 516 spin_unlock_irq(&blkcg->lock); 517 mutex_unlock(&blkcg_pol_mutex); 518 return 0; 519 } 520 521 const char *blkg_dev_name(struct blkcg_gq *blkg) 522 { 523 /* some drivers (floppy) instantiate a queue w/o disk registered */ 524 if (blkg->q->backing_dev_info->dev) 525 return dev_name(blkg->q->backing_dev_info->dev); 526 return NULL; 527 } 528 EXPORT_SYMBOL_GPL(blkg_dev_name); 529 530 /** 531 * blkcg_print_blkgs - helper for printing per-blkg data 532 * @sf: seq_file to print to 533 * @blkcg: blkcg of interest 534 * @prfill: fill function to print out a blkg 535 * @pol: policy in question 536 * @data: data to be passed to @prfill 537 * @show_total: to print out sum of prfill return values or not 538 * 539 * This function invokes @prfill on each blkg of @blkcg if pd for the 540 * policy specified by @pol exists. @prfill is invoked with @sf, the 541 * policy data and @data and the matching queue lock held. If @show_total 542 * is %true, the sum of the return values from @prfill is printed with 543 * "Total" label at the end. 544 * 545 * This is to be used to construct print functions for 546 * cftype->read_seq_string method. 547 */ 548 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 549 u64 (*prfill)(struct seq_file *, 550 struct blkg_policy_data *, int), 551 const struct blkcg_policy *pol, int data, 552 bool show_total) 553 { 554 struct blkcg_gq *blkg; 555 u64 total = 0; 556 557 rcu_read_lock(); 558 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 559 spin_lock_irq(blkg->q->queue_lock); 560 if (blkcg_policy_enabled(blkg->q, pol)) 561 total += prfill(sf, blkg->pd[pol->plid], data); 562 spin_unlock_irq(blkg->q->queue_lock); 563 } 564 rcu_read_unlock(); 565 566 if (show_total) 567 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 568 } 569 EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 570 571 /** 572 * __blkg_prfill_u64 - prfill helper for a single u64 value 573 * @sf: seq_file to print to 574 * @pd: policy private data of interest 575 * @v: value to print 576 * 577 * Print @v to @sf for the device assocaited with @pd. 578 */ 579 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 580 { 581 const char *dname = blkg_dev_name(pd->blkg); 582 583 if (!dname) 584 return 0; 585 586 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 587 return v; 588 } 589 EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 590 591 /** 592 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 593 * @sf: seq_file to print to 594 * @pd: policy private data of interest 595 * @rwstat: rwstat to print 596 * 597 * Print @rwstat to @sf for the device assocaited with @pd. 598 */ 599 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 600 const struct blkg_rwstat *rwstat) 601 { 602 static const char *rwstr[] = { 603 [BLKG_RWSTAT_READ] = "Read", 604 [BLKG_RWSTAT_WRITE] = "Write", 605 [BLKG_RWSTAT_SYNC] = "Sync", 606 [BLKG_RWSTAT_ASYNC] = "Async", 607 [BLKG_RWSTAT_DISCARD] = "Discard", 608 }; 609 const char *dname = blkg_dev_name(pd->blkg); 610 u64 v; 611 int i; 612 613 if (!dname) 614 return 0; 615 616 for (i = 0; i < BLKG_RWSTAT_NR; i++) 617 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 618 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); 619 620 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + 621 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + 622 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); 623 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 624 return v; 625 } 626 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); 627 628 /** 629 * blkg_prfill_stat - prfill callback for blkg_stat 630 * @sf: seq_file to print to 631 * @pd: policy private data of interest 632 * @off: offset to the blkg_stat in @pd 633 * 634 * prfill callback for printing a blkg_stat. 635 */ 636 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) 637 { 638 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); 639 } 640 EXPORT_SYMBOL_GPL(blkg_prfill_stat); 641 642 /** 643 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 644 * @sf: seq_file to print to 645 * @pd: policy private data of interest 646 * @off: offset to the blkg_rwstat in @pd 647 * 648 * prfill callback for printing a blkg_rwstat. 649 */ 650 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 651 int off) 652 { 653 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 654 655 return __blkg_prfill_rwstat(sf, pd, &rwstat); 656 } 657 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 658 659 static u64 blkg_prfill_rwstat_field(struct seq_file *sf, 660 struct blkg_policy_data *pd, int off) 661 { 662 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); 663 664 return __blkg_prfill_rwstat(sf, pd, &rwstat); 665 } 666 667 /** 668 * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes 669 * @sf: seq_file to print to 670 * @v: unused 671 * 672 * To be used as cftype->seq_show to print blkg->stat_bytes. 673 * cftype->private must be set to the blkcg_policy. 674 */ 675 int blkg_print_stat_bytes(struct seq_file *sf, void *v) 676 { 677 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 678 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, 679 offsetof(struct blkcg_gq, stat_bytes), true); 680 return 0; 681 } 682 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); 683 684 /** 685 * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios 686 * @sf: seq_file to print to 687 * @v: unused 688 * 689 * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private 690 * must be set to the blkcg_policy. 691 */ 692 int blkg_print_stat_ios(struct seq_file *sf, void *v) 693 { 694 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 695 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, 696 offsetof(struct blkcg_gq, stat_ios), true); 697 return 0; 698 } 699 EXPORT_SYMBOL_GPL(blkg_print_stat_ios); 700 701 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, 702 struct blkg_policy_data *pd, 703 int off) 704 { 705 struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, 706 NULL, off); 707 return __blkg_prfill_rwstat(sf, pd, &rwstat); 708 } 709 710 /** 711 * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes 712 * @sf: seq_file to print to 713 * @v: unused 714 */ 715 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) 716 { 717 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 718 blkg_prfill_rwstat_field_recursive, 719 (void *)seq_cft(sf)->private, 720 offsetof(struct blkcg_gq, stat_bytes), true); 721 return 0; 722 } 723 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); 724 725 /** 726 * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios 727 * @sf: seq_file to print to 728 * @v: unused 729 */ 730 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) 731 { 732 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 733 blkg_prfill_rwstat_field_recursive, 734 (void *)seq_cft(sf)->private, 735 offsetof(struct blkcg_gq, stat_ios), true); 736 return 0; 737 } 738 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); 739 740 /** 741 * blkg_stat_recursive_sum - collect hierarchical blkg_stat 742 * @blkg: blkg of interest 743 * @pol: blkcg_policy which contains the blkg_stat 744 * @off: offset to the blkg_stat in blkg_policy_data or @blkg 745 * 746 * Collect the blkg_stat specified by @blkg, @pol and @off and all its 747 * online descendants and their aux counts. The caller must be holding the 748 * queue lock for online tests. 749 * 750 * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is 751 * at @off bytes into @blkg's blkg_policy_data of the policy. 752 */ 753 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, 754 struct blkcg_policy *pol, int off) 755 { 756 struct blkcg_gq *pos_blkg; 757 struct cgroup_subsys_state *pos_css; 758 u64 sum = 0; 759 760 lockdep_assert_held(blkg->q->queue_lock); 761 762 rcu_read_lock(); 763 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 764 struct blkg_stat *stat; 765 766 if (!pos_blkg->online) 767 continue; 768 769 if (pol) 770 stat = (void *)blkg_to_pd(pos_blkg, pol) + off; 771 else 772 stat = (void *)blkg + off; 773 774 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); 775 } 776 rcu_read_unlock(); 777 778 return sum; 779 } 780 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); 781 782 /** 783 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat 784 * @blkg: blkg of interest 785 * @pol: blkcg_policy which contains the blkg_rwstat 786 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg 787 * 788 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its 789 * online descendants and their aux counts. The caller must be holding the 790 * queue lock for online tests. 791 * 792 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it 793 * is at @off bytes into @blkg's blkg_policy_data of the policy. 794 */ 795 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, 796 struct blkcg_policy *pol, int off) 797 { 798 struct blkcg_gq *pos_blkg; 799 struct cgroup_subsys_state *pos_css; 800 struct blkg_rwstat sum = { }; 801 int i; 802 803 lockdep_assert_held(blkg->q->queue_lock); 804 805 rcu_read_lock(); 806 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 807 struct blkg_rwstat *rwstat; 808 809 if (!pos_blkg->online) 810 continue; 811 812 if (pol) 813 rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; 814 else 815 rwstat = (void *)pos_blkg + off; 816 817 for (i = 0; i < BLKG_RWSTAT_NR; i++) 818 atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + 819 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), 820 &sum.aux_cnt[i]); 821 } 822 rcu_read_unlock(); 823 824 return sum; 825 } 826 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); 827 828 /* Performs queue bypass and policy enabled checks then looks up blkg. */ 829 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, 830 const struct blkcg_policy *pol, 831 struct request_queue *q) 832 { 833 WARN_ON_ONCE(!rcu_read_lock_held()); 834 lockdep_assert_held(q->queue_lock); 835 836 if (!blkcg_policy_enabled(q, pol)) 837 return ERR_PTR(-EOPNOTSUPP); 838 839 /* 840 * This could be the first entry point of blkcg implementation and 841 * we shouldn't allow anything to go through for a bypassing queue. 842 */ 843 if (unlikely(blk_queue_bypass(q))) 844 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); 845 846 return __blkg_lookup(blkcg, q, true /* update_hint */); 847 } 848 849 /** 850 * blkg_conf_prep - parse and prepare for per-blkg config update 851 * @blkcg: target block cgroup 852 * @pol: target policy 853 * @input: input string 854 * @ctx: blkg_conf_ctx to be filled 855 * 856 * Parse per-blkg config update from @input and initialize @ctx with the 857 * result. @ctx->blkg points to the blkg to be updated and @ctx->body the 858 * part of @input following MAJ:MIN. This function returns with RCU read 859 * lock and queue lock held and must be paired with blkg_conf_finish(). 860 */ 861 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 862 char *input, struct blkg_conf_ctx *ctx) 863 __acquires(rcu) __acquires(disk->queue->queue_lock) 864 { 865 struct gendisk *disk; 866 struct request_queue *q; 867 struct blkcg_gq *blkg; 868 unsigned int major, minor; 869 int key_len, part, ret; 870 char *body; 871 872 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) 873 return -EINVAL; 874 875 body = input + key_len; 876 if (!isspace(*body)) 877 return -EINVAL; 878 body = skip_spaces(body); 879 880 disk = get_gendisk(MKDEV(major, minor), &part); 881 if (!disk) 882 return -ENODEV; 883 if (part) { 884 ret = -ENODEV; 885 goto fail; 886 } 887 888 q = disk->queue; 889 890 rcu_read_lock(); 891 spin_lock_irq(q->queue_lock); 892 893 blkg = blkg_lookup_check(blkcg, pol, q); 894 if (IS_ERR(blkg)) { 895 ret = PTR_ERR(blkg); 896 goto fail_unlock; 897 } 898 899 if (blkg) 900 goto success; 901 902 /* 903 * Create blkgs walking down from blkcg_root to @blkcg, so that all 904 * non-root blkgs have access to their parents. 905 */ 906 while (true) { 907 struct blkcg *pos = blkcg; 908 struct blkcg *parent; 909 struct blkcg_gq *new_blkg; 910 911 parent = blkcg_parent(blkcg); 912 while (parent && !__blkg_lookup(parent, q, false)) { 913 pos = parent; 914 parent = blkcg_parent(parent); 915 } 916 917 /* Drop locks to do new blkg allocation with GFP_KERNEL. */ 918 spin_unlock_irq(q->queue_lock); 919 rcu_read_unlock(); 920 921 new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 922 if (unlikely(!new_blkg)) { 923 ret = -ENOMEM; 924 goto fail; 925 } 926 927 rcu_read_lock(); 928 spin_lock_irq(q->queue_lock); 929 930 blkg = blkg_lookup_check(pos, pol, q); 931 if (IS_ERR(blkg)) { 932 ret = PTR_ERR(blkg); 933 goto fail_unlock; 934 } 935 936 if (blkg) { 937 blkg_free(new_blkg); 938 } else { 939 blkg = blkg_create(pos, q, new_blkg); 940 if (unlikely(IS_ERR(blkg))) { 941 ret = PTR_ERR(blkg); 942 goto fail_unlock; 943 } 944 } 945 946 if (pos == blkcg) 947 goto success; 948 } 949 success: 950 ctx->disk = disk; 951 ctx->blkg = blkg; 952 ctx->body = body; 953 return 0; 954 955 fail_unlock: 956 spin_unlock_irq(q->queue_lock); 957 rcu_read_unlock(); 958 fail: 959 put_disk_and_module(disk); 960 /* 961 * If queue was bypassing, we should retry. Do so after a 962 * short msleep(). It isn't strictly necessary but queue 963 * can be bypassing for some time and it's always nice to 964 * avoid busy looping. 965 */ 966 if (ret == -EBUSY) { 967 msleep(10); 968 ret = restart_syscall(); 969 } 970 return ret; 971 } 972 EXPORT_SYMBOL_GPL(blkg_conf_prep); 973 974 /** 975 * blkg_conf_finish - finish up per-blkg config update 976 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 977 * 978 * Finish up after per-blkg config update. This function must be paired 979 * with blkg_conf_prep(). 980 */ 981 void blkg_conf_finish(struct blkg_conf_ctx *ctx) 982 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 983 { 984 spin_unlock_irq(ctx->disk->queue->queue_lock); 985 rcu_read_unlock(); 986 put_disk_and_module(ctx->disk); 987 } 988 EXPORT_SYMBOL_GPL(blkg_conf_finish); 989 990 static int blkcg_print_stat(struct seq_file *sf, void *v) 991 { 992 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 993 struct blkcg_gq *blkg; 994 995 rcu_read_lock(); 996 997 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 998 const char *dname; 999 char *buf; 1000 struct blkg_rwstat rwstat; 1001 u64 rbytes, wbytes, rios, wios, dbytes, dios; 1002 size_t size = seq_get_buf(sf, &buf), off = 0; 1003 int i; 1004 bool has_stats = false; 1005 1006 dname = blkg_dev_name(blkg); 1007 if (!dname) 1008 continue; 1009 1010 /* 1011 * Hooray string manipulation, count is the size written NOT 1012 * INCLUDING THE \0, so size is now count+1 less than what we 1013 * had before, but we want to start writing the next bit from 1014 * the \0 so we only add count to buf. 1015 */ 1016 off += scnprintf(buf+off, size-off, "%s ", dname); 1017 1018 spin_lock_irq(blkg->q->queue_lock); 1019 1020 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 1021 offsetof(struct blkcg_gq, stat_bytes)); 1022 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 1023 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 1024 dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 1025 1026 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 1027 offsetof(struct blkcg_gq, stat_ios)); 1028 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 1029 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 1030 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 1031 1032 spin_unlock_irq(blkg->q->queue_lock); 1033 1034 if (rbytes || wbytes || rios || wios) { 1035 has_stats = true; 1036 off += scnprintf(buf+off, size-off, 1037 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", 1038 rbytes, wbytes, rios, wios, 1039 dbytes, dios); 1040 } 1041 1042 if (!blkcg_debug_stats) 1043 goto next; 1044 1045 if (atomic_read(&blkg->use_delay)) { 1046 has_stats = true; 1047 off += scnprintf(buf+off, size-off, 1048 " use_delay=%d delay_nsec=%llu", 1049 atomic_read(&blkg->use_delay), 1050 (unsigned long long)atomic64_read(&blkg->delay_nsec)); 1051 } 1052 1053 for (i = 0; i < BLKCG_MAX_POLS; i++) { 1054 struct blkcg_policy *pol = blkcg_policy[i]; 1055 size_t written; 1056 1057 if (!blkg->pd[i] || !pol->pd_stat_fn) 1058 continue; 1059 1060 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); 1061 if (written) 1062 has_stats = true; 1063 off += written; 1064 } 1065 next: 1066 if (has_stats) { 1067 off += scnprintf(buf+off, size-off, "\n"); 1068 seq_commit(sf, off); 1069 } 1070 } 1071 1072 rcu_read_unlock(); 1073 return 0; 1074 } 1075 1076 static struct cftype blkcg_files[] = { 1077 { 1078 .name = "stat", 1079 .flags = CFTYPE_NOT_ON_ROOT, 1080 .seq_show = blkcg_print_stat, 1081 }, 1082 { } /* terminate */ 1083 }; 1084 1085 static struct cftype blkcg_legacy_files[] = { 1086 { 1087 .name = "reset_stats", 1088 .write_u64 = blkcg_reset_stats, 1089 }, 1090 { } /* terminate */ 1091 }; 1092 1093 /* 1094 * blkcg destruction is a three-stage process. 1095 * 1096 * 1. Destruction starts. The blkcg_css_offline() callback is invoked 1097 * which offlines writeback. Here we tie the next stage of blkg destruction 1098 * to the completion of writeback associated with the blkcg. This lets us 1099 * avoid punting potentially large amounts of outstanding writeback to root 1100 * while maintaining any ongoing policies. The next stage is triggered when 1101 * the nr_cgwbs count goes to zero. 1102 * 1103 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called 1104 * and handles the destruction of blkgs. Here the css reference held by 1105 * the blkg is put back eventually allowing blkcg_css_free() to be called. 1106 * This work may occur in cgwb_release_workfn() on the cgwb_release 1107 * workqueue. Any submitted ios that fail to get the blkg ref will be 1108 * punted to the root_blkg. 1109 * 1110 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. 1111 * This finally frees the blkcg. 1112 */ 1113 1114 /** 1115 * blkcg_css_offline - cgroup css_offline callback 1116 * @css: css of interest 1117 * 1118 * This function is called when @css is about to go away. Here the cgwbs are 1119 * offlined first and only once writeback associated with the blkcg has 1120 * finished do we start step 2 (see above). 1121 */ 1122 static void blkcg_css_offline(struct cgroup_subsys_state *css) 1123 { 1124 struct blkcg *blkcg = css_to_blkcg(css); 1125 1126 /* this prevents anyone from attaching or migrating to this blkcg */ 1127 wb_blkcg_offline(blkcg); 1128 1129 /* put the base cgwb reference allowing step 2 to be triggered */ 1130 blkcg_cgwb_put(blkcg); 1131 } 1132 1133 /** 1134 * blkcg_destroy_blkgs - responsible for shooting down blkgs 1135 * @blkcg: blkcg of interest 1136 * 1137 * blkgs should be removed while holding both q and blkcg locks. As blkcg lock 1138 * is nested inside q lock, this function performs reverse double lock dancing. 1139 * Destroying the blkgs releases the reference held on the blkcg's css allowing 1140 * blkcg_css_free to eventually be called. 1141 * 1142 * This is the blkcg counterpart of ioc_release_fn(). 1143 */ 1144 void blkcg_destroy_blkgs(struct blkcg *blkcg) 1145 { 1146 spin_lock_irq(&blkcg->lock); 1147 1148 while (!hlist_empty(&blkcg->blkg_list)) { 1149 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 1150 struct blkcg_gq, blkcg_node); 1151 struct request_queue *q = blkg->q; 1152 1153 if (spin_trylock(q->queue_lock)) { 1154 blkg_destroy(blkg); 1155 spin_unlock(q->queue_lock); 1156 } else { 1157 spin_unlock_irq(&blkcg->lock); 1158 cpu_relax(); 1159 spin_lock_irq(&blkcg->lock); 1160 } 1161 } 1162 1163 spin_unlock_irq(&blkcg->lock); 1164 } 1165 1166 static void blkcg_css_free(struct cgroup_subsys_state *css) 1167 { 1168 struct blkcg *blkcg = css_to_blkcg(css); 1169 int i; 1170 1171 mutex_lock(&blkcg_pol_mutex); 1172 1173 list_del(&blkcg->all_blkcgs_node); 1174 1175 for (i = 0; i < BLKCG_MAX_POLS; i++) 1176 if (blkcg->cpd[i]) 1177 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1178 1179 mutex_unlock(&blkcg_pol_mutex); 1180 1181 kfree(blkcg); 1182 } 1183 1184 static struct cgroup_subsys_state * 1185 blkcg_css_alloc(struct cgroup_subsys_state *parent_css) 1186 { 1187 struct blkcg *blkcg; 1188 struct cgroup_subsys_state *ret; 1189 int i; 1190 1191 mutex_lock(&blkcg_pol_mutex); 1192 1193 if (!parent_css) { 1194 blkcg = &blkcg_root; 1195 } else { 1196 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1197 if (!blkcg) { 1198 ret = ERR_PTR(-ENOMEM); 1199 goto unlock; 1200 } 1201 } 1202 1203 for (i = 0; i < BLKCG_MAX_POLS ; i++) { 1204 struct blkcg_policy *pol = blkcg_policy[i]; 1205 struct blkcg_policy_data *cpd; 1206 1207 /* 1208 * If the policy hasn't been attached yet, wait for it 1209 * to be attached before doing anything else. Otherwise, 1210 * check if the policy requires any specific per-cgroup 1211 * data: if it does, allocate and initialize it. 1212 */ 1213 if (!pol || !pol->cpd_alloc_fn) 1214 continue; 1215 1216 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1217 if (!cpd) { 1218 ret = ERR_PTR(-ENOMEM); 1219 goto free_pd_blkcg; 1220 } 1221 blkcg->cpd[i] = cpd; 1222 cpd->blkcg = blkcg; 1223 cpd->plid = i; 1224 if (pol->cpd_init_fn) 1225 pol->cpd_init_fn(cpd); 1226 } 1227 1228 spin_lock_init(&blkcg->lock); 1229 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); 1230 INIT_HLIST_HEAD(&blkcg->blkg_list); 1231 #ifdef CONFIG_CGROUP_WRITEBACK 1232 INIT_LIST_HEAD(&blkcg->cgwb_list); 1233 refcount_set(&blkcg->cgwb_refcnt, 1); 1234 #endif 1235 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); 1236 1237 mutex_unlock(&blkcg_pol_mutex); 1238 return &blkcg->css; 1239 1240 free_pd_blkcg: 1241 for (i--; i >= 0; i--) 1242 if (blkcg->cpd[i]) 1243 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1244 1245 if (blkcg != &blkcg_root) 1246 kfree(blkcg); 1247 unlock: 1248 mutex_unlock(&blkcg_pol_mutex); 1249 return ret; 1250 } 1251 1252 /** 1253 * blkcg_init_queue - initialize blkcg part of request queue 1254 * @q: request_queue to initialize 1255 * 1256 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 1257 * part of new request_queue @q. 1258 * 1259 * RETURNS: 1260 * 0 on success, -errno on failure. 1261 */ 1262 int blkcg_init_queue(struct request_queue *q) 1263 { 1264 struct blkcg_gq *new_blkg, *blkg; 1265 bool preloaded; 1266 int ret; 1267 1268 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 1269 if (!new_blkg) 1270 return -ENOMEM; 1271 1272 preloaded = !radix_tree_preload(GFP_KERNEL); 1273 1274 /* Make sure the root blkg exists. */ 1275 rcu_read_lock(); 1276 spin_lock_irq(q->queue_lock); 1277 blkg = blkg_create(&blkcg_root, q, new_blkg); 1278 if (IS_ERR(blkg)) 1279 goto err_unlock; 1280 q->root_blkg = blkg; 1281 q->root_rl.blkg = blkg; 1282 spin_unlock_irq(q->queue_lock); 1283 rcu_read_unlock(); 1284 1285 if (preloaded) 1286 radix_tree_preload_end(); 1287 1288 ret = blk_iolatency_init(q); 1289 if (ret) { 1290 spin_lock_irq(q->queue_lock); 1291 blkg_destroy_all(q); 1292 spin_unlock_irq(q->queue_lock); 1293 return ret; 1294 } 1295 1296 ret = blk_throtl_init(q); 1297 if (ret) { 1298 spin_lock_irq(q->queue_lock); 1299 blkg_destroy_all(q); 1300 spin_unlock_irq(q->queue_lock); 1301 } 1302 return ret; 1303 1304 err_unlock: 1305 spin_unlock_irq(q->queue_lock); 1306 rcu_read_unlock(); 1307 if (preloaded) 1308 radix_tree_preload_end(); 1309 return PTR_ERR(blkg); 1310 } 1311 1312 /** 1313 * blkcg_drain_queue - drain blkcg part of request_queue 1314 * @q: request_queue to drain 1315 * 1316 * Called from blk_drain_queue(). Responsible for draining blkcg part. 1317 */ 1318 void blkcg_drain_queue(struct request_queue *q) 1319 { 1320 lockdep_assert_held(q->queue_lock); 1321 1322 /* 1323 * @q could be exiting and already have destroyed all blkgs as 1324 * indicated by NULL root_blkg. If so, don't confuse policies. 1325 */ 1326 if (!q->root_blkg) 1327 return; 1328 1329 blk_throtl_drain(q); 1330 } 1331 1332 /** 1333 * blkcg_exit_queue - exit and release blkcg part of request_queue 1334 * @q: request_queue being released 1335 * 1336 * Called from blk_release_queue(). Responsible for exiting blkcg part. 1337 */ 1338 void blkcg_exit_queue(struct request_queue *q) 1339 { 1340 spin_lock_irq(q->queue_lock); 1341 blkg_destroy_all(q); 1342 spin_unlock_irq(q->queue_lock); 1343 1344 blk_throtl_exit(q); 1345 } 1346 1347 /* 1348 * We cannot support shared io contexts, as we have no mean to support 1349 * two tasks with the same ioc in two different groups without major rework 1350 * of the main cic data structures. For now we allow a task to change 1351 * its cgroup only if it's the only owner of its ioc. 1352 */ 1353 static int blkcg_can_attach(struct cgroup_taskset *tset) 1354 { 1355 struct task_struct *task; 1356 struct cgroup_subsys_state *dst_css; 1357 struct io_context *ioc; 1358 int ret = 0; 1359 1360 /* task_lock() is needed to avoid races with exit_io_context() */ 1361 cgroup_taskset_for_each(task, dst_css, tset) { 1362 task_lock(task); 1363 ioc = task->io_context; 1364 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1365 ret = -EINVAL; 1366 task_unlock(task); 1367 if (ret) 1368 break; 1369 } 1370 return ret; 1371 } 1372 1373 static void blkcg_bind(struct cgroup_subsys_state *root_css) 1374 { 1375 int i; 1376 1377 mutex_lock(&blkcg_pol_mutex); 1378 1379 for (i = 0; i < BLKCG_MAX_POLS; i++) { 1380 struct blkcg_policy *pol = blkcg_policy[i]; 1381 struct blkcg *blkcg; 1382 1383 if (!pol || !pol->cpd_bind_fn) 1384 continue; 1385 1386 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) 1387 if (blkcg->cpd[pol->plid]) 1388 pol->cpd_bind_fn(blkcg->cpd[pol->plid]); 1389 } 1390 mutex_unlock(&blkcg_pol_mutex); 1391 } 1392 1393 static void blkcg_exit(struct task_struct *tsk) 1394 { 1395 if (tsk->throttle_queue) 1396 blk_put_queue(tsk->throttle_queue); 1397 tsk->throttle_queue = NULL; 1398 } 1399 1400 struct cgroup_subsys io_cgrp_subsys = { 1401 .css_alloc = blkcg_css_alloc, 1402 .css_offline = blkcg_css_offline, 1403 .css_free = blkcg_css_free, 1404 .can_attach = blkcg_can_attach, 1405 .bind = blkcg_bind, 1406 .dfl_cftypes = blkcg_files, 1407 .legacy_cftypes = blkcg_legacy_files, 1408 .legacy_name = "blkio", 1409 .exit = blkcg_exit, 1410 #ifdef CONFIG_MEMCG 1411 /* 1412 * This ensures that, if available, memcg is automatically enabled 1413 * together on the default hierarchy so that the owner cgroup can 1414 * be retrieved from writeback pages. 1415 */ 1416 .depends_on = 1 << memory_cgrp_id, 1417 #endif 1418 }; 1419 EXPORT_SYMBOL_GPL(io_cgrp_subsys); 1420 1421 /** 1422 * blkcg_activate_policy - activate a blkcg policy on a request_queue 1423 * @q: request_queue of interest 1424 * @pol: blkcg policy to activate 1425 * 1426 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 1427 * bypass mode to populate its blkgs with policy_data for @pol. 1428 * 1429 * Activation happens with @q bypassed, so nobody would be accessing blkgs 1430 * from IO path. Update of each blkg is protected by both queue and blkcg 1431 * locks so that holding either lock and testing blkcg_policy_enabled() is 1432 * always enough for dereferencing policy data. 1433 * 1434 * The caller is responsible for synchronizing [de]activations and policy 1435 * [un]registerations. Returns 0 on success, -errno on failure. 1436 */ 1437 int blkcg_activate_policy(struct request_queue *q, 1438 const struct blkcg_policy *pol) 1439 { 1440 struct blkg_policy_data *pd_prealloc = NULL; 1441 struct blkcg_gq *blkg; 1442 int ret; 1443 1444 if (blkcg_policy_enabled(q, pol)) 1445 return 0; 1446 1447 if (q->mq_ops) 1448 blk_mq_freeze_queue(q); 1449 else 1450 blk_queue_bypass_start(q); 1451 pd_prealloc: 1452 if (!pd_prealloc) { 1453 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); 1454 if (!pd_prealloc) { 1455 ret = -ENOMEM; 1456 goto out_bypass_end; 1457 } 1458 } 1459 1460 spin_lock_irq(q->queue_lock); 1461 1462 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1463 struct blkg_policy_data *pd; 1464 1465 if (blkg->pd[pol->plid]) 1466 continue; 1467 1468 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); 1469 if (!pd) 1470 swap(pd, pd_prealloc); 1471 if (!pd) { 1472 spin_unlock_irq(q->queue_lock); 1473 goto pd_prealloc; 1474 } 1475 1476 blkg->pd[pol->plid] = pd; 1477 pd->blkg = blkg; 1478 pd->plid = pol->plid; 1479 if (pol->pd_init_fn) 1480 pol->pd_init_fn(pd); 1481 } 1482 1483 __set_bit(pol->plid, q->blkcg_pols); 1484 ret = 0; 1485 1486 spin_unlock_irq(q->queue_lock); 1487 out_bypass_end: 1488 if (q->mq_ops) 1489 blk_mq_unfreeze_queue(q); 1490 else 1491 blk_queue_bypass_end(q); 1492 if (pd_prealloc) 1493 pol->pd_free_fn(pd_prealloc); 1494 return ret; 1495 } 1496 EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1497 1498 /** 1499 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 1500 * @q: request_queue of interest 1501 * @pol: blkcg policy to deactivate 1502 * 1503 * Deactivate @pol on @q. Follows the same synchronization rules as 1504 * blkcg_activate_policy(). 1505 */ 1506 void blkcg_deactivate_policy(struct request_queue *q, 1507 const struct blkcg_policy *pol) 1508 { 1509 struct blkcg_gq *blkg; 1510 1511 if (!blkcg_policy_enabled(q, pol)) 1512 return; 1513 1514 if (q->mq_ops) 1515 blk_mq_freeze_queue(q); 1516 else 1517 blk_queue_bypass_start(q); 1518 1519 spin_lock_irq(q->queue_lock); 1520 1521 __clear_bit(pol->plid, q->blkcg_pols); 1522 1523 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1524 if (blkg->pd[pol->plid]) { 1525 if (pol->pd_offline_fn) 1526 pol->pd_offline_fn(blkg->pd[pol->plid]); 1527 pol->pd_free_fn(blkg->pd[pol->plid]); 1528 blkg->pd[pol->plid] = NULL; 1529 } 1530 } 1531 1532 spin_unlock_irq(q->queue_lock); 1533 1534 if (q->mq_ops) 1535 blk_mq_unfreeze_queue(q); 1536 else 1537 blk_queue_bypass_end(q); 1538 } 1539 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1540 1541 /** 1542 * blkcg_policy_register - register a blkcg policy 1543 * @pol: blkcg policy to register 1544 * 1545 * Register @pol with blkcg core. Might sleep and @pol may be modified on 1546 * successful registration. Returns 0 on success and -errno on failure. 1547 */ 1548 int blkcg_policy_register(struct blkcg_policy *pol) 1549 { 1550 struct blkcg *blkcg; 1551 int i, ret; 1552 1553 mutex_lock(&blkcg_pol_register_mutex); 1554 mutex_lock(&blkcg_pol_mutex); 1555 1556 /* find an empty slot */ 1557 ret = -ENOSPC; 1558 for (i = 0; i < BLKCG_MAX_POLS; i++) 1559 if (!blkcg_policy[i]) 1560 break; 1561 if (i >= BLKCG_MAX_POLS) { 1562 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); 1563 goto err_unlock; 1564 } 1565 1566 /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ 1567 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || 1568 (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) 1569 goto err_unlock; 1570 1571 /* register @pol */ 1572 pol->plid = i; 1573 blkcg_policy[pol->plid] = pol; 1574 1575 /* allocate and install cpd's */ 1576 if (pol->cpd_alloc_fn) { 1577 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1578 struct blkcg_policy_data *cpd; 1579 1580 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1581 if (!cpd) 1582 goto err_free_cpds; 1583 1584 blkcg->cpd[pol->plid] = cpd; 1585 cpd->blkcg = blkcg; 1586 cpd->plid = pol->plid; 1587 pol->cpd_init_fn(cpd); 1588 } 1589 } 1590 1591 mutex_unlock(&blkcg_pol_mutex); 1592 1593 /* everything is in place, add intf files for the new policy */ 1594 if (pol->dfl_cftypes) 1595 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, 1596 pol->dfl_cftypes)); 1597 if (pol->legacy_cftypes) 1598 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, 1599 pol->legacy_cftypes)); 1600 mutex_unlock(&blkcg_pol_register_mutex); 1601 return 0; 1602 1603 err_free_cpds: 1604 if (pol->cpd_free_fn) { 1605 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1606 if (blkcg->cpd[pol->plid]) { 1607 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1608 blkcg->cpd[pol->plid] = NULL; 1609 } 1610 } 1611 } 1612 blkcg_policy[pol->plid] = NULL; 1613 err_unlock: 1614 mutex_unlock(&blkcg_pol_mutex); 1615 mutex_unlock(&blkcg_pol_register_mutex); 1616 return ret; 1617 } 1618 EXPORT_SYMBOL_GPL(blkcg_policy_register); 1619 1620 /** 1621 * blkcg_policy_unregister - unregister a blkcg policy 1622 * @pol: blkcg policy to unregister 1623 * 1624 * Undo blkcg_policy_register(@pol). Might sleep. 1625 */ 1626 void blkcg_policy_unregister(struct blkcg_policy *pol) 1627 { 1628 struct blkcg *blkcg; 1629 1630 mutex_lock(&blkcg_pol_register_mutex); 1631 1632 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 1633 goto out_unlock; 1634 1635 /* kill the intf files first */ 1636 if (pol->dfl_cftypes) 1637 cgroup_rm_cftypes(pol->dfl_cftypes); 1638 if (pol->legacy_cftypes) 1639 cgroup_rm_cftypes(pol->legacy_cftypes); 1640 1641 /* remove cpds and unregister */ 1642 mutex_lock(&blkcg_pol_mutex); 1643 1644 if (pol->cpd_free_fn) { 1645 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1646 if (blkcg->cpd[pol->plid]) { 1647 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1648 blkcg->cpd[pol->plid] = NULL; 1649 } 1650 } 1651 } 1652 blkcg_policy[pol->plid] = NULL; 1653 1654 mutex_unlock(&blkcg_pol_mutex); 1655 out_unlock: 1656 mutex_unlock(&blkcg_pol_register_mutex); 1657 } 1658 EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1659 1660 /* 1661 * Scale the accumulated delay based on how long it has been since we updated 1662 * the delay. We only call this when we are adding delay, in case it's been a 1663 * while since we added delay, and when we are checking to see if we need to 1664 * delay a task, to account for any delays that may have occurred. 1665 */ 1666 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) 1667 { 1668 u64 old = atomic64_read(&blkg->delay_start); 1669 1670 /* 1671 * We only want to scale down every second. The idea here is that we 1672 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain 1673 * time window. We only want to throttle tasks for recent delay that 1674 * has occurred, in 1 second time windows since that's the maximum 1675 * things can be throttled. We save the current delay window in 1676 * blkg->last_delay so we know what amount is still left to be charged 1677 * to the blkg from this point onward. blkg->last_use keeps track of 1678 * the use_delay counter. The idea is if we're unthrottling the blkg we 1679 * are ok with whatever is happening now, and we can take away more of 1680 * the accumulated delay as we've already throttled enough that 1681 * everybody is happy with their IO latencies. 1682 */ 1683 if (time_before64(old + NSEC_PER_SEC, now) && 1684 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { 1685 u64 cur = atomic64_read(&blkg->delay_nsec); 1686 u64 sub = min_t(u64, blkg->last_delay, now - old); 1687 int cur_use = atomic_read(&blkg->use_delay); 1688 1689 /* 1690 * We've been unthrottled, subtract a larger chunk of our 1691 * accumulated delay. 1692 */ 1693 if (cur_use < blkg->last_use) 1694 sub = max_t(u64, sub, blkg->last_delay >> 1); 1695 1696 /* 1697 * This shouldn't happen, but handle it anyway. Our delay_nsec 1698 * should only ever be growing except here where we subtract out 1699 * min(last_delay, 1 second), but lord knows bugs happen and I'd 1700 * rather not end up with negative numbers. 1701 */ 1702 if (unlikely(cur < sub)) { 1703 atomic64_set(&blkg->delay_nsec, 0); 1704 blkg->last_delay = 0; 1705 } else { 1706 atomic64_sub(sub, &blkg->delay_nsec); 1707 blkg->last_delay = cur - sub; 1708 } 1709 blkg->last_use = cur_use; 1710 } 1711 } 1712 1713 /* 1714 * This is called when we want to actually walk up the hierarchy and check to 1715 * see if we need to throttle, and then actually throttle if there is some 1716 * accumulated delay. This should only be called upon return to user space so 1717 * we're not holding some lock that would induce a priority inversion. 1718 */ 1719 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1720 { 1721 u64 now = ktime_to_ns(ktime_get()); 1722 u64 exp; 1723 u64 delay_nsec = 0; 1724 int tok; 1725 1726 while (blkg->parent) { 1727 if (atomic_read(&blkg->use_delay)) { 1728 blkcg_scale_delay(blkg, now); 1729 delay_nsec = max_t(u64, delay_nsec, 1730 atomic64_read(&blkg->delay_nsec)); 1731 } 1732 blkg = blkg->parent; 1733 } 1734 1735 if (!delay_nsec) 1736 return; 1737 1738 /* 1739 * Let's not sleep for all eternity if we've amassed a huge delay. 1740 * Swapping or metadata IO can accumulate 10's of seconds worth of 1741 * delay, and we want userspace to be able to do _something_ so cap the 1742 * delays at 1 second. If there's 10's of seconds worth of delay then 1743 * the tasks will be delayed for 1 second for every syscall. 1744 */ 1745 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1746 1747 /* 1748 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff 1749 * that hasn't landed upstream yet. Once that stuff is in place we need 1750 * to do a psi_memstall_enter/leave if memdelay is set. 1751 */ 1752 1753 exp = ktime_add_ns(now, delay_nsec); 1754 tok = io_schedule_prepare(); 1755 do { 1756 __set_current_state(TASK_KILLABLE); 1757 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) 1758 break; 1759 } while (!fatal_signal_pending(current)); 1760 io_schedule_finish(tok); 1761 } 1762 1763 /** 1764 * blkcg_maybe_throttle_current - throttle the current task if it has been marked 1765 * 1766 * This is only called if we've been marked with set_notify_resume(). Obviously 1767 * we can be set_notify_resume() for reasons other than blkcg throttling, so we 1768 * check to see if current->throttle_queue is set and if not this doesn't do 1769 * anything. This should only ever be called by the resume code, it's not meant 1770 * to be called by people willy-nilly as it will actually do the work to 1771 * throttle the task if it is setup for throttling. 1772 */ 1773 void blkcg_maybe_throttle_current(void) 1774 { 1775 struct request_queue *q = current->throttle_queue; 1776 struct cgroup_subsys_state *css; 1777 struct blkcg *blkcg; 1778 struct blkcg_gq *blkg; 1779 bool use_memdelay = current->use_memdelay; 1780 1781 if (!q) 1782 return; 1783 1784 current->throttle_queue = NULL; 1785 current->use_memdelay = false; 1786 1787 rcu_read_lock(); 1788 css = kthread_blkcg(); 1789 if (css) 1790 blkcg = css_to_blkcg(css); 1791 else 1792 blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); 1793 1794 if (!blkcg) 1795 goto out; 1796 blkg = blkg_lookup(blkcg, q); 1797 if (!blkg) 1798 goto out; 1799 if (!blkg_tryget(blkg)) 1800 goto out; 1801 rcu_read_unlock(); 1802 1803 blkcg_maybe_throttle_blkg(blkg, use_memdelay); 1804 blkg_put(blkg); 1805 blk_put_queue(q); 1806 return; 1807 out: 1808 rcu_read_unlock(); 1809 blk_put_queue(q); 1810 } 1811 EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); 1812 1813 /** 1814 * blkcg_schedule_throttle - this task needs to check for throttling 1815 * @q - the request queue IO was submitted on 1816 * @use_memdelay - do we charge this to memory delay for PSI 1817 * 1818 * This is called by the IO controller when we know there's delay accumulated 1819 * for the blkg for this task. We do not pass the blkg because there are places 1820 * we call this that may not have that information, the swapping code for 1821 * instance will only have a request_queue at that point. This set's the 1822 * notify_resume for the task to check and see if it requires throttling before 1823 * returning to user space. 1824 * 1825 * We will only schedule once per syscall. You can call this over and over 1826 * again and it will only do the check once upon return to user space, and only 1827 * throttle once. If the task needs to be throttled again it'll need to be 1828 * re-set at the next time we see the task. 1829 */ 1830 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) 1831 { 1832 if (unlikely(current->flags & PF_KTHREAD)) 1833 return; 1834 1835 if (!blk_get_queue(q)) 1836 return; 1837 1838 if (current->throttle_queue) 1839 blk_put_queue(current->throttle_queue); 1840 current->throttle_queue = q; 1841 if (use_memdelay) 1842 current->use_memdelay = use_memdelay; 1843 set_notify_resume(current); 1844 } 1845 EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); 1846 1847 /** 1848 * blkcg_add_delay - add delay to this blkg 1849 * @now - the current time in nanoseconds 1850 * @delta - how many nanoseconds of delay to add 1851 * 1852 * Charge @delta to the blkg's current delay accumulation. This is used to 1853 * throttle tasks if an IO controller thinks we need more throttling. 1854 */ 1855 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) 1856 { 1857 blkcg_scale_delay(blkg, now); 1858 atomic64_add(delta, &blkg->delay_nsec); 1859 } 1860 EXPORT_SYMBOL_GPL(blkcg_add_delay); 1861 1862 module_param(blkcg_debug_stats, bool, 0644); 1863 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1864