1 /* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13 #include <linux/ioprio.h> 14 #include <linux/seq_file.h> 15 #include <linux/kdev_t.h> 16 #include <linux/module.h> 17 #include <linux/err.h> 18 #include <linux/blkdev.h> 19 #include <linux/slab.h> 20 #include "blk-cgroup.h" 21 #include <linux/genhd.h> 22 23 #define MAX_KEY_LEN 100 24 25 static DEFINE_SPINLOCK(blkio_list_lock); 26 static LIST_HEAD(blkio_list); 27 28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 29 EXPORT_SYMBOL_GPL(blkio_root_cgroup); 30 31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 32 struct cgroup *); 33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 34 struct task_struct *, bool); 35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, 36 struct cgroup *, struct task_struct *, bool); 37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 39 40 struct cgroup_subsys blkio_subsys = { 41 .name = "blkio", 42 .create = blkiocg_create, 43 .can_attach = blkiocg_can_attach, 44 .attach = blkiocg_attach, 45 .destroy = blkiocg_destroy, 46 .populate = blkiocg_populate, 47 #ifdef CONFIG_BLK_CGROUP 48 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ 49 .subsys_id = blkio_subsys_id, 50 #endif 51 .use_id = 1, 52 .module = THIS_MODULE, 53 }; 54 EXPORT_SYMBOL_GPL(blkio_subsys); 55 56 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, 57 struct blkio_policy_node *pn) 58 { 59 list_add(&pn->node, &blkcg->policy_list); 60 } 61 62 /* Must be called with blkcg->lock held */ 63 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 64 { 65 list_del(&pn->node); 66 } 67 68 /* Must be called with blkcg->lock held */ 69 static struct blkio_policy_node * 70 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 71 { 72 struct blkio_policy_node *pn; 73 74 list_for_each_entry(pn, &blkcg->policy_list, node) { 75 if (pn->dev == dev) 76 return pn; 77 } 78 79 return NULL; 80 } 81 82 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 83 { 84 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 85 struct blkio_cgroup, css); 86 } 87 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 88 89 /* 90 * Add to the appropriate stat variable depending on the request type. 91 * This should be called with the blkg->stats_lock held. 92 */ 93 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, 94 bool sync) 95 { 96 if (direction) 97 stat[BLKIO_STAT_WRITE] += add; 98 else 99 stat[BLKIO_STAT_READ] += add; 100 if (sync) 101 stat[BLKIO_STAT_SYNC] += add; 102 else 103 stat[BLKIO_STAT_ASYNC] += add; 104 } 105 106 /* 107 * Decrements the appropriate stat variable if non-zero depending on the 108 * request type. Panics on value being zero. 109 * This should be called with the blkg->stats_lock held. 110 */ 111 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) 112 { 113 if (direction) { 114 BUG_ON(stat[BLKIO_STAT_WRITE] == 0); 115 stat[BLKIO_STAT_WRITE]--; 116 } else { 117 BUG_ON(stat[BLKIO_STAT_READ] == 0); 118 stat[BLKIO_STAT_READ]--; 119 } 120 if (sync) { 121 BUG_ON(stat[BLKIO_STAT_SYNC] == 0); 122 stat[BLKIO_STAT_SYNC]--; 123 } else { 124 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); 125 stat[BLKIO_STAT_ASYNC]--; 126 } 127 } 128 129 #ifdef CONFIG_DEBUG_BLK_CGROUP 130 /* This should be called with the blkg->stats_lock held. */ 131 static void blkio_set_start_group_wait_time(struct blkio_group *blkg, 132 struct blkio_group *curr_blkg) 133 { 134 if (blkio_blkg_waiting(&blkg->stats)) 135 return; 136 if (blkg == curr_blkg) 137 return; 138 blkg->stats.start_group_wait_time = sched_clock(); 139 blkio_mark_blkg_waiting(&blkg->stats); 140 } 141 142 /* This should be called with the blkg->stats_lock held. */ 143 static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 144 { 145 unsigned long long now; 146 147 if (!blkio_blkg_waiting(stats)) 148 return; 149 150 now = sched_clock(); 151 if (time_after64(now, stats->start_group_wait_time)) 152 stats->group_wait_time += now - stats->start_group_wait_time; 153 blkio_clear_blkg_waiting(stats); 154 } 155 156 /* This should be called with the blkg->stats_lock held. */ 157 static void blkio_end_empty_time(struct blkio_group_stats *stats) 158 { 159 unsigned long long now; 160 161 if (!blkio_blkg_empty(stats)) 162 return; 163 164 now = sched_clock(); 165 if (time_after64(now, stats->start_empty_time)) 166 stats->empty_time += now - stats->start_empty_time; 167 blkio_clear_blkg_empty(stats); 168 } 169 170 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 171 { 172 unsigned long flags; 173 174 spin_lock_irqsave(&blkg->stats_lock, flags); 175 BUG_ON(blkio_blkg_idling(&blkg->stats)); 176 blkg->stats.start_idle_time = sched_clock(); 177 blkio_mark_blkg_idling(&blkg->stats); 178 spin_unlock_irqrestore(&blkg->stats_lock, flags); 179 } 180 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); 181 182 void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 183 { 184 unsigned long flags; 185 unsigned long long now; 186 struct blkio_group_stats *stats; 187 188 spin_lock_irqsave(&blkg->stats_lock, flags); 189 stats = &blkg->stats; 190 if (blkio_blkg_idling(stats)) { 191 now = sched_clock(); 192 if (time_after64(now, stats->start_idle_time)) 193 stats->idle_time += now - stats->start_idle_time; 194 blkio_clear_blkg_idling(stats); 195 } 196 spin_unlock_irqrestore(&blkg->stats_lock, flags); 197 } 198 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); 199 200 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 201 { 202 unsigned long flags; 203 struct blkio_group_stats *stats; 204 205 spin_lock_irqsave(&blkg->stats_lock, flags); 206 stats = &blkg->stats; 207 stats->avg_queue_size_sum += 208 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 209 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 210 stats->avg_queue_size_samples++; 211 blkio_update_group_wait_time(stats); 212 spin_unlock_irqrestore(&blkg->stats_lock, flags); 213 } 214 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); 215 216 void blkiocg_set_start_empty_time(struct blkio_group *blkg) 217 { 218 unsigned long flags; 219 struct blkio_group_stats *stats; 220 221 spin_lock_irqsave(&blkg->stats_lock, flags); 222 stats = &blkg->stats; 223 224 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 225 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 226 spin_unlock_irqrestore(&blkg->stats_lock, flags); 227 return; 228 } 229 230 /* 231 * group is already marked empty. This can happen if cfqq got new 232 * request in parent group and moved to this group while being added 233 * to service tree. Just ignore the event and move on. 234 */ 235 if(blkio_blkg_empty(stats)) { 236 spin_unlock_irqrestore(&blkg->stats_lock, flags); 237 return; 238 } 239 240 stats->start_empty_time = sched_clock(); 241 blkio_mark_blkg_empty(stats); 242 spin_unlock_irqrestore(&blkg->stats_lock, flags); 243 } 244 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); 245 246 void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 247 unsigned long dequeue) 248 { 249 blkg->stats.dequeue += dequeue; 250 } 251 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 252 #else 253 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 254 struct blkio_group *curr_blkg) {} 255 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 256 #endif 257 258 void blkiocg_update_io_add_stats(struct blkio_group *blkg, 259 struct blkio_group *curr_blkg, bool direction, 260 bool sync) 261 { 262 unsigned long flags; 263 264 spin_lock_irqsave(&blkg->stats_lock, flags); 265 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 266 sync); 267 blkio_end_empty_time(&blkg->stats); 268 blkio_set_start_group_wait_time(blkg, curr_blkg); 269 spin_unlock_irqrestore(&blkg->stats_lock, flags); 270 } 271 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 272 273 void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 274 bool direction, bool sync) 275 { 276 unsigned long flags; 277 278 spin_lock_irqsave(&blkg->stats_lock, flags); 279 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 280 direction, sync); 281 spin_unlock_irqrestore(&blkg->stats_lock, flags); 282 } 283 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 284 285 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) 286 { 287 unsigned long flags; 288 289 spin_lock_irqsave(&blkg->stats_lock, flags); 290 blkg->stats.time += time; 291 spin_unlock_irqrestore(&blkg->stats_lock, flags); 292 } 293 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 294 295 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 296 uint64_t bytes, bool direction, bool sync) 297 { 298 struct blkio_group_stats *stats; 299 unsigned long flags; 300 301 spin_lock_irqsave(&blkg->stats_lock, flags); 302 stats = &blkg->stats; 303 stats->sectors += bytes >> 9; 304 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, 305 sync); 306 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, 307 direction, sync); 308 spin_unlock_irqrestore(&blkg->stats_lock, flags); 309 } 310 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 311 312 void blkiocg_update_completion_stats(struct blkio_group *blkg, 313 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) 314 { 315 struct blkio_group_stats *stats; 316 unsigned long flags; 317 unsigned long long now = sched_clock(); 318 319 spin_lock_irqsave(&blkg->stats_lock, flags); 320 stats = &blkg->stats; 321 if (time_after64(now, io_start_time)) 322 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], 323 now - io_start_time, direction, sync); 324 if (time_after64(io_start_time, start_time)) 325 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], 326 io_start_time - start_time, direction, sync); 327 spin_unlock_irqrestore(&blkg->stats_lock, flags); 328 } 329 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 330 331 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 332 bool sync) 333 { 334 unsigned long flags; 335 336 spin_lock_irqsave(&blkg->stats_lock, flags); 337 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, 338 sync); 339 spin_unlock_irqrestore(&blkg->stats_lock, flags); 340 } 341 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 342 343 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 344 struct blkio_group *blkg, void *key, dev_t dev) 345 { 346 unsigned long flags; 347 348 spin_lock_irqsave(&blkcg->lock, flags); 349 spin_lock_init(&blkg->stats_lock); 350 rcu_assign_pointer(blkg->key, key); 351 blkg->blkcg_id = css_id(&blkcg->css); 352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 353 spin_unlock_irqrestore(&blkcg->lock, flags); 354 /* Need to take css reference ? */ 355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 356 blkg->dev = dev; 357 } 358 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 359 360 static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 361 { 362 hlist_del_init_rcu(&blkg->blkcg_node); 363 blkg->blkcg_id = 0; 364 } 365 366 /* 367 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 368 * indicating that blk_group was unhashed by the time we got to it. 369 */ 370 int blkiocg_del_blkio_group(struct blkio_group *blkg) 371 { 372 struct blkio_cgroup *blkcg; 373 unsigned long flags; 374 struct cgroup_subsys_state *css; 375 int ret = 1; 376 377 rcu_read_lock(); 378 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 379 if (css) { 380 blkcg = container_of(css, struct blkio_cgroup, css); 381 spin_lock_irqsave(&blkcg->lock, flags); 382 if (!hlist_unhashed(&blkg->blkcg_node)) { 383 __blkiocg_del_blkio_group(blkg); 384 ret = 0; 385 } 386 spin_unlock_irqrestore(&blkcg->lock, flags); 387 } 388 389 rcu_read_unlock(); 390 return ret; 391 } 392 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 393 394 /* called under rcu_read_lock(). */ 395 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) 396 { 397 struct blkio_group *blkg; 398 struct hlist_node *n; 399 void *__key; 400 401 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 402 __key = blkg->key; 403 if (__key == key) 404 return blkg; 405 } 406 407 return NULL; 408 } 409 EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 410 411 #define SHOW_FUNCTION(__VAR) \ 412 static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 413 struct cftype *cftype) \ 414 { \ 415 struct blkio_cgroup *blkcg; \ 416 \ 417 blkcg = cgroup_to_blkio_cgroup(cgroup); \ 418 return (u64)blkcg->__VAR; \ 419 } 420 421 SHOW_FUNCTION(weight); 422 #undef SHOW_FUNCTION 423 424 static int 425 blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) 426 { 427 struct blkio_cgroup *blkcg; 428 struct blkio_group *blkg; 429 struct hlist_node *n; 430 struct blkio_policy_type *blkiop; 431 struct blkio_policy_node *pn; 432 433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 434 return -EINVAL; 435 436 blkcg = cgroup_to_blkio_cgroup(cgroup); 437 spin_lock(&blkio_list_lock); 438 spin_lock_irq(&blkcg->lock); 439 blkcg->weight = (unsigned int)val; 440 441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 442 pn = blkio_policy_search_node(blkcg, blkg->dev); 443 444 if (pn) 445 continue; 446 447 list_for_each_entry(blkiop, &blkio_list, list) 448 blkiop->ops.blkio_update_group_weight_fn(blkg, 449 blkcg->weight); 450 } 451 spin_unlock_irq(&blkcg->lock); 452 spin_unlock(&blkio_list_lock); 453 return 0; 454 } 455 456 static int 457 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 458 { 459 struct blkio_cgroup *blkcg; 460 struct blkio_group *blkg; 461 struct blkio_group_stats *stats; 462 struct hlist_node *n; 463 uint64_t queued[BLKIO_STAT_TOTAL]; 464 int i; 465 #ifdef CONFIG_DEBUG_BLK_CGROUP 466 bool idling, waiting, empty; 467 unsigned long long now = sched_clock(); 468 #endif 469 470 blkcg = cgroup_to_blkio_cgroup(cgroup); 471 spin_lock_irq(&blkcg->lock); 472 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 473 spin_lock(&blkg->stats_lock); 474 stats = &blkg->stats; 475 #ifdef CONFIG_DEBUG_BLK_CGROUP 476 idling = blkio_blkg_idling(stats); 477 waiting = blkio_blkg_waiting(stats); 478 empty = blkio_blkg_empty(stats); 479 #endif 480 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 481 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; 482 memset(stats, 0, sizeof(struct blkio_group_stats)); 483 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 484 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 485 #ifdef CONFIG_DEBUG_BLK_CGROUP 486 if (idling) { 487 blkio_mark_blkg_idling(stats); 488 stats->start_idle_time = now; 489 } 490 if (waiting) { 491 blkio_mark_blkg_waiting(stats); 492 stats->start_group_wait_time = now; 493 } 494 if (empty) { 495 blkio_mark_blkg_empty(stats); 496 stats->start_empty_time = now; 497 } 498 #endif 499 spin_unlock(&blkg->stats_lock); 500 } 501 spin_unlock_irq(&blkcg->lock); 502 return 0; 503 } 504 505 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, 506 int chars_left, bool diskname_only) 507 { 508 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); 509 chars_left -= strlen(str); 510 if (chars_left <= 0) { 511 printk(KERN_WARNING 512 "Possibly incorrect cgroup stat display format"); 513 return; 514 } 515 if (diskname_only) 516 return; 517 switch (type) { 518 case BLKIO_STAT_READ: 519 strlcat(str, " Read", chars_left); 520 break; 521 case BLKIO_STAT_WRITE: 522 strlcat(str, " Write", chars_left); 523 break; 524 case BLKIO_STAT_SYNC: 525 strlcat(str, " Sync", chars_left); 526 break; 527 case BLKIO_STAT_ASYNC: 528 strlcat(str, " Async", chars_left); 529 break; 530 case BLKIO_STAT_TOTAL: 531 strlcat(str, " Total", chars_left); 532 break; 533 default: 534 strlcat(str, " Invalid", chars_left); 535 } 536 } 537 538 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 539 struct cgroup_map_cb *cb, dev_t dev) 540 { 541 blkio_get_key_name(0, dev, str, chars_left, true); 542 cb->fill(cb, str, val); 543 return val; 544 } 545 546 /* This should be called with blkg->stats_lock held */ 547 static uint64_t blkio_get_stat(struct blkio_group *blkg, 548 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 549 { 550 uint64_t disk_total; 551 char key_str[MAX_KEY_LEN]; 552 enum stat_sub_type sub_type; 553 554 if (type == BLKIO_STAT_TIME) 555 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 556 blkg->stats.time, cb, dev); 557 if (type == BLKIO_STAT_SECTORS) 558 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 559 blkg->stats.sectors, cb, dev); 560 #ifdef CONFIG_DEBUG_BLK_CGROUP 561 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 562 uint64_t sum = blkg->stats.avg_queue_size_sum; 563 uint64_t samples = blkg->stats.avg_queue_size_samples; 564 if (samples) 565 do_div(sum, samples); 566 else 567 sum = 0; 568 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 569 } 570 if (type == BLKIO_STAT_GROUP_WAIT_TIME) 571 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 572 blkg->stats.group_wait_time, cb, dev); 573 if (type == BLKIO_STAT_IDLE_TIME) 574 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 575 blkg->stats.idle_time, cb, dev); 576 if (type == BLKIO_STAT_EMPTY_TIME) 577 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 578 blkg->stats.empty_time, cb, dev); 579 if (type == BLKIO_STAT_DEQUEUE) 580 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 581 blkg->stats.dequeue, cb, dev); 582 #endif 583 584 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 585 sub_type++) { 586 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 587 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); 588 } 589 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + 590 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; 591 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 592 cb->fill(cb, key_str, disk_total); 593 return disk_total; 594 } 595 596 #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ 597 static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 598 struct cftype *cftype, struct cgroup_map_cb *cb) \ 599 { \ 600 struct blkio_cgroup *blkcg; \ 601 struct blkio_group *blkg; \ 602 struct hlist_node *n; \ 603 uint64_t cgroup_total = 0; \ 604 \ 605 if (!cgroup_lock_live_group(cgroup)) \ 606 return -ENODEV; \ 607 \ 608 blkcg = cgroup_to_blkio_cgroup(cgroup); \ 609 rcu_read_lock(); \ 610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ 611 if (blkg->dev) { \ 612 spin_lock_irq(&blkg->stats_lock); \ 613 cgroup_total += blkio_get_stat(blkg, cb, \ 614 blkg->dev, type); \ 615 spin_unlock_irq(&blkg->stats_lock); \ 616 } \ 617 } \ 618 if (show_total) \ 619 cb->fill(cb, "Total", cgroup_total); \ 620 rcu_read_unlock(); \ 621 cgroup_unlock(); \ 622 return 0; \ 623 } 624 625 SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); 626 SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); 627 SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); 628 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); 629 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); 630 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); 631 SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); 632 SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); 633 #ifdef CONFIG_DEBUG_BLK_CGROUP 634 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); 635 SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); 636 SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); 637 SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); 638 SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); 639 #endif 640 #undef SHOW_FUNCTION_PER_GROUP 641 642 static int blkio_check_dev_num(dev_t dev) 643 { 644 int part = 0; 645 struct gendisk *disk; 646 647 disk = get_gendisk(dev, &part); 648 if (!disk || part) 649 return -ENODEV; 650 651 return 0; 652 } 653 654 static int blkio_policy_parse_and_set(char *buf, 655 struct blkio_policy_node *newpn) 656 { 657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 658 int ret; 659 unsigned long major, minor, temp; 660 int i = 0; 661 dev_t dev; 662 663 memset(s, 0, sizeof(s)); 664 665 while ((p = strsep(&buf, " ")) != NULL) { 666 if (!*p) 667 continue; 668 669 s[i++] = p; 670 671 /* Prevent from inputing too many things */ 672 if (i == 3) 673 break; 674 } 675 676 if (i != 2) 677 return -EINVAL; 678 679 p = strsep(&s[0], ":"); 680 if (p != NULL) 681 major_s = p; 682 else 683 return -EINVAL; 684 685 minor_s = s[0]; 686 if (!minor_s) 687 return -EINVAL; 688 689 ret = strict_strtoul(major_s, 10, &major); 690 if (ret) 691 return -EINVAL; 692 693 ret = strict_strtoul(minor_s, 10, &minor); 694 if (ret) 695 return -EINVAL; 696 697 dev = MKDEV(major, minor); 698 699 ret = blkio_check_dev_num(dev); 700 if (ret) 701 return ret; 702 703 newpn->dev = dev; 704 705 if (s[1] == NULL) 706 return -EINVAL; 707 708 ret = strict_strtoul(s[1], 10, &temp); 709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 710 temp > BLKIO_WEIGHT_MAX) 711 return -EINVAL; 712 713 newpn->weight = temp; 714 715 return 0; 716 } 717 718 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 719 dev_t dev) 720 { 721 struct blkio_policy_node *pn; 722 723 pn = blkio_policy_search_node(blkcg, dev); 724 if (pn) 725 return pn->weight; 726 else 727 return blkcg->weight; 728 } 729 EXPORT_SYMBOL_GPL(blkcg_get_weight); 730 731 732 static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 733 const char *buffer) 734 { 735 int ret = 0; 736 char *buf; 737 struct blkio_policy_node *newpn, *pn; 738 struct blkio_cgroup *blkcg; 739 struct blkio_group *blkg; 740 int keep_newpn = 0; 741 struct hlist_node *n; 742 struct blkio_policy_type *blkiop; 743 744 buf = kstrdup(buffer, GFP_KERNEL); 745 if (!buf) 746 return -ENOMEM; 747 748 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); 749 if (!newpn) { 750 ret = -ENOMEM; 751 goto free_buf; 752 } 753 754 ret = blkio_policy_parse_and_set(buf, newpn); 755 if (ret) 756 goto free_newpn; 757 758 blkcg = cgroup_to_blkio_cgroup(cgrp); 759 760 spin_lock_irq(&blkcg->lock); 761 762 pn = blkio_policy_search_node(blkcg, newpn->dev); 763 if (!pn) { 764 if (newpn->weight != 0) { 765 blkio_policy_insert_node(blkcg, newpn); 766 keep_newpn = 1; 767 } 768 spin_unlock_irq(&blkcg->lock); 769 goto update_io_group; 770 } 771 772 if (newpn->weight == 0) { 773 /* weight == 0 means deleteing a specific weight */ 774 blkio_policy_delete_node(pn); 775 spin_unlock_irq(&blkcg->lock); 776 goto update_io_group; 777 } 778 spin_unlock_irq(&blkcg->lock); 779 780 pn->weight = newpn->weight; 781 782 update_io_group: 783 /* update weight for each cfqg */ 784 spin_lock(&blkio_list_lock); 785 spin_lock_irq(&blkcg->lock); 786 787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 788 if (newpn->dev == blkg->dev) { 789 list_for_each_entry(blkiop, &blkio_list, list) 790 blkiop->ops.blkio_update_group_weight_fn(blkg, 791 newpn->weight ? 792 newpn->weight : 793 blkcg->weight); 794 } 795 } 796 797 spin_unlock_irq(&blkcg->lock); 798 spin_unlock(&blkio_list_lock); 799 800 free_newpn: 801 if (!keep_newpn) 802 kfree(newpn); 803 free_buf: 804 kfree(buf); 805 return ret; 806 } 807 808 static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 809 struct seq_file *m) 810 { 811 struct blkio_cgroup *blkcg; 812 struct blkio_policy_node *pn; 813 814 seq_printf(m, "dev\tweight\n"); 815 816 blkcg = cgroup_to_blkio_cgroup(cgrp); 817 if (!list_empty(&blkcg->policy_list)) { 818 spin_lock_irq(&blkcg->lock); 819 list_for_each_entry(pn, &blkcg->policy_list, node) { 820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 821 MINOR(pn->dev), pn->weight); 822 } 823 spin_unlock_irq(&blkcg->lock); 824 } 825 826 return 0; 827 } 828 829 struct cftype blkio_files[] = { 830 { 831 .name = "weight_device", 832 .read_seq_string = blkiocg_weight_device_read, 833 .write_string = blkiocg_weight_device_write, 834 .max_write_len = 256, 835 }, 836 { 837 .name = "weight", 838 .read_u64 = blkiocg_weight_read, 839 .write_u64 = blkiocg_weight_write, 840 }, 841 { 842 .name = "time", 843 .read_map = blkiocg_time_read, 844 }, 845 { 846 .name = "sectors", 847 .read_map = blkiocg_sectors_read, 848 }, 849 { 850 .name = "io_service_bytes", 851 .read_map = blkiocg_io_service_bytes_read, 852 }, 853 { 854 .name = "io_serviced", 855 .read_map = blkiocg_io_serviced_read, 856 }, 857 { 858 .name = "io_service_time", 859 .read_map = blkiocg_io_service_time_read, 860 }, 861 { 862 .name = "io_wait_time", 863 .read_map = blkiocg_io_wait_time_read, 864 }, 865 { 866 .name = "io_merged", 867 .read_map = blkiocg_io_merged_read, 868 }, 869 { 870 .name = "io_queued", 871 .read_map = blkiocg_io_queued_read, 872 }, 873 { 874 .name = "reset_stats", 875 .write_u64 = blkiocg_reset_stats, 876 }, 877 #ifdef CONFIG_DEBUG_BLK_CGROUP 878 { 879 .name = "avg_queue_size", 880 .read_map = blkiocg_avg_queue_size_read, 881 }, 882 { 883 .name = "group_wait_time", 884 .read_map = blkiocg_group_wait_time_read, 885 }, 886 { 887 .name = "idle_time", 888 .read_map = blkiocg_idle_time_read, 889 }, 890 { 891 .name = "empty_time", 892 .read_map = blkiocg_empty_time_read, 893 }, 894 { 895 .name = "dequeue", 896 .read_map = blkiocg_dequeue_read, 897 }, 898 #endif 899 }; 900 901 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 902 { 903 return cgroup_add_files(cgroup, subsys, blkio_files, 904 ARRAY_SIZE(blkio_files)); 905 } 906 907 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) 908 { 909 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 910 unsigned long flags; 911 struct blkio_group *blkg; 912 void *key; 913 struct blkio_policy_type *blkiop; 914 struct blkio_policy_node *pn, *pntmp; 915 916 rcu_read_lock(); 917 do { 918 spin_lock_irqsave(&blkcg->lock, flags); 919 920 if (hlist_empty(&blkcg->blkg_list)) { 921 spin_unlock_irqrestore(&blkcg->lock, flags); 922 break; 923 } 924 925 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 926 blkcg_node); 927 key = rcu_dereference(blkg->key); 928 __blkiocg_del_blkio_group(blkg); 929 930 spin_unlock_irqrestore(&blkcg->lock, flags); 931 932 /* 933 * This blkio_group is being unlinked as associated cgroup is 934 * going away. Let all the IO controlling policies know about 935 * this event. Currently this is static call to one io 936 * controlling policy. Once we have more policies in place, we 937 * need some dynamic registration of callback function. 938 */ 939 spin_lock(&blkio_list_lock); 940 list_for_each_entry(blkiop, &blkio_list, list) 941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 942 spin_unlock(&blkio_list_lock); 943 } while (1); 944 945 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 946 blkio_policy_delete_node(pn); 947 kfree(pn); 948 } 949 950 free_css_id(&blkio_subsys, &blkcg->css); 951 rcu_read_unlock(); 952 if (blkcg != &blkio_root_cgroup) 953 kfree(blkcg); 954 } 955 956 static struct cgroup_subsys_state * 957 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) 958 { 959 struct blkio_cgroup *blkcg; 960 struct cgroup *parent = cgroup->parent; 961 962 if (!parent) { 963 blkcg = &blkio_root_cgroup; 964 goto done; 965 } 966 967 /* Currently we do not support hierarchy deeper than two level (0,1) */ 968 if (parent != cgroup->top_cgroup) 969 return ERR_PTR(-EINVAL); 970 971 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 972 if (!blkcg) 973 return ERR_PTR(-ENOMEM); 974 975 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 976 done: 977 spin_lock_init(&blkcg->lock); 978 INIT_HLIST_HEAD(&blkcg->blkg_list); 979 980 INIT_LIST_HEAD(&blkcg->policy_list); 981 return &blkcg->css; 982 } 983 984 /* 985 * We cannot support shared io contexts, as we have no mean to support 986 * two tasks with the same ioc in two different groups without major rework 987 * of the main cic data structures. For now we allow a task to change 988 * its cgroup only if it's the only owner of its ioc. 989 */ 990 static int blkiocg_can_attach(struct cgroup_subsys *subsys, 991 struct cgroup *cgroup, struct task_struct *tsk, 992 bool threadgroup) 993 { 994 struct io_context *ioc; 995 int ret = 0; 996 997 /* task_lock() is needed to avoid races with exit_io_context() */ 998 task_lock(tsk); 999 ioc = tsk->io_context; 1000 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1001 ret = -EINVAL; 1002 task_unlock(tsk); 1003 1004 return ret; 1005 } 1006 1007 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, 1008 struct cgroup *prev, struct task_struct *tsk, 1009 bool threadgroup) 1010 { 1011 struct io_context *ioc; 1012 1013 task_lock(tsk); 1014 ioc = tsk->io_context; 1015 if (ioc) 1016 ioc->cgroup_changed = 1; 1017 task_unlock(tsk); 1018 } 1019 1020 void blkio_policy_register(struct blkio_policy_type *blkiop) 1021 { 1022 spin_lock(&blkio_list_lock); 1023 list_add_tail(&blkiop->list, &blkio_list); 1024 spin_unlock(&blkio_list_lock); 1025 } 1026 EXPORT_SYMBOL_GPL(blkio_policy_register); 1027 1028 void blkio_policy_unregister(struct blkio_policy_type *blkiop) 1029 { 1030 spin_lock(&blkio_list_lock); 1031 list_del_init(&blkiop->list); 1032 spin_unlock(&blkio_list_lock); 1033 } 1034 EXPORT_SYMBOL_GPL(blkio_policy_unregister); 1035 1036 static int __init init_cgroup_blkio(void) 1037 { 1038 return cgroup_load_subsys(&blkio_subsys); 1039 } 1040 1041 static void __exit exit_cgroup_blkio(void) 1042 { 1043 cgroup_unload_subsys(&blkio_subsys); 1044 } 1045 1046 module_init(init_cgroup_blkio); 1047 module_exit(exit_cgroup_blkio); 1048 MODULE_LICENSE("GPL"); 1049