1 #include <linux/errno.h> 2 #include <linux/numa.h> 3 #include <linux/slab.h> 4 #include <linux/rculist.h> 5 #include <linux/threads.h> 6 #include <linux/preempt.h> 7 #include <linux/irqflags.h> 8 #include <linux/vmalloc.h> 9 #include <linux/mm.h> 10 #include <linux/module.h> 11 #include <linux/device-mapper.h> 12 13 #include "dm.h" 14 #include "dm-stats.h" 15 16 #define DM_MSG_PREFIX "stats" 17 18 static int dm_stat_need_rcu_barrier; 19 20 /* 21 * Using 64-bit values to avoid overflow (which is a 22 * problem that block/genhd.c's IO accounting has). 23 */ 24 struct dm_stat_percpu { 25 unsigned long long sectors[2]; 26 unsigned long long ios[2]; 27 unsigned long long merges[2]; 28 unsigned long long ticks[2]; 29 unsigned long long io_ticks[2]; 30 unsigned long long io_ticks_total; 31 unsigned long long time_in_queue; 32 unsigned long long *histogram; 33 }; 34 35 struct dm_stat_shared { 36 atomic_t in_flight[2]; 37 unsigned long long stamp; 38 struct dm_stat_percpu tmp; 39 }; 40 41 struct dm_stat { 42 struct list_head list_entry; 43 int id; 44 unsigned stat_flags; 45 size_t n_entries; 46 sector_t start; 47 sector_t end; 48 sector_t step; 49 unsigned n_histogram_entries; 50 unsigned long long *histogram_boundaries; 51 const char *program_id; 52 const char *aux_data; 53 struct rcu_head rcu_head; 54 size_t shared_alloc_size; 55 size_t percpu_alloc_size; 56 size_t histogram_alloc_size; 57 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 58 struct dm_stat_shared stat_shared[0]; 59 }; 60 61 #define STAT_PRECISE_TIMESTAMPS 1 62 63 struct dm_stats_last_position { 64 sector_t last_sector; 65 unsigned last_rw; 66 }; 67 68 /* 69 * A typo on the command line could possibly make the kernel run out of memory 70 * and crash. To prevent the crash we account all used memory. We fail if we 71 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 72 */ 73 #define DM_STATS_MEMORY_FACTOR 4 74 #define DM_STATS_VMALLOC_FACTOR 2 75 76 static DEFINE_SPINLOCK(shared_memory_lock); 77 78 static unsigned long shared_memory_amount; 79 80 static bool __check_shared_memory(size_t alloc_size) 81 { 82 size_t a; 83 84 a = shared_memory_amount + alloc_size; 85 if (a < shared_memory_amount) 86 return false; 87 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) 88 return false; 89 #ifdef CONFIG_MMU 90 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 91 return false; 92 #endif 93 return true; 94 } 95 96 static bool check_shared_memory(size_t alloc_size) 97 { 98 bool ret; 99 100 spin_lock_irq(&shared_memory_lock); 101 102 ret = __check_shared_memory(alloc_size); 103 104 spin_unlock_irq(&shared_memory_lock); 105 106 return ret; 107 } 108 109 static bool claim_shared_memory(size_t alloc_size) 110 { 111 spin_lock_irq(&shared_memory_lock); 112 113 if (!__check_shared_memory(alloc_size)) { 114 spin_unlock_irq(&shared_memory_lock); 115 return false; 116 } 117 118 shared_memory_amount += alloc_size; 119 120 spin_unlock_irq(&shared_memory_lock); 121 122 return true; 123 } 124 125 static void free_shared_memory(size_t alloc_size) 126 { 127 unsigned long flags; 128 129 spin_lock_irqsave(&shared_memory_lock, flags); 130 131 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 132 spin_unlock_irqrestore(&shared_memory_lock, flags); 133 DMCRIT("Memory usage accounting bug."); 134 return; 135 } 136 137 shared_memory_amount -= alloc_size; 138 139 spin_unlock_irqrestore(&shared_memory_lock, flags); 140 } 141 142 static void *dm_kvzalloc(size_t alloc_size, int node) 143 { 144 void *p; 145 146 if (!claim_shared_memory(alloc_size)) 147 return NULL; 148 149 if (alloc_size <= KMALLOC_MAX_SIZE) { 150 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); 151 if (p) 152 return p; 153 } 154 p = vzalloc_node(alloc_size, node); 155 if (p) 156 return p; 157 158 free_shared_memory(alloc_size); 159 160 return NULL; 161 } 162 163 static void dm_kvfree(void *ptr, size_t alloc_size) 164 { 165 if (!ptr) 166 return; 167 168 free_shared_memory(alloc_size); 169 170 kvfree(ptr); 171 } 172 173 static void dm_stat_free(struct rcu_head *head) 174 { 175 int cpu; 176 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 177 178 kfree(s->program_id); 179 kfree(s->aux_data); 180 for_each_possible_cpu(cpu) { 181 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 182 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 183 } 184 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 185 dm_kvfree(s, s->shared_alloc_size); 186 } 187 188 static int dm_stat_in_flight(struct dm_stat_shared *shared) 189 { 190 return atomic_read(&shared->in_flight[READ]) + 191 atomic_read(&shared->in_flight[WRITE]); 192 } 193 194 void dm_stats_init(struct dm_stats *stats) 195 { 196 int cpu; 197 struct dm_stats_last_position *last; 198 199 mutex_init(&stats->mutex); 200 INIT_LIST_HEAD(&stats->list); 201 stats->last = alloc_percpu(struct dm_stats_last_position); 202 for_each_possible_cpu(cpu) { 203 last = per_cpu_ptr(stats->last, cpu); 204 last->last_sector = (sector_t)ULLONG_MAX; 205 last->last_rw = UINT_MAX; 206 } 207 } 208 209 void dm_stats_cleanup(struct dm_stats *stats) 210 { 211 size_t ni; 212 struct dm_stat *s; 213 struct dm_stat_shared *shared; 214 215 while (!list_empty(&stats->list)) { 216 s = container_of(stats->list.next, struct dm_stat, list_entry); 217 list_del(&s->list_entry); 218 for (ni = 0; ni < s->n_entries; ni++) { 219 shared = &s->stat_shared[ni]; 220 if (WARN_ON(dm_stat_in_flight(shared))) { 221 DMCRIT("leaked in-flight counter at index %lu " 222 "(start %llu, end %llu, step %llu): reads %d, writes %d", 223 (unsigned long)ni, 224 (unsigned long long)s->start, 225 (unsigned long long)s->end, 226 (unsigned long long)s->step, 227 atomic_read(&shared->in_flight[READ]), 228 atomic_read(&shared->in_flight[WRITE])); 229 } 230 } 231 dm_stat_free(&s->rcu_head); 232 } 233 free_percpu(stats->last); 234 } 235 236 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 237 sector_t step, unsigned stat_flags, 238 unsigned n_histogram_entries, 239 unsigned long long *histogram_boundaries, 240 const char *program_id, const char *aux_data, 241 void (*suspend_callback)(struct mapped_device *), 242 void (*resume_callback)(struct mapped_device *), 243 struct mapped_device *md) 244 { 245 struct list_head *l; 246 struct dm_stat *s, *tmp_s; 247 sector_t n_entries; 248 size_t ni; 249 size_t shared_alloc_size; 250 size_t percpu_alloc_size; 251 size_t histogram_alloc_size; 252 struct dm_stat_percpu *p; 253 int cpu; 254 int ret_id; 255 int r; 256 257 if (end < start || !step) 258 return -EINVAL; 259 260 n_entries = end - start; 261 if (dm_sector_div64(n_entries, step)) 262 n_entries++; 263 264 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 265 return -EOVERFLOW; 266 267 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 268 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 269 return -EOVERFLOW; 270 271 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 272 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 273 return -EOVERFLOW; 274 275 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 276 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 277 return -EOVERFLOW; 278 279 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 280 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 281 return -ENOMEM; 282 283 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 284 if (!s) 285 return -ENOMEM; 286 287 s->stat_flags = stat_flags; 288 s->n_entries = n_entries; 289 s->start = start; 290 s->end = end; 291 s->step = step; 292 s->shared_alloc_size = shared_alloc_size; 293 s->percpu_alloc_size = percpu_alloc_size; 294 s->histogram_alloc_size = histogram_alloc_size; 295 296 s->n_histogram_entries = n_histogram_entries; 297 s->histogram_boundaries = kmemdup(histogram_boundaries, 298 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 299 if (!s->histogram_boundaries) { 300 r = -ENOMEM; 301 goto out; 302 } 303 304 s->program_id = kstrdup(program_id, GFP_KERNEL); 305 if (!s->program_id) { 306 r = -ENOMEM; 307 goto out; 308 } 309 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 310 if (!s->aux_data) { 311 r = -ENOMEM; 312 goto out; 313 } 314 315 for (ni = 0; ni < n_entries; ni++) { 316 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 317 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 318 } 319 320 if (s->n_histogram_entries) { 321 unsigned long long *hi; 322 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 323 if (!hi) { 324 r = -ENOMEM; 325 goto out; 326 } 327 for (ni = 0; ni < n_entries; ni++) { 328 s->stat_shared[ni].tmp.histogram = hi; 329 hi += s->n_histogram_entries + 1; 330 } 331 } 332 333 for_each_possible_cpu(cpu) { 334 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 335 if (!p) { 336 r = -ENOMEM; 337 goto out; 338 } 339 s->stat_percpu[cpu] = p; 340 if (s->n_histogram_entries) { 341 unsigned long long *hi; 342 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 343 if (!hi) { 344 r = -ENOMEM; 345 goto out; 346 } 347 for (ni = 0; ni < n_entries; ni++) { 348 p[ni].histogram = hi; 349 hi += s->n_histogram_entries + 1; 350 } 351 } 352 } 353 354 /* 355 * Suspend/resume to make sure there is no i/o in flight, 356 * so that newly created statistics will be exact. 357 * 358 * (note: we couldn't suspend earlier because we must not 359 * allocate memory while suspended) 360 */ 361 suspend_callback(md); 362 363 mutex_lock(&stats->mutex); 364 s->id = 0; 365 list_for_each(l, &stats->list) { 366 tmp_s = container_of(l, struct dm_stat, list_entry); 367 if (WARN_ON(tmp_s->id < s->id)) { 368 r = -EINVAL; 369 goto out_unlock_resume; 370 } 371 if (tmp_s->id > s->id) 372 break; 373 if (unlikely(s->id == INT_MAX)) { 374 r = -ENFILE; 375 goto out_unlock_resume; 376 } 377 s->id++; 378 } 379 ret_id = s->id; 380 list_add_tail_rcu(&s->list_entry, l); 381 mutex_unlock(&stats->mutex); 382 383 resume_callback(md); 384 385 return ret_id; 386 387 out_unlock_resume: 388 mutex_unlock(&stats->mutex); 389 resume_callback(md); 390 out: 391 dm_stat_free(&s->rcu_head); 392 return r; 393 } 394 395 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 396 { 397 struct dm_stat *s; 398 399 list_for_each_entry(s, &stats->list, list_entry) { 400 if (s->id > id) 401 break; 402 if (s->id == id) 403 return s; 404 } 405 406 return NULL; 407 } 408 409 static int dm_stats_delete(struct dm_stats *stats, int id) 410 { 411 struct dm_stat *s; 412 int cpu; 413 414 mutex_lock(&stats->mutex); 415 416 s = __dm_stats_find(stats, id); 417 if (!s) { 418 mutex_unlock(&stats->mutex); 419 return -ENOENT; 420 } 421 422 list_del_rcu(&s->list_entry); 423 mutex_unlock(&stats->mutex); 424 425 /* 426 * vfree can't be called from RCU callback 427 */ 428 for_each_possible_cpu(cpu) 429 if (is_vmalloc_addr(s->stat_percpu) || 430 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 431 goto do_sync_free; 432 if (is_vmalloc_addr(s) || 433 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 434 do_sync_free: 435 synchronize_rcu_expedited(); 436 dm_stat_free(&s->rcu_head); 437 } else { 438 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; 439 call_rcu(&s->rcu_head, dm_stat_free); 440 } 441 return 0; 442 } 443 444 static int dm_stats_list(struct dm_stats *stats, const char *program, 445 char *result, unsigned maxlen) 446 { 447 struct dm_stat *s; 448 sector_t len; 449 unsigned sz = 0; 450 451 /* 452 * Output format: 453 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 454 */ 455 456 mutex_lock(&stats->mutex); 457 list_for_each_entry(s, &stats->list, list_entry) { 458 if (!program || !strcmp(program, s->program_id)) { 459 len = s->end - s->start; 460 DMEMIT("%d: %llu+%llu %llu %s %s", s->id, 461 (unsigned long long)s->start, 462 (unsigned long long)len, 463 (unsigned long long)s->step, 464 s->program_id, 465 s->aux_data); 466 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 467 DMEMIT(" precise_timestamps"); 468 if (s->n_histogram_entries) { 469 unsigned i; 470 DMEMIT(" histogram:"); 471 for (i = 0; i < s->n_histogram_entries; i++) { 472 if (i) 473 DMEMIT(","); 474 DMEMIT("%llu", s->histogram_boundaries[i]); 475 } 476 } 477 DMEMIT("\n"); 478 } 479 } 480 mutex_unlock(&stats->mutex); 481 482 return 1; 483 } 484 485 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 486 struct dm_stat_percpu *p) 487 { 488 /* 489 * This is racy, but so is part_round_stats_single. 490 */ 491 unsigned long long now, difference; 492 unsigned in_flight_read, in_flight_write; 493 494 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 495 now = jiffies; 496 else 497 now = ktime_to_ns(ktime_get()); 498 499 difference = now - shared->stamp; 500 if (!difference) 501 return; 502 503 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 504 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 505 if (in_flight_read) 506 p->io_ticks[READ] += difference; 507 if (in_flight_write) 508 p->io_ticks[WRITE] += difference; 509 if (in_flight_read + in_flight_write) { 510 p->io_ticks_total += difference; 511 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 512 } 513 shared->stamp = now; 514 } 515 516 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 517 unsigned long bi_rw, sector_t len, 518 struct dm_stats_aux *stats_aux, bool end, 519 unsigned long duration_jiffies) 520 { 521 unsigned long idx = bi_rw & REQ_WRITE; 522 struct dm_stat_shared *shared = &s->stat_shared[entry]; 523 struct dm_stat_percpu *p; 524 525 /* 526 * For strict correctness we should use local_irq_save/restore 527 * instead of preempt_disable/enable. 528 * 529 * preempt_disable/enable is racy if the driver finishes bios 530 * from non-interrupt context as well as from interrupt context 531 * or from more different interrupts. 532 * 533 * On 64-bit architectures the race only results in not counting some 534 * events, so it is acceptable. On 32-bit architectures the race could 535 * cause the counter going off by 2^32, so we need to do proper locking 536 * there. 537 * 538 * part_stat_lock()/part_stat_unlock() have this race too. 539 */ 540 #if BITS_PER_LONG == 32 541 unsigned long flags; 542 local_irq_save(flags); 543 #else 544 preempt_disable(); 545 #endif 546 p = &s->stat_percpu[smp_processor_id()][entry]; 547 548 if (!end) { 549 dm_stat_round(s, shared, p); 550 atomic_inc(&shared->in_flight[idx]); 551 } else { 552 unsigned long long duration; 553 dm_stat_round(s, shared, p); 554 atomic_dec(&shared->in_flight[idx]); 555 p->sectors[idx] += len; 556 p->ios[idx] += 1; 557 p->merges[idx] += stats_aux->merged; 558 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 559 p->ticks[idx] += duration_jiffies; 560 duration = jiffies_to_msecs(duration_jiffies); 561 } else { 562 p->ticks[idx] += stats_aux->duration_ns; 563 duration = stats_aux->duration_ns; 564 } 565 if (s->n_histogram_entries) { 566 unsigned lo = 0, hi = s->n_histogram_entries + 1; 567 while (lo + 1 < hi) { 568 unsigned mid = (lo + hi) / 2; 569 if (s->histogram_boundaries[mid - 1] > duration) { 570 hi = mid; 571 } else { 572 lo = mid; 573 } 574 575 } 576 p->histogram[lo]++; 577 } 578 } 579 580 #if BITS_PER_LONG == 32 581 local_irq_restore(flags); 582 #else 583 preempt_enable(); 584 #endif 585 } 586 587 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, 588 sector_t bi_sector, sector_t end_sector, 589 bool end, unsigned long duration_jiffies, 590 struct dm_stats_aux *stats_aux) 591 { 592 sector_t rel_sector, offset, todo, fragment_len; 593 size_t entry; 594 595 if (end_sector <= s->start || bi_sector >= s->end) 596 return; 597 if (unlikely(bi_sector < s->start)) { 598 rel_sector = 0; 599 todo = end_sector - s->start; 600 } else { 601 rel_sector = bi_sector - s->start; 602 todo = end_sector - bi_sector; 603 } 604 if (unlikely(end_sector > s->end)) 605 todo -= (end_sector - s->end); 606 607 offset = dm_sector_div64(rel_sector, s->step); 608 entry = rel_sector; 609 do { 610 if (WARN_ON_ONCE(entry >= s->n_entries)) { 611 DMCRIT("Invalid area access in region id %d", s->id); 612 return; 613 } 614 fragment_len = todo; 615 if (fragment_len > s->step - offset) 616 fragment_len = s->step - offset; 617 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 618 stats_aux, end, duration_jiffies); 619 todo -= fragment_len; 620 entry++; 621 offset = 0; 622 } while (unlikely(todo != 0)); 623 } 624 625 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 626 sector_t bi_sector, unsigned bi_sectors, bool end, 627 unsigned long duration_jiffies, 628 struct dm_stats_aux *stats_aux) 629 { 630 struct dm_stat *s; 631 sector_t end_sector; 632 struct dm_stats_last_position *last; 633 bool got_precise_time; 634 635 if (unlikely(!bi_sectors)) 636 return; 637 638 end_sector = bi_sector + bi_sectors; 639 640 if (!end) { 641 /* 642 * A race condition can at worst result in the merged flag being 643 * misrepresented, so we don't have to disable preemption here. 644 */ 645 last = raw_cpu_ptr(stats->last); 646 stats_aux->merged = 647 (bi_sector == (ACCESS_ONCE(last->last_sector) && 648 ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == 649 (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD))) 650 )); 651 ACCESS_ONCE(last->last_sector) = end_sector; 652 ACCESS_ONCE(last->last_rw) = bi_rw; 653 } 654 655 rcu_read_lock(); 656 657 got_precise_time = false; 658 list_for_each_entry_rcu(s, &stats->list, list_entry) { 659 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 660 if (!end) 661 stats_aux->duration_ns = ktime_to_ns(ktime_get()); 662 else 663 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 664 got_precise_time = true; 665 } 666 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 667 } 668 669 rcu_read_unlock(); 670 } 671 672 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 673 struct dm_stat *s, size_t x) 674 { 675 int cpu; 676 struct dm_stat_percpu *p; 677 678 local_irq_disable(); 679 p = &s->stat_percpu[smp_processor_id()][x]; 680 dm_stat_round(s, shared, p); 681 local_irq_enable(); 682 683 shared->tmp.sectors[READ] = 0; 684 shared->tmp.sectors[WRITE] = 0; 685 shared->tmp.ios[READ] = 0; 686 shared->tmp.ios[WRITE] = 0; 687 shared->tmp.merges[READ] = 0; 688 shared->tmp.merges[WRITE] = 0; 689 shared->tmp.ticks[READ] = 0; 690 shared->tmp.ticks[WRITE] = 0; 691 shared->tmp.io_ticks[READ] = 0; 692 shared->tmp.io_ticks[WRITE] = 0; 693 shared->tmp.io_ticks_total = 0; 694 shared->tmp.time_in_queue = 0; 695 696 if (s->n_histogram_entries) 697 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 698 699 for_each_possible_cpu(cpu) { 700 p = &s->stat_percpu[cpu][x]; 701 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); 702 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); 703 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); 704 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); 705 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); 706 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); 707 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); 708 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); 709 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); 710 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 711 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 712 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 713 if (s->n_histogram_entries) { 714 unsigned i; 715 for (i = 0; i < s->n_histogram_entries + 1; i++) 716 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); 717 } 718 } 719 } 720 721 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 722 bool init_tmp_percpu_totals) 723 { 724 size_t x; 725 struct dm_stat_shared *shared; 726 struct dm_stat_percpu *p; 727 728 for (x = idx_start; x < idx_end; x++) { 729 shared = &s->stat_shared[x]; 730 if (init_tmp_percpu_totals) 731 __dm_stat_init_temporary_percpu_totals(shared, s, x); 732 local_irq_disable(); 733 p = &s->stat_percpu[smp_processor_id()][x]; 734 p->sectors[READ] -= shared->tmp.sectors[READ]; 735 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 736 p->ios[READ] -= shared->tmp.ios[READ]; 737 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 738 p->merges[READ] -= shared->tmp.merges[READ]; 739 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 740 p->ticks[READ] -= shared->tmp.ticks[READ]; 741 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 742 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 743 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 744 p->io_ticks_total -= shared->tmp.io_ticks_total; 745 p->time_in_queue -= shared->tmp.time_in_queue; 746 local_irq_enable(); 747 if (s->n_histogram_entries) { 748 unsigned i; 749 for (i = 0; i < s->n_histogram_entries + 1; i++) { 750 local_irq_disable(); 751 p = &s->stat_percpu[smp_processor_id()][x]; 752 p->histogram[i] -= shared->tmp.histogram[i]; 753 local_irq_enable(); 754 } 755 } 756 } 757 } 758 759 static int dm_stats_clear(struct dm_stats *stats, int id) 760 { 761 struct dm_stat *s; 762 763 mutex_lock(&stats->mutex); 764 765 s = __dm_stats_find(stats, id); 766 if (!s) { 767 mutex_unlock(&stats->mutex); 768 return -ENOENT; 769 } 770 771 __dm_stat_clear(s, 0, s->n_entries, true); 772 773 mutex_unlock(&stats->mutex); 774 775 return 1; 776 } 777 778 /* 779 * This is like jiffies_to_msec, but works for 64-bit values. 780 */ 781 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 782 { 783 unsigned long long result; 784 unsigned mult; 785 786 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 787 return j; 788 789 result = 0; 790 if (j) 791 result = jiffies_to_msecs(j & 0x3fffff); 792 if (j >= 1 << 22) { 793 mult = jiffies_to_msecs(1 << 22); 794 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 795 } 796 if (j >= 1ULL << 44) 797 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 798 799 return result; 800 } 801 802 static int dm_stats_print(struct dm_stats *stats, int id, 803 size_t idx_start, size_t idx_len, 804 bool clear, char *result, unsigned maxlen) 805 { 806 unsigned sz = 0; 807 struct dm_stat *s; 808 size_t x; 809 sector_t start, end, step; 810 size_t idx_end; 811 struct dm_stat_shared *shared; 812 813 /* 814 * Output format: 815 * <start_sector>+<length> counters 816 */ 817 818 mutex_lock(&stats->mutex); 819 820 s = __dm_stats_find(stats, id); 821 if (!s) { 822 mutex_unlock(&stats->mutex); 823 return -ENOENT; 824 } 825 826 idx_end = idx_start + idx_len; 827 if (idx_end < idx_start || 828 idx_end > s->n_entries) 829 idx_end = s->n_entries; 830 831 if (idx_start > idx_end) 832 idx_start = idx_end; 833 834 step = s->step; 835 start = s->start + (step * idx_start); 836 837 for (x = idx_start; x < idx_end; x++, start = end) { 838 shared = &s->stat_shared[x]; 839 end = start + step; 840 if (unlikely(end > s->end)) 841 end = s->end; 842 843 __dm_stat_init_temporary_percpu_totals(shared, s, x); 844 845 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 846 (unsigned long long)start, 847 (unsigned long long)step, 848 shared->tmp.ios[READ], 849 shared->tmp.merges[READ], 850 shared->tmp.sectors[READ], 851 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 852 shared->tmp.ios[WRITE], 853 shared->tmp.merges[WRITE], 854 shared->tmp.sectors[WRITE], 855 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 856 dm_stat_in_flight(shared), 857 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 858 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 859 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 860 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 861 if (s->n_histogram_entries) { 862 unsigned i; 863 for (i = 0; i < s->n_histogram_entries + 1; i++) { 864 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 865 } 866 } 867 DMEMIT("\n"); 868 869 if (unlikely(sz + 1 >= maxlen)) 870 goto buffer_overflow; 871 } 872 873 if (clear) 874 __dm_stat_clear(s, idx_start, idx_end, false); 875 876 buffer_overflow: 877 mutex_unlock(&stats->mutex); 878 879 return 1; 880 } 881 882 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 883 { 884 struct dm_stat *s; 885 const char *new_aux_data; 886 887 mutex_lock(&stats->mutex); 888 889 s = __dm_stats_find(stats, id); 890 if (!s) { 891 mutex_unlock(&stats->mutex); 892 return -ENOENT; 893 } 894 895 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 896 if (!new_aux_data) { 897 mutex_unlock(&stats->mutex); 898 return -ENOMEM; 899 } 900 901 kfree(s->aux_data); 902 s->aux_data = new_aux_data; 903 904 mutex_unlock(&stats->mutex); 905 906 return 0; 907 } 908 909 static int parse_histogram(const char *h, unsigned *n_histogram_entries, 910 unsigned long long **histogram_boundaries) 911 { 912 const char *q; 913 unsigned n; 914 unsigned long long last; 915 916 *n_histogram_entries = 1; 917 for (q = h; *q; q++) 918 if (*q == ',') 919 (*n_histogram_entries)++; 920 921 *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 922 if (!*histogram_boundaries) 923 return -ENOMEM; 924 925 n = 0; 926 last = 0; 927 while (1) { 928 unsigned long long hi; 929 int s; 930 char ch; 931 s = sscanf(h, "%llu%c", &hi, &ch); 932 if (!s || (s == 2 && ch != ',')) 933 return -EINVAL; 934 if (hi <= last) 935 return -EINVAL; 936 last = hi; 937 (*histogram_boundaries)[n] = hi; 938 if (s == 1) 939 return 0; 940 h = strchr(h, ',') + 1; 941 n++; 942 } 943 } 944 945 static int message_stats_create(struct mapped_device *md, 946 unsigned argc, char **argv, 947 char *result, unsigned maxlen) 948 { 949 int r; 950 int id; 951 char dummy; 952 unsigned long long start, end, len, step; 953 unsigned divisor; 954 const char *program_id, *aux_data; 955 unsigned stat_flags = 0; 956 957 unsigned n_histogram_entries = 0; 958 unsigned long long *histogram_boundaries = NULL; 959 960 struct dm_arg_set as, as_backup; 961 const char *a; 962 unsigned feature_args; 963 964 /* 965 * Input format: 966 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 967 */ 968 969 if (argc < 3) 970 goto ret_einval; 971 972 as.argc = argc; 973 as.argv = argv; 974 dm_consume_args(&as, 1); 975 976 a = dm_shift_arg(&as); 977 if (!strcmp(a, "-")) { 978 start = 0; 979 len = dm_get_size(md); 980 if (!len) 981 len = 1; 982 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 983 start != (sector_t)start || len != (sector_t)len) 984 goto ret_einval; 985 986 end = start + len; 987 if (start >= end) 988 goto ret_einval; 989 990 a = dm_shift_arg(&as); 991 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 992 if (!divisor) 993 return -EINVAL; 994 step = end - start; 995 if (do_div(step, divisor)) 996 step++; 997 if (!step) 998 step = 1; 999 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 1000 step != (sector_t)step || !step) 1001 goto ret_einval; 1002 1003 as_backup = as; 1004 a = dm_shift_arg(&as); 1005 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 1006 while (feature_args--) { 1007 a = dm_shift_arg(&as); 1008 if (!a) 1009 goto ret_einval; 1010 if (!strcasecmp(a, "precise_timestamps")) 1011 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1012 else if (!strncasecmp(a, "histogram:", 10)) { 1013 if (n_histogram_entries) 1014 goto ret_einval; 1015 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 1016 goto ret; 1017 } else 1018 goto ret_einval; 1019 } 1020 } else { 1021 as = as_backup; 1022 } 1023 1024 program_id = "-"; 1025 aux_data = "-"; 1026 1027 a = dm_shift_arg(&as); 1028 if (a) 1029 program_id = a; 1030 1031 a = dm_shift_arg(&as); 1032 if (a) 1033 aux_data = a; 1034 1035 if (as.argc) 1036 goto ret_einval; 1037 1038 /* 1039 * If a buffer overflow happens after we created the region, 1040 * it's too late (the userspace would retry with a larger 1041 * buffer, but the region id that caused the overflow is already 1042 * leaked). So we must detect buffer overflow in advance. 1043 */ 1044 snprintf(result, maxlen, "%d", INT_MAX); 1045 if (dm_message_test_buffer_overflow(result, maxlen)) { 1046 r = 1; 1047 goto ret; 1048 } 1049 1050 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1051 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1052 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1053 if (id < 0) { 1054 r = id; 1055 goto ret; 1056 } 1057 1058 snprintf(result, maxlen, "%d", id); 1059 1060 r = 1; 1061 goto ret; 1062 1063 ret_einval: 1064 r = -EINVAL; 1065 ret: 1066 kfree(histogram_boundaries); 1067 return r; 1068 } 1069 1070 static int message_stats_delete(struct mapped_device *md, 1071 unsigned argc, char **argv) 1072 { 1073 int id; 1074 char dummy; 1075 1076 if (argc != 2) 1077 return -EINVAL; 1078 1079 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1080 return -EINVAL; 1081 1082 return dm_stats_delete(dm_get_stats(md), id); 1083 } 1084 1085 static int message_stats_clear(struct mapped_device *md, 1086 unsigned argc, char **argv) 1087 { 1088 int id; 1089 char dummy; 1090 1091 if (argc != 2) 1092 return -EINVAL; 1093 1094 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1095 return -EINVAL; 1096 1097 return dm_stats_clear(dm_get_stats(md), id); 1098 } 1099 1100 static int message_stats_list(struct mapped_device *md, 1101 unsigned argc, char **argv, 1102 char *result, unsigned maxlen) 1103 { 1104 int r; 1105 const char *program = NULL; 1106 1107 if (argc < 1 || argc > 2) 1108 return -EINVAL; 1109 1110 if (argc > 1) { 1111 program = kstrdup(argv[1], GFP_KERNEL); 1112 if (!program) 1113 return -ENOMEM; 1114 } 1115 1116 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1117 1118 kfree(program); 1119 1120 return r; 1121 } 1122 1123 static int message_stats_print(struct mapped_device *md, 1124 unsigned argc, char **argv, bool clear, 1125 char *result, unsigned maxlen) 1126 { 1127 int id; 1128 char dummy; 1129 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1130 1131 if (argc != 2 && argc != 4) 1132 return -EINVAL; 1133 1134 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1135 return -EINVAL; 1136 1137 if (argc > 3) { 1138 if (strcmp(argv[2], "-") && 1139 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1140 return -EINVAL; 1141 if (strcmp(argv[3], "-") && 1142 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1143 return -EINVAL; 1144 } 1145 1146 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1147 result, maxlen); 1148 } 1149 1150 static int message_stats_set_aux(struct mapped_device *md, 1151 unsigned argc, char **argv) 1152 { 1153 int id; 1154 char dummy; 1155 1156 if (argc != 3) 1157 return -EINVAL; 1158 1159 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1160 return -EINVAL; 1161 1162 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1163 } 1164 1165 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 1166 char *result, unsigned maxlen) 1167 { 1168 int r; 1169 1170 /* All messages here must start with '@' */ 1171 if (!strcasecmp(argv[0], "@stats_create")) 1172 r = message_stats_create(md, argc, argv, result, maxlen); 1173 else if (!strcasecmp(argv[0], "@stats_delete")) 1174 r = message_stats_delete(md, argc, argv); 1175 else if (!strcasecmp(argv[0], "@stats_clear")) 1176 r = message_stats_clear(md, argc, argv); 1177 else if (!strcasecmp(argv[0], "@stats_list")) 1178 r = message_stats_list(md, argc, argv, result, maxlen); 1179 else if (!strcasecmp(argv[0], "@stats_print")) 1180 r = message_stats_print(md, argc, argv, false, result, maxlen); 1181 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1182 r = message_stats_print(md, argc, argv, true, result, maxlen); 1183 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1184 r = message_stats_set_aux(md, argc, argv); 1185 else 1186 return 2; /* this wasn't a stats message */ 1187 1188 if (r == -EINVAL) 1189 DMWARN("Invalid parameters for message %s", argv[0]); 1190 1191 return r; 1192 } 1193 1194 int __init dm_statistics_init(void) 1195 { 1196 shared_memory_amount = 0; 1197 dm_stat_need_rcu_barrier = 0; 1198 return 0; 1199 } 1200 1201 void dm_statistics_exit(void) 1202 { 1203 if (dm_stat_need_rcu_barrier) 1204 rcu_barrier(); 1205 if (WARN_ON(shared_memory_amount)) 1206 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1207 } 1208 1209 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 1210 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1211