1 #include <linux/errno.h> 2 #include <linux/numa.h> 3 #include <linux/slab.h> 4 #include <linux/rculist.h> 5 #include <linux/threads.h> 6 #include <linux/preempt.h> 7 #include <linux/irqflags.h> 8 #include <linux/vmalloc.h> 9 #include <linux/mm.h> 10 #include <linux/module.h> 11 #include <linux/device-mapper.h> 12 13 #include "dm.h" 14 #include "dm-stats.h" 15 16 #define DM_MSG_PREFIX "stats" 17 18 static int dm_stat_need_rcu_barrier; 19 20 /* 21 * Using 64-bit values to avoid overflow (which is a 22 * problem that block/genhd.c's IO accounting has). 23 */ 24 struct dm_stat_percpu { 25 unsigned long long sectors[2]; 26 unsigned long long ios[2]; 27 unsigned long long merges[2]; 28 unsigned long long ticks[2]; 29 unsigned long long io_ticks[2]; 30 unsigned long long io_ticks_total; 31 unsigned long long time_in_queue; 32 unsigned long long *histogram; 33 }; 34 35 struct dm_stat_shared { 36 atomic_t in_flight[2]; 37 unsigned long long stamp; 38 struct dm_stat_percpu tmp; 39 }; 40 41 struct dm_stat { 42 struct list_head list_entry; 43 int id; 44 unsigned stat_flags; 45 size_t n_entries; 46 sector_t start; 47 sector_t end; 48 sector_t step; 49 unsigned n_histogram_entries; 50 unsigned long long *histogram_boundaries; 51 const char *program_id; 52 const char *aux_data; 53 struct rcu_head rcu_head; 54 size_t shared_alloc_size; 55 size_t percpu_alloc_size; 56 size_t histogram_alloc_size; 57 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 58 struct dm_stat_shared stat_shared[0]; 59 }; 60 61 #define STAT_PRECISE_TIMESTAMPS 1 62 63 struct dm_stats_last_position { 64 sector_t last_sector; 65 unsigned last_rw; 66 }; 67 68 /* 69 * A typo on the command line could possibly make the kernel run out of memory 70 * and crash. To prevent the crash we account all used memory. We fail if we 71 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 72 */ 73 #define DM_STATS_MEMORY_FACTOR 4 74 #define DM_STATS_VMALLOC_FACTOR 2 75 76 static DEFINE_SPINLOCK(shared_memory_lock); 77 78 static unsigned long shared_memory_amount; 79 80 static bool __check_shared_memory(size_t alloc_size) 81 { 82 size_t a; 83 84 a = shared_memory_amount + alloc_size; 85 if (a < shared_memory_amount) 86 return false; 87 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) 88 return false; 89 #ifdef CONFIG_MMU 90 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 91 return false; 92 #endif 93 return true; 94 } 95 96 static bool check_shared_memory(size_t alloc_size) 97 { 98 bool ret; 99 100 spin_lock_irq(&shared_memory_lock); 101 102 ret = __check_shared_memory(alloc_size); 103 104 spin_unlock_irq(&shared_memory_lock); 105 106 return ret; 107 } 108 109 static bool claim_shared_memory(size_t alloc_size) 110 { 111 spin_lock_irq(&shared_memory_lock); 112 113 if (!__check_shared_memory(alloc_size)) { 114 spin_unlock_irq(&shared_memory_lock); 115 return false; 116 } 117 118 shared_memory_amount += alloc_size; 119 120 spin_unlock_irq(&shared_memory_lock); 121 122 return true; 123 } 124 125 static void free_shared_memory(size_t alloc_size) 126 { 127 unsigned long flags; 128 129 spin_lock_irqsave(&shared_memory_lock, flags); 130 131 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 132 spin_unlock_irqrestore(&shared_memory_lock, flags); 133 DMCRIT("Memory usage accounting bug."); 134 return; 135 } 136 137 shared_memory_amount -= alloc_size; 138 139 spin_unlock_irqrestore(&shared_memory_lock, flags); 140 } 141 142 static void *dm_kvzalloc(size_t alloc_size, int node) 143 { 144 void *p; 145 146 if (!claim_shared_memory(alloc_size)) 147 return NULL; 148 149 if (alloc_size <= KMALLOC_MAX_SIZE) { 150 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); 151 if (p) 152 return p; 153 } 154 p = vzalloc_node(alloc_size, node); 155 if (p) 156 return p; 157 158 free_shared_memory(alloc_size); 159 160 return NULL; 161 } 162 163 static void dm_kvfree(void *ptr, size_t alloc_size) 164 { 165 if (!ptr) 166 return; 167 168 free_shared_memory(alloc_size); 169 170 kvfree(ptr); 171 } 172 173 static void dm_stat_free(struct rcu_head *head) 174 { 175 int cpu; 176 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 177 178 kfree(s->program_id); 179 kfree(s->aux_data); 180 for_each_possible_cpu(cpu) { 181 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 182 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 183 } 184 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 185 dm_kvfree(s, s->shared_alloc_size); 186 } 187 188 static int dm_stat_in_flight(struct dm_stat_shared *shared) 189 { 190 return atomic_read(&shared->in_flight[READ]) + 191 atomic_read(&shared->in_flight[WRITE]); 192 } 193 194 void dm_stats_init(struct dm_stats *stats) 195 { 196 int cpu; 197 struct dm_stats_last_position *last; 198 199 mutex_init(&stats->mutex); 200 INIT_LIST_HEAD(&stats->list); 201 stats->last = alloc_percpu(struct dm_stats_last_position); 202 for_each_possible_cpu(cpu) { 203 last = per_cpu_ptr(stats->last, cpu); 204 last->last_sector = (sector_t)ULLONG_MAX; 205 last->last_rw = UINT_MAX; 206 } 207 } 208 209 void dm_stats_cleanup(struct dm_stats *stats) 210 { 211 size_t ni; 212 struct dm_stat *s; 213 struct dm_stat_shared *shared; 214 215 while (!list_empty(&stats->list)) { 216 s = container_of(stats->list.next, struct dm_stat, list_entry); 217 list_del(&s->list_entry); 218 for (ni = 0; ni < s->n_entries; ni++) { 219 shared = &s->stat_shared[ni]; 220 if (WARN_ON(dm_stat_in_flight(shared))) { 221 DMCRIT("leaked in-flight counter at index %lu " 222 "(start %llu, end %llu, step %llu): reads %d, writes %d", 223 (unsigned long)ni, 224 (unsigned long long)s->start, 225 (unsigned long long)s->end, 226 (unsigned long long)s->step, 227 atomic_read(&shared->in_flight[READ]), 228 atomic_read(&shared->in_flight[WRITE])); 229 } 230 } 231 dm_stat_free(&s->rcu_head); 232 } 233 free_percpu(stats->last); 234 } 235 236 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 237 sector_t step, unsigned stat_flags, 238 unsigned n_histogram_entries, 239 unsigned long long *histogram_boundaries, 240 const char *program_id, const char *aux_data, 241 void (*suspend_callback)(struct mapped_device *), 242 void (*resume_callback)(struct mapped_device *), 243 struct mapped_device *md) 244 { 245 struct list_head *l; 246 struct dm_stat *s, *tmp_s; 247 sector_t n_entries; 248 size_t ni; 249 size_t shared_alloc_size; 250 size_t percpu_alloc_size; 251 size_t histogram_alloc_size; 252 struct dm_stat_percpu *p; 253 int cpu; 254 int ret_id; 255 int r; 256 257 if (end < start || !step) 258 return -EINVAL; 259 260 n_entries = end - start; 261 if (dm_sector_div64(n_entries, step)) 262 n_entries++; 263 264 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 265 return -EOVERFLOW; 266 267 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 268 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 269 return -EOVERFLOW; 270 271 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 272 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 273 return -EOVERFLOW; 274 275 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 276 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 277 return -EOVERFLOW; 278 279 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 280 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 281 return -ENOMEM; 282 283 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 284 if (!s) 285 return -ENOMEM; 286 287 s->stat_flags = stat_flags; 288 s->n_entries = n_entries; 289 s->start = start; 290 s->end = end; 291 s->step = step; 292 s->shared_alloc_size = shared_alloc_size; 293 s->percpu_alloc_size = percpu_alloc_size; 294 s->histogram_alloc_size = histogram_alloc_size; 295 296 s->n_histogram_entries = n_histogram_entries; 297 s->histogram_boundaries = kmemdup(histogram_boundaries, 298 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 299 if (!s->histogram_boundaries) { 300 r = -ENOMEM; 301 goto out; 302 } 303 304 s->program_id = kstrdup(program_id, GFP_KERNEL); 305 if (!s->program_id) { 306 r = -ENOMEM; 307 goto out; 308 } 309 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 310 if (!s->aux_data) { 311 r = -ENOMEM; 312 goto out; 313 } 314 315 for (ni = 0; ni < n_entries; ni++) { 316 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 317 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 318 } 319 320 if (s->n_histogram_entries) { 321 unsigned long long *hi; 322 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 323 if (!hi) { 324 r = -ENOMEM; 325 goto out; 326 } 327 for (ni = 0; ni < n_entries; ni++) { 328 s->stat_shared[ni].tmp.histogram = hi; 329 hi += s->n_histogram_entries + 1; 330 } 331 } 332 333 for_each_possible_cpu(cpu) { 334 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 335 if (!p) { 336 r = -ENOMEM; 337 goto out; 338 } 339 s->stat_percpu[cpu] = p; 340 if (s->n_histogram_entries) { 341 unsigned long long *hi; 342 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 343 if (!hi) { 344 r = -ENOMEM; 345 goto out; 346 } 347 for (ni = 0; ni < n_entries; ni++) { 348 p[ni].histogram = hi; 349 hi += s->n_histogram_entries + 1; 350 } 351 } 352 } 353 354 /* 355 * Suspend/resume to make sure there is no i/o in flight, 356 * so that newly created statistics will be exact. 357 * 358 * (note: we couldn't suspend earlier because we must not 359 * allocate memory while suspended) 360 */ 361 suspend_callback(md); 362 363 mutex_lock(&stats->mutex); 364 s->id = 0; 365 list_for_each(l, &stats->list) { 366 tmp_s = container_of(l, struct dm_stat, list_entry); 367 if (WARN_ON(tmp_s->id < s->id)) { 368 r = -EINVAL; 369 goto out_unlock_resume; 370 } 371 if (tmp_s->id > s->id) 372 break; 373 if (unlikely(s->id == INT_MAX)) { 374 r = -ENFILE; 375 goto out_unlock_resume; 376 } 377 s->id++; 378 } 379 ret_id = s->id; 380 list_add_tail_rcu(&s->list_entry, l); 381 mutex_unlock(&stats->mutex); 382 383 resume_callback(md); 384 385 return ret_id; 386 387 out_unlock_resume: 388 mutex_unlock(&stats->mutex); 389 resume_callback(md); 390 out: 391 dm_stat_free(&s->rcu_head); 392 return r; 393 } 394 395 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 396 { 397 struct dm_stat *s; 398 399 list_for_each_entry(s, &stats->list, list_entry) { 400 if (s->id > id) 401 break; 402 if (s->id == id) 403 return s; 404 } 405 406 return NULL; 407 } 408 409 static int dm_stats_delete(struct dm_stats *stats, int id) 410 { 411 struct dm_stat *s; 412 int cpu; 413 414 mutex_lock(&stats->mutex); 415 416 s = __dm_stats_find(stats, id); 417 if (!s) { 418 mutex_unlock(&stats->mutex); 419 return -ENOENT; 420 } 421 422 list_del_rcu(&s->list_entry); 423 mutex_unlock(&stats->mutex); 424 425 /* 426 * vfree can't be called from RCU callback 427 */ 428 for_each_possible_cpu(cpu) 429 if (is_vmalloc_addr(s->stat_percpu) || 430 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 431 goto do_sync_free; 432 if (is_vmalloc_addr(s) || 433 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 434 do_sync_free: 435 synchronize_rcu_expedited(); 436 dm_stat_free(&s->rcu_head); 437 } else { 438 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; 439 call_rcu(&s->rcu_head, dm_stat_free); 440 } 441 return 0; 442 } 443 444 static int dm_stats_list(struct dm_stats *stats, const char *program, 445 char *result, unsigned maxlen) 446 { 447 struct dm_stat *s; 448 sector_t len; 449 unsigned sz = 0; 450 451 /* 452 * Output format: 453 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 454 */ 455 456 mutex_lock(&stats->mutex); 457 list_for_each_entry(s, &stats->list, list_entry) { 458 if (!program || !strcmp(program, s->program_id)) { 459 len = s->end - s->start; 460 DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id, 461 (unsigned long long)s->start, 462 (unsigned long long)len, 463 (unsigned long long)s->step, 464 s->program_id, 465 s->aux_data); 466 } 467 } 468 mutex_unlock(&stats->mutex); 469 470 return 1; 471 } 472 473 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 474 struct dm_stat_percpu *p) 475 { 476 /* 477 * This is racy, but so is part_round_stats_single. 478 */ 479 unsigned long long now, difference; 480 unsigned in_flight_read, in_flight_write; 481 482 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 483 now = jiffies; 484 else 485 now = ktime_to_ns(ktime_get()); 486 487 difference = now - shared->stamp; 488 if (!difference) 489 return; 490 491 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 492 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 493 if (in_flight_read) 494 p->io_ticks[READ] += difference; 495 if (in_flight_write) 496 p->io_ticks[WRITE] += difference; 497 if (in_flight_read + in_flight_write) { 498 p->io_ticks_total += difference; 499 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 500 } 501 shared->stamp = now; 502 } 503 504 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 505 unsigned long bi_rw, sector_t len, 506 struct dm_stats_aux *stats_aux, bool end, 507 unsigned long duration_jiffies) 508 { 509 unsigned long idx = bi_rw & REQ_WRITE; 510 struct dm_stat_shared *shared = &s->stat_shared[entry]; 511 struct dm_stat_percpu *p; 512 513 /* 514 * For strict correctness we should use local_irq_save/restore 515 * instead of preempt_disable/enable. 516 * 517 * preempt_disable/enable is racy if the driver finishes bios 518 * from non-interrupt context as well as from interrupt context 519 * or from more different interrupts. 520 * 521 * On 64-bit architectures the race only results in not counting some 522 * events, so it is acceptable. On 32-bit architectures the race could 523 * cause the counter going off by 2^32, so we need to do proper locking 524 * there. 525 * 526 * part_stat_lock()/part_stat_unlock() have this race too. 527 */ 528 #if BITS_PER_LONG == 32 529 unsigned long flags; 530 local_irq_save(flags); 531 #else 532 preempt_disable(); 533 #endif 534 p = &s->stat_percpu[smp_processor_id()][entry]; 535 536 if (!end) { 537 dm_stat_round(s, shared, p); 538 atomic_inc(&shared->in_flight[idx]); 539 } else { 540 unsigned long long duration; 541 dm_stat_round(s, shared, p); 542 atomic_dec(&shared->in_flight[idx]); 543 p->sectors[idx] += len; 544 p->ios[idx] += 1; 545 p->merges[idx] += stats_aux->merged; 546 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 547 p->ticks[idx] += duration_jiffies; 548 duration = jiffies_to_msecs(duration_jiffies); 549 } else { 550 p->ticks[idx] += stats_aux->duration_ns; 551 duration = stats_aux->duration_ns; 552 } 553 if (s->n_histogram_entries) { 554 unsigned lo = 0, hi = s->n_histogram_entries + 1; 555 while (lo + 1 < hi) { 556 unsigned mid = (lo + hi) / 2; 557 if (s->histogram_boundaries[mid - 1] > duration) { 558 hi = mid; 559 } else { 560 lo = mid; 561 } 562 563 } 564 p->histogram[lo]++; 565 } 566 } 567 568 #if BITS_PER_LONG == 32 569 local_irq_restore(flags); 570 #else 571 preempt_enable(); 572 #endif 573 } 574 575 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, 576 sector_t bi_sector, sector_t end_sector, 577 bool end, unsigned long duration_jiffies, 578 struct dm_stats_aux *stats_aux) 579 { 580 sector_t rel_sector, offset, todo, fragment_len; 581 size_t entry; 582 583 if (end_sector <= s->start || bi_sector >= s->end) 584 return; 585 if (unlikely(bi_sector < s->start)) { 586 rel_sector = 0; 587 todo = end_sector - s->start; 588 } else { 589 rel_sector = bi_sector - s->start; 590 todo = end_sector - bi_sector; 591 } 592 if (unlikely(end_sector > s->end)) 593 todo -= (end_sector - s->end); 594 595 offset = dm_sector_div64(rel_sector, s->step); 596 entry = rel_sector; 597 do { 598 if (WARN_ON_ONCE(entry >= s->n_entries)) { 599 DMCRIT("Invalid area access in region id %d", s->id); 600 return; 601 } 602 fragment_len = todo; 603 if (fragment_len > s->step - offset) 604 fragment_len = s->step - offset; 605 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 606 stats_aux, end, duration_jiffies); 607 todo -= fragment_len; 608 entry++; 609 offset = 0; 610 } while (unlikely(todo != 0)); 611 } 612 613 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 614 sector_t bi_sector, unsigned bi_sectors, bool end, 615 unsigned long duration_jiffies, 616 struct dm_stats_aux *stats_aux) 617 { 618 struct dm_stat *s; 619 sector_t end_sector; 620 struct dm_stats_last_position *last; 621 bool got_precise_time; 622 623 if (unlikely(!bi_sectors)) 624 return; 625 626 end_sector = bi_sector + bi_sectors; 627 628 if (!end) { 629 /* 630 * A race condition can at worst result in the merged flag being 631 * misrepresented, so we don't have to disable preemption here. 632 */ 633 last = raw_cpu_ptr(stats->last); 634 stats_aux->merged = 635 (bi_sector == (ACCESS_ONCE(last->last_sector) && 636 ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == 637 (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD))) 638 )); 639 ACCESS_ONCE(last->last_sector) = end_sector; 640 ACCESS_ONCE(last->last_rw) = bi_rw; 641 } 642 643 rcu_read_lock(); 644 645 got_precise_time = false; 646 list_for_each_entry_rcu(s, &stats->list, list_entry) { 647 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 648 if (!end) 649 stats_aux->duration_ns = ktime_to_ns(ktime_get()); 650 else 651 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 652 got_precise_time = true; 653 } 654 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 655 } 656 657 rcu_read_unlock(); 658 } 659 660 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 661 struct dm_stat *s, size_t x) 662 { 663 int cpu; 664 struct dm_stat_percpu *p; 665 666 local_irq_disable(); 667 p = &s->stat_percpu[smp_processor_id()][x]; 668 dm_stat_round(s, shared, p); 669 local_irq_enable(); 670 671 shared->tmp.sectors[READ] = 0; 672 shared->tmp.sectors[WRITE] = 0; 673 shared->tmp.ios[READ] = 0; 674 shared->tmp.ios[WRITE] = 0; 675 shared->tmp.merges[READ] = 0; 676 shared->tmp.merges[WRITE] = 0; 677 shared->tmp.ticks[READ] = 0; 678 shared->tmp.ticks[WRITE] = 0; 679 shared->tmp.io_ticks[READ] = 0; 680 shared->tmp.io_ticks[WRITE] = 0; 681 shared->tmp.io_ticks_total = 0; 682 shared->tmp.time_in_queue = 0; 683 684 if (s->n_histogram_entries) 685 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 686 687 for_each_possible_cpu(cpu) { 688 p = &s->stat_percpu[cpu][x]; 689 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); 690 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); 691 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); 692 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); 693 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); 694 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); 695 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); 696 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); 697 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); 698 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 699 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 700 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 701 if (s->n_histogram_entries) { 702 unsigned i; 703 for (i = 0; i < s->n_histogram_entries + 1; i++) 704 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); 705 } 706 } 707 } 708 709 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 710 bool init_tmp_percpu_totals) 711 { 712 size_t x; 713 struct dm_stat_shared *shared; 714 struct dm_stat_percpu *p; 715 716 for (x = idx_start; x < idx_end; x++) { 717 shared = &s->stat_shared[x]; 718 if (init_tmp_percpu_totals) 719 __dm_stat_init_temporary_percpu_totals(shared, s, x); 720 local_irq_disable(); 721 p = &s->stat_percpu[smp_processor_id()][x]; 722 p->sectors[READ] -= shared->tmp.sectors[READ]; 723 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 724 p->ios[READ] -= shared->tmp.ios[READ]; 725 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 726 p->merges[READ] -= shared->tmp.merges[READ]; 727 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 728 p->ticks[READ] -= shared->tmp.ticks[READ]; 729 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 730 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 731 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 732 p->io_ticks_total -= shared->tmp.io_ticks_total; 733 p->time_in_queue -= shared->tmp.time_in_queue; 734 local_irq_enable(); 735 if (s->n_histogram_entries) { 736 unsigned i; 737 for (i = 0; i < s->n_histogram_entries + 1; i++) { 738 local_irq_disable(); 739 p = &s->stat_percpu[smp_processor_id()][x]; 740 p->histogram[i] -= shared->tmp.histogram[i]; 741 local_irq_enable(); 742 } 743 } 744 } 745 } 746 747 static int dm_stats_clear(struct dm_stats *stats, int id) 748 { 749 struct dm_stat *s; 750 751 mutex_lock(&stats->mutex); 752 753 s = __dm_stats_find(stats, id); 754 if (!s) { 755 mutex_unlock(&stats->mutex); 756 return -ENOENT; 757 } 758 759 __dm_stat_clear(s, 0, s->n_entries, true); 760 761 mutex_unlock(&stats->mutex); 762 763 return 1; 764 } 765 766 /* 767 * This is like jiffies_to_msec, but works for 64-bit values. 768 */ 769 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 770 { 771 unsigned long long result; 772 unsigned mult; 773 774 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 775 return j; 776 777 result = 0; 778 if (j) 779 result = jiffies_to_msecs(j & 0x3fffff); 780 if (j >= 1 << 22) { 781 mult = jiffies_to_msecs(1 << 22); 782 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 783 } 784 if (j >= 1ULL << 44) 785 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 786 787 return result; 788 } 789 790 static int dm_stats_print(struct dm_stats *stats, int id, 791 size_t idx_start, size_t idx_len, 792 bool clear, char *result, unsigned maxlen) 793 { 794 unsigned sz = 0; 795 struct dm_stat *s; 796 size_t x; 797 sector_t start, end, step; 798 size_t idx_end; 799 struct dm_stat_shared *shared; 800 801 /* 802 * Output format: 803 * <start_sector>+<length> counters 804 */ 805 806 mutex_lock(&stats->mutex); 807 808 s = __dm_stats_find(stats, id); 809 if (!s) { 810 mutex_unlock(&stats->mutex); 811 return -ENOENT; 812 } 813 814 idx_end = idx_start + idx_len; 815 if (idx_end < idx_start || 816 idx_end > s->n_entries) 817 idx_end = s->n_entries; 818 819 if (idx_start > idx_end) 820 idx_start = idx_end; 821 822 step = s->step; 823 start = s->start + (step * idx_start); 824 825 for (x = idx_start; x < idx_end; x++, start = end) { 826 shared = &s->stat_shared[x]; 827 end = start + step; 828 if (unlikely(end > s->end)) 829 end = s->end; 830 831 __dm_stat_init_temporary_percpu_totals(shared, s, x); 832 833 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 834 (unsigned long long)start, 835 (unsigned long long)step, 836 shared->tmp.ios[READ], 837 shared->tmp.merges[READ], 838 shared->tmp.sectors[READ], 839 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 840 shared->tmp.ios[WRITE], 841 shared->tmp.merges[WRITE], 842 shared->tmp.sectors[WRITE], 843 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 844 dm_stat_in_flight(shared), 845 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 846 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 847 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 848 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 849 if (s->n_histogram_entries) { 850 unsigned i; 851 for (i = 0; i < s->n_histogram_entries + 1; i++) { 852 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 853 } 854 } 855 DMEMIT("\n"); 856 857 if (unlikely(sz + 1 >= maxlen)) 858 goto buffer_overflow; 859 } 860 861 if (clear) 862 __dm_stat_clear(s, idx_start, idx_end, false); 863 864 buffer_overflow: 865 mutex_unlock(&stats->mutex); 866 867 return 1; 868 } 869 870 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 871 { 872 struct dm_stat *s; 873 const char *new_aux_data; 874 875 mutex_lock(&stats->mutex); 876 877 s = __dm_stats_find(stats, id); 878 if (!s) { 879 mutex_unlock(&stats->mutex); 880 return -ENOENT; 881 } 882 883 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 884 if (!new_aux_data) { 885 mutex_unlock(&stats->mutex); 886 return -ENOMEM; 887 } 888 889 kfree(s->aux_data); 890 s->aux_data = new_aux_data; 891 892 mutex_unlock(&stats->mutex); 893 894 return 0; 895 } 896 897 static int parse_histogram(const char *h, unsigned *n_histogram_entries, 898 unsigned long long **histogram_boundaries) 899 { 900 const char *q; 901 unsigned n; 902 unsigned long long last; 903 904 *n_histogram_entries = 1; 905 for (q = h; *q; q++) 906 if (*q == ',') 907 (*n_histogram_entries)++; 908 909 *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 910 if (!*histogram_boundaries) 911 return -ENOMEM; 912 913 n = 0; 914 last = 0; 915 while (1) { 916 unsigned long long hi; 917 int s; 918 char ch; 919 s = sscanf(h, "%llu%c", &hi, &ch); 920 if (!s || (s == 2 && ch != ',')) 921 return -EINVAL; 922 if (hi <= last) 923 return -EINVAL; 924 last = hi; 925 (*histogram_boundaries)[n] = hi; 926 if (s == 1) 927 return 0; 928 h = strchr(h, ',') + 1; 929 n++; 930 } 931 } 932 933 static int message_stats_create(struct mapped_device *md, 934 unsigned argc, char **argv, 935 char *result, unsigned maxlen) 936 { 937 int r; 938 int id; 939 char dummy; 940 unsigned long long start, end, len, step; 941 unsigned divisor; 942 const char *program_id, *aux_data; 943 unsigned stat_flags = 0; 944 945 unsigned n_histogram_entries = 0; 946 unsigned long long *histogram_boundaries = NULL; 947 948 struct dm_arg_set as, as_backup; 949 const char *a; 950 unsigned feature_args; 951 952 /* 953 * Input format: 954 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 955 */ 956 957 if (argc < 3) 958 goto ret_einval; 959 960 as.argc = argc; 961 as.argv = argv; 962 dm_consume_args(&as, 1); 963 964 a = dm_shift_arg(&as); 965 if (!strcmp(a, "-")) { 966 start = 0; 967 len = dm_get_size(md); 968 if (!len) 969 len = 1; 970 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 971 start != (sector_t)start || len != (sector_t)len) 972 goto ret_einval; 973 974 end = start + len; 975 if (start >= end) 976 goto ret_einval; 977 978 a = dm_shift_arg(&as); 979 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 980 if (!divisor) 981 return -EINVAL; 982 step = end - start; 983 if (do_div(step, divisor)) 984 step++; 985 if (!step) 986 step = 1; 987 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 988 step != (sector_t)step || !step) 989 goto ret_einval; 990 991 as_backup = as; 992 a = dm_shift_arg(&as); 993 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 994 while (feature_args--) { 995 a = dm_shift_arg(&as); 996 if (!a) 997 goto ret_einval; 998 if (!strcasecmp(a, "precise_timestamps")) 999 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1000 else if (!strncasecmp(a, "histogram:", 10)) { 1001 if (n_histogram_entries) 1002 goto ret_einval; 1003 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 1004 goto ret; 1005 } else 1006 goto ret_einval; 1007 } 1008 } else { 1009 as = as_backup; 1010 } 1011 1012 program_id = "-"; 1013 aux_data = "-"; 1014 1015 a = dm_shift_arg(&as); 1016 if (a) 1017 program_id = a; 1018 1019 a = dm_shift_arg(&as); 1020 if (a) 1021 aux_data = a; 1022 1023 if (as.argc) 1024 goto ret_einval; 1025 1026 /* 1027 * If a buffer overflow happens after we created the region, 1028 * it's too late (the userspace would retry with a larger 1029 * buffer, but the region id that caused the overflow is already 1030 * leaked). So we must detect buffer overflow in advance. 1031 */ 1032 snprintf(result, maxlen, "%d", INT_MAX); 1033 if (dm_message_test_buffer_overflow(result, maxlen)) { 1034 r = 1; 1035 goto ret; 1036 } 1037 1038 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1039 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1040 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1041 if (id < 0) { 1042 r = id; 1043 goto ret; 1044 } 1045 1046 snprintf(result, maxlen, "%d", id); 1047 1048 r = 1; 1049 goto ret; 1050 1051 ret_einval: 1052 r = -EINVAL; 1053 ret: 1054 kfree(histogram_boundaries); 1055 return r; 1056 } 1057 1058 static int message_stats_delete(struct mapped_device *md, 1059 unsigned argc, char **argv) 1060 { 1061 int id; 1062 char dummy; 1063 1064 if (argc != 2) 1065 return -EINVAL; 1066 1067 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1068 return -EINVAL; 1069 1070 return dm_stats_delete(dm_get_stats(md), id); 1071 } 1072 1073 static int message_stats_clear(struct mapped_device *md, 1074 unsigned argc, char **argv) 1075 { 1076 int id; 1077 char dummy; 1078 1079 if (argc != 2) 1080 return -EINVAL; 1081 1082 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1083 return -EINVAL; 1084 1085 return dm_stats_clear(dm_get_stats(md), id); 1086 } 1087 1088 static int message_stats_list(struct mapped_device *md, 1089 unsigned argc, char **argv, 1090 char *result, unsigned maxlen) 1091 { 1092 int r; 1093 const char *program = NULL; 1094 1095 if (argc < 1 || argc > 2) 1096 return -EINVAL; 1097 1098 if (argc > 1) { 1099 program = kstrdup(argv[1], GFP_KERNEL); 1100 if (!program) 1101 return -ENOMEM; 1102 } 1103 1104 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1105 1106 kfree(program); 1107 1108 return r; 1109 } 1110 1111 static int message_stats_print(struct mapped_device *md, 1112 unsigned argc, char **argv, bool clear, 1113 char *result, unsigned maxlen) 1114 { 1115 int id; 1116 char dummy; 1117 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1118 1119 if (argc != 2 && argc != 4) 1120 return -EINVAL; 1121 1122 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1123 return -EINVAL; 1124 1125 if (argc > 3) { 1126 if (strcmp(argv[2], "-") && 1127 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1128 return -EINVAL; 1129 if (strcmp(argv[3], "-") && 1130 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1131 return -EINVAL; 1132 } 1133 1134 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1135 result, maxlen); 1136 } 1137 1138 static int message_stats_set_aux(struct mapped_device *md, 1139 unsigned argc, char **argv) 1140 { 1141 int id; 1142 char dummy; 1143 1144 if (argc != 3) 1145 return -EINVAL; 1146 1147 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1148 return -EINVAL; 1149 1150 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1151 } 1152 1153 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 1154 char *result, unsigned maxlen) 1155 { 1156 int r; 1157 1158 /* All messages here must start with '@' */ 1159 if (!strcasecmp(argv[0], "@stats_create")) 1160 r = message_stats_create(md, argc, argv, result, maxlen); 1161 else if (!strcasecmp(argv[0], "@stats_delete")) 1162 r = message_stats_delete(md, argc, argv); 1163 else if (!strcasecmp(argv[0], "@stats_clear")) 1164 r = message_stats_clear(md, argc, argv); 1165 else if (!strcasecmp(argv[0], "@stats_list")) 1166 r = message_stats_list(md, argc, argv, result, maxlen); 1167 else if (!strcasecmp(argv[0], "@stats_print")) 1168 r = message_stats_print(md, argc, argv, false, result, maxlen); 1169 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1170 r = message_stats_print(md, argc, argv, true, result, maxlen); 1171 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1172 r = message_stats_set_aux(md, argc, argv); 1173 else 1174 return 2; /* this wasn't a stats message */ 1175 1176 if (r == -EINVAL) 1177 DMWARN("Invalid parameters for message %s", argv[0]); 1178 1179 return r; 1180 } 1181 1182 int __init dm_statistics_init(void) 1183 { 1184 shared_memory_amount = 0; 1185 dm_stat_need_rcu_barrier = 0; 1186 return 0; 1187 } 1188 1189 void dm_statistics_exit(void) 1190 { 1191 if (dm_stat_need_rcu_barrier) 1192 rcu_barrier(); 1193 if (WARN_ON(shared_memory_amount)) 1194 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1195 } 1196 1197 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 1198 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1199