1 /* 2 * qsp.c - QEMU Synchronization Profiler 3 * 4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org> 5 * 6 * License: GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * QSP profiles the time spent in synchronization primitives, which can 10 * help diagnose performance problems, e.g. scalability issues when 11 * contention is high. 12 * 13 * The primitives currently supported are mutexes, recursive mutexes and 14 * condition variables. Note that not all related functions are intercepted; 15 * instead we profile only those functions that can have a performance impact, 16 * either due to blocking (e.g. cond_wait, mutex_lock) or cache line 17 * contention (e.g. mutex_lock, mutex_trylock). 18 * 19 * QSP's design focuses on speed and scalability. This is achieved 20 * by having threads do their profiling entirely on thread-local data. 21 * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash 22 * table. To aggregate data in order to generate a report, we iterate over 23 * all entries in the hash table. Depending on the number of threads and 24 * synchronization objects this might be expensive, but note that it is 25 * very rarely called -- reports are generated only when requested by users. 26 * 27 * Reports are generated as a table where each row represents a call site. A 28 * call site is the triplet formed by the __file__ and __LINE__ of the caller 29 * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar) 30 * being operated on. Optionally, call sites that operate on different objects 31 * of the same type can be coalesced, which can be particularly useful when 32 * profiling dynamically-allocated objects. 33 * 34 * Alternative designs considered: 35 * 36 * - Use an off-the-shelf profiler such as mutrace. This is not a viable option 37 * for us because QEMU has __malloc_hook set (by one of the libraries it 38 * uses); leaving this hook unset is required to avoid deadlock in mutrace. 39 * 40 * - Use a glib HT for each thread, protecting each HT with its own lock. 41 * This isn't simpler than the current design, and is 10% slower in the 42 * atomic_add-bench microbenchmark (-m option). 43 * 44 * - For reports, just use a binary tree as we aggregate data, instead of having 45 * an intermediate hash table. This would simplify the code only slightly, but 46 * would perform badly if there were many threads and objects to track. 47 * 48 * - Wrap operations on qsp entries with RCU read-side critical sections, so 49 * that qsp_reset() can delete entries. Unfortunately, the overhead of calling 50 * rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having 51 * a snapshot that is updated on qsp_reset() avoids this overhead. 52 * 53 * Related Work: 54 * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html 55 * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating 56 * Critical-Section Execution to Improve the Performance of Multithreaded 57 * Applications", USENIX ATC'12. 58 */ 59 #include "qemu/osdep.h" 60 #include "qemu/thread.h" 61 #include "qemu/timer.h" 62 #include "qemu/qht.h" 63 #include "qemu/rcu.h" 64 #include "exec/tb-hash-xx.h" 65 66 enum QSPType { 67 QSP_MUTEX, 68 QSP_BQL_MUTEX, 69 QSP_REC_MUTEX, 70 QSP_CONDVAR, 71 }; 72 73 struct QSPCallSite { 74 const void *obj; 75 const char *file; /* i.e. __FILE__; shortened later */ 76 int line; 77 enum QSPType type; 78 }; 79 typedef struct QSPCallSite QSPCallSite; 80 81 struct QSPEntry { 82 void *thread_ptr; 83 const QSPCallSite *callsite; 84 uint64_t n_acqs; 85 uint64_t ns; 86 unsigned int n_objs; /* count of coalesced objs; only used for reporting */ 87 #ifndef CONFIG_ATOMIC64 88 /* 89 * If we cannot update the counts atomically, then use a seqlock. 90 * We don't need an associated lock because the updates are thread-local. 91 */ 92 QemuSeqLock sequence; 93 #endif 94 }; 95 typedef struct QSPEntry QSPEntry; 96 97 struct QSPSnapshot { 98 struct rcu_head rcu; 99 struct qht ht; 100 }; 101 typedef struct QSPSnapshot QSPSnapshot; 102 103 /* initial sizing for hash tables */ 104 #define QSP_INITIAL_SIZE 64 105 106 /* If this file is moved, QSP_REL_PATH should be updated accordingly */ 107 #define QSP_REL_PATH "util/qsp.c" 108 109 /* this file's full path. Used to present all call sites with relative paths */ 110 static size_t qsp_qemu_path_len; 111 112 /* the address of qsp_thread gives us a unique 'thread ID' */ 113 static __thread int qsp_thread; 114 115 /* 116 * Call sites are the same for all threads, so we track them in a separate hash 117 * table to save memory. 118 */ 119 static struct qht qsp_callsite_ht; 120 121 static struct qht qsp_ht; 122 static QSPSnapshot *qsp_snapshot; 123 static bool qsp_initialized, qsp_initializing; 124 125 static const char * const qsp_typenames[] = { 126 [QSP_MUTEX] = "mutex", 127 [QSP_BQL_MUTEX] = "BQL mutex", 128 [QSP_REC_MUTEX] = "rec_mutex", 129 [QSP_CONDVAR] = "condvar", 130 }; 131 132 QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl; 133 QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl; 134 QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl; 135 QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl; 136 QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func = 137 qemu_rec_mutex_trylock_impl; 138 QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl; 139 140 /* 141 * It pays off to _not_ hash callsite->file; hashing a string is slow, and 142 * without it we still get a pretty unique hash. 143 */ 144 static inline 145 uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t a) 146 { 147 uint64_t b = (uint64_t)(uintptr_t)callsite->obj; 148 uint32_t e = callsite->line; 149 uint32_t f = callsite->type; 150 151 return tb_hash_func7(a, b, e, f, 0); 152 } 153 154 static inline 155 uint32_t qsp_callsite_hash(const QSPCallSite *callsite) 156 { 157 return do_qsp_callsite_hash(callsite, 0); 158 } 159 160 static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a) 161 { 162 return do_qsp_callsite_hash(entry->callsite, a); 163 } 164 165 static uint32_t qsp_entry_hash(const QSPEntry *entry) 166 { 167 return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr); 168 } 169 170 static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry) 171 { 172 return do_qsp_entry_hash(entry, 0); 173 } 174 175 /* without the objects we need to hash the file name to get a decent hash */ 176 static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry) 177 { 178 const QSPCallSite *callsite = entry->callsite; 179 uint64_t a = g_str_hash(callsite->file); 180 uint64_t b = callsite->line; 181 uint32_t e = callsite->type; 182 183 return tb_hash_func7(a, b, e, 0, 0); 184 } 185 186 static bool qsp_callsite_cmp(const void *ap, const void *bp) 187 { 188 const QSPCallSite *a = ap; 189 const QSPCallSite *b = bp; 190 191 return a == b || 192 (a->obj == b->obj && 193 a->line == b->line && 194 a->type == b->type && 195 (a->file == b->file || !strcmp(a->file, b->file))); 196 } 197 198 static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp) 199 { 200 const QSPCallSite *a = ap; 201 const QSPCallSite *b = bp; 202 203 return a == b || 204 (a->line == b->line && 205 a->type == b->type && 206 (a->file == b->file || !strcmp(a->file, b->file))); 207 } 208 209 static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp) 210 { 211 const QSPEntry *a = ap; 212 const QSPEntry *b = bp; 213 214 return qsp_callsite_cmp(a->callsite, b->callsite); 215 } 216 217 static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp) 218 { 219 const QSPEntry *a = ap; 220 const QSPEntry *b = bp; 221 222 return qsp_callsite_no_obj_cmp(a->callsite, b->callsite); 223 } 224 225 static bool qsp_entry_cmp(const void *ap, const void *bp) 226 { 227 const QSPEntry *a = ap; 228 const QSPEntry *b = bp; 229 230 return a->thread_ptr == b->thread_ptr && 231 qsp_callsite_cmp(a->callsite, b->callsite); 232 } 233 234 /* 235 * Normally we'd call this from a constructor function, but we want it to work 236 * via libutil as well. 237 */ 238 static void qsp_do_init(void) 239 { 240 /* make sure this file's path in the tree is up to date with QSP_REL_PATH */ 241 g_assert(strstr(__FILE__, QSP_REL_PATH)); 242 qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH); 243 244 qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE, 245 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); 246 qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE, 247 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); 248 } 249 250 static __attribute__((noinline)) void qsp_init__slowpath(void) 251 { 252 if (atomic_cmpxchg(&qsp_initializing, false, true) == false) { 253 qsp_do_init(); 254 atomic_set(&qsp_initialized, true); 255 } else { 256 while (!atomic_read(&qsp_initialized)) { 257 cpu_relax(); 258 } 259 } 260 } 261 262 /* qsp_init() must be called from _all_ exported functions */ 263 static inline void qsp_init(void) 264 { 265 if (likely(atomic_read(&qsp_initialized))) { 266 return; 267 } 268 qsp_init__slowpath(); 269 } 270 271 static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig) 272 { 273 QSPCallSite *callsite; 274 uint32_t hash; 275 276 hash = qsp_callsite_hash(orig); 277 callsite = qht_lookup(&qsp_callsite_ht, orig, hash); 278 if (callsite == NULL) { 279 void *existing = NULL; 280 281 callsite = g_new(QSPCallSite, 1); 282 memcpy(callsite, orig, sizeof(*callsite)); 283 qht_insert(&qsp_callsite_ht, callsite, hash, &existing); 284 if (unlikely(existing)) { 285 g_free(callsite); 286 callsite = existing; 287 } 288 } 289 return callsite; 290 } 291 292 static QSPEntry * 293 qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash) 294 { 295 QSPEntry *e; 296 void *existing = NULL; 297 298 e = g_new0(QSPEntry, 1); 299 e->thread_ptr = entry->thread_ptr; 300 e->callsite = qsp_callsite_find(entry->callsite); 301 302 qht_insert(ht, e, hash, &existing); 303 if (unlikely(existing)) { 304 g_free(e); 305 e = existing; 306 } 307 return e; 308 } 309 310 static QSPEntry * 311 qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash) 312 { 313 QSPEntry *e; 314 315 e = qht_lookup(ht, entry, hash); 316 if (e == NULL) { 317 e = qsp_entry_create(ht, entry, hash); 318 } 319 return e; 320 } 321 322 /* 323 * Note: Entries are never removed, so callers do not have to be in an RCU 324 * read-side critical section. 325 */ 326 static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line, 327 enum QSPType type) 328 { 329 QSPCallSite callsite = { 330 .obj = obj, 331 .file = file, 332 .line = line, 333 .type = type, 334 }; 335 QSPEntry orig; 336 uint32_t hash; 337 338 qsp_init(); 339 340 orig.thread_ptr = &qsp_thread; 341 orig.callsite = &callsite; 342 343 hash = qsp_entry_hash(&orig); 344 return qsp_entry_find(&qsp_ht, &orig, hash); 345 } 346 347 /* 348 * @from is in the global hash table; read it atomically if the host 349 * supports it, otherwise use the seqlock. 350 */ 351 static void qsp_entry_aggregate(QSPEntry *to, const QSPEntry *from) 352 { 353 #ifdef CONFIG_ATOMIC64 354 to->ns += atomic_read__nocheck(&from->ns); 355 to->n_acqs += atomic_read__nocheck(&from->n_acqs); 356 #else 357 unsigned int version; 358 uint64_t ns, n_acqs; 359 360 do { 361 version = seqlock_read_begin(&from->sequence); 362 ns = atomic_read__nocheck(&from->ns); 363 n_acqs = atomic_read__nocheck(&from->n_acqs); 364 } while (seqlock_read_retry(&from->sequence, version)); 365 366 to->ns += ns; 367 to->n_acqs += n_acqs; 368 #endif 369 } 370 371 /* 372 * @e is in the global hash table; it is only written to by the current thread, 373 * so we write to it atomically (as in "write once") to prevent torn reads. 374 * If the host doesn't support u64 atomics, use the seqlock. 375 */ 376 static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq) 377 { 378 #ifndef CONFIG_ATOMIC64 379 seqlock_write_begin(&e->sequence); 380 #endif 381 atomic_set__nocheck(&e->ns, e->ns + delta); 382 if (acq) { 383 atomic_set__nocheck(&e->n_acqs, e->n_acqs + 1); 384 } 385 #ifndef CONFIG_ATOMIC64 386 seqlock_write_end(&e->sequence); 387 #endif 388 } 389 390 static inline void qsp_entry_record(QSPEntry *e, int64_t delta) 391 { 392 do_qsp_entry_record(e, delta, true); 393 } 394 395 #define QSP_GEN_VOID(type_, qsp_t_, func_, impl_) \ 396 static void func_(type_ *obj, const char *file, int line) \ 397 { \ 398 QSPEntry *e; \ 399 int64_t t0, t1; \ 400 \ 401 t0 = get_clock(); \ 402 impl_(obj, file, line); \ 403 t1 = get_clock(); \ 404 \ 405 e = qsp_entry_get(obj, file, line, qsp_t_); \ 406 qsp_entry_record(e, t1 - t0); \ 407 } 408 409 #define QSP_GEN_RET1(type_, qsp_t_, func_, impl_) \ 410 static int func_(type_ *obj, const char *file, int line) \ 411 { \ 412 QSPEntry *e; \ 413 int64_t t0, t1; \ 414 int err; \ 415 \ 416 t0 = get_clock(); \ 417 err = impl_(obj, file, line); \ 418 t1 = get_clock(); \ 419 \ 420 e = qsp_entry_get(obj, file, line, qsp_t_); \ 421 do_qsp_entry_record(e, t1 - t0, !err); \ 422 return err; \ 423 } 424 425 QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl) 426 QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl) 427 QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl) 428 429 QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock, 430 qemu_rec_mutex_lock_impl) 431 QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock, 432 qemu_rec_mutex_trylock_impl) 433 434 #undef QSP_GEN_RET1 435 #undef QSP_GEN_VOID 436 437 static void 438 qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line) 439 { 440 QSPEntry *e; 441 int64_t t0, t1; 442 443 t0 = get_clock(); 444 qemu_cond_wait_impl(cond, mutex, file, line); 445 t1 = get_clock(); 446 447 e = qsp_entry_get(cond, file, line, QSP_CONDVAR); 448 qsp_entry_record(e, t1 - t0); 449 } 450 451 bool qsp_is_enabled(void) 452 { 453 return atomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock; 454 } 455 456 void qsp_enable(void) 457 { 458 atomic_set(&qemu_mutex_lock_func, qsp_mutex_lock); 459 atomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock); 460 atomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock); 461 atomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock); 462 atomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock); 463 atomic_set(&qemu_cond_wait_func, qsp_cond_wait); 464 } 465 466 void qsp_disable(void) 467 { 468 atomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl); 469 atomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl); 470 atomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl); 471 atomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl); 472 atomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl); 473 atomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl); 474 } 475 476 static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up) 477 { 478 const QSPEntry *a = ap; 479 const QSPEntry *b = bp; 480 enum QSPSortBy sort_by = *(enum QSPSortBy *)up; 481 const QSPCallSite *ca; 482 const QSPCallSite *cb; 483 484 switch (sort_by) { 485 case QSP_SORT_BY_TOTAL_WAIT_TIME: 486 if (a->ns > b->ns) { 487 return -1; 488 } else if (a->ns < b->ns) { 489 return 1; 490 } 491 break; 492 case QSP_SORT_BY_AVG_WAIT_TIME: 493 { 494 double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0; 495 double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0; 496 497 if (avg_a > avg_b) { 498 return -1; 499 } else if (avg_a < avg_b) { 500 return 1; 501 } 502 break; 503 } 504 default: 505 g_assert_not_reached(); 506 } 507 508 ca = a->callsite; 509 cb = b->callsite; 510 /* Break the tie with the object's address */ 511 if (ca->obj < cb->obj) { 512 return -1; 513 } else if (ca->obj > cb->obj) { 514 return 1; 515 } else { 516 int cmp; 517 518 /* same obj. Break the tie with the callsite's file */ 519 cmp = strcmp(ca->file, cb->file); 520 if (cmp) { 521 return cmp; 522 } 523 /* same callsite file. Break the tie with the callsite's line */ 524 g_assert(ca->line != cb->line); 525 if (ca->line < cb->line) { 526 return -1; 527 } else if (ca->line > cb->line) { 528 return 1; 529 } else { 530 /* break the tie with the callsite's type */ 531 return cb->type - ca->type; 532 } 533 } 534 } 535 536 static void qsp_sort(struct qht *ht, void *p, uint32_t h, void *userp) 537 { 538 QSPEntry *e = p; 539 GTree *tree = userp; 540 541 g_tree_insert(tree, e, NULL); 542 } 543 544 static void qsp_aggregate(struct qht *global_ht, void *p, uint32_t h, void *up) 545 { 546 struct qht *ht = up; 547 const QSPEntry *e = p; 548 QSPEntry *agg; 549 uint32_t hash; 550 551 hash = qsp_entry_no_thread_hash(e); 552 agg = qsp_entry_find(ht, e, hash); 553 qsp_entry_aggregate(agg, e); 554 } 555 556 static void qsp_iter_diff(struct qht *orig, void *p, uint32_t hash, void *htp) 557 { 558 struct qht *ht = htp; 559 QSPEntry *old = p; 560 QSPEntry *new; 561 562 new = qht_lookup(ht, old, hash); 563 /* entries are never deleted, so we must have this one */ 564 g_assert(new != NULL); 565 /* our reading of the stats happened after the snapshot was taken */ 566 g_assert(new->n_acqs >= old->n_acqs); 567 g_assert(new->ns >= old->ns); 568 569 new->n_acqs -= old->n_acqs; 570 new->ns -= old->ns; 571 572 /* No point in reporting an empty entry */ 573 if (new->n_acqs == 0 && new->ns == 0) { 574 bool removed = qht_remove(ht, new, hash); 575 576 g_assert(removed); 577 g_free(new); 578 } 579 } 580 581 static void qsp_diff(struct qht *orig, struct qht *new) 582 { 583 qht_iter(orig, qsp_iter_diff, new); 584 } 585 586 static void 587 qsp_iter_callsite_coalesce(struct qht *orig, void *p, uint32_t h, void *htp) 588 { 589 struct qht *ht = htp; 590 QSPEntry *old = p; 591 QSPEntry *e; 592 uint32_t hash; 593 594 hash = qsp_entry_no_thread_obj_hash(old); 595 e = qht_lookup(ht, old, hash); 596 if (e == NULL) { 597 e = qsp_entry_create(ht, old, hash); 598 e->n_objs = 1; 599 } else if (e->callsite->obj != old->callsite->obj) { 600 e->n_objs++; 601 } 602 e->ns += old->ns; 603 e->n_acqs += old->n_acqs; 604 } 605 606 static void qsp_ht_delete(struct qht *ht, void *p, uint32_t h, void *htp) 607 { 608 g_free(p); 609 } 610 611 static void qsp_mktree(GTree *tree, bool callsite_coalesce) 612 { 613 QSPSnapshot *snap; 614 struct qht ht, coalesce_ht; 615 struct qht *htp; 616 617 /* 618 * First, see if there's a prior snapshot, so that we read the global hash 619 * table _after_ the snapshot has been created, which guarantees that 620 * the entries we'll read will be a superset of the snapshot's entries. 621 * 622 * We must remain in an RCU read-side critical section until we're done 623 * with the snapshot. 624 */ 625 rcu_read_lock(); 626 snap = atomic_rcu_read(&qsp_snapshot); 627 628 /* Aggregate all results from the global hash table into a local one */ 629 qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE, 630 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); 631 qht_iter(&qsp_ht, qsp_aggregate, &ht); 632 633 /* compute the difference wrt the snapshot, if any */ 634 if (snap) { 635 qsp_diff(&snap->ht, &ht); 636 } 637 /* done with the snapshot; RCU can reclaim it */ 638 rcu_read_unlock(); 639 640 htp = &ht; 641 if (callsite_coalesce) { 642 qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE, 643 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); 644 qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht); 645 646 /* free the previous hash table, and point htp to coalesce_ht */ 647 qht_iter(&ht, qsp_ht_delete, NULL); 648 qht_destroy(&ht); 649 htp = &coalesce_ht; 650 } 651 652 /* sort the hash table elements by using a tree */ 653 qht_iter(htp, qsp_sort, tree); 654 655 /* free the hash table, but keep the elements (those are in the tree now) */ 656 qht_destroy(htp); 657 } 658 659 /* free string with g_free */ 660 static char *qsp_at(const QSPCallSite *callsite) 661 { 662 GString *s = g_string_new(NULL); 663 const char *shortened; 664 665 /* remove the absolute path to qemu */ 666 if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) { 667 shortened = callsite->file; 668 } else { 669 shortened = callsite->file + qsp_qemu_path_len; 670 } 671 g_string_append_printf(s, "%s:%u", shortened, callsite->line); 672 return g_string_free(s, FALSE); 673 } 674 675 struct QSPReportEntry { 676 const void *obj; 677 char *callsite_at; 678 const char *typename; 679 double time_s; 680 double ns_avg; 681 uint64_t n_acqs; 682 unsigned int n_objs; 683 }; 684 typedef struct QSPReportEntry QSPReportEntry; 685 686 struct QSPReport { 687 QSPReportEntry *entries; 688 size_t n_entries; 689 size_t max_n_entries; 690 }; 691 typedef struct QSPReport QSPReport; 692 693 static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata) 694 { 695 const QSPEntry *e = key; 696 QSPReport *report = udata; 697 QSPReportEntry *entry; 698 699 if (report->n_entries == report->max_n_entries) { 700 return TRUE; 701 } 702 entry = &report->entries[report->n_entries]; 703 report->n_entries++; 704 705 entry->obj = e->callsite->obj; 706 entry->n_objs = e->n_objs; 707 entry->callsite_at = qsp_at(e->callsite); 708 entry->typename = qsp_typenames[e->callsite->type]; 709 entry->time_s = e->ns * 1e-9; 710 entry->n_acqs = e->n_acqs; 711 entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0; 712 return FALSE; 713 } 714 715 static void 716 pr_report(const QSPReport *rep, FILE *f, fprintf_function pr) 717 { 718 char *dashes; 719 size_t max_len = 0; 720 int callsite_len = 0; 721 int callsite_rspace; 722 int n_dashes; 723 size_t i; 724 725 /* find out the maximum length of all 'callsite' fields */ 726 for (i = 0; i < rep->n_entries; i++) { 727 const QSPReportEntry *e = &rep->entries[i]; 728 size_t len = strlen(e->callsite_at); 729 730 if (len > max_len) { 731 max_len = len; 732 } 733 } 734 735 callsite_len = MAX(max_len, strlen("Call site")); 736 /* white space to leave to the right of "Call site" */ 737 callsite_rspace = callsite_len - strlen("Call site"); 738 739 pr(f, "Type Object Call site%*s Wait Time (s) " 740 " Count Average (us)\n", callsite_rspace, ""); 741 742 /* build a horizontal rule with dashes */ 743 n_dashes = 79 + callsite_rspace; 744 dashes = g_malloc(n_dashes + 1); 745 memset(dashes, '-', n_dashes); 746 dashes[n_dashes] = '\0'; 747 pr(f, "%s\n", dashes); 748 749 for (i = 0; i < rep->n_entries; i++) { 750 const QSPReportEntry *e = &rep->entries[i]; 751 GString *s = g_string_new(NULL); 752 753 g_string_append_printf(s, "%-9s ", e->typename); 754 if (e->n_objs > 1) { 755 g_string_append_printf(s, "[%12u]", e->n_objs); 756 } else { 757 g_string_append_printf(s, "%14p", e->obj); 758 } 759 g_string_append_printf(s, " %s%*s %13.5f %12" PRIu64 " %12.2f\n", 760 e->callsite_at, 761 callsite_len - (int)strlen(e->callsite_at), "", 762 e->time_s, e->n_acqs, e->ns_avg * 1e-3); 763 pr(f, "%s", s->str); 764 g_string_free(s, TRUE); 765 } 766 767 pr(f, "%s\n", dashes); 768 g_free(dashes); 769 } 770 771 static void report_destroy(QSPReport *rep) 772 { 773 size_t i; 774 775 for (i = 0; i < rep->n_entries; i++) { 776 QSPReportEntry *e = &rep->entries[i]; 777 778 g_free(e->callsite_at); 779 } 780 g_free(rep->entries); 781 } 782 783 void qsp_report(FILE *f, fprintf_function cpu_fprintf, size_t max, 784 enum QSPSortBy sort_by, bool callsite_coalesce) 785 { 786 GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL); 787 QSPReport rep; 788 789 qsp_init(); 790 791 rep.entries = g_new0(QSPReportEntry, max); 792 rep.n_entries = 0; 793 rep.max_n_entries = max; 794 795 qsp_mktree(tree, callsite_coalesce); 796 g_tree_foreach(tree, qsp_tree_report, &rep); 797 g_tree_destroy(tree); 798 799 pr_report(&rep, f, cpu_fprintf); 800 report_destroy(&rep); 801 } 802 803 static void qsp_snapshot_destroy(QSPSnapshot *snap) 804 { 805 qht_iter(&snap->ht, qsp_ht_delete, NULL); 806 qht_destroy(&snap->ht); 807 g_free(snap); 808 } 809 810 void qsp_reset(void) 811 { 812 QSPSnapshot *new = g_new(QSPSnapshot, 1); 813 QSPSnapshot *old; 814 815 qsp_init(); 816 817 qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE, 818 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); 819 820 /* take a snapshot of the current state */ 821 qht_iter(&qsp_ht, qsp_aggregate, &new->ht); 822 823 /* replace the previous snapshot, if any */ 824 old = atomic_xchg(&qsp_snapshot, new); 825 if (old) { 826 call_rcu(old, qsp_snapshot_destroy, rcu); 827 } 828 } 829