xref: /openbmc/qemu/util/qsp.c (revision 4b615be540d8566f4b981245b4d5401163c57ced)
1 /*
2  * qsp.c - QEMU Synchronization Profiler
3  *
4  * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
5  *
6  * License: GNU GPL, version 2 or later.
7  *   See the COPYING file in the top-level directory.
8  *
9  * QSP profiles the time spent in synchronization primitives, which can
10  * help diagnose performance problems, e.g. scalability issues when
11  * contention is high.
12  *
13  * The primitives currently supported are mutexes, recursive mutexes and
14  * condition variables. Note that not all related functions are intercepted;
15  * instead we profile only those functions that can have a performance impact,
16  * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
17  * contention (e.g. mutex_lock, mutex_trylock).
18  *
19  * QSP's design focuses on speed and scalability. This is achieved
20  * by having threads do their profiling entirely on thread-local data.
21  * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
22  * table. To aggregate data in order to generate a report, we iterate over
23  * all entries in the hash table. Depending on the number of threads and
24  * synchronization objects this might be expensive, but note that it is
25  * very rarely called -- reports are generated only when requested by users.
26  *
27  * Reports are generated as a table where each row represents a call site. A
28  * call site is the triplet formed by the __file__ and __LINE__ of the caller
29  * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
30  * being operated on. Optionally, call sites that operate on different objects
31  * of the same type can be coalesced, which can be particularly useful when
32  * profiling dynamically-allocated objects.
33  *
34  * Alternative designs considered:
35  *
36  * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
37  *   for us because QEMU has __malloc_hook set (by one of the libraries it
38  *   uses); leaving this hook unset is required to avoid deadlock in mutrace.
39  *
40  * - Use a glib HT for each thread, protecting each HT with its own lock.
41  *   This isn't simpler than the current design, and is 10% slower in the
42  *   atomic_add-bench microbenchmark (-m option).
43  *
44  * - For reports, just use a binary tree as we aggregate data, instead of having
45  *   an intermediate hash table. This would simplify the code only slightly, but
46  *   would perform badly if there were many threads and objects to track.
47  *
48  * - Wrap operations on qsp entries with RCU read-side critical sections, so
49  *   that qsp_reset() can delete entries. Unfortunately, the overhead of calling
50  *   rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having
51  *   a snapshot that is updated on qsp_reset() avoids this overhead.
52  *
53  * Related Work:
54  * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
55  * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
56  *   Critical-Section Execution to Improve the Performance of Multithreaded
57  *   Applications", USENIX ATC'12.
58  */
59 #include "qemu/osdep.h"
60 #include "qemu/thread.h"
61 #include "qemu/timer.h"
62 #include "qemu/qht.h"
63 #include "qemu/rcu.h"
64 #include "exec/tb-hash-xx.h"
65 
66 enum QSPType {
67     QSP_MUTEX,
68     QSP_BQL_MUTEX,
69     QSP_REC_MUTEX,
70     QSP_CONDVAR,
71 };
72 
73 struct QSPCallSite {
74     const void *obj;
75     const char *file; /* i.e. __FILE__; shortened later */
76     int line;
77     enum QSPType type;
78 };
79 typedef struct QSPCallSite QSPCallSite;
80 
81 struct QSPEntry {
82     void *thread_ptr;
83     const QSPCallSite *callsite;
84     uint64_t n_acqs;
85     uint64_t ns;
86     unsigned int n_objs; /* count of coalesced objs; only used for reporting */
87 #ifndef CONFIG_ATOMIC64
88     /*
89      * If we cannot update the counts atomically, then use a seqlock.
90      * We don't need an associated lock because the updates are thread-local.
91      */
92     QemuSeqLock sequence;
93 #endif
94 };
95 typedef struct QSPEntry QSPEntry;
96 
97 struct QSPSnapshot {
98     struct rcu_head rcu;
99     struct qht ht;
100 };
101 typedef struct QSPSnapshot QSPSnapshot;
102 
103 /* initial sizing for hash tables */
104 #define QSP_INITIAL_SIZE 64
105 
106 /* If this file is moved, QSP_REL_PATH should be updated accordingly */
107 #define QSP_REL_PATH "util/qsp.c"
108 
109 /* this file's full path. Used to present all call sites with relative paths */
110 static size_t qsp_qemu_path_len;
111 
112 /* the address of qsp_thread gives us a unique 'thread ID' */
113 static __thread int qsp_thread;
114 
115 /*
116  * Call sites are the same for all threads, so we track them in a separate hash
117  * table to save memory.
118  */
119 static struct qht qsp_callsite_ht;
120 
121 static struct qht qsp_ht;
122 static QSPSnapshot *qsp_snapshot;
123 static bool qsp_initialized, qsp_initializing;
124 
125 static const char * const qsp_typenames[] = {
126     [QSP_MUTEX]     = "mutex",
127     [QSP_BQL_MUTEX] = "BQL mutex",
128     [QSP_REC_MUTEX] = "rec_mutex",
129     [QSP_CONDVAR]   = "condvar",
130 };
131 
132 QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl;
133 QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl;
134 QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl;
135 QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl;
136 QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func =
137     qemu_rec_mutex_trylock_impl;
138 QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl;
139 
140 /*
141  * It pays off to _not_ hash callsite->file; hashing a string is slow, and
142  * without it we still get a pretty unique hash.
143  */
144 static inline
145 uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t a)
146 {
147     uint64_t b = (uint64_t)(uintptr_t)callsite->obj;
148     uint32_t e = callsite->line;
149     uint32_t f = callsite->type;
150 
151     return tb_hash_func7(a, b, e, f, 0);
152 }
153 
154 static inline
155 uint32_t qsp_callsite_hash(const QSPCallSite *callsite)
156 {
157     return do_qsp_callsite_hash(callsite, 0);
158 }
159 
160 static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a)
161 {
162     return do_qsp_callsite_hash(entry->callsite, a);
163 }
164 
165 static uint32_t qsp_entry_hash(const QSPEntry *entry)
166 {
167     return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr);
168 }
169 
170 static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry)
171 {
172     return do_qsp_entry_hash(entry, 0);
173 }
174 
175 /* without the objects we need to hash the file name to get a decent hash */
176 static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry)
177 {
178     const QSPCallSite *callsite = entry->callsite;
179     uint64_t a = g_str_hash(callsite->file);
180     uint64_t b = callsite->line;
181     uint32_t e = callsite->type;
182 
183     return tb_hash_func7(a, b, e, 0, 0);
184 }
185 
186 static bool qsp_callsite_cmp(const void *ap, const void *bp)
187 {
188     const QSPCallSite *a = ap;
189     const QSPCallSite *b = bp;
190 
191     return a == b ||
192         (a->obj == b->obj &&
193          a->line == b->line &&
194          a->type == b->type &&
195          (a->file == b->file || !strcmp(a->file, b->file)));
196 }
197 
198 static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp)
199 {
200     const QSPCallSite *a = ap;
201     const QSPCallSite *b = bp;
202 
203     return a == b ||
204         (a->line == b->line &&
205          a->type == b->type &&
206          (a->file == b->file || !strcmp(a->file, b->file)));
207 }
208 
209 static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp)
210 {
211     const QSPEntry *a = ap;
212     const QSPEntry *b = bp;
213 
214     return qsp_callsite_cmp(a->callsite, b->callsite);
215 }
216 
217 static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp)
218 {
219     const QSPEntry *a = ap;
220     const QSPEntry *b = bp;
221 
222     return qsp_callsite_no_obj_cmp(a->callsite, b->callsite);
223 }
224 
225 static bool qsp_entry_cmp(const void *ap, const void *bp)
226 {
227     const QSPEntry *a = ap;
228     const QSPEntry *b = bp;
229 
230     return a->thread_ptr == b->thread_ptr &&
231         qsp_callsite_cmp(a->callsite, b->callsite);
232 }
233 
234 /*
235  * Normally we'd call this from a constructor function, but we want it to work
236  * via libutil as well.
237  */
238 static void qsp_do_init(void)
239 {
240     /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
241     g_assert(strstr(__FILE__, QSP_REL_PATH));
242     qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH);
243 
244     qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
245              QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
246     qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE,
247              QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
248 }
249 
250 static __attribute__((noinline)) void qsp_init__slowpath(void)
251 {
252     if (atomic_cmpxchg(&qsp_initializing, false, true) == false) {
253         qsp_do_init();
254         atomic_set(&qsp_initialized, true);
255     } else {
256         while (!atomic_read(&qsp_initialized)) {
257             cpu_relax();
258         }
259     }
260 }
261 
262 /* qsp_init() must be called from _all_ exported functions */
263 static inline void qsp_init(void)
264 {
265     if (likely(atomic_read(&qsp_initialized))) {
266         return;
267     }
268     qsp_init__slowpath();
269 }
270 
271 static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig)
272 {
273     QSPCallSite *callsite;
274     uint32_t hash;
275 
276     hash = qsp_callsite_hash(orig);
277     callsite = qht_lookup(&qsp_callsite_ht, orig, hash);
278     if (callsite == NULL) {
279         void *existing = NULL;
280 
281         callsite = g_new(QSPCallSite, 1);
282         memcpy(callsite, orig, sizeof(*callsite));
283         qht_insert(&qsp_callsite_ht, callsite, hash, &existing);
284         if (unlikely(existing)) {
285             g_free(callsite);
286             callsite = existing;
287         }
288     }
289     return callsite;
290 }
291 
292 static QSPEntry *
293 qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash)
294 {
295     QSPEntry *e;
296     void *existing = NULL;
297 
298     e = g_new0(QSPEntry, 1);
299     e->thread_ptr = entry->thread_ptr;
300     e->callsite = qsp_callsite_find(entry->callsite);
301 
302     qht_insert(ht, e, hash, &existing);
303     if (unlikely(existing)) {
304         g_free(e);
305         e = existing;
306     }
307     return e;
308 }
309 
310 static QSPEntry *
311 qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash)
312 {
313     QSPEntry *e;
314 
315     e = qht_lookup(ht, entry, hash);
316     if (e == NULL) {
317         e = qsp_entry_create(ht, entry, hash);
318     }
319     return e;
320 }
321 
322 /*
323  * Note: Entries are never removed, so callers do not have to be in an RCU
324  * read-side critical section.
325  */
326 static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line,
327                                enum QSPType type)
328 {
329     QSPCallSite callsite = {
330         .obj = obj,
331         .file = file,
332         .line = line,
333         .type = type,
334     };
335     QSPEntry orig;
336     uint32_t hash;
337 
338     qsp_init();
339 
340     orig.thread_ptr = &qsp_thread;
341     orig.callsite = &callsite;
342 
343     hash = qsp_entry_hash(&orig);
344     return qsp_entry_find(&qsp_ht, &orig, hash);
345 }
346 
347 /*
348  * @from is in the global hash table; read it atomically if the host
349  * supports it, otherwise use the seqlock.
350  */
351 static void qsp_entry_aggregate(QSPEntry *to, const QSPEntry *from)
352 {
353 #ifdef CONFIG_ATOMIC64
354     to->ns += atomic_read__nocheck(&from->ns);
355     to->n_acqs += atomic_read__nocheck(&from->n_acqs);
356 #else
357     unsigned int version;
358     uint64_t ns, n_acqs;
359 
360     do {
361         version = seqlock_read_begin(&from->sequence);
362         ns = atomic_read__nocheck(&from->ns);
363         n_acqs = atomic_read__nocheck(&from->n_acqs);
364     } while (seqlock_read_retry(&from->sequence, version));
365 
366     to->ns += ns;
367     to->n_acqs += n_acqs;
368 #endif
369 }
370 
371 /*
372  * @e is in the global hash table; it is only written to by the current thread,
373  * so we write to it atomically (as in "write once") to prevent torn reads.
374  * If the host doesn't support u64 atomics, use the seqlock.
375  */
376 static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq)
377 {
378 #ifndef CONFIG_ATOMIC64
379     seqlock_write_begin(&e->sequence);
380 #endif
381     atomic_set__nocheck(&e->ns, e->ns + delta);
382     if (acq) {
383         atomic_set__nocheck(&e->n_acqs, e->n_acqs + 1);
384     }
385 #ifndef CONFIG_ATOMIC64
386     seqlock_write_end(&e->sequence);
387 #endif
388 }
389 
390 static inline void qsp_entry_record(QSPEntry *e, int64_t delta)
391 {
392     do_qsp_entry_record(e, delta, true);
393 }
394 
395 #define QSP_GEN_VOID(type_, qsp_t_, func_, impl_)                       \
396     static void func_(type_ *obj, const char *file, int line)           \
397     {                                                                   \
398         QSPEntry *e;                                                    \
399         int64_t t0, t1;                                                 \
400                                                                         \
401         t0 = get_clock();                                               \
402         impl_(obj, file, line);                                         \
403         t1 = get_clock();                                               \
404                                                                         \
405         e = qsp_entry_get(obj, file, line, qsp_t_);                     \
406         qsp_entry_record(e, t1 - t0);                                   \
407     }
408 
409 #define QSP_GEN_RET1(type_, qsp_t_, func_, impl_)                       \
410     static int func_(type_ *obj, const char *file, int line)            \
411     {                                                                   \
412         QSPEntry *e;                                                    \
413         int64_t t0, t1;                                                 \
414         int err;                                                        \
415                                                                         \
416         t0 = get_clock();                                               \
417         err = impl_(obj, file, line);                                   \
418         t1 = get_clock();                                               \
419                                                                         \
420         e = qsp_entry_get(obj, file, line, qsp_t_);                     \
421         do_qsp_entry_record(e, t1 - t0, !err);                          \
422         return err;                                                     \
423     }
424 
425 QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl)
426 QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl)
427 QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl)
428 
429 QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock,
430              qemu_rec_mutex_lock_impl)
431 QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock,
432              qemu_rec_mutex_trylock_impl)
433 
434 #undef QSP_GEN_RET1
435 #undef QSP_GEN_VOID
436 
437 static void
438 qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line)
439 {
440     QSPEntry *e;
441     int64_t t0, t1;
442 
443     t0 = get_clock();
444     qemu_cond_wait_impl(cond, mutex, file, line);
445     t1 = get_clock();
446 
447     e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
448     qsp_entry_record(e, t1 - t0);
449 }
450 
451 bool qsp_is_enabled(void)
452 {
453     return atomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock;
454 }
455 
456 void qsp_enable(void)
457 {
458     atomic_set(&qemu_mutex_lock_func, qsp_mutex_lock);
459     atomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock);
460     atomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock);
461     atomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock);
462     atomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock);
463     atomic_set(&qemu_cond_wait_func, qsp_cond_wait);
464 }
465 
466 void qsp_disable(void)
467 {
468     atomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl);
469     atomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl);
470     atomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl);
471     atomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl);
472     atomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl);
473     atomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl);
474 }
475 
476 static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up)
477 {
478     const QSPEntry *a = ap;
479     const QSPEntry *b = bp;
480     enum QSPSortBy sort_by = *(enum QSPSortBy *)up;
481     const QSPCallSite *ca;
482     const QSPCallSite *cb;
483 
484     switch (sort_by) {
485     case QSP_SORT_BY_TOTAL_WAIT_TIME:
486         if (a->ns > b->ns) {
487             return -1;
488         } else if (a->ns < b->ns) {
489             return 1;
490         }
491         break;
492     case QSP_SORT_BY_AVG_WAIT_TIME:
493     {
494         double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0;
495         double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0;
496 
497         if (avg_a > avg_b) {
498             return -1;
499         } else if (avg_a < avg_b) {
500             return 1;
501         }
502         break;
503     }
504     default:
505         g_assert_not_reached();
506     }
507 
508     ca = a->callsite;
509     cb = b->callsite;
510     /* Break the tie with the object's address */
511     if (ca->obj < cb->obj) {
512         return -1;
513     } else if (ca->obj > cb->obj) {
514         return 1;
515     } else {
516         int cmp;
517 
518         /* same obj. Break the tie with the callsite's file */
519         cmp = strcmp(ca->file, cb->file);
520         if (cmp) {
521             return cmp;
522         }
523         /* same callsite file. Break the tie with the callsite's line */
524         g_assert(ca->line != cb->line);
525         if (ca->line < cb->line) {
526             return -1;
527         } else if (ca->line > cb->line) {
528             return 1;
529         } else {
530             /* break the tie with the callsite's type */
531             return cb->type - ca->type;
532         }
533     }
534 }
535 
536 static void qsp_sort(struct qht *ht, void *p, uint32_t h, void *userp)
537 {
538     QSPEntry *e = p;
539     GTree *tree = userp;
540 
541     g_tree_insert(tree, e, NULL);
542 }
543 
544 static void qsp_aggregate(struct qht *global_ht, void *p, uint32_t h, void *up)
545 {
546     struct qht *ht = up;
547     const QSPEntry *e = p;
548     QSPEntry *agg;
549     uint32_t hash;
550 
551     hash = qsp_entry_no_thread_hash(e);
552     agg = qsp_entry_find(ht, e, hash);
553     qsp_entry_aggregate(agg, e);
554 }
555 
556 static void qsp_iter_diff(struct qht *orig, void *p, uint32_t hash, void *htp)
557 {
558     struct qht *ht = htp;
559     QSPEntry *old = p;
560     QSPEntry *new;
561 
562     new = qht_lookup(ht, old, hash);
563     /* entries are never deleted, so we must have this one */
564     g_assert(new != NULL);
565     /* our reading of the stats happened after the snapshot was taken */
566     g_assert(new->n_acqs >= old->n_acqs);
567     g_assert(new->ns >= old->ns);
568 
569     new->n_acqs -= old->n_acqs;
570     new->ns -= old->ns;
571 
572     /* No point in reporting an empty entry */
573     if (new->n_acqs == 0 && new->ns == 0) {
574         bool removed = qht_remove(ht, new, hash);
575 
576         g_assert(removed);
577         g_free(new);
578     }
579 }
580 
581 static void qsp_diff(struct qht *orig, struct qht *new)
582 {
583     qht_iter(orig, qsp_iter_diff, new);
584 }
585 
586 static void
587 qsp_iter_callsite_coalesce(struct qht *orig, void *p, uint32_t h, void *htp)
588 {
589     struct qht *ht = htp;
590     QSPEntry *old = p;
591     QSPEntry *e;
592     uint32_t hash;
593 
594     hash = qsp_entry_no_thread_obj_hash(old);
595     e = qht_lookup(ht, old, hash);
596     if (e == NULL) {
597         e = qsp_entry_create(ht, old, hash);
598         e->n_objs = 1;
599     } else if (e->callsite->obj != old->callsite->obj) {
600         e->n_objs++;
601     }
602     e->ns += old->ns;
603     e->n_acqs += old->n_acqs;
604 }
605 
606 static void qsp_ht_delete(struct qht *ht, void *p, uint32_t h, void *htp)
607 {
608     g_free(p);
609 }
610 
611 static void qsp_mktree(GTree *tree, bool callsite_coalesce)
612 {
613     QSPSnapshot *snap;
614     struct qht ht, coalesce_ht;
615     struct qht *htp;
616 
617     /*
618      * First, see if there's a prior snapshot, so that we read the global hash
619      * table _after_ the snapshot has been created, which guarantees that
620      * the entries we'll read will be a superset of the snapshot's entries.
621      *
622      * We must remain in an RCU read-side critical section until we're done
623      * with the snapshot.
624      */
625     rcu_read_lock();
626     snap = atomic_rcu_read(&qsp_snapshot);
627 
628     /* Aggregate all results from the global hash table into a local one */
629     qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
630              QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
631     qht_iter(&qsp_ht, qsp_aggregate, &ht);
632 
633     /* compute the difference wrt the snapshot, if any */
634     if (snap) {
635         qsp_diff(&snap->ht, &ht);
636     }
637     /* done with the snapshot; RCU can reclaim it */
638     rcu_read_unlock();
639 
640     htp = &ht;
641     if (callsite_coalesce) {
642         qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE,
643                  QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
644         qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht);
645 
646         /* free the previous hash table, and point htp to coalesce_ht */
647         qht_iter(&ht, qsp_ht_delete, NULL);
648         qht_destroy(&ht);
649         htp = &coalesce_ht;
650     }
651 
652     /* sort the hash table elements by using a tree */
653     qht_iter(htp, qsp_sort, tree);
654 
655     /* free the hash table, but keep the elements (those are in the tree now) */
656     qht_destroy(htp);
657 }
658 
659 /* free string with g_free */
660 static char *qsp_at(const QSPCallSite *callsite)
661 {
662     GString *s = g_string_new(NULL);
663     const char *shortened;
664 
665     /* remove the absolute path to qemu */
666     if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) {
667         shortened = callsite->file;
668     } else {
669         shortened = callsite->file + qsp_qemu_path_len;
670     }
671     g_string_append_printf(s, "%s:%u", shortened, callsite->line);
672     return g_string_free(s, FALSE);
673 }
674 
675 struct QSPReportEntry {
676     const void *obj;
677     char *callsite_at;
678     const char *typename;
679     double time_s;
680     double ns_avg;
681     uint64_t n_acqs;
682     unsigned int n_objs;
683 };
684 typedef struct QSPReportEntry QSPReportEntry;
685 
686 struct QSPReport {
687     QSPReportEntry *entries;
688     size_t n_entries;
689     size_t max_n_entries;
690 };
691 typedef struct QSPReport QSPReport;
692 
693 static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata)
694 {
695     const QSPEntry *e = key;
696     QSPReport *report = udata;
697     QSPReportEntry *entry;
698 
699     if (report->n_entries == report->max_n_entries) {
700         return TRUE;
701     }
702     entry = &report->entries[report->n_entries];
703     report->n_entries++;
704 
705     entry->obj = e->callsite->obj;
706     entry->n_objs = e->n_objs;
707     entry->callsite_at = qsp_at(e->callsite);
708     entry->typename = qsp_typenames[e->callsite->type];
709     entry->time_s = e->ns * 1e-9;
710     entry->n_acqs = e->n_acqs;
711     entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0;
712     return FALSE;
713 }
714 
715 static void
716 pr_report(const QSPReport *rep, FILE *f, fprintf_function pr)
717 {
718     char *dashes;
719     size_t max_len = 0;
720     int callsite_len = 0;
721     int callsite_rspace;
722     int n_dashes;
723     size_t i;
724 
725     /* find out the maximum length of all 'callsite' fields */
726     for (i = 0; i < rep->n_entries; i++) {
727         const QSPReportEntry *e = &rep->entries[i];
728         size_t len = strlen(e->callsite_at);
729 
730         if (len > max_len) {
731             max_len = len;
732         }
733     }
734 
735     callsite_len = MAX(max_len, strlen("Call site"));
736     /* white space to leave to the right of "Call site" */
737     callsite_rspace = callsite_len - strlen("Call site");
738 
739     pr(f, "Type               Object  Call site%*s  Wait Time (s)  "
740        "       Count  Average (us)\n", callsite_rspace, "");
741 
742     /* build a horizontal rule with dashes */
743     n_dashes = 79 + callsite_rspace;
744     dashes = g_malloc(n_dashes + 1);
745     memset(dashes, '-', n_dashes);
746     dashes[n_dashes] = '\0';
747     pr(f, "%s\n", dashes);
748 
749     for (i = 0; i < rep->n_entries; i++) {
750         const QSPReportEntry *e = &rep->entries[i];
751         GString *s = g_string_new(NULL);
752 
753         g_string_append_printf(s, "%-9s  ", e->typename);
754         if (e->n_objs > 1) {
755             g_string_append_printf(s, "[%12u]", e->n_objs);
756         } else {
757             g_string_append_printf(s, "%14p", e->obj);
758         }
759         g_string_append_printf(s, "  %s%*s  %13.5f  %12" PRIu64 "  %12.2f\n",
760                                e->callsite_at,
761                                callsite_len - (int)strlen(e->callsite_at), "",
762                                e->time_s, e->n_acqs, e->ns_avg * 1e-3);
763         pr(f, "%s", s->str);
764         g_string_free(s, TRUE);
765     }
766 
767     pr(f, "%s\n", dashes);
768     g_free(dashes);
769 }
770 
771 static void report_destroy(QSPReport *rep)
772 {
773     size_t i;
774 
775     for (i = 0; i < rep->n_entries; i++) {
776         QSPReportEntry *e = &rep->entries[i];
777 
778         g_free(e->callsite_at);
779     }
780     g_free(rep->entries);
781 }
782 
783 void qsp_report(FILE *f, fprintf_function cpu_fprintf, size_t max,
784                 enum QSPSortBy sort_by, bool callsite_coalesce)
785 {
786     GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL);
787     QSPReport rep;
788 
789     qsp_init();
790 
791     rep.entries = g_new0(QSPReportEntry, max);
792     rep.n_entries = 0;
793     rep.max_n_entries = max;
794 
795     qsp_mktree(tree, callsite_coalesce);
796     g_tree_foreach(tree, qsp_tree_report, &rep);
797     g_tree_destroy(tree);
798 
799     pr_report(&rep, f, cpu_fprintf);
800     report_destroy(&rep);
801 }
802 
803 static void qsp_snapshot_destroy(QSPSnapshot *snap)
804 {
805     qht_iter(&snap->ht, qsp_ht_delete, NULL);
806     qht_destroy(&snap->ht);
807     g_free(snap);
808 }
809 
810 void qsp_reset(void)
811 {
812     QSPSnapshot *new = g_new(QSPSnapshot, 1);
813     QSPSnapshot *old;
814 
815     qsp_init();
816 
817     qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
818              QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
819 
820     /* take a snapshot of the current state */
821     qht_iter(&qsp_ht, qsp_aggregate, &new->ht);
822 
823     /* replace the previous snapshot, if any */
824     old = atomic_xchg(&qsp_snapshot, new);
825     if (old) {
826         call_rcu(old, qsp_snapshot_destroy, rcu);
827     }
828 }
829