1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2023 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
4  */
5 
6 #include <stdlib.h>
7 #include <errno.h>
8 #include "utils.h"
9 #include "osnoise.h"
10 #include "timerlat.h"
11 #include <unistd.h>
12 
13 enum timelat_state {
14 	TIMERLAT_INIT = 0,
15 	TIMERLAT_WAITING_IRQ,
16 	TIMERLAT_WAITING_THREAD,
17 };
18 
19 /* Used to fill spaces in the output */
20 static const char *spaces  = "                                                         ";
21 
22 #define MAX_COMM		24
23 
24 /*
25  * Per-cpu data statistics and data.
26  */
27 struct timerlat_aa_data {
28 	/* Current CPU state */
29 	int			curr_state;
30 
31 	/* timerlat IRQ latency */
32 	unsigned long long	tlat_irq_seqnum;
33 	unsigned long long	tlat_irq_latency;
34 	unsigned long long	tlat_irq_timstamp;
35 
36 	/* timerlat Thread latency */
37 	unsigned long long	tlat_thread_seqnum;
38 	unsigned long long	tlat_thread_latency;
39 	unsigned long long	tlat_thread_timstamp;
40 
41 	/*
42 	 * Information about the thread running when the IRQ
43 	 * arrived.
44 	 *
45 	 * This can be blocking or interference, depending on the
46 	 * priority of the thread. Assuming timerlat is the highest
47 	 * prio, it is blocking. If timerlat has a lower prio, it is
48 	 * interference.
49 	 * note: "unsigned long long" because they are fetch using tep_get_field_val();
50 	 */
51 	unsigned long long	run_thread_pid;
52 	char			run_thread_comm[MAX_COMM];
53 	unsigned long long	thread_blocking_duration;
54 	unsigned long long	max_exit_idle_latency;
55 
56 	/* Information about the timerlat timer irq */
57 	unsigned long long	timer_irq_start_time;
58 	unsigned long long	timer_irq_start_delay;
59 	unsigned long long	timer_irq_duration;
60 	unsigned long long	timer_exit_from_idle;
61 
62 	/*
63 	 * Information about the last IRQ before the timerlat irq
64 	 * arrived.
65 	 *
66 	 * If now - timestamp is <= latency, it might have influenced
67 	 * in the timerlat irq latency. Otherwise, ignore it.
68 	 */
69 	unsigned long long	prev_irq_duration;
70 	unsigned long long	prev_irq_timstamp;
71 
72 	/*
73 	 * Interference sum.
74 	 */
75 	unsigned long long	thread_nmi_sum;
76 	unsigned long long	thread_irq_sum;
77 	unsigned long long	thread_softirq_sum;
78 	unsigned long long	thread_thread_sum;
79 
80 	/*
81 	 * Interference task information.
82 	 */
83 	struct trace_seq	*prev_irqs_seq;
84 	struct trace_seq	*nmi_seq;
85 	struct trace_seq	*irqs_seq;
86 	struct trace_seq	*softirqs_seq;
87 	struct trace_seq	*threads_seq;
88 	struct trace_seq	*stack_seq;
89 
90 	/*
91 	 * Current thread.
92 	 */
93 	char			current_comm[MAX_COMM];
94 	unsigned long long	current_pid;
95 
96 	/*
97 	 * Is the system running a kworker?
98 	 */
99 	unsigned long long	kworker;
100 	unsigned long long	kworker_func;
101 };
102 
103 /*
104  * The analysis context and system wide view
105  */
106 struct timerlat_aa_context {
107 	int nr_cpus;
108 	int dump_tasks;
109 
110 	/* per CPU data */
111 	struct timerlat_aa_data *taa_data;
112 
113 	/*
114 	 * required to translate function names and register
115 	 * events.
116 	 */
117 	struct osnoise_tool *tool;
118 };
119 
120 /*
121  * The data is stored as a local variable, but accessed via a helper function.
122  *
123  * It could be stored inside the trace context. But every access would
124  * require container_of() + a series of pointers. Do we need it? Not sure.
125  *
126  * For now keep it simple. If needed, store it in the tool, add the *context
127  * as a parameter in timerlat_aa_get_ctx() and do the magic there.
128  */
129 static struct timerlat_aa_context *__timerlat_aa_ctx;
130 
timerlat_aa_get_ctx(void)131 static struct timerlat_aa_context *timerlat_aa_get_ctx(void)
132 {
133 	return __timerlat_aa_ctx;
134 }
135 
136 /*
137  * timerlat_aa_get_data - Get the per-cpu data from the timerlat context
138  */
139 static struct timerlat_aa_data
timerlat_aa_get_data(struct timerlat_aa_context * taa_ctx,int cpu)140 *timerlat_aa_get_data(struct timerlat_aa_context *taa_ctx, int cpu)
141 {
142 	return &taa_ctx->taa_data[cpu];
143 }
144 
145 /*
146  * timerlat_aa_irq_latency - Handles timerlat IRQ event
147  */
timerlat_aa_irq_latency(struct timerlat_aa_data * taa_data,struct trace_seq * s,struct tep_record * record,struct tep_event * event)148 static int timerlat_aa_irq_latency(struct timerlat_aa_data *taa_data,
149 				   struct trace_seq *s, struct tep_record *record,
150 				   struct tep_event *event)
151 {
152 	/*
153 	 * For interference, we start now looking for things that can delay
154 	 * the thread.
155 	 */
156 	taa_data->curr_state = TIMERLAT_WAITING_THREAD;
157 	taa_data->tlat_irq_timstamp = record->ts;
158 
159 	/*
160 	 * Zero values.
161 	 */
162 	taa_data->thread_nmi_sum = 0;
163 	taa_data->thread_irq_sum = 0;
164 	taa_data->thread_softirq_sum = 0;
165 	taa_data->thread_thread_sum = 0;
166 	taa_data->thread_blocking_duration = 0;
167 	taa_data->timer_irq_start_time = 0;
168 	taa_data->timer_irq_duration = 0;
169 	taa_data->timer_exit_from_idle = 0;
170 
171 	/*
172 	 * Zero interference tasks.
173 	 */
174 	trace_seq_reset(taa_data->nmi_seq);
175 	trace_seq_reset(taa_data->irqs_seq);
176 	trace_seq_reset(taa_data->softirqs_seq);
177 	trace_seq_reset(taa_data->threads_seq);
178 
179 	/* IRQ latency values */
180 	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_irq_latency, 1);
181 	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_irq_seqnum, 1);
182 
183 	/* The thread that can cause blocking */
184 	tep_get_common_field_val(s, event, "common_pid", record, &taa_data->run_thread_pid, 1);
185 
186 	/*
187 	 * Get exit from idle case.
188 	 *
189 	 * If it is not idle thread:
190 	 */
191 	if (taa_data->run_thread_pid)
192 		return 0;
193 
194 	/*
195 	 * if the latency is shorter than the known exit from idle:
196 	 */
197 	if (taa_data->tlat_irq_latency < taa_data->max_exit_idle_latency)
198 		return 0;
199 
200 	/*
201 	 * To be safe, ignore the cases in which an IRQ/NMI could have
202 	 * interfered with the timerlat IRQ.
203 	 */
204 	if (taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency
205 	    < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
206 		return 0;
207 
208 	taa_data->max_exit_idle_latency = taa_data->tlat_irq_latency;
209 
210 	return 0;
211 }
212 
213 /*
214  * timerlat_aa_thread_latency - Handles timerlat thread event
215  */
timerlat_aa_thread_latency(struct timerlat_aa_data * taa_data,struct trace_seq * s,struct tep_record * record,struct tep_event * event)216 static int timerlat_aa_thread_latency(struct timerlat_aa_data *taa_data,
217 				      struct trace_seq *s, struct tep_record *record,
218 				      struct tep_event *event)
219 {
220 	/*
221 	 * For interference, we start now looking for things that can delay
222 	 * the IRQ of the next cycle.
223 	 */
224 	taa_data->curr_state = TIMERLAT_WAITING_IRQ;
225 	taa_data->tlat_thread_timstamp = record->ts;
226 
227 	/* Thread latency values */
228 	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_thread_latency, 1);
229 	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_thread_seqnum, 1);
230 
231 	return 0;
232 }
233 
234 /*
235  * timerlat_aa_handler - Handle timerlat events
236  *
237  * This function is called to handle timerlat events recording statistics.
238  *
239  * Returns 0 on success, -1 otherwise.
240  */
timerlat_aa_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)241 static int timerlat_aa_handler(struct trace_seq *s, struct tep_record *record,
242 			struct tep_event *event, void *context)
243 {
244 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
245 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
246 	unsigned long long thread;
247 
248 	if (!taa_data)
249 		return -1;
250 
251 	tep_get_field_val(s, event, "context", record, &thread, 1);
252 	if (!thread)
253 		return timerlat_aa_irq_latency(taa_data, s, record, event);
254 	else
255 		return timerlat_aa_thread_latency(taa_data, s, record, event);
256 }
257 
258 /*
259  * timerlat_aa_nmi_handler - Handles NMI noise
260  *
261  * It is used to collect information about interferences from NMI. It is
262  * hooked to the osnoise:nmi_noise event.
263  */
timerlat_aa_nmi_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)264 static int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *record,
265 				   struct tep_event *event, void *context)
266 {
267 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
268 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
269 	unsigned long long duration;
270 	unsigned long long start;
271 
272 	tep_get_field_val(s, event, "duration", record, &duration, 1);
273 	tep_get_field_val(s, event, "start", record, &start, 1);
274 
275 	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
276 		taa_data->prev_irq_duration = duration;
277 		taa_data->prev_irq_timstamp = start;
278 
279 		trace_seq_reset(taa_data->prev_irqs_seq);
280 		trace_seq_printf(taa_data->prev_irqs_seq, "  %24s %.*s %9.2f us\n",
281 				 "nmi",
282 				 24, spaces,
283 				 ns_to_usf(duration));
284 		return 0;
285 	}
286 
287 	taa_data->thread_nmi_sum += duration;
288 	trace_seq_printf(taa_data->nmi_seq, "  %24s %.*s %9.2f us\n",
289 			 "nmi",
290 			 24, spaces, ns_to_usf(duration));
291 
292 	return 0;
293 }
294 
295 /*
296  * timerlat_aa_irq_handler - Handles IRQ noise
297  *
298  * It is used to collect information about interferences from IRQ. It is
299  * hooked to the osnoise:irq_noise event.
300  *
301  * It is a little bit more complex than the other because it measures:
302  *	- The IRQs that can delay the timer IRQ before it happened.
303  *	- The Timerlat IRQ handler
304  *	- The IRQs that happened between the timerlat IRQ and the timerlat thread
305  *	  (IRQ interference).
306  */
timerlat_aa_irq_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)307 static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *record,
308 				   struct tep_event *event, void *context)
309 {
310 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
311 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
312 	unsigned long long expected_start;
313 	unsigned long long duration;
314 	unsigned long long vector;
315 	unsigned long long start;
316 	char *desc;
317 	int val;
318 
319 	tep_get_field_val(s, event, "duration", record, &duration, 1);
320 	tep_get_field_val(s, event, "start", record, &start, 1);
321 	tep_get_field_val(s, event, "vector", record, &vector, 1);
322 	desc = tep_get_field_raw(s, event, "desc", record, &val, 1);
323 
324 	/*
325 	 * Before the timerlat IRQ.
326 	 */
327 	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
328 		taa_data->prev_irq_duration = duration;
329 		taa_data->prev_irq_timstamp = start;
330 
331 		trace_seq_reset(taa_data->prev_irqs_seq);
332 		trace_seq_printf(taa_data->prev_irqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
333 				 desc, vector,
334 				 15, spaces,
335 				 ns_to_usf(duration));
336 		return 0;
337 	}
338 
339 	/*
340 	 * The timerlat IRQ: taa_data->timer_irq_start_time is zeroed at
341 	 * the timerlat irq handler.
342 	 */
343 	if (!taa_data->timer_irq_start_time) {
344 		expected_start = taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency;
345 
346 		taa_data->timer_irq_start_time = start;
347 		taa_data->timer_irq_duration = duration;
348 
349 		/*
350 		 * We are dealing with two different clock sources: the
351 		 * external clock source that timerlat uses as a reference
352 		 * and the clock used by the tracer. There are also two
353 		 * moments: the time reading the clock and the timer in
354 		 * which the event is placed in the buffer (the trace
355 		 * event timestamp). If the processor is slow or there
356 		 * is some hardware noise, the difference between the
357 		 * timestamp and the external clock read can be longer
358 		 * than the IRQ handler delay, resulting in a negative
359 		 * time. If so, set IRQ start delay as 0. In the end,
360 		 * it is less relevant than the noise.
361 		 */
362 		if (expected_start < taa_data->timer_irq_start_time)
363 			taa_data->timer_irq_start_delay = taa_data->timer_irq_start_time - expected_start;
364 		else
365 			taa_data->timer_irq_start_delay = 0;
366 
367 		/*
368 		 * not exit from idle.
369 		 */
370 		if (taa_data->run_thread_pid)
371 			return 0;
372 
373 		if (expected_start > taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
374 			taa_data->timer_exit_from_idle = taa_data->timer_irq_start_delay;
375 
376 		return 0;
377 	}
378 
379 	/*
380 	 * IRQ interference.
381 	 */
382 	taa_data->thread_irq_sum += duration;
383 	trace_seq_printf(taa_data->irqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
384 			 desc, vector,
385 			 24, spaces,
386 			 ns_to_usf(duration));
387 
388 	return 0;
389 }
390 
391 static char *softirq_name[] = { "HI", "TIMER",	"NET_TX", "NET_RX", "BLOCK",
392 				"IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" };
393 
394 
395 /*
396  * timerlat_aa_softirq_handler - Handles Softirq noise
397  *
398  * It is used to collect information about interferences from Softirq. It is
399  * hooked to the osnoise:softirq_noise event.
400  *
401  * It is only printed in the non-rt kernel, as softirqs become thread on RT.
402  */
timerlat_aa_softirq_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)403 static int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *record,
404 				       struct tep_event *event, void *context)
405 {
406 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
407 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
408 	unsigned long long duration;
409 	unsigned long long vector;
410 	unsigned long long start;
411 
412 	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
413 		return 0;
414 
415 	tep_get_field_val(s, event, "duration", record, &duration, 1);
416 	tep_get_field_val(s, event, "start", record, &start, 1);
417 	tep_get_field_val(s, event, "vector", record, &vector, 1);
418 
419 	taa_data->thread_softirq_sum += duration;
420 
421 	trace_seq_printf(taa_data->softirqs_seq, "  %24s:%-3llu %.*s %9.2f us\n",
422 			 softirq_name[vector], vector,
423 			 24, spaces,
424 			 ns_to_usf(duration));
425 	return 0;
426 }
427 
428 /*
429  * timerlat_aa_softirq_handler - Handles thread noise
430  *
431  * It is used to collect information about interferences from threads. It is
432  * hooked to the osnoise:thread_noise event.
433  *
434  * Note: if you see thread noise, your timerlat thread was not the highest prio one.
435  */
timerlat_aa_thread_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)436 static int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *record,
437 				      struct tep_event *event, void *context)
438 {
439 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
440 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
441 	unsigned long long duration;
442 	unsigned long long start;
443 	unsigned long long pid;
444 	const char *comm;
445 	int val;
446 
447 	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
448 		return 0;
449 
450 	tep_get_field_val(s, event, "duration", record, &duration, 1);
451 	tep_get_field_val(s, event, "start", record, &start, 1);
452 
453 	tep_get_common_field_val(s, event, "common_pid", record, &pid, 1);
454 	comm = tep_get_field_raw(s, event, "comm", record, &val, 1);
455 
456 	if (pid == taa_data->run_thread_pid && !taa_data->thread_blocking_duration) {
457 		taa_data->thread_blocking_duration = duration;
458 
459 		if (comm)
460 			strncpy(taa_data->run_thread_comm, comm, MAX_COMM);
461 		else
462 			sprintf(taa_data->run_thread_comm, "<...>");
463 
464 	} else {
465 		taa_data->thread_thread_sum += duration;
466 
467 		trace_seq_printf(taa_data->threads_seq, "  %24s:%-12llu %.*s %9.2f us\n",
468 				 comm, pid,
469 				 15, spaces,
470 				 ns_to_usf(duration));
471 	}
472 
473 	return 0;
474 }
475 
476 /*
477  * timerlat_aa_stack_handler - Handles timerlat IRQ stack trace
478  *
479  * Saves and parse the stack trace generated by the timerlat IRQ.
480  */
timerlat_aa_stack_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)481 static int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *record,
482 			      struct tep_event *event, void *context)
483 {
484 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
485 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
486 	unsigned long *caller;
487 	const char *function;
488 	int val, i;
489 
490 	trace_seq_reset(taa_data->stack_seq);
491 
492 	trace_seq_printf(taa_data->stack_seq, "    Blocking thread stack trace\n");
493 	caller = tep_get_field_raw(s, event, "caller", record, &val, 1);
494 	if (caller) {
495 		for (i = 0; ; i++) {
496 			function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]);
497 			if (!function)
498 				break;
499 			trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n",
500 					 14, spaces, function);
501 		}
502 	}
503 	return 0;
504 }
505 
506 /*
507  * timerlat_aa_sched_switch_handler - Tracks the current thread running on the CPU
508  *
509  * Handles the sched:sched_switch event to trace the current thread running on the
510  * CPU. It is used to display the threads running on the other CPUs when the trace
511  * stops.
512  */
timerlat_aa_sched_switch_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)513 static int timerlat_aa_sched_switch_handler(struct trace_seq *s, struct tep_record *record,
514 					    struct tep_event *event, void *context)
515 {
516 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
517 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
518 	const char *comm;
519 	int val;
520 
521 	tep_get_field_val(s, event, "next_pid", record, &taa_data->current_pid, 1);
522 	comm = tep_get_field_raw(s, event, "next_comm", record, &val, 1);
523 
524 	strncpy(taa_data->current_comm, comm, MAX_COMM);
525 
526 	/*
527 	 * If this was a kworker, clean the last kworkers that ran.
528 	 */
529 	taa_data->kworker = 0;
530 	taa_data->kworker_func = 0;
531 
532 	return 0;
533 }
534 
535 /*
536  * timerlat_aa_kworker_start_handler - Tracks a kworker running on the CPU
537  *
538  * Handles workqueue:workqueue_execute_start event, keeping track of
539  * the job that a kworker could be doing in the CPU.
540  *
541  * We already catch problems of hardware related latencies caused by work queues
542  * running driver code that causes hardware stall. For example, with DRM drivers.
543  */
timerlat_aa_kworker_start_handler(struct trace_seq * s,struct tep_record * record,struct tep_event * event,void * context)544 static int timerlat_aa_kworker_start_handler(struct trace_seq *s, struct tep_record *record,
545 					     struct tep_event *event, void *context)
546 {
547 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
548 	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
549 
550 	tep_get_field_val(s, event, "work", record, &taa_data->kworker, 1);
551 	tep_get_field_val(s, event, "function", record, &taa_data->kworker_func, 1);
552 	return 0;
553 }
554 
555 /*
556  * timerlat_thread_analysis - Prints the analysis of a CPU that hit a stop tracing
557  *
558  * This is the core of the analysis.
559  */
timerlat_thread_analysis(struct timerlat_aa_data * taa_data,int cpu,int irq_thresh,int thread_thresh)560 static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu,
561 				     int irq_thresh, int thread_thresh)
562 {
563 	long long exp_irq_ts;
564 	int total;
565 	int irq;
566 
567 	/*
568 	 * IRQ latency or Thread latency?
569 	 */
570 	if (taa_data->tlat_irq_seqnum > taa_data->tlat_thread_seqnum) {
571 		irq = 1;
572 		total = taa_data->tlat_irq_latency;
573 	} else {
574 		irq = 0;
575 		total = taa_data->tlat_thread_latency;
576 	}
577 
578 	/*
579 	 * Expected IRQ arrival time using the trace clock as the base.
580 	 *
581 	 * TODO: Add a list of previous IRQ, and then run the list backwards.
582 	 */
583 	exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay;
584 	if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) {
585 		if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time)
586 			printf("  Previous IRQ interference: %.*s up to  %9.2f us\n",
587 			       16, spaces,
588 			       ns_to_usf(taa_data->prev_irq_duration));
589 	}
590 
591 	/*
592 	 * The delay that the IRQ suffered before starting.
593 	 */
594 	printf("  IRQ handler delay: %.*s %16s  %9.2f us (%.2f %%)\n", 16, spaces,
595 	       (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "",
596 	       ns_to_usf(taa_data->timer_irq_start_delay),
597 	       ns_to_per(total, taa_data->timer_irq_start_delay));
598 
599 	/*
600 	 * Timerlat IRQ.
601 	 */
602 	printf("  IRQ latency: %.*s %9.2f us\n", 40, spaces,
603 	       ns_to_usf(taa_data->tlat_irq_latency));
604 
605 	if (irq) {
606 		/*
607 		 * If the trace stopped due to IRQ, the other events will not happen
608 		 * because... the trace stopped :-).
609 		 *
610 		 * That is all folks, the stack trace was printed before the stop,
611 		 * so it will be displayed, it is the key.
612 		 */
613 		printf("  Blocking thread:\n");
614 		printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm,
615 		       taa_data->run_thread_pid);
616 	} else  {
617 		/*
618 		 * The duration of the IRQ handler that handled the timerlat IRQ.
619 		 */
620 		printf("  Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n",
621 		       30, spaces,
622 		       ns_to_usf(taa_data->timer_irq_duration),
623 		       ns_to_per(total, taa_data->timer_irq_duration));
624 
625 		/*
626 		 * The amount of time that the current thread postponed the scheduler.
627 		 *
628 		 * Recalling that it is net from NMI/IRQ/Softirq interference, so there
629 		 * is no need to compute values here.
630 		 */
631 		printf("  Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces,
632 		       ns_to_usf(taa_data->thread_blocking_duration),
633 		       ns_to_per(total, taa_data->thread_blocking_duration));
634 
635 		printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces,
636 		       taa_data->run_thread_comm, taa_data->run_thread_pid,
637 		       12, spaces, ns_to_usf(taa_data->thread_blocking_duration));
638 	}
639 
640 	/*
641 	 * Print the stack trace!
642 	 */
643 	trace_seq_do_printf(taa_data->stack_seq);
644 
645 	/*
646 	 * NMIs can happen during the IRQ, so they are always possible.
647 	 */
648 	if (taa_data->thread_nmi_sum)
649 		printf("  NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces,
650 		       ns_to_usf(taa_data->thread_nmi_sum),
651 		       ns_to_per(total, taa_data->thread_nmi_sum));
652 
653 	/*
654 	 * If it is an IRQ latency, the other factors can be skipped.
655 	 */
656 	if (irq)
657 		goto print_total;
658 
659 	/*
660 	 * Prints the interference caused by IRQs to the thread latency.
661 	 */
662 	if (taa_data->thread_irq_sum) {
663 		printf("  IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces,
664 		       ns_to_usf(taa_data->thread_irq_sum),
665 		       ns_to_per(total, taa_data->thread_irq_sum));
666 
667 		trace_seq_do_printf(taa_data->irqs_seq);
668 	}
669 
670 	/*
671 	 * Prints the interference caused by Softirqs to the thread latency.
672 	 */
673 	if (taa_data->thread_softirq_sum) {
674 		printf("  Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces,
675 		       ns_to_usf(taa_data->thread_softirq_sum),
676 		       ns_to_per(total, taa_data->thread_softirq_sum));
677 
678 		trace_seq_do_printf(taa_data->softirqs_seq);
679 	}
680 
681 	/*
682 	 * Prints the interference caused by other threads to the thread latency.
683 	 *
684 	 * If this happens, your timerlat is not the highest prio. OK, migration
685 	 * thread can happen. But otherwise, you are not measuring the "scheduling
686 	 * latency" only, and here is the difference from scheduling latency and
687 	 * timer handling latency.
688 	 */
689 	if (taa_data->thread_thread_sum) {
690 		printf("  Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces,
691 		       ns_to_usf(taa_data->thread_thread_sum),
692 		       ns_to_per(total, taa_data->thread_thread_sum));
693 
694 		trace_seq_do_printf(taa_data->threads_seq);
695 	}
696 
697 	/*
698 	 * Done.
699 	 */
700 print_total:
701 	printf("------------------------------------------------------------------------\n");
702 	printf("  %s latency: %.*s %9.2f us (100%%)\n", irq ? "   IRQ" : "Thread",
703 	       37, spaces, ns_to_usf(total));
704 }
705 
timerlat_auto_analysis_collect_trace(struct timerlat_aa_context * taa_ctx)706 static int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx)
707 {
708 	struct trace_instance *trace = &taa_ctx->tool->trace;
709 	int retval;
710 
711 	retval = tracefs_iterate_raw_events(trace->tep,
712 					    trace->inst,
713 					    NULL,
714 					    0,
715 					    collect_registered_events,
716 					    trace);
717 		if (retval < 0) {
718 			err_msg("Error iterating on events\n");
719 			return 0;
720 		}
721 
722 	return 1;
723 }
724 
725 /**
726  * timerlat_auto_analysis - Analyze the collected data
727  */
timerlat_auto_analysis(int irq_thresh,int thread_thresh)728 void timerlat_auto_analysis(int irq_thresh, int thread_thresh)
729 {
730 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
731 	unsigned long long max_exit_from_idle = 0;
732 	struct timerlat_aa_data *taa_data;
733 	int max_exit_from_idle_cpu;
734 	struct tep_handle *tep;
735 	int cpu;
736 
737 	timerlat_auto_analysis_collect_trace(taa_ctx);
738 
739 	/* bring stop tracing to the ns scale */
740 	irq_thresh = irq_thresh * 1000;
741 	thread_thresh = thread_thresh * 1000;
742 
743 	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
744 		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
745 
746 		if (irq_thresh && taa_data->tlat_irq_latency >= irq_thresh) {
747 			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
748 			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
749 		} else if (thread_thresh && (taa_data->tlat_thread_latency) >= thread_thresh) {
750 			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
751 			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
752 		}
753 
754 		if (taa_data->max_exit_idle_latency > max_exit_from_idle) {
755 			max_exit_from_idle = taa_data->max_exit_idle_latency;
756 			max_exit_from_idle_cpu = cpu;
757 		}
758 
759 	}
760 
761 	if (max_exit_from_idle) {
762 		printf("\n");
763 		printf("Max timerlat IRQ latency from idle: %.2f us in cpu %d\n",
764 			ns_to_usf(max_exit_from_idle), max_exit_from_idle_cpu);
765 	}
766 	if (!taa_ctx->dump_tasks)
767 		return;
768 
769 	printf("\n");
770 	printf("Printing CPU tasks:\n");
771 	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
772 		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
773 		tep = taa_ctx->tool->trace.tep;
774 
775 		printf("    [%.3d] %24s:%llu", cpu, taa_data->current_comm, taa_data->current_pid);
776 
777 		if (taa_data->kworker_func)
778 			printf(" kworker:%s:%s",
779 				tep_find_function(tep, taa_data->kworker) ? : "<...>",
780 				tep_find_function(tep, taa_data->kworker_func));
781 		printf("\n");
782 	}
783 
784 }
785 
786 /*
787  * timerlat_aa_destroy_seqs - Destroy seq files used to store parsed data
788  */
timerlat_aa_destroy_seqs(struct timerlat_aa_context * taa_ctx)789 static void timerlat_aa_destroy_seqs(struct timerlat_aa_context *taa_ctx)
790 {
791 	struct timerlat_aa_data *taa_data;
792 	int i;
793 
794 	if (!taa_ctx->taa_data)
795 		return;
796 
797 	for (i = 0; i < taa_ctx->nr_cpus; i++) {
798 		taa_data = timerlat_aa_get_data(taa_ctx, i);
799 
800 		if (taa_data->prev_irqs_seq) {
801 			trace_seq_destroy(taa_data->prev_irqs_seq);
802 			free(taa_data->prev_irqs_seq);
803 		}
804 
805 		if (taa_data->nmi_seq) {
806 			trace_seq_destroy(taa_data->nmi_seq);
807 			free(taa_data->nmi_seq);
808 		}
809 
810 		if (taa_data->irqs_seq) {
811 			trace_seq_destroy(taa_data->irqs_seq);
812 			free(taa_data->irqs_seq);
813 		}
814 
815 		if (taa_data->softirqs_seq) {
816 			trace_seq_destroy(taa_data->softirqs_seq);
817 			free(taa_data->softirqs_seq);
818 		}
819 
820 		if (taa_data->threads_seq) {
821 			trace_seq_destroy(taa_data->threads_seq);
822 			free(taa_data->threads_seq);
823 		}
824 
825 		if (taa_data->stack_seq) {
826 			trace_seq_destroy(taa_data->stack_seq);
827 			free(taa_data->stack_seq);
828 		}
829 	}
830 }
831 
832 /*
833  * timerlat_aa_init_seqs - Init seq files used to store parsed information
834  *
835  * Instead of keeping data structures to store raw data, use seq files to
836  * store parsed data.
837  *
838  * Allocates and initialize seq files.
839  *
840  * Returns 0 on success, -1 otherwise.
841  */
timerlat_aa_init_seqs(struct timerlat_aa_context * taa_ctx)842 static int timerlat_aa_init_seqs(struct timerlat_aa_context *taa_ctx)
843 {
844 	struct timerlat_aa_data *taa_data;
845 	int i;
846 
847 	for (i = 0; i < taa_ctx->nr_cpus; i++) {
848 
849 		taa_data = timerlat_aa_get_data(taa_ctx, i);
850 
851 		taa_data->prev_irqs_seq = calloc(1, sizeof(*taa_data->prev_irqs_seq));
852 		if (!taa_data->prev_irqs_seq)
853 			goto out_err;
854 
855 		trace_seq_init(taa_data->prev_irqs_seq);
856 
857 		taa_data->nmi_seq = calloc(1, sizeof(*taa_data->nmi_seq));
858 		if (!taa_data->nmi_seq)
859 			goto out_err;
860 
861 		trace_seq_init(taa_data->nmi_seq);
862 
863 		taa_data->irqs_seq = calloc(1, sizeof(*taa_data->irqs_seq));
864 		if (!taa_data->irqs_seq)
865 			goto out_err;
866 
867 		trace_seq_init(taa_data->irqs_seq);
868 
869 		taa_data->softirqs_seq = calloc(1, sizeof(*taa_data->softirqs_seq));
870 		if (!taa_data->softirqs_seq)
871 			goto out_err;
872 
873 		trace_seq_init(taa_data->softirqs_seq);
874 
875 		taa_data->threads_seq = calloc(1, sizeof(*taa_data->threads_seq));
876 		if (!taa_data->threads_seq)
877 			goto out_err;
878 
879 		trace_seq_init(taa_data->threads_seq);
880 
881 		taa_data->stack_seq = calloc(1, sizeof(*taa_data->stack_seq));
882 		if (!taa_data->stack_seq)
883 			goto out_err;
884 
885 		trace_seq_init(taa_data->stack_seq);
886 	}
887 
888 	return 0;
889 
890 out_err:
891 	timerlat_aa_destroy_seqs(taa_ctx);
892 	return -1;
893 }
894 
895 /*
896  * timerlat_aa_unregister_events - Unregister events used in the auto-analysis
897  */
timerlat_aa_unregister_events(struct osnoise_tool * tool,int dump_tasks)898 static void timerlat_aa_unregister_events(struct osnoise_tool *tool, int dump_tasks)
899 {
900 
901 	tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "timerlat",
902 				     timerlat_aa_handler, tool);
903 
904 	tracefs_event_disable(tool->trace.inst, "osnoise", NULL);
905 
906 	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
907 				     timerlat_aa_nmi_handler, tool);
908 
909 	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
910 				     timerlat_aa_irq_handler, tool);
911 
912 	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
913 				     timerlat_aa_softirq_handler, tool);
914 
915 	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
916 				     timerlat_aa_thread_handler, tool);
917 
918 	tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
919 				     timerlat_aa_stack_handler, tool);
920 	if (!dump_tasks)
921 		return;
922 
923 	tracefs_event_disable(tool->trace.inst, "sched", "sched_switch");
924 	tep_unregister_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
925 				     timerlat_aa_sched_switch_handler, tool);
926 
927 	tracefs_event_disable(tool->trace.inst, "workqueue", "workqueue_execute_start");
928 	tep_unregister_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
929 				     timerlat_aa_kworker_start_handler, tool);
930 }
931 
932 /*
933  * timerlat_aa_register_events - Register events used in the auto-analysis
934  *
935  * Returns 0 on success, -1 otherwise.
936  */
timerlat_aa_register_events(struct osnoise_tool * tool,int dump_tasks)937 static int timerlat_aa_register_events(struct osnoise_tool *tool, int dump_tasks)
938 {
939 	int retval;
940 
941 	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "timerlat",
942 				timerlat_aa_handler, tool);
943 
944 
945 	/*
946 	 * register auto-analysis handlers.
947 	 */
948 	retval = tracefs_event_enable(tool->trace.inst, "osnoise", NULL);
949 	if (retval < 0 && !errno) {
950 		err_msg("Could not find osnoise events\n");
951 		goto out_err;
952 	}
953 
954 	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
955 				   timerlat_aa_nmi_handler, tool);
956 
957 	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
958 				   timerlat_aa_irq_handler, tool);
959 
960 	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
961 				   timerlat_aa_softirq_handler, tool);
962 
963 	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
964 				   timerlat_aa_thread_handler, tool);
965 
966 	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
967 				   timerlat_aa_stack_handler, tool);
968 
969 	if (!dump_tasks)
970 		return 0;
971 
972 	/*
973 	 * Dump task events.
974 	 */
975 	retval = tracefs_event_enable(tool->trace.inst, "sched", "sched_switch");
976 	if (retval < 0 && !errno) {
977 		err_msg("Could not find sched_switch\n");
978 		goto out_err;
979 	}
980 
981 	tep_register_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
982 				   timerlat_aa_sched_switch_handler, tool);
983 
984 	retval = tracefs_event_enable(tool->trace.inst, "workqueue", "workqueue_execute_start");
985 	if (retval < 0 && !errno) {
986 		err_msg("Could not find workqueue_execute_start\n");
987 		goto out_err;
988 	}
989 
990 	tep_register_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
991 				   timerlat_aa_kworker_start_handler, tool);
992 
993 	return 0;
994 
995 out_err:
996 	timerlat_aa_unregister_events(tool, dump_tasks);
997 	return -1;
998 }
999 
1000 /**
1001  * timerlat_aa_destroy - Destroy timerlat auto-analysis
1002  */
timerlat_aa_destroy(void)1003 void timerlat_aa_destroy(void)
1004 {
1005 	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
1006 
1007 	if (!taa_ctx)
1008 		return;
1009 
1010 	if (!taa_ctx->taa_data)
1011 		goto out_ctx;
1012 
1013 	timerlat_aa_unregister_events(taa_ctx->tool, taa_ctx->dump_tasks);
1014 	timerlat_aa_destroy_seqs(taa_ctx);
1015 	free(taa_ctx->taa_data);
1016 out_ctx:
1017 	free(taa_ctx);
1018 }
1019 
1020 /**
1021  * timerlat_aa_init - Initialize timerlat auto-analysis
1022  *
1023  * Returns 0 on success, -1 otherwise.
1024  */
timerlat_aa_init(struct osnoise_tool * tool,int dump_tasks)1025 int timerlat_aa_init(struct osnoise_tool *tool, int dump_tasks)
1026 {
1027 	int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
1028 	struct timerlat_aa_context *taa_ctx;
1029 	int retval;
1030 
1031 	taa_ctx = calloc(1, sizeof(*taa_ctx));
1032 	if (!taa_ctx)
1033 		return -1;
1034 
1035 	__timerlat_aa_ctx = taa_ctx;
1036 
1037 	taa_ctx->nr_cpus = nr_cpus;
1038 	taa_ctx->tool = tool;
1039 	taa_ctx->dump_tasks = dump_tasks;
1040 
1041 	taa_ctx->taa_data = calloc(nr_cpus, sizeof(*taa_ctx->taa_data));
1042 	if (!taa_ctx->taa_data)
1043 		goto out_err;
1044 
1045 	retval = timerlat_aa_init_seqs(taa_ctx);
1046 	if (retval)
1047 		goto out_err;
1048 
1049 	retval = timerlat_aa_register_events(tool, dump_tasks);
1050 	if (retval)
1051 		goto out_err;
1052 
1053 	return 0;
1054 
1055 out_err:
1056 	timerlat_aa_destroy();
1057 	return -1;
1058 }
1059