xref: /openbmc/linux/tools/perf/builtin-sched.c (revision 0da85d1e)
1 #include "builtin.h"
2 #include "perf.h"
3 
4 #include "util/util.h"
5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 #include "util/cloexec.h"
14 
15 #include "util/parse-options.h"
16 #include "util/trace-event.h"
17 
18 #include "util/debug.h"
19 
20 #include <sys/prctl.h>
21 #include <sys/resource.h>
22 
23 #include <semaphore.h>
24 #include <pthread.h>
25 #include <math.h>
26 #include <api/fs/fs.h>
27 
28 #define PR_SET_NAME		15               /* Set process name */
29 #define MAX_CPUS		4096
30 #define COMM_LEN		20
31 #define SYM_LEN			129
32 #define MAX_PID			1024000
33 
34 struct sched_atom;
35 
36 struct task_desc {
37 	unsigned long		nr;
38 	unsigned long		pid;
39 	char			comm[COMM_LEN];
40 
41 	unsigned long		nr_events;
42 	unsigned long		curr_event;
43 	struct sched_atom	**atoms;
44 
45 	pthread_t		thread;
46 	sem_t			sleep_sem;
47 
48 	sem_t			ready_for_work;
49 	sem_t			work_done_sem;
50 
51 	u64			cpu_usage;
52 };
53 
54 enum sched_event_type {
55 	SCHED_EVENT_RUN,
56 	SCHED_EVENT_SLEEP,
57 	SCHED_EVENT_WAKEUP,
58 	SCHED_EVENT_MIGRATION,
59 };
60 
61 struct sched_atom {
62 	enum sched_event_type	type;
63 	int			specific_wait;
64 	u64			timestamp;
65 	u64			duration;
66 	unsigned long		nr;
67 	sem_t			*wait_sem;
68 	struct task_desc	*wakee;
69 };
70 
71 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
72 
73 enum thread_state {
74 	THREAD_SLEEPING = 0,
75 	THREAD_WAIT_CPU,
76 	THREAD_SCHED_IN,
77 	THREAD_IGNORE
78 };
79 
80 struct work_atom {
81 	struct list_head	list;
82 	enum thread_state	state;
83 	u64			sched_out_time;
84 	u64			wake_up_time;
85 	u64			sched_in_time;
86 	u64			runtime;
87 };
88 
89 struct work_atoms {
90 	struct list_head	work_list;
91 	struct thread		*thread;
92 	struct rb_node		node;
93 	u64			max_lat;
94 	u64			max_lat_at;
95 	u64			total_lat;
96 	u64			nb_atoms;
97 	u64			total_runtime;
98 };
99 
100 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
101 
102 struct perf_sched;
103 
104 struct trace_sched_handler {
105 	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
106 			    struct perf_sample *sample, struct machine *machine);
107 
108 	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
109 			     struct perf_sample *sample, struct machine *machine);
110 
111 	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
112 			    struct perf_sample *sample, struct machine *machine);
113 
114 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
115 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
116 			  struct machine *machine);
117 
118 	int (*migrate_task_event)(struct perf_sched *sched,
119 				  struct perf_evsel *evsel,
120 				  struct perf_sample *sample,
121 				  struct machine *machine);
122 };
123 
124 struct perf_sched {
125 	struct perf_tool tool;
126 	const char	 *sort_order;
127 	unsigned long	 nr_tasks;
128 	struct task_desc **pid_to_task;
129 	struct task_desc **tasks;
130 	const struct trace_sched_handler *tp_handler;
131 	pthread_mutex_t	 start_work_mutex;
132 	pthread_mutex_t	 work_done_wait_mutex;
133 	int		 profile_cpu;
134 /*
135  * Track the current task - that way we can know whether there's any
136  * weird events, such as a task being switched away that is not current.
137  */
138 	int		 max_cpu;
139 	u32		 curr_pid[MAX_CPUS];
140 	struct thread	 *curr_thread[MAX_CPUS];
141 	char		 next_shortname1;
142 	char		 next_shortname2;
143 	unsigned int	 replay_repeat;
144 	unsigned long	 nr_run_events;
145 	unsigned long	 nr_sleep_events;
146 	unsigned long	 nr_wakeup_events;
147 	unsigned long	 nr_sleep_corrections;
148 	unsigned long	 nr_run_events_optimized;
149 	unsigned long	 targetless_wakeups;
150 	unsigned long	 multitarget_wakeups;
151 	unsigned long	 nr_runs;
152 	unsigned long	 nr_timestamps;
153 	unsigned long	 nr_unordered_timestamps;
154 	unsigned long	 nr_context_switch_bugs;
155 	unsigned long	 nr_events;
156 	unsigned long	 nr_lost_chunks;
157 	unsigned long	 nr_lost_events;
158 	u64		 run_measurement_overhead;
159 	u64		 sleep_measurement_overhead;
160 	u64		 start_time;
161 	u64		 cpu_usage;
162 	u64		 runavg_cpu_usage;
163 	u64		 parent_cpu_usage;
164 	u64		 runavg_parent_cpu_usage;
165 	u64		 sum_runtime;
166 	u64		 sum_fluct;
167 	u64		 run_avg;
168 	u64		 all_runtime;
169 	u64		 all_count;
170 	u64		 cpu_last_switched[MAX_CPUS];
171 	struct rb_root	 atom_root, sorted_atom_root;
172 	struct list_head sort_list, cmp_pid;
173 	bool force;
174 };
175 
176 static u64 get_nsecs(void)
177 {
178 	struct timespec ts;
179 
180 	clock_gettime(CLOCK_MONOTONIC, &ts);
181 
182 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
183 }
184 
185 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
186 {
187 	u64 T0 = get_nsecs(), T1;
188 
189 	do {
190 		T1 = get_nsecs();
191 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
192 }
193 
194 static void sleep_nsecs(u64 nsecs)
195 {
196 	struct timespec ts;
197 
198 	ts.tv_nsec = nsecs % 999999999;
199 	ts.tv_sec = nsecs / 999999999;
200 
201 	nanosleep(&ts, NULL);
202 }
203 
204 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
205 {
206 	u64 T0, T1, delta, min_delta = 1000000000ULL;
207 	int i;
208 
209 	for (i = 0; i < 10; i++) {
210 		T0 = get_nsecs();
211 		burn_nsecs(sched, 0);
212 		T1 = get_nsecs();
213 		delta = T1-T0;
214 		min_delta = min(min_delta, delta);
215 	}
216 	sched->run_measurement_overhead = min_delta;
217 
218 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
219 }
220 
221 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
222 {
223 	u64 T0, T1, delta, min_delta = 1000000000ULL;
224 	int i;
225 
226 	for (i = 0; i < 10; i++) {
227 		T0 = get_nsecs();
228 		sleep_nsecs(10000);
229 		T1 = get_nsecs();
230 		delta = T1-T0;
231 		min_delta = min(min_delta, delta);
232 	}
233 	min_delta -= 10000;
234 	sched->sleep_measurement_overhead = min_delta;
235 
236 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
237 }
238 
239 static struct sched_atom *
240 get_new_event(struct task_desc *task, u64 timestamp)
241 {
242 	struct sched_atom *event = zalloc(sizeof(*event));
243 	unsigned long idx = task->nr_events;
244 	size_t size;
245 
246 	event->timestamp = timestamp;
247 	event->nr = idx;
248 
249 	task->nr_events++;
250 	size = sizeof(struct sched_atom *) * task->nr_events;
251 	task->atoms = realloc(task->atoms, size);
252 	BUG_ON(!task->atoms);
253 
254 	task->atoms[idx] = event;
255 
256 	return event;
257 }
258 
259 static struct sched_atom *last_event(struct task_desc *task)
260 {
261 	if (!task->nr_events)
262 		return NULL;
263 
264 	return task->atoms[task->nr_events - 1];
265 }
266 
267 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
268 				u64 timestamp, u64 duration)
269 {
270 	struct sched_atom *event, *curr_event = last_event(task);
271 
272 	/*
273 	 * optimize an existing RUN event by merging this one
274 	 * to it:
275 	 */
276 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
277 		sched->nr_run_events_optimized++;
278 		curr_event->duration += duration;
279 		return;
280 	}
281 
282 	event = get_new_event(task, timestamp);
283 
284 	event->type = SCHED_EVENT_RUN;
285 	event->duration = duration;
286 
287 	sched->nr_run_events++;
288 }
289 
290 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
291 				   u64 timestamp, struct task_desc *wakee)
292 {
293 	struct sched_atom *event, *wakee_event;
294 
295 	event = get_new_event(task, timestamp);
296 	event->type = SCHED_EVENT_WAKEUP;
297 	event->wakee = wakee;
298 
299 	wakee_event = last_event(wakee);
300 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
301 		sched->targetless_wakeups++;
302 		return;
303 	}
304 	if (wakee_event->wait_sem) {
305 		sched->multitarget_wakeups++;
306 		return;
307 	}
308 
309 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
310 	sem_init(wakee_event->wait_sem, 0, 0);
311 	wakee_event->specific_wait = 1;
312 	event->wait_sem = wakee_event->wait_sem;
313 
314 	sched->nr_wakeup_events++;
315 }
316 
317 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
318 				  u64 timestamp, u64 task_state __maybe_unused)
319 {
320 	struct sched_atom *event = get_new_event(task, timestamp);
321 
322 	event->type = SCHED_EVENT_SLEEP;
323 
324 	sched->nr_sleep_events++;
325 }
326 
327 static struct task_desc *register_pid(struct perf_sched *sched,
328 				      unsigned long pid, const char *comm)
329 {
330 	struct task_desc *task;
331 	static int pid_max;
332 
333 	if (sched->pid_to_task == NULL) {
334 		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
335 			pid_max = MAX_PID;
336 		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
337 	}
338 	if (pid >= (unsigned long)pid_max) {
339 		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
340 			sizeof(struct task_desc *))) == NULL);
341 		while (pid >= (unsigned long)pid_max)
342 			sched->pid_to_task[pid_max++] = NULL;
343 	}
344 
345 	task = sched->pid_to_task[pid];
346 
347 	if (task)
348 		return task;
349 
350 	task = zalloc(sizeof(*task));
351 	task->pid = pid;
352 	task->nr = sched->nr_tasks;
353 	strcpy(task->comm, comm);
354 	/*
355 	 * every task starts in sleeping state - this gets ignored
356 	 * if there's no wakeup pointing to this sleep state:
357 	 */
358 	add_sched_event_sleep(sched, task, 0, 0);
359 
360 	sched->pid_to_task[pid] = task;
361 	sched->nr_tasks++;
362 	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
363 	BUG_ON(!sched->tasks);
364 	sched->tasks[task->nr] = task;
365 
366 	if (verbose)
367 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
368 
369 	return task;
370 }
371 
372 
373 static void print_task_traces(struct perf_sched *sched)
374 {
375 	struct task_desc *task;
376 	unsigned long i;
377 
378 	for (i = 0; i < sched->nr_tasks; i++) {
379 		task = sched->tasks[i];
380 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
381 			task->nr, task->comm, task->pid, task->nr_events);
382 	}
383 }
384 
385 static void add_cross_task_wakeups(struct perf_sched *sched)
386 {
387 	struct task_desc *task1, *task2;
388 	unsigned long i, j;
389 
390 	for (i = 0; i < sched->nr_tasks; i++) {
391 		task1 = sched->tasks[i];
392 		j = i + 1;
393 		if (j == sched->nr_tasks)
394 			j = 0;
395 		task2 = sched->tasks[j];
396 		add_sched_event_wakeup(sched, task1, 0, task2);
397 	}
398 }
399 
400 static void perf_sched__process_event(struct perf_sched *sched,
401 				      struct sched_atom *atom)
402 {
403 	int ret = 0;
404 
405 	switch (atom->type) {
406 		case SCHED_EVENT_RUN:
407 			burn_nsecs(sched, atom->duration);
408 			break;
409 		case SCHED_EVENT_SLEEP:
410 			if (atom->wait_sem)
411 				ret = sem_wait(atom->wait_sem);
412 			BUG_ON(ret);
413 			break;
414 		case SCHED_EVENT_WAKEUP:
415 			if (atom->wait_sem)
416 				ret = sem_post(atom->wait_sem);
417 			BUG_ON(ret);
418 			break;
419 		case SCHED_EVENT_MIGRATION:
420 			break;
421 		default:
422 			BUG_ON(1);
423 	}
424 }
425 
426 static u64 get_cpu_usage_nsec_parent(void)
427 {
428 	struct rusage ru;
429 	u64 sum;
430 	int err;
431 
432 	err = getrusage(RUSAGE_SELF, &ru);
433 	BUG_ON(err);
434 
435 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
436 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
437 
438 	return sum;
439 }
440 
441 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
442 {
443 	struct perf_event_attr attr;
444 	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
445 	int fd;
446 	struct rlimit limit;
447 	bool need_privilege = false;
448 
449 	memset(&attr, 0, sizeof(attr));
450 
451 	attr.type = PERF_TYPE_SOFTWARE;
452 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
453 
454 force_again:
455 	fd = sys_perf_event_open(&attr, 0, -1, -1,
456 				 perf_event_open_cloexec_flag());
457 
458 	if (fd < 0) {
459 		if (errno == EMFILE) {
460 			if (sched->force) {
461 				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
462 				limit.rlim_cur += sched->nr_tasks - cur_task;
463 				if (limit.rlim_cur > limit.rlim_max) {
464 					limit.rlim_max = limit.rlim_cur;
465 					need_privilege = true;
466 				}
467 				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
468 					if (need_privilege && errno == EPERM)
469 						strcpy(info, "Need privilege\n");
470 				} else
471 					goto force_again;
472 			} else
473 				strcpy(info, "Have a try with -f option\n");
474 		}
475 		pr_err("Error: sys_perf_event_open() syscall returned "
476 		       "with %d (%s)\n%s", fd,
477 		       strerror_r(errno, sbuf, sizeof(sbuf)), info);
478 		exit(EXIT_FAILURE);
479 	}
480 	return fd;
481 }
482 
483 static u64 get_cpu_usage_nsec_self(int fd)
484 {
485 	u64 runtime;
486 	int ret;
487 
488 	ret = read(fd, &runtime, sizeof(runtime));
489 	BUG_ON(ret != sizeof(runtime));
490 
491 	return runtime;
492 }
493 
494 struct sched_thread_parms {
495 	struct task_desc  *task;
496 	struct perf_sched *sched;
497 	int fd;
498 };
499 
500 static void *thread_func(void *ctx)
501 {
502 	struct sched_thread_parms *parms = ctx;
503 	struct task_desc *this_task = parms->task;
504 	struct perf_sched *sched = parms->sched;
505 	u64 cpu_usage_0, cpu_usage_1;
506 	unsigned long i, ret;
507 	char comm2[22];
508 	int fd = parms->fd;
509 
510 	zfree(&parms);
511 
512 	sprintf(comm2, ":%s", this_task->comm);
513 	prctl(PR_SET_NAME, comm2);
514 	if (fd < 0)
515 		return NULL;
516 again:
517 	ret = sem_post(&this_task->ready_for_work);
518 	BUG_ON(ret);
519 	ret = pthread_mutex_lock(&sched->start_work_mutex);
520 	BUG_ON(ret);
521 	ret = pthread_mutex_unlock(&sched->start_work_mutex);
522 	BUG_ON(ret);
523 
524 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
525 
526 	for (i = 0; i < this_task->nr_events; i++) {
527 		this_task->curr_event = i;
528 		perf_sched__process_event(sched, this_task->atoms[i]);
529 	}
530 
531 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
532 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
533 	ret = sem_post(&this_task->work_done_sem);
534 	BUG_ON(ret);
535 
536 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
537 	BUG_ON(ret);
538 	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
539 	BUG_ON(ret);
540 
541 	goto again;
542 }
543 
544 static void create_tasks(struct perf_sched *sched)
545 {
546 	struct task_desc *task;
547 	pthread_attr_t attr;
548 	unsigned long i;
549 	int err;
550 
551 	err = pthread_attr_init(&attr);
552 	BUG_ON(err);
553 	err = pthread_attr_setstacksize(&attr,
554 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
555 	BUG_ON(err);
556 	err = pthread_mutex_lock(&sched->start_work_mutex);
557 	BUG_ON(err);
558 	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
559 	BUG_ON(err);
560 	for (i = 0; i < sched->nr_tasks; i++) {
561 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
562 		BUG_ON(parms == NULL);
563 		parms->task = task = sched->tasks[i];
564 		parms->sched = sched;
565 		parms->fd = self_open_counters(sched, i);
566 		sem_init(&task->sleep_sem, 0, 0);
567 		sem_init(&task->ready_for_work, 0, 0);
568 		sem_init(&task->work_done_sem, 0, 0);
569 		task->curr_event = 0;
570 		err = pthread_create(&task->thread, &attr, thread_func, parms);
571 		BUG_ON(err);
572 	}
573 }
574 
575 static void wait_for_tasks(struct perf_sched *sched)
576 {
577 	u64 cpu_usage_0, cpu_usage_1;
578 	struct task_desc *task;
579 	unsigned long i, ret;
580 
581 	sched->start_time = get_nsecs();
582 	sched->cpu_usage = 0;
583 	pthread_mutex_unlock(&sched->work_done_wait_mutex);
584 
585 	for (i = 0; i < sched->nr_tasks; i++) {
586 		task = sched->tasks[i];
587 		ret = sem_wait(&task->ready_for_work);
588 		BUG_ON(ret);
589 		sem_init(&task->ready_for_work, 0, 0);
590 	}
591 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
592 	BUG_ON(ret);
593 
594 	cpu_usage_0 = get_cpu_usage_nsec_parent();
595 
596 	pthread_mutex_unlock(&sched->start_work_mutex);
597 
598 	for (i = 0; i < sched->nr_tasks; i++) {
599 		task = sched->tasks[i];
600 		ret = sem_wait(&task->work_done_sem);
601 		BUG_ON(ret);
602 		sem_init(&task->work_done_sem, 0, 0);
603 		sched->cpu_usage += task->cpu_usage;
604 		task->cpu_usage = 0;
605 	}
606 
607 	cpu_usage_1 = get_cpu_usage_nsec_parent();
608 	if (!sched->runavg_cpu_usage)
609 		sched->runavg_cpu_usage = sched->cpu_usage;
610 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
611 
612 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
613 	if (!sched->runavg_parent_cpu_usage)
614 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
615 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
616 					 sched->parent_cpu_usage)/sched->replay_repeat;
617 
618 	ret = pthread_mutex_lock(&sched->start_work_mutex);
619 	BUG_ON(ret);
620 
621 	for (i = 0; i < sched->nr_tasks; i++) {
622 		task = sched->tasks[i];
623 		sem_init(&task->sleep_sem, 0, 0);
624 		task->curr_event = 0;
625 	}
626 }
627 
628 static void run_one_test(struct perf_sched *sched)
629 {
630 	u64 T0, T1, delta, avg_delta, fluct;
631 
632 	T0 = get_nsecs();
633 	wait_for_tasks(sched);
634 	T1 = get_nsecs();
635 
636 	delta = T1 - T0;
637 	sched->sum_runtime += delta;
638 	sched->nr_runs++;
639 
640 	avg_delta = sched->sum_runtime / sched->nr_runs;
641 	if (delta < avg_delta)
642 		fluct = avg_delta - delta;
643 	else
644 		fluct = delta - avg_delta;
645 	sched->sum_fluct += fluct;
646 	if (!sched->run_avg)
647 		sched->run_avg = delta;
648 	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
649 
650 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
651 
652 	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
653 
654 	printf("cpu: %0.2f / %0.2f",
655 		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
656 
657 #if 0
658 	/*
659 	 * rusage statistics done by the parent, these are less
660 	 * accurate than the sched->sum_exec_runtime based statistics:
661 	 */
662 	printf(" [%0.2f / %0.2f]",
663 		(double)sched->parent_cpu_usage/1e6,
664 		(double)sched->runavg_parent_cpu_usage/1e6);
665 #endif
666 
667 	printf("\n");
668 
669 	if (sched->nr_sleep_corrections)
670 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
671 	sched->nr_sleep_corrections = 0;
672 }
673 
674 static void test_calibrations(struct perf_sched *sched)
675 {
676 	u64 T0, T1;
677 
678 	T0 = get_nsecs();
679 	burn_nsecs(sched, 1e6);
680 	T1 = get_nsecs();
681 
682 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
683 
684 	T0 = get_nsecs();
685 	sleep_nsecs(1e6);
686 	T1 = get_nsecs();
687 
688 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
689 }
690 
691 static int
692 replay_wakeup_event(struct perf_sched *sched,
693 		    struct perf_evsel *evsel, struct perf_sample *sample,
694 		    struct machine *machine __maybe_unused)
695 {
696 	const char *comm = perf_evsel__strval(evsel, sample, "comm");
697 	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
698 	struct task_desc *waker, *wakee;
699 
700 	if (verbose) {
701 		printf("sched_wakeup event %p\n", evsel);
702 
703 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
704 	}
705 
706 	waker = register_pid(sched, sample->tid, "<unknown>");
707 	wakee = register_pid(sched, pid, comm);
708 
709 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
710 	return 0;
711 }
712 
713 static int replay_switch_event(struct perf_sched *sched,
714 			       struct perf_evsel *evsel,
715 			       struct perf_sample *sample,
716 			       struct machine *machine __maybe_unused)
717 {
718 	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
719 		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
720 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
721 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
722 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
723 	struct task_desc *prev, __maybe_unused *next;
724 	u64 timestamp0, timestamp = sample->time;
725 	int cpu = sample->cpu;
726 	s64 delta;
727 
728 	if (verbose)
729 		printf("sched_switch event %p\n", evsel);
730 
731 	if (cpu >= MAX_CPUS || cpu < 0)
732 		return 0;
733 
734 	timestamp0 = sched->cpu_last_switched[cpu];
735 	if (timestamp0)
736 		delta = timestamp - timestamp0;
737 	else
738 		delta = 0;
739 
740 	if (delta < 0) {
741 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
742 		return -1;
743 	}
744 
745 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
746 		 prev_comm, prev_pid, next_comm, next_pid, delta);
747 
748 	prev = register_pid(sched, prev_pid, prev_comm);
749 	next = register_pid(sched, next_pid, next_comm);
750 
751 	sched->cpu_last_switched[cpu] = timestamp;
752 
753 	add_sched_event_run(sched, prev, timestamp, delta);
754 	add_sched_event_sleep(sched, prev, timestamp, prev_state);
755 
756 	return 0;
757 }
758 
759 static int replay_fork_event(struct perf_sched *sched,
760 			     union perf_event *event,
761 			     struct machine *machine)
762 {
763 	struct thread *child, *parent;
764 
765 	child = machine__findnew_thread(machine, event->fork.pid,
766 					event->fork.tid);
767 	parent = machine__findnew_thread(machine, event->fork.ppid,
768 					 event->fork.ptid);
769 
770 	if (child == NULL || parent == NULL) {
771 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
772 				 child, parent);
773 		return 0;
774 	}
775 
776 	if (verbose) {
777 		printf("fork event\n");
778 		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
779 		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
780 	}
781 
782 	register_pid(sched, parent->tid, thread__comm_str(parent));
783 	register_pid(sched, child->tid, thread__comm_str(child));
784 	return 0;
785 }
786 
787 struct sort_dimension {
788 	const char		*name;
789 	sort_fn_t		cmp;
790 	struct list_head	list;
791 };
792 
793 static int
794 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
795 {
796 	struct sort_dimension *sort;
797 	int ret = 0;
798 
799 	BUG_ON(list_empty(list));
800 
801 	list_for_each_entry(sort, list, list) {
802 		ret = sort->cmp(l, r);
803 		if (ret)
804 			return ret;
805 	}
806 
807 	return ret;
808 }
809 
810 static struct work_atoms *
811 thread_atoms_search(struct rb_root *root, struct thread *thread,
812 			 struct list_head *sort_list)
813 {
814 	struct rb_node *node = root->rb_node;
815 	struct work_atoms key = { .thread = thread };
816 
817 	while (node) {
818 		struct work_atoms *atoms;
819 		int cmp;
820 
821 		atoms = container_of(node, struct work_atoms, node);
822 
823 		cmp = thread_lat_cmp(sort_list, &key, atoms);
824 		if (cmp > 0)
825 			node = node->rb_left;
826 		else if (cmp < 0)
827 			node = node->rb_right;
828 		else {
829 			BUG_ON(thread != atoms->thread);
830 			return atoms;
831 		}
832 	}
833 	return NULL;
834 }
835 
836 static void
837 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
838 			 struct list_head *sort_list)
839 {
840 	struct rb_node **new = &(root->rb_node), *parent = NULL;
841 
842 	while (*new) {
843 		struct work_atoms *this;
844 		int cmp;
845 
846 		this = container_of(*new, struct work_atoms, node);
847 		parent = *new;
848 
849 		cmp = thread_lat_cmp(sort_list, data, this);
850 
851 		if (cmp > 0)
852 			new = &((*new)->rb_left);
853 		else
854 			new = &((*new)->rb_right);
855 	}
856 
857 	rb_link_node(&data->node, parent, new);
858 	rb_insert_color(&data->node, root);
859 }
860 
861 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
862 {
863 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
864 	if (!atoms) {
865 		pr_err("No memory at %s\n", __func__);
866 		return -1;
867 	}
868 
869 	atoms->thread = thread__get(thread);
870 	INIT_LIST_HEAD(&atoms->work_list);
871 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
872 	return 0;
873 }
874 
875 static char sched_out_state(u64 prev_state)
876 {
877 	const char *str = TASK_STATE_TO_CHAR_STR;
878 
879 	return str[prev_state];
880 }
881 
882 static int
883 add_sched_out_event(struct work_atoms *atoms,
884 		    char run_state,
885 		    u64 timestamp)
886 {
887 	struct work_atom *atom = zalloc(sizeof(*atom));
888 	if (!atom) {
889 		pr_err("Non memory at %s", __func__);
890 		return -1;
891 	}
892 
893 	atom->sched_out_time = timestamp;
894 
895 	if (run_state == 'R') {
896 		atom->state = THREAD_WAIT_CPU;
897 		atom->wake_up_time = atom->sched_out_time;
898 	}
899 
900 	list_add_tail(&atom->list, &atoms->work_list);
901 	return 0;
902 }
903 
904 static void
905 add_runtime_event(struct work_atoms *atoms, u64 delta,
906 		  u64 timestamp __maybe_unused)
907 {
908 	struct work_atom *atom;
909 
910 	BUG_ON(list_empty(&atoms->work_list));
911 
912 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
913 
914 	atom->runtime += delta;
915 	atoms->total_runtime += delta;
916 }
917 
918 static void
919 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
920 {
921 	struct work_atom *atom;
922 	u64 delta;
923 
924 	if (list_empty(&atoms->work_list))
925 		return;
926 
927 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
928 
929 	if (atom->state != THREAD_WAIT_CPU)
930 		return;
931 
932 	if (timestamp < atom->wake_up_time) {
933 		atom->state = THREAD_IGNORE;
934 		return;
935 	}
936 
937 	atom->state = THREAD_SCHED_IN;
938 	atom->sched_in_time = timestamp;
939 
940 	delta = atom->sched_in_time - atom->wake_up_time;
941 	atoms->total_lat += delta;
942 	if (delta > atoms->max_lat) {
943 		atoms->max_lat = delta;
944 		atoms->max_lat_at = timestamp;
945 	}
946 	atoms->nb_atoms++;
947 }
948 
949 static int latency_switch_event(struct perf_sched *sched,
950 				struct perf_evsel *evsel,
951 				struct perf_sample *sample,
952 				struct machine *machine)
953 {
954 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
955 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
956 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
957 	struct work_atoms *out_events, *in_events;
958 	struct thread *sched_out, *sched_in;
959 	u64 timestamp0, timestamp = sample->time;
960 	int cpu = sample->cpu;
961 	s64 delta;
962 
963 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
964 
965 	timestamp0 = sched->cpu_last_switched[cpu];
966 	sched->cpu_last_switched[cpu] = timestamp;
967 	if (timestamp0)
968 		delta = timestamp - timestamp0;
969 	else
970 		delta = 0;
971 
972 	if (delta < 0) {
973 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
974 		return -1;
975 	}
976 
977 	sched_out = machine__findnew_thread(machine, -1, prev_pid);
978 	sched_in = machine__findnew_thread(machine, -1, next_pid);
979 
980 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
981 	if (!out_events) {
982 		if (thread_atoms_insert(sched, sched_out))
983 			return -1;
984 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
985 		if (!out_events) {
986 			pr_err("out-event: Internal tree error");
987 			return -1;
988 		}
989 	}
990 	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
991 		return -1;
992 
993 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
994 	if (!in_events) {
995 		if (thread_atoms_insert(sched, sched_in))
996 			return -1;
997 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
998 		if (!in_events) {
999 			pr_err("in-event: Internal tree error");
1000 			return -1;
1001 		}
1002 		/*
1003 		 * Take came in we have not heard about yet,
1004 		 * add in an initial atom in runnable state:
1005 		 */
1006 		if (add_sched_out_event(in_events, 'R', timestamp))
1007 			return -1;
1008 	}
1009 	add_sched_in_event(in_events, timestamp);
1010 
1011 	return 0;
1012 }
1013 
1014 static int latency_runtime_event(struct perf_sched *sched,
1015 				 struct perf_evsel *evsel,
1016 				 struct perf_sample *sample,
1017 				 struct machine *machine)
1018 {
1019 	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
1020 	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1021 	struct thread *thread = machine__findnew_thread(machine, -1, pid);
1022 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1023 	u64 timestamp = sample->time;
1024 	int cpu = sample->cpu;
1025 
1026 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1027 	if (!atoms) {
1028 		if (thread_atoms_insert(sched, thread))
1029 			return -1;
1030 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1031 		if (!atoms) {
1032 			pr_err("in-event: Internal tree error");
1033 			return -1;
1034 		}
1035 		if (add_sched_out_event(atoms, 'R', timestamp))
1036 			return -1;
1037 	}
1038 
1039 	add_runtime_event(atoms, runtime, timestamp);
1040 	return 0;
1041 }
1042 
1043 static int latency_wakeup_event(struct perf_sched *sched,
1044 				struct perf_evsel *evsel,
1045 				struct perf_sample *sample,
1046 				struct machine *machine)
1047 {
1048 	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1049 	struct work_atoms *atoms;
1050 	struct work_atom *atom;
1051 	struct thread *wakee;
1052 	u64 timestamp = sample->time;
1053 
1054 	wakee = machine__findnew_thread(machine, -1, pid);
1055 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1056 	if (!atoms) {
1057 		if (thread_atoms_insert(sched, wakee))
1058 			return -1;
1059 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1060 		if (!atoms) {
1061 			pr_err("wakeup-event: Internal tree error");
1062 			return -1;
1063 		}
1064 		if (add_sched_out_event(atoms, 'S', timestamp))
1065 			return -1;
1066 	}
1067 
1068 	BUG_ON(list_empty(&atoms->work_list));
1069 
1070 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1071 
1072 	/*
1073 	 * As we do not guarantee the wakeup event happens when
1074 	 * task is out of run queue, also may happen when task is
1075 	 * on run queue and wakeup only change ->state to TASK_RUNNING,
1076 	 * then we should not set the ->wake_up_time when wake up a
1077 	 * task which is on run queue.
1078 	 *
1079 	 * You WILL be missing events if you've recorded only
1080 	 * one CPU, or are only looking at only one, so don't
1081 	 * skip in this case.
1082 	 */
1083 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1084 		return 0;
1085 
1086 	sched->nr_timestamps++;
1087 	if (atom->sched_out_time > timestamp) {
1088 		sched->nr_unordered_timestamps++;
1089 		return 0;
1090 	}
1091 
1092 	atom->state = THREAD_WAIT_CPU;
1093 	atom->wake_up_time = timestamp;
1094 	return 0;
1095 }
1096 
1097 static int latency_migrate_task_event(struct perf_sched *sched,
1098 				      struct perf_evsel *evsel,
1099 				      struct perf_sample *sample,
1100 				      struct machine *machine)
1101 {
1102 	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1103 	u64 timestamp = sample->time;
1104 	struct work_atoms *atoms;
1105 	struct work_atom *atom;
1106 	struct thread *migrant;
1107 
1108 	/*
1109 	 * Only need to worry about migration when profiling one CPU.
1110 	 */
1111 	if (sched->profile_cpu == -1)
1112 		return 0;
1113 
1114 	migrant = machine__findnew_thread(machine, -1, pid);
1115 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1116 	if (!atoms) {
1117 		if (thread_atoms_insert(sched, migrant))
1118 			return -1;
1119 		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1120 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1121 		if (!atoms) {
1122 			pr_err("migration-event: Internal tree error");
1123 			return -1;
1124 		}
1125 		if (add_sched_out_event(atoms, 'R', timestamp))
1126 			return -1;
1127 	}
1128 
1129 	BUG_ON(list_empty(&atoms->work_list));
1130 
1131 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1132 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1133 
1134 	sched->nr_timestamps++;
1135 
1136 	if (atom->sched_out_time > timestamp)
1137 		sched->nr_unordered_timestamps++;
1138 
1139 	return 0;
1140 }
1141 
1142 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1143 {
1144 	int i;
1145 	int ret;
1146 	u64 avg;
1147 
1148 	if (!work_list->nb_atoms)
1149 		return;
1150 	/*
1151 	 * Ignore idle threads:
1152 	 */
1153 	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1154 		return;
1155 
1156 	sched->all_runtime += work_list->total_runtime;
1157 	sched->all_count   += work_list->nb_atoms;
1158 
1159 	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1160 
1161 	for (i = 0; i < 24 - ret; i++)
1162 		printf(" ");
1163 
1164 	avg = work_list->total_lat / work_list->nb_atoms;
1165 
1166 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1167 	      (double)work_list->total_runtime / 1e6,
1168 		 work_list->nb_atoms, (double)avg / 1e6,
1169 		 (double)work_list->max_lat / 1e6,
1170 		 (double)work_list->max_lat_at / 1e9);
1171 }
1172 
1173 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1174 {
1175 	if (l->thread->tid < r->thread->tid)
1176 		return -1;
1177 	if (l->thread->tid > r->thread->tid)
1178 		return 1;
1179 
1180 	return 0;
1181 }
1182 
1183 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1184 {
1185 	u64 avgl, avgr;
1186 
1187 	if (!l->nb_atoms)
1188 		return -1;
1189 
1190 	if (!r->nb_atoms)
1191 		return 1;
1192 
1193 	avgl = l->total_lat / l->nb_atoms;
1194 	avgr = r->total_lat / r->nb_atoms;
1195 
1196 	if (avgl < avgr)
1197 		return -1;
1198 	if (avgl > avgr)
1199 		return 1;
1200 
1201 	return 0;
1202 }
1203 
1204 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1205 {
1206 	if (l->max_lat < r->max_lat)
1207 		return -1;
1208 	if (l->max_lat > r->max_lat)
1209 		return 1;
1210 
1211 	return 0;
1212 }
1213 
1214 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1215 {
1216 	if (l->nb_atoms < r->nb_atoms)
1217 		return -1;
1218 	if (l->nb_atoms > r->nb_atoms)
1219 		return 1;
1220 
1221 	return 0;
1222 }
1223 
1224 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1225 {
1226 	if (l->total_runtime < r->total_runtime)
1227 		return -1;
1228 	if (l->total_runtime > r->total_runtime)
1229 		return 1;
1230 
1231 	return 0;
1232 }
1233 
1234 static int sort_dimension__add(const char *tok, struct list_head *list)
1235 {
1236 	size_t i;
1237 	static struct sort_dimension avg_sort_dimension = {
1238 		.name = "avg",
1239 		.cmp  = avg_cmp,
1240 	};
1241 	static struct sort_dimension max_sort_dimension = {
1242 		.name = "max",
1243 		.cmp  = max_cmp,
1244 	};
1245 	static struct sort_dimension pid_sort_dimension = {
1246 		.name = "pid",
1247 		.cmp  = pid_cmp,
1248 	};
1249 	static struct sort_dimension runtime_sort_dimension = {
1250 		.name = "runtime",
1251 		.cmp  = runtime_cmp,
1252 	};
1253 	static struct sort_dimension switch_sort_dimension = {
1254 		.name = "switch",
1255 		.cmp  = switch_cmp,
1256 	};
1257 	struct sort_dimension *available_sorts[] = {
1258 		&pid_sort_dimension,
1259 		&avg_sort_dimension,
1260 		&max_sort_dimension,
1261 		&switch_sort_dimension,
1262 		&runtime_sort_dimension,
1263 	};
1264 
1265 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1266 		if (!strcmp(available_sorts[i]->name, tok)) {
1267 			list_add_tail(&available_sorts[i]->list, list);
1268 
1269 			return 0;
1270 		}
1271 	}
1272 
1273 	return -1;
1274 }
1275 
1276 static void perf_sched__sort_lat(struct perf_sched *sched)
1277 {
1278 	struct rb_node *node;
1279 
1280 	for (;;) {
1281 		struct work_atoms *data;
1282 		node = rb_first(&sched->atom_root);
1283 		if (!node)
1284 			break;
1285 
1286 		rb_erase(node, &sched->atom_root);
1287 		data = rb_entry(node, struct work_atoms, node);
1288 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1289 	}
1290 }
1291 
1292 static int process_sched_wakeup_event(struct perf_tool *tool,
1293 				      struct perf_evsel *evsel,
1294 				      struct perf_sample *sample,
1295 				      struct machine *machine)
1296 {
1297 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1298 
1299 	if (sched->tp_handler->wakeup_event)
1300 		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1301 
1302 	return 0;
1303 }
1304 
1305 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1306 			    struct perf_sample *sample, struct machine *machine)
1307 {
1308 	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1309 	struct thread *sched_in;
1310 	int new_shortname;
1311 	u64 timestamp0, timestamp = sample->time;
1312 	s64 delta;
1313 	int cpu, this_cpu = sample->cpu;
1314 
1315 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1316 
1317 	if (this_cpu > sched->max_cpu)
1318 		sched->max_cpu = this_cpu;
1319 
1320 	timestamp0 = sched->cpu_last_switched[this_cpu];
1321 	sched->cpu_last_switched[this_cpu] = timestamp;
1322 	if (timestamp0)
1323 		delta = timestamp - timestamp0;
1324 	else
1325 		delta = 0;
1326 
1327 	if (delta < 0) {
1328 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1329 		return -1;
1330 	}
1331 
1332 	sched_in = machine__findnew_thread(machine, -1, next_pid);
1333 
1334 	sched->curr_thread[this_cpu] = sched_in;
1335 
1336 	printf("  ");
1337 
1338 	new_shortname = 0;
1339 	if (!sched_in->shortname[0]) {
1340 		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1341 			/*
1342 			 * Don't allocate a letter-number for swapper:0
1343 			 * as a shortname. Instead, we use '.' for it.
1344 			 */
1345 			sched_in->shortname[0] = '.';
1346 			sched_in->shortname[1] = ' ';
1347 		} else {
1348 			sched_in->shortname[0] = sched->next_shortname1;
1349 			sched_in->shortname[1] = sched->next_shortname2;
1350 
1351 			if (sched->next_shortname1 < 'Z') {
1352 				sched->next_shortname1++;
1353 			} else {
1354 				sched->next_shortname1 = 'A';
1355 				if (sched->next_shortname2 < '9')
1356 					sched->next_shortname2++;
1357 				else
1358 					sched->next_shortname2 = '0';
1359 			}
1360 		}
1361 		new_shortname = 1;
1362 	}
1363 
1364 	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1365 		if (cpu != this_cpu)
1366 			printf(" ");
1367 		else
1368 			printf("*");
1369 
1370 		if (sched->curr_thread[cpu])
1371 			printf("%2s ", sched->curr_thread[cpu]->shortname);
1372 		else
1373 			printf("   ");
1374 	}
1375 
1376 	printf("  %12.6f secs ", (double)timestamp/1e9);
1377 	if (new_shortname) {
1378 		printf("%s => %s:%d\n",
1379 		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1380 	} else {
1381 		printf("\n");
1382 	}
1383 
1384 	return 0;
1385 }
1386 
1387 static int process_sched_switch_event(struct perf_tool *tool,
1388 				      struct perf_evsel *evsel,
1389 				      struct perf_sample *sample,
1390 				      struct machine *machine)
1391 {
1392 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1393 	int this_cpu = sample->cpu, err = 0;
1394 	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1395 	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1396 
1397 	if (sched->curr_pid[this_cpu] != (u32)-1) {
1398 		/*
1399 		 * Are we trying to switch away a PID that is
1400 		 * not current?
1401 		 */
1402 		if (sched->curr_pid[this_cpu] != prev_pid)
1403 			sched->nr_context_switch_bugs++;
1404 	}
1405 
1406 	if (sched->tp_handler->switch_event)
1407 		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1408 
1409 	sched->curr_pid[this_cpu] = next_pid;
1410 	return err;
1411 }
1412 
1413 static int process_sched_runtime_event(struct perf_tool *tool,
1414 				       struct perf_evsel *evsel,
1415 				       struct perf_sample *sample,
1416 				       struct machine *machine)
1417 {
1418 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1419 
1420 	if (sched->tp_handler->runtime_event)
1421 		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1422 
1423 	return 0;
1424 }
1425 
1426 static int perf_sched__process_fork_event(struct perf_tool *tool,
1427 					  union perf_event *event,
1428 					  struct perf_sample *sample,
1429 					  struct machine *machine)
1430 {
1431 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1432 
1433 	/* run the fork event through the perf machineruy */
1434 	perf_event__process_fork(tool, event, sample, machine);
1435 
1436 	/* and then run additional processing needed for this command */
1437 	if (sched->tp_handler->fork_event)
1438 		return sched->tp_handler->fork_event(sched, event, machine);
1439 
1440 	return 0;
1441 }
1442 
1443 static int process_sched_migrate_task_event(struct perf_tool *tool,
1444 					    struct perf_evsel *evsel,
1445 					    struct perf_sample *sample,
1446 					    struct machine *machine)
1447 {
1448 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1449 
1450 	if (sched->tp_handler->migrate_task_event)
1451 		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1452 
1453 	return 0;
1454 }
1455 
1456 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1457 				  struct perf_evsel *evsel,
1458 				  struct perf_sample *sample,
1459 				  struct machine *machine);
1460 
1461 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1462 						 union perf_event *event __maybe_unused,
1463 						 struct perf_sample *sample,
1464 						 struct perf_evsel *evsel,
1465 						 struct machine *machine)
1466 {
1467 	int err = 0;
1468 
1469 	if (evsel->handler != NULL) {
1470 		tracepoint_handler f = evsel->handler;
1471 		err = f(tool, evsel, sample, machine);
1472 	}
1473 
1474 	return err;
1475 }
1476 
1477 static int perf_sched__read_events(struct perf_sched *sched)
1478 {
1479 	const struct perf_evsel_str_handler handlers[] = {
1480 		{ "sched:sched_switch",	      process_sched_switch_event, },
1481 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1482 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1483 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1484 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1485 	};
1486 	struct perf_session *session;
1487 	struct perf_data_file file = {
1488 		.path = input_name,
1489 		.mode = PERF_DATA_MODE_READ,
1490 		.force = sched->force,
1491 	};
1492 	int rc = -1;
1493 
1494 	session = perf_session__new(&file, false, &sched->tool);
1495 	if (session == NULL) {
1496 		pr_debug("No Memory for session\n");
1497 		return -1;
1498 	}
1499 
1500 	symbol__init(&session->header.env);
1501 
1502 	if (perf_session__set_tracepoints_handlers(session, handlers))
1503 		goto out_delete;
1504 
1505 	if (perf_session__has_traces(session, "record -R")) {
1506 		int err = perf_session__process_events(session);
1507 		if (err) {
1508 			pr_err("Failed to process events, error %d", err);
1509 			goto out_delete;
1510 		}
1511 
1512 		sched->nr_events      = session->evlist->stats.nr_events[0];
1513 		sched->nr_lost_events = session->evlist->stats.total_lost;
1514 		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1515 	}
1516 
1517 	rc = 0;
1518 out_delete:
1519 	perf_session__delete(session);
1520 	return rc;
1521 }
1522 
1523 static void print_bad_events(struct perf_sched *sched)
1524 {
1525 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1526 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1527 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1528 			sched->nr_unordered_timestamps, sched->nr_timestamps);
1529 	}
1530 	if (sched->nr_lost_events && sched->nr_events) {
1531 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1532 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1533 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1534 	}
1535 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1536 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1537 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1538 			sched->nr_context_switch_bugs, sched->nr_timestamps);
1539 		if (sched->nr_lost_events)
1540 			printf(" (due to lost events?)");
1541 		printf("\n");
1542 	}
1543 }
1544 
1545 static int perf_sched__lat(struct perf_sched *sched)
1546 {
1547 	struct rb_node *next;
1548 
1549 	setup_pager();
1550 
1551 	if (perf_sched__read_events(sched))
1552 		return -1;
1553 
1554 	perf_sched__sort_lat(sched);
1555 
1556 	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
1557 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
1558 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1559 
1560 	next = rb_first(&sched->sorted_atom_root);
1561 
1562 	while (next) {
1563 		struct work_atoms *work_list;
1564 
1565 		work_list = rb_entry(next, struct work_atoms, node);
1566 		output_lat_thread(sched, work_list);
1567 		next = rb_next(next);
1568 		thread__zput(work_list->thread);
1569 	}
1570 
1571 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1572 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1573 		(double)sched->all_runtime / 1e6, sched->all_count);
1574 
1575 	printf(" ---------------------------------------------------\n");
1576 
1577 	print_bad_events(sched);
1578 	printf("\n");
1579 
1580 	return 0;
1581 }
1582 
1583 static int perf_sched__map(struct perf_sched *sched)
1584 {
1585 	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1586 
1587 	setup_pager();
1588 	if (perf_sched__read_events(sched))
1589 		return -1;
1590 	print_bad_events(sched);
1591 	return 0;
1592 }
1593 
1594 static int perf_sched__replay(struct perf_sched *sched)
1595 {
1596 	unsigned long i;
1597 
1598 	calibrate_run_measurement_overhead(sched);
1599 	calibrate_sleep_measurement_overhead(sched);
1600 
1601 	test_calibrations(sched);
1602 
1603 	if (perf_sched__read_events(sched))
1604 		return -1;
1605 
1606 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
1607 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
1608 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1609 
1610 	if (sched->targetless_wakeups)
1611 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
1612 	if (sched->multitarget_wakeups)
1613 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1614 	if (sched->nr_run_events_optimized)
1615 		printf("run atoms optimized: %ld\n",
1616 			sched->nr_run_events_optimized);
1617 
1618 	print_task_traces(sched);
1619 	add_cross_task_wakeups(sched);
1620 
1621 	create_tasks(sched);
1622 	printf("------------------------------------------------------------\n");
1623 	for (i = 0; i < sched->replay_repeat; i++)
1624 		run_one_test(sched);
1625 
1626 	return 0;
1627 }
1628 
1629 static void setup_sorting(struct perf_sched *sched, const struct option *options,
1630 			  const char * const usage_msg[])
1631 {
1632 	char *tmp, *tok, *str = strdup(sched->sort_order);
1633 
1634 	for (tok = strtok_r(str, ", ", &tmp);
1635 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1636 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1637 			error("Unknown --sort key: `%s'", tok);
1638 			usage_with_options(usage_msg, options);
1639 		}
1640 	}
1641 
1642 	free(str);
1643 
1644 	sort_dimension__add("pid", &sched->cmp_pid);
1645 }
1646 
1647 static int __cmd_record(int argc, const char **argv)
1648 {
1649 	unsigned int rec_argc, i, j;
1650 	const char **rec_argv;
1651 	const char * const record_args[] = {
1652 		"record",
1653 		"-a",
1654 		"-R",
1655 		"-m", "1024",
1656 		"-c", "1",
1657 		"-e", "sched:sched_switch",
1658 		"-e", "sched:sched_stat_wait",
1659 		"-e", "sched:sched_stat_sleep",
1660 		"-e", "sched:sched_stat_iowait",
1661 		"-e", "sched:sched_stat_runtime",
1662 		"-e", "sched:sched_process_fork",
1663 		"-e", "sched:sched_wakeup",
1664 		"-e", "sched:sched_wakeup_new",
1665 		"-e", "sched:sched_migrate_task",
1666 	};
1667 
1668 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1669 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1670 
1671 	if (rec_argv == NULL)
1672 		return -ENOMEM;
1673 
1674 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1675 		rec_argv[i] = strdup(record_args[i]);
1676 
1677 	for (j = 1; j < (unsigned int)argc; j++, i++)
1678 		rec_argv[i] = argv[j];
1679 
1680 	BUG_ON(i != rec_argc);
1681 
1682 	return cmd_record(i, rec_argv, NULL);
1683 }
1684 
1685 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1686 {
1687 	const char default_sort_order[] = "avg, max, switch, runtime";
1688 	struct perf_sched sched = {
1689 		.tool = {
1690 			.sample		 = perf_sched__process_tracepoint_sample,
1691 			.comm		 = perf_event__process_comm,
1692 			.lost		 = perf_event__process_lost,
1693 			.fork		 = perf_sched__process_fork_event,
1694 			.ordered_events = true,
1695 		},
1696 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
1697 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
1698 		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
1699 		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1700 		.sort_order	      = default_sort_order,
1701 		.replay_repeat	      = 10,
1702 		.profile_cpu	      = -1,
1703 		.next_shortname1      = 'A',
1704 		.next_shortname2      = '0',
1705 	};
1706 	const struct option latency_options[] = {
1707 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1708 		   "sort by key(s): runtime, switch, avg, max"),
1709 	OPT_INCR('v', "verbose", &verbose,
1710 		    "be more verbose (show symbol address, etc)"),
1711 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1712 		    "CPU to profile on"),
1713 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1714 		    "dump raw trace in ASCII"),
1715 	OPT_END()
1716 	};
1717 	const struct option replay_options[] = {
1718 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1719 		     "repeat the workload replay N times (-1: infinite)"),
1720 	OPT_INCR('v', "verbose", &verbose,
1721 		    "be more verbose (show symbol address, etc)"),
1722 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1723 		    "dump raw trace in ASCII"),
1724 	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
1725 	OPT_END()
1726 	};
1727 	const struct option sched_options[] = {
1728 	OPT_STRING('i', "input", &input_name, "file",
1729 		    "input file name"),
1730 	OPT_INCR('v', "verbose", &verbose,
1731 		    "be more verbose (show symbol address, etc)"),
1732 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1733 		    "dump raw trace in ASCII"),
1734 	OPT_END()
1735 	};
1736 	const char * const latency_usage[] = {
1737 		"perf sched latency [<options>]",
1738 		NULL
1739 	};
1740 	const char * const replay_usage[] = {
1741 		"perf sched replay [<options>]",
1742 		NULL
1743 	};
1744 	const char *const sched_subcommands[] = { "record", "latency", "map",
1745 						  "replay", "script", NULL };
1746 	const char *sched_usage[] = {
1747 		NULL,
1748 		NULL
1749 	};
1750 	struct trace_sched_handler lat_ops  = {
1751 		.wakeup_event	    = latency_wakeup_event,
1752 		.switch_event	    = latency_switch_event,
1753 		.runtime_event	    = latency_runtime_event,
1754 		.migrate_task_event = latency_migrate_task_event,
1755 	};
1756 	struct trace_sched_handler map_ops  = {
1757 		.switch_event	    = map_switch_event,
1758 	};
1759 	struct trace_sched_handler replay_ops  = {
1760 		.wakeup_event	    = replay_wakeup_event,
1761 		.switch_event	    = replay_switch_event,
1762 		.fork_event	    = replay_fork_event,
1763 	};
1764 	unsigned int i;
1765 
1766 	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
1767 		sched.curr_pid[i] = -1;
1768 
1769 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
1770 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
1771 	if (!argc)
1772 		usage_with_options(sched_usage, sched_options);
1773 
1774 	/*
1775 	 * Aliased to 'perf script' for now:
1776 	 */
1777 	if (!strcmp(argv[0], "script"))
1778 		return cmd_script(argc, argv, prefix);
1779 
1780 	if (!strncmp(argv[0], "rec", 3)) {
1781 		return __cmd_record(argc, argv);
1782 	} else if (!strncmp(argv[0], "lat", 3)) {
1783 		sched.tp_handler = &lat_ops;
1784 		if (argc > 1) {
1785 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1786 			if (argc)
1787 				usage_with_options(latency_usage, latency_options);
1788 		}
1789 		setup_sorting(&sched, latency_options, latency_usage);
1790 		return perf_sched__lat(&sched);
1791 	} else if (!strcmp(argv[0], "map")) {
1792 		sched.tp_handler = &map_ops;
1793 		setup_sorting(&sched, latency_options, latency_usage);
1794 		return perf_sched__map(&sched);
1795 	} else if (!strncmp(argv[0], "rep", 3)) {
1796 		sched.tp_handler = &replay_ops;
1797 		if (argc) {
1798 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1799 			if (argc)
1800 				usage_with_options(replay_usage, replay_options);
1801 		}
1802 		return perf_sched__replay(&sched);
1803 	} else {
1804 		usage_with_options(sched_usage, sched_options);
1805 	}
1806 
1807 	return 0;
1808 }
1809