xref: /openbmc/linux/tools/perf/builtin-sched.c (revision 8c0b9ee8)
1 #include "builtin.h"
2 #include "perf.h"
3 
4 #include "util/util.h"
5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 #include "util/cloexec.h"
14 
15 #include "util/parse-options.h"
16 #include "util/trace-event.h"
17 
18 #include "util/debug.h"
19 
20 #include <sys/prctl.h>
21 #include <sys/resource.h>
22 
23 #include <semaphore.h>
24 #include <pthread.h>
25 #include <math.h>
26 
27 #define PR_SET_NAME		15               /* Set process name */
28 #define MAX_CPUS		4096
29 #define COMM_LEN		20
30 #define SYM_LEN			129
31 #define MAX_PID			65536
32 
33 struct sched_atom;
34 
35 struct task_desc {
36 	unsigned long		nr;
37 	unsigned long		pid;
38 	char			comm[COMM_LEN];
39 
40 	unsigned long		nr_events;
41 	unsigned long		curr_event;
42 	struct sched_atom	**atoms;
43 
44 	pthread_t		thread;
45 	sem_t			sleep_sem;
46 
47 	sem_t			ready_for_work;
48 	sem_t			work_done_sem;
49 
50 	u64			cpu_usage;
51 };
52 
53 enum sched_event_type {
54 	SCHED_EVENT_RUN,
55 	SCHED_EVENT_SLEEP,
56 	SCHED_EVENT_WAKEUP,
57 	SCHED_EVENT_MIGRATION,
58 };
59 
60 struct sched_atom {
61 	enum sched_event_type	type;
62 	int			specific_wait;
63 	u64			timestamp;
64 	u64			duration;
65 	unsigned long		nr;
66 	sem_t			*wait_sem;
67 	struct task_desc	*wakee;
68 };
69 
70 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
71 
72 enum thread_state {
73 	THREAD_SLEEPING = 0,
74 	THREAD_WAIT_CPU,
75 	THREAD_SCHED_IN,
76 	THREAD_IGNORE
77 };
78 
79 struct work_atom {
80 	struct list_head	list;
81 	enum thread_state	state;
82 	u64			sched_out_time;
83 	u64			wake_up_time;
84 	u64			sched_in_time;
85 	u64			runtime;
86 };
87 
88 struct work_atoms {
89 	struct list_head	work_list;
90 	struct thread		*thread;
91 	struct rb_node		node;
92 	u64			max_lat;
93 	u64			max_lat_at;
94 	u64			total_lat;
95 	u64			nb_atoms;
96 	u64			total_runtime;
97 };
98 
99 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
100 
101 struct perf_sched;
102 
103 struct trace_sched_handler {
104 	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
105 			    struct perf_sample *sample, struct machine *machine);
106 
107 	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
108 			     struct perf_sample *sample, struct machine *machine);
109 
110 	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
111 			    struct perf_sample *sample, struct machine *machine);
112 
113 	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
114 	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
115 			  struct machine *machine);
116 
117 	int (*migrate_task_event)(struct perf_sched *sched,
118 				  struct perf_evsel *evsel,
119 				  struct perf_sample *sample,
120 				  struct machine *machine);
121 };
122 
123 struct perf_sched {
124 	struct perf_tool tool;
125 	const char	 *sort_order;
126 	unsigned long	 nr_tasks;
127 	struct task_desc *pid_to_task[MAX_PID];
128 	struct task_desc **tasks;
129 	const struct trace_sched_handler *tp_handler;
130 	pthread_mutex_t	 start_work_mutex;
131 	pthread_mutex_t	 work_done_wait_mutex;
132 	int		 profile_cpu;
133 /*
134  * Track the current task - that way we can know whether there's any
135  * weird events, such as a task being switched away that is not current.
136  */
137 	int		 max_cpu;
138 	u32		 curr_pid[MAX_CPUS];
139 	struct thread	 *curr_thread[MAX_CPUS];
140 	char		 next_shortname1;
141 	char		 next_shortname2;
142 	unsigned int	 replay_repeat;
143 	unsigned long	 nr_run_events;
144 	unsigned long	 nr_sleep_events;
145 	unsigned long	 nr_wakeup_events;
146 	unsigned long	 nr_sleep_corrections;
147 	unsigned long	 nr_run_events_optimized;
148 	unsigned long	 targetless_wakeups;
149 	unsigned long	 multitarget_wakeups;
150 	unsigned long	 nr_runs;
151 	unsigned long	 nr_timestamps;
152 	unsigned long	 nr_unordered_timestamps;
153 	unsigned long	 nr_context_switch_bugs;
154 	unsigned long	 nr_events;
155 	unsigned long	 nr_lost_chunks;
156 	unsigned long	 nr_lost_events;
157 	u64		 run_measurement_overhead;
158 	u64		 sleep_measurement_overhead;
159 	u64		 start_time;
160 	u64		 cpu_usage;
161 	u64		 runavg_cpu_usage;
162 	u64		 parent_cpu_usage;
163 	u64		 runavg_parent_cpu_usage;
164 	u64		 sum_runtime;
165 	u64		 sum_fluct;
166 	u64		 run_avg;
167 	u64		 all_runtime;
168 	u64		 all_count;
169 	u64		 cpu_last_switched[MAX_CPUS];
170 	struct rb_root	 atom_root, sorted_atom_root;
171 	struct list_head sort_list, cmp_pid;
172 };
173 
174 static u64 get_nsecs(void)
175 {
176 	struct timespec ts;
177 
178 	clock_gettime(CLOCK_MONOTONIC, &ts);
179 
180 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
181 }
182 
183 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
184 {
185 	u64 T0 = get_nsecs(), T1;
186 
187 	do {
188 		T1 = get_nsecs();
189 	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
190 }
191 
192 static void sleep_nsecs(u64 nsecs)
193 {
194 	struct timespec ts;
195 
196 	ts.tv_nsec = nsecs % 999999999;
197 	ts.tv_sec = nsecs / 999999999;
198 
199 	nanosleep(&ts, NULL);
200 }
201 
202 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
203 {
204 	u64 T0, T1, delta, min_delta = 1000000000ULL;
205 	int i;
206 
207 	for (i = 0; i < 10; i++) {
208 		T0 = get_nsecs();
209 		burn_nsecs(sched, 0);
210 		T1 = get_nsecs();
211 		delta = T1-T0;
212 		min_delta = min(min_delta, delta);
213 	}
214 	sched->run_measurement_overhead = min_delta;
215 
216 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
217 }
218 
219 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
220 {
221 	u64 T0, T1, delta, min_delta = 1000000000ULL;
222 	int i;
223 
224 	for (i = 0; i < 10; i++) {
225 		T0 = get_nsecs();
226 		sleep_nsecs(10000);
227 		T1 = get_nsecs();
228 		delta = T1-T0;
229 		min_delta = min(min_delta, delta);
230 	}
231 	min_delta -= 10000;
232 	sched->sleep_measurement_overhead = min_delta;
233 
234 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
235 }
236 
237 static struct sched_atom *
238 get_new_event(struct task_desc *task, u64 timestamp)
239 {
240 	struct sched_atom *event = zalloc(sizeof(*event));
241 	unsigned long idx = task->nr_events;
242 	size_t size;
243 
244 	event->timestamp = timestamp;
245 	event->nr = idx;
246 
247 	task->nr_events++;
248 	size = sizeof(struct sched_atom *) * task->nr_events;
249 	task->atoms = realloc(task->atoms, size);
250 	BUG_ON(!task->atoms);
251 
252 	task->atoms[idx] = event;
253 
254 	return event;
255 }
256 
257 static struct sched_atom *last_event(struct task_desc *task)
258 {
259 	if (!task->nr_events)
260 		return NULL;
261 
262 	return task->atoms[task->nr_events - 1];
263 }
264 
265 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
266 				u64 timestamp, u64 duration)
267 {
268 	struct sched_atom *event, *curr_event = last_event(task);
269 
270 	/*
271 	 * optimize an existing RUN event by merging this one
272 	 * to it:
273 	 */
274 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
275 		sched->nr_run_events_optimized++;
276 		curr_event->duration += duration;
277 		return;
278 	}
279 
280 	event = get_new_event(task, timestamp);
281 
282 	event->type = SCHED_EVENT_RUN;
283 	event->duration = duration;
284 
285 	sched->nr_run_events++;
286 }
287 
288 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
289 				   u64 timestamp, struct task_desc *wakee)
290 {
291 	struct sched_atom *event, *wakee_event;
292 
293 	event = get_new_event(task, timestamp);
294 	event->type = SCHED_EVENT_WAKEUP;
295 	event->wakee = wakee;
296 
297 	wakee_event = last_event(wakee);
298 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
299 		sched->targetless_wakeups++;
300 		return;
301 	}
302 	if (wakee_event->wait_sem) {
303 		sched->multitarget_wakeups++;
304 		return;
305 	}
306 
307 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
308 	sem_init(wakee_event->wait_sem, 0, 0);
309 	wakee_event->specific_wait = 1;
310 	event->wait_sem = wakee_event->wait_sem;
311 
312 	sched->nr_wakeup_events++;
313 }
314 
315 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
316 				  u64 timestamp, u64 task_state __maybe_unused)
317 {
318 	struct sched_atom *event = get_new_event(task, timestamp);
319 
320 	event->type = SCHED_EVENT_SLEEP;
321 
322 	sched->nr_sleep_events++;
323 }
324 
325 static struct task_desc *register_pid(struct perf_sched *sched,
326 				      unsigned long pid, const char *comm)
327 {
328 	struct task_desc *task;
329 
330 	BUG_ON(pid >= MAX_PID);
331 
332 	task = sched->pid_to_task[pid];
333 
334 	if (task)
335 		return task;
336 
337 	task = zalloc(sizeof(*task));
338 	task->pid = pid;
339 	task->nr = sched->nr_tasks;
340 	strcpy(task->comm, comm);
341 	/*
342 	 * every task starts in sleeping state - this gets ignored
343 	 * if there's no wakeup pointing to this sleep state:
344 	 */
345 	add_sched_event_sleep(sched, task, 0, 0);
346 
347 	sched->pid_to_task[pid] = task;
348 	sched->nr_tasks++;
349 	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
350 	BUG_ON(!sched->tasks);
351 	sched->tasks[task->nr] = task;
352 
353 	if (verbose)
354 		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
355 
356 	return task;
357 }
358 
359 
360 static void print_task_traces(struct perf_sched *sched)
361 {
362 	struct task_desc *task;
363 	unsigned long i;
364 
365 	for (i = 0; i < sched->nr_tasks; i++) {
366 		task = sched->tasks[i];
367 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
368 			task->nr, task->comm, task->pid, task->nr_events);
369 	}
370 }
371 
372 static void add_cross_task_wakeups(struct perf_sched *sched)
373 {
374 	struct task_desc *task1, *task2;
375 	unsigned long i, j;
376 
377 	for (i = 0; i < sched->nr_tasks; i++) {
378 		task1 = sched->tasks[i];
379 		j = i + 1;
380 		if (j == sched->nr_tasks)
381 			j = 0;
382 		task2 = sched->tasks[j];
383 		add_sched_event_wakeup(sched, task1, 0, task2);
384 	}
385 }
386 
387 static void perf_sched__process_event(struct perf_sched *sched,
388 				      struct sched_atom *atom)
389 {
390 	int ret = 0;
391 
392 	switch (atom->type) {
393 		case SCHED_EVENT_RUN:
394 			burn_nsecs(sched, atom->duration);
395 			break;
396 		case SCHED_EVENT_SLEEP:
397 			if (atom->wait_sem)
398 				ret = sem_wait(atom->wait_sem);
399 			BUG_ON(ret);
400 			break;
401 		case SCHED_EVENT_WAKEUP:
402 			if (atom->wait_sem)
403 				ret = sem_post(atom->wait_sem);
404 			BUG_ON(ret);
405 			break;
406 		case SCHED_EVENT_MIGRATION:
407 			break;
408 		default:
409 			BUG_ON(1);
410 	}
411 }
412 
413 static u64 get_cpu_usage_nsec_parent(void)
414 {
415 	struct rusage ru;
416 	u64 sum;
417 	int err;
418 
419 	err = getrusage(RUSAGE_SELF, &ru);
420 	BUG_ON(err);
421 
422 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
423 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
424 
425 	return sum;
426 }
427 
428 static int self_open_counters(void)
429 {
430 	struct perf_event_attr attr;
431 	char sbuf[STRERR_BUFSIZE];
432 	int fd;
433 
434 	memset(&attr, 0, sizeof(attr));
435 
436 	attr.type = PERF_TYPE_SOFTWARE;
437 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
438 
439 	fd = sys_perf_event_open(&attr, 0, -1, -1,
440 				 perf_event_open_cloexec_flag());
441 
442 	if (fd < 0)
443 		pr_err("Error: sys_perf_event_open() syscall returned "
444 		       "with %d (%s)\n", fd,
445 		       strerror_r(errno, sbuf, sizeof(sbuf)));
446 	return fd;
447 }
448 
449 static u64 get_cpu_usage_nsec_self(int fd)
450 {
451 	u64 runtime;
452 	int ret;
453 
454 	ret = read(fd, &runtime, sizeof(runtime));
455 	BUG_ON(ret != sizeof(runtime));
456 
457 	return runtime;
458 }
459 
460 struct sched_thread_parms {
461 	struct task_desc  *task;
462 	struct perf_sched *sched;
463 };
464 
465 static void *thread_func(void *ctx)
466 {
467 	struct sched_thread_parms *parms = ctx;
468 	struct task_desc *this_task = parms->task;
469 	struct perf_sched *sched = parms->sched;
470 	u64 cpu_usage_0, cpu_usage_1;
471 	unsigned long i, ret;
472 	char comm2[22];
473 	int fd;
474 
475 	zfree(&parms);
476 
477 	sprintf(comm2, ":%s", this_task->comm);
478 	prctl(PR_SET_NAME, comm2);
479 	fd = self_open_counters();
480 	if (fd < 0)
481 		return NULL;
482 again:
483 	ret = sem_post(&this_task->ready_for_work);
484 	BUG_ON(ret);
485 	ret = pthread_mutex_lock(&sched->start_work_mutex);
486 	BUG_ON(ret);
487 	ret = pthread_mutex_unlock(&sched->start_work_mutex);
488 	BUG_ON(ret);
489 
490 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
491 
492 	for (i = 0; i < this_task->nr_events; i++) {
493 		this_task->curr_event = i;
494 		perf_sched__process_event(sched, this_task->atoms[i]);
495 	}
496 
497 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
498 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
499 	ret = sem_post(&this_task->work_done_sem);
500 	BUG_ON(ret);
501 
502 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
503 	BUG_ON(ret);
504 	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
505 	BUG_ON(ret);
506 
507 	goto again;
508 }
509 
510 static void create_tasks(struct perf_sched *sched)
511 {
512 	struct task_desc *task;
513 	pthread_attr_t attr;
514 	unsigned long i;
515 	int err;
516 
517 	err = pthread_attr_init(&attr);
518 	BUG_ON(err);
519 	err = pthread_attr_setstacksize(&attr,
520 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
521 	BUG_ON(err);
522 	err = pthread_mutex_lock(&sched->start_work_mutex);
523 	BUG_ON(err);
524 	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
525 	BUG_ON(err);
526 	for (i = 0; i < sched->nr_tasks; i++) {
527 		struct sched_thread_parms *parms = malloc(sizeof(*parms));
528 		BUG_ON(parms == NULL);
529 		parms->task = task = sched->tasks[i];
530 		parms->sched = sched;
531 		sem_init(&task->sleep_sem, 0, 0);
532 		sem_init(&task->ready_for_work, 0, 0);
533 		sem_init(&task->work_done_sem, 0, 0);
534 		task->curr_event = 0;
535 		err = pthread_create(&task->thread, &attr, thread_func, parms);
536 		BUG_ON(err);
537 	}
538 }
539 
540 static void wait_for_tasks(struct perf_sched *sched)
541 {
542 	u64 cpu_usage_0, cpu_usage_1;
543 	struct task_desc *task;
544 	unsigned long i, ret;
545 
546 	sched->start_time = get_nsecs();
547 	sched->cpu_usage = 0;
548 	pthread_mutex_unlock(&sched->work_done_wait_mutex);
549 
550 	for (i = 0; i < sched->nr_tasks; i++) {
551 		task = sched->tasks[i];
552 		ret = sem_wait(&task->ready_for_work);
553 		BUG_ON(ret);
554 		sem_init(&task->ready_for_work, 0, 0);
555 	}
556 	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
557 	BUG_ON(ret);
558 
559 	cpu_usage_0 = get_cpu_usage_nsec_parent();
560 
561 	pthread_mutex_unlock(&sched->start_work_mutex);
562 
563 	for (i = 0; i < sched->nr_tasks; i++) {
564 		task = sched->tasks[i];
565 		ret = sem_wait(&task->work_done_sem);
566 		BUG_ON(ret);
567 		sem_init(&task->work_done_sem, 0, 0);
568 		sched->cpu_usage += task->cpu_usage;
569 		task->cpu_usage = 0;
570 	}
571 
572 	cpu_usage_1 = get_cpu_usage_nsec_parent();
573 	if (!sched->runavg_cpu_usage)
574 		sched->runavg_cpu_usage = sched->cpu_usage;
575 	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
576 
577 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
578 	if (!sched->runavg_parent_cpu_usage)
579 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
580 	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
581 					 sched->parent_cpu_usage)/10;
582 
583 	ret = pthread_mutex_lock(&sched->start_work_mutex);
584 	BUG_ON(ret);
585 
586 	for (i = 0; i < sched->nr_tasks; i++) {
587 		task = sched->tasks[i];
588 		sem_init(&task->sleep_sem, 0, 0);
589 		task->curr_event = 0;
590 	}
591 }
592 
593 static void run_one_test(struct perf_sched *sched)
594 {
595 	u64 T0, T1, delta, avg_delta, fluct;
596 
597 	T0 = get_nsecs();
598 	wait_for_tasks(sched);
599 	T1 = get_nsecs();
600 
601 	delta = T1 - T0;
602 	sched->sum_runtime += delta;
603 	sched->nr_runs++;
604 
605 	avg_delta = sched->sum_runtime / sched->nr_runs;
606 	if (delta < avg_delta)
607 		fluct = avg_delta - delta;
608 	else
609 		fluct = delta - avg_delta;
610 	sched->sum_fluct += fluct;
611 	if (!sched->run_avg)
612 		sched->run_avg = delta;
613 	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
614 
615 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
616 
617 	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
618 
619 	printf("cpu: %0.2f / %0.2f",
620 		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
621 
622 #if 0
623 	/*
624 	 * rusage statistics done by the parent, these are less
625 	 * accurate than the sched->sum_exec_runtime based statistics:
626 	 */
627 	printf(" [%0.2f / %0.2f]",
628 		(double)sched->parent_cpu_usage/1e6,
629 		(double)sched->runavg_parent_cpu_usage/1e6);
630 #endif
631 
632 	printf("\n");
633 
634 	if (sched->nr_sleep_corrections)
635 		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
636 	sched->nr_sleep_corrections = 0;
637 }
638 
639 static void test_calibrations(struct perf_sched *sched)
640 {
641 	u64 T0, T1;
642 
643 	T0 = get_nsecs();
644 	burn_nsecs(sched, 1e6);
645 	T1 = get_nsecs();
646 
647 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
648 
649 	T0 = get_nsecs();
650 	sleep_nsecs(1e6);
651 	T1 = get_nsecs();
652 
653 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
654 }
655 
656 static int
657 replay_wakeup_event(struct perf_sched *sched,
658 		    struct perf_evsel *evsel, struct perf_sample *sample,
659 		    struct machine *machine __maybe_unused)
660 {
661 	const char *comm = perf_evsel__strval(evsel, sample, "comm");
662 	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
663 	struct task_desc *waker, *wakee;
664 
665 	if (verbose) {
666 		printf("sched_wakeup event %p\n", evsel);
667 
668 		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
669 	}
670 
671 	waker = register_pid(sched, sample->tid, "<unknown>");
672 	wakee = register_pid(sched, pid, comm);
673 
674 	add_sched_event_wakeup(sched, waker, sample->time, wakee);
675 	return 0;
676 }
677 
678 static int replay_switch_event(struct perf_sched *sched,
679 			       struct perf_evsel *evsel,
680 			       struct perf_sample *sample,
681 			       struct machine *machine __maybe_unused)
682 {
683 	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
684 		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
685 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
686 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
687 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
688 	struct task_desc *prev, __maybe_unused *next;
689 	u64 timestamp0, timestamp = sample->time;
690 	int cpu = sample->cpu;
691 	s64 delta;
692 
693 	if (verbose)
694 		printf("sched_switch event %p\n", evsel);
695 
696 	if (cpu >= MAX_CPUS || cpu < 0)
697 		return 0;
698 
699 	timestamp0 = sched->cpu_last_switched[cpu];
700 	if (timestamp0)
701 		delta = timestamp - timestamp0;
702 	else
703 		delta = 0;
704 
705 	if (delta < 0) {
706 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
707 		return -1;
708 	}
709 
710 	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
711 		 prev_comm, prev_pid, next_comm, next_pid, delta);
712 
713 	prev = register_pid(sched, prev_pid, prev_comm);
714 	next = register_pid(sched, next_pid, next_comm);
715 
716 	sched->cpu_last_switched[cpu] = timestamp;
717 
718 	add_sched_event_run(sched, prev, timestamp, delta);
719 	add_sched_event_sleep(sched, prev, timestamp, prev_state);
720 
721 	return 0;
722 }
723 
724 static int replay_fork_event(struct perf_sched *sched,
725 			     union perf_event *event,
726 			     struct machine *machine)
727 {
728 	struct thread *child, *parent;
729 
730 	child = machine__findnew_thread(machine, event->fork.pid,
731 					event->fork.tid);
732 	parent = machine__findnew_thread(machine, event->fork.ppid,
733 					 event->fork.ptid);
734 
735 	if (child == NULL || parent == NULL) {
736 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
737 				 child, parent);
738 		return 0;
739 	}
740 
741 	if (verbose) {
742 		printf("fork event\n");
743 		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
744 		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
745 	}
746 
747 	register_pid(sched, parent->tid, thread__comm_str(parent));
748 	register_pid(sched, child->tid, thread__comm_str(child));
749 	return 0;
750 }
751 
752 struct sort_dimension {
753 	const char		*name;
754 	sort_fn_t		cmp;
755 	struct list_head	list;
756 };
757 
758 static int
759 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
760 {
761 	struct sort_dimension *sort;
762 	int ret = 0;
763 
764 	BUG_ON(list_empty(list));
765 
766 	list_for_each_entry(sort, list, list) {
767 		ret = sort->cmp(l, r);
768 		if (ret)
769 			return ret;
770 	}
771 
772 	return ret;
773 }
774 
775 static struct work_atoms *
776 thread_atoms_search(struct rb_root *root, struct thread *thread,
777 			 struct list_head *sort_list)
778 {
779 	struct rb_node *node = root->rb_node;
780 	struct work_atoms key = { .thread = thread };
781 
782 	while (node) {
783 		struct work_atoms *atoms;
784 		int cmp;
785 
786 		atoms = container_of(node, struct work_atoms, node);
787 
788 		cmp = thread_lat_cmp(sort_list, &key, atoms);
789 		if (cmp > 0)
790 			node = node->rb_left;
791 		else if (cmp < 0)
792 			node = node->rb_right;
793 		else {
794 			BUG_ON(thread != atoms->thread);
795 			return atoms;
796 		}
797 	}
798 	return NULL;
799 }
800 
801 static void
802 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
803 			 struct list_head *sort_list)
804 {
805 	struct rb_node **new = &(root->rb_node), *parent = NULL;
806 
807 	while (*new) {
808 		struct work_atoms *this;
809 		int cmp;
810 
811 		this = container_of(*new, struct work_atoms, node);
812 		parent = *new;
813 
814 		cmp = thread_lat_cmp(sort_list, data, this);
815 
816 		if (cmp > 0)
817 			new = &((*new)->rb_left);
818 		else
819 			new = &((*new)->rb_right);
820 	}
821 
822 	rb_link_node(&data->node, parent, new);
823 	rb_insert_color(&data->node, root);
824 }
825 
826 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
827 {
828 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
829 	if (!atoms) {
830 		pr_err("No memory at %s\n", __func__);
831 		return -1;
832 	}
833 
834 	atoms->thread = thread;
835 	INIT_LIST_HEAD(&atoms->work_list);
836 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
837 	return 0;
838 }
839 
840 static char sched_out_state(u64 prev_state)
841 {
842 	const char *str = TASK_STATE_TO_CHAR_STR;
843 
844 	return str[prev_state];
845 }
846 
847 static int
848 add_sched_out_event(struct work_atoms *atoms,
849 		    char run_state,
850 		    u64 timestamp)
851 {
852 	struct work_atom *atom = zalloc(sizeof(*atom));
853 	if (!atom) {
854 		pr_err("Non memory at %s", __func__);
855 		return -1;
856 	}
857 
858 	atom->sched_out_time = timestamp;
859 
860 	if (run_state == 'R') {
861 		atom->state = THREAD_WAIT_CPU;
862 		atom->wake_up_time = atom->sched_out_time;
863 	}
864 
865 	list_add_tail(&atom->list, &atoms->work_list);
866 	return 0;
867 }
868 
869 static void
870 add_runtime_event(struct work_atoms *atoms, u64 delta,
871 		  u64 timestamp __maybe_unused)
872 {
873 	struct work_atom *atom;
874 
875 	BUG_ON(list_empty(&atoms->work_list));
876 
877 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
878 
879 	atom->runtime += delta;
880 	atoms->total_runtime += delta;
881 }
882 
883 static void
884 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
885 {
886 	struct work_atom *atom;
887 	u64 delta;
888 
889 	if (list_empty(&atoms->work_list))
890 		return;
891 
892 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
893 
894 	if (atom->state != THREAD_WAIT_CPU)
895 		return;
896 
897 	if (timestamp < atom->wake_up_time) {
898 		atom->state = THREAD_IGNORE;
899 		return;
900 	}
901 
902 	atom->state = THREAD_SCHED_IN;
903 	atom->sched_in_time = timestamp;
904 
905 	delta = atom->sched_in_time - atom->wake_up_time;
906 	atoms->total_lat += delta;
907 	if (delta > atoms->max_lat) {
908 		atoms->max_lat = delta;
909 		atoms->max_lat_at = timestamp;
910 	}
911 	atoms->nb_atoms++;
912 }
913 
914 static int latency_switch_event(struct perf_sched *sched,
915 				struct perf_evsel *evsel,
916 				struct perf_sample *sample,
917 				struct machine *machine)
918 {
919 	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
920 		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
921 	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
922 	struct work_atoms *out_events, *in_events;
923 	struct thread *sched_out, *sched_in;
924 	u64 timestamp0, timestamp = sample->time;
925 	int cpu = sample->cpu;
926 	s64 delta;
927 
928 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
929 
930 	timestamp0 = sched->cpu_last_switched[cpu];
931 	sched->cpu_last_switched[cpu] = timestamp;
932 	if (timestamp0)
933 		delta = timestamp - timestamp0;
934 	else
935 		delta = 0;
936 
937 	if (delta < 0) {
938 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
939 		return -1;
940 	}
941 
942 	sched_out = machine__findnew_thread(machine, -1, prev_pid);
943 	sched_in = machine__findnew_thread(machine, -1, next_pid);
944 
945 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
946 	if (!out_events) {
947 		if (thread_atoms_insert(sched, sched_out))
948 			return -1;
949 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
950 		if (!out_events) {
951 			pr_err("out-event: Internal tree error");
952 			return -1;
953 		}
954 	}
955 	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
956 		return -1;
957 
958 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
959 	if (!in_events) {
960 		if (thread_atoms_insert(sched, sched_in))
961 			return -1;
962 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
963 		if (!in_events) {
964 			pr_err("in-event: Internal tree error");
965 			return -1;
966 		}
967 		/*
968 		 * Take came in we have not heard about yet,
969 		 * add in an initial atom in runnable state:
970 		 */
971 		if (add_sched_out_event(in_events, 'R', timestamp))
972 			return -1;
973 	}
974 	add_sched_in_event(in_events, timestamp);
975 
976 	return 0;
977 }
978 
979 static int latency_runtime_event(struct perf_sched *sched,
980 				 struct perf_evsel *evsel,
981 				 struct perf_sample *sample,
982 				 struct machine *machine)
983 {
984 	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
985 	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
986 	struct thread *thread = machine__findnew_thread(machine, -1, pid);
987 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
988 	u64 timestamp = sample->time;
989 	int cpu = sample->cpu;
990 
991 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
992 	if (!atoms) {
993 		if (thread_atoms_insert(sched, thread))
994 			return -1;
995 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
996 		if (!atoms) {
997 			pr_err("in-event: Internal tree error");
998 			return -1;
999 		}
1000 		if (add_sched_out_event(atoms, 'R', timestamp))
1001 			return -1;
1002 	}
1003 
1004 	add_runtime_event(atoms, runtime, timestamp);
1005 	return 0;
1006 }
1007 
1008 static int latency_wakeup_event(struct perf_sched *sched,
1009 				struct perf_evsel *evsel,
1010 				struct perf_sample *sample,
1011 				struct machine *machine)
1012 {
1013 	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1014 	struct work_atoms *atoms;
1015 	struct work_atom *atom;
1016 	struct thread *wakee;
1017 	u64 timestamp = sample->time;
1018 
1019 	wakee = machine__findnew_thread(machine, -1, pid);
1020 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1021 	if (!atoms) {
1022 		if (thread_atoms_insert(sched, wakee))
1023 			return -1;
1024 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1025 		if (!atoms) {
1026 			pr_err("wakeup-event: Internal tree error");
1027 			return -1;
1028 		}
1029 		if (add_sched_out_event(atoms, 'S', timestamp))
1030 			return -1;
1031 	}
1032 
1033 	BUG_ON(list_empty(&atoms->work_list));
1034 
1035 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1036 
1037 	/*
1038 	 * As we do not guarantee the wakeup event happens when
1039 	 * task is out of run queue, also may happen when task is
1040 	 * on run queue and wakeup only change ->state to TASK_RUNNING,
1041 	 * then we should not set the ->wake_up_time when wake up a
1042 	 * task which is on run queue.
1043 	 *
1044 	 * You WILL be missing events if you've recorded only
1045 	 * one CPU, or are only looking at only one, so don't
1046 	 * skip in this case.
1047 	 */
1048 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1049 		return 0;
1050 
1051 	sched->nr_timestamps++;
1052 	if (atom->sched_out_time > timestamp) {
1053 		sched->nr_unordered_timestamps++;
1054 		return 0;
1055 	}
1056 
1057 	atom->state = THREAD_WAIT_CPU;
1058 	atom->wake_up_time = timestamp;
1059 	return 0;
1060 }
1061 
1062 static int latency_migrate_task_event(struct perf_sched *sched,
1063 				      struct perf_evsel *evsel,
1064 				      struct perf_sample *sample,
1065 				      struct machine *machine)
1066 {
1067 	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1068 	u64 timestamp = sample->time;
1069 	struct work_atoms *atoms;
1070 	struct work_atom *atom;
1071 	struct thread *migrant;
1072 
1073 	/*
1074 	 * Only need to worry about migration when profiling one CPU.
1075 	 */
1076 	if (sched->profile_cpu == -1)
1077 		return 0;
1078 
1079 	migrant = machine__findnew_thread(machine, -1, pid);
1080 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1081 	if (!atoms) {
1082 		if (thread_atoms_insert(sched, migrant))
1083 			return -1;
1084 		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1085 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1086 		if (!atoms) {
1087 			pr_err("migration-event: Internal tree error");
1088 			return -1;
1089 		}
1090 		if (add_sched_out_event(atoms, 'R', timestamp))
1091 			return -1;
1092 	}
1093 
1094 	BUG_ON(list_empty(&atoms->work_list));
1095 
1096 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1097 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1098 
1099 	sched->nr_timestamps++;
1100 
1101 	if (atom->sched_out_time > timestamp)
1102 		sched->nr_unordered_timestamps++;
1103 
1104 	return 0;
1105 }
1106 
1107 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1108 {
1109 	int i;
1110 	int ret;
1111 	u64 avg;
1112 
1113 	if (!work_list->nb_atoms)
1114 		return;
1115 	/*
1116 	 * Ignore idle threads:
1117 	 */
1118 	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1119 		return;
1120 
1121 	sched->all_runtime += work_list->total_runtime;
1122 	sched->all_count   += work_list->nb_atoms;
1123 
1124 	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1125 
1126 	for (i = 0; i < 24 - ret; i++)
1127 		printf(" ");
1128 
1129 	avg = work_list->total_lat / work_list->nb_atoms;
1130 
1131 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1132 	      (double)work_list->total_runtime / 1e6,
1133 		 work_list->nb_atoms, (double)avg / 1e6,
1134 		 (double)work_list->max_lat / 1e6,
1135 		 (double)work_list->max_lat_at / 1e9);
1136 }
1137 
1138 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1139 {
1140 	if (l->thread->tid < r->thread->tid)
1141 		return -1;
1142 	if (l->thread->tid > r->thread->tid)
1143 		return 1;
1144 
1145 	return 0;
1146 }
1147 
1148 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1149 {
1150 	u64 avgl, avgr;
1151 
1152 	if (!l->nb_atoms)
1153 		return -1;
1154 
1155 	if (!r->nb_atoms)
1156 		return 1;
1157 
1158 	avgl = l->total_lat / l->nb_atoms;
1159 	avgr = r->total_lat / r->nb_atoms;
1160 
1161 	if (avgl < avgr)
1162 		return -1;
1163 	if (avgl > avgr)
1164 		return 1;
1165 
1166 	return 0;
1167 }
1168 
1169 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1170 {
1171 	if (l->max_lat < r->max_lat)
1172 		return -1;
1173 	if (l->max_lat > r->max_lat)
1174 		return 1;
1175 
1176 	return 0;
1177 }
1178 
1179 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1180 {
1181 	if (l->nb_atoms < r->nb_atoms)
1182 		return -1;
1183 	if (l->nb_atoms > r->nb_atoms)
1184 		return 1;
1185 
1186 	return 0;
1187 }
1188 
1189 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1190 {
1191 	if (l->total_runtime < r->total_runtime)
1192 		return -1;
1193 	if (l->total_runtime > r->total_runtime)
1194 		return 1;
1195 
1196 	return 0;
1197 }
1198 
1199 static int sort_dimension__add(const char *tok, struct list_head *list)
1200 {
1201 	size_t i;
1202 	static struct sort_dimension avg_sort_dimension = {
1203 		.name = "avg",
1204 		.cmp  = avg_cmp,
1205 	};
1206 	static struct sort_dimension max_sort_dimension = {
1207 		.name = "max",
1208 		.cmp  = max_cmp,
1209 	};
1210 	static struct sort_dimension pid_sort_dimension = {
1211 		.name = "pid",
1212 		.cmp  = pid_cmp,
1213 	};
1214 	static struct sort_dimension runtime_sort_dimension = {
1215 		.name = "runtime",
1216 		.cmp  = runtime_cmp,
1217 	};
1218 	static struct sort_dimension switch_sort_dimension = {
1219 		.name = "switch",
1220 		.cmp  = switch_cmp,
1221 	};
1222 	struct sort_dimension *available_sorts[] = {
1223 		&pid_sort_dimension,
1224 		&avg_sort_dimension,
1225 		&max_sort_dimension,
1226 		&switch_sort_dimension,
1227 		&runtime_sort_dimension,
1228 	};
1229 
1230 	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1231 		if (!strcmp(available_sorts[i]->name, tok)) {
1232 			list_add_tail(&available_sorts[i]->list, list);
1233 
1234 			return 0;
1235 		}
1236 	}
1237 
1238 	return -1;
1239 }
1240 
1241 static void perf_sched__sort_lat(struct perf_sched *sched)
1242 {
1243 	struct rb_node *node;
1244 
1245 	for (;;) {
1246 		struct work_atoms *data;
1247 		node = rb_first(&sched->atom_root);
1248 		if (!node)
1249 			break;
1250 
1251 		rb_erase(node, &sched->atom_root);
1252 		data = rb_entry(node, struct work_atoms, node);
1253 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1254 	}
1255 }
1256 
1257 static int process_sched_wakeup_event(struct perf_tool *tool,
1258 				      struct perf_evsel *evsel,
1259 				      struct perf_sample *sample,
1260 				      struct machine *machine)
1261 {
1262 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1263 
1264 	if (sched->tp_handler->wakeup_event)
1265 		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1266 
1267 	return 0;
1268 }
1269 
1270 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1271 			    struct perf_sample *sample, struct machine *machine)
1272 {
1273 	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1274 	struct thread *sched_in;
1275 	int new_shortname;
1276 	u64 timestamp0, timestamp = sample->time;
1277 	s64 delta;
1278 	int cpu, this_cpu = sample->cpu;
1279 
1280 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1281 
1282 	if (this_cpu > sched->max_cpu)
1283 		sched->max_cpu = this_cpu;
1284 
1285 	timestamp0 = sched->cpu_last_switched[this_cpu];
1286 	sched->cpu_last_switched[this_cpu] = timestamp;
1287 	if (timestamp0)
1288 		delta = timestamp - timestamp0;
1289 	else
1290 		delta = 0;
1291 
1292 	if (delta < 0) {
1293 		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1294 		return -1;
1295 	}
1296 
1297 	sched_in = machine__findnew_thread(machine, -1, next_pid);
1298 
1299 	sched->curr_thread[this_cpu] = sched_in;
1300 
1301 	printf("  ");
1302 
1303 	new_shortname = 0;
1304 	if (!sched_in->shortname[0]) {
1305 		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1306 			/*
1307 			 * Don't allocate a letter-number for swapper:0
1308 			 * as a shortname. Instead, we use '.' for it.
1309 			 */
1310 			sched_in->shortname[0] = '.';
1311 			sched_in->shortname[1] = ' ';
1312 		} else {
1313 			sched_in->shortname[0] = sched->next_shortname1;
1314 			sched_in->shortname[1] = sched->next_shortname2;
1315 
1316 			if (sched->next_shortname1 < 'Z') {
1317 				sched->next_shortname1++;
1318 			} else {
1319 				sched->next_shortname1 = 'A';
1320 				if (sched->next_shortname2 < '9')
1321 					sched->next_shortname2++;
1322 				else
1323 					sched->next_shortname2 = '0';
1324 			}
1325 		}
1326 		new_shortname = 1;
1327 	}
1328 
1329 	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1330 		if (cpu != this_cpu)
1331 			printf(" ");
1332 		else
1333 			printf("*");
1334 
1335 		if (sched->curr_thread[cpu])
1336 			printf("%2s ", sched->curr_thread[cpu]->shortname);
1337 		else
1338 			printf("   ");
1339 	}
1340 
1341 	printf("  %12.6f secs ", (double)timestamp/1e9);
1342 	if (new_shortname) {
1343 		printf("%s => %s:%d\n",
1344 		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1345 	} else {
1346 		printf("\n");
1347 	}
1348 
1349 	return 0;
1350 }
1351 
1352 static int process_sched_switch_event(struct perf_tool *tool,
1353 				      struct perf_evsel *evsel,
1354 				      struct perf_sample *sample,
1355 				      struct machine *machine)
1356 {
1357 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1358 	int this_cpu = sample->cpu, err = 0;
1359 	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1360 	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1361 
1362 	if (sched->curr_pid[this_cpu] != (u32)-1) {
1363 		/*
1364 		 * Are we trying to switch away a PID that is
1365 		 * not current?
1366 		 */
1367 		if (sched->curr_pid[this_cpu] != prev_pid)
1368 			sched->nr_context_switch_bugs++;
1369 	}
1370 
1371 	if (sched->tp_handler->switch_event)
1372 		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1373 
1374 	sched->curr_pid[this_cpu] = next_pid;
1375 	return err;
1376 }
1377 
1378 static int process_sched_runtime_event(struct perf_tool *tool,
1379 				       struct perf_evsel *evsel,
1380 				       struct perf_sample *sample,
1381 				       struct machine *machine)
1382 {
1383 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1384 
1385 	if (sched->tp_handler->runtime_event)
1386 		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1387 
1388 	return 0;
1389 }
1390 
1391 static int perf_sched__process_fork_event(struct perf_tool *tool,
1392 					  union perf_event *event,
1393 					  struct perf_sample *sample,
1394 					  struct machine *machine)
1395 {
1396 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1397 
1398 	/* run the fork event through the perf machineruy */
1399 	perf_event__process_fork(tool, event, sample, machine);
1400 
1401 	/* and then run additional processing needed for this command */
1402 	if (sched->tp_handler->fork_event)
1403 		return sched->tp_handler->fork_event(sched, event, machine);
1404 
1405 	return 0;
1406 }
1407 
1408 static int process_sched_migrate_task_event(struct perf_tool *tool,
1409 					    struct perf_evsel *evsel,
1410 					    struct perf_sample *sample,
1411 					    struct machine *machine)
1412 {
1413 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1414 
1415 	if (sched->tp_handler->migrate_task_event)
1416 		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1417 
1418 	return 0;
1419 }
1420 
1421 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1422 				  struct perf_evsel *evsel,
1423 				  struct perf_sample *sample,
1424 				  struct machine *machine);
1425 
1426 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1427 						 union perf_event *event __maybe_unused,
1428 						 struct perf_sample *sample,
1429 						 struct perf_evsel *evsel,
1430 						 struct machine *machine)
1431 {
1432 	int err = 0;
1433 
1434 	if (evsel->handler != NULL) {
1435 		tracepoint_handler f = evsel->handler;
1436 		err = f(tool, evsel, sample, machine);
1437 	}
1438 
1439 	return err;
1440 }
1441 
1442 static int perf_sched__read_events(struct perf_sched *sched,
1443 				   struct perf_session **psession)
1444 {
1445 	const struct perf_evsel_str_handler handlers[] = {
1446 		{ "sched:sched_switch",	      process_sched_switch_event, },
1447 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1448 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1449 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1450 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1451 	};
1452 	struct perf_session *session;
1453 	struct perf_data_file file = {
1454 		.path = input_name,
1455 		.mode = PERF_DATA_MODE_READ,
1456 	};
1457 
1458 	session = perf_session__new(&file, false, &sched->tool);
1459 	if (session == NULL) {
1460 		pr_debug("No Memory for session\n");
1461 		return -1;
1462 	}
1463 
1464 	symbol__init(&session->header.env);
1465 
1466 	if (perf_session__set_tracepoints_handlers(session, handlers))
1467 		goto out_delete;
1468 
1469 	if (perf_session__has_traces(session, "record -R")) {
1470 		int err = perf_session__process_events(session, &sched->tool);
1471 		if (err) {
1472 			pr_err("Failed to process events, error %d", err);
1473 			goto out_delete;
1474 		}
1475 
1476 		sched->nr_events      = session->stats.nr_events[0];
1477 		sched->nr_lost_events = session->stats.total_lost;
1478 		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
1479 	}
1480 
1481 	if (psession)
1482 		*psession = session;
1483 	else
1484 		perf_session__delete(session);
1485 
1486 	return 0;
1487 
1488 out_delete:
1489 	perf_session__delete(session);
1490 	return -1;
1491 }
1492 
1493 static void print_bad_events(struct perf_sched *sched)
1494 {
1495 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1496 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1497 			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1498 			sched->nr_unordered_timestamps, sched->nr_timestamps);
1499 	}
1500 	if (sched->nr_lost_events && sched->nr_events) {
1501 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1502 			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1503 			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1504 	}
1505 	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1506 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1507 			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1508 			sched->nr_context_switch_bugs, sched->nr_timestamps);
1509 		if (sched->nr_lost_events)
1510 			printf(" (due to lost events?)");
1511 		printf("\n");
1512 	}
1513 }
1514 
1515 static int perf_sched__lat(struct perf_sched *sched)
1516 {
1517 	struct rb_node *next;
1518 	struct perf_session *session;
1519 
1520 	setup_pager();
1521 
1522 	/* save session -- references to threads are held in work_list */
1523 	if (perf_sched__read_events(sched, &session))
1524 		return -1;
1525 
1526 	perf_sched__sort_lat(sched);
1527 
1528 	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
1529 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
1530 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1531 
1532 	next = rb_first(&sched->sorted_atom_root);
1533 
1534 	while (next) {
1535 		struct work_atoms *work_list;
1536 
1537 		work_list = rb_entry(next, struct work_atoms, node);
1538 		output_lat_thread(sched, work_list);
1539 		next = rb_next(next);
1540 	}
1541 
1542 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1543 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1544 		(double)sched->all_runtime / 1e6, sched->all_count);
1545 
1546 	printf(" ---------------------------------------------------\n");
1547 
1548 	print_bad_events(sched);
1549 	printf("\n");
1550 
1551 	perf_session__delete(session);
1552 	return 0;
1553 }
1554 
1555 static int perf_sched__map(struct perf_sched *sched)
1556 {
1557 	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1558 
1559 	setup_pager();
1560 	if (perf_sched__read_events(sched, NULL))
1561 		return -1;
1562 	print_bad_events(sched);
1563 	return 0;
1564 }
1565 
1566 static int perf_sched__replay(struct perf_sched *sched)
1567 {
1568 	unsigned long i;
1569 
1570 	calibrate_run_measurement_overhead(sched);
1571 	calibrate_sleep_measurement_overhead(sched);
1572 
1573 	test_calibrations(sched);
1574 
1575 	if (perf_sched__read_events(sched, NULL))
1576 		return -1;
1577 
1578 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
1579 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
1580 	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1581 
1582 	if (sched->targetless_wakeups)
1583 		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
1584 	if (sched->multitarget_wakeups)
1585 		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1586 	if (sched->nr_run_events_optimized)
1587 		printf("run atoms optimized: %ld\n",
1588 			sched->nr_run_events_optimized);
1589 
1590 	print_task_traces(sched);
1591 	add_cross_task_wakeups(sched);
1592 
1593 	create_tasks(sched);
1594 	printf("------------------------------------------------------------\n");
1595 	for (i = 0; i < sched->replay_repeat; i++)
1596 		run_one_test(sched);
1597 
1598 	return 0;
1599 }
1600 
1601 static void setup_sorting(struct perf_sched *sched, const struct option *options,
1602 			  const char * const usage_msg[])
1603 {
1604 	char *tmp, *tok, *str = strdup(sched->sort_order);
1605 
1606 	for (tok = strtok_r(str, ", ", &tmp);
1607 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1608 		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1609 			error("Unknown --sort key: `%s'", tok);
1610 			usage_with_options(usage_msg, options);
1611 		}
1612 	}
1613 
1614 	free(str);
1615 
1616 	sort_dimension__add("pid", &sched->cmp_pid);
1617 }
1618 
1619 static int __cmd_record(int argc, const char **argv)
1620 {
1621 	unsigned int rec_argc, i, j;
1622 	const char **rec_argv;
1623 	const char * const record_args[] = {
1624 		"record",
1625 		"-a",
1626 		"-R",
1627 		"-m", "1024",
1628 		"-c", "1",
1629 		"-e", "sched:sched_switch",
1630 		"-e", "sched:sched_stat_wait",
1631 		"-e", "sched:sched_stat_sleep",
1632 		"-e", "sched:sched_stat_iowait",
1633 		"-e", "sched:sched_stat_runtime",
1634 		"-e", "sched:sched_process_fork",
1635 		"-e", "sched:sched_wakeup",
1636 		"-e", "sched:sched_wakeup_new",
1637 		"-e", "sched:sched_migrate_task",
1638 	};
1639 
1640 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1641 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1642 
1643 	if (rec_argv == NULL)
1644 		return -ENOMEM;
1645 
1646 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1647 		rec_argv[i] = strdup(record_args[i]);
1648 
1649 	for (j = 1; j < (unsigned int)argc; j++, i++)
1650 		rec_argv[i] = argv[j];
1651 
1652 	BUG_ON(i != rec_argc);
1653 
1654 	return cmd_record(i, rec_argv, NULL);
1655 }
1656 
1657 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1658 {
1659 	const char default_sort_order[] = "avg, max, switch, runtime";
1660 	struct perf_sched sched = {
1661 		.tool = {
1662 			.sample		 = perf_sched__process_tracepoint_sample,
1663 			.comm		 = perf_event__process_comm,
1664 			.lost		 = perf_event__process_lost,
1665 			.fork		 = perf_sched__process_fork_event,
1666 			.ordered_events = true,
1667 		},
1668 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
1669 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
1670 		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
1671 		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1672 		.sort_order	      = default_sort_order,
1673 		.replay_repeat	      = 10,
1674 		.profile_cpu	      = -1,
1675 		.next_shortname1      = 'A',
1676 		.next_shortname2      = '0',
1677 	};
1678 	const struct option latency_options[] = {
1679 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1680 		   "sort by key(s): runtime, switch, avg, max"),
1681 	OPT_INCR('v', "verbose", &verbose,
1682 		    "be more verbose (show symbol address, etc)"),
1683 	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1684 		    "CPU to profile on"),
1685 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1686 		    "dump raw trace in ASCII"),
1687 	OPT_END()
1688 	};
1689 	const struct option replay_options[] = {
1690 	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1691 		     "repeat the workload replay N times (-1: infinite)"),
1692 	OPT_INCR('v', "verbose", &verbose,
1693 		    "be more verbose (show symbol address, etc)"),
1694 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1695 		    "dump raw trace in ASCII"),
1696 	OPT_END()
1697 	};
1698 	const struct option sched_options[] = {
1699 	OPT_STRING('i', "input", &input_name, "file",
1700 		    "input file name"),
1701 	OPT_INCR('v', "verbose", &verbose,
1702 		    "be more verbose (show symbol address, etc)"),
1703 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1704 		    "dump raw trace in ASCII"),
1705 	OPT_END()
1706 	};
1707 	const char * const latency_usage[] = {
1708 		"perf sched latency [<options>]",
1709 		NULL
1710 	};
1711 	const char * const replay_usage[] = {
1712 		"perf sched replay [<options>]",
1713 		NULL
1714 	};
1715 	const char *const sched_subcommands[] = { "record", "latency", "map",
1716 						  "replay", "script", NULL };
1717 	const char *sched_usage[] = {
1718 		NULL,
1719 		NULL
1720 	};
1721 	struct trace_sched_handler lat_ops  = {
1722 		.wakeup_event	    = latency_wakeup_event,
1723 		.switch_event	    = latency_switch_event,
1724 		.runtime_event	    = latency_runtime_event,
1725 		.migrate_task_event = latency_migrate_task_event,
1726 	};
1727 	struct trace_sched_handler map_ops  = {
1728 		.switch_event	    = map_switch_event,
1729 	};
1730 	struct trace_sched_handler replay_ops  = {
1731 		.wakeup_event	    = replay_wakeup_event,
1732 		.switch_event	    = replay_switch_event,
1733 		.fork_event	    = replay_fork_event,
1734 	};
1735 	unsigned int i;
1736 
1737 	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
1738 		sched.curr_pid[i] = -1;
1739 
1740 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
1741 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
1742 	if (!argc)
1743 		usage_with_options(sched_usage, sched_options);
1744 
1745 	/*
1746 	 * Aliased to 'perf script' for now:
1747 	 */
1748 	if (!strcmp(argv[0], "script"))
1749 		return cmd_script(argc, argv, prefix);
1750 
1751 	if (!strncmp(argv[0], "rec", 3)) {
1752 		return __cmd_record(argc, argv);
1753 	} else if (!strncmp(argv[0], "lat", 3)) {
1754 		sched.tp_handler = &lat_ops;
1755 		if (argc > 1) {
1756 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1757 			if (argc)
1758 				usage_with_options(latency_usage, latency_options);
1759 		}
1760 		setup_sorting(&sched, latency_options, latency_usage);
1761 		return perf_sched__lat(&sched);
1762 	} else if (!strcmp(argv[0], "map")) {
1763 		sched.tp_handler = &map_ops;
1764 		setup_sorting(&sched, latency_options, latency_usage);
1765 		return perf_sched__map(&sched);
1766 	} else if (!strncmp(argv[0], "rep", 3)) {
1767 		sched.tp_handler = &replay_ops;
1768 		if (argc) {
1769 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1770 			if (argc)
1771 				usage_with_options(replay_usage, replay_options);
1772 		}
1773 		return perf_sched__replay(&sched);
1774 	} else {
1775 		usage_with_options(sched_usage, sched_options);
1776 	}
1777 
1778 	return 0;
1779 }
1780