xref: /openbmc/linux/tools/perf/builtin-sched.c (revision 1eb4c977)
1 #include "builtin.h"
2 #include "perf.h"
3 
4 #include "util/util.h"
5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 
14 #include "util/parse-options.h"
15 #include "util/trace-event.h"
16 
17 #include "util/debug.h"
18 
19 #include <sys/prctl.h>
20 
21 #include <semaphore.h>
22 #include <pthread.h>
23 #include <math.h>
24 
25 static const char		*input_name;
26 
27 static char			default_sort_order[] = "avg, max, switch, runtime";
28 static const char		*sort_order = default_sort_order;
29 
30 static int			profile_cpu = -1;
31 
32 #define PR_SET_NAME		15               /* Set process name */
33 #define MAX_CPUS		4096
34 
35 static u64			run_measurement_overhead;
36 static u64			sleep_measurement_overhead;
37 
38 #define COMM_LEN		20
39 #define SYM_LEN			129
40 
41 #define MAX_PID			65536
42 
43 static unsigned long		nr_tasks;
44 
45 struct sched_atom;
46 
47 struct task_desc {
48 	unsigned long		nr;
49 	unsigned long		pid;
50 	char			comm[COMM_LEN];
51 
52 	unsigned long		nr_events;
53 	unsigned long		curr_event;
54 	struct sched_atom	**atoms;
55 
56 	pthread_t		thread;
57 	sem_t			sleep_sem;
58 
59 	sem_t			ready_for_work;
60 	sem_t			work_done_sem;
61 
62 	u64			cpu_usage;
63 };
64 
65 enum sched_event_type {
66 	SCHED_EVENT_RUN,
67 	SCHED_EVENT_SLEEP,
68 	SCHED_EVENT_WAKEUP,
69 	SCHED_EVENT_MIGRATION,
70 };
71 
72 struct sched_atom {
73 	enum sched_event_type	type;
74 	int			specific_wait;
75 	u64			timestamp;
76 	u64			duration;
77 	unsigned long		nr;
78 	sem_t			*wait_sem;
79 	struct task_desc	*wakee;
80 };
81 
82 static struct task_desc		*pid_to_task[MAX_PID];
83 
84 static struct task_desc		**tasks;
85 
86 static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static u64			start_time;
88 
89 static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
90 
91 static unsigned long		nr_run_events;
92 static unsigned long		nr_sleep_events;
93 static unsigned long		nr_wakeup_events;
94 
95 static unsigned long		nr_sleep_corrections;
96 static unsigned long		nr_run_events_optimized;
97 
98 static unsigned long		targetless_wakeups;
99 static unsigned long		multitarget_wakeups;
100 
101 static u64			cpu_usage;
102 static u64			runavg_cpu_usage;
103 static u64			parent_cpu_usage;
104 static u64			runavg_parent_cpu_usage;
105 
106 static unsigned long		nr_runs;
107 static u64			sum_runtime;
108 static u64			sum_fluct;
109 static u64			run_avg;
110 
111 static unsigned int		replay_repeat = 10;
112 static unsigned long		nr_timestamps;
113 static unsigned long		nr_unordered_timestamps;
114 static unsigned long		nr_state_machine_bugs;
115 static unsigned long		nr_context_switch_bugs;
116 static unsigned long		nr_events;
117 static unsigned long		nr_lost_chunks;
118 static unsigned long		nr_lost_events;
119 
120 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
121 
122 enum thread_state {
123 	THREAD_SLEEPING = 0,
124 	THREAD_WAIT_CPU,
125 	THREAD_SCHED_IN,
126 	THREAD_IGNORE
127 };
128 
129 struct work_atom {
130 	struct list_head	list;
131 	enum thread_state	state;
132 	u64			sched_out_time;
133 	u64			wake_up_time;
134 	u64			sched_in_time;
135 	u64			runtime;
136 };
137 
138 struct work_atoms {
139 	struct list_head	work_list;
140 	struct thread		*thread;
141 	struct rb_node		node;
142 	u64			max_lat;
143 	u64			max_lat_at;
144 	u64			total_lat;
145 	u64			nb_atoms;
146 	u64			total_runtime;
147 };
148 
149 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
150 
151 static struct rb_root		atom_root, sorted_atom_root;
152 
153 static u64			all_runtime;
154 static u64			all_count;
155 
156 
157 static u64 get_nsecs(void)
158 {
159 	struct timespec ts;
160 
161 	clock_gettime(CLOCK_MONOTONIC, &ts);
162 
163 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
164 }
165 
166 static void burn_nsecs(u64 nsecs)
167 {
168 	u64 T0 = get_nsecs(), T1;
169 
170 	do {
171 		T1 = get_nsecs();
172 	} while (T1 + run_measurement_overhead < T0 + nsecs);
173 }
174 
175 static void sleep_nsecs(u64 nsecs)
176 {
177 	struct timespec ts;
178 
179 	ts.tv_nsec = nsecs % 999999999;
180 	ts.tv_sec = nsecs / 999999999;
181 
182 	nanosleep(&ts, NULL);
183 }
184 
185 static void calibrate_run_measurement_overhead(void)
186 {
187 	u64 T0, T1, delta, min_delta = 1000000000ULL;
188 	int i;
189 
190 	for (i = 0; i < 10; i++) {
191 		T0 = get_nsecs();
192 		burn_nsecs(0);
193 		T1 = get_nsecs();
194 		delta = T1-T0;
195 		min_delta = min(min_delta, delta);
196 	}
197 	run_measurement_overhead = min_delta;
198 
199 	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
200 }
201 
202 static void calibrate_sleep_measurement_overhead(void)
203 {
204 	u64 T0, T1, delta, min_delta = 1000000000ULL;
205 	int i;
206 
207 	for (i = 0; i < 10; i++) {
208 		T0 = get_nsecs();
209 		sleep_nsecs(10000);
210 		T1 = get_nsecs();
211 		delta = T1-T0;
212 		min_delta = min(min_delta, delta);
213 	}
214 	min_delta -= 10000;
215 	sleep_measurement_overhead = min_delta;
216 
217 	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
218 }
219 
220 static struct sched_atom *
221 get_new_event(struct task_desc *task, u64 timestamp)
222 {
223 	struct sched_atom *event = zalloc(sizeof(*event));
224 	unsigned long idx = task->nr_events;
225 	size_t size;
226 
227 	event->timestamp = timestamp;
228 	event->nr = idx;
229 
230 	task->nr_events++;
231 	size = sizeof(struct sched_atom *) * task->nr_events;
232 	task->atoms = realloc(task->atoms, size);
233 	BUG_ON(!task->atoms);
234 
235 	task->atoms[idx] = event;
236 
237 	return event;
238 }
239 
240 static struct sched_atom *last_event(struct task_desc *task)
241 {
242 	if (!task->nr_events)
243 		return NULL;
244 
245 	return task->atoms[task->nr_events - 1];
246 }
247 
248 static void
249 add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
250 {
251 	struct sched_atom *event, *curr_event = last_event(task);
252 
253 	/*
254 	 * optimize an existing RUN event by merging this one
255 	 * to it:
256 	 */
257 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
258 		nr_run_events_optimized++;
259 		curr_event->duration += duration;
260 		return;
261 	}
262 
263 	event = get_new_event(task, timestamp);
264 
265 	event->type = SCHED_EVENT_RUN;
266 	event->duration = duration;
267 
268 	nr_run_events++;
269 }
270 
271 static void
272 add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
273 		       struct task_desc *wakee)
274 {
275 	struct sched_atom *event, *wakee_event;
276 
277 	event = get_new_event(task, timestamp);
278 	event->type = SCHED_EVENT_WAKEUP;
279 	event->wakee = wakee;
280 
281 	wakee_event = last_event(wakee);
282 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
283 		targetless_wakeups++;
284 		return;
285 	}
286 	if (wakee_event->wait_sem) {
287 		multitarget_wakeups++;
288 		return;
289 	}
290 
291 	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
292 	sem_init(wakee_event->wait_sem, 0, 0);
293 	wakee_event->specific_wait = 1;
294 	event->wait_sem = wakee_event->wait_sem;
295 
296 	nr_wakeup_events++;
297 }
298 
299 static void
300 add_sched_event_sleep(struct task_desc *task, u64 timestamp,
301 		      u64 task_state __used)
302 {
303 	struct sched_atom *event = get_new_event(task, timestamp);
304 
305 	event->type = SCHED_EVENT_SLEEP;
306 
307 	nr_sleep_events++;
308 }
309 
310 static struct task_desc *register_pid(unsigned long pid, const char *comm)
311 {
312 	struct task_desc *task;
313 
314 	BUG_ON(pid >= MAX_PID);
315 
316 	task = pid_to_task[pid];
317 
318 	if (task)
319 		return task;
320 
321 	task = zalloc(sizeof(*task));
322 	task->pid = pid;
323 	task->nr = nr_tasks;
324 	strcpy(task->comm, comm);
325 	/*
326 	 * every task starts in sleeping state - this gets ignored
327 	 * if there's no wakeup pointing to this sleep state:
328 	 */
329 	add_sched_event_sleep(task, 0, 0);
330 
331 	pid_to_task[pid] = task;
332 	nr_tasks++;
333 	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
334 	BUG_ON(!tasks);
335 	tasks[task->nr] = task;
336 
337 	if (verbose)
338 		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
339 
340 	return task;
341 }
342 
343 
344 static void print_task_traces(void)
345 {
346 	struct task_desc *task;
347 	unsigned long i;
348 
349 	for (i = 0; i < nr_tasks; i++) {
350 		task = tasks[i];
351 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
352 			task->nr, task->comm, task->pid, task->nr_events);
353 	}
354 }
355 
356 static void add_cross_task_wakeups(void)
357 {
358 	struct task_desc *task1, *task2;
359 	unsigned long i, j;
360 
361 	for (i = 0; i < nr_tasks; i++) {
362 		task1 = tasks[i];
363 		j = i + 1;
364 		if (j == nr_tasks)
365 			j = 0;
366 		task2 = tasks[j];
367 		add_sched_event_wakeup(task1, 0, task2);
368 	}
369 }
370 
371 static void
372 process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
373 {
374 	int ret = 0;
375 
376 	switch (atom->type) {
377 		case SCHED_EVENT_RUN:
378 			burn_nsecs(atom->duration);
379 			break;
380 		case SCHED_EVENT_SLEEP:
381 			if (atom->wait_sem)
382 				ret = sem_wait(atom->wait_sem);
383 			BUG_ON(ret);
384 			break;
385 		case SCHED_EVENT_WAKEUP:
386 			if (atom->wait_sem)
387 				ret = sem_post(atom->wait_sem);
388 			BUG_ON(ret);
389 			break;
390 		case SCHED_EVENT_MIGRATION:
391 			break;
392 		default:
393 			BUG_ON(1);
394 	}
395 }
396 
397 static u64 get_cpu_usage_nsec_parent(void)
398 {
399 	struct rusage ru;
400 	u64 sum;
401 	int err;
402 
403 	err = getrusage(RUSAGE_SELF, &ru);
404 	BUG_ON(err);
405 
406 	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
407 	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
408 
409 	return sum;
410 }
411 
412 static int self_open_counters(void)
413 {
414 	struct perf_event_attr attr;
415 	int fd;
416 
417 	memset(&attr, 0, sizeof(attr));
418 
419 	attr.type = PERF_TYPE_SOFTWARE;
420 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
421 
422 	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
423 
424 	if (fd < 0)
425 		die("Error: sys_perf_event_open() syscall returned"
426 		    "with %d (%s)\n", fd, strerror(errno));
427 	return fd;
428 }
429 
430 static u64 get_cpu_usage_nsec_self(int fd)
431 {
432 	u64 runtime;
433 	int ret;
434 
435 	ret = read(fd, &runtime, sizeof(runtime));
436 	BUG_ON(ret != sizeof(runtime));
437 
438 	return runtime;
439 }
440 
441 static void *thread_func(void *ctx)
442 {
443 	struct task_desc *this_task = ctx;
444 	u64 cpu_usage_0, cpu_usage_1;
445 	unsigned long i, ret;
446 	char comm2[22];
447 	int fd;
448 
449 	sprintf(comm2, ":%s", this_task->comm);
450 	prctl(PR_SET_NAME, comm2);
451 	fd = self_open_counters();
452 
453 again:
454 	ret = sem_post(&this_task->ready_for_work);
455 	BUG_ON(ret);
456 	ret = pthread_mutex_lock(&start_work_mutex);
457 	BUG_ON(ret);
458 	ret = pthread_mutex_unlock(&start_work_mutex);
459 	BUG_ON(ret);
460 
461 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
462 
463 	for (i = 0; i < this_task->nr_events; i++) {
464 		this_task->curr_event = i;
465 		process_sched_event(this_task, this_task->atoms[i]);
466 	}
467 
468 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
469 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
470 	ret = sem_post(&this_task->work_done_sem);
471 	BUG_ON(ret);
472 
473 	ret = pthread_mutex_lock(&work_done_wait_mutex);
474 	BUG_ON(ret);
475 	ret = pthread_mutex_unlock(&work_done_wait_mutex);
476 	BUG_ON(ret);
477 
478 	goto again;
479 }
480 
481 static void create_tasks(void)
482 {
483 	struct task_desc *task;
484 	pthread_attr_t attr;
485 	unsigned long i;
486 	int err;
487 
488 	err = pthread_attr_init(&attr);
489 	BUG_ON(err);
490 	err = pthread_attr_setstacksize(&attr,
491 			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
492 	BUG_ON(err);
493 	err = pthread_mutex_lock(&start_work_mutex);
494 	BUG_ON(err);
495 	err = pthread_mutex_lock(&work_done_wait_mutex);
496 	BUG_ON(err);
497 	for (i = 0; i < nr_tasks; i++) {
498 		task = tasks[i];
499 		sem_init(&task->sleep_sem, 0, 0);
500 		sem_init(&task->ready_for_work, 0, 0);
501 		sem_init(&task->work_done_sem, 0, 0);
502 		task->curr_event = 0;
503 		err = pthread_create(&task->thread, &attr, thread_func, task);
504 		BUG_ON(err);
505 	}
506 }
507 
508 static void wait_for_tasks(void)
509 {
510 	u64 cpu_usage_0, cpu_usage_1;
511 	struct task_desc *task;
512 	unsigned long i, ret;
513 
514 	start_time = get_nsecs();
515 	cpu_usage = 0;
516 	pthread_mutex_unlock(&work_done_wait_mutex);
517 
518 	for (i = 0; i < nr_tasks; i++) {
519 		task = tasks[i];
520 		ret = sem_wait(&task->ready_for_work);
521 		BUG_ON(ret);
522 		sem_init(&task->ready_for_work, 0, 0);
523 	}
524 	ret = pthread_mutex_lock(&work_done_wait_mutex);
525 	BUG_ON(ret);
526 
527 	cpu_usage_0 = get_cpu_usage_nsec_parent();
528 
529 	pthread_mutex_unlock(&start_work_mutex);
530 
531 	for (i = 0; i < nr_tasks; i++) {
532 		task = tasks[i];
533 		ret = sem_wait(&task->work_done_sem);
534 		BUG_ON(ret);
535 		sem_init(&task->work_done_sem, 0, 0);
536 		cpu_usage += task->cpu_usage;
537 		task->cpu_usage = 0;
538 	}
539 
540 	cpu_usage_1 = get_cpu_usage_nsec_parent();
541 	if (!runavg_cpu_usage)
542 		runavg_cpu_usage = cpu_usage;
543 	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
544 
545 	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
546 	if (!runavg_parent_cpu_usage)
547 		runavg_parent_cpu_usage = parent_cpu_usage;
548 	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
549 				   parent_cpu_usage)/10;
550 
551 	ret = pthread_mutex_lock(&start_work_mutex);
552 	BUG_ON(ret);
553 
554 	for (i = 0; i < nr_tasks; i++) {
555 		task = tasks[i];
556 		sem_init(&task->sleep_sem, 0, 0);
557 		task->curr_event = 0;
558 	}
559 }
560 
561 static void run_one_test(void)
562 {
563 	u64 T0, T1, delta, avg_delta, fluct;
564 
565 	T0 = get_nsecs();
566 	wait_for_tasks();
567 	T1 = get_nsecs();
568 
569 	delta = T1 - T0;
570 	sum_runtime += delta;
571 	nr_runs++;
572 
573 	avg_delta = sum_runtime / nr_runs;
574 	if (delta < avg_delta)
575 		fluct = avg_delta - delta;
576 	else
577 		fluct = delta - avg_delta;
578 	sum_fluct += fluct;
579 	if (!run_avg)
580 		run_avg = delta;
581 	run_avg = (run_avg*9 + delta)/10;
582 
583 	printf("#%-3ld: %0.3f, ",
584 		nr_runs, (double)delta/1000000.0);
585 
586 	printf("ravg: %0.2f, ",
587 		(double)run_avg/1e6);
588 
589 	printf("cpu: %0.2f / %0.2f",
590 		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
591 
592 #if 0
593 	/*
594 	 * rusage statistics done by the parent, these are less
595 	 * accurate than the sum_exec_runtime based statistics:
596 	 */
597 	printf(" [%0.2f / %0.2f]",
598 		(double)parent_cpu_usage/1e6,
599 		(double)runavg_parent_cpu_usage/1e6);
600 #endif
601 
602 	printf("\n");
603 
604 	if (nr_sleep_corrections)
605 		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
606 	nr_sleep_corrections = 0;
607 }
608 
609 static void test_calibrations(void)
610 {
611 	u64 T0, T1;
612 
613 	T0 = get_nsecs();
614 	burn_nsecs(1e6);
615 	T1 = get_nsecs();
616 
617 	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
618 
619 	T0 = get_nsecs();
620 	sleep_nsecs(1e6);
621 	T1 = get_nsecs();
622 
623 	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
624 }
625 
626 #define FILL_FIELD(ptr, field, event, data)	\
627 	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
628 
629 #define FILL_ARRAY(ptr, array, event, data)			\
630 do {								\
631 	void *__array = raw_field_ptr(event, #array, data);	\
632 	memcpy(ptr.array, __array, sizeof(ptr.array));	\
633 } while(0)
634 
635 #define FILL_COMMON_FIELDS(ptr, event, data)			\
636 do {								\
637 	FILL_FIELD(ptr, common_type, event, data);		\
638 	FILL_FIELD(ptr, common_flags, event, data);		\
639 	FILL_FIELD(ptr, common_preempt_count, event, data);	\
640 	FILL_FIELD(ptr, common_pid, event, data);		\
641 	FILL_FIELD(ptr, common_tgid, event, data);		\
642 } while (0)
643 
644 
645 
646 struct trace_switch_event {
647 	u32 size;
648 
649 	u16 common_type;
650 	u8 common_flags;
651 	u8 common_preempt_count;
652 	u32 common_pid;
653 	u32 common_tgid;
654 
655 	char prev_comm[16];
656 	u32 prev_pid;
657 	u32 prev_prio;
658 	u64 prev_state;
659 	char next_comm[16];
660 	u32 next_pid;
661 	u32 next_prio;
662 };
663 
664 struct trace_runtime_event {
665 	u32 size;
666 
667 	u16 common_type;
668 	u8 common_flags;
669 	u8 common_preempt_count;
670 	u32 common_pid;
671 	u32 common_tgid;
672 
673 	char comm[16];
674 	u32 pid;
675 	u64 runtime;
676 	u64 vruntime;
677 };
678 
679 struct trace_wakeup_event {
680 	u32 size;
681 
682 	u16 common_type;
683 	u8 common_flags;
684 	u8 common_preempt_count;
685 	u32 common_pid;
686 	u32 common_tgid;
687 
688 	char comm[16];
689 	u32 pid;
690 
691 	u32 prio;
692 	u32 success;
693 	u32 cpu;
694 };
695 
696 struct trace_fork_event {
697 	u32 size;
698 
699 	u16 common_type;
700 	u8 common_flags;
701 	u8 common_preempt_count;
702 	u32 common_pid;
703 	u32 common_tgid;
704 
705 	char parent_comm[16];
706 	u32 parent_pid;
707 	char child_comm[16];
708 	u32 child_pid;
709 };
710 
711 struct trace_migrate_task_event {
712 	u32 size;
713 
714 	u16 common_type;
715 	u8 common_flags;
716 	u8 common_preempt_count;
717 	u32 common_pid;
718 	u32 common_tgid;
719 
720 	char comm[16];
721 	u32 pid;
722 
723 	u32 prio;
724 	u32 cpu;
725 };
726 
727 struct trace_sched_handler {
728 	void (*switch_event)(struct trace_switch_event *,
729 			     struct machine *,
730 			     struct event *,
731 			     int cpu,
732 			     u64 timestamp,
733 			     struct thread *thread);
734 
735 	void (*runtime_event)(struct trace_runtime_event *,
736 			      struct machine *,
737 			      struct event *,
738 			      int cpu,
739 			      u64 timestamp,
740 			      struct thread *thread);
741 
742 	void (*wakeup_event)(struct trace_wakeup_event *,
743 			     struct machine *,
744 			     struct event *,
745 			     int cpu,
746 			     u64 timestamp,
747 			     struct thread *thread);
748 
749 	void (*fork_event)(struct trace_fork_event *,
750 			   struct event *,
751 			   int cpu,
752 			   u64 timestamp,
753 			   struct thread *thread);
754 
755 	void (*migrate_task_event)(struct trace_migrate_task_event *,
756 			   struct machine *machine,
757 			   struct event *,
758 			   int cpu,
759 			   u64 timestamp,
760 			   struct thread *thread);
761 };
762 
763 
764 static void
765 replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
766 		    struct machine *machine __used,
767 		    struct event *event,
768 		    int cpu __used,
769 		    u64 timestamp __used,
770 		    struct thread *thread __used)
771 {
772 	struct task_desc *waker, *wakee;
773 
774 	if (verbose) {
775 		printf("sched_wakeup event %p\n", event);
776 
777 		printf(" ... pid %d woke up %s/%d\n",
778 			wakeup_event->common_pid,
779 			wakeup_event->comm,
780 			wakeup_event->pid);
781 	}
782 
783 	waker = register_pid(wakeup_event->common_pid, "<unknown>");
784 	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
785 
786 	add_sched_event_wakeup(waker, timestamp, wakee);
787 }
788 
789 static u64 cpu_last_switched[MAX_CPUS];
790 
791 static void
792 replay_switch_event(struct trace_switch_event *switch_event,
793 		    struct machine *machine __used,
794 		    struct event *event,
795 		    int cpu,
796 		    u64 timestamp,
797 		    struct thread *thread __used)
798 {
799 	struct task_desc *prev, __used *next;
800 	u64 timestamp0;
801 	s64 delta;
802 
803 	if (verbose)
804 		printf("sched_switch event %p\n", event);
805 
806 	if (cpu >= MAX_CPUS || cpu < 0)
807 		return;
808 
809 	timestamp0 = cpu_last_switched[cpu];
810 	if (timestamp0)
811 		delta = timestamp - timestamp0;
812 	else
813 		delta = 0;
814 
815 	if (delta < 0)
816 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
817 
818 	if (verbose) {
819 		printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
820 			switch_event->prev_comm, switch_event->prev_pid,
821 			switch_event->next_comm, switch_event->next_pid,
822 			delta);
823 	}
824 
825 	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
826 	next = register_pid(switch_event->next_pid, switch_event->next_comm);
827 
828 	cpu_last_switched[cpu] = timestamp;
829 
830 	add_sched_event_run(prev, timestamp, delta);
831 	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
832 }
833 
834 
835 static void
836 replay_fork_event(struct trace_fork_event *fork_event,
837 		  struct event *event,
838 		  int cpu __used,
839 		  u64 timestamp __used,
840 		  struct thread *thread __used)
841 {
842 	if (verbose) {
843 		printf("sched_fork event %p\n", event);
844 		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
845 		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
846 	}
847 	register_pid(fork_event->parent_pid, fork_event->parent_comm);
848 	register_pid(fork_event->child_pid, fork_event->child_comm);
849 }
850 
851 static struct trace_sched_handler replay_ops  = {
852 	.wakeup_event		= replay_wakeup_event,
853 	.switch_event		= replay_switch_event,
854 	.fork_event		= replay_fork_event,
855 };
856 
857 struct sort_dimension {
858 	const char		*name;
859 	sort_fn_t		cmp;
860 	struct list_head	list;
861 };
862 
863 static LIST_HEAD(cmp_pid);
864 
865 static int
866 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
867 {
868 	struct sort_dimension *sort;
869 	int ret = 0;
870 
871 	BUG_ON(list_empty(list));
872 
873 	list_for_each_entry(sort, list, list) {
874 		ret = sort->cmp(l, r);
875 		if (ret)
876 			return ret;
877 	}
878 
879 	return ret;
880 }
881 
882 static struct work_atoms *
883 thread_atoms_search(struct rb_root *root, struct thread *thread,
884 			 struct list_head *sort_list)
885 {
886 	struct rb_node *node = root->rb_node;
887 	struct work_atoms key = { .thread = thread };
888 
889 	while (node) {
890 		struct work_atoms *atoms;
891 		int cmp;
892 
893 		atoms = container_of(node, struct work_atoms, node);
894 
895 		cmp = thread_lat_cmp(sort_list, &key, atoms);
896 		if (cmp > 0)
897 			node = node->rb_left;
898 		else if (cmp < 0)
899 			node = node->rb_right;
900 		else {
901 			BUG_ON(thread != atoms->thread);
902 			return atoms;
903 		}
904 	}
905 	return NULL;
906 }
907 
908 static void
909 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
910 			 struct list_head *sort_list)
911 {
912 	struct rb_node **new = &(root->rb_node), *parent = NULL;
913 
914 	while (*new) {
915 		struct work_atoms *this;
916 		int cmp;
917 
918 		this = container_of(*new, struct work_atoms, node);
919 		parent = *new;
920 
921 		cmp = thread_lat_cmp(sort_list, data, this);
922 
923 		if (cmp > 0)
924 			new = &((*new)->rb_left);
925 		else
926 			new = &((*new)->rb_right);
927 	}
928 
929 	rb_link_node(&data->node, parent, new);
930 	rb_insert_color(&data->node, root);
931 }
932 
933 static void thread_atoms_insert(struct thread *thread)
934 {
935 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
936 	if (!atoms)
937 		die("No memory");
938 
939 	atoms->thread = thread;
940 	INIT_LIST_HEAD(&atoms->work_list);
941 	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
942 }
943 
944 static void
945 latency_fork_event(struct trace_fork_event *fork_event __used,
946 		   struct event *event __used,
947 		   int cpu __used,
948 		   u64 timestamp __used,
949 		   struct thread *thread __used)
950 {
951 	/* should insert the newcomer */
952 }
953 
954 __used
955 static char sched_out_state(struct trace_switch_event *switch_event)
956 {
957 	const char *str = TASK_STATE_TO_CHAR_STR;
958 
959 	return str[switch_event->prev_state];
960 }
961 
962 static void
963 add_sched_out_event(struct work_atoms *atoms,
964 		    char run_state,
965 		    u64 timestamp)
966 {
967 	struct work_atom *atom = zalloc(sizeof(*atom));
968 	if (!atom)
969 		die("Non memory");
970 
971 	atom->sched_out_time = timestamp;
972 
973 	if (run_state == 'R') {
974 		atom->state = THREAD_WAIT_CPU;
975 		atom->wake_up_time = atom->sched_out_time;
976 	}
977 
978 	list_add_tail(&atom->list, &atoms->work_list);
979 }
980 
981 static void
982 add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
983 {
984 	struct work_atom *atom;
985 
986 	BUG_ON(list_empty(&atoms->work_list));
987 
988 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
989 
990 	atom->runtime += delta;
991 	atoms->total_runtime += delta;
992 }
993 
994 static void
995 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
996 {
997 	struct work_atom *atom;
998 	u64 delta;
999 
1000 	if (list_empty(&atoms->work_list))
1001 		return;
1002 
1003 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1004 
1005 	if (atom->state != THREAD_WAIT_CPU)
1006 		return;
1007 
1008 	if (timestamp < atom->wake_up_time) {
1009 		atom->state = THREAD_IGNORE;
1010 		return;
1011 	}
1012 
1013 	atom->state = THREAD_SCHED_IN;
1014 	atom->sched_in_time = timestamp;
1015 
1016 	delta = atom->sched_in_time - atom->wake_up_time;
1017 	atoms->total_lat += delta;
1018 	if (delta > atoms->max_lat) {
1019 		atoms->max_lat = delta;
1020 		atoms->max_lat_at = timestamp;
1021 	}
1022 	atoms->nb_atoms++;
1023 }
1024 
1025 static void
1026 latency_switch_event(struct trace_switch_event *switch_event,
1027 		     struct machine *machine,
1028 		     struct event *event __used,
1029 		     int cpu,
1030 		     u64 timestamp,
1031 		     struct thread *thread __used)
1032 {
1033 	struct work_atoms *out_events, *in_events;
1034 	struct thread *sched_out, *sched_in;
1035 	u64 timestamp0;
1036 	s64 delta;
1037 
1038 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1039 
1040 	timestamp0 = cpu_last_switched[cpu];
1041 	cpu_last_switched[cpu] = timestamp;
1042 	if (timestamp0)
1043 		delta = timestamp - timestamp0;
1044 	else
1045 		delta = 0;
1046 
1047 	if (delta < 0)
1048 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1049 
1050 
1051 	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1052 	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1053 
1054 	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1055 	if (!out_events) {
1056 		thread_atoms_insert(sched_out);
1057 		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1058 		if (!out_events)
1059 			die("out-event: Internal tree error");
1060 	}
1061 	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1062 
1063 	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1064 	if (!in_events) {
1065 		thread_atoms_insert(sched_in);
1066 		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1067 		if (!in_events)
1068 			die("in-event: Internal tree error");
1069 		/*
1070 		 * Take came in we have not heard about yet,
1071 		 * add in an initial atom in runnable state:
1072 		 */
1073 		add_sched_out_event(in_events, 'R', timestamp);
1074 	}
1075 	add_sched_in_event(in_events, timestamp);
1076 }
1077 
1078 static void
1079 latency_runtime_event(struct trace_runtime_event *runtime_event,
1080 		     struct machine *machine,
1081 		     struct event *event __used,
1082 		     int cpu,
1083 		     u64 timestamp,
1084 		     struct thread *this_thread __used)
1085 {
1086 	struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1087 	struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1088 
1089 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1090 	if (!atoms) {
1091 		thread_atoms_insert(thread);
1092 		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1093 		if (!atoms)
1094 			die("in-event: Internal tree error");
1095 		add_sched_out_event(atoms, 'R', timestamp);
1096 	}
1097 
1098 	add_runtime_event(atoms, runtime_event->runtime, timestamp);
1099 }
1100 
1101 static void
1102 latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1103 		     struct machine *machine,
1104 		     struct event *__event __used,
1105 		     int cpu __used,
1106 		     u64 timestamp,
1107 		     struct thread *thread __used)
1108 {
1109 	struct work_atoms *atoms;
1110 	struct work_atom *atom;
1111 	struct thread *wakee;
1112 
1113 	/* Note for later, it may be interesting to observe the failing cases */
1114 	if (!wakeup_event->success)
1115 		return;
1116 
1117 	wakee = machine__findnew_thread(machine, wakeup_event->pid);
1118 	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1119 	if (!atoms) {
1120 		thread_atoms_insert(wakee);
1121 		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1122 		if (!atoms)
1123 			die("wakeup-event: Internal tree error");
1124 		add_sched_out_event(atoms, 'S', timestamp);
1125 	}
1126 
1127 	BUG_ON(list_empty(&atoms->work_list));
1128 
1129 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1130 
1131 	/*
1132 	 * You WILL be missing events if you've recorded only
1133 	 * one CPU, or are only looking at only one, so don't
1134 	 * make useless noise.
1135 	 */
1136 	if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1137 		nr_state_machine_bugs++;
1138 
1139 	nr_timestamps++;
1140 	if (atom->sched_out_time > timestamp) {
1141 		nr_unordered_timestamps++;
1142 		return;
1143 	}
1144 
1145 	atom->state = THREAD_WAIT_CPU;
1146 	atom->wake_up_time = timestamp;
1147 }
1148 
1149 static void
1150 latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1151 		     struct machine *machine,
1152 		     struct event *__event __used,
1153 		     int cpu __used,
1154 		     u64 timestamp,
1155 		     struct thread *thread __used)
1156 {
1157 	struct work_atoms *atoms;
1158 	struct work_atom *atom;
1159 	struct thread *migrant;
1160 
1161 	/*
1162 	 * Only need to worry about migration when profiling one CPU.
1163 	 */
1164 	if (profile_cpu == -1)
1165 		return;
1166 
1167 	migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1168 	atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1169 	if (!atoms) {
1170 		thread_atoms_insert(migrant);
1171 		register_pid(migrant->pid, migrant->comm);
1172 		atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1173 		if (!atoms)
1174 			die("migration-event: Internal tree error");
1175 		add_sched_out_event(atoms, 'R', timestamp);
1176 	}
1177 
1178 	BUG_ON(list_empty(&atoms->work_list));
1179 
1180 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1181 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1182 
1183 	nr_timestamps++;
1184 
1185 	if (atom->sched_out_time > timestamp)
1186 		nr_unordered_timestamps++;
1187 }
1188 
1189 static struct trace_sched_handler lat_ops  = {
1190 	.wakeup_event		= latency_wakeup_event,
1191 	.switch_event		= latency_switch_event,
1192 	.runtime_event		= latency_runtime_event,
1193 	.fork_event		= latency_fork_event,
1194 	.migrate_task_event	= latency_migrate_task_event,
1195 };
1196 
1197 static void output_lat_thread(struct work_atoms *work_list)
1198 {
1199 	int i;
1200 	int ret;
1201 	u64 avg;
1202 
1203 	if (!work_list->nb_atoms)
1204 		return;
1205 	/*
1206 	 * Ignore idle threads:
1207 	 */
1208 	if (!strcmp(work_list->thread->comm, "swapper"))
1209 		return;
1210 
1211 	all_runtime += work_list->total_runtime;
1212 	all_count += work_list->nb_atoms;
1213 
1214 	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1215 
1216 	for (i = 0; i < 24 - ret; i++)
1217 		printf(" ");
1218 
1219 	avg = work_list->total_lat / work_list->nb_atoms;
1220 
1221 	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1222 	      (double)work_list->total_runtime / 1e6,
1223 		 work_list->nb_atoms, (double)avg / 1e6,
1224 		 (double)work_list->max_lat / 1e6,
1225 		 (double)work_list->max_lat_at / 1e9);
1226 }
1227 
1228 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1229 {
1230 	if (l->thread->pid < r->thread->pid)
1231 		return -1;
1232 	if (l->thread->pid > r->thread->pid)
1233 		return 1;
1234 
1235 	return 0;
1236 }
1237 
1238 static struct sort_dimension pid_sort_dimension = {
1239 	.name			= "pid",
1240 	.cmp			= pid_cmp,
1241 };
1242 
1243 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1244 {
1245 	u64 avgl, avgr;
1246 
1247 	if (!l->nb_atoms)
1248 		return -1;
1249 
1250 	if (!r->nb_atoms)
1251 		return 1;
1252 
1253 	avgl = l->total_lat / l->nb_atoms;
1254 	avgr = r->total_lat / r->nb_atoms;
1255 
1256 	if (avgl < avgr)
1257 		return -1;
1258 	if (avgl > avgr)
1259 		return 1;
1260 
1261 	return 0;
1262 }
1263 
1264 static struct sort_dimension avg_sort_dimension = {
1265 	.name			= "avg",
1266 	.cmp			= avg_cmp,
1267 };
1268 
1269 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1270 {
1271 	if (l->max_lat < r->max_lat)
1272 		return -1;
1273 	if (l->max_lat > r->max_lat)
1274 		return 1;
1275 
1276 	return 0;
1277 }
1278 
1279 static struct sort_dimension max_sort_dimension = {
1280 	.name			= "max",
1281 	.cmp			= max_cmp,
1282 };
1283 
1284 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1285 {
1286 	if (l->nb_atoms < r->nb_atoms)
1287 		return -1;
1288 	if (l->nb_atoms > r->nb_atoms)
1289 		return 1;
1290 
1291 	return 0;
1292 }
1293 
1294 static struct sort_dimension switch_sort_dimension = {
1295 	.name			= "switch",
1296 	.cmp			= switch_cmp,
1297 };
1298 
1299 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1300 {
1301 	if (l->total_runtime < r->total_runtime)
1302 		return -1;
1303 	if (l->total_runtime > r->total_runtime)
1304 		return 1;
1305 
1306 	return 0;
1307 }
1308 
1309 static struct sort_dimension runtime_sort_dimension = {
1310 	.name			= "runtime",
1311 	.cmp			= runtime_cmp,
1312 };
1313 
1314 static struct sort_dimension *available_sorts[] = {
1315 	&pid_sort_dimension,
1316 	&avg_sort_dimension,
1317 	&max_sort_dimension,
1318 	&switch_sort_dimension,
1319 	&runtime_sort_dimension,
1320 };
1321 
1322 #define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1323 
1324 static LIST_HEAD(sort_list);
1325 
1326 static int sort_dimension__add(const char *tok, struct list_head *list)
1327 {
1328 	int i;
1329 
1330 	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1331 		if (!strcmp(available_sorts[i]->name, tok)) {
1332 			list_add_tail(&available_sorts[i]->list, list);
1333 
1334 			return 0;
1335 		}
1336 	}
1337 
1338 	return -1;
1339 }
1340 
1341 static void setup_sorting(void);
1342 
1343 static void sort_lat(void)
1344 {
1345 	struct rb_node *node;
1346 
1347 	for (;;) {
1348 		struct work_atoms *data;
1349 		node = rb_first(&atom_root);
1350 		if (!node)
1351 			break;
1352 
1353 		rb_erase(node, &atom_root);
1354 		data = rb_entry(node, struct work_atoms, node);
1355 		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
1356 	}
1357 }
1358 
1359 static struct trace_sched_handler *trace_handler;
1360 
1361 static void
1362 process_sched_wakeup_event(struct perf_tool *tool __used,
1363 			   struct event *event,
1364 			   struct perf_sample *sample,
1365 			   struct machine *machine,
1366 			   struct thread *thread)
1367 {
1368 	void *data = sample->raw_data;
1369 	struct trace_wakeup_event wakeup_event;
1370 
1371 	FILL_COMMON_FIELDS(wakeup_event, event, data);
1372 
1373 	FILL_ARRAY(wakeup_event, comm, event, data);
1374 	FILL_FIELD(wakeup_event, pid, event, data);
1375 	FILL_FIELD(wakeup_event, prio, event, data);
1376 	FILL_FIELD(wakeup_event, success, event, data);
1377 	FILL_FIELD(wakeup_event, cpu, event, data);
1378 
1379 	if (trace_handler->wakeup_event)
1380 		trace_handler->wakeup_event(&wakeup_event, machine, event,
1381 					    sample->cpu, sample->time, thread);
1382 }
1383 
1384 /*
1385  * Track the current task - that way we can know whether there's any
1386  * weird events, such as a task being switched away that is not current.
1387  */
1388 static int max_cpu;
1389 
1390 static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1391 
1392 static struct thread *curr_thread[MAX_CPUS];
1393 
1394 static char next_shortname1 = 'A';
1395 static char next_shortname2 = '0';
1396 
1397 static void
1398 map_switch_event(struct trace_switch_event *switch_event,
1399 		 struct machine *machine,
1400 		 struct event *event __used,
1401 		 int this_cpu,
1402 		 u64 timestamp,
1403 		 struct thread *thread __used)
1404 {
1405 	struct thread *sched_out __used, *sched_in;
1406 	int new_shortname;
1407 	u64 timestamp0;
1408 	s64 delta;
1409 	int cpu;
1410 
1411 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1412 
1413 	if (this_cpu > max_cpu)
1414 		max_cpu = this_cpu;
1415 
1416 	timestamp0 = cpu_last_switched[this_cpu];
1417 	cpu_last_switched[this_cpu] = timestamp;
1418 	if (timestamp0)
1419 		delta = timestamp - timestamp0;
1420 	else
1421 		delta = 0;
1422 
1423 	if (delta < 0)
1424 		die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1425 
1426 
1427 	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1428 	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1429 
1430 	curr_thread[this_cpu] = sched_in;
1431 
1432 	printf("  ");
1433 
1434 	new_shortname = 0;
1435 	if (!sched_in->shortname[0]) {
1436 		sched_in->shortname[0] = next_shortname1;
1437 		sched_in->shortname[1] = next_shortname2;
1438 
1439 		if (next_shortname1 < 'Z') {
1440 			next_shortname1++;
1441 		} else {
1442 			next_shortname1='A';
1443 			if (next_shortname2 < '9') {
1444 				next_shortname2++;
1445 			} else {
1446 				next_shortname2='0';
1447 			}
1448 		}
1449 		new_shortname = 1;
1450 	}
1451 
1452 	for (cpu = 0; cpu <= max_cpu; cpu++) {
1453 		if (cpu != this_cpu)
1454 			printf(" ");
1455 		else
1456 			printf("*");
1457 
1458 		if (curr_thread[cpu]) {
1459 			if (curr_thread[cpu]->pid)
1460 				printf("%2s ", curr_thread[cpu]->shortname);
1461 			else
1462 				printf(".  ");
1463 		} else
1464 			printf("   ");
1465 	}
1466 
1467 	printf("  %12.6f secs ", (double)timestamp/1e9);
1468 	if (new_shortname) {
1469 		printf("%s => %s:%d\n",
1470 			sched_in->shortname, sched_in->comm, sched_in->pid);
1471 	} else {
1472 		printf("\n");
1473 	}
1474 }
1475 
1476 static void
1477 process_sched_switch_event(struct perf_tool *tool __used,
1478 			   struct event *event,
1479 			   struct perf_sample *sample,
1480 			   struct machine *machine,
1481 			   struct thread *thread)
1482 {
1483 	int this_cpu = sample->cpu;
1484 	void *data = sample->raw_data;
1485 	struct trace_switch_event switch_event;
1486 
1487 	FILL_COMMON_FIELDS(switch_event, event, data);
1488 
1489 	FILL_ARRAY(switch_event, prev_comm, event, data);
1490 	FILL_FIELD(switch_event, prev_pid, event, data);
1491 	FILL_FIELD(switch_event, prev_prio, event, data);
1492 	FILL_FIELD(switch_event, prev_state, event, data);
1493 	FILL_ARRAY(switch_event, next_comm, event, data);
1494 	FILL_FIELD(switch_event, next_pid, event, data);
1495 	FILL_FIELD(switch_event, next_prio, event, data);
1496 
1497 	if (curr_pid[this_cpu] != (u32)-1) {
1498 		/*
1499 		 * Are we trying to switch away a PID that is
1500 		 * not current?
1501 		 */
1502 		if (curr_pid[this_cpu] != switch_event.prev_pid)
1503 			nr_context_switch_bugs++;
1504 	}
1505 	if (trace_handler->switch_event)
1506 		trace_handler->switch_event(&switch_event, machine, event,
1507 					    this_cpu, sample->time, thread);
1508 
1509 	curr_pid[this_cpu] = switch_event.next_pid;
1510 }
1511 
1512 static void
1513 process_sched_runtime_event(struct perf_tool *tool __used,
1514 			    struct event *event,
1515 			    struct perf_sample *sample,
1516 			    struct machine *machine,
1517 			    struct thread *thread)
1518 {
1519 	void *data = sample->raw_data;
1520 	struct trace_runtime_event runtime_event;
1521 
1522 	FILL_ARRAY(runtime_event, comm, event, data);
1523 	FILL_FIELD(runtime_event, pid, event, data);
1524 	FILL_FIELD(runtime_event, runtime, event, data);
1525 	FILL_FIELD(runtime_event, vruntime, event, data);
1526 
1527 	if (trace_handler->runtime_event)
1528 		trace_handler->runtime_event(&runtime_event, machine, event,
1529 					     sample->cpu, sample->time, thread);
1530 }
1531 
1532 static void
1533 process_sched_fork_event(struct perf_tool *tool __used,
1534 			 struct event *event,
1535 			 struct perf_sample *sample,
1536 			 struct machine *machine __used,
1537 			 struct thread *thread)
1538 {
1539 	void *data = sample->raw_data;
1540 	struct trace_fork_event fork_event;
1541 
1542 	FILL_COMMON_FIELDS(fork_event, event, data);
1543 
1544 	FILL_ARRAY(fork_event, parent_comm, event, data);
1545 	FILL_FIELD(fork_event, parent_pid, event, data);
1546 	FILL_ARRAY(fork_event, child_comm, event, data);
1547 	FILL_FIELD(fork_event, child_pid, event, data);
1548 
1549 	if (trace_handler->fork_event)
1550 		trace_handler->fork_event(&fork_event, event,
1551 					  sample->cpu, sample->time, thread);
1552 }
1553 
1554 static void
1555 process_sched_exit_event(struct perf_tool *tool __used,
1556 			 struct event *event,
1557 			 struct perf_sample *sample __used,
1558 			 struct machine *machine __used,
1559 			 struct thread *thread __used)
1560 {
1561 	if (verbose)
1562 		printf("sched_exit event %p\n", event);
1563 }
1564 
1565 static void
1566 process_sched_migrate_task_event(struct perf_tool *tool __used,
1567 				 struct event *event,
1568 				 struct perf_sample *sample,
1569 				 struct machine *machine,
1570 				 struct thread *thread)
1571 {
1572 	void *data = sample->raw_data;
1573 	struct trace_migrate_task_event migrate_task_event;
1574 
1575 	FILL_COMMON_FIELDS(migrate_task_event, event, data);
1576 
1577 	FILL_ARRAY(migrate_task_event, comm, event, data);
1578 	FILL_FIELD(migrate_task_event, pid, event, data);
1579 	FILL_FIELD(migrate_task_event, prio, event, data);
1580 	FILL_FIELD(migrate_task_event, cpu, event, data);
1581 
1582 	if (trace_handler->migrate_task_event)
1583 		trace_handler->migrate_task_event(&migrate_task_event, machine,
1584 						  event, sample->cpu,
1585 						  sample->time, thread);
1586 }
1587 
1588 typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event,
1589 				   struct perf_sample *sample,
1590 				   struct machine *machine,
1591 				   struct thread *thread);
1592 
1593 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
1594 						 union perf_event *event __used,
1595 						 struct perf_sample *sample,
1596 						 struct perf_evsel *evsel,
1597 						 struct machine *machine)
1598 {
1599 	struct thread *thread = machine__findnew_thread(machine, sample->pid);
1600 
1601 	if (thread == NULL) {
1602 		pr_debug("problem processing %s event, skipping it.\n",
1603 			 evsel->name);
1604 		return -1;
1605 	}
1606 
1607 	evsel->hists.stats.total_period += sample->period;
1608 	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1609 
1610 	if (evsel->handler.func != NULL) {
1611 		tracepoint_handler f = evsel->handler.func;
1612 
1613 		if (evsel->handler.data == NULL)
1614 			evsel->handler.data = trace_find_event(evsel->attr.config);
1615 
1616 		f(tool, evsel->handler.data, sample, machine, thread);
1617 	}
1618 
1619 	return 0;
1620 }
1621 
1622 static struct perf_tool perf_sched = {
1623 	.sample			= perf_sched__process_tracepoint_sample,
1624 	.comm			= perf_event__process_comm,
1625 	.lost			= perf_event__process_lost,
1626 	.fork			= perf_event__process_task,
1627 	.ordered_samples	= true,
1628 };
1629 
1630 static void read_events(bool destroy, struct perf_session **psession)
1631 {
1632 	int err = -EINVAL;
1633 	const struct perf_evsel_str_handler handlers[] = {
1634 		{ "sched:sched_switch",	      process_sched_switch_event, },
1635 		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1636 		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
1637 		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1638 		{ "sched:sched_process_fork", process_sched_fork_event, },
1639 		{ "sched:sched_process_exit", process_sched_exit_event, },
1640 		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1641 	};
1642 	struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1643 							 0, false, &perf_sched);
1644 	if (session == NULL)
1645 		die("No Memory");
1646 
1647 	err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers);
1648 	assert(err == 0);
1649 
1650 	if (perf_session__has_traces(session, "record -R")) {
1651 		err = perf_session__process_events(session, &perf_sched);
1652 		if (err)
1653 			die("Failed to process events, error %d", err);
1654 
1655 		nr_events      = session->hists.stats.nr_events[0];
1656 		nr_lost_events = session->hists.stats.total_lost;
1657 		nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1658 	}
1659 
1660 	if (destroy)
1661 		perf_session__delete(session);
1662 
1663 	if (psession)
1664 		*psession = session;
1665 }
1666 
1667 static void print_bad_events(void)
1668 {
1669 	if (nr_unordered_timestamps && nr_timestamps) {
1670 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1671 			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1672 			nr_unordered_timestamps, nr_timestamps);
1673 	}
1674 	if (nr_lost_events && nr_events) {
1675 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1676 			(double)nr_lost_events/(double)nr_events*100.0,
1677 			nr_lost_events, nr_events, nr_lost_chunks);
1678 	}
1679 	if (nr_state_machine_bugs && nr_timestamps) {
1680 		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1681 			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1682 			nr_state_machine_bugs, nr_timestamps);
1683 		if (nr_lost_events)
1684 			printf(" (due to lost events?)");
1685 		printf("\n");
1686 	}
1687 	if (nr_context_switch_bugs && nr_timestamps) {
1688 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1689 			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1690 			nr_context_switch_bugs, nr_timestamps);
1691 		if (nr_lost_events)
1692 			printf(" (due to lost events?)");
1693 		printf("\n");
1694 	}
1695 }
1696 
1697 static void __cmd_lat(void)
1698 {
1699 	struct rb_node *next;
1700 	struct perf_session *session;
1701 
1702 	setup_pager();
1703 	read_events(false, &session);
1704 	sort_lat();
1705 
1706 	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1707 	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1708 	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1709 
1710 	next = rb_first(&sorted_atom_root);
1711 
1712 	while (next) {
1713 		struct work_atoms *work_list;
1714 
1715 		work_list = rb_entry(next, struct work_atoms, node);
1716 		output_lat_thread(work_list);
1717 		next = rb_next(next);
1718 	}
1719 
1720 	printf(" -----------------------------------------------------------------------------------------\n");
1721 	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1722 		(double)all_runtime/1e6, all_count);
1723 
1724 	printf(" ---------------------------------------------------\n");
1725 
1726 	print_bad_events();
1727 	printf("\n");
1728 
1729 	perf_session__delete(session);
1730 }
1731 
1732 static struct trace_sched_handler map_ops  = {
1733 	.wakeup_event		= NULL,
1734 	.switch_event		= map_switch_event,
1735 	.runtime_event		= NULL,
1736 	.fork_event		= NULL,
1737 };
1738 
1739 static void __cmd_map(void)
1740 {
1741 	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1742 
1743 	setup_pager();
1744 	read_events(true, NULL);
1745 	print_bad_events();
1746 }
1747 
1748 static void __cmd_replay(void)
1749 {
1750 	unsigned long i;
1751 
1752 	calibrate_run_measurement_overhead();
1753 	calibrate_sleep_measurement_overhead();
1754 
1755 	test_calibrations();
1756 
1757 	read_events(true, NULL);
1758 
1759 	printf("nr_run_events:        %ld\n", nr_run_events);
1760 	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
1761 	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
1762 
1763 	if (targetless_wakeups)
1764 		printf("target-less wakeups:  %ld\n", targetless_wakeups);
1765 	if (multitarget_wakeups)
1766 		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1767 	if (nr_run_events_optimized)
1768 		printf("run atoms optimized: %ld\n",
1769 			nr_run_events_optimized);
1770 
1771 	print_task_traces();
1772 	add_cross_task_wakeups();
1773 
1774 	create_tasks();
1775 	printf("------------------------------------------------------------\n");
1776 	for (i = 0; i < replay_repeat; i++)
1777 		run_one_test();
1778 }
1779 
1780 
1781 static const char * const sched_usage[] = {
1782 	"perf sched [<options>] {record|latency|map|replay|script}",
1783 	NULL
1784 };
1785 
1786 static const struct option sched_options[] = {
1787 	OPT_STRING('i', "input", &input_name, "file",
1788 		    "input file name"),
1789 	OPT_INCR('v', "verbose", &verbose,
1790 		    "be more verbose (show symbol address, etc)"),
1791 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1792 		    "dump raw trace in ASCII"),
1793 	OPT_END()
1794 };
1795 
1796 static const char * const latency_usage[] = {
1797 	"perf sched latency [<options>]",
1798 	NULL
1799 };
1800 
1801 static const struct option latency_options[] = {
1802 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1803 		   "sort by key(s): runtime, switch, avg, max"),
1804 	OPT_INCR('v', "verbose", &verbose,
1805 		    "be more verbose (show symbol address, etc)"),
1806 	OPT_INTEGER('C', "CPU", &profile_cpu,
1807 		    "CPU to profile on"),
1808 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1809 		    "dump raw trace in ASCII"),
1810 	OPT_END()
1811 };
1812 
1813 static const char * const replay_usage[] = {
1814 	"perf sched replay [<options>]",
1815 	NULL
1816 };
1817 
1818 static const struct option replay_options[] = {
1819 	OPT_UINTEGER('r', "repeat", &replay_repeat,
1820 		     "repeat the workload replay N times (-1: infinite)"),
1821 	OPT_INCR('v', "verbose", &verbose,
1822 		    "be more verbose (show symbol address, etc)"),
1823 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1824 		    "dump raw trace in ASCII"),
1825 	OPT_END()
1826 };
1827 
1828 static void setup_sorting(void)
1829 {
1830 	char *tmp, *tok, *str = strdup(sort_order);
1831 
1832 	for (tok = strtok_r(str, ", ", &tmp);
1833 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1834 		if (sort_dimension__add(tok, &sort_list) < 0) {
1835 			error("Unknown --sort key: `%s'", tok);
1836 			usage_with_options(latency_usage, latency_options);
1837 		}
1838 	}
1839 
1840 	free(str);
1841 
1842 	sort_dimension__add("pid", &cmp_pid);
1843 }
1844 
1845 static const char *record_args[] = {
1846 	"record",
1847 	"-a",
1848 	"-R",
1849 	"-f",
1850 	"-m", "1024",
1851 	"-c", "1",
1852 	"-e", "sched:sched_switch",
1853 	"-e", "sched:sched_stat_wait",
1854 	"-e", "sched:sched_stat_sleep",
1855 	"-e", "sched:sched_stat_iowait",
1856 	"-e", "sched:sched_stat_runtime",
1857 	"-e", "sched:sched_process_exit",
1858 	"-e", "sched:sched_process_fork",
1859 	"-e", "sched:sched_wakeup",
1860 	"-e", "sched:sched_migrate_task",
1861 };
1862 
1863 static int __cmd_record(int argc, const char **argv)
1864 {
1865 	unsigned int rec_argc, i, j;
1866 	const char **rec_argv;
1867 
1868 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1869 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1870 
1871 	if (rec_argv == NULL)
1872 		return -ENOMEM;
1873 
1874 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1875 		rec_argv[i] = strdup(record_args[i]);
1876 
1877 	for (j = 1; j < (unsigned int)argc; j++, i++)
1878 		rec_argv[i] = argv[j];
1879 
1880 	BUG_ON(i != rec_argc);
1881 
1882 	return cmd_record(i, rec_argv, NULL);
1883 }
1884 
1885 int cmd_sched(int argc, const char **argv, const char *prefix __used)
1886 {
1887 	argc = parse_options(argc, argv, sched_options, sched_usage,
1888 			     PARSE_OPT_STOP_AT_NON_OPTION);
1889 	if (!argc)
1890 		usage_with_options(sched_usage, sched_options);
1891 
1892 	/*
1893 	 * Aliased to 'perf script' for now:
1894 	 */
1895 	if (!strcmp(argv[0], "script"))
1896 		return cmd_script(argc, argv, prefix);
1897 
1898 	symbol__init();
1899 	if (!strncmp(argv[0], "rec", 3)) {
1900 		return __cmd_record(argc, argv);
1901 	} else if (!strncmp(argv[0], "lat", 3)) {
1902 		trace_handler = &lat_ops;
1903 		if (argc > 1) {
1904 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1905 			if (argc)
1906 				usage_with_options(latency_usage, latency_options);
1907 		}
1908 		setup_sorting();
1909 		__cmd_lat();
1910 	} else if (!strcmp(argv[0], "map")) {
1911 		trace_handler = &map_ops;
1912 		setup_sorting();
1913 		__cmd_map();
1914 	} else if (!strncmp(argv[0], "rep", 3)) {
1915 		trace_handler = &replay_ops;
1916 		if (argc) {
1917 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1918 			if (argc)
1919 				usage_with_options(replay_usage, replay_options);
1920 		}
1921 		__cmd_replay();
1922 	} else {
1923 		usage_with_options(sched_usage, sched_options);
1924 	}
1925 
1926 	return 0;
1927 }
1928