1 // SPDX-License-Identifier: GPL-2.0 2 #include "builtin.h" 3 #include "perf.h" 4 #include "perf-sys.h" 5 6 #include "util/cpumap.h" 7 #include "util/evlist.h" 8 #include "util/evsel.h" 9 #include "util/evsel_fprintf.h" 10 #include "util/symbol.h" 11 #include "util/thread.h" 12 #include "util/header.h" 13 #include "util/session.h" 14 #include "util/tool.h" 15 #include "util/cloexec.h" 16 #include "util/thread_map.h" 17 #include "util/color.h" 18 #include "util/stat.h" 19 #include "util/string2.h" 20 #include "util/callchain.h" 21 #include "util/time-utils.h" 22 23 #include <subcmd/pager.h> 24 #include <subcmd/parse-options.h> 25 #include "util/trace-event.h" 26 27 #include "util/debug.h" 28 #include "util/event.h" 29 30 #include <linux/kernel.h> 31 #include <linux/log2.h> 32 #include <linux/zalloc.h> 33 #include <sys/prctl.h> 34 #include <sys/resource.h> 35 #include <inttypes.h> 36 37 #include <errno.h> 38 #include <semaphore.h> 39 #include <pthread.h> 40 #include <math.h> 41 #include <api/fs/fs.h> 42 #include <perf/cpumap.h> 43 #include <linux/time64.h> 44 #include <linux/err.h> 45 46 #include <linux/ctype.h> 47 48 #define PR_SET_NAME 15 /* Set process name */ 49 #define MAX_CPUS 4096 50 #define COMM_LEN 20 51 #define SYM_LEN 129 52 #define MAX_PID 1024000 53 54 static const char *cpu_list; 55 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 56 57 struct sched_atom; 58 59 struct task_desc { 60 unsigned long nr; 61 unsigned long pid; 62 char comm[COMM_LEN]; 63 64 unsigned long nr_events; 65 unsigned long curr_event; 66 struct sched_atom **atoms; 67 68 pthread_t thread; 69 sem_t sleep_sem; 70 71 sem_t ready_for_work; 72 sem_t work_done_sem; 73 74 u64 cpu_usage; 75 }; 76 77 enum sched_event_type { 78 SCHED_EVENT_RUN, 79 SCHED_EVENT_SLEEP, 80 SCHED_EVENT_WAKEUP, 81 SCHED_EVENT_MIGRATION, 82 }; 83 84 struct sched_atom { 85 enum sched_event_type type; 86 int specific_wait; 87 u64 timestamp; 88 u64 duration; 89 unsigned long nr; 90 sem_t *wait_sem; 91 struct task_desc *wakee; 92 }; 93 94 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" 95 96 /* task state bitmask, copied from include/linux/sched.h */ 97 #define TASK_RUNNING 0 98 #define TASK_INTERRUPTIBLE 1 99 #define TASK_UNINTERRUPTIBLE 2 100 #define __TASK_STOPPED 4 101 #define __TASK_TRACED 8 102 /* in tsk->exit_state */ 103 #define EXIT_DEAD 16 104 #define EXIT_ZOMBIE 32 105 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) 106 /* in tsk->state again */ 107 #define TASK_DEAD 64 108 #define TASK_WAKEKILL 128 109 #define TASK_WAKING 256 110 #define TASK_PARKED 512 111 112 enum thread_state { 113 THREAD_SLEEPING = 0, 114 THREAD_WAIT_CPU, 115 THREAD_SCHED_IN, 116 THREAD_IGNORE 117 }; 118 119 struct work_atom { 120 struct list_head list; 121 enum thread_state state; 122 u64 sched_out_time; 123 u64 wake_up_time; 124 u64 sched_in_time; 125 u64 runtime; 126 }; 127 128 struct work_atoms { 129 struct list_head work_list; 130 struct thread *thread; 131 struct rb_node node; 132 u64 max_lat; 133 u64 max_lat_at; 134 u64 total_lat; 135 u64 nb_atoms; 136 u64 total_runtime; 137 int num_merged; 138 }; 139 140 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); 141 142 struct perf_sched; 143 144 struct trace_sched_handler { 145 int (*switch_event)(struct perf_sched *sched, struct evsel *evsel, 146 struct perf_sample *sample, struct machine *machine); 147 148 int (*runtime_event)(struct perf_sched *sched, struct evsel *evsel, 149 struct perf_sample *sample, struct machine *machine); 150 151 int (*wakeup_event)(struct perf_sched *sched, struct evsel *evsel, 152 struct perf_sample *sample, struct machine *machine); 153 154 /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */ 155 int (*fork_event)(struct perf_sched *sched, union perf_event *event, 156 struct machine *machine); 157 158 int (*migrate_task_event)(struct perf_sched *sched, 159 struct evsel *evsel, 160 struct perf_sample *sample, 161 struct machine *machine); 162 }; 163 164 #define COLOR_PIDS PERF_COLOR_BLUE 165 #define COLOR_CPUS PERF_COLOR_BG_RED 166 167 struct perf_sched_map { 168 DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); 169 int *comp_cpus; 170 bool comp; 171 struct perf_thread_map *color_pids; 172 const char *color_pids_str; 173 struct perf_cpu_map *color_cpus; 174 const char *color_cpus_str; 175 struct perf_cpu_map *cpus; 176 const char *cpus_str; 177 }; 178 179 struct perf_sched { 180 struct perf_tool tool; 181 const char *sort_order; 182 unsigned long nr_tasks; 183 struct task_desc **pid_to_task; 184 struct task_desc **tasks; 185 const struct trace_sched_handler *tp_handler; 186 pthread_mutex_t start_work_mutex; 187 pthread_mutex_t work_done_wait_mutex; 188 int profile_cpu; 189 /* 190 * Track the current task - that way we can know whether there's any 191 * weird events, such as a task being switched away that is not current. 192 */ 193 int max_cpu; 194 u32 curr_pid[MAX_CPUS]; 195 struct thread *curr_thread[MAX_CPUS]; 196 char next_shortname1; 197 char next_shortname2; 198 unsigned int replay_repeat; 199 unsigned long nr_run_events; 200 unsigned long nr_sleep_events; 201 unsigned long nr_wakeup_events; 202 unsigned long nr_sleep_corrections; 203 unsigned long nr_run_events_optimized; 204 unsigned long targetless_wakeups; 205 unsigned long multitarget_wakeups; 206 unsigned long nr_runs; 207 unsigned long nr_timestamps; 208 unsigned long nr_unordered_timestamps; 209 unsigned long nr_context_switch_bugs; 210 unsigned long nr_events; 211 unsigned long nr_lost_chunks; 212 unsigned long nr_lost_events; 213 u64 run_measurement_overhead; 214 u64 sleep_measurement_overhead; 215 u64 start_time; 216 u64 cpu_usage; 217 u64 runavg_cpu_usage; 218 u64 parent_cpu_usage; 219 u64 runavg_parent_cpu_usage; 220 u64 sum_runtime; 221 u64 sum_fluct; 222 u64 run_avg; 223 u64 all_runtime; 224 u64 all_count; 225 u64 cpu_last_switched[MAX_CPUS]; 226 struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root; 227 struct list_head sort_list, cmp_pid; 228 bool force; 229 bool skip_merge; 230 struct perf_sched_map map; 231 232 /* options for timehist command */ 233 bool summary; 234 bool summary_only; 235 bool idle_hist; 236 bool show_callchain; 237 unsigned int max_stack; 238 bool show_cpu_visual; 239 bool show_wakeups; 240 bool show_next; 241 bool show_migrations; 242 bool show_state; 243 u64 skipped_samples; 244 const char *time_str; 245 struct perf_time_interval ptime; 246 struct perf_time_interval hist_time; 247 }; 248 249 /* per thread run time data */ 250 struct thread_runtime { 251 u64 last_time; /* time of previous sched in/out event */ 252 u64 dt_run; /* run time */ 253 u64 dt_sleep; /* time between CPU access by sleep (off cpu) */ 254 u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ 255 u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ 256 u64 dt_delay; /* time between wakeup and sched-in */ 257 u64 ready_to_run; /* time of wakeup */ 258 259 struct stats run_stats; 260 u64 total_run_time; 261 u64 total_sleep_time; 262 u64 total_iowait_time; 263 u64 total_preempt_time; 264 u64 total_delay_time; 265 266 int last_state; 267 268 char shortname[3]; 269 bool comm_changed; 270 271 u64 migrations; 272 }; 273 274 /* per event run time data */ 275 struct evsel_runtime { 276 u64 *last_time; /* time this event was last seen per cpu */ 277 u32 ncpu; /* highest cpu slot allocated */ 278 }; 279 280 /* per cpu idle time data */ 281 struct idle_thread_runtime { 282 struct thread_runtime tr; 283 struct thread *last_thread; 284 struct rb_root_cached sorted_root; 285 struct callchain_root callchain; 286 struct callchain_cursor cursor; 287 }; 288 289 /* track idle times per cpu */ 290 static struct thread **idle_threads; 291 static int idle_max_cpu; 292 static char idle_comm[] = "<idle>"; 293 294 static u64 get_nsecs(void) 295 { 296 struct timespec ts; 297 298 clock_gettime(CLOCK_MONOTONIC, &ts); 299 300 return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; 301 } 302 303 static void burn_nsecs(struct perf_sched *sched, u64 nsecs) 304 { 305 u64 T0 = get_nsecs(), T1; 306 307 do { 308 T1 = get_nsecs(); 309 } while (T1 + sched->run_measurement_overhead < T0 + nsecs); 310 } 311 312 static void sleep_nsecs(u64 nsecs) 313 { 314 struct timespec ts; 315 316 ts.tv_nsec = nsecs % 999999999; 317 ts.tv_sec = nsecs / 999999999; 318 319 nanosleep(&ts, NULL); 320 } 321 322 static void calibrate_run_measurement_overhead(struct perf_sched *sched) 323 { 324 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 325 int i; 326 327 for (i = 0; i < 10; i++) { 328 T0 = get_nsecs(); 329 burn_nsecs(sched, 0); 330 T1 = get_nsecs(); 331 delta = T1-T0; 332 min_delta = min(min_delta, delta); 333 } 334 sched->run_measurement_overhead = min_delta; 335 336 printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); 337 } 338 339 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) 340 { 341 u64 T0, T1, delta, min_delta = NSEC_PER_SEC; 342 int i; 343 344 for (i = 0; i < 10; i++) { 345 T0 = get_nsecs(); 346 sleep_nsecs(10000); 347 T1 = get_nsecs(); 348 delta = T1-T0; 349 min_delta = min(min_delta, delta); 350 } 351 min_delta -= 10000; 352 sched->sleep_measurement_overhead = min_delta; 353 354 printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); 355 } 356 357 static struct sched_atom * 358 get_new_event(struct task_desc *task, u64 timestamp) 359 { 360 struct sched_atom *event = zalloc(sizeof(*event)); 361 unsigned long idx = task->nr_events; 362 size_t size; 363 364 event->timestamp = timestamp; 365 event->nr = idx; 366 367 task->nr_events++; 368 size = sizeof(struct sched_atom *) * task->nr_events; 369 task->atoms = realloc(task->atoms, size); 370 BUG_ON(!task->atoms); 371 372 task->atoms[idx] = event; 373 374 return event; 375 } 376 377 static struct sched_atom *last_event(struct task_desc *task) 378 { 379 if (!task->nr_events) 380 return NULL; 381 382 return task->atoms[task->nr_events - 1]; 383 } 384 385 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, 386 u64 timestamp, u64 duration) 387 { 388 struct sched_atom *event, *curr_event = last_event(task); 389 390 /* 391 * optimize an existing RUN event by merging this one 392 * to it: 393 */ 394 if (curr_event && curr_event->type == SCHED_EVENT_RUN) { 395 sched->nr_run_events_optimized++; 396 curr_event->duration += duration; 397 return; 398 } 399 400 event = get_new_event(task, timestamp); 401 402 event->type = SCHED_EVENT_RUN; 403 event->duration = duration; 404 405 sched->nr_run_events++; 406 } 407 408 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, 409 u64 timestamp, struct task_desc *wakee) 410 { 411 struct sched_atom *event, *wakee_event; 412 413 event = get_new_event(task, timestamp); 414 event->type = SCHED_EVENT_WAKEUP; 415 event->wakee = wakee; 416 417 wakee_event = last_event(wakee); 418 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { 419 sched->targetless_wakeups++; 420 return; 421 } 422 if (wakee_event->wait_sem) { 423 sched->multitarget_wakeups++; 424 return; 425 } 426 427 wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); 428 sem_init(wakee_event->wait_sem, 0, 0); 429 wakee_event->specific_wait = 1; 430 event->wait_sem = wakee_event->wait_sem; 431 432 sched->nr_wakeup_events++; 433 } 434 435 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, 436 u64 timestamp, u64 task_state __maybe_unused) 437 { 438 struct sched_atom *event = get_new_event(task, timestamp); 439 440 event->type = SCHED_EVENT_SLEEP; 441 442 sched->nr_sleep_events++; 443 } 444 445 static struct task_desc *register_pid(struct perf_sched *sched, 446 unsigned long pid, const char *comm) 447 { 448 struct task_desc *task; 449 static int pid_max; 450 451 if (sched->pid_to_task == NULL) { 452 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) 453 pid_max = MAX_PID; 454 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); 455 } 456 if (pid >= (unsigned long)pid_max) { 457 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * 458 sizeof(struct task_desc *))) == NULL); 459 while (pid >= (unsigned long)pid_max) 460 sched->pid_to_task[pid_max++] = NULL; 461 } 462 463 task = sched->pid_to_task[pid]; 464 465 if (task) 466 return task; 467 468 task = zalloc(sizeof(*task)); 469 task->pid = pid; 470 task->nr = sched->nr_tasks; 471 strcpy(task->comm, comm); 472 /* 473 * every task starts in sleeping state - this gets ignored 474 * if there's no wakeup pointing to this sleep state: 475 */ 476 add_sched_event_sleep(sched, task, 0, 0); 477 478 sched->pid_to_task[pid] = task; 479 sched->nr_tasks++; 480 sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); 481 BUG_ON(!sched->tasks); 482 sched->tasks[task->nr] = task; 483 484 if (verbose > 0) 485 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); 486 487 return task; 488 } 489 490 491 static void print_task_traces(struct perf_sched *sched) 492 { 493 struct task_desc *task; 494 unsigned long i; 495 496 for (i = 0; i < sched->nr_tasks; i++) { 497 task = sched->tasks[i]; 498 printf("task %6ld (%20s:%10ld), nr_events: %ld\n", 499 task->nr, task->comm, task->pid, task->nr_events); 500 } 501 } 502 503 static void add_cross_task_wakeups(struct perf_sched *sched) 504 { 505 struct task_desc *task1, *task2; 506 unsigned long i, j; 507 508 for (i = 0; i < sched->nr_tasks; i++) { 509 task1 = sched->tasks[i]; 510 j = i + 1; 511 if (j == sched->nr_tasks) 512 j = 0; 513 task2 = sched->tasks[j]; 514 add_sched_event_wakeup(sched, task1, 0, task2); 515 } 516 } 517 518 static void perf_sched__process_event(struct perf_sched *sched, 519 struct sched_atom *atom) 520 { 521 int ret = 0; 522 523 switch (atom->type) { 524 case SCHED_EVENT_RUN: 525 burn_nsecs(sched, atom->duration); 526 break; 527 case SCHED_EVENT_SLEEP: 528 if (atom->wait_sem) 529 ret = sem_wait(atom->wait_sem); 530 BUG_ON(ret); 531 break; 532 case SCHED_EVENT_WAKEUP: 533 if (atom->wait_sem) 534 ret = sem_post(atom->wait_sem); 535 BUG_ON(ret); 536 break; 537 case SCHED_EVENT_MIGRATION: 538 break; 539 default: 540 BUG_ON(1); 541 } 542 } 543 544 static u64 get_cpu_usage_nsec_parent(void) 545 { 546 struct rusage ru; 547 u64 sum; 548 int err; 549 550 err = getrusage(RUSAGE_SELF, &ru); 551 BUG_ON(err); 552 553 sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; 554 sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; 555 556 return sum; 557 } 558 559 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) 560 { 561 struct perf_event_attr attr; 562 char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; 563 int fd; 564 struct rlimit limit; 565 bool need_privilege = false; 566 567 memset(&attr, 0, sizeof(attr)); 568 569 attr.type = PERF_TYPE_SOFTWARE; 570 attr.config = PERF_COUNT_SW_TASK_CLOCK; 571 572 force_again: 573 fd = sys_perf_event_open(&attr, 0, -1, -1, 574 perf_event_open_cloexec_flag()); 575 576 if (fd < 0) { 577 if (errno == EMFILE) { 578 if (sched->force) { 579 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); 580 limit.rlim_cur += sched->nr_tasks - cur_task; 581 if (limit.rlim_cur > limit.rlim_max) { 582 limit.rlim_max = limit.rlim_cur; 583 need_privilege = true; 584 } 585 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { 586 if (need_privilege && errno == EPERM) 587 strcpy(info, "Need privilege\n"); 588 } else 589 goto force_again; 590 } else 591 strcpy(info, "Have a try with -f option\n"); 592 } 593 pr_err("Error: sys_perf_event_open() syscall returned " 594 "with %d (%s)\n%s", fd, 595 str_error_r(errno, sbuf, sizeof(sbuf)), info); 596 exit(EXIT_FAILURE); 597 } 598 return fd; 599 } 600 601 static u64 get_cpu_usage_nsec_self(int fd) 602 { 603 u64 runtime; 604 int ret; 605 606 ret = read(fd, &runtime, sizeof(runtime)); 607 BUG_ON(ret != sizeof(runtime)); 608 609 return runtime; 610 } 611 612 struct sched_thread_parms { 613 struct task_desc *task; 614 struct perf_sched *sched; 615 int fd; 616 }; 617 618 static void *thread_func(void *ctx) 619 { 620 struct sched_thread_parms *parms = ctx; 621 struct task_desc *this_task = parms->task; 622 struct perf_sched *sched = parms->sched; 623 u64 cpu_usage_0, cpu_usage_1; 624 unsigned long i, ret; 625 char comm2[22]; 626 int fd = parms->fd; 627 628 zfree(&parms); 629 630 sprintf(comm2, ":%s", this_task->comm); 631 prctl(PR_SET_NAME, comm2); 632 if (fd < 0) 633 return NULL; 634 again: 635 ret = sem_post(&this_task->ready_for_work); 636 BUG_ON(ret); 637 ret = pthread_mutex_lock(&sched->start_work_mutex); 638 BUG_ON(ret); 639 ret = pthread_mutex_unlock(&sched->start_work_mutex); 640 BUG_ON(ret); 641 642 cpu_usage_0 = get_cpu_usage_nsec_self(fd); 643 644 for (i = 0; i < this_task->nr_events; i++) { 645 this_task->curr_event = i; 646 perf_sched__process_event(sched, this_task->atoms[i]); 647 } 648 649 cpu_usage_1 = get_cpu_usage_nsec_self(fd); 650 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; 651 ret = sem_post(&this_task->work_done_sem); 652 BUG_ON(ret); 653 654 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 655 BUG_ON(ret); 656 ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); 657 BUG_ON(ret); 658 659 goto again; 660 } 661 662 static void create_tasks(struct perf_sched *sched) 663 { 664 struct task_desc *task; 665 pthread_attr_t attr; 666 unsigned long i; 667 int err; 668 669 err = pthread_attr_init(&attr); 670 BUG_ON(err); 671 err = pthread_attr_setstacksize(&attr, 672 (size_t) max(16 * 1024, PTHREAD_STACK_MIN)); 673 BUG_ON(err); 674 err = pthread_mutex_lock(&sched->start_work_mutex); 675 BUG_ON(err); 676 err = pthread_mutex_lock(&sched->work_done_wait_mutex); 677 BUG_ON(err); 678 for (i = 0; i < sched->nr_tasks; i++) { 679 struct sched_thread_parms *parms = malloc(sizeof(*parms)); 680 BUG_ON(parms == NULL); 681 parms->task = task = sched->tasks[i]; 682 parms->sched = sched; 683 parms->fd = self_open_counters(sched, i); 684 sem_init(&task->sleep_sem, 0, 0); 685 sem_init(&task->ready_for_work, 0, 0); 686 sem_init(&task->work_done_sem, 0, 0); 687 task->curr_event = 0; 688 err = pthread_create(&task->thread, &attr, thread_func, parms); 689 BUG_ON(err); 690 } 691 } 692 693 static void wait_for_tasks(struct perf_sched *sched) 694 { 695 u64 cpu_usage_0, cpu_usage_1; 696 struct task_desc *task; 697 unsigned long i, ret; 698 699 sched->start_time = get_nsecs(); 700 sched->cpu_usage = 0; 701 pthread_mutex_unlock(&sched->work_done_wait_mutex); 702 703 for (i = 0; i < sched->nr_tasks; i++) { 704 task = sched->tasks[i]; 705 ret = sem_wait(&task->ready_for_work); 706 BUG_ON(ret); 707 sem_init(&task->ready_for_work, 0, 0); 708 } 709 ret = pthread_mutex_lock(&sched->work_done_wait_mutex); 710 BUG_ON(ret); 711 712 cpu_usage_0 = get_cpu_usage_nsec_parent(); 713 714 pthread_mutex_unlock(&sched->start_work_mutex); 715 716 for (i = 0; i < sched->nr_tasks; i++) { 717 task = sched->tasks[i]; 718 ret = sem_wait(&task->work_done_sem); 719 BUG_ON(ret); 720 sem_init(&task->work_done_sem, 0, 0); 721 sched->cpu_usage += task->cpu_usage; 722 task->cpu_usage = 0; 723 } 724 725 cpu_usage_1 = get_cpu_usage_nsec_parent(); 726 if (!sched->runavg_cpu_usage) 727 sched->runavg_cpu_usage = sched->cpu_usage; 728 sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; 729 730 sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; 731 if (!sched->runavg_parent_cpu_usage) 732 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; 733 sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + 734 sched->parent_cpu_usage)/sched->replay_repeat; 735 736 ret = pthread_mutex_lock(&sched->start_work_mutex); 737 BUG_ON(ret); 738 739 for (i = 0; i < sched->nr_tasks; i++) { 740 task = sched->tasks[i]; 741 sem_init(&task->sleep_sem, 0, 0); 742 task->curr_event = 0; 743 } 744 } 745 746 static void run_one_test(struct perf_sched *sched) 747 { 748 u64 T0, T1, delta, avg_delta, fluct; 749 750 T0 = get_nsecs(); 751 wait_for_tasks(sched); 752 T1 = get_nsecs(); 753 754 delta = T1 - T0; 755 sched->sum_runtime += delta; 756 sched->nr_runs++; 757 758 avg_delta = sched->sum_runtime / sched->nr_runs; 759 if (delta < avg_delta) 760 fluct = avg_delta - delta; 761 else 762 fluct = delta - avg_delta; 763 sched->sum_fluct += fluct; 764 if (!sched->run_avg) 765 sched->run_avg = delta; 766 sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; 767 768 printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC); 769 770 printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC); 771 772 printf("cpu: %0.2f / %0.2f", 773 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC); 774 775 #if 0 776 /* 777 * rusage statistics done by the parent, these are less 778 * accurate than the sched->sum_exec_runtime based statistics: 779 */ 780 printf(" [%0.2f / %0.2f]", 781 (double)sched->parent_cpu_usage / NSEC_PER_MSEC, 782 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC); 783 #endif 784 785 printf("\n"); 786 787 if (sched->nr_sleep_corrections) 788 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections); 789 sched->nr_sleep_corrections = 0; 790 } 791 792 static void test_calibrations(struct perf_sched *sched) 793 { 794 u64 T0, T1; 795 796 T0 = get_nsecs(); 797 burn_nsecs(sched, NSEC_PER_MSEC); 798 T1 = get_nsecs(); 799 800 printf("the run test took %" PRIu64 " nsecs\n", T1 - T0); 801 802 T0 = get_nsecs(); 803 sleep_nsecs(NSEC_PER_MSEC); 804 T1 = get_nsecs(); 805 806 printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0); 807 } 808 809 static int 810 replay_wakeup_event(struct perf_sched *sched, 811 struct evsel *evsel, struct perf_sample *sample, 812 struct machine *machine __maybe_unused) 813 { 814 const char *comm = evsel__strval(evsel, sample, "comm"); 815 const u32 pid = evsel__intval(evsel, sample, "pid"); 816 struct task_desc *waker, *wakee; 817 818 if (verbose > 0) { 819 printf("sched_wakeup event %p\n", evsel); 820 821 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid); 822 } 823 824 waker = register_pid(sched, sample->tid, "<unknown>"); 825 wakee = register_pid(sched, pid, comm); 826 827 add_sched_event_wakeup(sched, waker, sample->time, wakee); 828 return 0; 829 } 830 831 static int replay_switch_event(struct perf_sched *sched, 832 struct evsel *evsel, 833 struct perf_sample *sample, 834 struct machine *machine __maybe_unused) 835 { 836 const char *prev_comm = evsel__strval(evsel, sample, "prev_comm"), 837 *next_comm = evsel__strval(evsel, sample, "next_comm"); 838 const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), 839 next_pid = evsel__intval(evsel, sample, "next_pid"); 840 const u64 prev_state = evsel__intval(evsel, sample, "prev_state"); 841 struct task_desc *prev, __maybe_unused *next; 842 u64 timestamp0, timestamp = sample->time; 843 int cpu = sample->cpu; 844 s64 delta; 845 846 if (verbose > 0) 847 printf("sched_switch event %p\n", evsel); 848 849 if (cpu >= MAX_CPUS || cpu < 0) 850 return 0; 851 852 timestamp0 = sched->cpu_last_switched[cpu]; 853 if (timestamp0) 854 delta = timestamp - timestamp0; 855 else 856 delta = 0; 857 858 if (delta < 0) { 859 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 860 return -1; 861 } 862 863 pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n", 864 prev_comm, prev_pid, next_comm, next_pid, delta); 865 866 prev = register_pid(sched, prev_pid, prev_comm); 867 next = register_pid(sched, next_pid, next_comm); 868 869 sched->cpu_last_switched[cpu] = timestamp; 870 871 add_sched_event_run(sched, prev, timestamp, delta); 872 add_sched_event_sleep(sched, prev, timestamp, prev_state); 873 874 return 0; 875 } 876 877 static int replay_fork_event(struct perf_sched *sched, 878 union perf_event *event, 879 struct machine *machine) 880 { 881 struct thread *child, *parent; 882 883 child = machine__findnew_thread(machine, event->fork.pid, 884 event->fork.tid); 885 parent = machine__findnew_thread(machine, event->fork.ppid, 886 event->fork.ptid); 887 888 if (child == NULL || parent == NULL) { 889 pr_debug("thread does not exist on fork event: child %p, parent %p\n", 890 child, parent); 891 goto out_put; 892 } 893 894 if (verbose > 0) { 895 printf("fork event\n"); 896 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); 897 printf("... child: %s/%d\n", thread__comm_str(child), child->tid); 898 } 899 900 register_pid(sched, parent->tid, thread__comm_str(parent)); 901 register_pid(sched, child->tid, thread__comm_str(child)); 902 out_put: 903 thread__put(child); 904 thread__put(parent); 905 return 0; 906 } 907 908 struct sort_dimension { 909 const char *name; 910 sort_fn_t cmp; 911 struct list_head list; 912 }; 913 914 /* 915 * handle runtime stats saved per thread 916 */ 917 static struct thread_runtime *thread__init_runtime(struct thread *thread) 918 { 919 struct thread_runtime *r; 920 921 r = zalloc(sizeof(struct thread_runtime)); 922 if (!r) 923 return NULL; 924 925 init_stats(&r->run_stats); 926 thread__set_priv(thread, r); 927 928 return r; 929 } 930 931 static struct thread_runtime *thread__get_runtime(struct thread *thread) 932 { 933 struct thread_runtime *tr; 934 935 tr = thread__priv(thread); 936 if (tr == NULL) { 937 tr = thread__init_runtime(thread); 938 if (tr == NULL) 939 pr_debug("Failed to malloc memory for runtime data.\n"); 940 } 941 942 return tr; 943 } 944 945 static int 946 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) 947 { 948 struct sort_dimension *sort; 949 int ret = 0; 950 951 BUG_ON(list_empty(list)); 952 953 list_for_each_entry(sort, list, list) { 954 ret = sort->cmp(l, r); 955 if (ret) 956 return ret; 957 } 958 959 return ret; 960 } 961 962 static struct work_atoms * 963 thread_atoms_search(struct rb_root_cached *root, struct thread *thread, 964 struct list_head *sort_list) 965 { 966 struct rb_node *node = root->rb_root.rb_node; 967 struct work_atoms key = { .thread = thread }; 968 969 while (node) { 970 struct work_atoms *atoms; 971 int cmp; 972 973 atoms = container_of(node, struct work_atoms, node); 974 975 cmp = thread_lat_cmp(sort_list, &key, atoms); 976 if (cmp > 0) 977 node = node->rb_left; 978 else if (cmp < 0) 979 node = node->rb_right; 980 else { 981 BUG_ON(thread != atoms->thread); 982 return atoms; 983 } 984 } 985 return NULL; 986 } 987 988 static void 989 __thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data, 990 struct list_head *sort_list) 991 { 992 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; 993 bool leftmost = true; 994 995 while (*new) { 996 struct work_atoms *this; 997 int cmp; 998 999 this = container_of(*new, struct work_atoms, node); 1000 parent = *new; 1001 1002 cmp = thread_lat_cmp(sort_list, data, this); 1003 1004 if (cmp > 0) 1005 new = &((*new)->rb_left); 1006 else { 1007 new = &((*new)->rb_right); 1008 leftmost = false; 1009 } 1010 } 1011 1012 rb_link_node(&data->node, parent, new); 1013 rb_insert_color_cached(&data->node, root, leftmost); 1014 } 1015 1016 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) 1017 { 1018 struct work_atoms *atoms = zalloc(sizeof(*atoms)); 1019 if (!atoms) { 1020 pr_err("No memory at %s\n", __func__); 1021 return -1; 1022 } 1023 1024 atoms->thread = thread__get(thread); 1025 INIT_LIST_HEAD(&atoms->work_list); 1026 __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid); 1027 return 0; 1028 } 1029 1030 static char sched_out_state(u64 prev_state) 1031 { 1032 const char *str = TASK_STATE_TO_CHAR_STR; 1033 1034 return str[prev_state]; 1035 } 1036 1037 static int 1038 add_sched_out_event(struct work_atoms *atoms, 1039 char run_state, 1040 u64 timestamp) 1041 { 1042 struct work_atom *atom = zalloc(sizeof(*atom)); 1043 if (!atom) { 1044 pr_err("Non memory at %s", __func__); 1045 return -1; 1046 } 1047 1048 atom->sched_out_time = timestamp; 1049 1050 if (run_state == 'R') { 1051 atom->state = THREAD_WAIT_CPU; 1052 atom->wake_up_time = atom->sched_out_time; 1053 } 1054 1055 list_add_tail(&atom->list, &atoms->work_list); 1056 return 0; 1057 } 1058 1059 static void 1060 add_runtime_event(struct work_atoms *atoms, u64 delta, 1061 u64 timestamp __maybe_unused) 1062 { 1063 struct work_atom *atom; 1064 1065 BUG_ON(list_empty(&atoms->work_list)); 1066 1067 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1068 1069 atom->runtime += delta; 1070 atoms->total_runtime += delta; 1071 } 1072 1073 static void 1074 add_sched_in_event(struct work_atoms *atoms, u64 timestamp) 1075 { 1076 struct work_atom *atom; 1077 u64 delta; 1078 1079 if (list_empty(&atoms->work_list)) 1080 return; 1081 1082 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1083 1084 if (atom->state != THREAD_WAIT_CPU) 1085 return; 1086 1087 if (timestamp < atom->wake_up_time) { 1088 atom->state = THREAD_IGNORE; 1089 return; 1090 } 1091 1092 atom->state = THREAD_SCHED_IN; 1093 atom->sched_in_time = timestamp; 1094 1095 delta = atom->sched_in_time - atom->wake_up_time; 1096 atoms->total_lat += delta; 1097 if (delta > atoms->max_lat) { 1098 atoms->max_lat = delta; 1099 atoms->max_lat_at = timestamp; 1100 } 1101 atoms->nb_atoms++; 1102 } 1103 1104 static int latency_switch_event(struct perf_sched *sched, 1105 struct evsel *evsel, 1106 struct perf_sample *sample, 1107 struct machine *machine) 1108 { 1109 const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), 1110 next_pid = evsel__intval(evsel, sample, "next_pid"); 1111 const u64 prev_state = evsel__intval(evsel, sample, "prev_state"); 1112 struct work_atoms *out_events, *in_events; 1113 struct thread *sched_out, *sched_in; 1114 u64 timestamp0, timestamp = sample->time; 1115 int cpu = sample->cpu, err = -1; 1116 s64 delta; 1117 1118 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 1119 1120 timestamp0 = sched->cpu_last_switched[cpu]; 1121 sched->cpu_last_switched[cpu] = timestamp; 1122 if (timestamp0) 1123 delta = timestamp - timestamp0; 1124 else 1125 delta = 0; 1126 1127 if (delta < 0) { 1128 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1129 return -1; 1130 } 1131 1132 sched_out = machine__findnew_thread(machine, -1, prev_pid); 1133 sched_in = machine__findnew_thread(machine, -1, next_pid); 1134 if (sched_out == NULL || sched_in == NULL) 1135 goto out_put; 1136 1137 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1138 if (!out_events) { 1139 if (thread_atoms_insert(sched, sched_out)) 1140 goto out_put; 1141 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid); 1142 if (!out_events) { 1143 pr_err("out-event: Internal tree error"); 1144 goto out_put; 1145 } 1146 } 1147 if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp)) 1148 return -1; 1149 1150 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1151 if (!in_events) { 1152 if (thread_atoms_insert(sched, sched_in)) 1153 goto out_put; 1154 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid); 1155 if (!in_events) { 1156 pr_err("in-event: Internal tree error"); 1157 goto out_put; 1158 } 1159 /* 1160 * Take came in we have not heard about yet, 1161 * add in an initial atom in runnable state: 1162 */ 1163 if (add_sched_out_event(in_events, 'R', timestamp)) 1164 goto out_put; 1165 } 1166 add_sched_in_event(in_events, timestamp); 1167 err = 0; 1168 out_put: 1169 thread__put(sched_out); 1170 thread__put(sched_in); 1171 return err; 1172 } 1173 1174 static int latency_runtime_event(struct perf_sched *sched, 1175 struct evsel *evsel, 1176 struct perf_sample *sample, 1177 struct machine *machine) 1178 { 1179 const u32 pid = evsel__intval(evsel, sample, "pid"); 1180 const u64 runtime = evsel__intval(evsel, sample, "runtime"); 1181 struct thread *thread = machine__findnew_thread(machine, -1, pid); 1182 struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1183 u64 timestamp = sample->time; 1184 int cpu = sample->cpu, err = -1; 1185 1186 if (thread == NULL) 1187 return -1; 1188 1189 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 1190 if (!atoms) { 1191 if (thread_atoms_insert(sched, thread)) 1192 goto out_put; 1193 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid); 1194 if (!atoms) { 1195 pr_err("in-event: Internal tree error"); 1196 goto out_put; 1197 } 1198 if (add_sched_out_event(atoms, 'R', timestamp)) 1199 goto out_put; 1200 } 1201 1202 add_runtime_event(atoms, runtime, timestamp); 1203 err = 0; 1204 out_put: 1205 thread__put(thread); 1206 return err; 1207 } 1208 1209 static int latency_wakeup_event(struct perf_sched *sched, 1210 struct evsel *evsel, 1211 struct perf_sample *sample, 1212 struct machine *machine) 1213 { 1214 const u32 pid = evsel__intval(evsel, sample, "pid"); 1215 struct work_atoms *atoms; 1216 struct work_atom *atom; 1217 struct thread *wakee; 1218 u64 timestamp = sample->time; 1219 int err = -1; 1220 1221 wakee = machine__findnew_thread(machine, -1, pid); 1222 if (wakee == NULL) 1223 return -1; 1224 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1225 if (!atoms) { 1226 if (thread_atoms_insert(sched, wakee)) 1227 goto out_put; 1228 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid); 1229 if (!atoms) { 1230 pr_err("wakeup-event: Internal tree error"); 1231 goto out_put; 1232 } 1233 if (add_sched_out_event(atoms, 'S', timestamp)) 1234 goto out_put; 1235 } 1236 1237 BUG_ON(list_empty(&atoms->work_list)); 1238 1239 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1240 1241 /* 1242 * As we do not guarantee the wakeup event happens when 1243 * task is out of run queue, also may happen when task is 1244 * on run queue and wakeup only change ->state to TASK_RUNNING, 1245 * then we should not set the ->wake_up_time when wake up a 1246 * task which is on run queue. 1247 * 1248 * You WILL be missing events if you've recorded only 1249 * one CPU, or are only looking at only one, so don't 1250 * skip in this case. 1251 */ 1252 if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING) 1253 goto out_ok; 1254 1255 sched->nr_timestamps++; 1256 if (atom->sched_out_time > timestamp) { 1257 sched->nr_unordered_timestamps++; 1258 goto out_ok; 1259 } 1260 1261 atom->state = THREAD_WAIT_CPU; 1262 atom->wake_up_time = timestamp; 1263 out_ok: 1264 err = 0; 1265 out_put: 1266 thread__put(wakee); 1267 return err; 1268 } 1269 1270 static int latency_migrate_task_event(struct perf_sched *sched, 1271 struct evsel *evsel, 1272 struct perf_sample *sample, 1273 struct machine *machine) 1274 { 1275 const u32 pid = evsel__intval(evsel, sample, "pid"); 1276 u64 timestamp = sample->time; 1277 struct work_atoms *atoms; 1278 struct work_atom *atom; 1279 struct thread *migrant; 1280 int err = -1; 1281 1282 /* 1283 * Only need to worry about migration when profiling one CPU. 1284 */ 1285 if (sched->profile_cpu == -1) 1286 return 0; 1287 1288 migrant = machine__findnew_thread(machine, -1, pid); 1289 if (migrant == NULL) 1290 return -1; 1291 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1292 if (!atoms) { 1293 if (thread_atoms_insert(sched, migrant)) 1294 goto out_put; 1295 register_pid(sched, migrant->tid, thread__comm_str(migrant)); 1296 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); 1297 if (!atoms) { 1298 pr_err("migration-event: Internal tree error"); 1299 goto out_put; 1300 } 1301 if (add_sched_out_event(atoms, 'R', timestamp)) 1302 goto out_put; 1303 } 1304 1305 BUG_ON(list_empty(&atoms->work_list)); 1306 1307 atom = list_entry(atoms->work_list.prev, struct work_atom, list); 1308 atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp; 1309 1310 sched->nr_timestamps++; 1311 1312 if (atom->sched_out_time > timestamp) 1313 sched->nr_unordered_timestamps++; 1314 err = 0; 1315 out_put: 1316 thread__put(migrant); 1317 return err; 1318 } 1319 1320 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list) 1321 { 1322 int i; 1323 int ret; 1324 u64 avg; 1325 char max_lat_at[32]; 1326 1327 if (!work_list->nb_atoms) 1328 return; 1329 /* 1330 * Ignore idle threads: 1331 */ 1332 if (!strcmp(thread__comm_str(work_list->thread), "swapper")) 1333 return; 1334 1335 sched->all_runtime += work_list->total_runtime; 1336 sched->all_count += work_list->nb_atoms; 1337 1338 if (work_list->num_merged > 1) 1339 ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged); 1340 else 1341 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); 1342 1343 for (i = 0; i < 24 - ret; i++) 1344 printf(" "); 1345 1346 avg = work_list->total_lat / work_list->nb_atoms; 1347 timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at)); 1348 1349 printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n", 1350 (double)work_list->total_runtime / NSEC_PER_MSEC, 1351 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC, 1352 (double)work_list->max_lat / NSEC_PER_MSEC, 1353 max_lat_at); 1354 } 1355 1356 static int pid_cmp(struct work_atoms *l, struct work_atoms *r) 1357 { 1358 if (l->thread == r->thread) 1359 return 0; 1360 if (l->thread->tid < r->thread->tid) 1361 return -1; 1362 if (l->thread->tid > r->thread->tid) 1363 return 1; 1364 return (int)(l->thread - r->thread); 1365 } 1366 1367 static int avg_cmp(struct work_atoms *l, struct work_atoms *r) 1368 { 1369 u64 avgl, avgr; 1370 1371 if (!l->nb_atoms) 1372 return -1; 1373 1374 if (!r->nb_atoms) 1375 return 1; 1376 1377 avgl = l->total_lat / l->nb_atoms; 1378 avgr = r->total_lat / r->nb_atoms; 1379 1380 if (avgl < avgr) 1381 return -1; 1382 if (avgl > avgr) 1383 return 1; 1384 1385 return 0; 1386 } 1387 1388 static int max_cmp(struct work_atoms *l, struct work_atoms *r) 1389 { 1390 if (l->max_lat < r->max_lat) 1391 return -1; 1392 if (l->max_lat > r->max_lat) 1393 return 1; 1394 1395 return 0; 1396 } 1397 1398 static int switch_cmp(struct work_atoms *l, struct work_atoms *r) 1399 { 1400 if (l->nb_atoms < r->nb_atoms) 1401 return -1; 1402 if (l->nb_atoms > r->nb_atoms) 1403 return 1; 1404 1405 return 0; 1406 } 1407 1408 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) 1409 { 1410 if (l->total_runtime < r->total_runtime) 1411 return -1; 1412 if (l->total_runtime > r->total_runtime) 1413 return 1; 1414 1415 return 0; 1416 } 1417 1418 static int sort_dimension__add(const char *tok, struct list_head *list) 1419 { 1420 size_t i; 1421 static struct sort_dimension avg_sort_dimension = { 1422 .name = "avg", 1423 .cmp = avg_cmp, 1424 }; 1425 static struct sort_dimension max_sort_dimension = { 1426 .name = "max", 1427 .cmp = max_cmp, 1428 }; 1429 static struct sort_dimension pid_sort_dimension = { 1430 .name = "pid", 1431 .cmp = pid_cmp, 1432 }; 1433 static struct sort_dimension runtime_sort_dimension = { 1434 .name = "runtime", 1435 .cmp = runtime_cmp, 1436 }; 1437 static struct sort_dimension switch_sort_dimension = { 1438 .name = "switch", 1439 .cmp = switch_cmp, 1440 }; 1441 struct sort_dimension *available_sorts[] = { 1442 &pid_sort_dimension, 1443 &avg_sort_dimension, 1444 &max_sort_dimension, 1445 &switch_sort_dimension, 1446 &runtime_sort_dimension, 1447 }; 1448 1449 for (i = 0; i < ARRAY_SIZE(available_sorts); i++) { 1450 if (!strcmp(available_sorts[i]->name, tok)) { 1451 list_add_tail(&available_sorts[i]->list, list); 1452 1453 return 0; 1454 } 1455 } 1456 1457 return -1; 1458 } 1459 1460 static void perf_sched__sort_lat(struct perf_sched *sched) 1461 { 1462 struct rb_node *node; 1463 struct rb_root_cached *root = &sched->atom_root; 1464 again: 1465 for (;;) { 1466 struct work_atoms *data; 1467 node = rb_first_cached(root); 1468 if (!node) 1469 break; 1470 1471 rb_erase_cached(node, root); 1472 data = rb_entry(node, struct work_atoms, node); 1473 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); 1474 } 1475 if (root == &sched->atom_root) { 1476 root = &sched->merged_atom_root; 1477 goto again; 1478 } 1479 } 1480 1481 static int process_sched_wakeup_event(struct perf_tool *tool, 1482 struct evsel *evsel, 1483 struct perf_sample *sample, 1484 struct machine *machine) 1485 { 1486 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1487 1488 if (sched->tp_handler->wakeup_event) 1489 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine); 1490 1491 return 0; 1492 } 1493 1494 union map_priv { 1495 void *ptr; 1496 bool color; 1497 }; 1498 1499 static bool thread__has_color(struct thread *thread) 1500 { 1501 union map_priv priv = { 1502 .ptr = thread__priv(thread), 1503 }; 1504 1505 return priv.color; 1506 } 1507 1508 static struct thread* 1509 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) 1510 { 1511 struct thread *thread = machine__findnew_thread(machine, pid, tid); 1512 union map_priv priv = { 1513 .color = false, 1514 }; 1515 1516 if (!sched->map.color_pids || !thread || thread__priv(thread)) 1517 return thread; 1518 1519 if (thread_map__has(sched->map.color_pids, tid)) 1520 priv.color = true; 1521 1522 thread__set_priv(thread, priv.ptr); 1523 return thread; 1524 } 1525 1526 static int map_switch_event(struct perf_sched *sched, struct evsel *evsel, 1527 struct perf_sample *sample, struct machine *machine) 1528 { 1529 const u32 next_pid = evsel__intval(evsel, sample, "next_pid"); 1530 struct thread *sched_in; 1531 struct thread_runtime *tr; 1532 int new_shortname; 1533 u64 timestamp0, timestamp = sample->time; 1534 s64 delta; 1535 int i, this_cpu = sample->cpu; 1536 int cpus_nr; 1537 bool new_cpu = false; 1538 const char *color = PERF_COLOR_NORMAL; 1539 char stimestamp[32]; 1540 1541 BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); 1542 1543 if (this_cpu > sched->max_cpu) 1544 sched->max_cpu = this_cpu; 1545 1546 if (sched->map.comp) { 1547 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); 1548 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) { 1549 sched->map.comp_cpus[cpus_nr++] = this_cpu; 1550 new_cpu = true; 1551 } 1552 } else 1553 cpus_nr = sched->max_cpu; 1554 1555 timestamp0 = sched->cpu_last_switched[this_cpu]; 1556 sched->cpu_last_switched[this_cpu] = timestamp; 1557 if (timestamp0) 1558 delta = timestamp - timestamp0; 1559 else 1560 delta = 0; 1561 1562 if (delta < 0) { 1563 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1564 return -1; 1565 } 1566 1567 sched_in = map__findnew_thread(sched, machine, -1, next_pid); 1568 if (sched_in == NULL) 1569 return -1; 1570 1571 tr = thread__get_runtime(sched_in); 1572 if (tr == NULL) { 1573 thread__put(sched_in); 1574 return -1; 1575 } 1576 1577 sched->curr_thread[this_cpu] = thread__get(sched_in); 1578 1579 printf(" "); 1580 1581 new_shortname = 0; 1582 if (!tr->shortname[0]) { 1583 if (!strcmp(thread__comm_str(sched_in), "swapper")) { 1584 /* 1585 * Don't allocate a letter-number for swapper:0 1586 * as a shortname. Instead, we use '.' for it. 1587 */ 1588 tr->shortname[0] = '.'; 1589 tr->shortname[1] = ' '; 1590 } else { 1591 tr->shortname[0] = sched->next_shortname1; 1592 tr->shortname[1] = sched->next_shortname2; 1593 1594 if (sched->next_shortname1 < 'Z') { 1595 sched->next_shortname1++; 1596 } else { 1597 sched->next_shortname1 = 'A'; 1598 if (sched->next_shortname2 < '9') 1599 sched->next_shortname2++; 1600 else 1601 sched->next_shortname2 = '0'; 1602 } 1603 } 1604 new_shortname = 1; 1605 } 1606 1607 for (i = 0; i < cpus_nr; i++) { 1608 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; 1609 struct thread *curr_thread = sched->curr_thread[cpu]; 1610 struct thread_runtime *curr_tr; 1611 const char *pid_color = color; 1612 const char *cpu_color = color; 1613 1614 if (curr_thread && thread__has_color(curr_thread)) 1615 pid_color = COLOR_PIDS; 1616 1617 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu)) 1618 continue; 1619 1620 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) 1621 cpu_color = COLOR_CPUS; 1622 1623 if (cpu != this_cpu) 1624 color_fprintf(stdout, color, " "); 1625 else 1626 color_fprintf(stdout, cpu_color, "*"); 1627 1628 if (sched->curr_thread[cpu]) { 1629 curr_tr = thread__get_runtime(sched->curr_thread[cpu]); 1630 if (curr_tr == NULL) { 1631 thread__put(sched_in); 1632 return -1; 1633 } 1634 color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname); 1635 } else 1636 color_fprintf(stdout, color, " "); 1637 } 1638 1639 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu)) 1640 goto out; 1641 1642 timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); 1643 color_fprintf(stdout, color, " %12s secs ", stimestamp); 1644 if (new_shortname || tr->comm_changed || (verbose > 0 && sched_in->tid)) { 1645 const char *pid_color = color; 1646 1647 if (thread__has_color(sched_in)) 1648 pid_color = COLOR_PIDS; 1649 1650 color_fprintf(stdout, pid_color, "%s => %s:%d", 1651 tr->shortname, thread__comm_str(sched_in), sched_in->tid); 1652 tr->comm_changed = false; 1653 } 1654 1655 if (sched->map.comp && new_cpu) 1656 color_fprintf(stdout, color, " (CPU %d)", this_cpu); 1657 1658 out: 1659 color_fprintf(stdout, color, "\n"); 1660 1661 thread__put(sched_in); 1662 1663 return 0; 1664 } 1665 1666 static int process_sched_switch_event(struct perf_tool *tool, 1667 struct evsel *evsel, 1668 struct perf_sample *sample, 1669 struct machine *machine) 1670 { 1671 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1672 int this_cpu = sample->cpu, err = 0; 1673 u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"), 1674 next_pid = evsel__intval(evsel, sample, "next_pid"); 1675 1676 if (sched->curr_pid[this_cpu] != (u32)-1) { 1677 /* 1678 * Are we trying to switch away a PID that is 1679 * not current? 1680 */ 1681 if (sched->curr_pid[this_cpu] != prev_pid) 1682 sched->nr_context_switch_bugs++; 1683 } 1684 1685 if (sched->tp_handler->switch_event) 1686 err = sched->tp_handler->switch_event(sched, evsel, sample, machine); 1687 1688 sched->curr_pid[this_cpu] = next_pid; 1689 return err; 1690 } 1691 1692 static int process_sched_runtime_event(struct perf_tool *tool, 1693 struct evsel *evsel, 1694 struct perf_sample *sample, 1695 struct machine *machine) 1696 { 1697 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1698 1699 if (sched->tp_handler->runtime_event) 1700 return sched->tp_handler->runtime_event(sched, evsel, sample, machine); 1701 1702 return 0; 1703 } 1704 1705 static int perf_sched__process_fork_event(struct perf_tool *tool, 1706 union perf_event *event, 1707 struct perf_sample *sample, 1708 struct machine *machine) 1709 { 1710 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1711 1712 /* run the fork event through the perf machineruy */ 1713 perf_event__process_fork(tool, event, sample, machine); 1714 1715 /* and then run additional processing needed for this command */ 1716 if (sched->tp_handler->fork_event) 1717 return sched->tp_handler->fork_event(sched, event, machine); 1718 1719 return 0; 1720 } 1721 1722 static int process_sched_migrate_task_event(struct perf_tool *tool, 1723 struct evsel *evsel, 1724 struct perf_sample *sample, 1725 struct machine *machine) 1726 { 1727 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 1728 1729 if (sched->tp_handler->migrate_task_event) 1730 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine); 1731 1732 return 0; 1733 } 1734 1735 typedef int (*tracepoint_handler)(struct perf_tool *tool, 1736 struct evsel *evsel, 1737 struct perf_sample *sample, 1738 struct machine *machine); 1739 1740 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused, 1741 union perf_event *event __maybe_unused, 1742 struct perf_sample *sample, 1743 struct evsel *evsel, 1744 struct machine *machine) 1745 { 1746 int err = 0; 1747 1748 if (evsel->handler != NULL) { 1749 tracepoint_handler f = evsel->handler; 1750 err = f(tool, evsel, sample, machine); 1751 } 1752 1753 return err; 1754 } 1755 1756 static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused, 1757 union perf_event *event, 1758 struct perf_sample *sample, 1759 struct machine *machine) 1760 { 1761 struct thread *thread; 1762 struct thread_runtime *tr; 1763 int err; 1764 1765 err = perf_event__process_comm(tool, event, sample, machine); 1766 if (err) 1767 return err; 1768 1769 thread = machine__find_thread(machine, sample->pid, sample->tid); 1770 if (!thread) { 1771 pr_err("Internal error: can't find thread\n"); 1772 return -1; 1773 } 1774 1775 tr = thread__get_runtime(thread); 1776 if (tr == NULL) { 1777 thread__put(thread); 1778 return -1; 1779 } 1780 1781 tr->comm_changed = true; 1782 thread__put(thread); 1783 1784 return 0; 1785 } 1786 1787 static int perf_sched__read_events(struct perf_sched *sched) 1788 { 1789 const struct evsel_str_handler handlers[] = { 1790 { "sched:sched_switch", process_sched_switch_event, }, 1791 { "sched:sched_stat_runtime", process_sched_runtime_event, }, 1792 { "sched:sched_wakeup", process_sched_wakeup_event, }, 1793 { "sched:sched_wakeup_new", process_sched_wakeup_event, }, 1794 { "sched:sched_migrate_task", process_sched_migrate_task_event, }, 1795 }; 1796 struct perf_session *session; 1797 struct perf_data data = { 1798 .path = input_name, 1799 .mode = PERF_DATA_MODE_READ, 1800 .force = sched->force, 1801 }; 1802 int rc = -1; 1803 1804 session = perf_session__new(&data, false, &sched->tool); 1805 if (IS_ERR(session)) { 1806 pr_debug("Error creating perf session"); 1807 return PTR_ERR(session); 1808 } 1809 1810 symbol__init(&session->header.env); 1811 1812 if (perf_session__set_tracepoints_handlers(session, handlers)) 1813 goto out_delete; 1814 1815 if (perf_session__has_traces(session, "record -R")) { 1816 int err = perf_session__process_events(session); 1817 if (err) { 1818 pr_err("Failed to process events, error %d", err); 1819 goto out_delete; 1820 } 1821 1822 sched->nr_events = session->evlist->stats.nr_events[0]; 1823 sched->nr_lost_events = session->evlist->stats.total_lost; 1824 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST]; 1825 } 1826 1827 rc = 0; 1828 out_delete: 1829 perf_session__delete(session); 1830 return rc; 1831 } 1832 1833 /* 1834 * scheduling times are printed as msec.usec 1835 */ 1836 static inline void print_sched_time(unsigned long long nsecs, int width) 1837 { 1838 unsigned long msecs; 1839 unsigned long usecs; 1840 1841 msecs = nsecs / NSEC_PER_MSEC; 1842 nsecs -= msecs * NSEC_PER_MSEC; 1843 usecs = nsecs / NSEC_PER_USEC; 1844 printf("%*lu.%03lu ", width, msecs, usecs); 1845 } 1846 1847 /* 1848 * returns runtime data for event, allocating memory for it the 1849 * first time it is used. 1850 */ 1851 static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel) 1852 { 1853 struct evsel_runtime *r = evsel->priv; 1854 1855 if (r == NULL) { 1856 r = zalloc(sizeof(struct evsel_runtime)); 1857 evsel->priv = r; 1858 } 1859 1860 return r; 1861 } 1862 1863 /* 1864 * save last time event was seen per cpu 1865 */ 1866 static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu) 1867 { 1868 struct evsel_runtime *r = evsel__get_runtime(evsel); 1869 1870 if (r == NULL) 1871 return; 1872 1873 if ((cpu >= r->ncpu) || (r->last_time == NULL)) { 1874 int i, n = __roundup_pow_of_two(cpu+1); 1875 void *p = r->last_time; 1876 1877 p = realloc(r->last_time, n * sizeof(u64)); 1878 if (!p) 1879 return; 1880 1881 r->last_time = p; 1882 for (i = r->ncpu; i < n; ++i) 1883 r->last_time[i] = (u64) 0; 1884 1885 r->ncpu = n; 1886 } 1887 1888 r->last_time[cpu] = timestamp; 1889 } 1890 1891 /* returns last time this event was seen on the given cpu */ 1892 static u64 evsel__get_time(struct evsel *evsel, u32 cpu) 1893 { 1894 struct evsel_runtime *r = evsel__get_runtime(evsel); 1895 1896 if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu)) 1897 return 0; 1898 1899 return r->last_time[cpu]; 1900 } 1901 1902 static int comm_width = 30; 1903 1904 static char *timehist_get_commstr(struct thread *thread) 1905 { 1906 static char str[32]; 1907 const char *comm = thread__comm_str(thread); 1908 pid_t tid = thread->tid; 1909 pid_t pid = thread->pid_; 1910 int n; 1911 1912 if (pid == 0) 1913 n = scnprintf(str, sizeof(str), "%s", comm); 1914 1915 else if (tid != pid) 1916 n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid); 1917 1918 else 1919 n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid); 1920 1921 if (n > comm_width) 1922 comm_width = n; 1923 1924 return str; 1925 } 1926 1927 static void timehist_header(struct perf_sched *sched) 1928 { 1929 u32 ncpus = sched->max_cpu + 1; 1930 u32 i, j; 1931 1932 printf("%15s %6s ", "time", "cpu"); 1933 1934 if (sched->show_cpu_visual) { 1935 printf(" "); 1936 for (i = 0, j = 0; i < ncpus; ++i) { 1937 printf("%x", j++); 1938 if (j > 15) 1939 j = 0; 1940 } 1941 printf(" "); 1942 } 1943 1944 printf(" %-*s %9s %9s %9s", comm_width, 1945 "task name", "wait time", "sch delay", "run time"); 1946 1947 if (sched->show_state) 1948 printf(" %s", "state"); 1949 1950 printf("\n"); 1951 1952 /* 1953 * units row 1954 */ 1955 printf("%15s %-6s ", "", ""); 1956 1957 if (sched->show_cpu_visual) 1958 printf(" %*s ", ncpus, ""); 1959 1960 printf(" %-*s %9s %9s %9s", comm_width, 1961 "[tid/pid]", "(msec)", "(msec)", "(msec)"); 1962 1963 if (sched->show_state) 1964 printf(" %5s", ""); 1965 1966 printf("\n"); 1967 1968 /* 1969 * separator 1970 */ 1971 printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line); 1972 1973 if (sched->show_cpu_visual) 1974 printf(" %.*s ", ncpus, graph_dotted_line); 1975 1976 printf(" %.*s %.9s %.9s %.9s", comm_width, 1977 graph_dotted_line, graph_dotted_line, graph_dotted_line, 1978 graph_dotted_line); 1979 1980 if (sched->show_state) 1981 printf(" %.5s", graph_dotted_line); 1982 1983 printf("\n"); 1984 } 1985 1986 static char task_state_char(struct thread *thread, int state) 1987 { 1988 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1989 unsigned bit = state ? ffs(state) : 0; 1990 1991 /* 'I' for idle */ 1992 if (thread->tid == 0) 1993 return 'I'; 1994 1995 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; 1996 } 1997 1998 static void timehist_print_sample(struct perf_sched *sched, 1999 struct evsel *evsel, 2000 struct perf_sample *sample, 2001 struct addr_location *al, 2002 struct thread *thread, 2003 u64 t, int state) 2004 { 2005 struct thread_runtime *tr = thread__priv(thread); 2006 const char *next_comm = evsel__strval(evsel, sample, "next_comm"); 2007 const u32 next_pid = evsel__intval(evsel, sample, "next_pid"); 2008 u32 max_cpus = sched->max_cpu + 1; 2009 char tstr[64]; 2010 char nstr[30]; 2011 u64 wait_time; 2012 2013 if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) 2014 return; 2015 2016 timestamp__scnprintf_usec(t, tstr, sizeof(tstr)); 2017 printf("%15s [%04d] ", tstr, sample->cpu); 2018 2019 if (sched->show_cpu_visual) { 2020 u32 i; 2021 char c; 2022 2023 printf(" "); 2024 for (i = 0; i < max_cpus; ++i) { 2025 /* flag idle times with 'i'; others are sched events */ 2026 if (i == sample->cpu) 2027 c = (thread->tid == 0) ? 'i' : 's'; 2028 else 2029 c = ' '; 2030 printf("%c", c); 2031 } 2032 printf(" "); 2033 } 2034 2035 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2036 2037 wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt; 2038 print_sched_time(wait_time, 6); 2039 2040 print_sched_time(tr->dt_delay, 6); 2041 print_sched_time(tr->dt_run, 6); 2042 2043 if (sched->show_state) 2044 printf(" %5c ", task_state_char(thread, state)); 2045 2046 if (sched->show_next) { 2047 snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid); 2048 printf(" %-*s", comm_width, nstr); 2049 } 2050 2051 if (sched->show_wakeups && !sched->show_next) 2052 printf(" %-*s", comm_width, ""); 2053 2054 if (thread->tid == 0) 2055 goto out; 2056 2057 if (sched->show_callchain) 2058 printf(" "); 2059 2060 sample__fprintf_sym(sample, al, 0, 2061 EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE | 2062 EVSEL__PRINT_CALLCHAIN_ARROW | 2063 EVSEL__PRINT_SKIP_IGNORED, 2064 &callchain_cursor, symbol_conf.bt_stop_list, stdout); 2065 2066 out: 2067 printf("\n"); 2068 } 2069 2070 /* 2071 * Explanation of delta-time stats: 2072 * 2073 * t = time of current schedule out event 2074 * tprev = time of previous sched out event 2075 * also time of schedule-in event for current task 2076 * last_time = time of last sched change event for current task 2077 * (i.e, time process was last scheduled out) 2078 * ready_to_run = time of wakeup for current task 2079 * 2080 * -----|------------|------------|------------|------ 2081 * last ready tprev t 2082 * time to run 2083 * 2084 * |-------- dt_wait --------| 2085 * |- dt_delay -|-- dt_run --| 2086 * 2087 * dt_run = run time of current task 2088 * dt_wait = time between last schedule out event for task and tprev 2089 * represents time spent off the cpu 2090 * dt_delay = time between wakeup and schedule-in of task 2091 */ 2092 2093 static void timehist_update_runtime_stats(struct thread_runtime *r, 2094 u64 t, u64 tprev) 2095 { 2096 r->dt_delay = 0; 2097 r->dt_sleep = 0; 2098 r->dt_iowait = 0; 2099 r->dt_preempt = 0; 2100 r->dt_run = 0; 2101 2102 if (tprev) { 2103 r->dt_run = t - tprev; 2104 if (r->ready_to_run) { 2105 if (r->ready_to_run > tprev) 2106 pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); 2107 else 2108 r->dt_delay = tprev - r->ready_to_run; 2109 } 2110 2111 if (r->last_time > tprev) 2112 pr_debug("time travel: last sched out time for task > previous sched_switch event\n"); 2113 else if (r->last_time) { 2114 u64 dt_wait = tprev - r->last_time; 2115 2116 if (r->last_state == TASK_RUNNING) 2117 r->dt_preempt = dt_wait; 2118 else if (r->last_state == TASK_UNINTERRUPTIBLE) 2119 r->dt_iowait = dt_wait; 2120 else 2121 r->dt_sleep = dt_wait; 2122 } 2123 } 2124 2125 update_stats(&r->run_stats, r->dt_run); 2126 2127 r->total_run_time += r->dt_run; 2128 r->total_delay_time += r->dt_delay; 2129 r->total_sleep_time += r->dt_sleep; 2130 r->total_iowait_time += r->dt_iowait; 2131 r->total_preempt_time += r->dt_preempt; 2132 } 2133 2134 static bool is_idle_sample(struct perf_sample *sample, 2135 struct evsel *evsel) 2136 { 2137 /* pid 0 == swapper == idle task */ 2138 if (strcmp(evsel__name(evsel), "sched:sched_switch") == 0) 2139 return evsel__intval(evsel, sample, "prev_pid") == 0; 2140 2141 return sample->pid == 0; 2142 } 2143 2144 static void save_task_callchain(struct perf_sched *sched, 2145 struct perf_sample *sample, 2146 struct evsel *evsel, 2147 struct machine *machine) 2148 { 2149 struct callchain_cursor *cursor = &callchain_cursor; 2150 struct thread *thread; 2151 2152 /* want main thread for process - has maps */ 2153 thread = machine__findnew_thread(machine, sample->pid, sample->pid); 2154 if (thread == NULL) { 2155 pr_debug("Failed to get thread for pid %d.\n", sample->pid); 2156 return; 2157 } 2158 2159 if (!sched->show_callchain || sample->callchain == NULL) 2160 return; 2161 2162 if (thread__resolve_callchain(thread, cursor, evsel, sample, 2163 NULL, NULL, sched->max_stack + 2) != 0) { 2164 if (verbose > 0) 2165 pr_err("Failed to resolve callchain. Skipping\n"); 2166 2167 return; 2168 } 2169 2170 callchain_cursor_commit(cursor); 2171 2172 while (true) { 2173 struct callchain_cursor_node *node; 2174 struct symbol *sym; 2175 2176 node = callchain_cursor_current(cursor); 2177 if (node == NULL) 2178 break; 2179 2180 sym = node->ms.sym; 2181 if (sym) { 2182 if (!strcmp(sym->name, "schedule") || 2183 !strcmp(sym->name, "__schedule") || 2184 !strcmp(sym->name, "preempt_schedule")) 2185 sym->ignore = 1; 2186 } 2187 2188 callchain_cursor_advance(cursor); 2189 } 2190 } 2191 2192 static int init_idle_thread(struct thread *thread) 2193 { 2194 struct idle_thread_runtime *itr; 2195 2196 thread__set_comm(thread, idle_comm, 0); 2197 2198 itr = zalloc(sizeof(*itr)); 2199 if (itr == NULL) 2200 return -ENOMEM; 2201 2202 init_stats(&itr->tr.run_stats); 2203 callchain_init(&itr->callchain); 2204 callchain_cursor_reset(&itr->cursor); 2205 thread__set_priv(thread, itr); 2206 2207 return 0; 2208 } 2209 2210 /* 2211 * Track idle stats per cpu by maintaining a local thread 2212 * struct for the idle task on each cpu. 2213 */ 2214 static int init_idle_threads(int ncpu) 2215 { 2216 int i, ret; 2217 2218 idle_threads = zalloc(ncpu * sizeof(struct thread *)); 2219 if (!idle_threads) 2220 return -ENOMEM; 2221 2222 idle_max_cpu = ncpu; 2223 2224 /* allocate the actual thread struct if needed */ 2225 for (i = 0; i < ncpu; ++i) { 2226 idle_threads[i] = thread__new(0, 0); 2227 if (idle_threads[i] == NULL) 2228 return -ENOMEM; 2229 2230 ret = init_idle_thread(idle_threads[i]); 2231 if (ret < 0) 2232 return ret; 2233 } 2234 2235 return 0; 2236 } 2237 2238 static void free_idle_threads(void) 2239 { 2240 int i; 2241 2242 if (idle_threads == NULL) 2243 return; 2244 2245 for (i = 0; i < idle_max_cpu; ++i) { 2246 if ((idle_threads[i])) 2247 thread__delete(idle_threads[i]); 2248 } 2249 2250 free(idle_threads); 2251 } 2252 2253 static struct thread *get_idle_thread(int cpu) 2254 { 2255 /* 2256 * expand/allocate array of pointers to local thread 2257 * structs if needed 2258 */ 2259 if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) { 2260 int i, j = __roundup_pow_of_two(cpu+1); 2261 void *p; 2262 2263 p = realloc(idle_threads, j * sizeof(struct thread *)); 2264 if (!p) 2265 return NULL; 2266 2267 idle_threads = (struct thread **) p; 2268 for (i = idle_max_cpu; i < j; ++i) 2269 idle_threads[i] = NULL; 2270 2271 idle_max_cpu = j; 2272 } 2273 2274 /* allocate a new thread struct if needed */ 2275 if (idle_threads[cpu] == NULL) { 2276 idle_threads[cpu] = thread__new(0, 0); 2277 2278 if (idle_threads[cpu]) { 2279 if (init_idle_thread(idle_threads[cpu]) < 0) 2280 return NULL; 2281 } 2282 } 2283 2284 return idle_threads[cpu]; 2285 } 2286 2287 static void save_idle_callchain(struct perf_sched *sched, 2288 struct idle_thread_runtime *itr, 2289 struct perf_sample *sample) 2290 { 2291 if (!sched->show_callchain || sample->callchain == NULL) 2292 return; 2293 2294 callchain_cursor__copy(&itr->cursor, &callchain_cursor); 2295 } 2296 2297 static struct thread *timehist_get_thread(struct perf_sched *sched, 2298 struct perf_sample *sample, 2299 struct machine *machine, 2300 struct evsel *evsel) 2301 { 2302 struct thread *thread; 2303 2304 if (is_idle_sample(sample, evsel)) { 2305 thread = get_idle_thread(sample->cpu); 2306 if (thread == NULL) 2307 pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu); 2308 2309 } else { 2310 /* there were samples with tid 0 but non-zero pid */ 2311 thread = machine__findnew_thread(machine, sample->pid, 2312 sample->tid ?: sample->pid); 2313 if (thread == NULL) { 2314 pr_debug("Failed to get thread for tid %d. skipping sample.\n", 2315 sample->tid); 2316 } 2317 2318 save_task_callchain(sched, sample, evsel, machine); 2319 if (sched->idle_hist) { 2320 struct thread *idle; 2321 struct idle_thread_runtime *itr; 2322 2323 idle = get_idle_thread(sample->cpu); 2324 if (idle == NULL) { 2325 pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu); 2326 return NULL; 2327 } 2328 2329 itr = thread__priv(idle); 2330 if (itr == NULL) 2331 return NULL; 2332 2333 itr->last_thread = thread; 2334 2335 /* copy task callchain when entering to idle */ 2336 if (evsel__intval(evsel, sample, "next_pid") == 0) 2337 save_idle_callchain(sched, itr, sample); 2338 } 2339 } 2340 2341 return thread; 2342 } 2343 2344 static bool timehist_skip_sample(struct perf_sched *sched, 2345 struct thread *thread, 2346 struct evsel *evsel, 2347 struct perf_sample *sample) 2348 { 2349 bool rc = false; 2350 2351 if (thread__is_filtered(thread)) { 2352 rc = true; 2353 sched->skipped_samples++; 2354 } 2355 2356 if (sched->idle_hist) { 2357 if (strcmp(evsel__name(evsel), "sched:sched_switch")) 2358 rc = true; 2359 else if (evsel__intval(evsel, sample, "prev_pid") != 0 && 2360 evsel__intval(evsel, sample, "next_pid") != 0) 2361 rc = true; 2362 } 2363 2364 return rc; 2365 } 2366 2367 static void timehist_print_wakeup_event(struct perf_sched *sched, 2368 struct evsel *evsel, 2369 struct perf_sample *sample, 2370 struct machine *machine, 2371 struct thread *awakened) 2372 { 2373 struct thread *thread; 2374 char tstr[64]; 2375 2376 thread = machine__findnew_thread(machine, sample->pid, sample->tid); 2377 if (thread == NULL) 2378 return; 2379 2380 /* show wakeup unless both awakee and awaker are filtered */ 2381 if (timehist_skip_sample(sched, thread, evsel, sample) && 2382 timehist_skip_sample(sched, awakened, evsel, sample)) { 2383 return; 2384 } 2385 2386 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 2387 printf("%15s [%04d] ", tstr, sample->cpu); 2388 if (sched->show_cpu_visual) 2389 printf(" %*s ", sched->max_cpu + 1, ""); 2390 2391 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2392 2393 /* dt spacer */ 2394 printf(" %9s %9s %9s ", "", "", ""); 2395 2396 printf("awakened: %s", timehist_get_commstr(awakened)); 2397 2398 printf("\n"); 2399 } 2400 2401 static int timehist_sched_wakeup_event(struct perf_tool *tool, 2402 union perf_event *event __maybe_unused, 2403 struct evsel *evsel, 2404 struct perf_sample *sample, 2405 struct machine *machine) 2406 { 2407 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2408 struct thread *thread; 2409 struct thread_runtime *tr = NULL; 2410 /* want pid of awakened task not pid in sample */ 2411 const u32 pid = evsel__intval(evsel, sample, "pid"); 2412 2413 thread = machine__findnew_thread(machine, 0, pid); 2414 if (thread == NULL) 2415 return -1; 2416 2417 tr = thread__get_runtime(thread); 2418 if (tr == NULL) 2419 return -1; 2420 2421 if (tr->ready_to_run == 0) 2422 tr->ready_to_run = sample->time; 2423 2424 /* show wakeups if requested */ 2425 if (sched->show_wakeups && 2426 !perf_time__skip_sample(&sched->ptime, sample->time)) 2427 timehist_print_wakeup_event(sched, evsel, sample, machine, thread); 2428 2429 return 0; 2430 } 2431 2432 static void timehist_print_migration_event(struct perf_sched *sched, 2433 struct evsel *evsel, 2434 struct perf_sample *sample, 2435 struct machine *machine, 2436 struct thread *migrated) 2437 { 2438 struct thread *thread; 2439 char tstr[64]; 2440 u32 max_cpus = sched->max_cpu + 1; 2441 u32 ocpu, dcpu; 2442 2443 if (sched->summary_only) 2444 return; 2445 2446 max_cpus = sched->max_cpu + 1; 2447 ocpu = evsel__intval(evsel, sample, "orig_cpu"); 2448 dcpu = evsel__intval(evsel, sample, "dest_cpu"); 2449 2450 thread = machine__findnew_thread(machine, sample->pid, sample->tid); 2451 if (thread == NULL) 2452 return; 2453 2454 if (timehist_skip_sample(sched, thread, evsel, sample) && 2455 timehist_skip_sample(sched, migrated, evsel, sample)) { 2456 return; 2457 } 2458 2459 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 2460 printf("%15s [%04d] ", tstr, sample->cpu); 2461 2462 if (sched->show_cpu_visual) { 2463 u32 i; 2464 char c; 2465 2466 printf(" "); 2467 for (i = 0; i < max_cpus; ++i) { 2468 c = (i == sample->cpu) ? 'm' : ' '; 2469 printf("%c", c); 2470 } 2471 printf(" "); 2472 } 2473 2474 printf(" %-*s ", comm_width, timehist_get_commstr(thread)); 2475 2476 /* dt spacer */ 2477 printf(" %9s %9s %9s ", "", "", ""); 2478 2479 printf("migrated: %s", timehist_get_commstr(migrated)); 2480 printf(" cpu %d => %d", ocpu, dcpu); 2481 2482 printf("\n"); 2483 } 2484 2485 static int timehist_migrate_task_event(struct perf_tool *tool, 2486 union perf_event *event __maybe_unused, 2487 struct evsel *evsel, 2488 struct perf_sample *sample, 2489 struct machine *machine) 2490 { 2491 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2492 struct thread *thread; 2493 struct thread_runtime *tr = NULL; 2494 /* want pid of migrated task not pid in sample */ 2495 const u32 pid = evsel__intval(evsel, sample, "pid"); 2496 2497 thread = machine__findnew_thread(machine, 0, pid); 2498 if (thread == NULL) 2499 return -1; 2500 2501 tr = thread__get_runtime(thread); 2502 if (tr == NULL) 2503 return -1; 2504 2505 tr->migrations++; 2506 2507 /* show migrations if requested */ 2508 timehist_print_migration_event(sched, evsel, sample, machine, thread); 2509 2510 return 0; 2511 } 2512 2513 static int timehist_sched_change_event(struct perf_tool *tool, 2514 union perf_event *event, 2515 struct evsel *evsel, 2516 struct perf_sample *sample, 2517 struct machine *machine) 2518 { 2519 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2520 struct perf_time_interval *ptime = &sched->ptime; 2521 struct addr_location al; 2522 struct thread *thread; 2523 struct thread_runtime *tr = NULL; 2524 u64 tprev, t = sample->time; 2525 int rc = 0; 2526 int state = evsel__intval(evsel, sample, "prev_state"); 2527 2528 if (machine__resolve(machine, &al, sample) < 0) { 2529 pr_err("problem processing %d event. skipping it\n", 2530 event->header.type); 2531 rc = -1; 2532 goto out; 2533 } 2534 2535 thread = timehist_get_thread(sched, sample, machine, evsel); 2536 if (thread == NULL) { 2537 rc = -1; 2538 goto out; 2539 } 2540 2541 if (timehist_skip_sample(sched, thread, evsel, sample)) 2542 goto out; 2543 2544 tr = thread__get_runtime(thread); 2545 if (tr == NULL) { 2546 rc = -1; 2547 goto out; 2548 } 2549 2550 tprev = evsel__get_time(evsel, sample->cpu); 2551 2552 /* 2553 * If start time given: 2554 * - sample time is under window user cares about - skip sample 2555 * - tprev is under window user cares about - reset to start of window 2556 */ 2557 if (ptime->start && ptime->start > t) 2558 goto out; 2559 2560 if (tprev && ptime->start > tprev) 2561 tprev = ptime->start; 2562 2563 /* 2564 * If end time given: 2565 * - previous sched event is out of window - we are done 2566 * - sample time is beyond window user cares about - reset it 2567 * to close out stats for time window interest 2568 */ 2569 if (ptime->end) { 2570 if (tprev > ptime->end) 2571 goto out; 2572 2573 if (t > ptime->end) 2574 t = ptime->end; 2575 } 2576 2577 if (!sched->idle_hist || thread->tid == 0) { 2578 timehist_update_runtime_stats(tr, t, tprev); 2579 2580 if (sched->idle_hist) { 2581 struct idle_thread_runtime *itr = (void *)tr; 2582 struct thread_runtime *last_tr; 2583 2584 BUG_ON(thread->tid != 0); 2585 2586 if (itr->last_thread == NULL) 2587 goto out; 2588 2589 /* add current idle time as last thread's runtime */ 2590 last_tr = thread__get_runtime(itr->last_thread); 2591 if (last_tr == NULL) 2592 goto out; 2593 2594 timehist_update_runtime_stats(last_tr, t, tprev); 2595 /* 2596 * remove delta time of last thread as it's not updated 2597 * and otherwise it will show an invalid value next 2598 * time. we only care total run time and run stat. 2599 */ 2600 last_tr->dt_run = 0; 2601 last_tr->dt_delay = 0; 2602 last_tr->dt_sleep = 0; 2603 last_tr->dt_iowait = 0; 2604 last_tr->dt_preempt = 0; 2605 2606 if (itr->cursor.nr) 2607 callchain_append(&itr->callchain, &itr->cursor, t - tprev); 2608 2609 itr->last_thread = NULL; 2610 } 2611 } 2612 2613 if (!sched->summary_only) 2614 timehist_print_sample(sched, evsel, sample, &al, thread, t, state); 2615 2616 out: 2617 if (sched->hist_time.start == 0 && t >= ptime->start) 2618 sched->hist_time.start = t; 2619 if (ptime->end == 0 || t <= ptime->end) 2620 sched->hist_time.end = t; 2621 2622 if (tr) { 2623 /* time of this sched_switch event becomes last time task seen */ 2624 tr->last_time = sample->time; 2625 2626 /* last state is used to determine where to account wait time */ 2627 tr->last_state = state; 2628 2629 /* sched out event for task so reset ready to run time */ 2630 tr->ready_to_run = 0; 2631 } 2632 2633 evsel__save_time(evsel, sample->time, sample->cpu); 2634 2635 return rc; 2636 } 2637 2638 static int timehist_sched_switch_event(struct perf_tool *tool, 2639 union perf_event *event, 2640 struct evsel *evsel, 2641 struct perf_sample *sample, 2642 struct machine *machine __maybe_unused) 2643 { 2644 return timehist_sched_change_event(tool, event, evsel, sample, machine); 2645 } 2646 2647 static int process_lost(struct perf_tool *tool __maybe_unused, 2648 union perf_event *event, 2649 struct perf_sample *sample, 2650 struct machine *machine __maybe_unused) 2651 { 2652 char tstr[64]; 2653 2654 timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); 2655 printf("%15s ", tstr); 2656 printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu); 2657 2658 return 0; 2659 } 2660 2661 2662 static void print_thread_runtime(struct thread *t, 2663 struct thread_runtime *r) 2664 { 2665 double mean = avg_stats(&r->run_stats); 2666 float stddev; 2667 2668 printf("%*s %5d %9" PRIu64 " ", 2669 comm_width, timehist_get_commstr(t), t->ppid, 2670 (u64) r->run_stats.n); 2671 2672 print_sched_time(r->total_run_time, 8); 2673 stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean); 2674 print_sched_time(r->run_stats.min, 6); 2675 printf(" "); 2676 print_sched_time((u64) mean, 6); 2677 printf(" "); 2678 print_sched_time(r->run_stats.max, 6); 2679 printf(" "); 2680 printf("%5.2f", stddev); 2681 printf(" %5" PRIu64, r->migrations); 2682 printf("\n"); 2683 } 2684 2685 static void print_thread_waittime(struct thread *t, 2686 struct thread_runtime *r) 2687 { 2688 printf("%*s %5d %9" PRIu64 " ", 2689 comm_width, timehist_get_commstr(t), t->ppid, 2690 (u64) r->run_stats.n); 2691 2692 print_sched_time(r->total_run_time, 8); 2693 print_sched_time(r->total_sleep_time, 6); 2694 printf(" "); 2695 print_sched_time(r->total_iowait_time, 6); 2696 printf(" "); 2697 print_sched_time(r->total_preempt_time, 6); 2698 printf(" "); 2699 print_sched_time(r->total_delay_time, 6); 2700 printf("\n"); 2701 } 2702 2703 struct total_run_stats { 2704 struct perf_sched *sched; 2705 u64 sched_count; 2706 u64 task_count; 2707 u64 total_run_time; 2708 }; 2709 2710 static int __show_thread_runtime(struct thread *t, void *priv) 2711 { 2712 struct total_run_stats *stats = priv; 2713 struct thread_runtime *r; 2714 2715 if (thread__is_filtered(t)) 2716 return 0; 2717 2718 r = thread__priv(t); 2719 if (r && r->run_stats.n) { 2720 stats->task_count++; 2721 stats->sched_count += r->run_stats.n; 2722 stats->total_run_time += r->total_run_time; 2723 2724 if (stats->sched->show_state) 2725 print_thread_waittime(t, r); 2726 else 2727 print_thread_runtime(t, r); 2728 } 2729 2730 return 0; 2731 } 2732 2733 static int show_thread_runtime(struct thread *t, void *priv) 2734 { 2735 if (t->dead) 2736 return 0; 2737 2738 return __show_thread_runtime(t, priv); 2739 } 2740 2741 static int show_deadthread_runtime(struct thread *t, void *priv) 2742 { 2743 if (!t->dead) 2744 return 0; 2745 2746 return __show_thread_runtime(t, priv); 2747 } 2748 2749 static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node) 2750 { 2751 const char *sep = " <- "; 2752 struct callchain_list *chain; 2753 size_t ret = 0; 2754 char bf[1024]; 2755 bool first; 2756 2757 if (node == NULL) 2758 return 0; 2759 2760 ret = callchain__fprintf_folded(fp, node->parent); 2761 first = (ret == 0); 2762 2763 list_for_each_entry(chain, &node->val, list) { 2764 if (chain->ip >= PERF_CONTEXT_MAX) 2765 continue; 2766 if (chain->ms.sym && chain->ms.sym->ignore) 2767 continue; 2768 ret += fprintf(fp, "%s%s", first ? "" : sep, 2769 callchain_list__sym_name(chain, bf, sizeof(bf), 2770 false)); 2771 first = false; 2772 } 2773 2774 return ret; 2775 } 2776 2777 static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root) 2778 { 2779 size_t ret = 0; 2780 FILE *fp = stdout; 2781 struct callchain_node *chain; 2782 struct rb_node *rb_node = rb_first_cached(root); 2783 2784 printf(" %16s %8s %s\n", "Idle time (msec)", "Count", "Callchains"); 2785 printf(" %.16s %.8s %.50s\n", graph_dotted_line, graph_dotted_line, 2786 graph_dotted_line); 2787 2788 while (rb_node) { 2789 chain = rb_entry(rb_node, struct callchain_node, rb_node); 2790 rb_node = rb_next(rb_node); 2791 2792 ret += fprintf(fp, " "); 2793 print_sched_time(chain->hit, 12); 2794 ret += 16; /* print_sched_time returns 2nd arg + 4 */ 2795 ret += fprintf(fp, " %8d ", chain->count); 2796 ret += callchain__fprintf_folded(fp, chain); 2797 ret += fprintf(fp, "\n"); 2798 } 2799 2800 return ret; 2801 } 2802 2803 static void timehist_print_summary(struct perf_sched *sched, 2804 struct perf_session *session) 2805 { 2806 struct machine *m = &session->machines.host; 2807 struct total_run_stats totals; 2808 u64 task_count; 2809 struct thread *t; 2810 struct thread_runtime *r; 2811 int i; 2812 u64 hist_time = sched->hist_time.end - sched->hist_time.start; 2813 2814 memset(&totals, 0, sizeof(totals)); 2815 totals.sched = sched; 2816 2817 if (sched->idle_hist) { 2818 printf("\nIdle-time summary\n"); 2819 printf("%*s parent sched-out ", comm_width, "comm"); 2820 printf(" idle-time min-idle avg-idle max-idle stddev migrations\n"); 2821 } else if (sched->show_state) { 2822 printf("\nWait-time summary\n"); 2823 printf("%*s parent sched-in ", comm_width, "comm"); 2824 printf(" run-time sleep iowait preempt delay\n"); 2825 } else { 2826 printf("\nRuntime summary\n"); 2827 printf("%*s parent sched-in ", comm_width, "comm"); 2828 printf(" run-time min-run avg-run max-run stddev migrations\n"); 2829 } 2830 printf("%*s (count) ", comm_width, ""); 2831 printf(" (msec) (msec) (msec) (msec) %s\n", 2832 sched->show_state ? "(msec)" : "%"); 2833 printf("%.117s\n", graph_dotted_line); 2834 2835 machine__for_each_thread(m, show_thread_runtime, &totals); 2836 task_count = totals.task_count; 2837 if (!task_count) 2838 printf("<no still running tasks>\n"); 2839 2840 printf("\nTerminated tasks:\n"); 2841 machine__for_each_thread(m, show_deadthread_runtime, &totals); 2842 if (task_count == totals.task_count) 2843 printf("<no terminated tasks>\n"); 2844 2845 /* CPU idle stats not tracked when samples were skipped */ 2846 if (sched->skipped_samples && !sched->idle_hist) 2847 return; 2848 2849 printf("\nIdle stats:\n"); 2850 for (i = 0; i < idle_max_cpu; ++i) { 2851 t = idle_threads[i]; 2852 if (!t) 2853 continue; 2854 2855 r = thread__priv(t); 2856 if (r && r->run_stats.n) { 2857 totals.sched_count += r->run_stats.n; 2858 printf(" CPU %2d idle for ", i); 2859 print_sched_time(r->total_run_time, 6); 2860 printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time); 2861 } else 2862 printf(" CPU %2d idle entire time window\n", i); 2863 } 2864 2865 if (sched->idle_hist && sched->show_callchain) { 2866 callchain_param.mode = CHAIN_FOLDED; 2867 callchain_param.value = CCVAL_PERIOD; 2868 2869 callchain_register_param(&callchain_param); 2870 2871 printf("\nIdle stats by callchain:\n"); 2872 for (i = 0; i < idle_max_cpu; ++i) { 2873 struct idle_thread_runtime *itr; 2874 2875 t = idle_threads[i]; 2876 if (!t) 2877 continue; 2878 2879 itr = thread__priv(t); 2880 if (itr == NULL) 2881 continue; 2882 2883 callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain, 2884 0, &callchain_param); 2885 2886 printf(" CPU %2d:", i); 2887 print_sched_time(itr->tr.total_run_time, 6); 2888 printf(" msec\n"); 2889 timehist_print_idlehist_callchain(&itr->sorted_root); 2890 printf("\n"); 2891 } 2892 } 2893 2894 printf("\n" 2895 " Total number of unique tasks: %" PRIu64 "\n" 2896 "Total number of context switches: %" PRIu64 "\n", 2897 totals.task_count, totals.sched_count); 2898 2899 printf(" Total run time (msec): "); 2900 print_sched_time(totals.total_run_time, 2); 2901 printf("\n"); 2902 2903 printf(" Total scheduling time (msec): "); 2904 print_sched_time(hist_time, 2); 2905 printf(" (x %d)\n", sched->max_cpu); 2906 } 2907 2908 typedef int (*sched_handler)(struct perf_tool *tool, 2909 union perf_event *event, 2910 struct evsel *evsel, 2911 struct perf_sample *sample, 2912 struct machine *machine); 2913 2914 static int perf_timehist__process_sample(struct perf_tool *tool, 2915 union perf_event *event, 2916 struct perf_sample *sample, 2917 struct evsel *evsel, 2918 struct machine *machine) 2919 { 2920 struct perf_sched *sched = container_of(tool, struct perf_sched, tool); 2921 int err = 0; 2922 int this_cpu = sample->cpu; 2923 2924 if (this_cpu > sched->max_cpu) 2925 sched->max_cpu = this_cpu; 2926 2927 if (evsel->handler != NULL) { 2928 sched_handler f = evsel->handler; 2929 2930 err = f(tool, event, evsel, sample, machine); 2931 } 2932 2933 return err; 2934 } 2935 2936 static int timehist_check_attr(struct perf_sched *sched, 2937 struct evlist *evlist) 2938 { 2939 struct evsel *evsel; 2940 struct evsel_runtime *er; 2941 2942 list_for_each_entry(evsel, &evlist->core.entries, core.node) { 2943 er = evsel__get_runtime(evsel); 2944 if (er == NULL) { 2945 pr_err("Failed to allocate memory for evsel runtime data\n"); 2946 return -1; 2947 } 2948 2949 if (sched->show_callchain && !evsel__has_callchain(evsel)) { 2950 pr_info("Samples do not have callchains.\n"); 2951 sched->show_callchain = 0; 2952 symbol_conf.use_callchain = 0; 2953 } 2954 } 2955 2956 return 0; 2957 } 2958 2959 static int perf_sched__timehist(struct perf_sched *sched) 2960 { 2961 const struct evsel_str_handler handlers[] = { 2962 { "sched:sched_switch", timehist_sched_switch_event, }, 2963 { "sched:sched_wakeup", timehist_sched_wakeup_event, }, 2964 { "sched:sched_wakeup_new", timehist_sched_wakeup_event, }, 2965 }; 2966 const struct evsel_str_handler migrate_handlers[] = { 2967 { "sched:sched_migrate_task", timehist_migrate_task_event, }, 2968 }; 2969 struct perf_data data = { 2970 .path = input_name, 2971 .mode = PERF_DATA_MODE_READ, 2972 .force = sched->force, 2973 }; 2974 2975 struct perf_session *session; 2976 struct evlist *evlist; 2977 int err = -1; 2978 2979 /* 2980 * event handlers for timehist option 2981 */ 2982 sched->tool.sample = perf_timehist__process_sample; 2983 sched->tool.mmap = perf_event__process_mmap; 2984 sched->tool.comm = perf_event__process_comm; 2985 sched->tool.exit = perf_event__process_exit; 2986 sched->tool.fork = perf_event__process_fork; 2987 sched->tool.lost = process_lost; 2988 sched->tool.attr = perf_event__process_attr; 2989 sched->tool.tracing_data = perf_event__process_tracing_data; 2990 sched->tool.build_id = perf_event__process_build_id; 2991 2992 sched->tool.ordered_events = true; 2993 sched->tool.ordering_requires_timestamps = true; 2994 2995 symbol_conf.use_callchain = sched->show_callchain; 2996 2997 session = perf_session__new(&data, false, &sched->tool); 2998 if (IS_ERR(session)) 2999 return PTR_ERR(session); 3000 3001 if (cpu_list) { 3002 err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); 3003 if (err < 0) 3004 goto out; 3005 } 3006 3007 evlist = session->evlist; 3008 3009 symbol__init(&session->header.env); 3010 3011 if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) { 3012 pr_err("Invalid time string\n"); 3013 return -EINVAL; 3014 } 3015 3016 if (timehist_check_attr(sched, evlist) != 0) 3017 goto out; 3018 3019 setup_pager(); 3020 3021 /* setup per-evsel handlers */ 3022 if (perf_session__set_tracepoints_handlers(session, handlers)) 3023 goto out; 3024 3025 /* sched_switch event at a minimum needs to exist */ 3026 if (!perf_evlist__find_tracepoint_by_name(session->evlist, 3027 "sched:sched_switch")) { 3028 pr_err("No sched_switch events found. Have you run 'perf sched record'?\n"); 3029 goto out; 3030 } 3031 3032 if (sched->show_migrations && 3033 perf_session__set_tracepoints_handlers(session, migrate_handlers)) 3034 goto out; 3035 3036 /* pre-allocate struct for per-CPU idle stats */ 3037 sched->max_cpu = session->header.env.nr_cpus_online; 3038 if (sched->max_cpu == 0) 3039 sched->max_cpu = 4; 3040 if (init_idle_threads(sched->max_cpu)) 3041 goto out; 3042 3043 /* summary_only implies summary option, but don't overwrite summary if set */ 3044 if (sched->summary_only) 3045 sched->summary = sched->summary_only; 3046 3047 if (!sched->summary_only) 3048 timehist_header(sched); 3049 3050 err = perf_session__process_events(session); 3051 if (err) { 3052 pr_err("Failed to process events, error %d", err); 3053 goto out; 3054 } 3055 3056 sched->nr_events = evlist->stats.nr_events[0]; 3057 sched->nr_lost_events = evlist->stats.total_lost; 3058 sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST]; 3059 3060 if (sched->summary) 3061 timehist_print_summary(sched, session); 3062 3063 out: 3064 free_idle_threads(); 3065 perf_session__delete(session); 3066 3067 return err; 3068 } 3069 3070 3071 static void print_bad_events(struct perf_sched *sched) 3072 { 3073 if (sched->nr_unordered_timestamps && sched->nr_timestamps) { 3074 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", 3075 (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0, 3076 sched->nr_unordered_timestamps, sched->nr_timestamps); 3077 } 3078 if (sched->nr_lost_events && sched->nr_events) { 3079 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", 3080 (double)sched->nr_lost_events/(double)sched->nr_events * 100.0, 3081 sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks); 3082 } 3083 if (sched->nr_context_switch_bugs && sched->nr_timestamps) { 3084 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", 3085 (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0, 3086 sched->nr_context_switch_bugs, sched->nr_timestamps); 3087 if (sched->nr_lost_events) 3088 printf(" (due to lost events?)"); 3089 printf("\n"); 3090 } 3091 } 3092 3093 static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data) 3094 { 3095 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; 3096 struct work_atoms *this; 3097 const char *comm = thread__comm_str(data->thread), *this_comm; 3098 bool leftmost = true; 3099 3100 while (*new) { 3101 int cmp; 3102 3103 this = container_of(*new, struct work_atoms, node); 3104 parent = *new; 3105 3106 this_comm = thread__comm_str(this->thread); 3107 cmp = strcmp(comm, this_comm); 3108 if (cmp > 0) { 3109 new = &((*new)->rb_left); 3110 } else if (cmp < 0) { 3111 new = &((*new)->rb_right); 3112 leftmost = false; 3113 } else { 3114 this->num_merged++; 3115 this->total_runtime += data->total_runtime; 3116 this->nb_atoms += data->nb_atoms; 3117 this->total_lat += data->total_lat; 3118 list_splice(&data->work_list, &this->work_list); 3119 if (this->max_lat < data->max_lat) { 3120 this->max_lat = data->max_lat; 3121 this->max_lat_at = data->max_lat_at; 3122 } 3123 zfree(&data); 3124 return; 3125 } 3126 } 3127 3128 data->num_merged++; 3129 rb_link_node(&data->node, parent, new); 3130 rb_insert_color_cached(&data->node, root, leftmost); 3131 } 3132 3133 static void perf_sched__merge_lat(struct perf_sched *sched) 3134 { 3135 struct work_atoms *data; 3136 struct rb_node *node; 3137 3138 if (sched->skip_merge) 3139 return; 3140 3141 while ((node = rb_first_cached(&sched->atom_root))) { 3142 rb_erase_cached(node, &sched->atom_root); 3143 data = rb_entry(node, struct work_atoms, node); 3144 __merge_work_atoms(&sched->merged_atom_root, data); 3145 } 3146 } 3147 3148 static int perf_sched__lat(struct perf_sched *sched) 3149 { 3150 struct rb_node *next; 3151 3152 setup_pager(); 3153 3154 if (perf_sched__read_events(sched)) 3155 return -1; 3156 3157 perf_sched__merge_lat(sched); 3158 perf_sched__sort_lat(sched); 3159 3160 printf("\n -----------------------------------------------------------------------------------------------------------------\n"); 3161 printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); 3162 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 3163 3164 next = rb_first_cached(&sched->sorted_atom_root); 3165 3166 while (next) { 3167 struct work_atoms *work_list; 3168 3169 work_list = rb_entry(next, struct work_atoms, node); 3170 output_lat_thread(sched, work_list); 3171 next = rb_next(next); 3172 thread__zput(work_list->thread); 3173 } 3174 3175 printf(" -----------------------------------------------------------------------------------------------------------------\n"); 3176 printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n", 3177 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count); 3178 3179 printf(" ---------------------------------------------------\n"); 3180 3181 print_bad_events(sched); 3182 printf("\n"); 3183 3184 return 0; 3185 } 3186 3187 static int setup_map_cpus(struct perf_sched *sched) 3188 { 3189 struct perf_cpu_map *map; 3190 3191 sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); 3192 3193 if (sched->map.comp) { 3194 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); 3195 if (!sched->map.comp_cpus) 3196 return -1; 3197 } 3198 3199 if (!sched->map.cpus_str) 3200 return 0; 3201 3202 map = perf_cpu_map__new(sched->map.cpus_str); 3203 if (!map) { 3204 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); 3205 return -1; 3206 } 3207 3208 sched->map.cpus = map; 3209 return 0; 3210 } 3211 3212 static int setup_color_pids(struct perf_sched *sched) 3213 { 3214 struct perf_thread_map *map; 3215 3216 if (!sched->map.color_pids_str) 3217 return 0; 3218 3219 map = thread_map__new_by_tid_str(sched->map.color_pids_str); 3220 if (!map) { 3221 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); 3222 return -1; 3223 } 3224 3225 sched->map.color_pids = map; 3226 return 0; 3227 } 3228 3229 static int setup_color_cpus(struct perf_sched *sched) 3230 { 3231 struct perf_cpu_map *map; 3232 3233 if (!sched->map.color_cpus_str) 3234 return 0; 3235 3236 map = perf_cpu_map__new(sched->map.color_cpus_str); 3237 if (!map) { 3238 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); 3239 return -1; 3240 } 3241 3242 sched->map.color_cpus = map; 3243 return 0; 3244 } 3245 3246 static int perf_sched__map(struct perf_sched *sched) 3247 { 3248 if (setup_map_cpus(sched)) 3249 return -1; 3250 3251 if (setup_color_pids(sched)) 3252 return -1; 3253 3254 if (setup_color_cpus(sched)) 3255 return -1; 3256 3257 setup_pager(); 3258 if (perf_sched__read_events(sched)) 3259 return -1; 3260 print_bad_events(sched); 3261 return 0; 3262 } 3263 3264 static int perf_sched__replay(struct perf_sched *sched) 3265 { 3266 unsigned long i; 3267 3268 calibrate_run_measurement_overhead(sched); 3269 calibrate_sleep_measurement_overhead(sched); 3270 3271 test_calibrations(sched); 3272 3273 if (perf_sched__read_events(sched)) 3274 return -1; 3275 3276 printf("nr_run_events: %ld\n", sched->nr_run_events); 3277 printf("nr_sleep_events: %ld\n", sched->nr_sleep_events); 3278 printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events); 3279 3280 if (sched->targetless_wakeups) 3281 printf("target-less wakeups: %ld\n", sched->targetless_wakeups); 3282 if (sched->multitarget_wakeups) 3283 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups); 3284 if (sched->nr_run_events_optimized) 3285 printf("run atoms optimized: %ld\n", 3286 sched->nr_run_events_optimized); 3287 3288 print_task_traces(sched); 3289 add_cross_task_wakeups(sched); 3290 3291 create_tasks(sched); 3292 printf("------------------------------------------------------------\n"); 3293 for (i = 0; i < sched->replay_repeat; i++) 3294 run_one_test(sched); 3295 3296 return 0; 3297 } 3298 3299 static void setup_sorting(struct perf_sched *sched, const struct option *options, 3300 const char * const usage_msg[]) 3301 { 3302 char *tmp, *tok, *str = strdup(sched->sort_order); 3303 3304 for (tok = strtok_r(str, ", ", &tmp); 3305 tok; tok = strtok_r(NULL, ", ", &tmp)) { 3306 if (sort_dimension__add(tok, &sched->sort_list) < 0) { 3307 usage_with_options_msg(usage_msg, options, 3308 "Unknown --sort key: `%s'", tok); 3309 } 3310 } 3311 3312 free(str); 3313 3314 sort_dimension__add("pid", &sched->cmp_pid); 3315 } 3316 3317 static int __cmd_record(int argc, const char **argv) 3318 { 3319 unsigned int rec_argc, i, j; 3320 const char **rec_argv; 3321 const char * const record_args[] = { 3322 "record", 3323 "-a", 3324 "-R", 3325 "-m", "1024", 3326 "-c", "1", 3327 "-e", "sched:sched_switch", 3328 "-e", "sched:sched_stat_wait", 3329 "-e", "sched:sched_stat_sleep", 3330 "-e", "sched:sched_stat_iowait", 3331 "-e", "sched:sched_stat_runtime", 3332 "-e", "sched:sched_process_fork", 3333 "-e", "sched:sched_wakeup", 3334 "-e", "sched:sched_wakeup_new", 3335 "-e", "sched:sched_migrate_task", 3336 }; 3337 3338 rec_argc = ARRAY_SIZE(record_args) + argc - 1; 3339 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 3340 3341 if (rec_argv == NULL) 3342 return -ENOMEM; 3343 3344 for (i = 0; i < ARRAY_SIZE(record_args); i++) 3345 rec_argv[i] = strdup(record_args[i]); 3346 3347 for (j = 1; j < (unsigned int)argc; j++, i++) 3348 rec_argv[i] = argv[j]; 3349 3350 BUG_ON(i != rec_argc); 3351 3352 return cmd_record(i, rec_argv); 3353 } 3354 3355 int cmd_sched(int argc, const char **argv) 3356 { 3357 static const char default_sort_order[] = "avg, max, switch, runtime"; 3358 struct perf_sched sched = { 3359 .tool = { 3360 .sample = perf_sched__process_tracepoint_sample, 3361 .comm = perf_sched__process_comm, 3362 .namespaces = perf_event__process_namespaces, 3363 .lost = perf_event__process_lost, 3364 .fork = perf_sched__process_fork_event, 3365 .ordered_events = true, 3366 }, 3367 .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), 3368 .sort_list = LIST_HEAD_INIT(sched.sort_list), 3369 .start_work_mutex = PTHREAD_MUTEX_INITIALIZER, 3370 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER, 3371 .sort_order = default_sort_order, 3372 .replay_repeat = 10, 3373 .profile_cpu = -1, 3374 .next_shortname1 = 'A', 3375 .next_shortname2 = '0', 3376 .skip_merge = 0, 3377 .show_callchain = 1, 3378 .max_stack = 5, 3379 }; 3380 const struct option sched_options[] = { 3381 OPT_STRING('i', "input", &input_name, "file", 3382 "input file name"), 3383 OPT_INCR('v', "verbose", &verbose, 3384 "be more verbose (show symbol address, etc)"), 3385 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 3386 "dump raw trace in ASCII"), 3387 OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), 3388 OPT_END() 3389 }; 3390 const struct option latency_options[] = { 3391 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", 3392 "sort by key(s): runtime, switch, avg, max"), 3393 OPT_INTEGER('C', "CPU", &sched.profile_cpu, 3394 "CPU to profile on"), 3395 OPT_BOOLEAN('p', "pids", &sched.skip_merge, 3396 "latency stats per pid instead of per comm"), 3397 OPT_PARENT(sched_options) 3398 }; 3399 const struct option replay_options[] = { 3400 OPT_UINTEGER('r', "repeat", &sched.replay_repeat, 3401 "repeat the workload replay N times (-1: infinite)"), 3402 OPT_PARENT(sched_options) 3403 }; 3404 const struct option map_options[] = { 3405 OPT_BOOLEAN(0, "compact", &sched.map.comp, 3406 "map output in compact mode"), 3407 OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", 3408 "highlight given pids in map"), 3409 OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", 3410 "highlight given CPUs in map"), 3411 OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", 3412 "display given CPUs in map"), 3413 OPT_PARENT(sched_options) 3414 }; 3415 const struct option timehist_options[] = { 3416 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, 3417 "file", "vmlinux pathname"), 3418 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, 3419 "file", "kallsyms pathname"), 3420 OPT_BOOLEAN('g', "call-graph", &sched.show_callchain, 3421 "Display call chains if present (default on)"), 3422 OPT_UINTEGER(0, "max-stack", &sched.max_stack, 3423 "Maximum number of functions to display backtrace."), 3424 OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", 3425 "Look for files with symbols relative to this directory"), 3426 OPT_BOOLEAN('s', "summary", &sched.summary_only, 3427 "Show only syscall summary with statistics"), 3428 OPT_BOOLEAN('S', "with-summary", &sched.summary, 3429 "Show all syscalls and summary with statistics"), 3430 OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"), 3431 OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"), 3432 OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"), 3433 OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"), 3434 OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"), 3435 OPT_STRING(0, "time", &sched.time_str, "str", 3436 "Time span for analysis (start,stop)"), 3437 OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"), 3438 OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]", 3439 "analyze events only for given process id(s)"), 3440 OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]", 3441 "analyze events only for given thread id(s)"), 3442 OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 3443 OPT_PARENT(sched_options) 3444 }; 3445 3446 const char * const latency_usage[] = { 3447 "perf sched latency [<options>]", 3448 NULL 3449 }; 3450 const char * const replay_usage[] = { 3451 "perf sched replay [<options>]", 3452 NULL 3453 }; 3454 const char * const map_usage[] = { 3455 "perf sched map [<options>]", 3456 NULL 3457 }; 3458 const char * const timehist_usage[] = { 3459 "perf sched timehist [<options>]", 3460 NULL 3461 }; 3462 const char *const sched_subcommands[] = { "record", "latency", "map", 3463 "replay", "script", 3464 "timehist", NULL }; 3465 const char *sched_usage[] = { 3466 NULL, 3467 NULL 3468 }; 3469 struct trace_sched_handler lat_ops = { 3470 .wakeup_event = latency_wakeup_event, 3471 .switch_event = latency_switch_event, 3472 .runtime_event = latency_runtime_event, 3473 .migrate_task_event = latency_migrate_task_event, 3474 }; 3475 struct trace_sched_handler map_ops = { 3476 .switch_event = map_switch_event, 3477 }; 3478 struct trace_sched_handler replay_ops = { 3479 .wakeup_event = replay_wakeup_event, 3480 .switch_event = replay_switch_event, 3481 .fork_event = replay_fork_event, 3482 }; 3483 unsigned int i; 3484 3485 for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++) 3486 sched.curr_pid[i] = -1; 3487 3488 argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, 3489 sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); 3490 if (!argc) 3491 usage_with_options(sched_usage, sched_options); 3492 3493 /* 3494 * Aliased to 'perf script' for now: 3495 */ 3496 if (!strcmp(argv[0], "script")) 3497 return cmd_script(argc, argv); 3498 3499 if (!strncmp(argv[0], "rec", 3)) { 3500 return __cmd_record(argc, argv); 3501 } else if (!strncmp(argv[0], "lat", 3)) { 3502 sched.tp_handler = &lat_ops; 3503 if (argc > 1) { 3504 argc = parse_options(argc, argv, latency_options, latency_usage, 0); 3505 if (argc) 3506 usage_with_options(latency_usage, latency_options); 3507 } 3508 setup_sorting(&sched, latency_options, latency_usage); 3509 return perf_sched__lat(&sched); 3510 } else if (!strcmp(argv[0], "map")) { 3511 if (argc) { 3512 argc = parse_options(argc, argv, map_options, map_usage, 0); 3513 if (argc) 3514 usage_with_options(map_usage, map_options); 3515 } 3516 sched.tp_handler = &map_ops; 3517 setup_sorting(&sched, latency_options, latency_usage); 3518 return perf_sched__map(&sched); 3519 } else if (!strncmp(argv[0], "rep", 3)) { 3520 sched.tp_handler = &replay_ops; 3521 if (argc) { 3522 argc = parse_options(argc, argv, replay_options, replay_usage, 0); 3523 if (argc) 3524 usage_with_options(replay_usage, replay_options); 3525 } 3526 return perf_sched__replay(&sched); 3527 } else if (!strcmp(argv[0], "timehist")) { 3528 if (argc) { 3529 argc = parse_options(argc, argv, timehist_options, 3530 timehist_usage, 0); 3531 if (argc) 3532 usage_with_options(timehist_usage, timehist_options); 3533 } 3534 if ((sched.show_wakeups || sched.show_next) && 3535 sched.summary_only) { 3536 pr_err(" Error: -s and -[n|w] are mutually exclusive.\n"); 3537 parse_options_usage(timehist_usage, timehist_options, "s", true); 3538 if (sched.show_wakeups) 3539 parse_options_usage(NULL, timehist_options, "w", true); 3540 if (sched.show_next) 3541 parse_options_usage(NULL, timehist_options, "n", true); 3542 return -EINVAL; 3543 } 3544 3545 return perf_sched__timehist(&sched); 3546 } else { 3547 usage_with_options(sched_usage, sched_options); 3548 } 3549 3550 return 0; 3551 } 3552