1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * thread-stack.c: Synthesize a thread's stack using call / return events 4 * Copyright (c) 2014, Intel Corporation. 5 */ 6 7 #include <linux/rbtree.h> 8 #include <linux/list.h> 9 #include <linux/log2.h> 10 #include <linux/zalloc.h> 11 #include <errno.h> 12 #include <stdlib.h> 13 #include "thread.h" 14 #include "event.h" 15 #include "machine.h" 16 #include "env.h" 17 #include "debug.h" 18 #include "symbol.h" 19 #include "comm.h" 20 #include "call-path.h" 21 #include "thread-stack.h" 22 23 #define STACK_GROWTH 2048 24 25 /* 26 * State of retpoline detection. 27 * 28 * RETPOLINE_NONE: no retpoline detection 29 * X86_RETPOLINE_POSSIBLE: x86 retpoline possible 30 * X86_RETPOLINE_DETECTED: x86 retpoline detected 31 */ 32 enum retpoline_state_t { 33 RETPOLINE_NONE, 34 X86_RETPOLINE_POSSIBLE, 35 X86_RETPOLINE_DETECTED, 36 }; 37 38 /** 39 * struct thread_stack_entry - thread stack entry. 40 * @ret_addr: return address 41 * @timestamp: timestamp (if known) 42 * @ref: external reference (e.g. db_id of sample) 43 * @branch_count: the branch count when the entry was created 44 * @insn_count: the instruction count when the entry was created 45 * @cyc_count the cycle count when the entry was created 46 * @db_id: id used for db-export 47 * @cp: call path 48 * @no_call: a 'call' was not seen 49 * @trace_end: a 'call' but trace ended 50 * @non_call: a branch but not a 'call' to the start of a different symbol 51 */ 52 struct thread_stack_entry { 53 u64 ret_addr; 54 u64 timestamp; 55 u64 ref; 56 u64 branch_count; 57 u64 insn_count; 58 u64 cyc_count; 59 u64 db_id; 60 struct call_path *cp; 61 bool no_call; 62 bool trace_end; 63 bool non_call; 64 }; 65 66 /** 67 * struct thread_stack - thread stack constructed from 'call' and 'return' 68 * branch samples. 69 * @stack: array that holds the stack 70 * @cnt: number of entries in the stack 71 * @sz: current maximum stack size 72 * @trace_nr: current trace number 73 * @branch_count: running branch count 74 * @insn_count: running instruction count 75 * @cyc_count running cycle count 76 * @kernel_start: kernel start address 77 * @last_time: last timestamp 78 * @crp: call/return processor 79 * @comm: current comm 80 * @arr_sz: size of array if this is the first element of an array 81 * @rstate: used to detect retpolines 82 */ 83 struct thread_stack { 84 struct thread_stack_entry *stack; 85 size_t cnt; 86 size_t sz; 87 u64 trace_nr; 88 u64 branch_count; 89 u64 insn_count; 90 u64 cyc_count; 91 u64 kernel_start; 92 u64 last_time; 93 struct call_return_processor *crp; 94 struct comm *comm; 95 unsigned int arr_sz; 96 enum retpoline_state_t rstate; 97 }; 98 99 /* 100 * Assume pid == tid == 0 identifies the idle task as defined by 101 * perf_session__register_idle_thread(). The idle task is really 1 task per cpu, 102 * and therefore requires a stack for each cpu. 103 */ 104 static inline bool thread_stack__per_cpu(struct thread *thread) 105 { 106 return !(thread->tid || thread->pid_); 107 } 108 109 static int thread_stack__grow(struct thread_stack *ts) 110 { 111 struct thread_stack_entry *new_stack; 112 size_t sz, new_sz; 113 114 new_sz = ts->sz + STACK_GROWTH; 115 sz = new_sz * sizeof(struct thread_stack_entry); 116 117 new_stack = realloc(ts->stack, sz); 118 if (!new_stack) 119 return -ENOMEM; 120 121 ts->stack = new_stack; 122 ts->sz = new_sz; 123 124 return 0; 125 } 126 127 static int thread_stack__init(struct thread_stack *ts, struct thread *thread, 128 struct call_return_processor *crp) 129 { 130 int err; 131 132 err = thread_stack__grow(ts); 133 if (err) 134 return err; 135 136 if (thread->mg && thread->mg->machine) { 137 struct machine *machine = thread->mg->machine; 138 const char *arch = perf_env__arch(machine->env); 139 140 ts->kernel_start = machine__kernel_start(machine); 141 if (!strcmp(arch, "x86")) 142 ts->rstate = X86_RETPOLINE_POSSIBLE; 143 } else { 144 ts->kernel_start = 1ULL << 63; 145 } 146 ts->crp = crp; 147 148 return 0; 149 } 150 151 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, 152 struct call_return_processor *crp) 153 { 154 struct thread_stack *ts = thread->ts, *new_ts; 155 unsigned int old_sz = ts ? ts->arr_sz : 0; 156 unsigned int new_sz = 1; 157 158 if (thread_stack__per_cpu(thread) && cpu > 0) 159 new_sz = roundup_pow_of_two(cpu + 1); 160 161 if (!ts || new_sz > old_sz) { 162 new_ts = calloc(new_sz, sizeof(*ts)); 163 if (!new_ts) 164 return NULL; 165 if (ts) 166 memcpy(new_ts, ts, old_sz * sizeof(*ts)); 167 new_ts->arr_sz = new_sz; 168 zfree(&thread->ts); 169 thread->ts = new_ts; 170 ts = new_ts; 171 } 172 173 if (thread_stack__per_cpu(thread) && cpu > 0 && 174 (unsigned int)cpu < ts->arr_sz) 175 ts += cpu; 176 177 if (!ts->stack && 178 thread_stack__init(ts, thread, crp)) 179 return NULL; 180 181 return ts; 182 } 183 184 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu) 185 { 186 struct thread_stack *ts = thread->ts; 187 188 if (cpu < 0) 189 cpu = 0; 190 191 if (!ts || (unsigned int)cpu >= ts->arr_sz) 192 return NULL; 193 194 ts += cpu; 195 196 if (!ts->stack) 197 return NULL; 198 199 return ts; 200 } 201 202 static inline struct thread_stack *thread__stack(struct thread *thread, 203 int cpu) 204 { 205 if (!thread) 206 return NULL; 207 208 if (thread_stack__per_cpu(thread)) 209 return thread__cpu_stack(thread, cpu); 210 211 return thread->ts; 212 } 213 214 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, 215 bool trace_end) 216 { 217 int err = 0; 218 219 if (ts->cnt == ts->sz) { 220 err = thread_stack__grow(ts); 221 if (err) { 222 pr_warning("Out of memory: discarding thread stack\n"); 223 ts->cnt = 0; 224 } 225 } 226 227 ts->stack[ts->cnt].trace_end = trace_end; 228 ts->stack[ts->cnt++].ret_addr = ret_addr; 229 230 return err; 231 } 232 233 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr) 234 { 235 size_t i; 236 237 /* 238 * In some cases there may be functions which are not seen to return. 239 * For example when setjmp / longjmp has been used. Or the perf context 240 * switch in the kernel which doesn't stop and start tracing in exactly 241 * the same code path. When that happens the return address will be 242 * further down the stack. If the return address is not found at all, 243 * we assume the opposite (i.e. this is a return for a call that wasn't 244 * seen for some reason) and leave the stack alone. 245 */ 246 for (i = ts->cnt; i; ) { 247 if (ts->stack[--i].ret_addr == ret_addr) { 248 ts->cnt = i; 249 return; 250 } 251 } 252 } 253 254 static void thread_stack__pop_trace_end(struct thread_stack *ts) 255 { 256 size_t i; 257 258 for (i = ts->cnt; i; ) { 259 if (ts->stack[--i].trace_end) 260 ts->cnt = i; 261 else 262 return; 263 } 264 } 265 266 static bool thread_stack__in_kernel(struct thread_stack *ts) 267 { 268 if (!ts->cnt) 269 return false; 270 271 return ts->stack[ts->cnt - 1].cp->in_kernel; 272 } 273 274 static int thread_stack__call_return(struct thread *thread, 275 struct thread_stack *ts, size_t idx, 276 u64 timestamp, u64 ref, bool no_return) 277 { 278 struct call_return_processor *crp = ts->crp; 279 struct thread_stack_entry *tse; 280 struct call_return cr = { 281 .thread = thread, 282 .comm = ts->comm, 283 .db_id = 0, 284 }; 285 u64 *parent_db_id; 286 287 tse = &ts->stack[idx]; 288 cr.cp = tse->cp; 289 cr.call_time = tse->timestamp; 290 cr.return_time = timestamp; 291 cr.branch_count = ts->branch_count - tse->branch_count; 292 cr.insn_count = ts->insn_count - tse->insn_count; 293 cr.cyc_count = ts->cyc_count - tse->cyc_count; 294 cr.db_id = tse->db_id; 295 cr.call_ref = tse->ref; 296 cr.return_ref = ref; 297 if (tse->no_call) 298 cr.flags |= CALL_RETURN_NO_CALL; 299 if (no_return) 300 cr.flags |= CALL_RETURN_NO_RETURN; 301 if (tse->non_call) 302 cr.flags |= CALL_RETURN_NON_CALL; 303 304 /* 305 * The parent db_id must be assigned before exporting the child. Note 306 * it is not possible to export the parent first because its information 307 * is not yet complete because its 'return' has not yet been processed. 308 */ 309 parent_db_id = idx ? &(tse - 1)->db_id : NULL; 310 311 return crp->process(&cr, parent_db_id, crp->data); 312 } 313 314 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) 315 { 316 struct call_return_processor *crp = ts->crp; 317 int err; 318 319 if (!crp) { 320 ts->cnt = 0; 321 return 0; 322 } 323 324 while (ts->cnt) { 325 err = thread_stack__call_return(thread, ts, --ts->cnt, 326 ts->last_time, 0, true); 327 if (err) { 328 pr_err("Error flushing thread stack!\n"); 329 ts->cnt = 0; 330 return err; 331 } 332 } 333 334 return 0; 335 } 336 337 int thread_stack__flush(struct thread *thread) 338 { 339 struct thread_stack *ts = thread->ts; 340 unsigned int pos; 341 int err = 0; 342 343 if (ts) { 344 for (pos = 0; pos < ts->arr_sz; pos++) { 345 int ret = __thread_stack__flush(thread, ts + pos); 346 347 if (ret) 348 err = ret; 349 } 350 } 351 352 return err; 353 } 354 355 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, 356 u64 to_ip, u16 insn_len, u64 trace_nr) 357 { 358 struct thread_stack *ts = thread__stack(thread, cpu); 359 360 if (!thread) 361 return -EINVAL; 362 363 if (!ts) { 364 ts = thread_stack__new(thread, cpu, NULL); 365 if (!ts) { 366 pr_warning("Out of memory: no thread stack\n"); 367 return -ENOMEM; 368 } 369 ts->trace_nr = trace_nr; 370 } 371 372 /* 373 * When the trace is discontinuous, the trace_nr changes. In that case 374 * the stack might be completely invalid. Better to report nothing than 375 * to report something misleading, so flush the stack. 376 */ 377 if (trace_nr != ts->trace_nr) { 378 if (ts->trace_nr) 379 __thread_stack__flush(thread, ts); 380 ts->trace_nr = trace_nr; 381 } 382 383 /* Stop here if thread_stack__process() is in use */ 384 if (ts->crp) 385 return 0; 386 387 if (flags & PERF_IP_FLAG_CALL) { 388 u64 ret_addr; 389 390 if (!to_ip) 391 return 0; 392 ret_addr = from_ip + insn_len; 393 if (ret_addr == to_ip) 394 return 0; /* Zero-length calls are excluded */ 395 return thread_stack__push(ts, ret_addr, 396 flags & PERF_IP_FLAG_TRACE_END); 397 } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) { 398 /* 399 * If the caller did not change the trace number (which would 400 * have flushed the stack) then try to make sense of the stack. 401 * Possibly, tracing began after returning to the current 402 * address, so try to pop that. Also, do not expect a call made 403 * when the trace ended, to return, so pop that. 404 */ 405 thread_stack__pop(ts, to_ip); 406 thread_stack__pop_trace_end(ts); 407 } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) { 408 thread_stack__pop(ts, to_ip); 409 } 410 411 return 0; 412 } 413 414 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr) 415 { 416 struct thread_stack *ts = thread__stack(thread, cpu); 417 418 if (!ts) 419 return; 420 421 if (trace_nr != ts->trace_nr) { 422 if (ts->trace_nr) 423 __thread_stack__flush(thread, ts); 424 ts->trace_nr = trace_nr; 425 } 426 } 427 428 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts) 429 { 430 __thread_stack__flush(thread, ts); 431 zfree(&ts->stack); 432 } 433 434 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) 435 { 436 unsigned int arr_sz = ts->arr_sz; 437 438 __thread_stack__free(thread, ts); 439 memset(ts, 0, sizeof(*ts)); 440 ts->arr_sz = arr_sz; 441 } 442 443 void thread_stack__free(struct thread *thread) 444 { 445 struct thread_stack *ts = thread->ts; 446 unsigned int pos; 447 448 if (ts) { 449 for (pos = 0; pos < ts->arr_sz; pos++) 450 __thread_stack__free(thread, ts + pos); 451 zfree(&thread->ts); 452 } 453 } 454 455 static inline u64 callchain_context(u64 ip, u64 kernel_start) 456 { 457 return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; 458 } 459 460 void thread_stack__sample(struct thread *thread, int cpu, 461 struct ip_callchain *chain, 462 size_t sz, u64 ip, u64 kernel_start) 463 { 464 struct thread_stack *ts = thread__stack(thread, cpu); 465 u64 context = callchain_context(ip, kernel_start); 466 u64 last_context; 467 size_t i, j; 468 469 if (sz < 2) { 470 chain->nr = 0; 471 return; 472 } 473 474 chain->ips[0] = context; 475 chain->ips[1] = ip; 476 477 if (!ts) { 478 chain->nr = 2; 479 return; 480 } 481 482 last_context = context; 483 484 for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) { 485 ip = ts->stack[ts->cnt - j].ret_addr; 486 context = callchain_context(ip, kernel_start); 487 if (context != last_context) { 488 if (i >= sz - 1) 489 break; 490 chain->ips[i++] = context; 491 last_context = context; 492 } 493 chain->ips[i] = ip; 494 } 495 496 chain->nr = i; 497 } 498 499 struct call_return_processor * 500 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), 501 void *data) 502 { 503 struct call_return_processor *crp; 504 505 crp = zalloc(sizeof(struct call_return_processor)); 506 if (!crp) 507 return NULL; 508 crp->cpr = call_path_root__new(); 509 if (!crp->cpr) 510 goto out_free; 511 crp->process = process; 512 crp->data = data; 513 return crp; 514 515 out_free: 516 free(crp); 517 return NULL; 518 } 519 520 void call_return_processor__free(struct call_return_processor *crp) 521 { 522 if (crp) { 523 call_path_root__free(crp->cpr); 524 free(crp); 525 } 526 } 527 528 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, 529 u64 timestamp, u64 ref, struct call_path *cp, 530 bool no_call, bool trace_end) 531 { 532 struct thread_stack_entry *tse; 533 int err; 534 535 if (!cp) 536 return -ENOMEM; 537 538 if (ts->cnt == ts->sz) { 539 err = thread_stack__grow(ts); 540 if (err) 541 return err; 542 } 543 544 tse = &ts->stack[ts->cnt++]; 545 tse->ret_addr = ret_addr; 546 tse->timestamp = timestamp; 547 tse->ref = ref; 548 tse->branch_count = ts->branch_count; 549 tse->insn_count = ts->insn_count; 550 tse->cyc_count = ts->cyc_count; 551 tse->cp = cp; 552 tse->no_call = no_call; 553 tse->trace_end = trace_end; 554 tse->non_call = false; 555 tse->db_id = 0; 556 557 return 0; 558 } 559 560 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts, 561 u64 ret_addr, u64 timestamp, u64 ref, 562 struct symbol *sym) 563 { 564 int err; 565 566 if (!ts->cnt) 567 return 1; 568 569 if (ts->cnt == 1) { 570 struct thread_stack_entry *tse = &ts->stack[0]; 571 572 if (tse->cp->sym == sym) 573 return thread_stack__call_return(thread, ts, --ts->cnt, 574 timestamp, ref, false); 575 } 576 577 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr && 578 !ts->stack[ts->cnt - 1].non_call) { 579 return thread_stack__call_return(thread, ts, --ts->cnt, 580 timestamp, ref, false); 581 } else { 582 size_t i = ts->cnt - 1; 583 584 while (i--) { 585 if (ts->stack[i].ret_addr != ret_addr || 586 ts->stack[i].non_call) 587 continue; 588 i += 1; 589 while (ts->cnt > i) { 590 err = thread_stack__call_return(thread, ts, 591 --ts->cnt, 592 timestamp, ref, 593 true); 594 if (err) 595 return err; 596 } 597 return thread_stack__call_return(thread, ts, --ts->cnt, 598 timestamp, ref, false); 599 } 600 } 601 602 return 1; 603 } 604 605 static int thread_stack__bottom(struct thread_stack *ts, 606 struct perf_sample *sample, 607 struct addr_location *from_al, 608 struct addr_location *to_al, u64 ref) 609 { 610 struct call_path_root *cpr = ts->crp->cpr; 611 struct call_path *cp; 612 struct symbol *sym; 613 u64 ip; 614 615 if (sample->ip) { 616 ip = sample->ip; 617 sym = from_al->sym; 618 } else if (sample->addr) { 619 ip = sample->addr; 620 sym = to_al->sym; 621 } else { 622 return 0; 623 } 624 625 cp = call_path__findnew(cpr, &cpr->call_path, sym, ip, 626 ts->kernel_start); 627 628 return thread_stack__push_cp(ts, ip, sample->time, ref, cp, 629 true, false); 630 } 631 632 static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts, 633 struct perf_sample *sample, u64 ref) 634 { 635 u64 tm = sample->time; 636 int err; 637 638 /* Return to userspace, so pop all kernel addresses */ 639 while (thread_stack__in_kernel(ts)) { 640 err = thread_stack__call_return(thread, ts, --ts->cnt, 641 tm, ref, true); 642 if (err) 643 return err; 644 } 645 646 return 0; 647 } 648 649 static int thread_stack__no_call_return(struct thread *thread, 650 struct thread_stack *ts, 651 struct perf_sample *sample, 652 struct addr_location *from_al, 653 struct addr_location *to_al, u64 ref) 654 { 655 struct call_path_root *cpr = ts->crp->cpr; 656 struct call_path *root = &cpr->call_path; 657 struct symbol *fsym = from_al->sym; 658 struct symbol *tsym = to_al->sym; 659 struct call_path *cp, *parent; 660 u64 ks = ts->kernel_start; 661 u64 addr = sample->addr; 662 u64 tm = sample->time; 663 u64 ip = sample->ip; 664 int err; 665 666 if (ip >= ks && addr < ks) { 667 /* Return to userspace, so pop all kernel addresses */ 668 err = thread_stack__pop_ks(thread, ts, sample, ref); 669 if (err) 670 return err; 671 672 /* If the stack is empty, push the userspace address */ 673 if (!ts->cnt) { 674 cp = call_path__findnew(cpr, root, tsym, addr, ks); 675 return thread_stack__push_cp(ts, 0, tm, ref, cp, true, 676 false); 677 } 678 } else if (thread_stack__in_kernel(ts) && ip < ks) { 679 /* Return to userspace, so pop all kernel addresses */ 680 err = thread_stack__pop_ks(thread, ts, sample, ref); 681 if (err) 682 return err; 683 } 684 685 if (ts->cnt) 686 parent = ts->stack[ts->cnt - 1].cp; 687 else 688 parent = root; 689 690 if (parent->sym == from_al->sym) { 691 /* 692 * At the bottom of the stack, assume the missing 'call' was 693 * before the trace started. So, pop the current symbol and push 694 * the 'to' symbol. 695 */ 696 if (ts->cnt == 1) { 697 err = thread_stack__call_return(thread, ts, --ts->cnt, 698 tm, ref, false); 699 if (err) 700 return err; 701 } 702 703 if (!ts->cnt) { 704 cp = call_path__findnew(cpr, root, tsym, addr, ks); 705 706 return thread_stack__push_cp(ts, addr, tm, ref, cp, 707 true, false); 708 } 709 710 /* 711 * Otherwise assume the 'return' is being used as a jump (e.g. 712 * retpoline) and just push the 'to' symbol. 713 */ 714 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 715 716 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false); 717 if (!err) 718 ts->stack[ts->cnt - 1].non_call = true; 719 720 return err; 721 } 722 723 /* 724 * Assume 'parent' has not yet returned, so push 'to', and then push and 725 * pop 'from'. 726 */ 727 728 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 729 730 err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false); 731 if (err) 732 return err; 733 734 cp = call_path__findnew(cpr, cp, fsym, ip, ks); 735 736 err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false); 737 if (err) 738 return err; 739 740 return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false); 741 } 742 743 static int thread_stack__trace_begin(struct thread *thread, 744 struct thread_stack *ts, u64 timestamp, 745 u64 ref) 746 { 747 struct thread_stack_entry *tse; 748 int err; 749 750 if (!ts->cnt) 751 return 0; 752 753 /* Pop trace end */ 754 tse = &ts->stack[ts->cnt - 1]; 755 if (tse->trace_end) { 756 err = thread_stack__call_return(thread, ts, --ts->cnt, 757 timestamp, ref, false); 758 if (err) 759 return err; 760 } 761 762 return 0; 763 } 764 765 static int thread_stack__trace_end(struct thread_stack *ts, 766 struct perf_sample *sample, u64 ref) 767 { 768 struct call_path_root *cpr = ts->crp->cpr; 769 struct call_path *cp; 770 u64 ret_addr; 771 772 /* No point having 'trace end' on the bottom of the stack */ 773 if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref)) 774 return 0; 775 776 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0, 777 ts->kernel_start); 778 779 ret_addr = sample->ip + sample->insn_len; 780 781 return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp, 782 false, true); 783 } 784 785 static bool is_x86_retpoline(const char *name) 786 { 787 const char *p = strstr(name, "__x86_indirect_thunk_"); 788 789 return p == name || !strcmp(name, "__indirect_thunk_start"); 790 } 791 792 /* 793 * x86 retpoline functions pollute the call graph. This function removes them. 794 * This does not handle function return thunks, nor is there any improvement 795 * for the handling of inline thunks or extern thunks. 796 */ 797 static int thread_stack__x86_retpoline(struct thread_stack *ts, 798 struct perf_sample *sample, 799 struct addr_location *to_al) 800 { 801 struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1]; 802 struct call_path_root *cpr = ts->crp->cpr; 803 struct symbol *sym = tse->cp->sym; 804 struct symbol *tsym = to_al->sym; 805 struct call_path *cp; 806 807 if (sym && is_x86_retpoline(sym->name)) { 808 /* 809 * This is a x86 retpoline fn. It pollutes the call graph by 810 * showing up everywhere there is an indirect branch, but does 811 * not itself mean anything. Here the top-of-stack is removed, 812 * by decrementing the stack count, and then further down, the 813 * resulting top-of-stack is replaced with the actual target. 814 * The result is that the retpoline functions will no longer 815 * appear in the call graph. Note this only affects the call 816 * graph, since all the original branches are left unchanged. 817 */ 818 ts->cnt -= 1; 819 sym = ts->stack[ts->cnt - 2].cp->sym; 820 if (sym && sym == tsym && to_al->addr != tsym->start) { 821 /* 822 * Target is back to the middle of the symbol we came 823 * from so assume it is an indirect jmp and forget it 824 * altogether. 825 */ 826 ts->cnt -= 1; 827 return 0; 828 } 829 } else if (sym && sym == tsym) { 830 /* 831 * Target is back to the symbol we came from so assume it is an 832 * indirect jmp and forget it altogether. 833 */ 834 ts->cnt -= 1; 835 return 0; 836 } 837 838 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym, 839 sample->addr, ts->kernel_start); 840 if (!cp) 841 return -ENOMEM; 842 843 /* Replace the top-of-stack with the actual target */ 844 ts->stack[ts->cnt - 1].cp = cp; 845 846 return 0; 847 } 848 849 int thread_stack__process(struct thread *thread, struct comm *comm, 850 struct perf_sample *sample, 851 struct addr_location *from_al, 852 struct addr_location *to_al, u64 ref, 853 struct call_return_processor *crp) 854 { 855 struct thread_stack *ts = thread__stack(thread, sample->cpu); 856 enum retpoline_state_t rstate; 857 int err = 0; 858 859 if (ts && !ts->crp) { 860 /* Supersede thread_stack__event() */ 861 thread_stack__reset(thread, ts); 862 ts = NULL; 863 } 864 865 if (!ts) { 866 ts = thread_stack__new(thread, sample->cpu, crp); 867 if (!ts) 868 return -ENOMEM; 869 ts->comm = comm; 870 } 871 872 rstate = ts->rstate; 873 if (rstate == X86_RETPOLINE_DETECTED) 874 ts->rstate = X86_RETPOLINE_POSSIBLE; 875 876 /* Flush stack on exec */ 877 if (ts->comm != comm && thread->pid_ == thread->tid) { 878 err = __thread_stack__flush(thread, ts); 879 if (err) 880 return err; 881 ts->comm = comm; 882 } 883 884 /* If the stack is empty, put the current symbol on the stack */ 885 if (!ts->cnt) { 886 err = thread_stack__bottom(ts, sample, from_al, to_al, ref); 887 if (err) 888 return err; 889 } 890 891 ts->branch_count += 1; 892 ts->insn_count += sample->insn_cnt; 893 ts->cyc_count += sample->cyc_cnt; 894 ts->last_time = sample->time; 895 896 if (sample->flags & PERF_IP_FLAG_CALL) { 897 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END; 898 struct call_path_root *cpr = ts->crp->cpr; 899 struct call_path *cp; 900 u64 ret_addr; 901 902 if (!sample->ip || !sample->addr) 903 return 0; 904 905 ret_addr = sample->ip + sample->insn_len; 906 if (ret_addr == sample->addr) 907 return 0; /* Zero-length calls are excluded */ 908 909 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 910 to_al->sym, sample->addr, 911 ts->kernel_start); 912 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref, 913 cp, false, trace_end); 914 915 /* 916 * A call to the same symbol but not the start of the symbol, 917 * may be the start of a x86 retpoline. 918 */ 919 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym && 920 from_al->sym == to_al->sym && 921 to_al->addr != to_al->sym->start) 922 ts->rstate = X86_RETPOLINE_DETECTED; 923 924 } else if (sample->flags & PERF_IP_FLAG_RETURN) { 925 if (!sample->addr) { 926 u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET | 927 PERF_IP_FLAG_INTERRUPT; 928 929 if (!(sample->flags & return_from_kernel)) 930 return 0; 931 932 /* Pop kernel stack */ 933 return thread_stack__pop_ks(thread, ts, sample, ref); 934 } 935 936 if (!sample->ip) 937 return 0; 938 939 /* x86 retpoline 'return' doesn't match the stack */ 940 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 && 941 ts->stack[ts->cnt - 1].ret_addr != sample->addr) 942 return thread_stack__x86_retpoline(ts, sample, to_al); 943 944 err = thread_stack__pop_cp(thread, ts, sample->addr, 945 sample->time, ref, from_al->sym); 946 if (err) { 947 if (err < 0) 948 return err; 949 err = thread_stack__no_call_return(thread, ts, sample, 950 from_al, to_al, ref); 951 } 952 } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) { 953 err = thread_stack__trace_begin(thread, ts, sample->time, ref); 954 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { 955 err = thread_stack__trace_end(ts, sample, ref); 956 } else if (sample->flags & PERF_IP_FLAG_BRANCH && 957 from_al->sym != to_al->sym && to_al->sym && 958 to_al->addr == to_al->sym->start) { 959 struct call_path_root *cpr = ts->crp->cpr; 960 struct call_path *cp; 961 962 /* 963 * The compiler might optimize a call/ret combination by making 964 * it a jmp. Make that visible by recording on the stack a 965 * branch to the start of a different symbol. Note, that means 966 * when a ret pops the stack, all jmps must be popped off first. 967 */ 968 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 969 to_al->sym, sample->addr, 970 ts->kernel_start); 971 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, 972 false); 973 if (!err) 974 ts->stack[ts->cnt - 1].non_call = true; 975 } 976 977 return err; 978 } 979 980 size_t thread_stack__depth(struct thread *thread, int cpu) 981 { 982 struct thread_stack *ts = thread__stack(thread, cpu); 983 984 if (!ts) 985 return 0; 986 return ts->cnt; 987 } 988