1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * 5 */ 6 #include <linux/sched/task_stack.h> 7 #include <linux/stacktrace.h> 8 #include <linux/kallsyms.h> 9 #include <linux/seq_file.h> 10 #include <linux/spinlock.h> 11 #include <linux/uaccess.h> 12 #include <linux/ftrace.h> 13 #include <linux/module.h> 14 #include <linux/sysctl.h> 15 #include <linux/init.h> 16 17 #include <asm/setup.h> 18 19 #include "trace.h" 20 21 #define STACK_TRACE_ENTRIES 500 22 23 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; 24 static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; 25 26 static unsigned int stack_trace_nr_entries; 27 static unsigned long stack_trace_max_size; 28 static arch_spinlock_t stack_trace_max_lock = 29 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 30 31 DEFINE_PER_CPU(int, disable_stack_tracer); 32 static DEFINE_MUTEX(stack_sysctl_mutex); 33 34 int stack_tracer_enabled; 35 36 static void print_max_stack(void) 37 { 38 long i; 39 int size; 40 41 pr_emerg(" Depth Size Location (%d entries)\n" 42 " ----- ---- --------\n", 43 stack_trace_nr_entries); 44 45 for (i = 0; i < stack_trace_nr_entries; i++) { 46 if (i + 1 == stack_trace_nr_entries) 47 size = stack_trace_index[i]; 48 else 49 size = stack_trace_index[i] - stack_trace_index[i+1]; 50 51 pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i], 52 size, (void *)stack_dump_trace[i]); 53 } 54 } 55 56 /* 57 * The stack tracer looks for a maximum stack at each call from a function. It 58 * registers a callback from ftrace, and in that callback it examines the stack 59 * size. It determines the stack size from the variable passed in, which is the 60 * address of a local variable in the stack_trace_call() callback function. 61 * The stack size is calculated by the address of the local variable to the top 62 * of the current stack. If that size is smaller than the currently saved max 63 * stack size, nothing more is done. 64 * 65 * If the size of the stack is greater than the maximum recorded size, then the 66 * following algorithm takes place. 67 * 68 * For architectures (like x86) that store the function's return address before 69 * saving the function's local variables, the stack will look something like 70 * this: 71 * 72 * [ top of stack ] 73 * 0: sys call entry frame 74 * 10: return addr to entry code 75 * 11: start of sys_foo frame 76 * 20: return addr to sys_foo 77 * 21: start of kernel_func_bar frame 78 * 30: return addr to kernel_func_bar 79 * 31: [ do trace stack here ] 80 * 81 * The save_stack_trace() is called returning all the functions it finds in the 82 * current stack. Which would be (from the bottom of the stack to the top): 83 * 84 * return addr to kernel_func_bar 85 * return addr to sys_foo 86 * return addr to entry code 87 * 88 * Now to figure out how much each of these functions' local variable size is, 89 * a search of the stack is made to find these values. When a match is made, it 90 * is added to the stack_dump_trace[] array. The offset into the stack is saved 91 * in the stack_trace_index[] array. The above example would show: 92 * 93 * stack_dump_trace[] | stack_trace_index[] 94 * ------------------ + ------------------- 95 * return addr to kernel_func_bar | 30 96 * return addr to sys_foo | 20 97 * return addr to entry | 10 98 * 99 * The print_max_stack() function above, uses these values to print the size of 100 * each function's portion of the stack. 101 * 102 * for (i = 0; i < nr_entries; i++) { 103 * size = i == nr_entries - 1 ? stack_trace_index[i] : 104 * stack_trace_index[i] - stack_trace_index[i+1] 105 * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); 106 * } 107 * 108 * The above shows 109 * 110 * depth size location 111 * ----- ---- -------- 112 * 0 30 10 kernel_func_bar 113 * 1 20 10 sys_foo 114 * 2 10 10 entry code 115 * 116 * Now for architectures that might save the return address after the functions 117 * local variables (saving the link register before calling nested functions), 118 * this will cause the stack to look a little different: 119 * 120 * [ top of stack ] 121 * 0: sys call entry frame 122 * 10: start of sys_foo_frame 123 * 19: return addr to entry code << lr saved before calling kernel_func_bar 124 * 20: start of kernel_func_bar frame 125 * 29: return addr to sys_foo_frame << lr saved before calling next function 126 * 30: [ do trace stack here ] 127 * 128 * Although the functions returned by save_stack_trace() may be the same, the 129 * placement in the stack will be different. Using the same algorithm as above 130 * would yield: 131 * 132 * stack_dump_trace[] | stack_trace_index[] 133 * ------------------ + ------------------- 134 * return addr to kernel_func_bar | 30 135 * return addr to sys_foo | 29 136 * return addr to entry | 19 137 * 138 * Where the mapping is off by one: 139 * 140 * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! 141 * 142 * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the 143 * values in stack_trace_index[] are shifted by one to and the number of 144 * stack trace entries is decremented by one. 145 * 146 * stack_dump_trace[] | stack_trace_index[] 147 * ------------------ + ------------------- 148 * return addr to kernel_func_bar | 29 149 * return addr to sys_foo | 19 150 * 151 * Although the entry function is not displayed, the first function (sys_foo) 152 * will still include the stack size of it. 153 */ 154 static void check_stack(unsigned long ip, unsigned long *stack) 155 { 156 unsigned long this_size, flags; unsigned long *p, *top, *start; 157 static int tracer_frame; 158 int frame_size = READ_ONCE(tracer_frame); 159 int i, x; 160 161 this_size = ((unsigned long)stack) & (THREAD_SIZE-1); 162 this_size = THREAD_SIZE - this_size; 163 /* Remove the frame of the tracer */ 164 this_size -= frame_size; 165 166 if (this_size <= stack_trace_max_size) 167 return; 168 169 /* we do not handle interrupt stacks yet */ 170 if (!object_is_on_stack(stack)) 171 return; 172 173 /* Can't do this from NMI context (can cause deadlocks) */ 174 if (in_nmi()) 175 return; 176 177 local_irq_save(flags); 178 arch_spin_lock(&stack_trace_max_lock); 179 180 /* In case another CPU set the tracer_frame on us */ 181 if (unlikely(!frame_size)) 182 this_size -= tracer_frame; 183 184 /* a race could have already updated it */ 185 if (this_size <= stack_trace_max_size) 186 goto out; 187 188 stack_trace_max_size = this_size; 189 190 stack_trace_nr_entries = stack_trace_save(stack_dump_trace, 191 ARRAY_SIZE(stack_dump_trace) - 1, 192 0); 193 194 /* Skip over the overhead of the stack tracer itself */ 195 for (i = 0; i < stack_trace_nr_entries; i++) { 196 if (stack_dump_trace[i] == ip) 197 break; 198 } 199 200 /* 201 * Some archs may not have the passed in ip in the dump. 202 * If that happens, we need to show everything. 203 */ 204 if (i == stack_trace_nr_entries) 205 i = 0; 206 207 /* 208 * Now find where in the stack these are. 209 */ 210 x = 0; 211 start = stack; 212 top = (unsigned long *) 213 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 214 215 /* 216 * Loop through all the entries. One of the entries may 217 * for some reason be missed on the stack, so we may 218 * have to account for them. If they are all there, this 219 * loop will only happen once. This code only takes place 220 * on a new max, so it is far from a fast path. 221 */ 222 while (i < stack_trace_nr_entries) { 223 int found = 0; 224 225 stack_trace_index[x] = this_size; 226 p = start; 227 228 for (; p < top && i < stack_trace_nr_entries; p++) { 229 /* 230 * The READ_ONCE_NOCHECK is used to let KASAN know that 231 * this is not a stack-out-of-bounds error. 232 */ 233 if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { 234 stack_dump_trace[x] = stack_dump_trace[i++]; 235 this_size = stack_trace_index[x++] = 236 (top - p) * sizeof(unsigned long); 237 found = 1; 238 /* Start the search from here */ 239 start = p + 1; 240 /* 241 * We do not want to show the overhead 242 * of the stack tracer stack in the 243 * max stack. If we haven't figured 244 * out what that is, then figure it out 245 * now. 246 */ 247 if (unlikely(!tracer_frame)) { 248 tracer_frame = (p - stack) * 249 sizeof(unsigned long); 250 stack_trace_max_size -= tracer_frame; 251 } 252 } 253 } 254 255 if (!found) 256 i++; 257 } 258 259 #ifdef ARCH_FTRACE_SHIFT_STACK_TRACER 260 /* 261 * Some archs will store the link register before calling 262 * nested functions. This means the saved return address 263 * comes after the local storage, and we need to shift 264 * for that. 265 */ 266 if (x > 1) { 267 memmove(&stack_trace_index[0], &stack_trace_index[1], 268 sizeof(stack_trace_index[0]) * (x - 1)); 269 x--; 270 } 271 #endif 272 273 stack_trace_nr_entries = x; 274 275 if (task_stack_end_corrupted(current)) { 276 print_max_stack(); 277 BUG(); 278 } 279 280 out: 281 arch_spin_unlock(&stack_trace_max_lock); 282 local_irq_restore(flags); 283 } 284 285 static void 286 stack_trace_call(unsigned long ip, unsigned long parent_ip, 287 struct ftrace_ops *op, struct pt_regs *pt_regs) 288 { 289 unsigned long stack; 290 291 preempt_disable_notrace(); 292 293 /* no atomic needed, we only modify this variable by this cpu */ 294 __this_cpu_inc(disable_stack_tracer); 295 if (__this_cpu_read(disable_stack_tracer) != 1) 296 goto out; 297 298 /* If rcu is not watching, then save stack trace can fail */ 299 if (!rcu_is_watching()) 300 goto out; 301 302 ip += MCOUNT_INSN_SIZE; 303 304 check_stack(ip, &stack); 305 306 out: 307 __this_cpu_dec(disable_stack_tracer); 308 /* prevent recursion in schedule */ 309 preempt_enable_notrace(); 310 } 311 312 static struct ftrace_ops trace_ops __read_mostly = 313 { 314 .func = stack_trace_call, 315 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 316 }; 317 318 static ssize_t 319 stack_max_size_read(struct file *filp, char __user *ubuf, 320 size_t count, loff_t *ppos) 321 { 322 unsigned long *ptr = filp->private_data; 323 char buf[64]; 324 int r; 325 326 r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); 327 if (r > sizeof(buf)) 328 r = sizeof(buf); 329 return simple_read_from_buffer(ubuf, count, ppos, buf, r); 330 } 331 332 static ssize_t 333 stack_max_size_write(struct file *filp, const char __user *ubuf, 334 size_t count, loff_t *ppos) 335 { 336 long *ptr = filp->private_data; 337 unsigned long val, flags; 338 int ret; 339 340 ret = kstrtoul_from_user(ubuf, count, 10, &val); 341 if (ret) 342 return ret; 343 344 local_irq_save(flags); 345 346 /* 347 * In case we trace inside arch_spin_lock() or after (NMI), 348 * we will cause circular lock, so we also need to increase 349 * the percpu disable_stack_tracer here. 350 */ 351 __this_cpu_inc(disable_stack_tracer); 352 353 arch_spin_lock(&stack_trace_max_lock); 354 *ptr = val; 355 arch_spin_unlock(&stack_trace_max_lock); 356 357 __this_cpu_dec(disable_stack_tracer); 358 local_irq_restore(flags); 359 360 return count; 361 } 362 363 static const struct file_operations stack_max_size_fops = { 364 .open = tracing_open_generic, 365 .read = stack_max_size_read, 366 .write = stack_max_size_write, 367 .llseek = default_llseek, 368 }; 369 370 static void * 371 __next(struct seq_file *m, loff_t *pos) 372 { 373 long n = *pos - 1; 374 375 if (n >= stack_trace_nr_entries) 376 return NULL; 377 378 m->private = (void *)n; 379 return &m->private; 380 } 381 382 static void * 383 t_next(struct seq_file *m, void *v, loff_t *pos) 384 { 385 (*pos)++; 386 return __next(m, pos); 387 } 388 389 static void *t_start(struct seq_file *m, loff_t *pos) 390 { 391 local_irq_disable(); 392 393 __this_cpu_inc(disable_stack_tracer); 394 395 arch_spin_lock(&stack_trace_max_lock); 396 397 if (*pos == 0) 398 return SEQ_START_TOKEN; 399 400 return __next(m, pos); 401 } 402 403 static void t_stop(struct seq_file *m, void *p) 404 { 405 arch_spin_unlock(&stack_trace_max_lock); 406 407 __this_cpu_dec(disable_stack_tracer); 408 409 local_irq_enable(); 410 } 411 412 static void trace_lookup_stack(struct seq_file *m, long i) 413 { 414 unsigned long addr = stack_dump_trace[i]; 415 416 seq_printf(m, "%pS\n", (void *)addr); 417 } 418 419 static void print_disabled(struct seq_file *m) 420 { 421 seq_puts(m, "#\n" 422 "# Stack tracer disabled\n" 423 "#\n" 424 "# To enable the stack tracer, either add 'stacktrace' to the\n" 425 "# kernel command line\n" 426 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" 427 "#\n"); 428 } 429 430 static int t_show(struct seq_file *m, void *v) 431 { 432 long i; 433 int size; 434 435 if (v == SEQ_START_TOKEN) { 436 seq_printf(m, " Depth Size Location" 437 " (%d entries)\n" 438 " ----- ---- --------\n", 439 stack_trace_nr_entries); 440 441 if (!stack_tracer_enabled && !stack_trace_max_size) 442 print_disabled(m); 443 444 return 0; 445 } 446 447 i = *(long *)v; 448 449 if (i >= stack_trace_nr_entries) 450 return 0; 451 452 if (i + 1 == stack_trace_nr_entries) 453 size = stack_trace_index[i]; 454 else 455 size = stack_trace_index[i] - stack_trace_index[i+1]; 456 457 seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size); 458 459 trace_lookup_stack(m, i); 460 461 return 0; 462 } 463 464 static const struct seq_operations stack_trace_seq_ops = { 465 .start = t_start, 466 .next = t_next, 467 .stop = t_stop, 468 .show = t_show, 469 }; 470 471 static int stack_trace_open(struct inode *inode, struct file *file) 472 { 473 return seq_open(file, &stack_trace_seq_ops); 474 } 475 476 static const struct file_operations stack_trace_fops = { 477 .open = stack_trace_open, 478 .read = seq_read, 479 .llseek = seq_lseek, 480 .release = seq_release, 481 }; 482 483 #ifdef CONFIG_DYNAMIC_FTRACE 484 485 static int 486 stack_trace_filter_open(struct inode *inode, struct file *file) 487 { 488 struct ftrace_ops *ops = inode->i_private; 489 490 return ftrace_regex_open(ops, FTRACE_ITER_FILTER, 491 inode, file); 492 } 493 494 static const struct file_operations stack_trace_filter_fops = { 495 .open = stack_trace_filter_open, 496 .read = seq_read, 497 .write = ftrace_filter_write, 498 .llseek = tracing_lseek, 499 .release = ftrace_regex_release, 500 }; 501 502 #endif /* CONFIG_DYNAMIC_FTRACE */ 503 504 int 505 stack_trace_sysctl(struct ctl_table *table, int write, 506 void __user *buffer, size_t *lenp, 507 loff_t *ppos) 508 { 509 int was_enabled; 510 int ret; 511 512 mutex_lock(&stack_sysctl_mutex); 513 was_enabled = !!stack_tracer_enabled; 514 515 ret = proc_dointvec(table, write, buffer, lenp, ppos); 516 517 if (ret || !write || (was_enabled == !!stack_tracer_enabled)) 518 goto out; 519 520 if (stack_tracer_enabled) 521 register_ftrace_function(&trace_ops); 522 else 523 unregister_ftrace_function(&trace_ops); 524 out: 525 mutex_unlock(&stack_sysctl_mutex); 526 return ret; 527 } 528 529 static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; 530 531 static __init int enable_stacktrace(char *str) 532 { 533 int len; 534 535 if ((len = str_has_prefix(str, "_filter="))) 536 strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); 537 538 stack_tracer_enabled = 1; 539 return 1; 540 } 541 __setup("stacktrace", enable_stacktrace); 542 543 static __init int stack_trace_init(void) 544 { 545 struct dentry *d_tracer; 546 547 d_tracer = tracing_init_dentry(); 548 if (IS_ERR(d_tracer)) 549 return 0; 550 551 trace_create_file("stack_max_size", 0644, d_tracer, 552 &stack_trace_max_size, &stack_max_size_fops); 553 554 trace_create_file("stack_trace", 0444, d_tracer, 555 NULL, &stack_trace_fops); 556 557 #ifdef CONFIG_DYNAMIC_FTRACE 558 trace_create_file("stack_trace_filter", 0644, d_tracer, 559 &trace_ops, &stack_trace_filter_fops); 560 #endif 561 562 if (stack_trace_filter_buf[0]) 563 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); 564 565 if (stack_tracer_enabled) 566 register_ftrace_function(&trace_ops); 567 568 return 0; 569 } 570 571 device_initcall(stack_trace_init); 572