1 /* 2 * Code for replacing ftrace calls with jumps. 3 * 4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 5 * 6 * Thanks goes to Ingo Molnar, for suggesting the idea. 7 * Mathieu Desnoyers, for suggesting postponing the modifications. 8 * Arjan van de Ven, for keeping me straight, and explaining to me 9 * the dangers of modifying code on the run. 10 */ 11 12 #include <linux/spinlock.h> 13 #include <linux/hardirq.h> 14 #include <linux/uaccess.h> 15 #include <linux/ftrace.h> 16 #include <linux/percpu.h> 17 #include <linux/sched.h> 18 #include <linux/init.h> 19 #include <linux/list.h> 20 21 #include <trace/syscall.h> 22 23 #include <asm/cacheflush.h> 24 #include <asm/ftrace.h> 25 #include <asm/nops.h> 26 #include <asm/nmi.h> 27 28 29 #ifdef CONFIG_DYNAMIC_FTRACE 30 31 int ftrace_arch_code_modify_prepare(void) 32 { 33 set_kernel_text_rw(); 34 return 0; 35 } 36 37 int ftrace_arch_code_modify_post_process(void) 38 { 39 set_kernel_text_ro(); 40 return 0; 41 } 42 43 union ftrace_code_union { 44 char code[MCOUNT_INSN_SIZE]; 45 struct { 46 char e8; 47 int offset; 48 } __attribute__((packed)); 49 }; 50 51 static int ftrace_calc_offset(long ip, long addr) 52 { 53 return (int)(addr - ip); 54 } 55 56 static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) 57 { 58 static union ftrace_code_union calc; 59 60 calc.e8 = 0xe8; 61 calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); 62 63 /* 64 * No locking needed, this must be called via kstop_machine 65 * which in essence is like running on a uniprocessor machine. 66 */ 67 return calc.code; 68 } 69 70 /* 71 * Modifying code must take extra care. On an SMP machine, if 72 * the code being modified is also being executed on another CPU 73 * that CPU will have undefined results and possibly take a GPF. 74 * We use kstop_machine to stop other CPUS from exectuing code. 75 * But this does not stop NMIs from happening. We still need 76 * to protect against that. We separate out the modification of 77 * the code to take care of this. 78 * 79 * Two buffers are added: An IP buffer and a "code" buffer. 80 * 81 * 1) Put the instruction pointer into the IP buffer 82 * and the new code into the "code" buffer. 83 * 2) Wait for any running NMIs to finish and set a flag that says 84 * we are modifying code, it is done in an atomic operation. 85 * 3) Write the code 86 * 4) clear the flag. 87 * 5) Wait for any running NMIs to finish. 88 * 89 * If an NMI is executed, the first thing it does is to call 90 * "ftrace_nmi_enter". This will check if the flag is set to write 91 * and if it is, it will write what is in the IP and "code" buffers. 92 * 93 * The trick is, it does not matter if everyone is writing the same 94 * content to the code location. Also, if a CPU is executing code 95 * it is OK to write to that code location if the contents being written 96 * are the same as what exists. 97 */ 98 99 #define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ 100 static atomic_t nmi_running = ATOMIC_INIT(0); 101 static int mod_code_status; /* holds return value of text write */ 102 static void *mod_code_ip; /* holds the IP to write to */ 103 static void *mod_code_newcode; /* holds the text to write to the IP */ 104 105 static unsigned nmi_wait_count; 106 static atomic_t nmi_update_count = ATOMIC_INIT(0); 107 108 int ftrace_arch_read_dyn_info(char *buf, int size) 109 { 110 int r; 111 112 r = snprintf(buf, size, "%u %u", 113 nmi_wait_count, 114 atomic_read(&nmi_update_count)); 115 return r; 116 } 117 118 static void clear_mod_flag(void) 119 { 120 int old = atomic_read(&nmi_running); 121 122 for (;;) { 123 int new = old & ~MOD_CODE_WRITE_FLAG; 124 125 if (old == new) 126 break; 127 128 old = atomic_cmpxchg(&nmi_running, old, new); 129 } 130 } 131 132 static void ftrace_mod_code(void) 133 { 134 /* 135 * Yes, more than one CPU process can be writing to mod_code_status. 136 * (and the code itself) 137 * But if one were to fail, then they all should, and if one were 138 * to succeed, then they all should. 139 */ 140 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, 141 MCOUNT_INSN_SIZE); 142 143 /* if we fail, then kill any new writers */ 144 if (mod_code_status) 145 clear_mod_flag(); 146 } 147 148 void ftrace_nmi_enter(void) 149 { 150 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 151 smp_rmb(); 152 ftrace_mod_code(); 153 atomic_inc(&nmi_update_count); 154 } 155 /* Must have previous changes seen before executions */ 156 smp_mb(); 157 } 158 159 void ftrace_nmi_exit(void) 160 { 161 /* Finish all executions before clearing nmi_running */ 162 smp_mb(); 163 atomic_dec(&nmi_running); 164 } 165 166 static void wait_for_nmi_and_set_mod_flag(void) 167 { 168 if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) 169 return; 170 171 do { 172 cpu_relax(); 173 } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); 174 175 nmi_wait_count++; 176 } 177 178 static void wait_for_nmi(void) 179 { 180 if (!atomic_read(&nmi_running)) 181 return; 182 183 do { 184 cpu_relax(); 185 } while (atomic_read(&nmi_running)); 186 187 nmi_wait_count++; 188 } 189 190 static int 191 do_ftrace_mod_code(unsigned long ip, void *new_code) 192 { 193 mod_code_ip = (void *)ip; 194 mod_code_newcode = new_code; 195 196 /* The buffers need to be visible before we let NMIs write them */ 197 smp_mb(); 198 199 wait_for_nmi_and_set_mod_flag(); 200 201 /* Make sure all running NMIs have finished before we write the code */ 202 smp_mb(); 203 204 ftrace_mod_code(); 205 206 /* Make sure the write happens before clearing the bit */ 207 smp_mb(); 208 209 clear_mod_flag(); 210 wait_for_nmi(); 211 212 return mod_code_status; 213 } 214 215 216 217 218 static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 219 220 static unsigned char *ftrace_nop_replace(void) 221 { 222 return ftrace_nop; 223 } 224 225 static int 226 ftrace_modify_code(unsigned long ip, unsigned char *old_code, 227 unsigned char *new_code) 228 { 229 unsigned char replaced[MCOUNT_INSN_SIZE]; 230 231 /* 232 * Note: Due to modules and __init, code can 233 * disappear and change, we need to protect against faulting 234 * as well as code changing. We do this by using the 235 * probe_kernel_* functions. 236 * 237 * No real locking needed, this code is run through 238 * kstop_machine, or before SMP starts. 239 */ 240 241 /* read the text we want to modify */ 242 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) 243 return -EFAULT; 244 245 /* Make sure it is what we expect it to be */ 246 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) 247 return -EINVAL; 248 249 /* replace the text with the new text */ 250 if (do_ftrace_mod_code(ip, new_code)) 251 return -EPERM; 252 253 sync_core(); 254 255 return 0; 256 } 257 258 int ftrace_make_nop(struct module *mod, 259 struct dyn_ftrace *rec, unsigned long addr) 260 { 261 unsigned char *new, *old; 262 unsigned long ip = rec->ip; 263 264 old = ftrace_call_replace(ip, addr); 265 new = ftrace_nop_replace(); 266 267 return ftrace_modify_code(rec->ip, old, new); 268 } 269 270 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 271 { 272 unsigned char *new, *old; 273 unsigned long ip = rec->ip; 274 275 old = ftrace_nop_replace(); 276 new = ftrace_call_replace(ip, addr); 277 278 return ftrace_modify_code(rec->ip, old, new); 279 } 280 281 int ftrace_update_ftrace_func(ftrace_func_t func) 282 { 283 unsigned long ip = (unsigned long)(&ftrace_call); 284 unsigned char old[MCOUNT_INSN_SIZE], *new; 285 int ret; 286 287 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); 288 new = ftrace_call_replace(ip, (unsigned long)func); 289 ret = ftrace_modify_code(ip, old, new); 290 291 return ret; 292 } 293 294 int __init ftrace_dyn_arch_init(void *data) 295 { 296 extern const unsigned char ftrace_test_p6nop[]; 297 extern const unsigned char ftrace_test_nop5[]; 298 extern const unsigned char ftrace_test_jmp[]; 299 int faulted = 0; 300 301 /* 302 * There is no good nop for all x86 archs. 303 * We will default to using the P6_NOP5, but first we 304 * will test to make sure that the nop will actually 305 * work on this CPU. If it faults, we will then 306 * go to a lesser efficient 5 byte nop. If that fails 307 * we then just use a jmp as our nop. This isn't the most 308 * efficient nop, but we can not use a multi part nop 309 * since we would then risk being preempted in the middle 310 * of that nop, and if we enabled tracing then, it might 311 * cause a system crash. 312 * 313 * TODO: check the cpuid to determine the best nop. 314 */ 315 asm volatile ( 316 "ftrace_test_jmp:" 317 "jmp ftrace_test_p6nop\n" 318 "nop\n" 319 "nop\n" 320 "nop\n" /* 2 byte jmp + 3 bytes */ 321 "ftrace_test_p6nop:" 322 P6_NOP5 323 "jmp 1f\n" 324 "ftrace_test_nop5:" 325 ".byte 0x66,0x66,0x66,0x66,0x90\n" 326 "1:" 327 ".section .fixup, \"ax\"\n" 328 "2: movl $1, %0\n" 329 " jmp ftrace_test_nop5\n" 330 "3: movl $2, %0\n" 331 " jmp 1b\n" 332 ".previous\n" 333 _ASM_EXTABLE(ftrace_test_p6nop, 2b) 334 _ASM_EXTABLE(ftrace_test_nop5, 3b) 335 : "=r"(faulted) : "0" (faulted)); 336 337 switch (faulted) { 338 case 0: 339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 341 break; 342 case 1: 343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 345 break; 346 case 2: 347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 349 break; 350 } 351 352 /* The return code is retured via data */ 353 *(unsigned long *)data = 0; 354 355 return 0; 356 } 357 #endif 358 359 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 360 361 #ifdef CONFIG_DYNAMIC_FTRACE 362 extern void ftrace_graph_call(void); 363 364 static int ftrace_mod_jmp(unsigned long ip, 365 int old_offset, int new_offset) 366 { 367 unsigned char code[MCOUNT_INSN_SIZE]; 368 369 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) 370 return -EFAULT; 371 372 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) 373 return -EINVAL; 374 375 *(int *)(&code[1]) = new_offset; 376 377 if (do_ftrace_mod_code(ip, &code)) 378 return -EPERM; 379 380 return 0; 381 } 382 383 int ftrace_enable_ftrace_graph_caller(void) 384 { 385 unsigned long ip = (unsigned long)(&ftrace_graph_call); 386 int old_offset, new_offset; 387 388 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); 389 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); 390 391 return ftrace_mod_jmp(ip, old_offset, new_offset); 392 } 393 394 int ftrace_disable_ftrace_graph_caller(void) 395 { 396 unsigned long ip = (unsigned long)(&ftrace_graph_call); 397 int old_offset, new_offset; 398 399 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); 400 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); 401 402 return ftrace_mod_jmp(ip, old_offset, new_offset); 403 } 404 405 #endif /* !CONFIG_DYNAMIC_FTRACE */ 406 407 /* 408 * Hook the return address and push it in the stack of return addrs 409 * in current thread info. 410 */ 411 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, 412 unsigned long frame_pointer) 413 { 414 unsigned long old; 415 int faulted; 416 struct ftrace_graph_ent trace; 417 unsigned long return_hooker = (unsigned long) 418 &return_to_handler; 419 420 if (unlikely(atomic_read(¤t->tracing_graph_pause))) 421 return; 422 423 /* 424 * Protect against fault, even if it shouldn't 425 * happen. This tool is too much intrusive to 426 * ignore such a protection. 427 */ 428 asm volatile( 429 "1: " _ASM_MOV " (%[parent]), %[old]\n" 430 "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" 431 " movl $0, %[faulted]\n" 432 "3:\n" 433 434 ".section .fixup, \"ax\"\n" 435 "4: movl $1, %[faulted]\n" 436 " jmp 3b\n" 437 ".previous\n" 438 439 _ASM_EXTABLE(1b, 4b) 440 _ASM_EXTABLE(2b, 4b) 441 442 : [old] "=&r" (old), [faulted] "=r" (faulted) 443 : [parent] "r" (parent), [return_hooker] "r" (return_hooker) 444 : "memory" 445 ); 446 447 if (unlikely(faulted)) { 448 ftrace_graph_stop(); 449 WARN_ON(1); 450 return; 451 } 452 453 if (ftrace_push_return_trace(old, self_addr, &trace.depth, 454 frame_pointer) == -EBUSY) { 455 *parent = old; 456 return; 457 } 458 459 trace.func = self_addr; 460 461 /* Only trace if the calling function expects to */ 462 if (!ftrace_graph_entry(&trace)) { 463 current->curr_ret_stack--; 464 *parent = old; 465 } 466 } 467 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 468 469 #ifdef CONFIG_FTRACE_SYSCALLS 470 471 extern unsigned long __start_syscalls_metadata[]; 472 extern unsigned long __stop_syscalls_metadata[]; 473 extern unsigned long *sys_call_table; 474 475 static struct syscall_metadata **syscalls_metadata; 476 477 static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) 478 { 479 struct syscall_metadata *start; 480 struct syscall_metadata *stop; 481 char str[KSYM_SYMBOL_LEN]; 482 483 484 start = (struct syscall_metadata *)__start_syscalls_metadata; 485 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); 487 488 for ( ; start < stop; start++) { 489 if (start->name && !strcmp(start->name, str)) 490 return start; 491 } 492 return NULL; 493 } 494 495 struct syscall_metadata *syscall_nr_to_meta(int nr) 496 { 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 498 return NULL; 499 500 return syscalls_metadata[nr]; 501 } 502 503 int syscall_name_to_nr(char *name) 504 { 505 int i; 506 507 if (!syscalls_metadata) 508 return -1; 509 510 for (i = 0; i < NR_syscalls; i++) { 511 if (syscalls_metadata[i]) { 512 if (!strcmp(syscalls_metadata[i]->name, name)) 513 return i; 514 } 515 } 516 return -1; 517 } 518 519 void set_syscall_enter_id(int num, int id) 520 { 521 syscalls_metadata[num]->enter_id = id; 522 } 523 524 void set_syscall_exit_id(int num, int id) 525 { 526 syscalls_metadata[num]->exit_id = id; 527 } 528 529 static int __init arch_init_ftrace_syscalls(void) 530 { 531 int i; 532 struct syscall_metadata *meta; 533 unsigned long **psys_syscall_table = &sys_call_table; 534 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 536 NR_syscalls, GFP_KERNEL); 537 if (!syscalls_metadata) { 538 WARN_ON(1); 539 return -ENOMEM; 540 } 541 542 for (i = 0; i < NR_syscalls; i++) { 543 meta = find_syscall_meta(psys_syscall_table[i]); 544 syscalls_metadata[i] = meta; 545 } 546 return 0; 547 } 548 arch_initcall(arch_init_ftrace_syscalls); 549 #endif 550