1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Kernel Probes Jump Optimization (Optprobes) 4 * 5 * Copyright (C) IBM Corporation, 2002, 2004 6 * Copyright (C) Hitachi Ltd., 2012 7 */ 8 #include <linux/kprobes.h> 9 #include <linux/perf_event.h> 10 #include <linux/ptrace.h> 11 #include <linux/string.h> 12 #include <linux/slab.h> 13 #include <linux/hardirq.h> 14 #include <linux/preempt.h> 15 #include <linux/extable.h> 16 #include <linux/kdebug.h> 17 #include <linux/kallsyms.h> 18 #include <linux/ftrace.h> 19 #include <linux/objtool.h> 20 #include <linux/pgtable.h> 21 #include <linux/static_call.h> 22 23 #include <asm/text-patching.h> 24 #include <asm/cacheflush.h> 25 #include <asm/desc.h> 26 #include <linux/uaccess.h> 27 #include <asm/alternative.h> 28 #include <asm/insn.h> 29 #include <asm/debugreg.h> 30 #include <asm/set_memory.h> 31 #include <asm/sections.h> 32 #include <asm/nospec-branch.h> 33 34 #include "common.h" 35 36 unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) 37 { 38 struct optimized_kprobe *op; 39 struct kprobe *kp; 40 long offs; 41 int i; 42 43 for (i = 0; i < JMP32_INSN_SIZE; i++) { 44 kp = get_kprobe((void *)addr - i); 45 /* This function only handles jump-optimized kprobe */ 46 if (kp && kprobe_optimized(kp)) { 47 op = container_of(kp, struct optimized_kprobe, kp); 48 /* If op->list is not empty, op is under optimizing */ 49 if (list_empty(&op->list)) 50 goto found; 51 } 52 } 53 54 return addr; 55 found: 56 /* 57 * If the kprobe can be optimized, original bytes which can be 58 * overwritten by jump destination address. In this case, original 59 * bytes must be recovered from op->optinsn.copied_insn buffer. 60 */ 61 if (copy_from_kernel_nofault(buf, (void *)addr, 62 MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 63 return 0UL; 64 65 if (addr == (unsigned long)kp->addr) { 66 buf[0] = kp->opcode; 67 memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); 68 } else { 69 offs = addr - (unsigned long)kp->addr - 1; 70 memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); 71 } 72 73 return (unsigned long)buf; 74 } 75 76 static void synthesize_clac(kprobe_opcode_t *addr) 77 { 78 /* 79 * Can't be static_cpu_has() due to how objtool treats this feature bit. 80 * This isn't a fast path anyway. 81 */ 82 if (!boot_cpu_has(X86_FEATURE_SMAP)) 83 return; 84 85 /* Replace the NOP3 with CLAC */ 86 addr[0] = 0x0f; 87 addr[1] = 0x01; 88 addr[2] = 0xca; 89 } 90 91 /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ 92 static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) 93 { 94 #ifdef CONFIG_X86_64 95 *addr++ = 0x48; 96 *addr++ = 0xbf; 97 #else 98 *addr++ = 0xb8; 99 #endif 100 *(unsigned long *)addr = val; 101 } 102 103 asm ( 104 ".pushsection .rodata\n" 105 "optprobe_template_func:\n" 106 ".global optprobe_template_entry\n" 107 "optprobe_template_entry:\n" 108 #ifdef CONFIG_X86_64 109 /* We don't bother saving the ss register */ 110 " pushq %rsp\n" 111 " pushfq\n" 112 ".global optprobe_template_clac\n" 113 "optprobe_template_clac:\n" 114 ASM_NOP3 115 SAVE_REGS_STRING 116 " movq %rsp, %rsi\n" 117 ".global optprobe_template_val\n" 118 "optprobe_template_val:\n" 119 ASM_NOP5 120 ASM_NOP5 121 ".global optprobe_template_call\n" 122 "optprobe_template_call:\n" 123 ASM_NOP5 124 /* Move flags to rsp */ 125 " movq 18*8(%rsp), %rdx\n" 126 " movq %rdx, 19*8(%rsp)\n" 127 RESTORE_REGS_STRING 128 /* Skip flags entry */ 129 " addq $8, %rsp\n" 130 " popfq\n" 131 #else /* CONFIG_X86_32 */ 132 " pushl %esp\n" 133 " pushfl\n" 134 ".global optprobe_template_clac\n" 135 "optprobe_template_clac:\n" 136 ASM_NOP3 137 SAVE_REGS_STRING 138 " movl %esp, %edx\n" 139 ".global optprobe_template_val\n" 140 "optprobe_template_val:\n" 141 ASM_NOP5 142 ".global optprobe_template_call\n" 143 "optprobe_template_call:\n" 144 ASM_NOP5 145 /* Move flags into esp */ 146 " movl 14*4(%esp), %edx\n" 147 " movl %edx, 15*4(%esp)\n" 148 RESTORE_REGS_STRING 149 /* Skip flags entry */ 150 " addl $4, %esp\n" 151 " popfl\n" 152 #endif 153 ".global optprobe_template_end\n" 154 "optprobe_template_end:\n" 155 ".popsection\n"); 156 157 void optprobe_template_func(void); 158 STACK_FRAME_NON_STANDARD(optprobe_template_func); 159 160 #define TMPL_CLAC_IDX \ 161 ((long)optprobe_template_clac - (long)optprobe_template_entry) 162 #define TMPL_MOVE_IDX \ 163 ((long)optprobe_template_val - (long)optprobe_template_entry) 164 #define TMPL_CALL_IDX \ 165 ((long)optprobe_template_call - (long)optprobe_template_entry) 166 #define TMPL_END_IDX \ 167 ((long)optprobe_template_end - (long)optprobe_template_entry) 168 169 /* Optimized kprobe call back function: called from optinsn */ 170 static void 171 optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) 172 { 173 /* This is possible if op is under delayed unoptimizing */ 174 if (kprobe_disabled(&op->kp)) 175 return; 176 177 preempt_disable(); 178 if (kprobe_running()) { 179 kprobes_inc_nmissed_count(&op->kp); 180 } else { 181 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 182 /* Save skipped registers */ 183 regs->cs = __KERNEL_CS; 184 #ifdef CONFIG_X86_32 185 regs->gs = 0; 186 #endif 187 regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; 188 regs->orig_ax = ~0UL; 189 190 __this_cpu_write(current_kprobe, &op->kp); 191 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 192 opt_pre_handler(&op->kp, regs); 193 __this_cpu_write(current_kprobe, NULL); 194 } 195 preempt_enable(); 196 } 197 NOKPROBE_SYMBOL(optimized_callback); 198 199 static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) 200 { 201 struct insn insn; 202 int len = 0, ret; 203 204 while (len < JMP32_INSN_SIZE) { 205 ret = __copy_instruction(dest + len, src + len, real + len, &insn); 206 if (!ret || !can_boost(&insn, src + len)) 207 return -EINVAL; 208 len += ret; 209 } 210 /* Check whether the address range is reserved */ 211 if (ftrace_text_reserved(src, src + len - 1) || 212 alternatives_text_reserved(src, src + len - 1) || 213 jump_label_text_reserved(src, src + len - 1) || 214 static_call_text_reserved(src, src + len - 1)) 215 return -EBUSY; 216 217 return len; 218 } 219 220 /* Check whether insn is indirect jump */ 221 static int __insn_is_indirect_jump(struct insn *insn) 222 { 223 return ((insn->opcode.bytes[0] == 0xff && 224 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ 225 insn->opcode.bytes[0] == 0xea); /* Segment based jump */ 226 } 227 228 /* Check whether insn jumps into specified address range */ 229 static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) 230 { 231 unsigned long target = 0; 232 233 switch (insn->opcode.bytes[0]) { 234 case 0xe0: /* loopne */ 235 case 0xe1: /* loope */ 236 case 0xe2: /* loop */ 237 case 0xe3: /* jcxz */ 238 case 0xe9: /* near relative jump */ 239 case 0xeb: /* short relative jump */ 240 break; 241 case 0x0f: 242 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ 243 break; 244 return 0; 245 default: 246 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ 247 break; 248 return 0; 249 } 250 target = (unsigned long)insn->next_byte + insn->immediate.value; 251 252 return (start <= target && target <= start + len); 253 } 254 255 static int insn_is_indirect_jump(struct insn *insn) 256 { 257 int ret = __insn_is_indirect_jump(insn); 258 259 #ifdef CONFIG_RETPOLINE 260 /* 261 * Jump to x86_indirect_thunk_* is treated as an indirect jump. 262 * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with 263 * older gcc may use indirect jump. So we add this check instead of 264 * replace indirect-jump check. 265 */ 266 if (!ret) 267 ret = insn_jump_into_range(insn, 268 (unsigned long)__indirect_thunk_start, 269 (unsigned long)__indirect_thunk_end - 270 (unsigned long)__indirect_thunk_start); 271 #endif 272 return ret; 273 } 274 275 static bool is_padding_int3(unsigned long addr, unsigned long eaddr) 276 { 277 unsigned char ops; 278 279 for (; addr < eaddr; addr++) { 280 if (get_kernel_nofault(ops, (void *)addr) < 0 || 281 ops != INT3_INSN_OPCODE) 282 return false; 283 } 284 285 return true; 286 } 287 288 /* Decode whole function to ensure any instructions don't jump into target */ 289 static int can_optimize(unsigned long paddr) 290 { 291 unsigned long addr, size = 0, offset = 0; 292 struct insn insn; 293 kprobe_opcode_t buf[MAX_INSN_SIZE]; 294 295 /* Lookup symbol including addr */ 296 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 297 return 0; 298 299 /* 300 * Do not optimize in the entry code due to the unstable 301 * stack handling and registers setup. 302 */ 303 if (((paddr >= (unsigned long)__entry_text_start) && 304 (paddr < (unsigned long)__entry_text_end))) 305 return 0; 306 307 /* Check there is enough space for a relative jump. */ 308 if (size - offset < JMP32_INSN_SIZE) 309 return 0; 310 311 /* Decode instructions */ 312 addr = paddr - offset; 313 while (addr < paddr - offset + size) { /* Decode until function end */ 314 unsigned long recovered_insn; 315 int ret; 316 317 if (search_exception_tables(addr)) 318 /* 319 * Since some fixup code will jumps into this function, 320 * we can't optimize kprobe in this function. 321 */ 322 return 0; 323 recovered_insn = recover_probed_instruction(buf, addr); 324 if (!recovered_insn) 325 return 0; 326 327 ret = insn_decode_kernel(&insn, (void *)recovered_insn); 328 if (ret < 0) 329 return 0; 330 331 /* 332 * In the case of detecting unknown breakpoint, this could be 333 * a padding INT3 between functions. Let's check that all the 334 * rest of the bytes are also INT3. 335 */ 336 if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) 337 return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; 338 339 /* Recover address */ 340 insn.kaddr = (void *)addr; 341 insn.next_byte = (void *)(addr + insn.length); 342 /* Check any instructions don't jump into target */ 343 if (insn_is_indirect_jump(&insn) || 344 insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, 345 DISP32_SIZE)) 346 return 0; 347 addr += insn.length; 348 } 349 350 return 1; 351 } 352 353 /* Check optimized_kprobe can actually be optimized. */ 354 int arch_check_optimized_kprobe(struct optimized_kprobe *op) 355 { 356 int i; 357 struct kprobe *p; 358 359 for (i = 1; i < op->optinsn.size; i++) { 360 p = get_kprobe(op->kp.addr + i); 361 if (p && !kprobe_disabled(p)) 362 return -EEXIST; 363 } 364 365 return 0; 366 } 367 368 /* Check the addr is within the optimized instructions. */ 369 int arch_within_optimized_kprobe(struct optimized_kprobe *op, 370 unsigned long addr) 371 { 372 return ((unsigned long)op->kp.addr <= addr && 373 (unsigned long)op->kp.addr + op->optinsn.size > addr); 374 } 375 376 /* Free optimized instruction slot */ 377 static 378 void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) 379 { 380 u8 *slot = op->optinsn.insn; 381 if (slot) { 382 int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; 383 384 /* Record the perf event before freeing the slot */ 385 if (dirty) 386 perf_event_text_poke(slot, slot, len, NULL, 0); 387 388 free_optinsn_slot(slot, dirty); 389 op->optinsn.insn = NULL; 390 op->optinsn.size = 0; 391 } 392 } 393 394 void arch_remove_optimized_kprobe(struct optimized_kprobe *op) 395 { 396 __arch_remove_optimized_kprobe(op, 1); 397 } 398 399 /* 400 * Copy replacing target instructions 401 * Target instructions MUST be relocatable (checked inside) 402 * This is called when new aggr(opt)probe is allocated or reused. 403 */ 404 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, 405 struct kprobe *__unused) 406 { 407 u8 *buf = NULL, *slot; 408 int ret, len; 409 long rel; 410 411 if (!can_optimize((unsigned long)op->kp.addr)) 412 return -EILSEQ; 413 414 buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); 415 if (!buf) 416 return -ENOMEM; 417 418 op->optinsn.insn = slot = get_optinsn_slot(); 419 if (!slot) { 420 ret = -ENOMEM; 421 goto out; 422 } 423 424 /* 425 * Verify if the address gap is in 2GB range, because this uses 426 * a relative jump. 427 */ 428 rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; 429 if (abs(rel) > 0x7fffffff) { 430 ret = -ERANGE; 431 goto err; 432 } 433 434 /* Copy arch-dep-instance from template */ 435 memcpy(buf, optprobe_template_entry, TMPL_END_IDX); 436 437 /* Copy instructions into the out-of-line buffer */ 438 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, 439 slot + TMPL_END_IDX); 440 if (ret < 0) 441 goto err; 442 op->optinsn.size = ret; 443 len = TMPL_END_IDX + op->optinsn.size; 444 445 synthesize_clac(buf + TMPL_CLAC_IDX); 446 447 /* Set probe information */ 448 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); 449 450 /* Set probe function call */ 451 synthesize_relcall(buf + TMPL_CALL_IDX, 452 slot + TMPL_CALL_IDX, optimized_callback); 453 454 /* Set returning jmp instruction at the tail of out-of-line buffer */ 455 synthesize_reljump(buf + len, slot + len, 456 (u8 *)op->kp.addr + op->optinsn.size); 457 len += JMP32_INSN_SIZE; 458 459 /* 460 * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also 461 * used in __arch_remove_optimized_kprobe(). 462 */ 463 464 /* We have to use text_poke() for instruction buffer because it is RO */ 465 perf_event_text_poke(slot, NULL, 0, buf, len); 466 text_poke(slot, buf, len); 467 468 ret = 0; 469 out: 470 kfree(buf); 471 return ret; 472 473 err: 474 __arch_remove_optimized_kprobe(op, 0); 475 goto out; 476 } 477 478 /* 479 * Replace breakpoints (INT3) with relative jumps (JMP.d32). 480 * Caller must call with locking kprobe_mutex and text_mutex. 481 * 482 * The caller will have installed a regular kprobe and after that issued 483 * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in 484 * the 4 bytes after the INT3 are unused and can now be overwritten. 485 */ 486 void arch_optimize_kprobes(struct list_head *oplist) 487 { 488 struct optimized_kprobe *op, *tmp; 489 u8 insn_buff[JMP32_INSN_SIZE]; 490 491 list_for_each_entry_safe(op, tmp, oplist, list) { 492 s32 rel = (s32)((long)op->optinsn.insn - 493 ((long)op->kp.addr + JMP32_INSN_SIZE)); 494 495 WARN_ON(kprobe_disabled(&op->kp)); 496 497 /* Backup instructions which will be replaced by jump address */ 498 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, 499 DISP32_SIZE); 500 501 insn_buff[0] = JMP32_INSN_OPCODE; 502 *(s32 *)(&insn_buff[1]) = rel; 503 504 text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); 505 506 list_del_init(&op->list); 507 } 508 } 509 510 /* 511 * Replace a relative jump (JMP.d32) with a breakpoint (INT3). 512 * 513 * After that, we can restore the 4 bytes after the INT3 to undo what 514 * arch_optimize_kprobes() scribbled. This is safe since those bytes will be 515 * unused once the INT3 lands. 516 */ 517 void arch_unoptimize_kprobe(struct optimized_kprobe *op) 518 { 519 u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; 520 u8 old[JMP32_INSN_SIZE]; 521 u8 *addr = op->kp.addr; 522 523 memcpy(old, op->kp.addr, JMP32_INSN_SIZE); 524 memcpy(new + INT3_INSN_SIZE, 525 op->optinsn.copied_insn, 526 JMP32_INSN_SIZE - INT3_INSN_SIZE); 527 528 text_poke(addr, new, INT3_INSN_SIZE); 529 text_poke_sync(); 530 text_poke(addr + INT3_INSN_SIZE, 531 new + INT3_INSN_SIZE, 532 JMP32_INSN_SIZE - INT3_INSN_SIZE); 533 text_poke_sync(); 534 535 perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); 536 } 537 538 /* 539 * Recover original instructions and breakpoints from relative jumps. 540 * Caller must call with locking kprobe_mutex. 541 */ 542 extern void arch_unoptimize_kprobes(struct list_head *oplist, 543 struct list_head *done_list) 544 { 545 struct optimized_kprobe *op, *tmp; 546 547 list_for_each_entry_safe(op, tmp, oplist, list) { 548 arch_unoptimize_kprobe(op); 549 list_move(&op->list, done_list); 550 } 551 } 552 553 int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) 554 { 555 struct optimized_kprobe *op; 556 557 if (p->flags & KPROBE_FLAG_OPTIMIZED) { 558 /* This kprobe is really able to run optimized path. */ 559 op = container_of(p, struct optimized_kprobe, kp); 560 /* Detour through copied instructions */ 561 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; 562 if (!reenter) 563 reset_current_kprobe(); 564 return 1; 565 } 566 return 0; 567 } 568 NOKPROBE_SYMBOL(setup_detour_execution); 569