1 /* 2 * User-space Probes (UProbes) for x86 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * 18 * Copyright (C) IBM Corporation, 2008-2011 19 * Authors: 20 * Srikar Dronamraju 21 * Jim Keniston 22 */ 23 #include <linux/kernel.h> 24 #include <linux/sched.h> 25 #include <linux/ptrace.h> 26 #include <linux/uprobes.h> 27 #include <linux/uaccess.h> 28 29 #include <linux/kdebug.h> 30 #include <asm/processor.h> 31 #include <asm/insn.h> 32 33 /* Post-execution fixups. */ 34 35 /* Adjust IP back to vicinity of actual insn */ 36 #define UPROBE_FIX_IP 0x01 37 38 /* Adjust the return address of a call insn */ 39 #define UPROBE_FIX_CALL 0x02 40 41 /* Instruction will modify TF, don't change it */ 42 #define UPROBE_FIX_SETF 0x04 43 44 #define UPROBE_FIX_RIP_SI 0x08 45 #define UPROBE_FIX_RIP_DI 0x10 46 #define UPROBE_FIX_RIP_BX 0x20 47 #define UPROBE_FIX_RIP_MASK \ 48 (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) 49 50 #define UPROBE_TRAP_NR UINT_MAX 51 52 /* Adaptations for mhiramat x86 decoder v14. */ 53 #define OPCODE1(insn) ((insn)->opcode.bytes[0]) 54 #define OPCODE2(insn) ((insn)->opcode.bytes[1]) 55 #define OPCODE3(insn) ((insn)->opcode.bytes[2]) 56 #define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) 57 58 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 59 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 60 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ 61 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ 62 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ 63 << (row % 32)) 64 65 /* 66 * Good-instruction tables for 32-bit apps. This is non-const and volatile 67 * to keep gcc from statically optimizing it out, as variable_test_bit makes 68 * some versions of gcc to think only *(unsigned long*) is used. 69 */ 70 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 71 static volatile u32 good_insns_32[256 / 32] = { 72 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 73 /* ---------------------------------------------- */ 74 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ 75 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ 76 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ 77 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ 78 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ 79 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 80 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ 81 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ 82 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 83 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 84 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ 85 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 86 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ 87 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 88 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ 89 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ 90 /* ---------------------------------------------- */ 91 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 92 }; 93 #else 94 #define good_insns_32 NULL 95 #endif 96 97 /* Good-instruction tables for 64-bit apps */ 98 #if defined(CONFIG_X86_64) 99 static volatile u32 good_insns_64[256 / 32] = { 100 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 101 /* ---------------------------------------------- */ 102 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ 103 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ 104 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ 105 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ 106 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ 107 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 108 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ 109 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ 110 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 111 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 112 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ 113 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 114 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ 115 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 116 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ 117 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ 118 /* ---------------------------------------------- */ 119 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 120 }; 121 #else 122 #define good_insns_64 NULL 123 #endif 124 125 /* Using this for both 64-bit and 32-bit apps */ 126 static volatile u32 good_2byte_insns[256 / 32] = { 127 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 128 /* ---------------------------------------------- */ 129 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ 130 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ 131 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ 132 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ 133 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ 134 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 135 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ 136 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ 137 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 138 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 139 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ 140 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 141 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ 142 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 143 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ 144 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ 145 /* ---------------------------------------------- */ 146 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 147 }; 148 #undef W 149 150 /* 151 * opcodes we'll probably never support: 152 * 153 * 6c-6d, e4-e5, ec-ed - in 154 * 6e-6f, e6-e7, ee-ef - out 155 * cc, cd - int3, int 156 * cf - iret 157 * d6 - illegal instruction 158 * f1 - int1/icebp 159 * f4 - hlt 160 * fa, fb - cli, sti 161 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 162 * 163 * invalid opcodes in 64-bit mode: 164 * 165 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 166 * 63 - we support this opcode in x86_64 but not in i386. 167 * 168 * opcodes we may need to refine support for: 169 * 170 * 0f - 2-byte instructions: For many of these instructions, the validity 171 * depends on the prefix and/or the reg field. On such instructions, we 172 * just consider the opcode combination valid if it corresponds to any 173 * valid instruction. 174 * 175 * 8f - Group 1 - only reg = 0 is OK 176 * c6-c7 - Group 11 - only reg = 0 is OK 177 * d9-df - fpu insns with some illegal encodings 178 * f2, f3 - repnz, repz prefixes. These are also the first byte for 179 * certain floating-point instructions, such as addsd. 180 * 181 * fe - Group 4 - only reg = 0 or 1 is OK 182 * ff - Group 5 - only reg = 0-6 is OK 183 * 184 * others -- Do we need to support these? 185 * 186 * 0f - (floating-point?) prefetch instructions 187 * 07, 17, 1f - pop es, pop ss, pop ds 188 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- 189 * but 64 and 65 (fs: and gs:) seem to be used, so we support them 190 * 67 - addr16 prefix 191 * ce - into 192 * f0 - lock prefix 193 */ 194 195 /* 196 * TODO: 197 * - Where necessary, examine the modrm byte and allow only valid instructions 198 * in the different Groups and fpu instructions. 199 */ 200 201 static bool is_prefix_bad(struct insn *insn) 202 { 203 int i; 204 205 for (i = 0; i < insn->prefixes.nbytes; i++) { 206 switch (insn->prefixes.bytes[i]) { 207 case 0x26: /* INAT_PFX_ES */ 208 case 0x2E: /* INAT_PFX_CS */ 209 case 0x36: /* INAT_PFX_DS */ 210 case 0x3E: /* INAT_PFX_SS */ 211 case 0xF0: /* INAT_PFX_LOCK */ 212 return true; 213 } 214 } 215 return false; 216 } 217 218 static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) 219 { 220 u32 volatile *good_insns; 221 222 insn_init(insn, auprobe->insn, x86_64); 223 /* has the side-effect of processing the entire instruction */ 224 insn_get_length(insn); 225 if (WARN_ON_ONCE(!insn_complete(insn))) 226 return -ENOEXEC; 227 228 if (is_prefix_bad(insn)) 229 return -ENOTSUPP; 230 231 if (x86_64) 232 good_insns = good_insns_64; 233 else 234 good_insns = good_insns_32; 235 236 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) 237 return 0; 238 239 if (insn->opcode.nbytes == 2) { 240 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) 241 return 0; 242 } 243 244 return -ENOTSUPP; 245 } 246 247 #ifdef CONFIG_X86_64 248 static inline bool is_64bit_mm(struct mm_struct *mm) 249 { 250 return !config_enabled(CONFIG_IA32_EMULATION) || 251 !(mm->context.ia32_compat == TIF_IA32); 252 } 253 /* 254 * If arch_uprobe->insn doesn't use rip-relative addressing, return 255 * immediately. Otherwise, rewrite the instruction so that it accesses 256 * its memory operand indirectly through a scratch register. Set 257 * defparam->fixups accordingly. (The contents of the scratch register 258 * will be saved before we single-step the modified instruction, 259 * and restored afterward). 260 * 261 * We do this because a rip-relative instruction can access only a 262 * relatively small area (+/- 2 GB from the instruction), and the XOL 263 * area typically lies beyond that area. At least for instructions 264 * that store to memory, we can't execute the original instruction 265 * and "fix things up" later, because the misdirected store could be 266 * disastrous. 267 * 268 * Some useful facts about rip-relative instructions: 269 * 270 * - There's always a modrm byte with bit layout "00 reg 101". 271 * - There's never a SIB byte. 272 * - The displacement is always 4 bytes. 273 * - REX.B=1 bit in REX prefix, which normally extends r/m field, 274 * has no effect on rip-relative mode. It doesn't make modrm byte 275 * with r/m=101 refer to register 1101 = R13. 276 */ 277 static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) 278 { 279 u8 *cursor; 280 u8 reg; 281 u8 reg2; 282 283 if (!insn_rip_relative(insn)) 284 return; 285 286 /* 287 * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. 288 * Clear REX.b bit (extension of MODRM.rm field): 289 * we want to encode low numbered reg, not r8+. 290 */ 291 if (insn->rex_prefix.nbytes) { 292 cursor = auprobe->insn + insn_offset_rex_prefix(insn); 293 /* REX byte has 0100wrxb layout, clearing REX.b bit */ 294 *cursor &= 0xfe; 295 } 296 /* 297 * Similar treatment for VEX3 prefix. 298 * TODO: add XOP/EVEX treatment when insn decoder supports them 299 */ 300 if (insn->vex_prefix.nbytes == 3) { 301 /* 302 * vex2: c5 rvvvvLpp (has no b bit) 303 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp 304 * evex: 62 rxbR00mm wvvvv1pp zllBVaaa 305 * (evex will need setting of both b and x since 306 * in non-sib encoding evex.x is 4th bit of MODRM.rm) 307 * Setting VEX3.b (setting because it has inverted meaning): 308 */ 309 cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; 310 *cursor |= 0x20; 311 } 312 313 /* 314 * Convert from rip-relative addressing to register-relative addressing 315 * via a scratch register. 316 * 317 * This is tricky since there are insns with modrm byte 318 * which also use registers not encoded in modrm byte: 319 * [i]div/[i]mul: implicitly use dx:ax 320 * shift ops: implicitly use cx 321 * cmpxchg: implicitly uses ax 322 * cmpxchg8/16b: implicitly uses dx:ax and bx:cx 323 * Encoding: 0f c7/1 modrm 324 * The code below thinks that reg=1 (cx), chooses si as scratch. 325 * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. 326 * First appeared in Haswell (BMI2 insn). It is vex-encoded. 327 * Example where none of bx,cx,dx can be used as scratch reg: 328 * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx 329 * [v]pcmpistri: implicitly uses cx, xmm0 330 * [v]pcmpistrm: implicitly uses xmm0 331 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 332 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 333 * Evil SSE4.2 string comparison ops from hell. 334 * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. 335 * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. 336 * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). 337 * AMD says it has no 3-operand form (vex.vvvv must be 1111) 338 * and that it can have only register operands, not mem 339 * (its modrm byte must have mode=11). 340 * If these restrictions will ever be lifted, 341 * we'll need code to prevent selection of di as scratch reg! 342 * 343 * Summary: I don't know any insns with modrm byte which 344 * use SI register implicitly. DI register is used only 345 * by one insn (maskmovq) and BX register is used 346 * only by one too (cmpxchg8b). 347 * BP is stack-segment based (may be a problem?). 348 * AX, DX, CX are off-limits (many implicit users). 349 * SP is unusable (it's stack pointer - think about "pop mem"; 350 * also, rsp+disp32 needs sib encoding -> insn length change). 351 */ 352 353 reg = MODRM_REG(insn); /* Fetch modrm.reg */ 354 reg2 = 0xff; /* Fetch vex.vvvv */ 355 if (insn->vex_prefix.nbytes == 2) 356 reg2 = insn->vex_prefix.bytes[1]; 357 else if (insn->vex_prefix.nbytes == 3) 358 reg2 = insn->vex_prefix.bytes[2]; 359 /* 360 * TODO: add XOP, EXEV vvvv reading. 361 * 362 * vex.vvvv field is in bits 6-3, bits are inverted. 363 * But in 32-bit mode, high-order bit may be ignored. 364 * Therefore, let's consider only 3 low-order bits. 365 */ 366 reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; 367 /* 368 * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. 369 * 370 * Choose scratch reg. Order is important: must not select bx 371 * if we can use si (cmpxchg8b case!) 372 */ 373 if (reg != 6 && reg2 != 6) { 374 reg2 = 6; 375 auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; 376 } else if (reg != 7 && reg2 != 7) { 377 reg2 = 7; 378 auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; 379 /* TODO (paranoia): force maskmovq to not use di */ 380 } else { 381 reg2 = 3; 382 auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; 383 } 384 /* 385 * Point cursor at the modrm byte. The next 4 bytes are the 386 * displacement. Beyond the displacement, for some instructions, 387 * is the immediate operand. 388 */ 389 cursor = auprobe->insn + insn_offset_modrm(insn); 390 /* 391 * Change modrm from "00 reg 101" to "10 reg reg2". Example: 392 * 89 05 disp32 mov %eax,disp32(%rip) becomes 393 * 89 86 disp32 mov %eax,disp32(%rsi) 394 */ 395 *cursor = 0x80 | (reg << 3) | reg2; 396 } 397 398 static inline unsigned long * 399 scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) 400 { 401 if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) 402 return ®s->si; 403 if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) 404 return ®s->di; 405 return ®s->bx; 406 } 407 408 /* 409 * If we're emulating a rip-relative instruction, save the contents 410 * of the scratch register and store the target address in that register. 411 */ 412 static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 413 { 414 if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { 415 struct uprobe_task *utask = current->utask; 416 unsigned long *sr = scratch_reg(auprobe, regs); 417 418 utask->autask.saved_scratch_register = *sr; 419 *sr = utask->vaddr + auprobe->defparam.ilen; 420 } 421 } 422 423 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 424 { 425 if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { 426 struct uprobe_task *utask = current->utask; 427 unsigned long *sr = scratch_reg(auprobe, regs); 428 429 *sr = utask->autask.saved_scratch_register; 430 } 431 } 432 #else /* 32-bit: */ 433 static inline bool is_64bit_mm(struct mm_struct *mm) 434 { 435 return false; 436 } 437 /* 438 * No RIP-relative addressing on 32-bit 439 */ 440 static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) 441 { 442 } 443 static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 444 { 445 } 446 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 447 { 448 } 449 #endif /* CONFIG_X86_64 */ 450 451 struct uprobe_xol_ops { 452 bool (*emulate)(struct arch_uprobe *, struct pt_regs *); 453 int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); 454 int (*post_xol)(struct arch_uprobe *, struct pt_regs *); 455 void (*abort)(struct arch_uprobe *, struct pt_regs *); 456 }; 457 458 static inline int sizeof_long(void) 459 { 460 return is_ia32_task() ? 4 : 8; 461 } 462 463 static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) 464 { 465 riprel_pre_xol(auprobe, regs); 466 return 0; 467 } 468 469 static int push_ret_address(struct pt_regs *regs, unsigned long ip) 470 { 471 unsigned long new_sp = regs->sp - sizeof_long(); 472 473 if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) 474 return -EFAULT; 475 476 regs->sp = new_sp; 477 return 0; 478 } 479 480 /* 481 * We have to fix things up as follows: 482 * 483 * Typically, the new ip is relative to the copied instruction. We need 484 * to make it relative to the original instruction (FIX_IP). Exceptions 485 * are return instructions and absolute or indirect jump or call instructions. 486 * 487 * If the single-stepped instruction was a call, the return address that 488 * is atop the stack is the address following the copied instruction. We 489 * need to make it the address following the original instruction (FIX_CALL). 490 * 491 * If the original instruction was a rip-relative instruction such as 492 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent 493 * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". 494 * We need to restore the contents of the scratch register 495 * (FIX_RIP_reg). 496 */ 497 static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) 498 { 499 struct uprobe_task *utask = current->utask; 500 501 riprel_post_xol(auprobe, regs); 502 if (auprobe->defparam.fixups & UPROBE_FIX_IP) { 503 long correction = utask->vaddr - utask->xol_vaddr; 504 regs->ip += correction; 505 } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { 506 regs->sp += sizeof_long(); /* Pop incorrect return address */ 507 if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) 508 return -ERESTART; 509 } 510 /* popf; tell the caller to not touch TF */ 511 if (auprobe->defparam.fixups & UPROBE_FIX_SETF) 512 utask->autask.saved_tf = true; 513 514 return 0; 515 } 516 517 static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) 518 { 519 riprel_post_xol(auprobe, regs); 520 } 521 522 static struct uprobe_xol_ops default_xol_ops = { 523 .pre_xol = default_pre_xol_op, 524 .post_xol = default_post_xol_op, 525 .abort = default_abort_op, 526 }; 527 528 static bool branch_is_call(struct arch_uprobe *auprobe) 529 { 530 return auprobe->branch.opc1 == 0xe8; 531 } 532 533 #define CASE_COND \ 534 COND(70, 71, XF(OF)) \ 535 COND(72, 73, XF(CF)) \ 536 COND(74, 75, XF(ZF)) \ 537 COND(78, 79, XF(SF)) \ 538 COND(7a, 7b, XF(PF)) \ 539 COND(76, 77, XF(CF) || XF(ZF)) \ 540 COND(7c, 7d, XF(SF) != XF(OF)) \ 541 COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) 542 543 #define COND(op_y, op_n, expr) \ 544 case 0x ## op_y: DO((expr) != 0) \ 545 case 0x ## op_n: DO((expr) == 0) 546 547 #define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) 548 549 static bool is_cond_jmp_opcode(u8 opcode) 550 { 551 switch (opcode) { 552 #define DO(expr) \ 553 return true; 554 CASE_COND 555 #undef DO 556 557 default: 558 return false; 559 } 560 } 561 562 static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) 563 { 564 unsigned long flags = regs->flags; 565 566 switch (auprobe->branch.opc1) { 567 #define DO(expr) \ 568 return expr; 569 CASE_COND 570 #undef DO 571 572 default: /* not a conditional jmp */ 573 return true; 574 } 575 } 576 577 #undef XF 578 #undef COND 579 #undef CASE_COND 580 581 static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) 582 { 583 unsigned long new_ip = regs->ip += auprobe->branch.ilen; 584 unsigned long offs = (long)auprobe->branch.offs; 585 586 if (branch_is_call(auprobe)) { 587 /* 588 * If it fails we execute this (mangled, see the comment in 589 * branch_clear_offset) insn out-of-line. In the likely case 590 * this should trigger the trap, and the probed application 591 * should die or restart the same insn after it handles the 592 * signal, arch_uprobe_post_xol() won't be even called. 593 * 594 * But there is corner case, see the comment in ->post_xol(). 595 */ 596 if (push_ret_address(regs, new_ip)) 597 return false; 598 } else if (!check_jmp_cond(auprobe, regs)) { 599 offs = 0; 600 } 601 602 regs->ip = new_ip + offs; 603 return true; 604 } 605 606 static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) 607 { 608 BUG_ON(!branch_is_call(auprobe)); 609 /* 610 * We can only get here if branch_emulate_op() failed to push the ret 611 * address _and_ another thread expanded our stack before the (mangled) 612 * "call" insn was executed out-of-line. Just restore ->sp and restart. 613 * We could also restore ->ip and try to call branch_emulate_op() again. 614 */ 615 regs->sp += sizeof_long(); 616 return -ERESTART; 617 } 618 619 static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) 620 { 621 /* 622 * Turn this insn into "call 1f; 1:", this is what we will execute 623 * out-of-line if ->emulate() fails. We only need this to generate 624 * a trap, so that the probed task receives the correct signal with 625 * the properly filled siginfo. 626 * 627 * But see the comment in ->post_xol(), in the unlikely case it can 628 * succeed. So we need to ensure that the new ->ip can not fall into 629 * the non-canonical area and trigger #GP. 630 * 631 * We could turn it into (say) "pushf", but then we would need to 632 * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte 633 * of ->insn[] for set_orig_insn(). 634 */ 635 memset(auprobe->insn + insn_offset_immediate(insn), 636 0, insn->immediate.nbytes); 637 } 638 639 static struct uprobe_xol_ops branch_xol_ops = { 640 .emulate = branch_emulate_op, 641 .post_xol = branch_post_xol_op, 642 }; 643 644 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ 645 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) 646 { 647 u8 opc1 = OPCODE1(insn); 648 int i; 649 650 switch (opc1) { 651 case 0xeb: /* jmp 8 */ 652 case 0xe9: /* jmp 32 */ 653 case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ 654 break; 655 656 case 0xe8: /* call relative */ 657 branch_clear_offset(auprobe, insn); 658 break; 659 660 case 0x0f: 661 if (insn->opcode.nbytes != 2) 662 return -ENOSYS; 663 /* 664 * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches 665 * OPCODE1() of the "short" jmp which checks the same condition. 666 */ 667 opc1 = OPCODE2(insn) - 0x10; 668 default: 669 if (!is_cond_jmp_opcode(opc1)) 670 return -ENOSYS; 671 } 672 673 /* 674 * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. 675 * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. 676 * No one uses these insns, reject any branch insns with such prefix. 677 */ 678 for (i = 0; i < insn->prefixes.nbytes; i++) { 679 if (insn->prefixes.bytes[i] == 0x66) 680 return -ENOTSUPP; 681 } 682 683 auprobe->branch.opc1 = opc1; 684 auprobe->branch.ilen = insn->length; 685 auprobe->branch.offs = insn->immediate.value; 686 687 auprobe->ops = &branch_xol_ops; 688 return 0; 689 } 690 691 /** 692 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. 693 * @mm: the probed address space. 694 * @arch_uprobe: the probepoint information. 695 * @addr: virtual address at which to install the probepoint 696 * Return 0 on success or a -ve number on error. 697 */ 698 int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) 699 { 700 struct insn insn; 701 u8 fix_ip_or_call = UPROBE_FIX_IP; 702 int ret; 703 704 ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); 705 if (ret) 706 return ret; 707 708 ret = branch_setup_xol_ops(auprobe, &insn); 709 if (ret != -ENOSYS) 710 return ret; 711 712 /* 713 * Figure out which fixups default_post_xol_op() will need to perform, 714 * and annotate defparam->fixups accordingly. 715 */ 716 switch (OPCODE1(&insn)) { 717 case 0x9d: /* popf */ 718 auprobe->defparam.fixups |= UPROBE_FIX_SETF; 719 break; 720 case 0xc3: /* ret or lret -- ip is correct */ 721 case 0xcb: 722 case 0xc2: 723 case 0xca: 724 case 0xea: /* jmp absolute -- ip is correct */ 725 fix_ip_or_call = 0; 726 break; 727 case 0x9a: /* call absolute - Fix return addr, not ip */ 728 fix_ip_or_call = UPROBE_FIX_CALL; 729 break; 730 case 0xff: 731 switch (MODRM_REG(&insn)) { 732 case 2: case 3: /* call or lcall, indirect */ 733 fix_ip_or_call = UPROBE_FIX_CALL; 734 break; 735 case 4: case 5: /* jmp or ljmp, indirect */ 736 fix_ip_or_call = 0; 737 break; 738 } 739 /* fall through */ 740 default: 741 riprel_analyze(auprobe, &insn); 742 } 743 744 auprobe->defparam.ilen = insn.length; 745 auprobe->defparam.fixups |= fix_ip_or_call; 746 747 auprobe->ops = &default_xol_ops; 748 return 0; 749 } 750 751 /* 752 * arch_uprobe_pre_xol - prepare to execute out of line. 753 * @auprobe: the probepoint information. 754 * @regs: reflects the saved user state of current task. 755 */ 756 int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 757 { 758 struct uprobe_task *utask = current->utask; 759 760 if (auprobe->ops->pre_xol) { 761 int err = auprobe->ops->pre_xol(auprobe, regs); 762 if (err) 763 return err; 764 } 765 766 regs->ip = utask->xol_vaddr; 767 utask->autask.saved_trap_nr = current->thread.trap_nr; 768 current->thread.trap_nr = UPROBE_TRAP_NR; 769 770 utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); 771 regs->flags |= X86_EFLAGS_TF; 772 if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) 773 set_task_blockstep(current, false); 774 775 return 0; 776 } 777 778 /* 779 * If xol insn itself traps and generates a signal(Say, 780 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped 781 * instruction jumps back to its own address. It is assumed that anything 782 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. 783 * 784 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, 785 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to 786 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). 787 */ 788 bool arch_uprobe_xol_was_trapped(struct task_struct *t) 789 { 790 if (t->thread.trap_nr != UPROBE_TRAP_NR) 791 return true; 792 793 return false; 794 } 795 796 /* 797 * Called after single-stepping. To avoid the SMP problems that can 798 * occur when we temporarily put back the original opcode to 799 * single-step, we single-stepped a copy of the instruction. 800 * 801 * This function prepares to resume execution after the single-step. 802 */ 803 int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 804 { 805 struct uprobe_task *utask = current->utask; 806 bool send_sigtrap = utask->autask.saved_tf; 807 int err = 0; 808 809 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); 810 current->thread.trap_nr = utask->autask.saved_trap_nr; 811 812 if (auprobe->ops->post_xol) { 813 err = auprobe->ops->post_xol(auprobe, regs); 814 if (err) { 815 /* 816 * Restore ->ip for restart or post mortem analysis. 817 * ->post_xol() must not return -ERESTART unless this 818 * is really possible. 819 */ 820 regs->ip = utask->vaddr; 821 if (err == -ERESTART) 822 err = 0; 823 send_sigtrap = false; 824 } 825 } 826 /* 827 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP 828 * so we can get an extra SIGTRAP if we do not clear TF. We need 829 * to examine the opcode to make it right. 830 */ 831 if (send_sigtrap) 832 send_sig(SIGTRAP, current, 0); 833 834 if (!utask->autask.saved_tf) 835 regs->flags &= ~X86_EFLAGS_TF; 836 837 return err; 838 } 839 840 /* callback routine for handling exceptions. */ 841 int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) 842 { 843 struct die_args *args = data; 844 struct pt_regs *regs = args->regs; 845 int ret = NOTIFY_DONE; 846 847 /* We are only interested in userspace traps */ 848 if (regs && !user_mode_vm(regs)) 849 return NOTIFY_DONE; 850 851 switch (val) { 852 case DIE_INT3: 853 if (uprobe_pre_sstep_notifier(regs)) 854 ret = NOTIFY_STOP; 855 856 break; 857 858 case DIE_DEBUG: 859 if (uprobe_post_sstep_notifier(regs)) 860 ret = NOTIFY_STOP; 861 862 default: 863 break; 864 } 865 866 return ret; 867 } 868 869 /* 870 * This function gets called when XOL instruction either gets trapped or 871 * the thread has a fatal signal. Reset the instruction pointer to its 872 * probed address for the potential restart or for post mortem analysis. 873 */ 874 void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 875 { 876 struct uprobe_task *utask = current->utask; 877 878 if (auprobe->ops->abort) 879 auprobe->ops->abort(auprobe, regs); 880 881 current->thread.trap_nr = utask->autask.saved_trap_nr; 882 regs->ip = utask->vaddr; 883 /* clear TF if it was set by us in arch_uprobe_pre_xol() */ 884 if (!utask->autask.saved_tf) 885 regs->flags &= ~X86_EFLAGS_TF; 886 } 887 888 static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) 889 { 890 if (auprobe->ops->emulate) 891 return auprobe->ops->emulate(auprobe, regs); 892 return false; 893 } 894 895 bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) 896 { 897 bool ret = __skip_sstep(auprobe, regs); 898 if (ret && (regs->flags & X86_EFLAGS_TF)) 899 send_sig(SIGTRAP, current, 0); 900 return ret; 901 } 902 903 unsigned long 904 arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) 905 { 906 int rasize = sizeof_long(), nleft; 907 unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ 908 909 if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) 910 return -1; 911 912 /* check whether address has been already hijacked */ 913 if (orig_ret_vaddr == trampoline_vaddr) 914 return orig_ret_vaddr; 915 916 nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); 917 if (likely(!nleft)) 918 return orig_ret_vaddr; 919 920 if (nleft != rasize) { 921 pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " 922 "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); 923 924 force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); 925 } 926 927 return -1; 928 } 929