1 /* Support for MMIO probes. 2 * Benfit many code from kprobes 3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. 4 * 2007 Alexander Eichner 5 * 2008 Pekka Paalanen <pq@iki.fi> 6 */ 7 8 #include <linux/list.h> 9 #include <linux/rculist.h> 10 #include <linux/spinlock.h> 11 #include <linux/hash.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/kernel.h> 15 #include <linux/uaccess.h> 16 #include <linux/ptrace.h> 17 #include <linux/preempt.h> 18 #include <linux/percpu.h> 19 #include <linux/kdebug.h> 20 #include <linux/mutex.h> 21 #include <linux/io.h> 22 #include <asm/cacheflush.h> 23 #include <asm/tlbflush.h> 24 #include <linux/errno.h> 25 #include <asm/debugreg.h> 26 #include <linux/mmiotrace.h> 27 28 #define KMMIO_PAGE_HASH_BITS 4 29 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) 30 31 struct kmmio_fault_page { 32 struct list_head list; 33 struct kmmio_fault_page *release_next; 34 unsigned long page; /* location of the fault page */ 35 36 /* 37 * Number of times this page has been registered as a part 38 * of a probe. If zero, page is disarmed and this may be freed. 39 * Used only by writers (RCU). 40 */ 41 int count; 42 }; 43 44 struct kmmio_delayed_release { 45 struct rcu_head rcu; 46 struct kmmio_fault_page *release_list; 47 }; 48 49 struct kmmio_context { 50 struct kmmio_fault_page *fpage; 51 struct kmmio_probe *probe; 52 unsigned long saved_flags; 53 unsigned long addr; 54 int active; 55 }; 56 57 static DEFINE_SPINLOCK(kmmio_lock); 58 59 /* Protected by kmmio_lock */ 60 unsigned int kmmio_count; 61 62 /* Read-protected by RCU, write-protected by kmmio_lock. */ 63 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 64 static LIST_HEAD(kmmio_probes); 65 66 static struct list_head *kmmio_page_list(unsigned long page) 67 { 68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; 69 } 70 71 /* Accessed per-cpu */ 72 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); 73 74 /* 75 * this is basically a dynamic stabbing problem: 76 * Could use the existing prio tree code or 77 * Possible better implementations: 78 * The Interval Skip List: A Data Structure for Finding All Intervals That 79 * Overlap a Point (might be simple) 80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup 81 */ 82 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ 83 static struct kmmio_probe *get_kmmio_probe(unsigned long addr) 84 { 85 struct kmmio_probe *p; 86 list_for_each_entry_rcu(p, &kmmio_probes, list) { 87 if (addr >= p->addr && addr <= (p->addr + p->len)) 88 return p; 89 } 90 return NULL; 91 } 92 93 /* You must be holding RCU read lock. */ 94 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 95 { 96 struct list_head *head; 97 struct kmmio_fault_page *p; 98 99 page &= PAGE_MASK; 100 head = kmmio_page_list(page); 101 list_for_each_entry_rcu(p, head, list) { 102 if (p->page == page) 103 return p; 104 } 105 return NULL; 106 } 107 108 static void set_page_present(unsigned long addr, bool present, 109 unsigned int *pglevel) 110 { 111 pteval_t pteval; 112 pmdval_t pmdval; 113 unsigned int level; 114 pmd_t *pmd; 115 pte_t *pte = lookup_address(addr, &level); 116 117 if (!pte) { 118 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 119 return; 120 } 121 122 if (pglevel) 123 *pglevel = level; 124 125 switch (level) { 126 case PG_LEVEL_2M: 127 pmd = (pmd_t *)pte; 128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; 129 if (present) 130 pmdval |= _PAGE_PRESENT; 131 set_pmd(pmd, __pmd(pmdval)); 132 break; 133 134 case PG_LEVEL_4K: 135 pteval = pte_val(*pte) & ~_PAGE_PRESENT; 136 if (present) 137 pteval |= _PAGE_PRESENT; 138 set_pte_atomic(pte, __pte(pteval)); 139 break; 140 141 default: 142 pr_err("kmmio: unexpected page level 0x%x.\n", level); 143 return; 144 } 145 146 __flush_tlb_one(addr); 147 } 148 149 /** Mark the given page as not present. Access to it will trigger a fault. */ 150 static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 151 { 152 set_page_present(page & PAGE_MASK, false, pglevel); 153 } 154 155 /** Mark the given page as present. */ 156 static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 157 { 158 set_page_present(page & PAGE_MASK, true, pglevel); 159 } 160 161 /* 162 * This is being called from do_page_fault(). 163 * 164 * We may be in an interrupt or a critical section. Also prefecthing may 165 * trigger a page fault. We may be in the middle of process switch. 166 * We cannot take any locks, because we could be executing especially 167 * within a kmmio critical section. 168 * 169 * Local interrupts are disabled, so preemption cannot happen. 170 * Do not enable interrupts, do not sleep, and watch out for other CPUs. 171 */ 172 /* 173 * Interrupts are disabled on entry as trap3 is an interrupt gate 174 * and they remain disabled thorough out this function. 175 */ 176 int kmmio_handler(struct pt_regs *regs, unsigned long addr) 177 { 178 struct kmmio_context *ctx; 179 struct kmmio_fault_page *faultpage; 180 int ret = 0; /* default to fault not handled */ 181 182 /* 183 * Preemption is now disabled to prevent process switch during 184 * single stepping. We can only handle one active kmmio trace 185 * per cpu, so ensure that we finish it before something else 186 * gets to run. We also hold the RCU read lock over single 187 * stepping to avoid looking up the probe and kmmio_fault_page 188 * again. 189 */ 190 preempt_disable(); 191 rcu_read_lock(); 192 193 faultpage = get_kmmio_fault_page(addr); 194 if (!faultpage) { 195 /* 196 * Either this page fault is not caused by kmmio, or 197 * another CPU just pulled the kmmio probe from under 198 * our feet. The latter case should not be possible. 199 */ 200 goto no_kmmio; 201 } 202 203 ctx = &get_cpu_var(kmmio_ctx); 204 if (ctx->active) { 205 disarm_kmmio_fault_page(faultpage->page, NULL); 206 if (addr == ctx->addr) { 207 /* 208 * On SMP we sometimes get recursive probe hits on the 209 * same address. Context is already saved, fall out. 210 */ 211 pr_debug("kmmio: duplicate probe hit on CPU %d, for " 212 "address 0x%08lx.\n", 213 smp_processor_id(), addr); 214 ret = 1; 215 goto no_kmmio_ctx; 216 } 217 /* 218 * Prevent overwriting already in-flight context. 219 * This should not happen, let's hope disarming at least 220 * prevents a panic. 221 */ 222 pr_emerg("kmmio: recursive probe hit on CPU %d, " 223 "for address 0x%08lx. Ignoring.\n", 224 smp_processor_id(), addr); 225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n", 226 ctx->addr); 227 goto no_kmmio_ctx; 228 } 229 ctx->active++; 230 231 ctx->fpage = faultpage; 232 ctx->probe = get_kmmio_probe(addr); 233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 234 ctx->addr = addr; 235 236 if (ctx->probe && ctx->probe->pre_handler) 237 ctx->probe->pre_handler(ctx->probe, regs, addr); 238 239 /* 240 * Enable single-stepping and disable interrupts for the faulting 241 * context. Local interrupts must not get enabled during stepping. 242 */ 243 regs->flags |= X86_EFLAGS_TF; 244 regs->flags &= ~X86_EFLAGS_IF; 245 246 /* Now we set present bit in PTE and single step. */ 247 disarm_kmmio_fault_page(ctx->fpage->page, NULL); 248 249 /* 250 * If another cpu accesses the same page while we are stepping, 251 * the access will not be caught. It will simply succeed and the 252 * only downside is we lose the event. If this becomes a problem, 253 * the user should drop to single cpu before tracing. 254 */ 255 256 put_cpu_var(kmmio_ctx); 257 return 1; /* fault handled */ 258 259 no_kmmio_ctx: 260 put_cpu_var(kmmio_ctx); 261 no_kmmio: 262 rcu_read_unlock(); 263 preempt_enable_no_resched(); 264 return ret; 265 } 266 267 /* 268 * Interrupts are disabled on entry as trap1 is an interrupt gate 269 * and they remain disabled thorough out this function. 270 * This must always get called as the pair to kmmio_handler(). 271 */ 272 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 273 { 274 int ret = 0; 275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 276 277 if (!ctx->active) { 278 pr_debug("kmmio: spurious debug trap on CPU %d.\n", 279 smp_processor_id()); 280 goto out; 281 } 282 283 if (ctx->probe && ctx->probe->post_handler) 284 ctx->probe->post_handler(ctx->probe, condition, regs); 285 286 arm_kmmio_fault_page(ctx->fpage->page, NULL); 287 288 regs->flags &= ~X86_EFLAGS_TF; 289 regs->flags |= ctx->saved_flags; 290 291 /* These were acquired in kmmio_handler(). */ 292 ctx->active--; 293 BUG_ON(ctx->active); 294 rcu_read_unlock(); 295 preempt_enable_no_resched(); 296 297 /* 298 * if somebody else is singlestepping across a probe point, flags 299 * will have TF set, in which case, continue the remaining processing 300 * of do_debug, as if this is not a probe hit. 301 */ 302 if (!(regs->flags & X86_EFLAGS_TF)) 303 ret = 1; 304 out: 305 put_cpu_var(kmmio_ctx); 306 return ret; 307 } 308 309 /* You must be holding kmmio_lock. */ 310 static int add_kmmio_fault_page(unsigned long page) 311 { 312 struct kmmio_fault_page *f; 313 314 page &= PAGE_MASK; 315 f = get_kmmio_fault_page(page); 316 if (f) { 317 if (!f->count) 318 arm_kmmio_fault_page(f->page, NULL); 319 f->count++; 320 return 0; 321 } 322 323 f = kmalloc(sizeof(*f), GFP_ATOMIC); 324 if (!f) 325 return -1; 326 327 f->count = 1; 328 f->page = page; 329 list_add_rcu(&f->list, kmmio_page_list(f->page)); 330 331 arm_kmmio_fault_page(f->page, NULL); 332 333 return 0; 334 } 335 336 /* You must be holding kmmio_lock. */ 337 static void release_kmmio_fault_page(unsigned long page, 338 struct kmmio_fault_page **release_list) 339 { 340 struct kmmio_fault_page *f; 341 342 page &= PAGE_MASK; 343 f = get_kmmio_fault_page(page); 344 if (!f) 345 return; 346 347 f->count--; 348 BUG_ON(f->count < 0); 349 if (!f->count) { 350 disarm_kmmio_fault_page(f->page, NULL); 351 f->release_next = *release_list; 352 *release_list = f; 353 } 354 } 355 356 /* 357 * With page-unaligned ioremaps, one or two armed pages may contain 358 * addresses from outside the intended mapping. Events for these addresses 359 * are currently silently dropped. The events may result only from programming 360 * mistakes by accessing addresses before the beginning or past the end of a 361 * mapping. 362 */ 363 int register_kmmio_probe(struct kmmio_probe *p) 364 { 365 unsigned long flags; 366 int ret = 0; 367 unsigned long size = 0; 368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 369 370 spin_lock_irqsave(&kmmio_lock, flags); 371 if (get_kmmio_probe(p->addr)) { 372 ret = -EEXIST; 373 goto out; 374 } 375 kmmio_count++; 376 list_add_rcu(&p->list, &kmmio_probes); 377 while (size < size_lim) { 378 if (add_kmmio_fault_page(p->addr + size)) 379 pr_err("kmmio: Unable to set page fault.\n"); 380 size += PAGE_SIZE; 381 } 382 out: 383 spin_unlock_irqrestore(&kmmio_lock, flags); 384 /* 385 * XXX: What should I do here? 386 * Here was a call to global_flush_tlb(), but it does not exist 387 * anymore. It seems it's not needed after all. 388 */ 389 return ret; 390 } 391 EXPORT_SYMBOL(register_kmmio_probe); 392 393 static void rcu_free_kmmio_fault_pages(struct rcu_head *head) 394 { 395 struct kmmio_delayed_release *dr = container_of( 396 head, 397 struct kmmio_delayed_release, 398 rcu); 399 struct kmmio_fault_page *p = dr->release_list; 400 while (p) { 401 struct kmmio_fault_page *next = p->release_next; 402 BUG_ON(p->count); 403 kfree(p); 404 p = next; 405 } 406 kfree(dr); 407 } 408 409 static void remove_kmmio_fault_pages(struct rcu_head *head) 410 { 411 struct kmmio_delayed_release *dr = container_of( 412 head, 413 struct kmmio_delayed_release, 414 rcu); 415 struct kmmio_fault_page *p = dr->release_list; 416 struct kmmio_fault_page **prevp = &dr->release_list; 417 unsigned long flags; 418 spin_lock_irqsave(&kmmio_lock, flags); 419 while (p) { 420 if (!p->count) 421 list_del_rcu(&p->list); 422 else 423 *prevp = p->release_next; 424 prevp = &p->release_next; 425 p = p->release_next; 426 } 427 spin_unlock_irqrestore(&kmmio_lock, flags); 428 /* This is the real RCU destroy call. */ 429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 430 } 431 432 /* 433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be 434 * sure that the callbacks will not be called anymore. Only after that 435 * you may actually release your struct kmmio_probe. 436 * 437 * Unregistering a kmmio fault page has three steps: 438 * 1. release_kmmio_fault_page() 439 * Disarm the page, wait a grace period to let all faults finish. 440 * 2. remove_kmmio_fault_pages() 441 * Remove the pages from kmmio_page_table. 442 * 3. rcu_free_kmmio_fault_pages() 443 * Actally free the kmmio_fault_page structs as with RCU. 444 */ 445 void unregister_kmmio_probe(struct kmmio_probe *p) 446 { 447 unsigned long flags; 448 unsigned long size = 0; 449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 450 struct kmmio_fault_page *release_list = NULL; 451 struct kmmio_delayed_release *drelease; 452 453 spin_lock_irqsave(&kmmio_lock, flags); 454 while (size < size_lim) { 455 release_kmmio_fault_page(p->addr + size, &release_list); 456 size += PAGE_SIZE; 457 } 458 list_del_rcu(&p->list); 459 kmmio_count--; 460 spin_unlock_irqrestore(&kmmio_lock, flags); 461 462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 463 if (!drelease) { 464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 465 return; 466 } 467 drelease->release_list = release_list; 468 469 /* 470 * This is not really RCU here. We have just disarmed a set of 471 * pages so that they cannot trigger page faults anymore. However, 472 * we cannot remove the pages from kmmio_page_table, 473 * because a probe hit might be in flight on another CPU. The 474 * pages are collected into a list, and they will be removed from 475 * kmmio_page_table when it is certain that no probe hit related to 476 * these pages can be in flight. RCU grace period sounds like a 477 * good choice. 478 * 479 * If we removed the pages too early, kmmio page fault handler might 480 * not find the respective kmmio_fault_page and determine it's not 481 * a kmmio fault, when it actually is. This would lead to madness. 482 */ 483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages); 484 } 485 EXPORT_SYMBOL(unregister_kmmio_probe); 486 487 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, 488 void *args) 489 { 490 struct die_args *arg = args; 491 492 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 493 if (post_kmmio_handler(arg->err, arg->regs) == 1) 494 return NOTIFY_STOP; 495 496 return NOTIFY_DONE; 497 } 498 499 static struct notifier_block nb_die = { 500 .notifier_call = kmmio_die_notifier 501 }; 502 503 static int __init init_kmmio(void) 504 { 505 int i; 506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 507 INIT_LIST_HEAD(&kmmio_page_table[i]); 508 return register_die_notifier(&nb_die); 509 } 510 fs_initcall(init_kmmio); /* should be before device_initcall() */ 511