1 /* Support for MMIO probes. 2 * Benfit many code from kprobes 3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. 4 * 2007 Alexander Eichner 5 * 2008 Pekka Paalanen <pq@iki.fi> 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/list.h> 11 #include <linux/rculist.h> 12 #include <linux/spinlock.h> 13 #include <linux/hash.h> 14 #include <linux/init.h> 15 #include <linux/module.h> 16 #include <linux/kernel.h> 17 #include <linux/uaccess.h> 18 #include <linux/ptrace.h> 19 #include <linux/preempt.h> 20 #include <linux/percpu.h> 21 #include <linux/kdebug.h> 22 #include <linux/mutex.h> 23 #include <linux/io.h> 24 #include <linux/slab.h> 25 #include <asm/cacheflush.h> 26 #include <asm/tlbflush.h> 27 #include <linux/errno.h> 28 #include <asm/debugreg.h> 29 #include <linux/mmiotrace.h> 30 31 #define KMMIO_PAGE_HASH_BITS 4 32 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) 33 34 struct kmmio_fault_page { 35 struct list_head list; 36 struct kmmio_fault_page *release_next; 37 unsigned long page; /* location of the fault page */ 38 pteval_t old_presence; /* page presence prior to arming */ 39 bool armed; 40 41 /* 42 * Number of times this page has been registered as a part 43 * of a probe. If zero, page is disarmed and this may be freed. 44 * Used only by writers (RCU) and post_kmmio_handler(). 45 * Protected by kmmio_lock, when linked into kmmio_page_table. 46 */ 47 int count; 48 }; 49 50 struct kmmio_delayed_release { 51 struct rcu_head rcu; 52 struct kmmio_fault_page *release_list; 53 }; 54 55 struct kmmio_context { 56 struct kmmio_fault_page *fpage; 57 struct kmmio_probe *probe; 58 unsigned long saved_flags; 59 unsigned long addr; 60 int active; 61 }; 62 63 static DEFINE_SPINLOCK(kmmio_lock); 64 65 /* Protected by kmmio_lock */ 66 unsigned int kmmio_count; 67 68 /* Read-protected by RCU, write-protected by kmmio_lock. */ 69 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 70 static LIST_HEAD(kmmio_probes); 71 72 static struct list_head *kmmio_page_list(unsigned long page) 73 { 74 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; 75 } 76 77 /* Accessed per-cpu */ 78 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); 79 80 /* 81 * this is basically a dynamic stabbing problem: 82 * Could use the existing prio tree code or 83 * Possible better implementations: 84 * The Interval Skip List: A Data Structure for Finding All Intervals That 85 * Overlap a Point (might be simple) 86 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup 87 */ 88 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ 89 static struct kmmio_probe *get_kmmio_probe(unsigned long addr) 90 { 91 struct kmmio_probe *p; 92 list_for_each_entry_rcu(p, &kmmio_probes, list) { 93 if (addr >= p->addr && addr < (p->addr + p->len)) 94 return p; 95 } 96 return NULL; 97 } 98 99 /* You must be holding RCU read lock. */ 100 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 101 { 102 struct list_head *head; 103 struct kmmio_fault_page *f; 104 105 page &= PAGE_MASK; 106 head = kmmio_page_list(page); 107 list_for_each_entry_rcu(f, head, list) { 108 if (f->page == page) 109 return f; 110 } 111 return NULL; 112 } 113 114 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) 115 { 116 pmdval_t v = pmd_val(*pmd); 117 if (clear) { 118 *old = v & _PAGE_PRESENT; 119 v &= ~_PAGE_PRESENT; 120 } else /* presume this has been called with clear==true previously */ 121 v |= *old; 122 set_pmd(pmd, __pmd(v)); 123 } 124 125 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) 126 { 127 pteval_t v = pte_val(*pte); 128 if (clear) { 129 *old = v & _PAGE_PRESENT; 130 v &= ~_PAGE_PRESENT; 131 } else /* presume this has been called with clear==true previously */ 132 v |= *old; 133 set_pte_atomic(pte, __pte(v)); 134 } 135 136 static int clear_page_presence(struct kmmio_fault_page *f, bool clear) 137 { 138 unsigned int level; 139 pte_t *pte = lookup_address(f->page, &level); 140 141 if (!pte) { 142 pr_err("no pte for page 0x%08lx\n", f->page); 143 return -1; 144 } 145 146 switch (level) { 147 case PG_LEVEL_2M: 148 clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); 149 break; 150 case PG_LEVEL_4K: 151 clear_pte_presence(pte, clear, &f->old_presence); 152 break; 153 default: 154 pr_err("unexpected page level 0x%x.\n", level); 155 return -1; 156 } 157 158 __flush_tlb_one(f->page); 159 return 0; 160 } 161 162 /* 163 * Mark the given page as not present. Access to it will trigger a fault. 164 * 165 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the 166 * protection is ignored here. RCU read lock is assumed held, so the struct 167 * will not disappear unexpectedly. Furthermore, the caller must guarantee, 168 * that double arming the same virtual address (page) cannot occur. 169 * 170 * Double disarming on the other hand is allowed, and may occur when a fault 171 * and mmiotrace shutdown happen simultaneously. 172 */ 173 static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 174 { 175 int ret; 176 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); 177 if (f->armed) { 178 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", 179 f->page, f->count, !!f->old_presence); 180 } 181 ret = clear_page_presence(f, true); 182 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), 183 f->page); 184 f->armed = true; 185 return ret; 186 } 187 188 /** Restore the given page to saved presence state. */ 189 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) 190 { 191 int ret = clear_page_presence(f, false); 192 WARN_ONCE(ret < 0, 193 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 194 f->armed = false; 195 } 196 197 /* 198 * This is being called from do_page_fault(). 199 * 200 * We may be in an interrupt or a critical section. Also prefecthing may 201 * trigger a page fault. We may be in the middle of process switch. 202 * We cannot take any locks, because we could be executing especially 203 * within a kmmio critical section. 204 * 205 * Local interrupts are disabled, so preemption cannot happen. 206 * Do not enable interrupts, do not sleep, and watch out for other CPUs. 207 */ 208 /* 209 * Interrupts are disabled on entry as trap3 is an interrupt gate 210 * and they remain disabled throughout this function. 211 */ 212 int kmmio_handler(struct pt_regs *regs, unsigned long addr) 213 { 214 struct kmmio_context *ctx; 215 struct kmmio_fault_page *faultpage; 216 int ret = 0; /* default to fault not handled */ 217 218 /* 219 * Preemption is now disabled to prevent process switch during 220 * single stepping. We can only handle one active kmmio trace 221 * per cpu, so ensure that we finish it before something else 222 * gets to run. We also hold the RCU read lock over single 223 * stepping to avoid looking up the probe and kmmio_fault_page 224 * again. 225 */ 226 preempt_disable(); 227 rcu_read_lock(); 228 229 faultpage = get_kmmio_fault_page(addr); 230 if (!faultpage) { 231 /* 232 * Either this page fault is not caused by kmmio, or 233 * another CPU just pulled the kmmio probe from under 234 * our feet. The latter case should not be possible. 235 */ 236 goto no_kmmio; 237 } 238 239 ctx = &get_cpu_var(kmmio_ctx); 240 if (ctx->active) { 241 if (addr == ctx->addr) { 242 /* 243 * A second fault on the same page means some other 244 * condition needs handling by do_page_fault(), the 245 * page really not being present is the most common. 246 */ 247 pr_debug("secondary hit for 0x%08lx CPU %d.\n", 248 addr, smp_processor_id()); 249 250 if (!faultpage->old_presence) 251 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", 252 addr, smp_processor_id()); 253 } else { 254 /* 255 * Prevent overwriting already in-flight context. 256 * This should not happen, let's hope disarming at 257 * least prevents a panic. 258 */ 259 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", 260 smp_processor_id(), addr); 261 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); 262 disarm_kmmio_fault_page(faultpage); 263 } 264 goto no_kmmio_ctx; 265 } 266 ctx->active++; 267 268 ctx->fpage = faultpage; 269 ctx->probe = get_kmmio_probe(addr); 270 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 271 ctx->addr = addr; 272 273 if (ctx->probe && ctx->probe->pre_handler) 274 ctx->probe->pre_handler(ctx->probe, regs, addr); 275 276 /* 277 * Enable single-stepping and disable interrupts for the faulting 278 * context. Local interrupts must not get enabled during stepping. 279 */ 280 regs->flags |= X86_EFLAGS_TF; 281 regs->flags &= ~X86_EFLAGS_IF; 282 283 /* Now we set present bit in PTE and single step. */ 284 disarm_kmmio_fault_page(ctx->fpage); 285 286 /* 287 * If another cpu accesses the same page while we are stepping, 288 * the access will not be caught. It will simply succeed and the 289 * only downside is we lose the event. If this becomes a problem, 290 * the user should drop to single cpu before tracing. 291 */ 292 293 put_cpu_var(kmmio_ctx); 294 return 1; /* fault handled */ 295 296 no_kmmio_ctx: 297 put_cpu_var(kmmio_ctx); 298 no_kmmio: 299 rcu_read_unlock(); 300 preempt_enable_no_resched(); 301 return ret; 302 } 303 304 /* 305 * Interrupts are disabled on entry as trap1 is an interrupt gate 306 * and they remain disabled throughout this function. 307 * This must always get called as the pair to kmmio_handler(). 308 */ 309 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 310 { 311 int ret = 0; 312 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 313 314 if (!ctx->active) { 315 /* 316 * debug traps without an active context are due to either 317 * something external causing them (f.e. using a debugger while 318 * mmio tracing enabled), or erroneous behaviour 319 */ 320 pr_warning("unexpected debug trap on CPU %d.\n", 321 smp_processor_id()); 322 goto out; 323 } 324 325 if (ctx->probe && ctx->probe->post_handler) 326 ctx->probe->post_handler(ctx->probe, condition, regs); 327 328 /* Prevent racing against release_kmmio_fault_page(). */ 329 spin_lock(&kmmio_lock); 330 if (ctx->fpage->count) 331 arm_kmmio_fault_page(ctx->fpage); 332 spin_unlock(&kmmio_lock); 333 334 regs->flags &= ~X86_EFLAGS_TF; 335 regs->flags |= ctx->saved_flags; 336 337 /* These were acquired in kmmio_handler(). */ 338 ctx->active--; 339 BUG_ON(ctx->active); 340 rcu_read_unlock(); 341 preempt_enable_no_resched(); 342 343 /* 344 * if somebody else is singlestepping across a probe point, flags 345 * will have TF set, in which case, continue the remaining processing 346 * of do_debug, as if this is not a probe hit. 347 */ 348 if (!(regs->flags & X86_EFLAGS_TF)) 349 ret = 1; 350 out: 351 put_cpu_var(kmmio_ctx); 352 return ret; 353 } 354 355 /* You must be holding kmmio_lock. */ 356 static int add_kmmio_fault_page(unsigned long page) 357 { 358 struct kmmio_fault_page *f; 359 360 page &= PAGE_MASK; 361 f = get_kmmio_fault_page(page); 362 if (f) { 363 if (!f->count) 364 arm_kmmio_fault_page(f); 365 f->count++; 366 return 0; 367 } 368 369 f = kzalloc(sizeof(*f), GFP_ATOMIC); 370 if (!f) 371 return -1; 372 373 f->count = 1; 374 f->page = page; 375 376 if (arm_kmmio_fault_page(f)) { 377 kfree(f); 378 return -1; 379 } 380 381 list_add_rcu(&f->list, kmmio_page_list(f->page)); 382 383 return 0; 384 } 385 386 /* You must be holding kmmio_lock. */ 387 static void release_kmmio_fault_page(unsigned long page, 388 struct kmmio_fault_page **release_list) 389 { 390 struct kmmio_fault_page *f; 391 392 page &= PAGE_MASK; 393 f = get_kmmio_fault_page(page); 394 if (!f) 395 return; 396 397 f->count--; 398 BUG_ON(f->count < 0); 399 if (!f->count) { 400 disarm_kmmio_fault_page(f); 401 f->release_next = *release_list; 402 *release_list = f; 403 } 404 } 405 406 /* 407 * With page-unaligned ioremaps, one or two armed pages may contain 408 * addresses from outside the intended mapping. Events for these addresses 409 * are currently silently dropped. The events may result only from programming 410 * mistakes by accessing addresses before the beginning or past the end of a 411 * mapping. 412 */ 413 int register_kmmio_probe(struct kmmio_probe *p) 414 { 415 unsigned long flags; 416 int ret = 0; 417 unsigned long size = 0; 418 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 419 420 spin_lock_irqsave(&kmmio_lock, flags); 421 if (get_kmmio_probe(p->addr)) { 422 ret = -EEXIST; 423 goto out; 424 } 425 kmmio_count++; 426 list_add_rcu(&p->list, &kmmio_probes); 427 while (size < size_lim) { 428 if (add_kmmio_fault_page(p->addr + size)) 429 pr_err("Unable to set page fault.\n"); 430 size += PAGE_SIZE; 431 } 432 out: 433 spin_unlock_irqrestore(&kmmio_lock, flags); 434 /* 435 * XXX: What should I do here? 436 * Here was a call to global_flush_tlb(), but it does not exist 437 * anymore. It seems it's not needed after all. 438 */ 439 return ret; 440 } 441 EXPORT_SYMBOL(register_kmmio_probe); 442 443 static void rcu_free_kmmio_fault_pages(struct rcu_head *head) 444 { 445 struct kmmio_delayed_release *dr = container_of( 446 head, 447 struct kmmio_delayed_release, 448 rcu); 449 struct kmmio_fault_page *f = dr->release_list; 450 while (f) { 451 struct kmmio_fault_page *next = f->release_next; 452 BUG_ON(f->count); 453 kfree(f); 454 f = next; 455 } 456 kfree(dr); 457 } 458 459 static void remove_kmmio_fault_pages(struct rcu_head *head) 460 { 461 struct kmmio_delayed_release *dr = 462 container_of(head, struct kmmio_delayed_release, rcu); 463 struct kmmio_fault_page *f = dr->release_list; 464 struct kmmio_fault_page **prevp = &dr->release_list; 465 unsigned long flags; 466 467 spin_lock_irqsave(&kmmio_lock, flags); 468 while (f) { 469 if (!f->count) { 470 list_del_rcu(&f->list); 471 prevp = &f->release_next; 472 } else { 473 *prevp = f->release_next; 474 } 475 f = f->release_next; 476 } 477 spin_unlock_irqrestore(&kmmio_lock, flags); 478 479 /* This is the real RCU destroy call. */ 480 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 481 } 482 483 /* 484 * Remove a kmmio probe. You have to synchronize_rcu() before you can be 485 * sure that the callbacks will not be called anymore. Only after that 486 * you may actually release your struct kmmio_probe. 487 * 488 * Unregistering a kmmio fault page has three steps: 489 * 1. release_kmmio_fault_page() 490 * Disarm the page, wait a grace period to let all faults finish. 491 * 2. remove_kmmio_fault_pages() 492 * Remove the pages from kmmio_page_table. 493 * 3. rcu_free_kmmio_fault_pages() 494 * Actually free the kmmio_fault_page structs as with RCU. 495 */ 496 void unregister_kmmio_probe(struct kmmio_probe *p) 497 { 498 unsigned long flags; 499 unsigned long size = 0; 500 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 501 struct kmmio_fault_page *release_list = NULL; 502 struct kmmio_delayed_release *drelease; 503 504 spin_lock_irqsave(&kmmio_lock, flags); 505 while (size < size_lim) { 506 release_kmmio_fault_page(p->addr + size, &release_list); 507 size += PAGE_SIZE; 508 } 509 list_del_rcu(&p->list); 510 kmmio_count--; 511 spin_unlock_irqrestore(&kmmio_lock, flags); 512 513 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 514 if (!drelease) { 515 pr_crit("leaking kmmio_fault_page objects.\n"); 516 return; 517 } 518 drelease->release_list = release_list; 519 520 /* 521 * This is not really RCU here. We have just disarmed a set of 522 * pages so that they cannot trigger page faults anymore. However, 523 * we cannot remove the pages from kmmio_page_table, 524 * because a probe hit might be in flight on another CPU. The 525 * pages are collected into a list, and they will be removed from 526 * kmmio_page_table when it is certain that no probe hit related to 527 * these pages can be in flight. RCU grace period sounds like a 528 * good choice. 529 * 530 * If we removed the pages too early, kmmio page fault handler might 531 * not find the respective kmmio_fault_page and determine it's not 532 * a kmmio fault, when it actually is. This would lead to madness. 533 */ 534 call_rcu(&drelease->rcu, remove_kmmio_fault_pages); 535 } 536 EXPORT_SYMBOL(unregister_kmmio_probe); 537 538 static int 539 kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) 540 { 541 struct die_args *arg = args; 542 unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); 543 544 if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) 545 if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { 546 /* 547 * Reset the BS bit in dr6 (pointed by args->err) to 548 * denote completion of processing 549 */ 550 *dr6_p &= ~DR_STEP; 551 return NOTIFY_STOP; 552 } 553 554 return NOTIFY_DONE; 555 } 556 557 static struct notifier_block nb_die = { 558 .notifier_call = kmmio_die_notifier 559 }; 560 561 int kmmio_init(void) 562 { 563 int i; 564 565 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 566 INIT_LIST_HEAD(&kmmio_page_table[i]); 567 568 return register_die_notifier(&nb_die); 569 } 570 571 void kmmio_cleanup(void) 572 { 573 int i; 574 575 unregister_die_notifier(&nb_die); 576 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { 577 WARN_ONCE(!list_empty(&kmmio_page_table[i]), 578 KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); 579 } 580 } 581