1 /* Support for MMIO probes. 2 * Benfit many code from kprobes 3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. 4 * 2007 Alexander Eichner 5 * 2008 Pekka Paalanen <pq@iki.fi> 6 */ 7 8 #include <linux/list.h> 9 #include <linux/rculist.h> 10 #include <linux/spinlock.h> 11 #include <linux/hash.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/kernel.h> 15 #include <linux/uaccess.h> 16 #include <linux/ptrace.h> 17 #include <linux/preempt.h> 18 #include <linux/percpu.h> 19 #include <linux/kdebug.h> 20 #include <linux/mutex.h> 21 #include <linux/io.h> 22 #include <asm/cacheflush.h> 23 #include <asm/tlbflush.h> 24 #include <linux/errno.h> 25 #include <asm/debugreg.h> 26 #include <linux/mmiotrace.h> 27 28 #define KMMIO_PAGE_HASH_BITS 4 29 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) 30 31 struct kmmio_fault_page { 32 struct list_head list; 33 struct kmmio_fault_page *release_next; 34 unsigned long page; /* location of the fault page */ 35 bool old_presence; /* page presence prior to arming */ 36 bool armed; 37 38 /* 39 * Number of times this page has been registered as a part 40 * of a probe. If zero, page is disarmed and this may be freed. 41 * Used only by writers (RCU) and post_kmmio_handler(). 42 * Protected by kmmio_lock, when linked into kmmio_page_table. 43 */ 44 int count; 45 }; 46 47 struct kmmio_delayed_release { 48 struct rcu_head rcu; 49 struct kmmio_fault_page *release_list; 50 }; 51 52 struct kmmio_context { 53 struct kmmio_fault_page *fpage; 54 struct kmmio_probe *probe; 55 unsigned long saved_flags; 56 unsigned long addr; 57 int active; 58 }; 59 60 static DEFINE_SPINLOCK(kmmio_lock); 61 62 /* Protected by kmmio_lock */ 63 unsigned int kmmio_count; 64 65 /* Read-protected by RCU, write-protected by kmmio_lock. */ 66 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 67 static LIST_HEAD(kmmio_probes); 68 69 static struct list_head *kmmio_page_list(unsigned long page) 70 { 71 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; 72 } 73 74 /* Accessed per-cpu */ 75 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); 76 77 /* 78 * this is basically a dynamic stabbing problem: 79 * Could use the existing prio tree code or 80 * Possible better implementations: 81 * The Interval Skip List: A Data Structure for Finding All Intervals That 82 * Overlap a Point (might be simple) 83 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup 84 */ 85 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ 86 static struct kmmio_probe *get_kmmio_probe(unsigned long addr) 87 { 88 struct kmmio_probe *p; 89 list_for_each_entry_rcu(p, &kmmio_probes, list) { 90 if (addr >= p->addr && addr <= (p->addr + p->len)) 91 return p; 92 } 93 return NULL; 94 } 95 96 /* You must be holding RCU read lock. */ 97 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 98 { 99 struct list_head *head; 100 struct kmmio_fault_page *p; 101 102 page &= PAGE_MASK; 103 head = kmmio_page_list(page); 104 list_for_each_entry_rcu(p, head, list) { 105 if (p->page == page) 106 return p; 107 } 108 return NULL; 109 } 110 111 static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) 112 { 113 pmdval_t v = pmd_val(*pmd); 114 *old = !!(v & _PAGE_PRESENT); 115 v &= ~_PAGE_PRESENT; 116 if (present) 117 v |= _PAGE_PRESENT; 118 set_pmd(pmd, __pmd(v)); 119 } 120 121 static void set_pte_presence(pte_t *pte, bool present, bool *old) 122 { 123 pteval_t v = pte_val(*pte); 124 *old = !!(v & _PAGE_PRESENT); 125 v &= ~_PAGE_PRESENT; 126 if (present) 127 v |= _PAGE_PRESENT; 128 set_pte_atomic(pte, __pte(v)); 129 } 130 131 static int set_page_presence(unsigned long addr, bool present, bool *old) 132 { 133 unsigned int level; 134 pte_t *pte = lookup_address(addr, &level); 135 136 if (!pte) { 137 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 138 return -1; 139 } 140 141 switch (level) { 142 case PG_LEVEL_2M: 143 set_pmd_presence((pmd_t *)pte, present, old); 144 break; 145 case PG_LEVEL_4K: 146 set_pte_presence(pte, present, old); 147 break; 148 default: 149 pr_err("kmmio: unexpected page level 0x%x.\n", level); 150 return -1; 151 } 152 153 __flush_tlb_one(addr); 154 return 0; 155 } 156 157 /* 158 * Mark the given page as not present. Access to it will trigger a fault. 159 * 160 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the 161 * protection is ignored here. RCU read lock is assumed held, so the struct 162 * will not disappear unexpectedly. Furthermore, the caller must guarantee, 163 * that double arming the same virtual address (page) cannot occur. 164 * 165 * Double disarming on the other hand is allowed, and may occur when a fault 166 * and mmiotrace shutdown happen simultaneously. 167 */ 168 static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 169 { 170 int ret; 171 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 172 if (f->armed) { 173 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 174 f->page, f->count, f->old_presence); 175 } 176 ret = set_page_presence(f->page, false, &f->old_presence); 177 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 178 f->armed = true; 179 return ret; 180 } 181 182 /** Restore the given page to saved presence state. */ 183 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) 184 { 185 bool tmp; 186 int ret = set_page_presence(f->page, f->old_presence, &tmp); 187 WARN_ONCE(ret < 0, 188 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 189 f->armed = false; 190 } 191 192 /* 193 * This is being called from do_page_fault(). 194 * 195 * We may be in an interrupt or a critical section. Also prefecthing may 196 * trigger a page fault. We may be in the middle of process switch. 197 * We cannot take any locks, because we could be executing especially 198 * within a kmmio critical section. 199 * 200 * Local interrupts are disabled, so preemption cannot happen. 201 * Do not enable interrupts, do not sleep, and watch out for other CPUs. 202 */ 203 /* 204 * Interrupts are disabled on entry as trap3 is an interrupt gate 205 * and they remain disabled thorough out this function. 206 */ 207 int kmmio_handler(struct pt_regs *regs, unsigned long addr) 208 { 209 struct kmmio_context *ctx; 210 struct kmmio_fault_page *faultpage; 211 int ret = 0; /* default to fault not handled */ 212 213 /* 214 * Preemption is now disabled to prevent process switch during 215 * single stepping. We can only handle one active kmmio trace 216 * per cpu, so ensure that we finish it before something else 217 * gets to run. We also hold the RCU read lock over single 218 * stepping to avoid looking up the probe and kmmio_fault_page 219 * again. 220 */ 221 preempt_disable(); 222 rcu_read_lock(); 223 224 faultpage = get_kmmio_fault_page(addr); 225 if (!faultpage) { 226 /* 227 * Either this page fault is not caused by kmmio, or 228 * another CPU just pulled the kmmio probe from under 229 * our feet. The latter case should not be possible. 230 */ 231 goto no_kmmio; 232 } 233 234 ctx = &get_cpu_var(kmmio_ctx); 235 if (ctx->active) { 236 if (addr == ctx->addr) { 237 /* 238 * A second fault on the same page means some other 239 * condition needs handling by do_page_fault(), the 240 * page really not being present is the most common. 241 */ 242 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", 243 addr, smp_processor_id()); 244 245 if (!faultpage->old_presence) 246 pr_info("kmmio: unexpected secondary hit for " 247 "address 0x%08lx on CPU %d.\n", addr, 248 smp_processor_id()); 249 } else { 250 /* 251 * Prevent overwriting already in-flight context. 252 * This should not happen, let's hope disarming at 253 * least prevents a panic. 254 */ 255 pr_emerg("kmmio: recursive probe hit on CPU %d, " 256 "for address 0x%08lx. Ignoring.\n", 257 smp_processor_id(), addr); 258 pr_emerg("kmmio: previous hit was at 0x%08lx.\n", 259 ctx->addr); 260 disarm_kmmio_fault_page(faultpage); 261 } 262 goto no_kmmio_ctx; 263 } 264 ctx->active++; 265 266 ctx->fpage = faultpage; 267 ctx->probe = get_kmmio_probe(addr); 268 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 269 ctx->addr = addr; 270 271 if (ctx->probe && ctx->probe->pre_handler) 272 ctx->probe->pre_handler(ctx->probe, regs, addr); 273 274 /* 275 * Enable single-stepping and disable interrupts for the faulting 276 * context. Local interrupts must not get enabled during stepping. 277 */ 278 regs->flags |= X86_EFLAGS_TF; 279 regs->flags &= ~X86_EFLAGS_IF; 280 281 /* Now we set present bit in PTE and single step. */ 282 disarm_kmmio_fault_page(ctx->fpage); 283 284 /* 285 * If another cpu accesses the same page while we are stepping, 286 * the access will not be caught. It will simply succeed and the 287 * only downside is we lose the event. If this becomes a problem, 288 * the user should drop to single cpu before tracing. 289 */ 290 291 put_cpu_var(kmmio_ctx); 292 return 1; /* fault handled */ 293 294 no_kmmio_ctx: 295 put_cpu_var(kmmio_ctx); 296 no_kmmio: 297 rcu_read_unlock(); 298 preempt_enable_no_resched(); 299 return ret; 300 } 301 302 /* 303 * Interrupts are disabled on entry as trap1 is an interrupt gate 304 * and they remain disabled thorough out this function. 305 * This must always get called as the pair to kmmio_handler(). 306 */ 307 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 308 { 309 int ret = 0; 310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 311 312 if (!ctx->active) { 313 pr_debug("kmmio: spurious debug trap on CPU %d.\n", 314 smp_processor_id()); 315 goto out; 316 } 317 318 if (ctx->probe && ctx->probe->post_handler) 319 ctx->probe->post_handler(ctx->probe, condition, regs); 320 321 /* Prevent racing against release_kmmio_fault_page(). */ 322 spin_lock(&kmmio_lock); 323 if (ctx->fpage->count) 324 arm_kmmio_fault_page(ctx->fpage); 325 spin_unlock(&kmmio_lock); 326 327 regs->flags &= ~X86_EFLAGS_TF; 328 regs->flags |= ctx->saved_flags; 329 330 /* These were acquired in kmmio_handler(). */ 331 ctx->active--; 332 BUG_ON(ctx->active); 333 rcu_read_unlock(); 334 preempt_enable_no_resched(); 335 336 /* 337 * if somebody else is singlestepping across a probe point, flags 338 * will have TF set, in which case, continue the remaining processing 339 * of do_debug, as if this is not a probe hit. 340 */ 341 if (!(regs->flags & X86_EFLAGS_TF)) 342 ret = 1; 343 out: 344 put_cpu_var(kmmio_ctx); 345 return ret; 346 } 347 348 /* You must be holding kmmio_lock. */ 349 static int add_kmmio_fault_page(unsigned long page) 350 { 351 struct kmmio_fault_page *f; 352 353 page &= PAGE_MASK; 354 f = get_kmmio_fault_page(page); 355 if (f) { 356 if (!f->count) 357 arm_kmmio_fault_page(f); 358 f->count++; 359 return 0; 360 } 361 362 f = kzalloc(sizeof(*f), GFP_ATOMIC); 363 if (!f) 364 return -1; 365 366 f->count = 1; 367 f->page = page; 368 369 if (arm_kmmio_fault_page(f)) { 370 kfree(f); 371 return -1; 372 } 373 374 list_add_rcu(&f->list, kmmio_page_list(f->page)); 375 376 return 0; 377 } 378 379 /* You must be holding kmmio_lock. */ 380 static void release_kmmio_fault_page(unsigned long page, 381 struct kmmio_fault_page **release_list) 382 { 383 struct kmmio_fault_page *f; 384 385 page &= PAGE_MASK; 386 f = get_kmmio_fault_page(page); 387 if (!f) 388 return; 389 390 f->count--; 391 BUG_ON(f->count < 0); 392 if (!f->count) { 393 disarm_kmmio_fault_page(f); 394 f->release_next = *release_list; 395 *release_list = f; 396 } 397 } 398 399 /* 400 * With page-unaligned ioremaps, one or two armed pages may contain 401 * addresses from outside the intended mapping. Events for these addresses 402 * are currently silently dropped. The events may result only from programming 403 * mistakes by accessing addresses before the beginning or past the end of a 404 * mapping. 405 */ 406 int register_kmmio_probe(struct kmmio_probe *p) 407 { 408 unsigned long flags; 409 int ret = 0; 410 unsigned long size = 0; 411 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 412 413 spin_lock_irqsave(&kmmio_lock, flags); 414 if (get_kmmio_probe(p->addr)) { 415 ret = -EEXIST; 416 goto out; 417 } 418 kmmio_count++; 419 list_add_rcu(&p->list, &kmmio_probes); 420 while (size < size_lim) { 421 if (add_kmmio_fault_page(p->addr + size)) 422 pr_err("kmmio: Unable to set page fault.\n"); 423 size += PAGE_SIZE; 424 } 425 out: 426 spin_unlock_irqrestore(&kmmio_lock, flags); 427 /* 428 * XXX: What should I do here? 429 * Here was a call to global_flush_tlb(), but it does not exist 430 * anymore. It seems it's not needed after all. 431 */ 432 return ret; 433 } 434 EXPORT_SYMBOL(register_kmmio_probe); 435 436 static void rcu_free_kmmio_fault_pages(struct rcu_head *head) 437 { 438 struct kmmio_delayed_release *dr = container_of( 439 head, 440 struct kmmio_delayed_release, 441 rcu); 442 struct kmmio_fault_page *p = dr->release_list; 443 while (p) { 444 struct kmmio_fault_page *next = p->release_next; 445 BUG_ON(p->count); 446 kfree(p); 447 p = next; 448 } 449 kfree(dr); 450 } 451 452 static void remove_kmmio_fault_pages(struct rcu_head *head) 453 { 454 struct kmmio_delayed_release *dr = 455 container_of(head, struct kmmio_delayed_release, rcu); 456 struct kmmio_fault_page *p = dr->release_list; 457 struct kmmio_fault_page **prevp = &dr->release_list; 458 unsigned long flags; 459 460 spin_lock_irqsave(&kmmio_lock, flags); 461 while (p) { 462 if (!p->count) { 463 list_del_rcu(&p->list); 464 prevp = &p->release_next; 465 } else { 466 *prevp = p->release_next; 467 } 468 p = p->release_next; 469 } 470 spin_unlock_irqrestore(&kmmio_lock, flags); 471 472 /* This is the real RCU destroy call. */ 473 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 474 } 475 476 /* 477 * Remove a kmmio probe. You have to synchronize_rcu() before you can be 478 * sure that the callbacks will not be called anymore. Only after that 479 * you may actually release your struct kmmio_probe. 480 * 481 * Unregistering a kmmio fault page has three steps: 482 * 1. release_kmmio_fault_page() 483 * Disarm the page, wait a grace period to let all faults finish. 484 * 2. remove_kmmio_fault_pages() 485 * Remove the pages from kmmio_page_table. 486 * 3. rcu_free_kmmio_fault_pages() 487 * Actally free the kmmio_fault_page structs as with RCU. 488 */ 489 void unregister_kmmio_probe(struct kmmio_probe *p) 490 { 491 unsigned long flags; 492 unsigned long size = 0; 493 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 494 struct kmmio_fault_page *release_list = NULL; 495 struct kmmio_delayed_release *drelease; 496 497 spin_lock_irqsave(&kmmio_lock, flags); 498 while (size < size_lim) { 499 release_kmmio_fault_page(p->addr + size, &release_list); 500 size += PAGE_SIZE; 501 } 502 list_del_rcu(&p->list); 503 kmmio_count--; 504 spin_unlock_irqrestore(&kmmio_lock, flags); 505 506 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 507 if (!drelease) { 508 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 509 return; 510 } 511 drelease->release_list = release_list; 512 513 /* 514 * This is not really RCU here. We have just disarmed a set of 515 * pages so that they cannot trigger page faults anymore. However, 516 * we cannot remove the pages from kmmio_page_table, 517 * because a probe hit might be in flight on another CPU. The 518 * pages are collected into a list, and they will be removed from 519 * kmmio_page_table when it is certain that no probe hit related to 520 * these pages can be in flight. RCU grace period sounds like a 521 * good choice. 522 * 523 * If we removed the pages too early, kmmio page fault handler might 524 * not find the respective kmmio_fault_page and determine it's not 525 * a kmmio fault, when it actually is. This would lead to madness. 526 */ 527 call_rcu(&drelease->rcu, remove_kmmio_fault_pages); 528 } 529 EXPORT_SYMBOL(unregister_kmmio_probe); 530 531 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, 532 void *args) 533 { 534 struct die_args *arg = args; 535 536 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 537 if (post_kmmio_handler(arg->err, arg->regs) == 1) 538 return NOTIFY_STOP; 539 540 return NOTIFY_DONE; 541 } 542 543 static struct notifier_block nb_die = { 544 .notifier_call = kmmio_die_notifier 545 }; 546 547 static int __init init_kmmio(void) 548 { 549 int i; 550 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 551 INIT_LIST_HEAD(&kmmio_page_table[i]); 552 return register_die_notifier(&nb_die); 553 } 554 fs_initcall(init_kmmio); /* should be before device_initcall() */ 555