1 /* 2 * linux/kernel/profile.c 3 * Simple profiling. Manages a direct-mapped profile hit count buffer, 4 * with configurable resolution, support for restricting the cpus on 5 * which profiling is done, and switching between cpu time and 6 * schedule() calls via kernel command line parameters passed at boot. 7 * 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 9 * Red Hat, July 2004 10 * Consolidation of architecture support code for profiling, 11 * William Irwin, Oracle, July 2004 12 * Amortized hit count accounting via per-cpu open-addressed hashtables 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 14 */ 15 16 #include <linux/module.h> 17 #include <linux/profile.h> 18 #include <linux/bootmem.h> 19 #include <linux/notifier.h> 20 #include <linux/mm.h> 21 #include <linux/cpumask.h> 22 #include <linux/cpu.h> 23 #include <linux/highmem.h> 24 #include <linux/mutex.h> 25 #include <asm/sections.h> 26 #include <asm/irq_regs.h> 27 #include <asm/ptrace.h> 28 29 struct profile_hit { 30 u32 pc, hits; 31 }; 32 #define PROFILE_GRPSHIFT 3 33 #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) 34 #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 35 #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 36 37 /* Oprofile timer tick hook */ 38 static int (*timer_hook)(struct pt_regs *) __read_mostly; 39 40 static atomic_t *prof_buffer; 41 static unsigned long prof_len, prof_shift; 42 43 int prof_on __read_mostly; 44 EXPORT_SYMBOL_GPL(prof_on); 45 46 static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 47 #ifdef CONFIG_SMP 48 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 49 static DEFINE_PER_CPU(int, cpu_profile_flip); 50 static DEFINE_MUTEX(profile_flip_mutex); 51 #endif /* CONFIG_SMP */ 52 53 static int __init profile_setup(char *str) 54 { 55 static char __initdata schedstr[] = "schedule"; 56 static char __initdata sleepstr[] = "sleep"; 57 static char __initdata kvmstr[] = "kvm"; 58 int par; 59 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 61 #ifdef CONFIG_SCHEDSTATS 62 prof_on = SLEEP_PROFILING; 63 if (str[strlen(sleepstr)] == ',') 64 str += strlen(sleepstr) + 1; 65 if (get_option(&str, &par)) 66 prof_shift = par; 67 printk(KERN_INFO 68 "kernel sleep profiling enabled (shift: %ld)\n", 69 prof_shift); 70 #else 71 printk(KERN_WARNING 72 "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); 73 #endif /* CONFIG_SCHEDSTATS */ 74 } else if (!strncmp(str, schedstr, strlen(schedstr))) { 75 prof_on = SCHED_PROFILING; 76 if (str[strlen(schedstr)] == ',') 77 str += strlen(schedstr) + 1; 78 if (get_option(&str, &par)) 79 prof_shift = par; 80 printk(KERN_INFO 81 "kernel schedule profiling enabled (shift: %ld)\n", 82 prof_shift); 83 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { 84 prof_on = KVM_PROFILING; 85 if (str[strlen(kvmstr)] == ',') 86 str += strlen(kvmstr) + 1; 87 if (get_option(&str, &par)) 88 prof_shift = par; 89 printk(KERN_INFO 90 "kernel KVM profiling enabled (shift: %ld)\n", 91 prof_shift); 92 } else if (get_option(&str, &par)) { 93 prof_shift = par; 94 prof_on = CPU_PROFILING; 95 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", 96 prof_shift); 97 } 98 return 1; 99 } 100 __setup("profile=", profile_setup); 101 102 103 void __init profile_init(void) 104 { 105 if (!prof_on) 106 return; 107 108 /* only text is profiled */ 109 prof_len = (_etext - _stext) >> prof_shift; 110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 111 } 112 113 /* Profile event notifications */ 114 115 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 116 static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 117 static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 118 119 void profile_task_exit(struct task_struct *task) 120 { 121 blocking_notifier_call_chain(&task_exit_notifier, 0, task); 122 } 123 124 int profile_handoff_task(struct task_struct *task) 125 { 126 int ret; 127 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); 128 return (ret == NOTIFY_OK) ? 1 : 0; 129 } 130 131 void profile_munmap(unsigned long addr) 132 { 133 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); 134 } 135 136 int task_handoff_register(struct notifier_block *n) 137 { 138 return atomic_notifier_chain_register(&task_free_notifier, n); 139 } 140 EXPORT_SYMBOL_GPL(task_handoff_register); 141 142 int task_handoff_unregister(struct notifier_block *n) 143 { 144 return atomic_notifier_chain_unregister(&task_free_notifier, n); 145 } 146 EXPORT_SYMBOL_GPL(task_handoff_unregister); 147 148 int profile_event_register(enum profile_type type, struct notifier_block *n) 149 { 150 int err = -EINVAL; 151 152 switch (type) { 153 case PROFILE_TASK_EXIT: 154 err = blocking_notifier_chain_register( 155 &task_exit_notifier, n); 156 break; 157 case PROFILE_MUNMAP: 158 err = blocking_notifier_chain_register( 159 &munmap_notifier, n); 160 break; 161 } 162 163 return err; 164 } 165 EXPORT_SYMBOL_GPL(profile_event_register); 166 167 int profile_event_unregister(enum profile_type type, struct notifier_block *n) 168 { 169 int err = -EINVAL; 170 171 switch (type) { 172 case PROFILE_TASK_EXIT: 173 err = blocking_notifier_chain_unregister( 174 &task_exit_notifier, n); 175 break; 176 case PROFILE_MUNMAP: 177 err = blocking_notifier_chain_unregister( 178 &munmap_notifier, n); 179 break; 180 } 181 182 return err; 183 } 184 EXPORT_SYMBOL_GPL(profile_event_unregister); 185 186 int register_timer_hook(int (*hook)(struct pt_regs *)) 187 { 188 if (timer_hook) 189 return -EBUSY; 190 timer_hook = hook; 191 return 0; 192 } 193 EXPORT_SYMBOL_GPL(register_timer_hook); 194 195 void unregister_timer_hook(int (*hook)(struct pt_regs *)) 196 { 197 WARN_ON(hook != timer_hook); 198 timer_hook = NULL; 199 /* make sure all CPUs see the NULL hook */ 200 synchronize_sched(); /* Allow ongoing interrupts to complete. */ 201 } 202 EXPORT_SYMBOL_GPL(unregister_timer_hook); 203 204 205 #ifdef CONFIG_SMP 206 /* 207 * Each cpu has a pair of open-addressed hashtables for pending 208 * profile hits. read_profile() IPI's all cpus to request them 209 * to flip buffers and flushes their contents to prof_buffer itself. 210 * Flip requests are serialized by the profile_flip_mutex. The sole 211 * use of having a second hashtable is for avoiding cacheline 212 * contention that would otherwise happen during flushes of pending 213 * profile hits required for the accuracy of reported profile hits 214 * and so resurrect the interrupt livelock issue. 215 * 216 * The open-addressed hashtables are indexed by profile buffer slot 217 * and hold the number of pending hits to that profile buffer slot on 218 * a cpu in an entry. When the hashtable overflows, all pending hits 219 * are accounted to their corresponding profile buffer slots with 220 * atomic_add() and the hashtable emptied. As numerous pending hits 221 * may be accounted to a profile buffer slot in a hashtable entry, 222 * this amortizes a number of atomic profile buffer increments likely 223 * to be far larger than the number of entries in the hashtable, 224 * particularly given that the number of distinct profile buffer 225 * positions to which hits are accounted during short intervals (e.g. 226 * several seconds) is usually very small. Exclusion from buffer 227 * flipping is provided by interrupt disablement (note that for 228 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from 229 * process context). 230 * The hash function is meant to be lightweight as opposed to strong, 231 * and was vaguely inspired by ppc64 firmware-supported inverted 232 * pagetable hash functions, but uses a full hashtable full of finite 233 * collision chains, not just pairs of them. 234 * 235 * -- wli 236 */ 237 static void __profile_flip_buffers(void *unused) 238 { 239 int cpu = smp_processor_id(); 240 241 per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); 242 } 243 244 static void profile_flip_buffers(void) 245 { 246 int i, j, cpu; 247 248 mutex_lock(&profile_flip_mutex); 249 j = per_cpu(cpu_profile_flip, get_cpu()); 250 put_cpu(); 251 on_each_cpu(__profile_flip_buffers, NULL, 1); 252 for_each_online_cpu(cpu) { 253 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; 254 for (i = 0; i < NR_PROFILE_HIT; ++i) { 255 if (!hits[i].hits) { 256 if (hits[i].pc) 257 hits[i].pc = 0; 258 continue; 259 } 260 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 261 hits[i].hits = hits[i].pc = 0; 262 } 263 } 264 mutex_unlock(&profile_flip_mutex); 265 } 266 267 static void profile_discard_flip_buffers(void) 268 { 269 int i, cpu; 270 271 mutex_lock(&profile_flip_mutex); 272 i = per_cpu(cpu_profile_flip, get_cpu()); 273 put_cpu(); 274 on_each_cpu(__profile_flip_buffers, NULL, 1); 275 for_each_online_cpu(cpu) { 276 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 277 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 278 } 279 mutex_unlock(&profile_flip_mutex); 280 } 281 282 void profile_hits(int type, void *__pc, unsigned int nr_hits) 283 { 284 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 285 int i, j, cpu; 286 struct profile_hit *hits; 287 288 if (prof_on != type || !prof_buffer) 289 return; 290 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 291 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 292 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 293 cpu = get_cpu(); 294 hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; 295 if (!hits) { 296 put_cpu(); 297 return; 298 } 299 /* 300 * We buffer the global profiler buffer into a per-CPU 301 * queue and thus reduce the number of global (and possibly 302 * NUMA-alien) accesses. The write-queue is self-coalescing: 303 */ 304 local_irq_save(flags); 305 do { 306 for (j = 0; j < PROFILE_GRPSZ; ++j) { 307 if (hits[i + j].pc == pc) { 308 hits[i + j].hits += nr_hits; 309 goto out; 310 } else if (!hits[i + j].hits) { 311 hits[i + j].pc = pc; 312 hits[i + j].hits = nr_hits; 313 goto out; 314 } 315 } 316 i = (i + secondary) & (NR_PROFILE_HIT - 1); 317 } while (i != primary); 318 319 /* 320 * Add the current hit(s) and flush the write-queue out 321 * to the global buffer: 322 */ 323 atomic_add(nr_hits, &prof_buffer[pc]); 324 for (i = 0; i < NR_PROFILE_HIT; ++i) { 325 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 326 hits[i].pc = hits[i].hits = 0; 327 } 328 out: 329 local_irq_restore(flags); 330 put_cpu(); 331 } 332 333 static int __devinit profile_cpu_callback(struct notifier_block *info, 334 unsigned long action, void *__cpu) 335 { 336 int node, cpu = (unsigned long)__cpu; 337 struct page *page; 338 339 switch (action) { 340 case CPU_UP_PREPARE: 341 case CPU_UP_PREPARE_FROZEN: 342 node = cpu_to_node(cpu); 343 per_cpu(cpu_profile_flip, cpu) = 0; 344 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 345 page = alloc_pages_node(node, 346 GFP_KERNEL | __GFP_ZERO, 347 0); 348 if (!page) 349 return NOTIFY_BAD; 350 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 351 } 352 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 353 page = alloc_pages_node(node, 354 GFP_KERNEL | __GFP_ZERO, 355 0); 356 if (!page) 357 goto out_free; 358 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 359 } 360 break; 361 out_free: 362 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 363 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 364 __free_page(page); 365 return NOTIFY_BAD; 366 case CPU_ONLINE: 367 case CPU_ONLINE_FROZEN: 368 cpu_set(cpu, prof_cpu_mask); 369 break; 370 case CPU_UP_CANCELED: 371 case CPU_UP_CANCELED_FROZEN: 372 case CPU_DEAD: 373 case CPU_DEAD_FROZEN: 374 cpu_clear(cpu, prof_cpu_mask); 375 if (per_cpu(cpu_profile_hits, cpu)[0]) { 376 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 377 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 378 __free_page(page); 379 } 380 if (per_cpu(cpu_profile_hits, cpu)[1]) { 381 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 382 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 383 __free_page(page); 384 } 385 break; 386 } 387 return NOTIFY_OK; 388 } 389 #else /* !CONFIG_SMP */ 390 #define profile_flip_buffers() do { } while (0) 391 #define profile_discard_flip_buffers() do { } while (0) 392 #define profile_cpu_callback NULL 393 394 void profile_hits(int type, void *__pc, unsigned int nr_hits) 395 { 396 unsigned long pc; 397 398 if (prof_on != type || !prof_buffer) 399 return; 400 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 401 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 402 } 403 #endif /* !CONFIG_SMP */ 404 EXPORT_SYMBOL_GPL(profile_hits); 405 406 void profile_tick(int type) 407 { 408 struct pt_regs *regs = get_irq_regs(); 409 410 if (type == CPU_PROFILING && timer_hook) 411 timer_hook(regs); 412 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 413 profile_hit(type, (void *)profile_pc(regs)); 414 } 415 416 #ifdef CONFIG_PROC_FS 417 #include <linux/proc_fs.h> 418 #include <asm/uaccess.h> 419 #include <asm/ptrace.h> 420 421 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 422 int count, int *eof, void *data) 423 { 424 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 425 if (count - len < 2) 426 return -EINVAL; 427 len += sprintf(page + len, "\n"); 428 return len; 429 } 430 431 static int prof_cpu_mask_write_proc(struct file *file, 432 const char __user *buffer, unsigned long count, void *data) 433 { 434 cpumask_t *mask = (cpumask_t *)data; 435 unsigned long full_count = count, err; 436 cpumask_t new_value; 437 438 err = cpumask_parse_user(buffer, count, new_value); 439 if (err) 440 return err; 441 442 *mask = new_value; 443 return full_count; 444 } 445 446 void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 447 { 448 struct proc_dir_entry *entry; 449 450 /* create /proc/irq/prof_cpu_mask */ 451 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 452 if (!entry) 453 return; 454 entry->data = (void *)&prof_cpu_mask; 455 entry->read_proc = prof_cpu_mask_read_proc; 456 entry->write_proc = prof_cpu_mask_write_proc; 457 } 458 459 /* 460 * This function accesses profiling information. The returned data is 461 * binary: the sampling step and the actual contents of the profile 462 * buffer. Use of the program readprofile is recommended in order to 463 * get meaningful info out of these data. 464 */ 465 static ssize_t 466 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) 467 { 468 unsigned long p = *ppos; 469 ssize_t read; 470 char *pnt; 471 unsigned int sample_step = 1 << prof_shift; 472 473 profile_flip_buffers(); 474 if (p >= (prof_len+1)*sizeof(unsigned int)) 475 return 0; 476 if (count > (prof_len+1)*sizeof(unsigned int) - p) 477 count = (prof_len+1)*sizeof(unsigned int) - p; 478 read = 0; 479 480 while (p < sizeof(unsigned int) && count > 0) { 481 if (put_user(*((char *)(&sample_step)+p), buf)) 482 return -EFAULT; 483 buf++; p++; count--; read++; 484 } 485 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 486 if (copy_to_user(buf, (void *)pnt, count)) 487 return -EFAULT; 488 read += count; 489 *ppos += read; 490 return read; 491 } 492 493 /* 494 * Writing to /proc/profile resets the counters 495 * 496 * Writing a 'profiling multiplier' value into it also re-sets the profiling 497 * interrupt frequency, on architectures that support this. 498 */ 499 static ssize_t write_profile(struct file *file, const char __user *buf, 500 size_t count, loff_t *ppos) 501 { 502 #ifdef CONFIG_SMP 503 extern int setup_profiling_timer(unsigned int multiplier); 504 505 if (count == sizeof(int)) { 506 unsigned int multiplier; 507 508 if (copy_from_user(&multiplier, buf, sizeof(int))) 509 return -EFAULT; 510 511 if (setup_profiling_timer(multiplier)) 512 return -EINVAL; 513 } 514 #endif 515 profile_discard_flip_buffers(); 516 memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); 517 return count; 518 } 519 520 static const struct file_operations proc_profile_operations = { 521 .read = read_profile, 522 .write = write_profile, 523 }; 524 525 #ifdef CONFIG_SMP 526 static void __init profile_nop(void *unused) 527 { 528 } 529 530 static int __init create_hash_tables(void) 531 { 532 int cpu; 533 534 for_each_online_cpu(cpu) { 535 int node = cpu_to_node(cpu); 536 struct page *page; 537 538 page = alloc_pages_node(node, 539 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 540 0); 541 if (!page) 542 goto out_cleanup; 543 per_cpu(cpu_profile_hits, cpu)[1] 544 = (struct profile_hit *)page_address(page); 545 page = alloc_pages_node(node, 546 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 547 0); 548 if (!page) 549 goto out_cleanup; 550 per_cpu(cpu_profile_hits, cpu)[0] 551 = (struct profile_hit *)page_address(page); 552 } 553 return 0; 554 out_cleanup: 555 prof_on = 0; 556 smp_mb(); 557 on_each_cpu(profile_nop, NULL, 1); 558 for_each_online_cpu(cpu) { 559 struct page *page; 560 561 if (per_cpu(cpu_profile_hits, cpu)[0]) { 562 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 563 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 564 __free_page(page); 565 } 566 if (per_cpu(cpu_profile_hits, cpu)[1]) { 567 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 568 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 569 __free_page(page); 570 } 571 } 572 return -1; 573 } 574 #else 575 #define create_hash_tables() ({ 0; }) 576 #endif 577 578 static int __init create_proc_profile(void) 579 { 580 struct proc_dir_entry *entry; 581 582 if (!prof_on) 583 return 0; 584 if (create_hash_tables()) 585 return -1; 586 entry = proc_create("profile", S_IWUSR | S_IRUGO, 587 NULL, &proc_profile_operations); 588 if (!entry) 589 return 0; 590 entry->size = (1+prof_len) * sizeof(atomic_t); 591 hotcpu_notifier(profile_cpu_callback, 0); 592 return 0; 593 } 594 module_init(create_proc_profile); 595 #endif /* CONFIG_PROC_FS */ 596