1 /* 2 * linux/kernel/profile.c 3 * Simple profiling. Manages a direct-mapped profile hit count buffer, 4 * with configurable resolution, support for restricting the cpus on 5 * which profiling is done, and switching between cpu time and 6 * schedule() calls via kernel command line parameters passed at boot. 7 * 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 9 * Red Hat, July 2004 10 * Consolidation of architecture support code for profiling, 11 * William Irwin, Oracle, July 2004 12 * Amortized hit count accounting via per-cpu open-addressed hashtables 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 14 */ 15 16 #include <linux/config.h> 17 #include <linux/module.h> 18 #include <linux/profile.h> 19 #include <linux/bootmem.h> 20 #include <linux/notifier.h> 21 #include <linux/mm.h> 22 #include <linux/cpumask.h> 23 #include <linux/cpu.h> 24 #include <linux/profile.h> 25 #include <linux/highmem.h> 26 #include <asm/sections.h> 27 #include <asm/semaphore.h> 28 29 struct profile_hit { 30 u32 pc, hits; 31 }; 32 #define PROFILE_GRPSHIFT 3 33 #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) 34 #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 35 #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 36 37 /* Oprofile timer tick hook */ 38 int (*timer_hook)(struct pt_regs *); 39 40 static atomic_t *prof_buffer; 41 static unsigned long prof_len, prof_shift; 42 static int prof_on; 43 static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44 #ifdef CONFIG_SMP 45 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46 static DEFINE_PER_CPU(int, cpu_profile_flip); 47 static DECLARE_MUTEX(profile_flip_mutex); 48 #endif /* CONFIG_SMP */ 49 50 static int __init profile_setup(char * str) 51 { 52 int par; 53 54 if (!strncmp(str, "schedule", 8)) { 55 prof_on = SCHED_PROFILING; 56 printk(KERN_INFO "kernel schedule profiling enabled\n"); 57 if (str[7] == ',') 58 str += 8; 59 } 60 if (get_option(&str,&par)) { 61 prof_shift = par; 62 prof_on = CPU_PROFILING; 63 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", 64 prof_shift); 65 } 66 return 1; 67 } 68 __setup("profile=", profile_setup); 69 70 71 void __init profile_init(void) 72 { 73 if (!prof_on) 74 return; 75 76 /* only text is profiled */ 77 prof_len = (_etext - _stext) >> prof_shift; 78 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 79 } 80 81 /* Profile event notifications */ 82 83 #ifdef CONFIG_PROFILING 84 85 static DECLARE_RWSEM(profile_rwsem); 86 static DEFINE_RWLOCK(handoff_lock); 87 static struct notifier_block * task_exit_notifier; 88 static struct notifier_block * task_free_notifier; 89 static struct notifier_block * munmap_notifier; 90 91 void profile_task_exit(struct task_struct * task) 92 { 93 down_read(&profile_rwsem); 94 notifier_call_chain(&task_exit_notifier, 0, task); 95 up_read(&profile_rwsem); 96 } 97 98 int profile_handoff_task(struct task_struct * task) 99 { 100 int ret; 101 read_lock(&handoff_lock); 102 ret = notifier_call_chain(&task_free_notifier, 0, task); 103 read_unlock(&handoff_lock); 104 return (ret == NOTIFY_OK) ? 1 : 0; 105 } 106 107 void profile_munmap(unsigned long addr) 108 { 109 down_read(&profile_rwsem); 110 notifier_call_chain(&munmap_notifier, 0, (void *)addr); 111 up_read(&profile_rwsem); 112 } 113 114 int task_handoff_register(struct notifier_block * n) 115 { 116 int err = -EINVAL; 117 118 write_lock(&handoff_lock); 119 err = notifier_chain_register(&task_free_notifier, n); 120 write_unlock(&handoff_lock); 121 return err; 122 } 123 124 int task_handoff_unregister(struct notifier_block * n) 125 { 126 int err = -EINVAL; 127 128 write_lock(&handoff_lock); 129 err = notifier_chain_unregister(&task_free_notifier, n); 130 write_unlock(&handoff_lock); 131 return err; 132 } 133 134 int profile_event_register(enum profile_type type, struct notifier_block * n) 135 { 136 int err = -EINVAL; 137 138 down_write(&profile_rwsem); 139 140 switch (type) { 141 case PROFILE_TASK_EXIT: 142 err = notifier_chain_register(&task_exit_notifier, n); 143 break; 144 case PROFILE_MUNMAP: 145 err = notifier_chain_register(&munmap_notifier, n); 146 break; 147 } 148 149 up_write(&profile_rwsem); 150 151 return err; 152 } 153 154 155 int profile_event_unregister(enum profile_type type, struct notifier_block * n) 156 { 157 int err = -EINVAL; 158 159 down_write(&profile_rwsem); 160 161 switch (type) { 162 case PROFILE_TASK_EXIT: 163 err = notifier_chain_unregister(&task_exit_notifier, n); 164 break; 165 case PROFILE_MUNMAP: 166 err = notifier_chain_unregister(&munmap_notifier, n); 167 break; 168 } 169 170 up_write(&profile_rwsem); 171 return err; 172 } 173 174 int register_timer_hook(int (*hook)(struct pt_regs *)) 175 { 176 if (timer_hook) 177 return -EBUSY; 178 timer_hook = hook; 179 return 0; 180 } 181 182 void unregister_timer_hook(int (*hook)(struct pt_regs *)) 183 { 184 WARN_ON(hook != timer_hook); 185 timer_hook = NULL; 186 /* make sure all CPUs see the NULL hook */ 187 synchronize_kernel(); 188 } 189 190 EXPORT_SYMBOL_GPL(register_timer_hook); 191 EXPORT_SYMBOL_GPL(unregister_timer_hook); 192 EXPORT_SYMBOL_GPL(task_handoff_register); 193 EXPORT_SYMBOL_GPL(task_handoff_unregister); 194 195 #endif /* CONFIG_PROFILING */ 196 197 EXPORT_SYMBOL_GPL(profile_event_register); 198 EXPORT_SYMBOL_GPL(profile_event_unregister); 199 200 #ifdef CONFIG_SMP 201 /* 202 * Each cpu has a pair of open-addressed hashtables for pending 203 * profile hits. read_profile() IPI's all cpus to request them 204 * to flip buffers and flushes their contents to prof_buffer itself. 205 * Flip requests are serialized by the profile_flip_mutex. The sole 206 * use of having a second hashtable is for avoiding cacheline 207 * contention that would otherwise happen during flushes of pending 208 * profile hits required for the accuracy of reported profile hits 209 * and so resurrect the interrupt livelock issue. 210 * 211 * The open-addressed hashtables are indexed by profile buffer slot 212 * and hold the number of pending hits to that profile buffer slot on 213 * a cpu in an entry. When the hashtable overflows, all pending hits 214 * are accounted to their corresponding profile buffer slots with 215 * atomic_add() and the hashtable emptied. As numerous pending hits 216 * may be accounted to a profile buffer slot in a hashtable entry, 217 * this amortizes a number of atomic profile buffer increments likely 218 * to be far larger than the number of entries in the hashtable, 219 * particularly given that the number of distinct profile buffer 220 * positions to which hits are accounted during short intervals (e.g. 221 * several seconds) is usually very small. Exclusion from buffer 222 * flipping is provided by interrupt disablement (note that for 223 * SCHED_PROFILING profile_hit() may be called from process context). 224 * The hash function is meant to be lightweight as opposed to strong, 225 * and was vaguely inspired by ppc64 firmware-supported inverted 226 * pagetable hash functions, but uses a full hashtable full of finite 227 * collision chains, not just pairs of them. 228 * 229 * -- wli 230 */ 231 static void __profile_flip_buffers(void *unused) 232 { 233 int cpu = smp_processor_id(); 234 235 per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); 236 } 237 238 static void profile_flip_buffers(void) 239 { 240 int i, j, cpu; 241 242 down(&profile_flip_mutex); 243 j = per_cpu(cpu_profile_flip, get_cpu()); 244 put_cpu(); 245 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 246 for_each_online_cpu(cpu) { 247 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; 248 for (i = 0; i < NR_PROFILE_HIT; ++i) { 249 if (!hits[i].hits) { 250 if (hits[i].pc) 251 hits[i].pc = 0; 252 continue; 253 } 254 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 255 hits[i].hits = hits[i].pc = 0; 256 } 257 } 258 up(&profile_flip_mutex); 259 } 260 261 static void profile_discard_flip_buffers(void) 262 { 263 int i, cpu; 264 265 down(&profile_flip_mutex); 266 i = per_cpu(cpu_profile_flip, get_cpu()); 267 put_cpu(); 268 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 269 for_each_online_cpu(cpu) { 270 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 271 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 272 } 273 up(&profile_flip_mutex); 274 } 275 276 void profile_hit(int type, void *__pc) 277 { 278 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 279 int i, j, cpu; 280 struct profile_hit *hits; 281 282 if (prof_on != type || !prof_buffer) 283 return; 284 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 285 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 286 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 287 cpu = get_cpu(); 288 hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; 289 if (!hits) { 290 put_cpu(); 291 return; 292 } 293 local_irq_save(flags); 294 do { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) { 296 if (hits[i + j].pc == pc) { 297 hits[i + j].hits++; 298 goto out; 299 } else if (!hits[i + j].hits) { 300 hits[i + j].pc = pc; 301 hits[i + j].hits = 1; 302 goto out; 303 } 304 } 305 i = (i + secondary) & (NR_PROFILE_HIT - 1); 306 } while (i != primary); 307 atomic_inc(&prof_buffer[pc]); 308 for (i = 0; i < NR_PROFILE_HIT; ++i) { 309 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 310 hits[i].pc = hits[i].hits = 0; 311 } 312 out: 313 local_irq_restore(flags); 314 put_cpu(); 315 } 316 317 #ifdef CONFIG_HOTPLUG_CPU 318 static int __devinit profile_cpu_callback(struct notifier_block *info, 319 unsigned long action, void *__cpu) 320 { 321 int node, cpu = (unsigned long)__cpu; 322 struct page *page; 323 324 switch (action) { 325 case CPU_UP_PREPARE: 326 node = cpu_to_node(cpu); 327 per_cpu(cpu_profile_flip, cpu) = 0; 328 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 329 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 330 if (!page) 331 return NOTIFY_BAD; 332 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 333 } 334 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 335 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 336 if (!page) 337 goto out_free; 338 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 339 } 340 break; 341 out_free: 342 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 343 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 344 __free_page(page); 345 return NOTIFY_BAD; 346 case CPU_ONLINE: 347 cpu_set(cpu, prof_cpu_mask); 348 break; 349 case CPU_UP_CANCELED: 350 case CPU_DEAD: 351 cpu_clear(cpu, prof_cpu_mask); 352 if (per_cpu(cpu_profile_hits, cpu)[0]) { 353 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 354 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 355 __free_page(page); 356 } 357 if (per_cpu(cpu_profile_hits, cpu)[1]) { 358 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 359 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 360 __free_page(page); 361 } 362 break; 363 } 364 return NOTIFY_OK; 365 } 366 #endif /* CONFIG_HOTPLUG_CPU */ 367 #else /* !CONFIG_SMP */ 368 #define profile_flip_buffers() do { } while (0) 369 #define profile_discard_flip_buffers() do { } while (0) 370 371 void profile_hit(int type, void *__pc) 372 { 373 unsigned long pc; 374 375 if (prof_on != type || !prof_buffer) 376 return; 377 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 378 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 379 } 380 #endif /* !CONFIG_SMP */ 381 382 void profile_tick(int type, struct pt_regs *regs) 383 { 384 if (type == CPU_PROFILING && timer_hook) 385 timer_hook(regs); 386 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 387 profile_hit(type, (void *)profile_pc(regs)); 388 } 389 390 #ifdef CONFIG_PROC_FS 391 #include <linux/proc_fs.h> 392 #include <asm/uaccess.h> 393 #include <asm/ptrace.h> 394 395 static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, 396 int count, int *eof, void *data) 397 { 398 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 399 if (count - len < 2) 400 return -EINVAL; 401 len += sprintf(page + len, "\n"); 402 return len; 403 } 404 405 static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, 406 unsigned long count, void *data) 407 { 408 cpumask_t *mask = (cpumask_t *)data; 409 unsigned long full_count = count, err; 410 cpumask_t new_value; 411 412 err = cpumask_parse(buffer, count, new_value); 413 if (err) 414 return err; 415 416 *mask = new_value; 417 return full_count; 418 } 419 420 void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 421 { 422 struct proc_dir_entry *entry; 423 424 /* create /proc/irq/prof_cpu_mask */ 425 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 426 return; 427 entry->nlink = 1; 428 entry->data = (void *)&prof_cpu_mask; 429 entry->read_proc = prof_cpu_mask_read_proc; 430 entry->write_proc = prof_cpu_mask_write_proc; 431 } 432 433 /* 434 * This function accesses profiling information. The returned data is 435 * binary: the sampling step and the actual contents of the profile 436 * buffer. Use of the program readprofile is recommended in order to 437 * get meaningful info out of these data. 438 */ 439 static ssize_t 440 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) 441 { 442 unsigned long p = *ppos; 443 ssize_t read; 444 char * pnt; 445 unsigned int sample_step = 1 << prof_shift; 446 447 profile_flip_buffers(); 448 if (p >= (prof_len+1)*sizeof(unsigned int)) 449 return 0; 450 if (count > (prof_len+1)*sizeof(unsigned int) - p) 451 count = (prof_len+1)*sizeof(unsigned int) - p; 452 read = 0; 453 454 while (p < sizeof(unsigned int) && count > 0) { 455 put_user(*((char *)(&sample_step)+p),buf); 456 buf++; p++; count--; read++; 457 } 458 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 459 if (copy_to_user(buf,(void *)pnt,count)) 460 return -EFAULT; 461 read += count; 462 *ppos += read; 463 return read; 464 } 465 466 /* 467 * Writing to /proc/profile resets the counters 468 * 469 * Writing a 'profiling multiplier' value into it also re-sets the profiling 470 * interrupt frequency, on architectures that support this. 471 */ 472 static ssize_t write_profile(struct file *file, const char __user *buf, 473 size_t count, loff_t *ppos) 474 { 475 #ifdef CONFIG_SMP 476 extern int setup_profiling_timer (unsigned int multiplier); 477 478 if (count == sizeof(int)) { 479 unsigned int multiplier; 480 481 if (copy_from_user(&multiplier, buf, sizeof(int))) 482 return -EFAULT; 483 484 if (setup_profiling_timer(multiplier)) 485 return -EINVAL; 486 } 487 #endif 488 profile_discard_flip_buffers(); 489 memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); 490 return count; 491 } 492 493 static struct file_operations proc_profile_operations = { 494 .read = read_profile, 495 .write = write_profile, 496 }; 497 498 #ifdef CONFIG_SMP 499 static void __init profile_nop(void *unused) 500 { 501 } 502 503 static int __init create_hash_tables(void) 504 { 505 int cpu; 506 507 for_each_online_cpu(cpu) { 508 int node = cpu_to_node(cpu); 509 struct page *page; 510 511 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 512 if (!page) 513 goto out_cleanup; 514 per_cpu(cpu_profile_hits, cpu)[1] 515 = (struct profile_hit *)page_address(page); 516 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 517 if (!page) 518 goto out_cleanup; 519 per_cpu(cpu_profile_hits, cpu)[0] 520 = (struct profile_hit *)page_address(page); 521 } 522 return 0; 523 out_cleanup: 524 prof_on = 0; 525 mb(); 526 on_each_cpu(profile_nop, NULL, 0, 1); 527 for_each_online_cpu(cpu) { 528 struct page *page; 529 530 if (per_cpu(cpu_profile_hits, cpu)[0]) { 531 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 532 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 533 __free_page(page); 534 } 535 if (per_cpu(cpu_profile_hits, cpu)[1]) { 536 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 537 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 538 __free_page(page); 539 } 540 } 541 return -1; 542 } 543 #else 544 #define create_hash_tables() ({ 0; }) 545 #endif 546 547 static int __init create_proc_profile(void) 548 { 549 struct proc_dir_entry *entry; 550 551 if (!prof_on) 552 return 0; 553 if (create_hash_tables()) 554 return -1; 555 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) 556 return 0; 557 entry->proc_fops = &proc_profile_operations; 558 entry->size = (1+prof_len) * sizeof(atomic_t); 559 hotcpu_notifier(profile_cpu_callback, 0); 560 return 0; 561 } 562 module_init(create_proc_profile); 563 #endif /* CONFIG_PROC_FS */ 564