1 /* 2 * Performance events core code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12 #include <linux/fs.h> 13 #include <linux/mm.h> 14 #include <linux/cpu.h> 15 #include <linux/smp.h> 16 #include <linux/idr.h> 17 #include <linux/file.h> 18 #include <linux/poll.h> 19 #include <linux/slab.h> 20 #include <linux/hash.h> 21 #include <linux/tick.h> 22 #include <linux/sysfs.h> 23 #include <linux/dcache.h> 24 #include <linux/percpu.h> 25 #include <linux/ptrace.h> 26 #include <linux/reboot.h> 27 #include <linux/vmstat.h> 28 #include <linux/device.h> 29 #include <linux/export.h> 30 #include <linux/vmalloc.h> 31 #include <linux/hardirq.h> 32 #include <linux/rculist.h> 33 #include <linux/uaccess.h> 34 #include <linux/syscalls.h> 35 #include <linux/anon_inodes.h> 36 #include <linux/kernel_stat.h> 37 #include <linux/perf_event.h> 38 #include <linux/ftrace_event.h> 39 #include <linux/hw_breakpoint.h> 40 #include <linux/mm_types.h> 41 #include <linux/cgroup.h> 42 #include <linux/module.h> 43 #include <linux/mman.h> 44 #include <linux/compat.h> 45 46 #include "internal.h" 47 48 #include <asm/irq_regs.h> 49 50 struct remote_function_call { 51 struct task_struct *p; 52 int (*func)(void *info); 53 void *info; 54 int ret; 55 }; 56 57 static void remote_function(void *data) 58 { 59 struct remote_function_call *tfc = data; 60 struct task_struct *p = tfc->p; 61 62 if (p) { 63 tfc->ret = -EAGAIN; 64 if (task_cpu(p) != smp_processor_id() || !task_curr(p)) 65 return; 66 } 67 68 tfc->ret = tfc->func(tfc->info); 69 } 70 71 /** 72 * task_function_call - call a function on the cpu on which a task runs 73 * @p: the task to evaluate 74 * @func: the function to be called 75 * @info: the function call argument 76 * 77 * Calls the function @func when the task is currently running. This might 78 * be on the current CPU, which just calls the function directly 79 * 80 * returns: @func return value, or 81 * -ESRCH - when the process isn't running 82 * -EAGAIN - when the process moved away 83 */ 84 static int 85 task_function_call(struct task_struct *p, int (*func) (void *info), void *info) 86 { 87 struct remote_function_call data = { 88 .p = p, 89 .func = func, 90 .info = info, 91 .ret = -ESRCH, /* No such (running) process */ 92 }; 93 94 if (task_curr(p)) 95 smp_call_function_single(task_cpu(p), remote_function, &data, 1); 96 97 return data.ret; 98 } 99 100 /** 101 * cpu_function_call - call a function on the cpu 102 * @func: the function to be called 103 * @info: the function call argument 104 * 105 * Calls the function @func on the remote cpu. 106 * 107 * returns: @func return value or -ENXIO when the cpu is offline 108 */ 109 static int cpu_function_call(int cpu, int (*func) (void *info), void *info) 110 { 111 struct remote_function_call data = { 112 .p = NULL, 113 .func = func, 114 .info = info, 115 .ret = -ENXIO, /* No such CPU */ 116 }; 117 118 smp_call_function_single(cpu, remote_function, &data, 1); 119 120 return data.ret; 121 } 122 123 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 124 PERF_FLAG_FD_OUTPUT |\ 125 PERF_FLAG_PID_CGROUP |\ 126 PERF_FLAG_FD_CLOEXEC) 127 128 /* 129 * branch priv levels that need permission checks 130 */ 131 #define PERF_SAMPLE_BRANCH_PERM_PLM \ 132 (PERF_SAMPLE_BRANCH_KERNEL |\ 133 PERF_SAMPLE_BRANCH_HV) 134 135 enum event_type_t { 136 EVENT_FLEXIBLE = 0x1, 137 EVENT_PINNED = 0x2, 138 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 139 }; 140 141 /* 142 * perf_sched_events : >0 events exist 143 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 144 */ 145 struct static_key_deferred perf_sched_events __read_mostly; 146 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 147 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); 148 149 static atomic_t nr_mmap_events __read_mostly; 150 static atomic_t nr_comm_events __read_mostly; 151 static atomic_t nr_task_events __read_mostly; 152 static atomic_t nr_freq_events __read_mostly; 153 154 static LIST_HEAD(pmus); 155 static DEFINE_MUTEX(pmus_lock); 156 static struct srcu_struct pmus_srcu; 157 158 /* 159 * perf event paranoia level: 160 * -1 - not paranoid at all 161 * 0 - disallow raw tracepoint access for unpriv 162 * 1 - disallow cpu events for unpriv 163 * 2 - disallow kernel profiling for unpriv 164 */ 165 int sysctl_perf_event_paranoid __read_mostly = 1; 166 167 /* Minimum for 512 kiB + 1 user control page */ 168 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ 169 170 /* 171 * max perf event sample rate 172 */ 173 #define DEFAULT_MAX_SAMPLE_RATE 100000 174 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) 175 #define DEFAULT_CPU_TIME_MAX_PERCENT 25 176 177 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 178 179 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 180 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 181 182 static int perf_sample_allowed_ns __read_mostly = 183 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 184 185 void update_perf_cpu_limits(void) 186 { 187 u64 tmp = perf_sample_period_ns; 188 189 tmp *= sysctl_perf_cpu_time_max_percent; 190 do_div(tmp, 100); 191 ACCESS_ONCE(perf_sample_allowed_ns) = tmp; 192 } 193 194 static int perf_rotate_context(struct perf_cpu_context *cpuctx); 195 196 int perf_proc_update_handler(struct ctl_table *table, int write, 197 void __user *buffer, size_t *lenp, 198 loff_t *ppos) 199 { 200 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 201 202 if (ret || !write) 203 return ret; 204 205 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 206 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 207 update_perf_cpu_limits(); 208 209 return 0; 210 } 211 212 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; 213 214 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, 215 void __user *buffer, size_t *lenp, 216 loff_t *ppos) 217 { 218 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 219 220 if (ret || !write) 221 return ret; 222 223 update_perf_cpu_limits(); 224 225 return 0; 226 } 227 228 /* 229 * perf samples are done in some very critical code paths (NMIs). 230 * If they take too much CPU time, the system can lock up and not 231 * get any real work done. This will drop the sample rate when 232 * we detect that events are taking too long. 233 */ 234 #define NR_ACCUMULATED_SAMPLES 128 235 static DEFINE_PER_CPU(u64, running_sample_length); 236 237 static void perf_duration_warn(struct irq_work *w) 238 { 239 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); 240 u64 avg_local_sample_len; 241 u64 local_samples_len; 242 243 local_samples_len = __get_cpu_var(running_sample_length); 244 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 245 246 printk_ratelimited(KERN_WARNING 247 "perf interrupt took too long (%lld > %lld), lowering " 248 "kernel.perf_event_max_sample_rate to %d\n", 249 avg_local_sample_len, allowed_ns >> 1, 250 sysctl_perf_event_sample_rate); 251 } 252 253 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); 254 255 void perf_sample_event_took(u64 sample_len_ns) 256 { 257 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); 258 u64 avg_local_sample_len; 259 u64 local_samples_len; 260 261 if (allowed_ns == 0) 262 return; 263 264 /* decay the counter by 1 average sample */ 265 local_samples_len = __get_cpu_var(running_sample_length); 266 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; 267 local_samples_len += sample_len_ns; 268 __get_cpu_var(running_sample_length) = local_samples_len; 269 270 /* 271 * note: this will be biased artifically low until we have 272 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us 273 * from having to maintain a count. 274 */ 275 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 276 277 if (avg_local_sample_len <= allowed_ns) 278 return; 279 280 if (max_samples_per_tick <= 1) 281 return; 282 283 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); 284 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; 285 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 286 287 update_perf_cpu_limits(); 288 289 if (!irq_work_queue(&perf_duration_work)) { 290 early_printk("perf interrupt took too long (%lld > %lld), lowering " 291 "kernel.perf_event_max_sample_rate to %d\n", 292 avg_local_sample_len, allowed_ns >> 1, 293 sysctl_perf_event_sample_rate); 294 } 295 } 296 297 static atomic64_t perf_event_id; 298 299 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 300 enum event_type_t event_type); 301 302 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 303 enum event_type_t event_type, 304 struct task_struct *task); 305 306 static void update_context_time(struct perf_event_context *ctx); 307 static u64 perf_event_time(struct perf_event *event); 308 309 void __weak perf_event_print_debug(void) { } 310 311 extern __weak const char *perf_pmu_name(void) 312 { 313 return "pmu"; 314 } 315 316 static inline u64 perf_clock(void) 317 { 318 return local_clock(); 319 } 320 321 static inline struct perf_cpu_context * 322 __get_cpu_context(struct perf_event_context *ctx) 323 { 324 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 325 } 326 327 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 328 struct perf_event_context *ctx) 329 { 330 raw_spin_lock(&cpuctx->ctx.lock); 331 if (ctx) 332 raw_spin_lock(&ctx->lock); 333 } 334 335 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 336 struct perf_event_context *ctx) 337 { 338 if (ctx) 339 raw_spin_unlock(&ctx->lock); 340 raw_spin_unlock(&cpuctx->ctx.lock); 341 } 342 343 #ifdef CONFIG_CGROUP_PERF 344 345 /* 346 * perf_cgroup_info keeps track of time_enabled for a cgroup. 347 * This is a per-cpu dynamically allocated data structure. 348 */ 349 struct perf_cgroup_info { 350 u64 time; 351 u64 timestamp; 352 }; 353 354 struct perf_cgroup { 355 struct cgroup_subsys_state css; 356 struct perf_cgroup_info __percpu *info; 357 }; 358 359 /* 360 * Must ensure cgroup is pinned (css_get) before calling 361 * this function. In other words, we cannot call this function 362 * if there is no cgroup event for the current CPU context. 363 */ 364 static inline struct perf_cgroup * 365 perf_cgroup_from_task(struct task_struct *task) 366 { 367 return container_of(task_css(task, perf_event_cgrp_id), 368 struct perf_cgroup, css); 369 } 370 371 static inline bool 372 perf_cgroup_match(struct perf_event *event) 373 { 374 struct perf_event_context *ctx = event->ctx; 375 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 376 377 /* @event doesn't care about cgroup */ 378 if (!event->cgrp) 379 return true; 380 381 /* wants specific cgroup scope but @cpuctx isn't associated with any */ 382 if (!cpuctx->cgrp) 383 return false; 384 385 /* 386 * Cgroup scoping is recursive. An event enabled for a cgroup is 387 * also enabled for all its descendant cgroups. If @cpuctx's 388 * cgroup is a descendant of @event's (the test covers identity 389 * case), it's a match. 390 */ 391 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, 392 event->cgrp->css.cgroup); 393 } 394 395 static inline void perf_put_cgroup(struct perf_event *event) 396 { 397 css_put(&event->cgrp->css); 398 } 399 400 static inline void perf_detach_cgroup(struct perf_event *event) 401 { 402 perf_put_cgroup(event); 403 event->cgrp = NULL; 404 } 405 406 static inline int is_cgroup_event(struct perf_event *event) 407 { 408 return event->cgrp != NULL; 409 } 410 411 static inline u64 perf_cgroup_event_time(struct perf_event *event) 412 { 413 struct perf_cgroup_info *t; 414 415 t = per_cpu_ptr(event->cgrp->info, event->cpu); 416 return t->time; 417 } 418 419 static inline void __update_cgrp_time(struct perf_cgroup *cgrp) 420 { 421 struct perf_cgroup_info *info; 422 u64 now; 423 424 now = perf_clock(); 425 426 info = this_cpu_ptr(cgrp->info); 427 428 info->time += now - info->timestamp; 429 info->timestamp = now; 430 } 431 432 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 433 { 434 struct perf_cgroup *cgrp_out = cpuctx->cgrp; 435 if (cgrp_out) 436 __update_cgrp_time(cgrp_out); 437 } 438 439 static inline void update_cgrp_time_from_event(struct perf_event *event) 440 { 441 struct perf_cgroup *cgrp; 442 443 /* 444 * ensure we access cgroup data only when needed and 445 * when we know the cgroup is pinned (css_get) 446 */ 447 if (!is_cgroup_event(event)) 448 return; 449 450 cgrp = perf_cgroup_from_task(current); 451 /* 452 * Do not update time when cgroup is not active 453 */ 454 if (cgrp == event->cgrp) 455 __update_cgrp_time(event->cgrp); 456 } 457 458 static inline void 459 perf_cgroup_set_timestamp(struct task_struct *task, 460 struct perf_event_context *ctx) 461 { 462 struct perf_cgroup *cgrp; 463 struct perf_cgroup_info *info; 464 465 /* 466 * ctx->lock held by caller 467 * ensure we do not access cgroup data 468 * unless we have the cgroup pinned (css_get) 469 */ 470 if (!task || !ctx->nr_cgroups) 471 return; 472 473 cgrp = perf_cgroup_from_task(task); 474 info = this_cpu_ptr(cgrp->info); 475 info->timestamp = ctx->timestamp; 476 } 477 478 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ 479 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ 480 481 /* 482 * reschedule events based on the cgroup constraint of task. 483 * 484 * mode SWOUT : schedule out everything 485 * mode SWIN : schedule in based on cgroup for next 486 */ 487 void perf_cgroup_switch(struct task_struct *task, int mode) 488 { 489 struct perf_cpu_context *cpuctx; 490 struct pmu *pmu; 491 unsigned long flags; 492 493 /* 494 * disable interrupts to avoid geting nr_cgroup 495 * changes via __perf_event_disable(). Also 496 * avoids preemption. 497 */ 498 local_irq_save(flags); 499 500 /* 501 * we reschedule only in the presence of cgroup 502 * constrained events. 503 */ 504 rcu_read_lock(); 505 506 list_for_each_entry_rcu(pmu, &pmus, entry) { 507 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 508 if (cpuctx->unique_pmu != pmu) 509 continue; /* ensure we process each cpuctx once */ 510 511 /* 512 * perf_cgroup_events says at least one 513 * context on this CPU has cgroup events. 514 * 515 * ctx->nr_cgroups reports the number of cgroup 516 * events for a context. 517 */ 518 if (cpuctx->ctx.nr_cgroups > 0) { 519 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 520 perf_pmu_disable(cpuctx->ctx.pmu); 521 522 if (mode & PERF_CGROUP_SWOUT) { 523 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 524 /* 525 * must not be done before ctxswout due 526 * to event_filter_match() in event_sched_out() 527 */ 528 cpuctx->cgrp = NULL; 529 } 530 531 if (mode & PERF_CGROUP_SWIN) { 532 WARN_ON_ONCE(cpuctx->cgrp); 533 /* 534 * set cgrp before ctxsw in to allow 535 * event_filter_match() to not have to pass 536 * task around 537 */ 538 cpuctx->cgrp = perf_cgroup_from_task(task); 539 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 540 } 541 perf_pmu_enable(cpuctx->ctx.pmu); 542 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 543 } 544 } 545 546 rcu_read_unlock(); 547 548 local_irq_restore(flags); 549 } 550 551 static inline void perf_cgroup_sched_out(struct task_struct *task, 552 struct task_struct *next) 553 { 554 struct perf_cgroup *cgrp1; 555 struct perf_cgroup *cgrp2 = NULL; 556 557 /* 558 * we come here when we know perf_cgroup_events > 0 559 */ 560 cgrp1 = perf_cgroup_from_task(task); 561 562 /* 563 * next is NULL when called from perf_event_enable_on_exec() 564 * that will systematically cause a cgroup_switch() 565 */ 566 if (next) 567 cgrp2 = perf_cgroup_from_task(next); 568 569 /* 570 * only schedule out current cgroup events if we know 571 * that we are switching to a different cgroup. Otherwise, 572 * do no touch the cgroup events. 573 */ 574 if (cgrp1 != cgrp2) 575 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 576 } 577 578 static inline void perf_cgroup_sched_in(struct task_struct *prev, 579 struct task_struct *task) 580 { 581 struct perf_cgroup *cgrp1; 582 struct perf_cgroup *cgrp2 = NULL; 583 584 /* 585 * we come here when we know perf_cgroup_events > 0 586 */ 587 cgrp1 = perf_cgroup_from_task(task); 588 589 /* prev can never be NULL */ 590 cgrp2 = perf_cgroup_from_task(prev); 591 592 /* 593 * only need to schedule in cgroup events if we are changing 594 * cgroup during ctxsw. Cgroup events were not scheduled 595 * out of ctxsw out if that was not the case. 596 */ 597 if (cgrp1 != cgrp2) 598 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 599 } 600 601 static inline int perf_cgroup_connect(int fd, struct perf_event *event, 602 struct perf_event_attr *attr, 603 struct perf_event *group_leader) 604 { 605 struct perf_cgroup *cgrp; 606 struct cgroup_subsys_state *css; 607 struct fd f = fdget(fd); 608 int ret = 0; 609 610 if (!f.file) 611 return -EBADF; 612 613 css = css_tryget_online_from_dir(f.file->f_dentry, 614 &perf_event_cgrp_subsys); 615 if (IS_ERR(css)) { 616 ret = PTR_ERR(css); 617 goto out; 618 } 619 620 cgrp = container_of(css, struct perf_cgroup, css); 621 event->cgrp = cgrp; 622 623 /* 624 * all events in a group must monitor 625 * the same cgroup because a task belongs 626 * to only one perf cgroup at a time 627 */ 628 if (group_leader && group_leader->cgrp != cgrp) { 629 perf_detach_cgroup(event); 630 ret = -EINVAL; 631 } 632 out: 633 fdput(f); 634 return ret; 635 } 636 637 static inline void 638 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 639 { 640 struct perf_cgroup_info *t; 641 t = per_cpu_ptr(event->cgrp->info, event->cpu); 642 event->shadow_ctx_time = now - t->timestamp; 643 } 644 645 static inline void 646 perf_cgroup_defer_enabled(struct perf_event *event) 647 { 648 /* 649 * when the current task's perf cgroup does not match 650 * the event's, we need to remember to call the 651 * perf_mark_enable() function the first time a task with 652 * a matching perf cgroup is scheduled in. 653 */ 654 if (is_cgroup_event(event) && !perf_cgroup_match(event)) 655 event->cgrp_defer_enabled = 1; 656 } 657 658 static inline void 659 perf_cgroup_mark_enabled(struct perf_event *event, 660 struct perf_event_context *ctx) 661 { 662 struct perf_event *sub; 663 u64 tstamp = perf_event_time(event); 664 665 if (!event->cgrp_defer_enabled) 666 return; 667 668 event->cgrp_defer_enabled = 0; 669 670 event->tstamp_enabled = tstamp - event->total_time_enabled; 671 list_for_each_entry(sub, &event->sibling_list, group_entry) { 672 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 673 sub->tstamp_enabled = tstamp - sub->total_time_enabled; 674 sub->cgrp_defer_enabled = 0; 675 } 676 } 677 } 678 #else /* !CONFIG_CGROUP_PERF */ 679 680 static inline bool 681 perf_cgroup_match(struct perf_event *event) 682 { 683 return true; 684 } 685 686 static inline void perf_detach_cgroup(struct perf_event *event) 687 {} 688 689 static inline int is_cgroup_event(struct perf_event *event) 690 { 691 return 0; 692 } 693 694 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) 695 { 696 return 0; 697 } 698 699 static inline void update_cgrp_time_from_event(struct perf_event *event) 700 { 701 } 702 703 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 704 { 705 } 706 707 static inline void perf_cgroup_sched_out(struct task_struct *task, 708 struct task_struct *next) 709 { 710 } 711 712 static inline void perf_cgroup_sched_in(struct task_struct *prev, 713 struct task_struct *task) 714 { 715 } 716 717 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, 718 struct perf_event_attr *attr, 719 struct perf_event *group_leader) 720 { 721 return -EINVAL; 722 } 723 724 static inline void 725 perf_cgroup_set_timestamp(struct task_struct *task, 726 struct perf_event_context *ctx) 727 { 728 } 729 730 void 731 perf_cgroup_switch(struct task_struct *task, struct task_struct *next) 732 { 733 } 734 735 static inline void 736 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 737 { 738 } 739 740 static inline u64 perf_cgroup_event_time(struct perf_event *event) 741 { 742 return 0; 743 } 744 745 static inline void 746 perf_cgroup_defer_enabled(struct perf_event *event) 747 { 748 } 749 750 static inline void 751 perf_cgroup_mark_enabled(struct perf_event *event, 752 struct perf_event_context *ctx) 753 { 754 } 755 #endif 756 757 /* 758 * set default to be dependent on timer tick just 759 * like original code 760 */ 761 #define PERF_CPU_HRTIMER (1000 / HZ) 762 /* 763 * function must be called with interrupts disbled 764 */ 765 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) 766 { 767 struct perf_cpu_context *cpuctx; 768 enum hrtimer_restart ret = HRTIMER_NORESTART; 769 int rotations = 0; 770 771 WARN_ON(!irqs_disabled()); 772 773 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); 774 775 rotations = perf_rotate_context(cpuctx); 776 777 /* 778 * arm timer if needed 779 */ 780 if (rotations) { 781 hrtimer_forward_now(hr, cpuctx->hrtimer_interval); 782 ret = HRTIMER_RESTART; 783 } 784 785 return ret; 786 } 787 788 /* CPU is going down */ 789 void perf_cpu_hrtimer_cancel(int cpu) 790 { 791 struct perf_cpu_context *cpuctx; 792 struct pmu *pmu; 793 unsigned long flags; 794 795 if (WARN_ON(cpu != smp_processor_id())) 796 return; 797 798 local_irq_save(flags); 799 800 rcu_read_lock(); 801 802 list_for_each_entry_rcu(pmu, &pmus, entry) { 803 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 804 805 if (pmu->task_ctx_nr == perf_sw_context) 806 continue; 807 808 hrtimer_cancel(&cpuctx->hrtimer); 809 } 810 811 rcu_read_unlock(); 812 813 local_irq_restore(flags); 814 } 815 816 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) 817 { 818 struct hrtimer *hr = &cpuctx->hrtimer; 819 struct pmu *pmu = cpuctx->ctx.pmu; 820 int timer; 821 822 /* no multiplexing needed for SW PMU */ 823 if (pmu->task_ctx_nr == perf_sw_context) 824 return; 825 826 /* 827 * check default is sane, if not set then force to 828 * default interval (1/tick) 829 */ 830 timer = pmu->hrtimer_interval_ms; 831 if (timer < 1) 832 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; 833 834 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); 835 836 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 837 hr->function = perf_cpu_hrtimer_handler; 838 } 839 840 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) 841 { 842 struct hrtimer *hr = &cpuctx->hrtimer; 843 struct pmu *pmu = cpuctx->ctx.pmu; 844 845 /* not for SW PMU */ 846 if (pmu->task_ctx_nr == perf_sw_context) 847 return; 848 849 if (hrtimer_active(hr)) 850 return; 851 852 if (!hrtimer_callback_running(hr)) 853 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, 854 0, HRTIMER_MODE_REL_PINNED, 0); 855 } 856 857 void perf_pmu_disable(struct pmu *pmu) 858 { 859 int *count = this_cpu_ptr(pmu->pmu_disable_count); 860 if (!(*count)++) 861 pmu->pmu_disable(pmu); 862 } 863 864 void perf_pmu_enable(struct pmu *pmu) 865 { 866 int *count = this_cpu_ptr(pmu->pmu_disable_count); 867 if (!--(*count)) 868 pmu->pmu_enable(pmu); 869 } 870 871 static DEFINE_PER_CPU(struct list_head, rotation_list); 872 873 /* 874 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized 875 * because they're strictly cpu affine and rotate_start is called with IRQs 876 * disabled, while rotate_context is called from IRQ context. 877 */ 878 static void perf_pmu_rotate_start(struct pmu *pmu) 879 { 880 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 881 struct list_head *head = &__get_cpu_var(rotation_list); 882 883 WARN_ON(!irqs_disabled()); 884 885 if (list_empty(&cpuctx->rotation_list)) 886 list_add(&cpuctx->rotation_list, head); 887 } 888 889 static void get_ctx(struct perf_event_context *ctx) 890 { 891 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 892 } 893 894 static void put_ctx(struct perf_event_context *ctx) 895 { 896 if (atomic_dec_and_test(&ctx->refcount)) { 897 if (ctx->parent_ctx) 898 put_ctx(ctx->parent_ctx); 899 if (ctx->task) 900 put_task_struct(ctx->task); 901 kfree_rcu(ctx, rcu_head); 902 } 903 } 904 905 static void unclone_ctx(struct perf_event_context *ctx) 906 { 907 if (ctx->parent_ctx) { 908 put_ctx(ctx->parent_ctx); 909 ctx->parent_ctx = NULL; 910 } 911 ctx->generation++; 912 } 913 914 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 915 { 916 /* 917 * only top level events have the pid namespace they were created in 918 */ 919 if (event->parent) 920 event = event->parent; 921 922 return task_tgid_nr_ns(p, event->ns); 923 } 924 925 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 926 { 927 /* 928 * only top level events have the pid namespace they were created in 929 */ 930 if (event->parent) 931 event = event->parent; 932 933 return task_pid_nr_ns(p, event->ns); 934 } 935 936 /* 937 * If we inherit events we want to return the parent event id 938 * to userspace. 939 */ 940 static u64 primary_event_id(struct perf_event *event) 941 { 942 u64 id = event->id; 943 944 if (event->parent) 945 id = event->parent->id; 946 947 return id; 948 } 949 950 /* 951 * Get the perf_event_context for a task and lock it. 952 * This has to cope with with the fact that until it is locked, 953 * the context could get moved to another task. 954 */ 955 static struct perf_event_context * 956 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) 957 { 958 struct perf_event_context *ctx; 959 960 retry: 961 /* 962 * One of the few rules of preemptible RCU is that one cannot do 963 * rcu_read_unlock() while holding a scheduler (or nested) lock when 964 * part of the read side critical section was preemptible -- see 965 * rcu_read_unlock_special(). 966 * 967 * Since ctx->lock nests under rq->lock we must ensure the entire read 968 * side critical section is non-preemptible. 969 */ 970 preempt_disable(); 971 rcu_read_lock(); 972 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 973 if (ctx) { 974 /* 975 * If this context is a clone of another, it might 976 * get swapped for another underneath us by 977 * perf_event_task_sched_out, though the 978 * rcu_read_lock() protects us from any context 979 * getting freed. Lock the context and check if it 980 * got swapped before we could get the lock, and retry 981 * if so. If we locked the right context, then it 982 * can't get swapped on us any more. 983 */ 984 raw_spin_lock_irqsave(&ctx->lock, *flags); 985 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 986 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 987 rcu_read_unlock(); 988 preempt_enable(); 989 goto retry; 990 } 991 992 if (!atomic_inc_not_zero(&ctx->refcount)) { 993 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 994 ctx = NULL; 995 } 996 } 997 rcu_read_unlock(); 998 preempt_enable(); 999 return ctx; 1000 } 1001 1002 /* 1003 * Get the context for a task and increment its pin_count so it 1004 * can't get swapped to another task. This also increments its 1005 * reference count so that the context can't get freed. 1006 */ 1007 static struct perf_event_context * 1008 perf_pin_task_context(struct task_struct *task, int ctxn) 1009 { 1010 struct perf_event_context *ctx; 1011 unsigned long flags; 1012 1013 ctx = perf_lock_task_context(task, ctxn, &flags); 1014 if (ctx) { 1015 ++ctx->pin_count; 1016 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1017 } 1018 return ctx; 1019 } 1020 1021 static void perf_unpin_context(struct perf_event_context *ctx) 1022 { 1023 unsigned long flags; 1024 1025 raw_spin_lock_irqsave(&ctx->lock, flags); 1026 --ctx->pin_count; 1027 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1028 } 1029 1030 /* 1031 * Update the record of the current time in a context. 1032 */ 1033 static void update_context_time(struct perf_event_context *ctx) 1034 { 1035 u64 now = perf_clock(); 1036 1037 ctx->time += now - ctx->timestamp; 1038 ctx->timestamp = now; 1039 } 1040 1041 static u64 perf_event_time(struct perf_event *event) 1042 { 1043 struct perf_event_context *ctx = event->ctx; 1044 1045 if (is_cgroup_event(event)) 1046 return perf_cgroup_event_time(event); 1047 1048 return ctx ? ctx->time : 0; 1049 } 1050 1051 /* 1052 * Update the total_time_enabled and total_time_running fields for a event. 1053 * The caller of this function needs to hold the ctx->lock. 1054 */ 1055 static void update_event_times(struct perf_event *event) 1056 { 1057 struct perf_event_context *ctx = event->ctx; 1058 u64 run_end; 1059 1060 if (event->state < PERF_EVENT_STATE_INACTIVE || 1061 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 1062 return; 1063 /* 1064 * in cgroup mode, time_enabled represents 1065 * the time the event was enabled AND active 1066 * tasks were in the monitored cgroup. This is 1067 * independent of the activity of the context as 1068 * there may be a mix of cgroup and non-cgroup events. 1069 * 1070 * That is why we treat cgroup events differently 1071 * here. 1072 */ 1073 if (is_cgroup_event(event)) 1074 run_end = perf_cgroup_event_time(event); 1075 else if (ctx->is_active) 1076 run_end = ctx->time; 1077 else 1078 run_end = event->tstamp_stopped; 1079 1080 event->total_time_enabled = run_end - event->tstamp_enabled; 1081 1082 if (event->state == PERF_EVENT_STATE_INACTIVE) 1083 run_end = event->tstamp_stopped; 1084 else 1085 run_end = perf_event_time(event); 1086 1087 event->total_time_running = run_end - event->tstamp_running; 1088 1089 } 1090 1091 /* 1092 * Update total_time_enabled and total_time_running for all events in a group. 1093 */ 1094 static void update_group_times(struct perf_event *leader) 1095 { 1096 struct perf_event *event; 1097 1098 update_event_times(leader); 1099 list_for_each_entry(event, &leader->sibling_list, group_entry) 1100 update_event_times(event); 1101 } 1102 1103 static struct list_head * 1104 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 1105 { 1106 if (event->attr.pinned) 1107 return &ctx->pinned_groups; 1108 else 1109 return &ctx->flexible_groups; 1110 } 1111 1112 /* 1113 * Add a event from the lists for its context. 1114 * Must be called with ctx->mutex and ctx->lock held. 1115 */ 1116 static void 1117 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1118 { 1119 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1120 event->attach_state |= PERF_ATTACH_CONTEXT; 1121 1122 /* 1123 * If we're a stand alone event or group leader, we go to the context 1124 * list, group events are kept attached to the group so that 1125 * perf_group_detach can, at all times, locate all siblings. 1126 */ 1127 if (event->group_leader == event) { 1128 struct list_head *list; 1129 1130 if (is_software_event(event)) 1131 event->group_flags |= PERF_GROUP_SOFTWARE; 1132 1133 list = ctx_group_list(event, ctx); 1134 list_add_tail(&event->group_entry, list); 1135 } 1136 1137 if (is_cgroup_event(event)) 1138 ctx->nr_cgroups++; 1139 1140 if (has_branch_stack(event)) 1141 ctx->nr_branch_stack++; 1142 1143 list_add_rcu(&event->event_entry, &ctx->event_list); 1144 if (!ctx->nr_events) 1145 perf_pmu_rotate_start(ctx->pmu); 1146 ctx->nr_events++; 1147 if (event->attr.inherit_stat) 1148 ctx->nr_stat++; 1149 1150 ctx->generation++; 1151 } 1152 1153 /* 1154 * Initialize event state based on the perf_event_attr::disabled. 1155 */ 1156 static inline void perf_event__state_init(struct perf_event *event) 1157 { 1158 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : 1159 PERF_EVENT_STATE_INACTIVE; 1160 } 1161 1162 /* 1163 * Called at perf_event creation and when events are attached/detached from a 1164 * group. 1165 */ 1166 static void perf_event__read_size(struct perf_event *event) 1167 { 1168 int entry = sizeof(u64); /* value */ 1169 int size = 0; 1170 int nr = 1; 1171 1172 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1173 size += sizeof(u64); 1174 1175 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1176 size += sizeof(u64); 1177 1178 if (event->attr.read_format & PERF_FORMAT_ID) 1179 entry += sizeof(u64); 1180 1181 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1182 nr += event->group_leader->nr_siblings; 1183 size += sizeof(u64); 1184 } 1185 1186 size += entry * nr; 1187 event->read_size = size; 1188 } 1189 1190 static void perf_event__header_size(struct perf_event *event) 1191 { 1192 struct perf_sample_data *data; 1193 u64 sample_type = event->attr.sample_type; 1194 u16 size = 0; 1195 1196 perf_event__read_size(event); 1197 1198 if (sample_type & PERF_SAMPLE_IP) 1199 size += sizeof(data->ip); 1200 1201 if (sample_type & PERF_SAMPLE_ADDR) 1202 size += sizeof(data->addr); 1203 1204 if (sample_type & PERF_SAMPLE_PERIOD) 1205 size += sizeof(data->period); 1206 1207 if (sample_type & PERF_SAMPLE_WEIGHT) 1208 size += sizeof(data->weight); 1209 1210 if (sample_type & PERF_SAMPLE_READ) 1211 size += event->read_size; 1212 1213 if (sample_type & PERF_SAMPLE_DATA_SRC) 1214 size += sizeof(data->data_src.val); 1215 1216 if (sample_type & PERF_SAMPLE_TRANSACTION) 1217 size += sizeof(data->txn); 1218 1219 event->header_size = size; 1220 } 1221 1222 static void perf_event__id_header_size(struct perf_event *event) 1223 { 1224 struct perf_sample_data *data; 1225 u64 sample_type = event->attr.sample_type; 1226 u16 size = 0; 1227 1228 if (sample_type & PERF_SAMPLE_TID) 1229 size += sizeof(data->tid_entry); 1230 1231 if (sample_type & PERF_SAMPLE_TIME) 1232 size += sizeof(data->time); 1233 1234 if (sample_type & PERF_SAMPLE_IDENTIFIER) 1235 size += sizeof(data->id); 1236 1237 if (sample_type & PERF_SAMPLE_ID) 1238 size += sizeof(data->id); 1239 1240 if (sample_type & PERF_SAMPLE_STREAM_ID) 1241 size += sizeof(data->stream_id); 1242 1243 if (sample_type & PERF_SAMPLE_CPU) 1244 size += sizeof(data->cpu_entry); 1245 1246 event->id_header_size = size; 1247 } 1248 1249 static void perf_group_attach(struct perf_event *event) 1250 { 1251 struct perf_event *group_leader = event->group_leader, *pos; 1252 1253 /* 1254 * We can have double attach due to group movement in perf_event_open. 1255 */ 1256 if (event->attach_state & PERF_ATTACH_GROUP) 1257 return; 1258 1259 event->attach_state |= PERF_ATTACH_GROUP; 1260 1261 if (group_leader == event) 1262 return; 1263 1264 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1265 !is_software_event(event)) 1266 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 1267 1268 list_add_tail(&event->group_entry, &group_leader->sibling_list); 1269 group_leader->nr_siblings++; 1270 1271 perf_event__header_size(group_leader); 1272 1273 list_for_each_entry(pos, &group_leader->sibling_list, group_entry) 1274 perf_event__header_size(pos); 1275 } 1276 1277 /* 1278 * Remove a event from the lists for its context. 1279 * Must be called with ctx->mutex and ctx->lock held. 1280 */ 1281 static void 1282 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1283 { 1284 struct perf_cpu_context *cpuctx; 1285 /* 1286 * We can have double detach due to exit/hot-unplug + close. 1287 */ 1288 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 1289 return; 1290 1291 event->attach_state &= ~PERF_ATTACH_CONTEXT; 1292 1293 if (is_cgroup_event(event)) { 1294 ctx->nr_cgroups--; 1295 cpuctx = __get_cpu_context(ctx); 1296 /* 1297 * if there are no more cgroup events 1298 * then cler cgrp to avoid stale pointer 1299 * in update_cgrp_time_from_cpuctx() 1300 */ 1301 if (!ctx->nr_cgroups) 1302 cpuctx->cgrp = NULL; 1303 } 1304 1305 if (has_branch_stack(event)) 1306 ctx->nr_branch_stack--; 1307 1308 ctx->nr_events--; 1309 if (event->attr.inherit_stat) 1310 ctx->nr_stat--; 1311 1312 list_del_rcu(&event->event_entry); 1313 1314 if (event->group_leader == event) 1315 list_del_init(&event->group_entry); 1316 1317 update_group_times(event); 1318 1319 /* 1320 * If event was in error state, then keep it 1321 * that way, otherwise bogus counts will be 1322 * returned on read(). The only way to get out 1323 * of error state is by explicit re-enabling 1324 * of the event 1325 */ 1326 if (event->state > PERF_EVENT_STATE_OFF) 1327 event->state = PERF_EVENT_STATE_OFF; 1328 1329 ctx->generation++; 1330 } 1331 1332 static void perf_group_detach(struct perf_event *event) 1333 { 1334 struct perf_event *sibling, *tmp; 1335 struct list_head *list = NULL; 1336 1337 /* 1338 * We can have double detach due to exit/hot-unplug + close. 1339 */ 1340 if (!(event->attach_state & PERF_ATTACH_GROUP)) 1341 return; 1342 1343 event->attach_state &= ~PERF_ATTACH_GROUP; 1344 1345 /* 1346 * If this is a sibling, remove it from its group. 1347 */ 1348 if (event->group_leader != event) { 1349 list_del_init(&event->group_entry); 1350 event->group_leader->nr_siblings--; 1351 goto out; 1352 } 1353 1354 if (!list_empty(&event->group_entry)) 1355 list = &event->group_entry; 1356 1357 /* 1358 * If this was a group event with sibling events then 1359 * upgrade the siblings to singleton events by adding them 1360 * to whatever list we are on. 1361 */ 1362 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 1363 if (list) 1364 list_move_tail(&sibling->group_entry, list); 1365 sibling->group_leader = sibling; 1366 1367 /* Inherit group flags from the previous leader */ 1368 sibling->group_flags = event->group_flags; 1369 } 1370 1371 out: 1372 perf_event__header_size(event->group_leader); 1373 1374 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) 1375 perf_event__header_size(tmp); 1376 } 1377 1378 static inline int 1379 event_filter_match(struct perf_event *event) 1380 { 1381 return (event->cpu == -1 || event->cpu == smp_processor_id()) 1382 && perf_cgroup_match(event); 1383 } 1384 1385 static void 1386 event_sched_out(struct perf_event *event, 1387 struct perf_cpu_context *cpuctx, 1388 struct perf_event_context *ctx) 1389 { 1390 u64 tstamp = perf_event_time(event); 1391 u64 delta; 1392 /* 1393 * An event which could not be activated because of 1394 * filter mismatch still needs to have its timings 1395 * maintained, otherwise bogus information is return 1396 * via read() for time_enabled, time_running: 1397 */ 1398 if (event->state == PERF_EVENT_STATE_INACTIVE 1399 && !event_filter_match(event)) { 1400 delta = tstamp - event->tstamp_stopped; 1401 event->tstamp_running += delta; 1402 event->tstamp_stopped = tstamp; 1403 } 1404 1405 if (event->state != PERF_EVENT_STATE_ACTIVE) 1406 return; 1407 1408 perf_pmu_disable(event->pmu); 1409 1410 event->state = PERF_EVENT_STATE_INACTIVE; 1411 if (event->pending_disable) { 1412 event->pending_disable = 0; 1413 event->state = PERF_EVENT_STATE_OFF; 1414 } 1415 event->tstamp_stopped = tstamp; 1416 event->pmu->del(event, 0); 1417 event->oncpu = -1; 1418 1419 if (!is_software_event(event)) 1420 cpuctx->active_oncpu--; 1421 ctx->nr_active--; 1422 if (event->attr.freq && event->attr.sample_freq) 1423 ctx->nr_freq--; 1424 if (event->attr.exclusive || !cpuctx->active_oncpu) 1425 cpuctx->exclusive = 0; 1426 1427 perf_pmu_enable(event->pmu); 1428 } 1429 1430 static void 1431 group_sched_out(struct perf_event *group_event, 1432 struct perf_cpu_context *cpuctx, 1433 struct perf_event_context *ctx) 1434 { 1435 struct perf_event *event; 1436 int state = group_event->state; 1437 1438 event_sched_out(group_event, cpuctx, ctx); 1439 1440 /* 1441 * Schedule out siblings (if any): 1442 */ 1443 list_for_each_entry(event, &group_event->sibling_list, group_entry) 1444 event_sched_out(event, cpuctx, ctx); 1445 1446 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 1447 cpuctx->exclusive = 0; 1448 } 1449 1450 struct remove_event { 1451 struct perf_event *event; 1452 bool detach_group; 1453 }; 1454 1455 /* 1456 * Cross CPU call to remove a performance event 1457 * 1458 * We disable the event on the hardware level first. After that we 1459 * remove it from the context list. 1460 */ 1461 static int __perf_remove_from_context(void *info) 1462 { 1463 struct remove_event *re = info; 1464 struct perf_event *event = re->event; 1465 struct perf_event_context *ctx = event->ctx; 1466 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1467 1468 raw_spin_lock(&ctx->lock); 1469 event_sched_out(event, cpuctx, ctx); 1470 if (re->detach_group) 1471 perf_group_detach(event); 1472 list_del_event(event, ctx); 1473 if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1474 ctx->is_active = 0; 1475 cpuctx->task_ctx = NULL; 1476 } 1477 raw_spin_unlock(&ctx->lock); 1478 1479 return 0; 1480 } 1481 1482 1483 /* 1484 * Remove the event from a task's (or a CPU's) list of events. 1485 * 1486 * CPU events are removed with a smp call. For task events we only 1487 * call when the task is on a CPU. 1488 * 1489 * If event->ctx is a cloned context, callers must make sure that 1490 * every task struct that event->ctx->task could possibly point to 1491 * remains valid. This is OK when called from perf_release since 1492 * that only calls us on the top-level context, which can't be a clone. 1493 * When called from perf_event_exit_task, it's OK because the 1494 * context has been detached from its task. 1495 */ 1496 static void perf_remove_from_context(struct perf_event *event, bool detach_group) 1497 { 1498 struct perf_event_context *ctx = event->ctx; 1499 struct task_struct *task = ctx->task; 1500 struct remove_event re = { 1501 .event = event, 1502 .detach_group = detach_group, 1503 }; 1504 1505 lockdep_assert_held(&ctx->mutex); 1506 1507 if (!task) { 1508 /* 1509 * Per cpu events are removed via an smp call and 1510 * the removal is always successful. 1511 */ 1512 cpu_function_call(event->cpu, __perf_remove_from_context, &re); 1513 return; 1514 } 1515 1516 retry: 1517 if (!task_function_call(task, __perf_remove_from_context, &re)) 1518 return; 1519 1520 raw_spin_lock_irq(&ctx->lock); 1521 /* 1522 * If we failed to find a running task, but find the context active now 1523 * that we've acquired the ctx->lock, retry. 1524 */ 1525 if (ctx->is_active) { 1526 raw_spin_unlock_irq(&ctx->lock); 1527 goto retry; 1528 } 1529 1530 /* 1531 * Since the task isn't running, its safe to remove the event, us 1532 * holding the ctx->lock ensures the task won't get scheduled in. 1533 */ 1534 if (detach_group) 1535 perf_group_detach(event); 1536 list_del_event(event, ctx); 1537 raw_spin_unlock_irq(&ctx->lock); 1538 } 1539 1540 /* 1541 * Cross CPU call to disable a performance event 1542 */ 1543 int __perf_event_disable(void *info) 1544 { 1545 struct perf_event *event = info; 1546 struct perf_event_context *ctx = event->ctx; 1547 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1548 1549 /* 1550 * If this is a per-task event, need to check whether this 1551 * event's task is the current task on this cpu. 1552 * 1553 * Can trigger due to concurrent perf_event_context_sched_out() 1554 * flipping contexts around. 1555 */ 1556 if (ctx->task && cpuctx->task_ctx != ctx) 1557 return -EINVAL; 1558 1559 raw_spin_lock(&ctx->lock); 1560 1561 /* 1562 * If the event is on, turn it off. 1563 * If it is in error state, leave it in error state. 1564 */ 1565 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1566 update_context_time(ctx); 1567 update_cgrp_time_from_event(event); 1568 update_group_times(event); 1569 if (event == event->group_leader) 1570 group_sched_out(event, cpuctx, ctx); 1571 else 1572 event_sched_out(event, cpuctx, ctx); 1573 event->state = PERF_EVENT_STATE_OFF; 1574 } 1575 1576 raw_spin_unlock(&ctx->lock); 1577 1578 return 0; 1579 } 1580 1581 /* 1582 * Disable a event. 1583 * 1584 * If event->ctx is a cloned context, callers must make sure that 1585 * every task struct that event->ctx->task could possibly point to 1586 * remains valid. This condition is satisifed when called through 1587 * perf_event_for_each_child or perf_event_for_each because they 1588 * hold the top-level event's child_mutex, so any descendant that 1589 * goes to exit will block in sync_child_event. 1590 * When called from perf_pending_event it's OK because event->ctx 1591 * is the current context on this CPU and preemption is disabled, 1592 * hence we can't get into perf_event_task_sched_out for this context. 1593 */ 1594 void perf_event_disable(struct perf_event *event) 1595 { 1596 struct perf_event_context *ctx = event->ctx; 1597 struct task_struct *task = ctx->task; 1598 1599 if (!task) { 1600 /* 1601 * Disable the event on the cpu that it's on 1602 */ 1603 cpu_function_call(event->cpu, __perf_event_disable, event); 1604 return; 1605 } 1606 1607 retry: 1608 if (!task_function_call(task, __perf_event_disable, event)) 1609 return; 1610 1611 raw_spin_lock_irq(&ctx->lock); 1612 /* 1613 * If the event is still active, we need to retry the cross-call. 1614 */ 1615 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1616 raw_spin_unlock_irq(&ctx->lock); 1617 /* 1618 * Reload the task pointer, it might have been changed by 1619 * a concurrent perf_event_context_sched_out(). 1620 */ 1621 task = ctx->task; 1622 goto retry; 1623 } 1624 1625 /* 1626 * Since we have the lock this context can't be scheduled 1627 * in, so we can change the state safely. 1628 */ 1629 if (event->state == PERF_EVENT_STATE_INACTIVE) { 1630 update_group_times(event); 1631 event->state = PERF_EVENT_STATE_OFF; 1632 } 1633 raw_spin_unlock_irq(&ctx->lock); 1634 } 1635 EXPORT_SYMBOL_GPL(perf_event_disable); 1636 1637 static void perf_set_shadow_time(struct perf_event *event, 1638 struct perf_event_context *ctx, 1639 u64 tstamp) 1640 { 1641 /* 1642 * use the correct time source for the time snapshot 1643 * 1644 * We could get by without this by leveraging the 1645 * fact that to get to this function, the caller 1646 * has most likely already called update_context_time() 1647 * and update_cgrp_time_xx() and thus both timestamp 1648 * are identical (or very close). Given that tstamp is, 1649 * already adjusted for cgroup, we could say that: 1650 * tstamp - ctx->timestamp 1651 * is equivalent to 1652 * tstamp - cgrp->timestamp. 1653 * 1654 * Then, in perf_output_read(), the calculation would 1655 * work with no changes because: 1656 * - event is guaranteed scheduled in 1657 * - no scheduled out in between 1658 * - thus the timestamp would be the same 1659 * 1660 * But this is a bit hairy. 1661 * 1662 * So instead, we have an explicit cgroup call to remain 1663 * within the time time source all along. We believe it 1664 * is cleaner and simpler to understand. 1665 */ 1666 if (is_cgroup_event(event)) 1667 perf_cgroup_set_shadow_time(event, tstamp); 1668 else 1669 event->shadow_ctx_time = tstamp - ctx->timestamp; 1670 } 1671 1672 #define MAX_INTERRUPTS (~0ULL) 1673 1674 static void perf_log_throttle(struct perf_event *event, int enable); 1675 1676 static int 1677 event_sched_in(struct perf_event *event, 1678 struct perf_cpu_context *cpuctx, 1679 struct perf_event_context *ctx) 1680 { 1681 u64 tstamp = perf_event_time(event); 1682 int ret = 0; 1683 1684 lockdep_assert_held(&ctx->lock); 1685 1686 if (event->state <= PERF_EVENT_STATE_OFF) 1687 return 0; 1688 1689 event->state = PERF_EVENT_STATE_ACTIVE; 1690 event->oncpu = smp_processor_id(); 1691 1692 /* 1693 * Unthrottle events, since we scheduled we might have missed several 1694 * ticks already, also for a heavily scheduling task there is little 1695 * guarantee it'll get a tick in a timely manner. 1696 */ 1697 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 1698 perf_log_throttle(event, 1); 1699 event->hw.interrupts = 0; 1700 } 1701 1702 /* 1703 * The new state must be visible before we turn it on in the hardware: 1704 */ 1705 smp_wmb(); 1706 1707 perf_pmu_disable(event->pmu); 1708 1709 if (event->pmu->add(event, PERF_EF_START)) { 1710 event->state = PERF_EVENT_STATE_INACTIVE; 1711 event->oncpu = -1; 1712 ret = -EAGAIN; 1713 goto out; 1714 } 1715 1716 event->tstamp_running += tstamp - event->tstamp_stopped; 1717 1718 perf_set_shadow_time(event, ctx, tstamp); 1719 1720 if (!is_software_event(event)) 1721 cpuctx->active_oncpu++; 1722 ctx->nr_active++; 1723 if (event->attr.freq && event->attr.sample_freq) 1724 ctx->nr_freq++; 1725 1726 if (event->attr.exclusive) 1727 cpuctx->exclusive = 1; 1728 1729 out: 1730 perf_pmu_enable(event->pmu); 1731 1732 return ret; 1733 } 1734 1735 static int 1736 group_sched_in(struct perf_event *group_event, 1737 struct perf_cpu_context *cpuctx, 1738 struct perf_event_context *ctx) 1739 { 1740 struct perf_event *event, *partial_group = NULL; 1741 struct pmu *pmu = ctx->pmu; 1742 u64 now = ctx->time; 1743 bool simulate = false; 1744 1745 if (group_event->state == PERF_EVENT_STATE_OFF) 1746 return 0; 1747 1748 pmu->start_txn(pmu); 1749 1750 if (event_sched_in(group_event, cpuctx, ctx)) { 1751 pmu->cancel_txn(pmu); 1752 perf_cpu_hrtimer_restart(cpuctx); 1753 return -EAGAIN; 1754 } 1755 1756 /* 1757 * Schedule in siblings as one group (if any): 1758 */ 1759 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 1760 if (event_sched_in(event, cpuctx, ctx)) { 1761 partial_group = event; 1762 goto group_error; 1763 } 1764 } 1765 1766 if (!pmu->commit_txn(pmu)) 1767 return 0; 1768 1769 group_error: 1770 /* 1771 * Groups can be scheduled in as one unit only, so undo any 1772 * partial group before returning: 1773 * The events up to the failed event are scheduled out normally, 1774 * tstamp_stopped will be updated. 1775 * 1776 * The failed events and the remaining siblings need to have 1777 * their timings updated as if they had gone thru event_sched_in() 1778 * and event_sched_out(). This is required to get consistent timings 1779 * across the group. This also takes care of the case where the group 1780 * could never be scheduled by ensuring tstamp_stopped is set to mark 1781 * the time the event was actually stopped, such that time delta 1782 * calculation in update_event_times() is correct. 1783 */ 1784 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 1785 if (event == partial_group) 1786 simulate = true; 1787 1788 if (simulate) { 1789 event->tstamp_running += now - event->tstamp_stopped; 1790 event->tstamp_stopped = now; 1791 } else { 1792 event_sched_out(event, cpuctx, ctx); 1793 } 1794 } 1795 event_sched_out(group_event, cpuctx, ctx); 1796 1797 pmu->cancel_txn(pmu); 1798 1799 perf_cpu_hrtimer_restart(cpuctx); 1800 1801 return -EAGAIN; 1802 } 1803 1804 /* 1805 * Work out whether we can put this event group on the CPU now. 1806 */ 1807 static int group_can_go_on(struct perf_event *event, 1808 struct perf_cpu_context *cpuctx, 1809 int can_add_hw) 1810 { 1811 /* 1812 * Groups consisting entirely of software events can always go on. 1813 */ 1814 if (event->group_flags & PERF_GROUP_SOFTWARE) 1815 return 1; 1816 /* 1817 * If an exclusive group is already on, no other hardware 1818 * events can go on. 1819 */ 1820 if (cpuctx->exclusive) 1821 return 0; 1822 /* 1823 * If this group is exclusive and there are already 1824 * events on the CPU, it can't go on. 1825 */ 1826 if (event->attr.exclusive && cpuctx->active_oncpu) 1827 return 0; 1828 /* 1829 * Otherwise, try to add it if all previous groups were able 1830 * to go on. 1831 */ 1832 return can_add_hw; 1833 } 1834 1835 static void add_event_to_ctx(struct perf_event *event, 1836 struct perf_event_context *ctx) 1837 { 1838 u64 tstamp = perf_event_time(event); 1839 1840 list_add_event(event, ctx); 1841 perf_group_attach(event); 1842 event->tstamp_enabled = tstamp; 1843 event->tstamp_running = tstamp; 1844 event->tstamp_stopped = tstamp; 1845 } 1846 1847 static void task_ctx_sched_out(struct perf_event_context *ctx); 1848 static void 1849 ctx_sched_in(struct perf_event_context *ctx, 1850 struct perf_cpu_context *cpuctx, 1851 enum event_type_t event_type, 1852 struct task_struct *task); 1853 1854 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 1855 struct perf_event_context *ctx, 1856 struct task_struct *task) 1857 { 1858 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); 1859 if (ctx) 1860 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 1861 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); 1862 if (ctx) 1863 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 1864 } 1865 1866 /* 1867 * Cross CPU call to install and enable a performance event 1868 * 1869 * Must be called with ctx->mutex held 1870 */ 1871 static int __perf_install_in_context(void *info) 1872 { 1873 struct perf_event *event = info; 1874 struct perf_event_context *ctx = event->ctx; 1875 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1876 struct perf_event_context *task_ctx = cpuctx->task_ctx; 1877 struct task_struct *task = current; 1878 1879 perf_ctx_lock(cpuctx, task_ctx); 1880 perf_pmu_disable(cpuctx->ctx.pmu); 1881 1882 /* 1883 * If there was an active task_ctx schedule it out. 1884 */ 1885 if (task_ctx) 1886 task_ctx_sched_out(task_ctx); 1887 1888 /* 1889 * If the context we're installing events in is not the 1890 * active task_ctx, flip them. 1891 */ 1892 if (ctx->task && task_ctx != ctx) { 1893 if (task_ctx) 1894 raw_spin_unlock(&task_ctx->lock); 1895 raw_spin_lock(&ctx->lock); 1896 task_ctx = ctx; 1897 } 1898 1899 if (task_ctx) { 1900 cpuctx->task_ctx = task_ctx; 1901 task = task_ctx->task; 1902 } 1903 1904 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 1905 1906 update_context_time(ctx); 1907 /* 1908 * update cgrp time only if current cgrp 1909 * matches event->cgrp. Must be done before 1910 * calling add_event_to_ctx() 1911 */ 1912 update_cgrp_time_from_event(event); 1913 1914 add_event_to_ctx(event, ctx); 1915 1916 /* 1917 * Schedule everything back in 1918 */ 1919 perf_event_sched_in(cpuctx, task_ctx, task); 1920 1921 perf_pmu_enable(cpuctx->ctx.pmu); 1922 perf_ctx_unlock(cpuctx, task_ctx); 1923 1924 return 0; 1925 } 1926 1927 /* 1928 * Attach a performance event to a context 1929 * 1930 * First we add the event to the list with the hardware enable bit 1931 * in event->hw_config cleared. 1932 * 1933 * If the event is attached to a task which is on a CPU we use a smp 1934 * call to enable it in the task context. The task might have been 1935 * scheduled away, but we check this in the smp call again. 1936 */ 1937 static void 1938 perf_install_in_context(struct perf_event_context *ctx, 1939 struct perf_event *event, 1940 int cpu) 1941 { 1942 struct task_struct *task = ctx->task; 1943 1944 lockdep_assert_held(&ctx->mutex); 1945 1946 event->ctx = ctx; 1947 if (event->cpu != -1) 1948 event->cpu = cpu; 1949 1950 if (!task) { 1951 /* 1952 * Per cpu events are installed via an smp call and 1953 * the install is always successful. 1954 */ 1955 cpu_function_call(cpu, __perf_install_in_context, event); 1956 return; 1957 } 1958 1959 retry: 1960 if (!task_function_call(task, __perf_install_in_context, event)) 1961 return; 1962 1963 raw_spin_lock_irq(&ctx->lock); 1964 /* 1965 * If we failed to find a running task, but find the context active now 1966 * that we've acquired the ctx->lock, retry. 1967 */ 1968 if (ctx->is_active) { 1969 raw_spin_unlock_irq(&ctx->lock); 1970 goto retry; 1971 } 1972 1973 /* 1974 * Since the task isn't running, its safe to add the event, us holding 1975 * the ctx->lock ensures the task won't get scheduled in. 1976 */ 1977 add_event_to_ctx(event, ctx); 1978 raw_spin_unlock_irq(&ctx->lock); 1979 } 1980 1981 /* 1982 * Put a event into inactive state and update time fields. 1983 * Enabling the leader of a group effectively enables all 1984 * the group members that aren't explicitly disabled, so we 1985 * have to update their ->tstamp_enabled also. 1986 * Note: this works for group members as well as group leaders 1987 * since the non-leader members' sibling_lists will be empty. 1988 */ 1989 static void __perf_event_mark_enabled(struct perf_event *event) 1990 { 1991 struct perf_event *sub; 1992 u64 tstamp = perf_event_time(event); 1993 1994 event->state = PERF_EVENT_STATE_INACTIVE; 1995 event->tstamp_enabled = tstamp - event->total_time_enabled; 1996 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1997 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 1998 sub->tstamp_enabled = tstamp - sub->total_time_enabled; 1999 } 2000 } 2001 2002 /* 2003 * Cross CPU call to enable a performance event 2004 */ 2005 static int __perf_event_enable(void *info) 2006 { 2007 struct perf_event *event = info; 2008 struct perf_event_context *ctx = event->ctx; 2009 struct perf_event *leader = event->group_leader; 2010 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2011 int err; 2012 2013 /* 2014 * There's a time window between 'ctx->is_active' check 2015 * in perf_event_enable function and this place having: 2016 * - IRQs on 2017 * - ctx->lock unlocked 2018 * 2019 * where the task could be killed and 'ctx' deactivated 2020 * by perf_event_exit_task. 2021 */ 2022 if (!ctx->is_active) 2023 return -EINVAL; 2024 2025 raw_spin_lock(&ctx->lock); 2026 update_context_time(ctx); 2027 2028 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2029 goto unlock; 2030 2031 /* 2032 * set current task's cgroup time reference point 2033 */ 2034 perf_cgroup_set_timestamp(current, ctx); 2035 2036 __perf_event_mark_enabled(event); 2037 2038 if (!event_filter_match(event)) { 2039 if (is_cgroup_event(event)) 2040 perf_cgroup_defer_enabled(event); 2041 goto unlock; 2042 } 2043 2044 /* 2045 * If the event is in a group and isn't the group leader, 2046 * then don't put it on unless the group is on. 2047 */ 2048 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2049 goto unlock; 2050 2051 if (!group_can_go_on(event, cpuctx, 1)) { 2052 err = -EEXIST; 2053 } else { 2054 if (event == leader) 2055 err = group_sched_in(event, cpuctx, ctx); 2056 else 2057 err = event_sched_in(event, cpuctx, ctx); 2058 } 2059 2060 if (err) { 2061 /* 2062 * If this event can't go on and it's part of a 2063 * group, then the whole group has to come off. 2064 */ 2065 if (leader != event) { 2066 group_sched_out(leader, cpuctx, ctx); 2067 perf_cpu_hrtimer_restart(cpuctx); 2068 } 2069 if (leader->attr.pinned) { 2070 update_group_times(leader); 2071 leader->state = PERF_EVENT_STATE_ERROR; 2072 } 2073 } 2074 2075 unlock: 2076 raw_spin_unlock(&ctx->lock); 2077 2078 return 0; 2079 } 2080 2081 /* 2082 * Enable a event. 2083 * 2084 * If event->ctx is a cloned context, callers must make sure that 2085 * every task struct that event->ctx->task could possibly point to 2086 * remains valid. This condition is satisfied when called through 2087 * perf_event_for_each_child or perf_event_for_each as described 2088 * for perf_event_disable. 2089 */ 2090 void perf_event_enable(struct perf_event *event) 2091 { 2092 struct perf_event_context *ctx = event->ctx; 2093 struct task_struct *task = ctx->task; 2094 2095 if (!task) { 2096 /* 2097 * Enable the event on the cpu that it's on 2098 */ 2099 cpu_function_call(event->cpu, __perf_event_enable, event); 2100 return; 2101 } 2102 2103 raw_spin_lock_irq(&ctx->lock); 2104 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2105 goto out; 2106 2107 /* 2108 * If the event is in error state, clear that first. 2109 * That way, if we see the event in error state below, we 2110 * know that it has gone back into error state, as distinct 2111 * from the task having been scheduled away before the 2112 * cross-call arrived. 2113 */ 2114 if (event->state == PERF_EVENT_STATE_ERROR) 2115 event->state = PERF_EVENT_STATE_OFF; 2116 2117 retry: 2118 if (!ctx->is_active) { 2119 __perf_event_mark_enabled(event); 2120 goto out; 2121 } 2122 2123 raw_spin_unlock_irq(&ctx->lock); 2124 2125 if (!task_function_call(task, __perf_event_enable, event)) 2126 return; 2127 2128 raw_spin_lock_irq(&ctx->lock); 2129 2130 /* 2131 * If the context is active and the event is still off, 2132 * we need to retry the cross-call. 2133 */ 2134 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { 2135 /* 2136 * task could have been flipped by a concurrent 2137 * perf_event_context_sched_out() 2138 */ 2139 task = ctx->task; 2140 goto retry; 2141 } 2142 2143 out: 2144 raw_spin_unlock_irq(&ctx->lock); 2145 } 2146 EXPORT_SYMBOL_GPL(perf_event_enable); 2147 2148 int perf_event_refresh(struct perf_event *event, int refresh) 2149 { 2150 /* 2151 * not supported on inherited events 2152 */ 2153 if (event->attr.inherit || !is_sampling_event(event)) 2154 return -EINVAL; 2155 2156 atomic_add(refresh, &event->event_limit); 2157 perf_event_enable(event); 2158 2159 return 0; 2160 } 2161 EXPORT_SYMBOL_GPL(perf_event_refresh); 2162 2163 static void ctx_sched_out(struct perf_event_context *ctx, 2164 struct perf_cpu_context *cpuctx, 2165 enum event_type_t event_type) 2166 { 2167 struct perf_event *event; 2168 int is_active = ctx->is_active; 2169 2170 ctx->is_active &= ~event_type; 2171 if (likely(!ctx->nr_events)) 2172 return; 2173 2174 update_context_time(ctx); 2175 update_cgrp_time_from_cpuctx(cpuctx); 2176 if (!ctx->nr_active) 2177 return; 2178 2179 perf_pmu_disable(ctx->pmu); 2180 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { 2181 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2182 group_sched_out(event, cpuctx, ctx); 2183 } 2184 2185 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { 2186 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2187 group_sched_out(event, cpuctx, ctx); 2188 } 2189 perf_pmu_enable(ctx->pmu); 2190 } 2191 2192 /* 2193 * Test whether two contexts are equivalent, i.e. whether they have both been 2194 * cloned from the same version of the same context. 2195 * 2196 * Equivalence is measured using a generation number in the context that is 2197 * incremented on each modification to it; see unclone_ctx(), list_add_event() 2198 * and list_del_event(). 2199 */ 2200 static int context_equiv(struct perf_event_context *ctx1, 2201 struct perf_event_context *ctx2) 2202 { 2203 /* Pinning disables the swap optimization */ 2204 if (ctx1->pin_count || ctx2->pin_count) 2205 return 0; 2206 2207 /* If ctx1 is the parent of ctx2 */ 2208 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) 2209 return 1; 2210 2211 /* If ctx2 is the parent of ctx1 */ 2212 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) 2213 return 1; 2214 2215 /* 2216 * If ctx1 and ctx2 have the same parent; we flatten the parent 2217 * hierarchy, see perf_event_init_context(). 2218 */ 2219 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && 2220 ctx1->parent_gen == ctx2->parent_gen) 2221 return 1; 2222 2223 /* Unmatched */ 2224 return 0; 2225 } 2226 2227 static void __perf_event_sync_stat(struct perf_event *event, 2228 struct perf_event *next_event) 2229 { 2230 u64 value; 2231 2232 if (!event->attr.inherit_stat) 2233 return; 2234 2235 /* 2236 * Update the event value, we cannot use perf_event_read() 2237 * because we're in the middle of a context switch and have IRQs 2238 * disabled, which upsets smp_call_function_single(), however 2239 * we know the event must be on the current CPU, therefore we 2240 * don't need to use it. 2241 */ 2242 switch (event->state) { 2243 case PERF_EVENT_STATE_ACTIVE: 2244 event->pmu->read(event); 2245 /* fall-through */ 2246 2247 case PERF_EVENT_STATE_INACTIVE: 2248 update_event_times(event); 2249 break; 2250 2251 default: 2252 break; 2253 } 2254 2255 /* 2256 * In order to keep per-task stats reliable we need to flip the event 2257 * values when we flip the contexts. 2258 */ 2259 value = local64_read(&next_event->count); 2260 value = local64_xchg(&event->count, value); 2261 local64_set(&next_event->count, value); 2262 2263 swap(event->total_time_enabled, next_event->total_time_enabled); 2264 swap(event->total_time_running, next_event->total_time_running); 2265 2266 /* 2267 * Since we swizzled the values, update the user visible data too. 2268 */ 2269 perf_event_update_userpage(event); 2270 perf_event_update_userpage(next_event); 2271 } 2272 2273 static void perf_event_sync_stat(struct perf_event_context *ctx, 2274 struct perf_event_context *next_ctx) 2275 { 2276 struct perf_event *event, *next_event; 2277 2278 if (!ctx->nr_stat) 2279 return; 2280 2281 update_context_time(ctx); 2282 2283 event = list_first_entry(&ctx->event_list, 2284 struct perf_event, event_entry); 2285 2286 next_event = list_first_entry(&next_ctx->event_list, 2287 struct perf_event, event_entry); 2288 2289 while (&event->event_entry != &ctx->event_list && 2290 &next_event->event_entry != &next_ctx->event_list) { 2291 2292 __perf_event_sync_stat(event, next_event); 2293 2294 event = list_next_entry(event, event_entry); 2295 next_event = list_next_entry(next_event, event_entry); 2296 } 2297 } 2298 2299 static void perf_event_context_sched_out(struct task_struct *task, int ctxn, 2300 struct task_struct *next) 2301 { 2302 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2303 struct perf_event_context *next_ctx; 2304 struct perf_event_context *parent, *next_parent; 2305 struct perf_cpu_context *cpuctx; 2306 int do_switch = 1; 2307 2308 if (likely(!ctx)) 2309 return; 2310 2311 cpuctx = __get_cpu_context(ctx); 2312 if (!cpuctx->task_ctx) 2313 return; 2314 2315 rcu_read_lock(); 2316 next_ctx = next->perf_event_ctxp[ctxn]; 2317 if (!next_ctx) 2318 goto unlock; 2319 2320 parent = rcu_dereference(ctx->parent_ctx); 2321 next_parent = rcu_dereference(next_ctx->parent_ctx); 2322 2323 /* If neither context have a parent context; they cannot be clones. */ 2324 if (!parent || !next_parent) 2325 goto unlock; 2326 2327 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2328 /* 2329 * Looks like the two contexts are clones, so we might be 2330 * able to optimize the context switch. We lock both 2331 * contexts and check that they are clones under the 2332 * lock (including re-checking that neither has been 2333 * uncloned in the meantime). It doesn't matter which 2334 * order we take the locks because no other cpu could 2335 * be trying to lock both of these tasks. 2336 */ 2337 raw_spin_lock(&ctx->lock); 2338 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2339 if (context_equiv(ctx, next_ctx)) { 2340 /* 2341 * XXX do we need a memory barrier of sorts 2342 * wrt to rcu_dereference() of perf_event_ctxp 2343 */ 2344 task->perf_event_ctxp[ctxn] = next_ctx; 2345 next->perf_event_ctxp[ctxn] = ctx; 2346 ctx->task = next; 2347 next_ctx->task = task; 2348 do_switch = 0; 2349 2350 perf_event_sync_stat(ctx, next_ctx); 2351 } 2352 raw_spin_unlock(&next_ctx->lock); 2353 raw_spin_unlock(&ctx->lock); 2354 } 2355 unlock: 2356 rcu_read_unlock(); 2357 2358 if (do_switch) { 2359 raw_spin_lock(&ctx->lock); 2360 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2361 cpuctx->task_ctx = NULL; 2362 raw_spin_unlock(&ctx->lock); 2363 } 2364 } 2365 2366 #define for_each_task_context_nr(ctxn) \ 2367 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2368 2369 /* 2370 * Called from scheduler to remove the events of the current task, 2371 * with interrupts disabled. 2372 * 2373 * We stop each event and update the event value in event->count. 2374 * 2375 * This does not protect us against NMI, but disable() 2376 * sets the disabled bit in the control field of event _before_ 2377 * accessing the event control register. If a NMI hits, then it will 2378 * not restart the event. 2379 */ 2380 void __perf_event_task_sched_out(struct task_struct *task, 2381 struct task_struct *next) 2382 { 2383 int ctxn; 2384 2385 for_each_task_context_nr(ctxn) 2386 perf_event_context_sched_out(task, ctxn, next); 2387 2388 /* 2389 * if cgroup events exist on this CPU, then we need 2390 * to check if we have to switch out PMU state. 2391 * cgroup event are system-wide mode only 2392 */ 2393 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2394 perf_cgroup_sched_out(task, next); 2395 } 2396 2397 static void task_ctx_sched_out(struct perf_event_context *ctx) 2398 { 2399 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2400 2401 if (!cpuctx->task_ctx) 2402 return; 2403 2404 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2405 return; 2406 2407 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2408 cpuctx->task_ctx = NULL; 2409 } 2410 2411 /* 2412 * Called with IRQs disabled 2413 */ 2414 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 2415 enum event_type_t event_type) 2416 { 2417 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 2418 } 2419 2420 static void 2421 ctx_pinned_sched_in(struct perf_event_context *ctx, 2422 struct perf_cpu_context *cpuctx) 2423 { 2424 struct perf_event *event; 2425 2426 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2427 if (event->state <= PERF_EVENT_STATE_OFF) 2428 continue; 2429 if (!event_filter_match(event)) 2430 continue; 2431 2432 /* may need to reset tstamp_enabled */ 2433 if (is_cgroup_event(event)) 2434 perf_cgroup_mark_enabled(event, ctx); 2435 2436 if (group_can_go_on(event, cpuctx, 1)) 2437 group_sched_in(event, cpuctx, ctx); 2438 2439 /* 2440 * If this pinned group hasn't been scheduled, 2441 * put it in error state. 2442 */ 2443 if (event->state == PERF_EVENT_STATE_INACTIVE) { 2444 update_group_times(event); 2445 event->state = PERF_EVENT_STATE_ERROR; 2446 } 2447 } 2448 } 2449 2450 static void 2451 ctx_flexible_sched_in(struct perf_event_context *ctx, 2452 struct perf_cpu_context *cpuctx) 2453 { 2454 struct perf_event *event; 2455 int can_add_hw = 1; 2456 2457 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 2458 /* Ignore events in OFF or ERROR state */ 2459 if (event->state <= PERF_EVENT_STATE_OFF) 2460 continue; 2461 /* 2462 * Listen to the 'cpu' scheduling filter constraint 2463 * of events: 2464 */ 2465 if (!event_filter_match(event)) 2466 continue; 2467 2468 /* may need to reset tstamp_enabled */ 2469 if (is_cgroup_event(event)) 2470 perf_cgroup_mark_enabled(event, ctx); 2471 2472 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2473 if (group_sched_in(event, cpuctx, ctx)) 2474 can_add_hw = 0; 2475 } 2476 } 2477 } 2478 2479 static void 2480 ctx_sched_in(struct perf_event_context *ctx, 2481 struct perf_cpu_context *cpuctx, 2482 enum event_type_t event_type, 2483 struct task_struct *task) 2484 { 2485 u64 now; 2486 int is_active = ctx->is_active; 2487 2488 ctx->is_active |= event_type; 2489 if (likely(!ctx->nr_events)) 2490 return; 2491 2492 now = perf_clock(); 2493 ctx->timestamp = now; 2494 perf_cgroup_set_timestamp(task, ctx); 2495 /* 2496 * First go through the list and put on any pinned groups 2497 * in order to give them the best chance of going on. 2498 */ 2499 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) 2500 ctx_pinned_sched_in(ctx, cpuctx); 2501 2502 /* Then walk through the lower prio flexible groups */ 2503 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) 2504 ctx_flexible_sched_in(ctx, cpuctx); 2505 } 2506 2507 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2508 enum event_type_t event_type, 2509 struct task_struct *task) 2510 { 2511 struct perf_event_context *ctx = &cpuctx->ctx; 2512 2513 ctx_sched_in(ctx, cpuctx, event_type, task); 2514 } 2515 2516 static void perf_event_context_sched_in(struct perf_event_context *ctx, 2517 struct task_struct *task) 2518 { 2519 struct perf_cpu_context *cpuctx; 2520 2521 cpuctx = __get_cpu_context(ctx); 2522 if (cpuctx->task_ctx == ctx) 2523 return; 2524 2525 perf_ctx_lock(cpuctx, ctx); 2526 perf_pmu_disable(ctx->pmu); 2527 /* 2528 * We want to keep the following priority order: 2529 * cpu pinned (that don't need to move), task pinned, 2530 * cpu flexible, task flexible. 2531 */ 2532 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2533 2534 if (ctx->nr_events) 2535 cpuctx->task_ctx = ctx; 2536 2537 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); 2538 2539 perf_pmu_enable(ctx->pmu); 2540 perf_ctx_unlock(cpuctx, ctx); 2541 2542 /* 2543 * Since these rotations are per-cpu, we need to ensure the 2544 * cpu-context we got scheduled on is actually rotating. 2545 */ 2546 perf_pmu_rotate_start(ctx->pmu); 2547 } 2548 2549 /* 2550 * When sampling the branck stack in system-wide, it may be necessary 2551 * to flush the stack on context switch. This happens when the branch 2552 * stack does not tag its entries with the pid of the current task. 2553 * Otherwise it becomes impossible to associate a branch entry with a 2554 * task. This ambiguity is more likely to appear when the branch stack 2555 * supports priv level filtering and the user sets it to monitor only 2556 * at the user level (which could be a useful measurement in system-wide 2557 * mode). In that case, the risk is high of having a branch stack with 2558 * branch from multiple tasks. Flushing may mean dropping the existing 2559 * entries or stashing them somewhere in the PMU specific code layer. 2560 * 2561 * This function provides the context switch callback to the lower code 2562 * layer. It is invoked ONLY when there is at least one system-wide context 2563 * with at least one active event using taken branch sampling. 2564 */ 2565 static void perf_branch_stack_sched_in(struct task_struct *prev, 2566 struct task_struct *task) 2567 { 2568 struct perf_cpu_context *cpuctx; 2569 struct pmu *pmu; 2570 unsigned long flags; 2571 2572 /* no need to flush branch stack if not changing task */ 2573 if (prev == task) 2574 return; 2575 2576 local_irq_save(flags); 2577 2578 rcu_read_lock(); 2579 2580 list_for_each_entry_rcu(pmu, &pmus, entry) { 2581 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 2582 2583 /* 2584 * check if the context has at least one 2585 * event using PERF_SAMPLE_BRANCH_STACK 2586 */ 2587 if (cpuctx->ctx.nr_branch_stack > 0 2588 && pmu->flush_branch_stack) { 2589 2590 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2591 2592 perf_pmu_disable(pmu); 2593 2594 pmu->flush_branch_stack(); 2595 2596 perf_pmu_enable(pmu); 2597 2598 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2599 } 2600 } 2601 2602 rcu_read_unlock(); 2603 2604 local_irq_restore(flags); 2605 } 2606 2607 /* 2608 * Called from scheduler to add the events of the current task 2609 * with interrupts disabled. 2610 * 2611 * We restore the event value and then enable it. 2612 * 2613 * This does not protect us against NMI, but enable() 2614 * sets the enabled bit in the control field of event _before_ 2615 * accessing the event control register. If a NMI hits, then it will 2616 * keep the event running. 2617 */ 2618 void __perf_event_task_sched_in(struct task_struct *prev, 2619 struct task_struct *task) 2620 { 2621 struct perf_event_context *ctx; 2622 int ctxn; 2623 2624 for_each_task_context_nr(ctxn) { 2625 ctx = task->perf_event_ctxp[ctxn]; 2626 if (likely(!ctx)) 2627 continue; 2628 2629 perf_event_context_sched_in(ctx, task); 2630 } 2631 /* 2632 * if cgroup events exist on this CPU, then we need 2633 * to check if we have to switch in PMU state. 2634 * cgroup event are system-wide mode only 2635 */ 2636 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2637 perf_cgroup_sched_in(prev, task); 2638 2639 /* check for system-wide branch_stack events */ 2640 if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) 2641 perf_branch_stack_sched_in(prev, task); 2642 } 2643 2644 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2645 { 2646 u64 frequency = event->attr.sample_freq; 2647 u64 sec = NSEC_PER_SEC; 2648 u64 divisor, dividend; 2649 2650 int count_fls, nsec_fls, frequency_fls, sec_fls; 2651 2652 count_fls = fls64(count); 2653 nsec_fls = fls64(nsec); 2654 frequency_fls = fls64(frequency); 2655 sec_fls = 30; 2656 2657 /* 2658 * We got @count in @nsec, with a target of sample_freq HZ 2659 * the target period becomes: 2660 * 2661 * @count * 10^9 2662 * period = ------------------- 2663 * @nsec * sample_freq 2664 * 2665 */ 2666 2667 /* 2668 * Reduce accuracy by one bit such that @a and @b converge 2669 * to a similar magnitude. 2670 */ 2671 #define REDUCE_FLS(a, b) \ 2672 do { \ 2673 if (a##_fls > b##_fls) { \ 2674 a >>= 1; \ 2675 a##_fls--; \ 2676 } else { \ 2677 b >>= 1; \ 2678 b##_fls--; \ 2679 } \ 2680 } while (0) 2681 2682 /* 2683 * Reduce accuracy until either term fits in a u64, then proceed with 2684 * the other, so that finally we can do a u64/u64 division. 2685 */ 2686 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 2687 REDUCE_FLS(nsec, frequency); 2688 REDUCE_FLS(sec, count); 2689 } 2690 2691 if (count_fls + sec_fls > 64) { 2692 divisor = nsec * frequency; 2693 2694 while (count_fls + sec_fls > 64) { 2695 REDUCE_FLS(count, sec); 2696 divisor >>= 1; 2697 } 2698 2699 dividend = count * sec; 2700 } else { 2701 dividend = count * sec; 2702 2703 while (nsec_fls + frequency_fls > 64) { 2704 REDUCE_FLS(nsec, frequency); 2705 dividend >>= 1; 2706 } 2707 2708 divisor = nsec * frequency; 2709 } 2710 2711 if (!divisor) 2712 return dividend; 2713 2714 return div64_u64(dividend, divisor); 2715 } 2716 2717 static DEFINE_PER_CPU(int, perf_throttled_count); 2718 static DEFINE_PER_CPU(u64, perf_throttled_seq); 2719 2720 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) 2721 { 2722 struct hw_perf_event *hwc = &event->hw; 2723 s64 period, sample_period; 2724 s64 delta; 2725 2726 period = perf_calculate_period(event, nsec, count); 2727 2728 delta = (s64)(period - hwc->sample_period); 2729 delta = (delta + 7) / 8; /* low pass filter */ 2730 2731 sample_period = hwc->sample_period + delta; 2732 2733 if (!sample_period) 2734 sample_period = 1; 2735 2736 hwc->sample_period = sample_period; 2737 2738 if (local64_read(&hwc->period_left) > 8*sample_period) { 2739 if (disable) 2740 event->pmu->stop(event, PERF_EF_UPDATE); 2741 2742 local64_set(&hwc->period_left, 0); 2743 2744 if (disable) 2745 event->pmu->start(event, PERF_EF_RELOAD); 2746 } 2747 } 2748 2749 /* 2750 * combine freq adjustment with unthrottling to avoid two passes over the 2751 * events. At the same time, make sure, having freq events does not change 2752 * the rate of unthrottling as that would introduce bias. 2753 */ 2754 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, 2755 int needs_unthr) 2756 { 2757 struct perf_event *event; 2758 struct hw_perf_event *hwc; 2759 u64 now, period = TICK_NSEC; 2760 s64 delta; 2761 2762 /* 2763 * only need to iterate over all events iff: 2764 * - context have events in frequency mode (needs freq adjust) 2765 * - there are events to unthrottle on this cpu 2766 */ 2767 if (!(ctx->nr_freq || needs_unthr)) 2768 return; 2769 2770 raw_spin_lock(&ctx->lock); 2771 perf_pmu_disable(ctx->pmu); 2772 2773 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2774 if (event->state != PERF_EVENT_STATE_ACTIVE) 2775 continue; 2776 2777 if (!event_filter_match(event)) 2778 continue; 2779 2780 perf_pmu_disable(event->pmu); 2781 2782 hwc = &event->hw; 2783 2784 if (hwc->interrupts == MAX_INTERRUPTS) { 2785 hwc->interrupts = 0; 2786 perf_log_throttle(event, 1); 2787 event->pmu->start(event, 0); 2788 } 2789 2790 if (!event->attr.freq || !event->attr.sample_freq) 2791 goto next; 2792 2793 /* 2794 * stop the event and update event->count 2795 */ 2796 event->pmu->stop(event, PERF_EF_UPDATE); 2797 2798 now = local64_read(&event->count); 2799 delta = now - hwc->freq_count_stamp; 2800 hwc->freq_count_stamp = now; 2801 2802 /* 2803 * restart the event 2804 * reload only if value has changed 2805 * we have stopped the event so tell that 2806 * to perf_adjust_period() to avoid stopping it 2807 * twice. 2808 */ 2809 if (delta > 0) 2810 perf_adjust_period(event, period, delta, false); 2811 2812 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 2813 next: 2814 perf_pmu_enable(event->pmu); 2815 } 2816 2817 perf_pmu_enable(ctx->pmu); 2818 raw_spin_unlock(&ctx->lock); 2819 } 2820 2821 /* 2822 * Round-robin a context's events: 2823 */ 2824 static void rotate_ctx(struct perf_event_context *ctx) 2825 { 2826 /* 2827 * Rotate the first entry last of non-pinned groups. Rotation might be 2828 * disabled by the inheritance code. 2829 */ 2830 if (!ctx->rotate_disable) 2831 list_rotate_left(&ctx->flexible_groups); 2832 } 2833 2834 /* 2835 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized 2836 * because they're strictly cpu affine and rotate_start is called with IRQs 2837 * disabled, while rotate_context is called from IRQ context. 2838 */ 2839 static int perf_rotate_context(struct perf_cpu_context *cpuctx) 2840 { 2841 struct perf_event_context *ctx = NULL; 2842 int rotate = 0, remove = 1; 2843 2844 if (cpuctx->ctx.nr_events) { 2845 remove = 0; 2846 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2847 rotate = 1; 2848 } 2849 2850 ctx = cpuctx->task_ctx; 2851 if (ctx && ctx->nr_events) { 2852 remove = 0; 2853 if (ctx->nr_events != ctx->nr_active) 2854 rotate = 1; 2855 } 2856 2857 if (!rotate) 2858 goto done; 2859 2860 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2861 perf_pmu_disable(cpuctx->ctx.pmu); 2862 2863 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2864 if (ctx) 2865 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2866 2867 rotate_ctx(&cpuctx->ctx); 2868 if (ctx) 2869 rotate_ctx(ctx); 2870 2871 perf_event_sched_in(cpuctx, ctx, current); 2872 2873 perf_pmu_enable(cpuctx->ctx.pmu); 2874 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2875 done: 2876 if (remove) 2877 list_del_init(&cpuctx->rotation_list); 2878 2879 return rotate; 2880 } 2881 2882 #ifdef CONFIG_NO_HZ_FULL 2883 bool perf_event_can_stop_tick(void) 2884 { 2885 if (atomic_read(&nr_freq_events) || 2886 __this_cpu_read(perf_throttled_count)) 2887 return false; 2888 else 2889 return true; 2890 } 2891 #endif 2892 2893 void perf_event_task_tick(void) 2894 { 2895 struct list_head *head = &__get_cpu_var(rotation_list); 2896 struct perf_cpu_context *cpuctx, *tmp; 2897 struct perf_event_context *ctx; 2898 int throttled; 2899 2900 WARN_ON(!irqs_disabled()); 2901 2902 __this_cpu_inc(perf_throttled_seq); 2903 throttled = __this_cpu_xchg(perf_throttled_count, 0); 2904 2905 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 2906 ctx = &cpuctx->ctx; 2907 perf_adjust_freq_unthr_context(ctx, throttled); 2908 2909 ctx = cpuctx->task_ctx; 2910 if (ctx) 2911 perf_adjust_freq_unthr_context(ctx, throttled); 2912 } 2913 } 2914 2915 static int event_enable_on_exec(struct perf_event *event, 2916 struct perf_event_context *ctx) 2917 { 2918 if (!event->attr.enable_on_exec) 2919 return 0; 2920 2921 event->attr.enable_on_exec = 0; 2922 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2923 return 0; 2924 2925 __perf_event_mark_enabled(event); 2926 2927 return 1; 2928 } 2929 2930 /* 2931 * Enable all of a task's events that have been marked enable-on-exec. 2932 * This expects task == current. 2933 */ 2934 static void perf_event_enable_on_exec(struct perf_event_context *ctx) 2935 { 2936 struct perf_event *event; 2937 unsigned long flags; 2938 int enabled = 0; 2939 int ret; 2940 2941 local_irq_save(flags); 2942 if (!ctx || !ctx->nr_events) 2943 goto out; 2944 2945 /* 2946 * We must ctxsw out cgroup events to avoid conflict 2947 * when invoking perf_task_event_sched_in() later on 2948 * in this function. Otherwise we end up trying to 2949 * ctxswin cgroup events which are already scheduled 2950 * in. 2951 */ 2952 perf_cgroup_sched_out(current, NULL); 2953 2954 raw_spin_lock(&ctx->lock); 2955 task_ctx_sched_out(ctx); 2956 2957 list_for_each_entry(event, &ctx->event_list, event_entry) { 2958 ret = event_enable_on_exec(event, ctx); 2959 if (ret) 2960 enabled = 1; 2961 } 2962 2963 /* 2964 * Unclone this context if we enabled any event. 2965 */ 2966 if (enabled) 2967 unclone_ctx(ctx); 2968 2969 raw_spin_unlock(&ctx->lock); 2970 2971 /* 2972 * Also calls ctxswin for cgroup events, if any: 2973 */ 2974 perf_event_context_sched_in(ctx, ctx->task); 2975 out: 2976 local_irq_restore(flags); 2977 } 2978 2979 void perf_event_exec(void) 2980 { 2981 struct perf_event_context *ctx; 2982 int ctxn; 2983 2984 rcu_read_lock(); 2985 for_each_task_context_nr(ctxn) { 2986 ctx = current->perf_event_ctxp[ctxn]; 2987 if (!ctx) 2988 continue; 2989 2990 perf_event_enable_on_exec(ctx); 2991 } 2992 rcu_read_unlock(); 2993 } 2994 2995 /* 2996 * Cross CPU call to read the hardware event 2997 */ 2998 static void __perf_event_read(void *info) 2999 { 3000 struct perf_event *event = info; 3001 struct perf_event_context *ctx = event->ctx; 3002 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 3003 3004 /* 3005 * If this is a task context, we need to check whether it is 3006 * the current task context of this cpu. If not it has been 3007 * scheduled out before the smp call arrived. In that case 3008 * event->count would have been updated to a recent sample 3009 * when the event was scheduled out. 3010 */ 3011 if (ctx->task && cpuctx->task_ctx != ctx) 3012 return; 3013 3014 raw_spin_lock(&ctx->lock); 3015 if (ctx->is_active) { 3016 update_context_time(ctx); 3017 update_cgrp_time_from_event(event); 3018 } 3019 update_event_times(event); 3020 if (event->state == PERF_EVENT_STATE_ACTIVE) 3021 event->pmu->read(event); 3022 raw_spin_unlock(&ctx->lock); 3023 } 3024 3025 static inline u64 perf_event_count(struct perf_event *event) 3026 { 3027 return local64_read(&event->count) + atomic64_read(&event->child_count); 3028 } 3029 3030 static u64 perf_event_read(struct perf_event *event) 3031 { 3032 /* 3033 * If event is enabled and currently active on a CPU, update the 3034 * value in the event structure: 3035 */ 3036 if (event->state == PERF_EVENT_STATE_ACTIVE) { 3037 smp_call_function_single(event->oncpu, 3038 __perf_event_read, event, 1); 3039 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 3040 struct perf_event_context *ctx = event->ctx; 3041 unsigned long flags; 3042 3043 raw_spin_lock_irqsave(&ctx->lock, flags); 3044 /* 3045 * may read while context is not active 3046 * (e.g., thread is blocked), in that case 3047 * we cannot update context time 3048 */ 3049 if (ctx->is_active) { 3050 update_context_time(ctx); 3051 update_cgrp_time_from_event(event); 3052 } 3053 update_event_times(event); 3054 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3055 } 3056 3057 return perf_event_count(event); 3058 } 3059 3060 /* 3061 * Initialize the perf_event context in a task_struct: 3062 */ 3063 static void __perf_event_init_context(struct perf_event_context *ctx) 3064 { 3065 raw_spin_lock_init(&ctx->lock); 3066 mutex_init(&ctx->mutex); 3067 INIT_LIST_HEAD(&ctx->pinned_groups); 3068 INIT_LIST_HEAD(&ctx->flexible_groups); 3069 INIT_LIST_HEAD(&ctx->event_list); 3070 atomic_set(&ctx->refcount, 1); 3071 } 3072 3073 static struct perf_event_context * 3074 alloc_perf_context(struct pmu *pmu, struct task_struct *task) 3075 { 3076 struct perf_event_context *ctx; 3077 3078 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 3079 if (!ctx) 3080 return NULL; 3081 3082 __perf_event_init_context(ctx); 3083 if (task) { 3084 ctx->task = task; 3085 get_task_struct(task); 3086 } 3087 ctx->pmu = pmu; 3088 3089 return ctx; 3090 } 3091 3092 static struct task_struct * 3093 find_lively_task_by_vpid(pid_t vpid) 3094 { 3095 struct task_struct *task; 3096 int err; 3097 3098 rcu_read_lock(); 3099 if (!vpid) 3100 task = current; 3101 else 3102 task = find_task_by_vpid(vpid); 3103 if (task) 3104 get_task_struct(task); 3105 rcu_read_unlock(); 3106 3107 if (!task) 3108 return ERR_PTR(-ESRCH); 3109 3110 /* Reuse ptrace permission checks for now. */ 3111 err = -EACCES; 3112 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 3113 goto errout; 3114 3115 return task; 3116 errout: 3117 put_task_struct(task); 3118 return ERR_PTR(err); 3119 3120 } 3121 3122 /* 3123 * Returns a matching context with refcount and pincount. 3124 */ 3125 static struct perf_event_context * 3126 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 3127 { 3128 struct perf_event_context *ctx; 3129 struct perf_cpu_context *cpuctx; 3130 unsigned long flags; 3131 int ctxn, err; 3132 3133 if (!task) { 3134 /* Must be root to operate on a CPU event: */ 3135 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 3136 return ERR_PTR(-EACCES); 3137 3138 /* 3139 * We could be clever and allow to attach a event to an 3140 * offline CPU and activate it when the CPU comes up, but 3141 * that's for later. 3142 */ 3143 if (!cpu_online(cpu)) 3144 return ERR_PTR(-ENODEV); 3145 3146 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 3147 ctx = &cpuctx->ctx; 3148 get_ctx(ctx); 3149 ++ctx->pin_count; 3150 3151 return ctx; 3152 } 3153 3154 err = -EINVAL; 3155 ctxn = pmu->task_ctx_nr; 3156 if (ctxn < 0) 3157 goto errout; 3158 3159 retry: 3160 ctx = perf_lock_task_context(task, ctxn, &flags); 3161 if (ctx) { 3162 unclone_ctx(ctx); 3163 ++ctx->pin_count; 3164 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3165 } else { 3166 ctx = alloc_perf_context(pmu, task); 3167 err = -ENOMEM; 3168 if (!ctx) 3169 goto errout; 3170 3171 err = 0; 3172 mutex_lock(&task->perf_event_mutex); 3173 /* 3174 * If it has already passed perf_event_exit_task(). 3175 * we must see PF_EXITING, it takes this mutex too. 3176 */ 3177 if (task->flags & PF_EXITING) 3178 err = -ESRCH; 3179 else if (task->perf_event_ctxp[ctxn]) 3180 err = -EAGAIN; 3181 else { 3182 get_ctx(ctx); 3183 ++ctx->pin_count; 3184 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 3185 } 3186 mutex_unlock(&task->perf_event_mutex); 3187 3188 if (unlikely(err)) { 3189 put_ctx(ctx); 3190 3191 if (err == -EAGAIN) 3192 goto retry; 3193 goto errout; 3194 } 3195 } 3196 3197 return ctx; 3198 3199 errout: 3200 return ERR_PTR(err); 3201 } 3202 3203 static void perf_event_free_filter(struct perf_event *event); 3204 3205 static void free_event_rcu(struct rcu_head *head) 3206 { 3207 struct perf_event *event; 3208 3209 event = container_of(head, struct perf_event, rcu_head); 3210 if (event->ns) 3211 put_pid_ns(event->ns); 3212 perf_event_free_filter(event); 3213 kfree(event); 3214 } 3215 3216 static void ring_buffer_put(struct ring_buffer *rb); 3217 static void ring_buffer_attach(struct perf_event *event, 3218 struct ring_buffer *rb); 3219 3220 static void unaccount_event_cpu(struct perf_event *event, int cpu) 3221 { 3222 if (event->parent) 3223 return; 3224 3225 if (has_branch_stack(event)) { 3226 if (!(event->attach_state & PERF_ATTACH_TASK)) 3227 atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); 3228 } 3229 if (is_cgroup_event(event)) 3230 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 3231 } 3232 3233 static void unaccount_event(struct perf_event *event) 3234 { 3235 if (event->parent) 3236 return; 3237 3238 if (event->attach_state & PERF_ATTACH_TASK) 3239 static_key_slow_dec_deferred(&perf_sched_events); 3240 if (event->attr.mmap || event->attr.mmap_data) 3241 atomic_dec(&nr_mmap_events); 3242 if (event->attr.comm) 3243 atomic_dec(&nr_comm_events); 3244 if (event->attr.task) 3245 atomic_dec(&nr_task_events); 3246 if (event->attr.freq) 3247 atomic_dec(&nr_freq_events); 3248 if (is_cgroup_event(event)) 3249 static_key_slow_dec_deferred(&perf_sched_events); 3250 if (has_branch_stack(event)) 3251 static_key_slow_dec_deferred(&perf_sched_events); 3252 3253 unaccount_event_cpu(event, event->cpu); 3254 } 3255 3256 static void __free_event(struct perf_event *event) 3257 { 3258 if (!event->parent) { 3259 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3260 put_callchain_buffers(); 3261 } 3262 3263 if (event->destroy) 3264 event->destroy(event); 3265 3266 if (event->ctx) 3267 put_ctx(event->ctx); 3268 3269 if (event->pmu) 3270 module_put(event->pmu->module); 3271 3272 call_rcu(&event->rcu_head, free_event_rcu); 3273 } 3274 3275 static void _free_event(struct perf_event *event) 3276 { 3277 irq_work_sync(&event->pending); 3278 3279 unaccount_event(event); 3280 3281 if (event->rb) { 3282 /* 3283 * Can happen when we close an event with re-directed output. 3284 * 3285 * Since we have a 0 refcount, perf_mmap_close() will skip 3286 * over us; possibly making our ring_buffer_put() the last. 3287 */ 3288 mutex_lock(&event->mmap_mutex); 3289 ring_buffer_attach(event, NULL); 3290 mutex_unlock(&event->mmap_mutex); 3291 } 3292 3293 if (is_cgroup_event(event)) 3294 perf_detach_cgroup(event); 3295 3296 __free_event(event); 3297 } 3298 3299 /* 3300 * Used to free events which have a known refcount of 1, such as in error paths 3301 * where the event isn't exposed yet and inherited events. 3302 */ 3303 static void free_event(struct perf_event *event) 3304 { 3305 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 3306 "unexpected event refcount: %ld; ptr=%p\n", 3307 atomic_long_read(&event->refcount), event)) { 3308 /* leak to avoid use-after-free */ 3309 return; 3310 } 3311 3312 _free_event(event); 3313 } 3314 3315 /* 3316 * Called when the last reference to the file is gone. 3317 */ 3318 static void put_event(struct perf_event *event) 3319 { 3320 struct perf_event_context *ctx = event->ctx; 3321 struct task_struct *owner; 3322 3323 if (!atomic_long_dec_and_test(&event->refcount)) 3324 return; 3325 3326 rcu_read_lock(); 3327 owner = ACCESS_ONCE(event->owner); 3328 /* 3329 * Matches the smp_wmb() in perf_event_exit_task(). If we observe 3330 * !owner it means the list deletion is complete and we can indeed 3331 * free this event, otherwise we need to serialize on 3332 * owner->perf_event_mutex. 3333 */ 3334 smp_read_barrier_depends(); 3335 if (owner) { 3336 /* 3337 * Since delayed_put_task_struct() also drops the last 3338 * task reference we can safely take a new reference 3339 * while holding the rcu_read_lock(). 3340 */ 3341 get_task_struct(owner); 3342 } 3343 rcu_read_unlock(); 3344 3345 if (owner) { 3346 mutex_lock(&owner->perf_event_mutex); 3347 /* 3348 * We have to re-check the event->owner field, if it is cleared 3349 * we raced with perf_event_exit_task(), acquiring the mutex 3350 * ensured they're done, and we can proceed with freeing the 3351 * event. 3352 */ 3353 if (event->owner) 3354 list_del_init(&event->owner_entry); 3355 mutex_unlock(&owner->perf_event_mutex); 3356 put_task_struct(owner); 3357 } 3358 3359 WARN_ON_ONCE(ctx->parent_ctx); 3360 /* 3361 * There are two ways this annotation is useful: 3362 * 3363 * 1) there is a lock recursion from perf_event_exit_task 3364 * see the comment there. 3365 * 3366 * 2) there is a lock-inversion with mmap_sem through 3367 * perf_event_read_group(), which takes faults while 3368 * holding ctx->mutex, however this is called after 3369 * the last filedesc died, so there is no possibility 3370 * to trigger the AB-BA case. 3371 */ 3372 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 3373 perf_remove_from_context(event, true); 3374 mutex_unlock(&ctx->mutex); 3375 3376 _free_event(event); 3377 } 3378 3379 int perf_event_release_kernel(struct perf_event *event) 3380 { 3381 put_event(event); 3382 return 0; 3383 } 3384 EXPORT_SYMBOL_GPL(perf_event_release_kernel); 3385 3386 static int perf_release(struct inode *inode, struct file *file) 3387 { 3388 put_event(file->private_data); 3389 return 0; 3390 } 3391 3392 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3393 { 3394 struct perf_event *child; 3395 u64 total = 0; 3396 3397 *enabled = 0; 3398 *running = 0; 3399 3400 mutex_lock(&event->child_mutex); 3401 total += perf_event_read(event); 3402 *enabled += event->total_time_enabled + 3403 atomic64_read(&event->child_total_time_enabled); 3404 *running += event->total_time_running + 3405 atomic64_read(&event->child_total_time_running); 3406 3407 list_for_each_entry(child, &event->child_list, child_list) { 3408 total += perf_event_read(child); 3409 *enabled += child->total_time_enabled; 3410 *running += child->total_time_running; 3411 } 3412 mutex_unlock(&event->child_mutex); 3413 3414 return total; 3415 } 3416 EXPORT_SYMBOL_GPL(perf_event_read_value); 3417 3418 static int perf_event_read_group(struct perf_event *event, 3419 u64 read_format, char __user *buf) 3420 { 3421 struct perf_event *leader = event->group_leader, *sub; 3422 int n = 0, size = 0, ret = -EFAULT; 3423 struct perf_event_context *ctx = leader->ctx; 3424 u64 values[5]; 3425 u64 count, enabled, running; 3426 3427 mutex_lock(&ctx->mutex); 3428 count = perf_event_read_value(leader, &enabled, &running); 3429 3430 values[n++] = 1 + leader->nr_siblings; 3431 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3432 values[n++] = enabled; 3433 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3434 values[n++] = running; 3435 values[n++] = count; 3436 if (read_format & PERF_FORMAT_ID) 3437 values[n++] = primary_event_id(leader); 3438 3439 size = n * sizeof(u64); 3440 3441 if (copy_to_user(buf, values, size)) 3442 goto unlock; 3443 3444 ret = size; 3445 3446 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3447 n = 0; 3448 3449 values[n++] = perf_event_read_value(sub, &enabled, &running); 3450 if (read_format & PERF_FORMAT_ID) 3451 values[n++] = primary_event_id(sub); 3452 3453 size = n * sizeof(u64); 3454 3455 if (copy_to_user(buf + ret, values, size)) { 3456 ret = -EFAULT; 3457 goto unlock; 3458 } 3459 3460 ret += size; 3461 } 3462 unlock: 3463 mutex_unlock(&ctx->mutex); 3464 3465 return ret; 3466 } 3467 3468 static int perf_event_read_one(struct perf_event *event, 3469 u64 read_format, char __user *buf) 3470 { 3471 u64 enabled, running; 3472 u64 values[4]; 3473 int n = 0; 3474 3475 values[n++] = perf_event_read_value(event, &enabled, &running); 3476 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3477 values[n++] = enabled; 3478 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3479 values[n++] = running; 3480 if (read_format & PERF_FORMAT_ID) 3481 values[n++] = primary_event_id(event); 3482 3483 if (copy_to_user(buf, values, n * sizeof(u64))) 3484 return -EFAULT; 3485 3486 return n * sizeof(u64); 3487 } 3488 3489 /* 3490 * Read the performance event - simple non blocking version for now 3491 */ 3492 static ssize_t 3493 perf_read_hw(struct perf_event *event, char __user *buf, size_t count) 3494 { 3495 u64 read_format = event->attr.read_format; 3496 int ret; 3497 3498 /* 3499 * Return end-of-file for a read on a event that is in 3500 * error state (i.e. because it was pinned but it couldn't be 3501 * scheduled on to the CPU at some point). 3502 */ 3503 if (event->state == PERF_EVENT_STATE_ERROR) 3504 return 0; 3505 3506 if (count < event->read_size) 3507 return -ENOSPC; 3508 3509 WARN_ON_ONCE(event->ctx->parent_ctx); 3510 if (read_format & PERF_FORMAT_GROUP) 3511 ret = perf_event_read_group(event, read_format, buf); 3512 else 3513 ret = perf_event_read_one(event, read_format, buf); 3514 3515 return ret; 3516 } 3517 3518 static ssize_t 3519 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 3520 { 3521 struct perf_event *event = file->private_data; 3522 3523 return perf_read_hw(event, buf, count); 3524 } 3525 3526 static unsigned int perf_poll(struct file *file, poll_table *wait) 3527 { 3528 struct perf_event *event = file->private_data; 3529 struct ring_buffer *rb; 3530 unsigned int events = POLL_HUP; 3531 3532 /* 3533 * Pin the event->rb by taking event->mmap_mutex; otherwise 3534 * perf_event_set_output() can swizzle our rb and make us miss wakeups. 3535 */ 3536 mutex_lock(&event->mmap_mutex); 3537 rb = event->rb; 3538 if (rb) 3539 events = atomic_xchg(&rb->poll, 0); 3540 mutex_unlock(&event->mmap_mutex); 3541 3542 poll_wait(file, &event->waitq, wait); 3543 3544 return events; 3545 } 3546 3547 static void perf_event_reset(struct perf_event *event) 3548 { 3549 (void)perf_event_read(event); 3550 local64_set(&event->count, 0); 3551 perf_event_update_userpage(event); 3552 } 3553 3554 /* 3555 * Holding the top-level event's child_mutex means that any 3556 * descendant process that has inherited this event will block 3557 * in sync_child_event if it goes to exit, thus satisfying the 3558 * task existence requirements of perf_event_enable/disable. 3559 */ 3560 static void perf_event_for_each_child(struct perf_event *event, 3561 void (*func)(struct perf_event *)) 3562 { 3563 struct perf_event *child; 3564 3565 WARN_ON_ONCE(event->ctx->parent_ctx); 3566 mutex_lock(&event->child_mutex); 3567 func(event); 3568 list_for_each_entry(child, &event->child_list, child_list) 3569 func(child); 3570 mutex_unlock(&event->child_mutex); 3571 } 3572 3573 static void perf_event_for_each(struct perf_event *event, 3574 void (*func)(struct perf_event *)) 3575 { 3576 struct perf_event_context *ctx = event->ctx; 3577 struct perf_event *sibling; 3578 3579 WARN_ON_ONCE(ctx->parent_ctx); 3580 mutex_lock(&ctx->mutex); 3581 event = event->group_leader; 3582 3583 perf_event_for_each_child(event, func); 3584 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3585 perf_event_for_each_child(sibling, func); 3586 mutex_unlock(&ctx->mutex); 3587 } 3588 3589 static int perf_event_period(struct perf_event *event, u64 __user *arg) 3590 { 3591 struct perf_event_context *ctx = event->ctx; 3592 int ret = 0, active; 3593 u64 value; 3594 3595 if (!is_sampling_event(event)) 3596 return -EINVAL; 3597 3598 if (copy_from_user(&value, arg, sizeof(value))) 3599 return -EFAULT; 3600 3601 if (!value) 3602 return -EINVAL; 3603 3604 raw_spin_lock_irq(&ctx->lock); 3605 if (event->attr.freq) { 3606 if (value > sysctl_perf_event_sample_rate) { 3607 ret = -EINVAL; 3608 goto unlock; 3609 } 3610 3611 event->attr.sample_freq = value; 3612 } else { 3613 event->attr.sample_period = value; 3614 event->hw.sample_period = value; 3615 } 3616 3617 active = (event->state == PERF_EVENT_STATE_ACTIVE); 3618 if (active) { 3619 perf_pmu_disable(ctx->pmu); 3620 event->pmu->stop(event, PERF_EF_UPDATE); 3621 } 3622 3623 local64_set(&event->hw.period_left, 0); 3624 3625 if (active) { 3626 event->pmu->start(event, PERF_EF_RELOAD); 3627 perf_pmu_enable(ctx->pmu); 3628 } 3629 3630 unlock: 3631 raw_spin_unlock_irq(&ctx->lock); 3632 3633 return ret; 3634 } 3635 3636 static const struct file_operations perf_fops; 3637 3638 static inline int perf_fget_light(int fd, struct fd *p) 3639 { 3640 struct fd f = fdget(fd); 3641 if (!f.file) 3642 return -EBADF; 3643 3644 if (f.file->f_op != &perf_fops) { 3645 fdput(f); 3646 return -EBADF; 3647 } 3648 *p = f; 3649 return 0; 3650 } 3651 3652 static int perf_event_set_output(struct perf_event *event, 3653 struct perf_event *output_event); 3654 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 3655 3656 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 3657 { 3658 struct perf_event *event = file->private_data; 3659 void (*func)(struct perf_event *); 3660 u32 flags = arg; 3661 3662 switch (cmd) { 3663 case PERF_EVENT_IOC_ENABLE: 3664 func = perf_event_enable; 3665 break; 3666 case PERF_EVENT_IOC_DISABLE: 3667 func = perf_event_disable; 3668 break; 3669 case PERF_EVENT_IOC_RESET: 3670 func = perf_event_reset; 3671 break; 3672 3673 case PERF_EVENT_IOC_REFRESH: 3674 return perf_event_refresh(event, arg); 3675 3676 case PERF_EVENT_IOC_PERIOD: 3677 return perf_event_period(event, (u64 __user *)arg); 3678 3679 case PERF_EVENT_IOC_ID: 3680 { 3681 u64 id = primary_event_id(event); 3682 3683 if (copy_to_user((void __user *)arg, &id, sizeof(id))) 3684 return -EFAULT; 3685 return 0; 3686 } 3687 3688 case PERF_EVENT_IOC_SET_OUTPUT: 3689 { 3690 int ret; 3691 if (arg != -1) { 3692 struct perf_event *output_event; 3693 struct fd output; 3694 ret = perf_fget_light(arg, &output); 3695 if (ret) 3696 return ret; 3697 output_event = output.file->private_data; 3698 ret = perf_event_set_output(event, output_event); 3699 fdput(output); 3700 } else { 3701 ret = perf_event_set_output(event, NULL); 3702 } 3703 return ret; 3704 } 3705 3706 case PERF_EVENT_IOC_SET_FILTER: 3707 return perf_event_set_filter(event, (void __user *)arg); 3708 3709 default: 3710 return -ENOTTY; 3711 } 3712 3713 if (flags & PERF_IOC_FLAG_GROUP) 3714 perf_event_for_each(event, func); 3715 else 3716 perf_event_for_each_child(event, func); 3717 3718 return 0; 3719 } 3720 3721 #ifdef CONFIG_COMPAT 3722 static long perf_compat_ioctl(struct file *file, unsigned int cmd, 3723 unsigned long arg) 3724 { 3725 switch (_IOC_NR(cmd)) { 3726 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): 3727 case _IOC_NR(PERF_EVENT_IOC_ID): 3728 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ 3729 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { 3730 cmd &= ~IOCSIZE_MASK; 3731 cmd |= sizeof(void *) << IOCSIZE_SHIFT; 3732 } 3733 break; 3734 } 3735 return perf_ioctl(file, cmd, arg); 3736 } 3737 #else 3738 # define perf_compat_ioctl NULL 3739 #endif 3740 3741 int perf_event_task_enable(void) 3742 { 3743 struct perf_event *event; 3744 3745 mutex_lock(¤t->perf_event_mutex); 3746 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) 3747 perf_event_for_each_child(event, perf_event_enable); 3748 mutex_unlock(¤t->perf_event_mutex); 3749 3750 return 0; 3751 } 3752 3753 int perf_event_task_disable(void) 3754 { 3755 struct perf_event *event; 3756 3757 mutex_lock(¤t->perf_event_mutex); 3758 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) 3759 perf_event_for_each_child(event, perf_event_disable); 3760 mutex_unlock(¤t->perf_event_mutex); 3761 3762 return 0; 3763 } 3764 3765 static int perf_event_index(struct perf_event *event) 3766 { 3767 if (event->hw.state & PERF_HES_STOPPED) 3768 return 0; 3769 3770 if (event->state != PERF_EVENT_STATE_ACTIVE) 3771 return 0; 3772 3773 return event->pmu->event_idx(event); 3774 } 3775 3776 static void calc_timer_values(struct perf_event *event, 3777 u64 *now, 3778 u64 *enabled, 3779 u64 *running) 3780 { 3781 u64 ctx_time; 3782 3783 *now = perf_clock(); 3784 ctx_time = event->shadow_ctx_time + *now; 3785 *enabled = ctx_time - event->tstamp_enabled; 3786 *running = ctx_time - event->tstamp_running; 3787 } 3788 3789 static void perf_event_init_userpage(struct perf_event *event) 3790 { 3791 struct perf_event_mmap_page *userpg; 3792 struct ring_buffer *rb; 3793 3794 rcu_read_lock(); 3795 rb = rcu_dereference(event->rb); 3796 if (!rb) 3797 goto unlock; 3798 3799 userpg = rb->user_page; 3800 3801 /* Allow new userspace to detect that bit 0 is deprecated */ 3802 userpg->cap_bit0_is_deprecated = 1; 3803 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 3804 3805 unlock: 3806 rcu_read_unlock(); 3807 } 3808 3809 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 3810 { 3811 } 3812 3813 /* 3814 * Callers need to ensure there can be no nesting of this function, otherwise 3815 * the seqlock logic goes bad. We can not serialize this because the arch 3816 * code calls this from NMI context. 3817 */ 3818 void perf_event_update_userpage(struct perf_event *event) 3819 { 3820 struct perf_event_mmap_page *userpg; 3821 struct ring_buffer *rb; 3822 u64 enabled, running, now; 3823 3824 rcu_read_lock(); 3825 rb = rcu_dereference(event->rb); 3826 if (!rb) 3827 goto unlock; 3828 3829 /* 3830 * compute total_time_enabled, total_time_running 3831 * based on snapshot values taken when the event 3832 * was last scheduled in. 3833 * 3834 * we cannot simply called update_context_time() 3835 * because of locking issue as we can be called in 3836 * NMI context 3837 */ 3838 calc_timer_values(event, &now, &enabled, &running); 3839 3840 userpg = rb->user_page; 3841 /* 3842 * Disable preemption so as to not let the corresponding user-space 3843 * spin too long if we get preempted. 3844 */ 3845 preempt_disable(); 3846 ++userpg->lock; 3847 barrier(); 3848 userpg->index = perf_event_index(event); 3849 userpg->offset = perf_event_count(event); 3850 if (userpg->index) 3851 userpg->offset -= local64_read(&event->hw.prev_count); 3852 3853 userpg->time_enabled = enabled + 3854 atomic64_read(&event->child_total_time_enabled); 3855 3856 userpg->time_running = running + 3857 atomic64_read(&event->child_total_time_running); 3858 3859 arch_perf_update_userpage(userpg, now); 3860 3861 barrier(); 3862 ++userpg->lock; 3863 preempt_enable(); 3864 unlock: 3865 rcu_read_unlock(); 3866 } 3867 3868 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3869 { 3870 struct perf_event *event = vma->vm_file->private_data; 3871 struct ring_buffer *rb; 3872 int ret = VM_FAULT_SIGBUS; 3873 3874 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3875 if (vmf->pgoff == 0) 3876 ret = 0; 3877 return ret; 3878 } 3879 3880 rcu_read_lock(); 3881 rb = rcu_dereference(event->rb); 3882 if (!rb) 3883 goto unlock; 3884 3885 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3886 goto unlock; 3887 3888 vmf->page = perf_mmap_to_page(rb, vmf->pgoff); 3889 if (!vmf->page) 3890 goto unlock; 3891 3892 get_page(vmf->page); 3893 vmf->page->mapping = vma->vm_file->f_mapping; 3894 vmf->page->index = vmf->pgoff; 3895 3896 ret = 0; 3897 unlock: 3898 rcu_read_unlock(); 3899 3900 return ret; 3901 } 3902 3903 static void ring_buffer_attach(struct perf_event *event, 3904 struct ring_buffer *rb) 3905 { 3906 struct ring_buffer *old_rb = NULL; 3907 unsigned long flags; 3908 3909 if (event->rb) { 3910 /* 3911 * Should be impossible, we set this when removing 3912 * event->rb_entry and wait/clear when adding event->rb_entry. 3913 */ 3914 WARN_ON_ONCE(event->rcu_pending); 3915 3916 old_rb = event->rb; 3917 event->rcu_batches = get_state_synchronize_rcu(); 3918 event->rcu_pending = 1; 3919 3920 spin_lock_irqsave(&old_rb->event_lock, flags); 3921 list_del_rcu(&event->rb_entry); 3922 spin_unlock_irqrestore(&old_rb->event_lock, flags); 3923 } 3924 3925 if (event->rcu_pending && rb) { 3926 cond_synchronize_rcu(event->rcu_batches); 3927 event->rcu_pending = 0; 3928 } 3929 3930 if (rb) { 3931 spin_lock_irqsave(&rb->event_lock, flags); 3932 list_add_rcu(&event->rb_entry, &rb->event_list); 3933 spin_unlock_irqrestore(&rb->event_lock, flags); 3934 } 3935 3936 rcu_assign_pointer(event->rb, rb); 3937 3938 if (old_rb) { 3939 ring_buffer_put(old_rb); 3940 /* 3941 * Since we detached before setting the new rb, so that we 3942 * could attach the new rb, we could have missed a wakeup. 3943 * Provide it now. 3944 */ 3945 wake_up_all(&event->waitq); 3946 } 3947 } 3948 3949 static void ring_buffer_wakeup(struct perf_event *event) 3950 { 3951 struct ring_buffer *rb; 3952 3953 rcu_read_lock(); 3954 rb = rcu_dereference(event->rb); 3955 if (rb) { 3956 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3957 wake_up_all(&event->waitq); 3958 } 3959 rcu_read_unlock(); 3960 } 3961 3962 static void rb_free_rcu(struct rcu_head *rcu_head) 3963 { 3964 struct ring_buffer *rb; 3965 3966 rb = container_of(rcu_head, struct ring_buffer, rcu_head); 3967 rb_free(rb); 3968 } 3969 3970 static struct ring_buffer *ring_buffer_get(struct perf_event *event) 3971 { 3972 struct ring_buffer *rb; 3973 3974 rcu_read_lock(); 3975 rb = rcu_dereference(event->rb); 3976 if (rb) { 3977 if (!atomic_inc_not_zero(&rb->refcount)) 3978 rb = NULL; 3979 } 3980 rcu_read_unlock(); 3981 3982 return rb; 3983 } 3984 3985 static void ring_buffer_put(struct ring_buffer *rb) 3986 { 3987 if (!atomic_dec_and_test(&rb->refcount)) 3988 return; 3989 3990 WARN_ON_ONCE(!list_empty(&rb->event_list)); 3991 3992 call_rcu(&rb->rcu_head, rb_free_rcu); 3993 } 3994 3995 static void perf_mmap_open(struct vm_area_struct *vma) 3996 { 3997 struct perf_event *event = vma->vm_file->private_data; 3998 3999 atomic_inc(&event->mmap_count); 4000 atomic_inc(&event->rb->mmap_count); 4001 } 4002 4003 /* 4004 * A buffer can be mmap()ed multiple times; either directly through the same 4005 * event, or through other events by use of perf_event_set_output(). 4006 * 4007 * In order to undo the VM accounting done by perf_mmap() we need to destroy 4008 * the buffer here, where we still have a VM context. This means we need 4009 * to detach all events redirecting to us. 4010 */ 4011 static void perf_mmap_close(struct vm_area_struct *vma) 4012 { 4013 struct perf_event *event = vma->vm_file->private_data; 4014 4015 struct ring_buffer *rb = ring_buffer_get(event); 4016 struct user_struct *mmap_user = rb->mmap_user; 4017 int mmap_locked = rb->mmap_locked; 4018 unsigned long size = perf_data_size(rb); 4019 4020 atomic_dec(&rb->mmap_count); 4021 4022 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4023 goto out_put; 4024 4025 ring_buffer_attach(event, NULL); 4026 mutex_unlock(&event->mmap_mutex); 4027 4028 /* If there's still other mmap()s of this buffer, we're done. */ 4029 if (atomic_read(&rb->mmap_count)) 4030 goto out_put; 4031 4032 /* 4033 * No other mmap()s, detach from all other events that might redirect 4034 * into the now unreachable buffer. Somewhat complicated by the 4035 * fact that rb::event_lock otherwise nests inside mmap_mutex. 4036 */ 4037 again: 4038 rcu_read_lock(); 4039 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { 4040 if (!atomic_long_inc_not_zero(&event->refcount)) { 4041 /* 4042 * This event is en-route to free_event() which will 4043 * detach it and remove it from the list. 4044 */ 4045 continue; 4046 } 4047 rcu_read_unlock(); 4048 4049 mutex_lock(&event->mmap_mutex); 4050 /* 4051 * Check we didn't race with perf_event_set_output() which can 4052 * swizzle the rb from under us while we were waiting to 4053 * acquire mmap_mutex. 4054 * 4055 * If we find a different rb; ignore this event, a next 4056 * iteration will no longer find it on the list. We have to 4057 * still restart the iteration to make sure we're not now 4058 * iterating the wrong list. 4059 */ 4060 if (event->rb == rb) 4061 ring_buffer_attach(event, NULL); 4062 4063 mutex_unlock(&event->mmap_mutex); 4064 put_event(event); 4065 4066 /* 4067 * Restart the iteration; either we're on the wrong list or 4068 * destroyed its integrity by doing a deletion. 4069 */ 4070 goto again; 4071 } 4072 rcu_read_unlock(); 4073 4074 /* 4075 * It could be there's still a few 0-ref events on the list; they'll 4076 * get cleaned up by free_event() -- they'll also still have their 4077 * ref on the rb and will free it whenever they are done with it. 4078 * 4079 * Aside from that, this buffer is 'fully' detached and unmapped, 4080 * undo the VM accounting. 4081 */ 4082 4083 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 4084 vma->vm_mm->pinned_vm -= mmap_locked; 4085 free_uid(mmap_user); 4086 4087 out_put: 4088 ring_buffer_put(rb); /* could be last */ 4089 } 4090 4091 static const struct vm_operations_struct perf_mmap_vmops = { 4092 .open = perf_mmap_open, 4093 .close = perf_mmap_close, 4094 .fault = perf_mmap_fault, 4095 .page_mkwrite = perf_mmap_fault, 4096 }; 4097 4098 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 4099 { 4100 struct perf_event *event = file->private_data; 4101 unsigned long user_locked, user_lock_limit; 4102 struct user_struct *user = current_user(); 4103 unsigned long locked, lock_limit; 4104 struct ring_buffer *rb; 4105 unsigned long vma_size; 4106 unsigned long nr_pages; 4107 long user_extra, extra; 4108 int ret = 0, flags = 0; 4109 4110 /* 4111 * Don't allow mmap() of inherited per-task counters. This would 4112 * create a performance issue due to all children writing to the 4113 * same rb. 4114 */ 4115 if (event->cpu == -1 && event->attr.inherit) 4116 return -EINVAL; 4117 4118 if (!(vma->vm_flags & VM_SHARED)) 4119 return -EINVAL; 4120 4121 vma_size = vma->vm_end - vma->vm_start; 4122 nr_pages = (vma_size / PAGE_SIZE) - 1; 4123 4124 /* 4125 * If we have rb pages ensure they're a power-of-two number, so we 4126 * can do bitmasks instead of modulo. 4127 */ 4128 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 4129 return -EINVAL; 4130 4131 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 4132 return -EINVAL; 4133 4134 if (vma->vm_pgoff != 0) 4135 return -EINVAL; 4136 4137 WARN_ON_ONCE(event->ctx->parent_ctx); 4138 again: 4139 mutex_lock(&event->mmap_mutex); 4140 if (event->rb) { 4141 if (event->rb->nr_pages != nr_pages) { 4142 ret = -EINVAL; 4143 goto unlock; 4144 } 4145 4146 if (!atomic_inc_not_zero(&event->rb->mmap_count)) { 4147 /* 4148 * Raced against perf_mmap_close() through 4149 * perf_event_set_output(). Try again, hope for better 4150 * luck. 4151 */ 4152 mutex_unlock(&event->mmap_mutex); 4153 goto again; 4154 } 4155 4156 goto unlock; 4157 } 4158 4159 user_extra = nr_pages + 1; 4160 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 4161 4162 /* 4163 * Increase the limit linearly with more CPUs: 4164 */ 4165 user_lock_limit *= num_online_cpus(); 4166 4167 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 4168 4169 extra = 0; 4170 if (user_locked > user_lock_limit) 4171 extra = user_locked - user_lock_limit; 4172 4173 lock_limit = rlimit(RLIMIT_MEMLOCK); 4174 lock_limit >>= PAGE_SHIFT; 4175 locked = vma->vm_mm->pinned_vm + extra; 4176 4177 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 4178 !capable(CAP_IPC_LOCK)) { 4179 ret = -EPERM; 4180 goto unlock; 4181 } 4182 4183 WARN_ON(event->rb); 4184 4185 if (vma->vm_flags & VM_WRITE) 4186 flags |= RING_BUFFER_WRITABLE; 4187 4188 rb = rb_alloc(nr_pages, 4189 event->attr.watermark ? event->attr.wakeup_watermark : 0, 4190 event->cpu, flags); 4191 4192 if (!rb) { 4193 ret = -ENOMEM; 4194 goto unlock; 4195 } 4196 4197 atomic_set(&rb->mmap_count, 1); 4198 rb->mmap_locked = extra; 4199 rb->mmap_user = get_current_user(); 4200 4201 atomic_long_add(user_extra, &user->locked_vm); 4202 vma->vm_mm->pinned_vm += extra; 4203 4204 ring_buffer_attach(event, rb); 4205 4206 perf_event_init_userpage(event); 4207 perf_event_update_userpage(event); 4208 4209 unlock: 4210 if (!ret) 4211 atomic_inc(&event->mmap_count); 4212 mutex_unlock(&event->mmap_mutex); 4213 4214 /* 4215 * Since pinned accounting is per vm we cannot allow fork() to copy our 4216 * vma. 4217 */ 4218 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 4219 vma->vm_ops = &perf_mmap_vmops; 4220 4221 return ret; 4222 } 4223 4224 static int perf_fasync(int fd, struct file *filp, int on) 4225 { 4226 struct inode *inode = file_inode(filp); 4227 struct perf_event *event = filp->private_data; 4228 int retval; 4229 4230 mutex_lock(&inode->i_mutex); 4231 retval = fasync_helper(fd, filp, on, &event->fasync); 4232 mutex_unlock(&inode->i_mutex); 4233 4234 if (retval < 0) 4235 return retval; 4236 4237 return 0; 4238 } 4239 4240 static const struct file_operations perf_fops = { 4241 .llseek = no_llseek, 4242 .release = perf_release, 4243 .read = perf_read, 4244 .poll = perf_poll, 4245 .unlocked_ioctl = perf_ioctl, 4246 .compat_ioctl = perf_compat_ioctl, 4247 .mmap = perf_mmap, 4248 .fasync = perf_fasync, 4249 }; 4250 4251 /* 4252 * Perf event wakeup 4253 * 4254 * If there's data, ensure we set the poll() state and publish everything 4255 * to user-space before waking everybody up. 4256 */ 4257 4258 void perf_event_wakeup(struct perf_event *event) 4259 { 4260 ring_buffer_wakeup(event); 4261 4262 if (event->pending_kill) { 4263 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 4264 event->pending_kill = 0; 4265 } 4266 } 4267 4268 static void perf_pending_event(struct irq_work *entry) 4269 { 4270 struct perf_event *event = container_of(entry, 4271 struct perf_event, pending); 4272 4273 if (event->pending_disable) { 4274 event->pending_disable = 0; 4275 __perf_event_disable(event); 4276 } 4277 4278 if (event->pending_wakeup) { 4279 event->pending_wakeup = 0; 4280 perf_event_wakeup(event); 4281 } 4282 } 4283 4284 /* 4285 * We assume there is only KVM supporting the callbacks. 4286 * Later on, we might change it to a list if there is 4287 * another virtualization implementation supporting the callbacks. 4288 */ 4289 struct perf_guest_info_callbacks *perf_guest_cbs; 4290 4291 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 4292 { 4293 perf_guest_cbs = cbs; 4294 return 0; 4295 } 4296 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 4297 4298 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 4299 { 4300 perf_guest_cbs = NULL; 4301 return 0; 4302 } 4303 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 4304 4305 static void 4306 perf_output_sample_regs(struct perf_output_handle *handle, 4307 struct pt_regs *regs, u64 mask) 4308 { 4309 int bit; 4310 4311 for_each_set_bit(bit, (const unsigned long *) &mask, 4312 sizeof(mask) * BITS_PER_BYTE) { 4313 u64 val; 4314 4315 val = perf_reg_value(regs, bit); 4316 perf_output_put(handle, val); 4317 } 4318 } 4319 4320 static void perf_sample_regs_user(struct perf_regs_user *regs_user, 4321 struct pt_regs *regs) 4322 { 4323 if (!user_mode(regs)) { 4324 if (current->mm) 4325 regs = task_pt_regs(current); 4326 else 4327 regs = NULL; 4328 } 4329 4330 if (regs) { 4331 regs_user->regs = regs; 4332 regs_user->abi = perf_reg_abi(current); 4333 } 4334 } 4335 4336 /* 4337 * Get remaining task size from user stack pointer. 4338 * 4339 * It'd be better to take stack vma map and limit this more 4340 * precisly, but there's no way to get it safely under interrupt, 4341 * so using TASK_SIZE as limit. 4342 */ 4343 static u64 perf_ustack_task_size(struct pt_regs *regs) 4344 { 4345 unsigned long addr = perf_user_stack_pointer(regs); 4346 4347 if (!addr || addr >= TASK_SIZE) 4348 return 0; 4349 4350 return TASK_SIZE - addr; 4351 } 4352 4353 static u16 4354 perf_sample_ustack_size(u16 stack_size, u16 header_size, 4355 struct pt_regs *regs) 4356 { 4357 u64 task_size; 4358 4359 /* No regs, no stack pointer, no dump. */ 4360 if (!regs) 4361 return 0; 4362 4363 /* 4364 * Check if we fit in with the requested stack size into the: 4365 * - TASK_SIZE 4366 * If we don't, we limit the size to the TASK_SIZE. 4367 * 4368 * - remaining sample size 4369 * If we don't, we customize the stack size to 4370 * fit in to the remaining sample size. 4371 */ 4372 4373 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); 4374 stack_size = min(stack_size, (u16) task_size); 4375 4376 /* Current header size plus static size and dynamic size. */ 4377 header_size += 2 * sizeof(u64); 4378 4379 /* Do we fit in with the current stack dump size? */ 4380 if ((u16) (header_size + stack_size) < header_size) { 4381 /* 4382 * If we overflow the maximum size for the sample, 4383 * we customize the stack dump size to fit in. 4384 */ 4385 stack_size = USHRT_MAX - header_size - sizeof(u64); 4386 stack_size = round_up(stack_size, sizeof(u64)); 4387 } 4388 4389 return stack_size; 4390 } 4391 4392 static void 4393 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, 4394 struct pt_regs *regs) 4395 { 4396 /* Case of a kernel thread, nothing to dump */ 4397 if (!regs) { 4398 u64 size = 0; 4399 perf_output_put(handle, size); 4400 } else { 4401 unsigned long sp; 4402 unsigned int rem; 4403 u64 dyn_size; 4404 4405 /* 4406 * We dump: 4407 * static size 4408 * - the size requested by user or the best one we can fit 4409 * in to the sample max size 4410 * data 4411 * - user stack dump data 4412 * dynamic size 4413 * - the actual dumped size 4414 */ 4415 4416 /* Static size. */ 4417 perf_output_put(handle, dump_size); 4418 4419 /* Data. */ 4420 sp = perf_user_stack_pointer(regs); 4421 rem = __output_copy_user(handle, (void *) sp, dump_size); 4422 dyn_size = dump_size - rem; 4423 4424 perf_output_skip(handle, rem); 4425 4426 /* Dynamic size. */ 4427 perf_output_put(handle, dyn_size); 4428 } 4429 } 4430 4431 static void __perf_event_header__init_id(struct perf_event_header *header, 4432 struct perf_sample_data *data, 4433 struct perf_event *event) 4434 { 4435 u64 sample_type = event->attr.sample_type; 4436 4437 data->type = sample_type; 4438 header->size += event->id_header_size; 4439 4440 if (sample_type & PERF_SAMPLE_TID) { 4441 /* namespace issues */ 4442 data->tid_entry.pid = perf_event_pid(event, current); 4443 data->tid_entry.tid = perf_event_tid(event, current); 4444 } 4445 4446 if (sample_type & PERF_SAMPLE_TIME) 4447 data->time = perf_clock(); 4448 4449 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 4450 data->id = primary_event_id(event); 4451 4452 if (sample_type & PERF_SAMPLE_STREAM_ID) 4453 data->stream_id = event->id; 4454 4455 if (sample_type & PERF_SAMPLE_CPU) { 4456 data->cpu_entry.cpu = raw_smp_processor_id(); 4457 data->cpu_entry.reserved = 0; 4458 } 4459 } 4460 4461 void perf_event_header__init_id(struct perf_event_header *header, 4462 struct perf_sample_data *data, 4463 struct perf_event *event) 4464 { 4465 if (event->attr.sample_id_all) 4466 __perf_event_header__init_id(header, data, event); 4467 } 4468 4469 static void __perf_event__output_id_sample(struct perf_output_handle *handle, 4470 struct perf_sample_data *data) 4471 { 4472 u64 sample_type = data->type; 4473 4474 if (sample_type & PERF_SAMPLE_TID) 4475 perf_output_put(handle, data->tid_entry); 4476 4477 if (sample_type & PERF_SAMPLE_TIME) 4478 perf_output_put(handle, data->time); 4479 4480 if (sample_type & PERF_SAMPLE_ID) 4481 perf_output_put(handle, data->id); 4482 4483 if (sample_type & PERF_SAMPLE_STREAM_ID) 4484 perf_output_put(handle, data->stream_id); 4485 4486 if (sample_type & PERF_SAMPLE_CPU) 4487 perf_output_put(handle, data->cpu_entry); 4488 4489 if (sample_type & PERF_SAMPLE_IDENTIFIER) 4490 perf_output_put(handle, data->id); 4491 } 4492 4493 void perf_event__output_id_sample(struct perf_event *event, 4494 struct perf_output_handle *handle, 4495 struct perf_sample_data *sample) 4496 { 4497 if (event->attr.sample_id_all) 4498 __perf_event__output_id_sample(handle, sample); 4499 } 4500 4501 static void perf_output_read_one(struct perf_output_handle *handle, 4502 struct perf_event *event, 4503 u64 enabled, u64 running) 4504 { 4505 u64 read_format = event->attr.read_format; 4506 u64 values[4]; 4507 int n = 0; 4508 4509 values[n++] = perf_event_count(event); 4510 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 4511 values[n++] = enabled + 4512 atomic64_read(&event->child_total_time_enabled); 4513 } 4514 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 4515 values[n++] = running + 4516 atomic64_read(&event->child_total_time_running); 4517 } 4518 if (read_format & PERF_FORMAT_ID) 4519 values[n++] = primary_event_id(event); 4520 4521 __output_copy(handle, values, n * sizeof(u64)); 4522 } 4523 4524 /* 4525 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 4526 */ 4527 static void perf_output_read_group(struct perf_output_handle *handle, 4528 struct perf_event *event, 4529 u64 enabled, u64 running) 4530 { 4531 struct perf_event *leader = event->group_leader, *sub; 4532 u64 read_format = event->attr.read_format; 4533 u64 values[5]; 4534 int n = 0; 4535 4536 values[n++] = 1 + leader->nr_siblings; 4537 4538 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 4539 values[n++] = enabled; 4540 4541 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 4542 values[n++] = running; 4543 4544 if (leader != event) 4545 leader->pmu->read(leader); 4546 4547 values[n++] = perf_event_count(leader); 4548 if (read_format & PERF_FORMAT_ID) 4549 values[n++] = primary_event_id(leader); 4550 4551 __output_copy(handle, values, n * sizeof(u64)); 4552 4553 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4554 n = 0; 4555 4556 if ((sub != event) && 4557 (sub->state == PERF_EVENT_STATE_ACTIVE)) 4558 sub->pmu->read(sub); 4559 4560 values[n++] = perf_event_count(sub); 4561 if (read_format & PERF_FORMAT_ID) 4562 values[n++] = primary_event_id(sub); 4563 4564 __output_copy(handle, values, n * sizeof(u64)); 4565 } 4566 } 4567 4568 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ 4569 PERF_FORMAT_TOTAL_TIME_RUNNING) 4570 4571 static void perf_output_read(struct perf_output_handle *handle, 4572 struct perf_event *event) 4573 { 4574 u64 enabled = 0, running = 0, now; 4575 u64 read_format = event->attr.read_format; 4576 4577 /* 4578 * compute total_time_enabled, total_time_running 4579 * based on snapshot values taken when the event 4580 * was last scheduled in. 4581 * 4582 * we cannot simply called update_context_time() 4583 * because of locking issue as we are called in 4584 * NMI context 4585 */ 4586 if (read_format & PERF_FORMAT_TOTAL_TIMES) 4587 calc_timer_values(event, &now, &enabled, &running); 4588 4589 if (event->attr.read_format & PERF_FORMAT_GROUP) 4590 perf_output_read_group(handle, event, enabled, running); 4591 else 4592 perf_output_read_one(handle, event, enabled, running); 4593 } 4594 4595 void perf_output_sample(struct perf_output_handle *handle, 4596 struct perf_event_header *header, 4597 struct perf_sample_data *data, 4598 struct perf_event *event) 4599 { 4600 u64 sample_type = data->type; 4601 4602 perf_output_put(handle, *header); 4603 4604 if (sample_type & PERF_SAMPLE_IDENTIFIER) 4605 perf_output_put(handle, data->id); 4606 4607 if (sample_type & PERF_SAMPLE_IP) 4608 perf_output_put(handle, data->ip); 4609 4610 if (sample_type & PERF_SAMPLE_TID) 4611 perf_output_put(handle, data->tid_entry); 4612 4613 if (sample_type & PERF_SAMPLE_TIME) 4614 perf_output_put(handle, data->time); 4615 4616 if (sample_type & PERF_SAMPLE_ADDR) 4617 perf_output_put(handle, data->addr); 4618 4619 if (sample_type & PERF_SAMPLE_ID) 4620 perf_output_put(handle, data->id); 4621 4622 if (sample_type & PERF_SAMPLE_STREAM_ID) 4623 perf_output_put(handle, data->stream_id); 4624 4625 if (sample_type & PERF_SAMPLE_CPU) 4626 perf_output_put(handle, data->cpu_entry); 4627 4628 if (sample_type & PERF_SAMPLE_PERIOD) 4629 perf_output_put(handle, data->period); 4630 4631 if (sample_type & PERF_SAMPLE_READ) 4632 perf_output_read(handle, event); 4633 4634 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4635 if (data->callchain) { 4636 int size = 1; 4637 4638 if (data->callchain) 4639 size += data->callchain->nr; 4640 4641 size *= sizeof(u64); 4642 4643 __output_copy(handle, data->callchain, size); 4644 } else { 4645 u64 nr = 0; 4646 perf_output_put(handle, nr); 4647 } 4648 } 4649 4650 if (sample_type & PERF_SAMPLE_RAW) { 4651 if (data->raw) { 4652 perf_output_put(handle, data->raw->size); 4653 __output_copy(handle, data->raw->data, 4654 data->raw->size); 4655 } else { 4656 struct { 4657 u32 size; 4658 u32 data; 4659 } raw = { 4660 .size = sizeof(u32), 4661 .data = 0, 4662 }; 4663 perf_output_put(handle, raw); 4664 } 4665 } 4666 4667 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4668 if (data->br_stack) { 4669 size_t size; 4670 4671 size = data->br_stack->nr 4672 * sizeof(struct perf_branch_entry); 4673 4674 perf_output_put(handle, data->br_stack->nr); 4675 perf_output_copy(handle, data->br_stack->entries, size); 4676 } else { 4677 /* 4678 * we always store at least the value of nr 4679 */ 4680 u64 nr = 0; 4681 perf_output_put(handle, nr); 4682 } 4683 } 4684 4685 if (sample_type & PERF_SAMPLE_REGS_USER) { 4686 u64 abi = data->regs_user.abi; 4687 4688 /* 4689 * If there are no regs to dump, notice it through 4690 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 4691 */ 4692 perf_output_put(handle, abi); 4693 4694 if (abi) { 4695 u64 mask = event->attr.sample_regs_user; 4696 perf_output_sample_regs(handle, 4697 data->regs_user.regs, 4698 mask); 4699 } 4700 } 4701 4702 if (sample_type & PERF_SAMPLE_STACK_USER) { 4703 perf_output_sample_ustack(handle, 4704 data->stack_user_size, 4705 data->regs_user.regs); 4706 } 4707 4708 if (sample_type & PERF_SAMPLE_WEIGHT) 4709 perf_output_put(handle, data->weight); 4710 4711 if (sample_type & PERF_SAMPLE_DATA_SRC) 4712 perf_output_put(handle, data->data_src.val); 4713 4714 if (sample_type & PERF_SAMPLE_TRANSACTION) 4715 perf_output_put(handle, data->txn); 4716 4717 if (!event->attr.watermark) { 4718 int wakeup_events = event->attr.wakeup_events; 4719 4720 if (wakeup_events) { 4721 struct ring_buffer *rb = handle->rb; 4722 int events = local_inc_return(&rb->events); 4723 4724 if (events >= wakeup_events) { 4725 local_sub(wakeup_events, &rb->events); 4726 local_inc(&rb->wakeup); 4727 } 4728 } 4729 } 4730 } 4731 4732 void perf_prepare_sample(struct perf_event_header *header, 4733 struct perf_sample_data *data, 4734 struct perf_event *event, 4735 struct pt_regs *regs) 4736 { 4737 u64 sample_type = event->attr.sample_type; 4738 4739 header->type = PERF_RECORD_SAMPLE; 4740 header->size = sizeof(*header) + event->header_size; 4741 4742 header->misc = 0; 4743 header->misc |= perf_misc_flags(regs); 4744 4745 __perf_event_header__init_id(header, data, event); 4746 4747 if (sample_type & PERF_SAMPLE_IP) 4748 data->ip = perf_instruction_pointer(regs); 4749 4750 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4751 int size = 1; 4752 4753 data->callchain = perf_callchain(event, regs); 4754 4755 if (data->callchain) 4756 size += data->callchain->nr; 4757 4758 header->size += size * sizeof(u64); 4759 } 4760 4761 if (sample_type & PERF_SAMPLE_RAW) { 4762 int size = sizeof(u32); 4763 4764 if (data->raw) 4765 size += data->raw->size; 4766 else 4767 size += sizeof(u32); 4768 4769 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4770 header->size += size; 4771 } 4772 4773 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4774 int size = sizeof(u64); /* nr */ 4775 if (data->br_stack) { 4776 size += data->br_stack->nr 4777 * sizeof(struct perf_branch_entry); 4778 } 4779 header->size += size; 4780 } 4781 4782 if (sample_type & PERF_SAMPLE_REGS_USER) { 4783 /* regs dump ABI info */ 4784 int size = sizeof(u64); 4785 4786 perf_sample_regs_user(&data->regs_user, regs); 4787 4788 if (data->regs_user.regs) { 4789 u64 mask = event->attr.sample_regs_user; 4790 size += hweight64(mask) * sizeof(u64); 4791 } 4792 4793 header->size += size; 4794 } 4795 4796 if (sample_type & PERF_SAMPLE_STACK_USER) { 4797 /* 4798 * Either we need PERF_SAMPLE_STACK_USER bit to be allways 4799 * processed as the last one or have additional check added 4800 * in case new sample type is added, because we could eat 4801 * up the rest of the sample size. 4802 */ 4803 struct perf_regs_user *uregs = &data->regs_user; 4804 u16 stack_size = event->attr.sample_stack_user; 4805 u16 size = sizeof(u64); 4806 4807 if (!uregs->abi) 4808 perf_sample_regs_user(uregs, regs); 4809 4810 stack_size = perf_sample_ustack_size(stack_size, header->size, 4811 uregs->regs); 4812 4813 /* 4814 * If there is something to dump, add space for the dump 4815 * itself and for the field that tells the dynamic size, 4816 * which is how many have been actually dumped. 4817 */ 4818 if (stack_size) 4819 size += sizeof(u64) + stack_size; 4820 4821 data->stack_user_size = stack_size; 4822 header->size += size; 4823 } 4824 } 4825 4826 static void perf_event_output(struct perf_event *event, 4827 struct perf_sample_data *data, 4828 struct pt_regs *regs) 4829 { 4830 struct perf_output_handle handle; 4831 struct perf_event_header header; 4832 4833 /* protect the callchain buffers */ 4834 rcu_read_lock(); 4835 4836 perf_prepare_sample(&header, data, event, regs); 4837 4838 if (perf_output_begin(&handle, event, header.size)) 4839 goto exit; 4840 4841 perf_output_sample(&handle, &header, data, event); 4842 4843 perf_output_end(&handle); 4844 4845 exit: 4846 rcu_read_unlock(); 4847 } 4848 4849 /* 4850 * read event_id 4851 */ 4852 4853 struct perf_read_event { 4854 struct perf_event_header header; 4855 4856 u32 pid; 4857 u32 tid; 4858 }; 4859 4860 static void 4861 perf_event_read_event(struct perf_event *event, 4862 struct task_struct *task) 4863 { 4864 struct perf_output_handle handle; 4865 struct perf_sample_data sample; 4866 struct perf_read_event read_event = { 4867 .header = { 4868 .type = PERF_RECORD_READ, 4869 .misc = 0, 4870 .size = sizeof(read_event) + event->read_size, 4871 }, 4872 .pid = perf_event_pid(event, task), 4873 .tid = perf_event_tid(event, task), 4874 }; 4875 int ret; 4876 4877 perf_event_header__init_id(&read_event.header, &sample, event); 4878 ret = perf_output_begin(&handle, event, read_event.header.size); 4879 if (ret) 4880 return; 4881 4882 perf_output_put(&handle, read_event); 4883 perf_output_read(&handle, event); 4884 perf_event__output_id_sample(event, &handle, &sample); 4885 4886 perf_output_end(&handle); 4887 } 4888 4889 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 4890 4891 static void 4892 perf_event_aux_ctx(struct perf_event_context *ctx, 4893 perf_event_aux_output_cb output, 4894 void *data) 4895 { 4896 struct perf_event *event; 4897 4898 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4899 if (event->state < PERF_EVENT_STATE_INACTIVE) 4900 continue; 4901 if (!event_filter_match(event)) 4902 continue; 4903 output(event, data); 4904 } 4905 } 4906 4907 static void 4908 perf_event_aux(perf_event_aux_output_cb output, void *data, 4909 struct perf_event_context *task_ctx) 4910 { 4911 struct perf_cpu_context *cpuctx; 4912 struct perf_event_context *ctx; 4913 struct pmu *pmu; 4914 int ctxn; 4915 4916 rcu_read_lock(); 4917 list_for_each_entry_rcu(pmu, &pmus, entry) { 4918 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4919 if (cpuctx->unique_pmu != pmu) 4920 goto next; 4921 perf_event_aux_ctx(&cpuctx->ctx, output, data); 4922 if (task_ctx) 4923 goto next; 4924 ctxn = pmu->task_ctx_nr; 4925 if (ctxn < 0) 4926 goto next; 4927 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4928 if (ctx) 4929 perf_event_aux_ctx(ctx, output, data); 4930 next: 4931 put_cpu_ptr(pmu->pmu_cpu_context); 4932 } 4933 4934 if (task_ctx) { 4935 preempt_disable(); 4936 perf_event_aux_ctx(task_ctx, output, data); 4937 preempt_enable(); 4938 } 4939 rcu_read_unlock(); 4940 } 4941 4942 /* 4943 * task tracking -- fork/exit 4944 * 4945 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task 4946 */ 4947 4948 struct perf_task_event { 4949 struct task_struct *task; 4950 struct perf_event_context *task_ctx; 4951 4952 struct { 4953 struct perf_event_header header; 4954 4955 u32 pid; 4956 u32 ppid; 4957 u32 tid; 4958 u32 ptid; 4959 u64 time; 4960 } event_id; 4961 }; 4962 4963 static int perf_event_task_match(struct perf_event *event) 4964 { 4965 return event->attr.comm || event->attr.mmap || 4966 event->attr.mmap2 || event->attr.mmap_data || 4967 event->attr.task; 4968 } 4969 4970 static void perf_event_task_output(struct perf_event *event, 4971 void *data) 4972 { 4973 struct perf_task_event *task_event = data; 4974 struct perf_output_handle handle; 4975 struct perf_sample_data sample; 4976 struct task_struct *task = task_event->task; 4977 int ret, size = task_event->event_id.header.size; 4978 4979 if (!perf_event_task_match(event)) 4980 return; 4981 4982 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4983 4984 ret = perf_output_begin(&handle, event, 4985 task_event->event_id.header.size); 4986 if (ret) 4987 goto out; 4988 4989 task_event->event_id.pid = perf_event_pid(event, task); 4990 task_event->event_id.ppid = perf_event_pid(event, current); 4991 4992 task_event->event_id.tid = perf_event_tid(event, task); 4993 task_event->event_id.ptid = perf_event_tid(event, current); 4994 4995 perf_output_put(&handle, task_event->event_id); 4996 4997 perf_event__output_id_sample(event, &handle, &sample); 4998 4999 perf_output_end(&handle); 5000 out: 5001 task_event->event_id.header.size = size; 5002 } 5003 5004 static void perf_event_task(struct task_struct *task, 5005 struct perf_event_context *task_ctx, 5006 int new) 5007 { 5008 struct perf_task_event task_event; 5009 5010 if (!atomic_read(&nr_comm_events) && 5011 !atomic_read(&nr_mmap_events) && 5012 !atomic_read(&nr_task_events)) 5013 return; 5014 5015 task_event = (struct perf_task_event){ 5016 .task = task, 5017 .task_ctx = task_ctx, 5018 .event_id = { 5019 .header = { 5020 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, 5021 .misc = 0, 5022 .size = sizeof(task_event.event_id), 5023 }, 5024 /* .pid */ 5025 /* .ppid */ 5026 /* .tid */ 5027 /* .ptid */ 5028 .time = perf_clock(), 5029 }, 5030 }; 5031 5032 perf_event_aux(perf_event_task_output, 5033 &task_event, 5034 task_ctx); 5035 } 5036 5037 void perf_event_fork(struct task_struct *task) 5038 { 5039 perf_event_task(task, NULL, 1); 5040 } 5041 5042 /* 5043 * comm tracking 5044 */ 5045 5046 struct perf_comm_event { 5047 struct task_struct *task; 5048 char *comm; 5049 int comm_size; 5050 5051 struct { 5052 struct perf_event_header header; 5053 5054 u32 pid; 5055 u32 tid; 5056 } event_id; 5057 }; 5058 5059 static int perf_event_comm_match(struct perf_event *event) 5060 { 5061 return event->attr.comm; 5062 } 5063 5064 static void perf_event_comm_output(struct perf_event *event, 5065 void *data) 5066 { 5067 struct perf_comm_event *comm_event = data; 5068 struct perf_output_handle handle; 5069 struct perf_sample_data sample; 5070 int size = comm_event->event_id.header.size; 5071 int ret; 5072 5073 if (!perf_event_comm_match(event)) 5074 return; 5075 5076 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 5077 ret = perf_output_begin(&handle, event, 5078 comm_event->event_id.header.size); 5079 5080 if (ret) 5081 goto out; 5082 5083 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 5084 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 5085 5086 perf_output_put(&handle, comm_event->event_id); 5087 __output_copy(&handle, comm_event->comm, 5088 comm_event->comm_size); 5089 5090 perf_event__output_id_sample(event, &handle, &sample); 5091 5092 perf_output_end(&handle); 5093 out: 5094 comm_event->event_id.header.size = size; 5095 } 5096 5097 static void perf_event_comm_event(struct perf_comm_event *comm_event) 5098 { 5099 char comm[TASK_COMM_LEN]; 5100 unsigned int size; 5101 5102 memset(comm, 0, sizeof(comm)); 5103 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 5104 size = ALIGN(strlen(comm)+1, sizeof(u64)); 5105 5106 comm_event->comm = comm; 5107 comm_event->comm_size = size; 5108 5109 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 5110 5111 perf_event_aux(perf_event_comm_output, 5112 comm_event, 5113 NULL); 5114 } 5115 5116 void perf_event_comm(struct task_struct *task, bool exec) 5117 { 5118 struct perf_comm_event comm_event; 5119 5120 if (!atomic_read(&nr_comm_events)) 5121 return; 5122 5123 comm_event = (struct perf_comm_event){ 5124 .task = task, 5125 /* .comm */ 5126 /* .comm_size */ 5127 .event_id = { 5128 .header = { 5129 .type = PERF_RECORD_COMM, 5130 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, 5131 /* .size */ 5132 }, 5133 /* .pid */ 5134 /* .tid */ 5135 }, 5136 }; 5137 5138 perf_event_comm_event(&comm_event); 5139 } 5140 5141 /* 5142 * mmap tracking 5143 */ 5144 5145 struct perf_mmap_event { 5146 struct vm_area_struct *vma; 5147 5148 const char *file_name; 5149 int file_size; 5150 int maj, min; 5151 u64 ino; 5152 u64 ino_generation; 5153 u32 prot, flags; 5154 5155 struct { 5156 struct perf_event_header header; 5157 5158 u32 pid; 5159 u32 tid; 5160 u64 start; 5161 u64 len; 5162 u64 pgoff; 5163 } event_id; 5164 }; 5165 5166 static int perf_event_mmap_match(struct perf_event *event, 5167 void *data) 5168 { 5169 struct perf_mmap_event *mmap_event = data; 5170 struct vm_area_struct *vma = mmap_event->vma; 5171 int executable = vma->vm_flags & VM_EXEC; 5172 5173 return (!executable && event->attr.mmap_data) || 5174 (executable && (event->attr.mmap || event->attr.mmap2)); 5175 } 5176 5177 static void perf_event_mmap_output(struct perf_event *event, 5178 void *data) 5179 { 5180 struct perf_mmap_event *mmap_event = data; 5181 struct perf_output_handle handle; 5182 struct perf_sample_data sample; 5183 int size = mmap_event->event_id.header.size; 5184 int ret; 5185 5186 if (!perf_event_mmap_match(event, data)) 5187 return; 5188 5189 if (event->attr.mmap2) { 5190 mmap_event->event_id.header.type = PERF_RECORD_MMAP2; 5191 mmap_event->event_id.header.size += sizeof(mmap_event->maj); 5192 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5193 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5194 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 5195 mmap_event->event_id.header.size += sizeof(mmap_event->prot); 5196 mmap_event->event_id.header.size += sizeof(mmap_event->flags); 5197 } 5198 5199 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5200 ret = perf_output_begin(&handle, event, 5201 mmap_event->event_id.header.size); 5202 if (ret) 5203 goto out; 5204 5205 mmap_event->event_id.pid = perf_event_pid(event, current); 5206 mmap_event->event_id.tid = perf_event_tid(event, current); 5207 5208 perf_output_put(&handle, mmap_event->event_id); 5209 5210 if (event->attr.mmap2) { 5211 perf_output_put(&handle, mmap_event->maj); 5212 perf_output_put(&handle, mmap_event->min); 5213 perf_output_put(&handle, mmap_event->ino); 5214 perf_output_put(&handle, mmap_event->ino_generation); 5215 perf_output_put(&handle, mmap_event->prot); 5216 perf_output_put(&handle, mmap_event->flags); 5217 } 5218 5219 __output_copy(&handle, mmap_event->file_name, 5220 mmap_event->file_size); 5221 5222 perf_event__output_id_sample(event, &handle, &sample); 5223 5224 perf_output_end(&handle); 5225 out: 5226 mmap_event->event_id.header.size = size; 5227 } 5228 5229 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 5230 { 5231 struct vm_area_struct *vma = mmap_event->vma; 5232 struct file *file = vma->vm_file; 5233 int maj = 0, min = 0; 5234 u64 ino = 0, gen = 0; 5235 u32 prot = 0, flags = 0; 5236 unsigned int size; 5237 char tmp[16]; 5238 char *buf = NULL; 5239 char *name; 5240 5241 if (file) { 5242 struct inode *inode; 5243 dev_t dev; 5244 5245 buf = kmalloc(PATH_MAX, GFP_KERNEL); 5246 if (!buf) { 5247 name = "//enomem"; 5248 goto cpy_name; 5249 } 5250 /* 5251 * d_path() works from the end of the rb backwards, so we 5252 * need to add enough zero bytes after the string to handle 5253 * the 64bit alignment we do later. 5254 */ 5255 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); 5256 if (IS_ERR(name)) { 5257 name = "//toolong"; 5258 goto cpy_name; 5259 } 5260 inode = file_inode(vma->vm_file); 5261 dev = inode->i_sb->s_dev; 5262 ino = inode->i_ino; 5263 gen = inode->i_generation; 5264 maj = MAJOR(dev); 5265 min = MINOR(dev); 5266 5267 if (vma->vm_flags & VM_READ) 5268 prot |= PROT_READ; 5269 if (vma->vm_flags & VM_WRITE) 5270 prot |= PROT_WRITE; 5271 if (vma->vm_flags & VM_EXEC) 5272 prot |= PROT_EXEC; 5273 5274 if (vma->vm_flags & VM_MAYSHARE) 5275 flags = MAP_SHARED; 5276 else 5277 flags = MAP_PRIVATE; 5278 5279 if (vma->vm_flags & VM_DENYWRITE) 5280 flags |= MAP_DENYWRITE; 5281 if (vma->vm_flags & VM_MAYEXEC) 5282 flags |= MAP_EXECUTABLE; 5283 if (vma->vm_flags & VM_LOCKED) 5284 flags |= MAP_LOCKED; 5285 if (vma->vm_flags & VM_HUGETLB) 5286 flags |= MAP_HUGETLB; 5287 5288 goto got_name; 5289 } else { 5290 if (vma->vm_ops && vma->vm_ops->name) { 5291 name = (char *) vma->vm_ops->name(vma); 5292 if (name) 5293 goto cpy_name; 5294 } 5295 5296 name = (char *)arch_vma_name(vma); 5297 if (name) 5298 goto cpy_name; 5299 5300 if (vma->vm_start <= vma->vm_mm->start_brk && 5301 vma->vm_end >= vma->vm_mm->brk) { 5302 name = "[heap]"; 5303 goto cpy_name; 5304 } 5305 if (vma->vm_start <= vma->vm_mm->start_stack && 5306 vma->vm_end >= vma->vm_mm->start_stack) { 5307 name = "[stack]"; 5308 goto cpy_name; 5309 } 5310 5311 name = "//anon"; 5312 goto cpy_name; 5313 } 5314 5315 cpy_name: 5316 strlcpy(tmp, name, sizeof(tmp)); 5317 name = tmp; 5318 got_name: 5319 /* 5320 * Since our buffer works in 8 byte units we need to align our string 5321 * size to a multiple of 8. However, we must guarantee the tail end is 5322 * zero'd out to avoid leaking random bits to userspace. 5323 */ 5324 size = strlen(name)+1; 5325 while (!IS_ALIGNED(size, sizeof(u64))) 5326 name[size++] = '\0'; 5327 5328 mmap_event->file_name = name; 5329 mmap_event->file_size = size; 5330 mmap_event->maj = maj; 5331 mmap_event->min = min; 5332 mmap_event->ino = ino; 5333 mmap_event->ino_generation = gen; 5334 mmap_event->prot = prot; 5335 mmap_event->flags = flags; 5336 5337 if (!(vma->vm_flags & VM_EXEC)) 5338 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5339 5340 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 5341 5342 perf_event_aux(perf_event_mmap_output, 5343 mmap_event, 5344 NULL); 5345 5346 kfree(buf); 5347 } 5348 5349 void perf_event_mmap(struct vm_area_struct *vma) 5350 { 5351 struct perf_mmap_event mmap_event; 5352 5353 if (!atomic_read(&nr_mmap_events)) 5354 return; 5355 5356 mmap_event = (struct perf_mmap_event){ 5357 .vma = vma, 5358 /* .file_name */ 5359 /* .file_size */ 5360 .event_id = { 5361 .header = { 5362 .type = PERF_RECORD_MMAP, 5363 .misc = PERF_RECORD_MISC_USER, 5364 /* .size */ 5365 }, 5366 /* .pid */ 5367 /* .tid */ 5368 .start = vma->vm_start, 5369 .len = vma->vm_end - vma->vm_start, 5370 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 5371 }, 5372 /* .maj (attr_mmap2 only) */ 5373 /* .min (attr_mmap2 only) */ 5374 /* .ino (attr_mmap2 only) */ 5375 /* .ino_generation (attr_mmap2 only) */ 5376 /* .prot (attr_mmap2 only) */ 5377 /* .flags (attr_mmap2 only) */ 5378 }; 5379 5380 perf_event_mmap_event(&mmap_event); 5381 } 5382 5383 /* 5384 * IRQ throttle logging 5385 */ 5386 5387 static void perf_log_throttle(struct perf_event *event, int enable) 5388 { 5389 struct perf_output_handle handle; 5390 struct perf_sample_data sample; 5391 int ret; 5392 5393 struct { 5394 struct perf_event_header header; 5395 u64 time; 5396 u64 id; 5397 u64 stream_id; 5398 } throttle_event = { 5399 .header = { 5400 .type = PERF_RECORD_THROTTLE, 5401 .misc = 0, 5402 .size = sizeof(throttle_event), 5403 }, 5404 .time = perf_clock(), 5405 .id = primary_event_id(event), 5406 .stream_id = event->id, 5407 }; 5408 5409 if (enable) 5410 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 5411 5412 perf_event_header__init_id(&throttle_event.header, &sample, event); 5413 5414 ret = perf_output_begin(&handle, event, 5415 throttle_event.header.size); 5416 if (ret) 5417 return; 5418 5419 perf_output_put(&handle, throttle_event); 5420 perf_event__output_id_sample(event, &handle, &sample); 5421 perf_output_end(&handle); 5422 } 5423 5424 /* 5425 * Generic event overflow handling, sampling. 5426 */ 5427 5428 static int __perf_event_overflow(struct perf_event *event, 5429 int throttle, struct perf_sample_data *data, 5430 struct pt_regs *regs) 5431 { 5432 int events = atomic_read(&event->event_limit); 5433 struct hw_perf_event *hwc = &event->hw; 5434 u64 seq; 5435 int ret = 0; 5436 5437 /* 5438 * Non-sampling counters might still use the PMI to fold short 5439 * hardware counters, ignore those. 5440 */ 5441 if (unlikely(!is_sampling_event(event))) 5442 return 0; 5443 5444 seq = __this_cpu_read(perf_throttled_seq); 5445 if (seq != hwc->interrupts_seq) { 5446 hwc->interrupts_seq = seq; 5447 hwc->interrupts = 1; 5448 } else { 5449 hwc->interrupts++; 5450 if (unlikely(throttle 5451 && hwc->interrupts >= max_samples_per_tick)) { 5452 __this_cpu_inc(perf_throttled_count); 5453 hwc->interrupts = MAX_INTERRUPTS; 5454 perf_log_throttle(event, 0); 5455 tick_nohz_full_kick(); 5456 ret = 1; 5457 } 5458 } 5459 5460 if (event->attr.freq) { 5461 u64 now = perf_clock(); 5462 s64 delta = now - hwc->freq_time_stamp; 5463 5464 hwc->freq_time_stamp = now; 5465 5466 if (delta > 0 && delta < 2*TICK_NSEC) 5467 perf_adjust_period(event, delta, hwc->last_period, true); 5468 } 5469 5470 /* 5471 * XXX event_limit might not quite work as expected on inherited 5472 * events 5473 */ 5474 5475 event->pending_kill = POLL_IN; 5476 if (events && atomic_dec_and_test(&event->event_limit)) { 5477 ret = 1; 5478 event->pending_kill = POLL_HUP; 5479 event->pending_disable = 1; 5480 irq_work_queue(&event->pending); 5481 } 5482 5483 if (event->overflow_handler) 5484 event->overflow_handler(event, data, regs); 5485 else 5486 perf_event_output(event, data, regs); 5487 5488 if (event->fasync && event->pending_kill) { 5489 event->pending_wakeup = 1; 5490 irq_work_queue(&event->pending); 5491 } 5492 5493 return ret; 5494 } 5495 5496 int perf_event_overflow(struct perf_event *event, 5497 struct perf_sample_data *data, 5498 struct pt_regs *regs) 5499 { 5500 return __perf_event_overflow(event, 1, data, regs); 5501 } 5502 5503 /* 5504 * Generic software event infrastructure 5505 */ 5506 5507 struct swevent_htable { 5508 struct swevent_hlist *swevent_hlist; 5509 struct mutex hlist_mutex; 5510 int hlist_refcount; 5511 5512 /* Recursion avoidance in each contexts */ 5513 int recursion[PERF_NR_CONTEXTS]; 5514 5515 /* Keeps track of cpu being initialized/exited */ 5516 bool online; 5517 }; 5518 5519 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 5520 5521 /* 5522 * We directly increment event->count and keep a second value in 5523 * event->hw.period_left to count intervals. This period event 5524 * is kept in the range [-sample_period, 0] so that we can use the 5525 * sign as trigger. 5526 */ 5527 5528 u64 perf_swevent_set_period(struct perf_event *event) 5529 { 5530 struct hw_perf_event *hwc = &event->hw; 5531 u64 period = hwc->last_period; 5532 u64 nr, offset; 5533 s64 old, val; 5534 5535 hwc->last_period = hwc->sample_period; 5536 5537 again: 5538 old = val = local64_read(&hwc->period_left); 5539 if (val < 0) 5540 return 0; 5541 5542 nr = div64_u64(period + val, period); 5543 offset = nr * period; 5544 val -= offset; 5545 if (local64_cmpxchg(&hwc->period_left, old, val) != old) 5546 goto again; 5547 5548 return nr; 5549 } 5550 5551 static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 5552 struct perf_sample_data *data, 5553 struct pt_regs *regs) 5554 { 5555 struct hw_perf_event *hwc = &event->hw; 5556 int throttle = 0; 5557 5558 if (!overflow) 5559 overflow = perf_swevent_set_period(event); 5560 5561 if (hwc->interrupts == MAX_INTERRUPTS) 5562 return; 5563 5564 for (; overflow; overflow--) { 5565 if (__perf_event_overflow(event, throttle, 5566 data, regs)) { 5567 /* 5568 * We inhibit the overflow from happening when 5569 * hwc->interrupts == MAX_INTERRUPTS. 5570 */ 5571 break; 5572 } 5573 throttle = 1; 5574 } 5575 } 5576 5577 static void perf_swevent_event(struct perf_event *event, u64 nr, 5578 struct perf_sample_data *data, 5579 struct pt_regs *regs) 5580 { 5581 struct hw_perf_event *hwc = &event->hw; 5582 5583 local64_add(nr, &event->count); 5584 5585 if (!regs) 5586 return; 5587 5588 if (!is_sampling_event(event)) 5589 return; 5590 5591 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { 5592 data->period = nr; 5593 return perf_swevent_overflow(event, 1, data, regs); 5594 } else 5595 data->period = event->hw.last_period; 5596 5597 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 5598 return perf_swevent_overflow(event, 1, data, regs); 5599 5600 if (local64_add_negative(nr, &hwc->period_left)) 5601 return; 5602 5603 perf_swevent_overflow(event, 0, data, regs); 5604 } 5605 5606 static int perf_exclude_event(struct perf_event *event, 5607 struct pt_regs *regs) 5608 { 5609 if (event->hw.state & PERF_HES_STOPPED) 5610 return 1; 5611 5612 if (regs) { 5613 if (event->attr.exclude_user && user_mode(regs)) 5614 return 1; 5615 5616 if (event->attr.exclude_kernel && !user_mode(regs)) 5617 return 1; 5618 } 5619 5620 return 0; 5621 } 5622 5623 static int perf_swevent_match(struct perf_event *event, 5624 enum perf_type_id type, 5625 u32 event_id, 5626 struct perf_sample_data *data, 5627 struct pt_regs *regs) 5628 { 5629 if (event->attr.type != type) 5630 return 0; 5631 5632 if (event->attr.config != event_id) 5633 return 0; 5634 5635 if (perf_exclude_event(event, regs)) 5636 return 0; 5637 5638 return 1; 5639 } 5640 5641 static inline u64 swevent_hash(u64 type, u32 event_id) 5642 { 5643 u64 val = event_id | (type << 32); 5644 5645 return hash_64(val, SWEVENT_HLIST_BITS); 5646 } 5647 5648 static inline struct hlist_head * 5649 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) 5650 { 5651 u64 hash = swevent_hash(type, event_id); 5652 5653 return &hlist->heads[hash]; 5654 } 5655 5656 /* For the read side: events when they trigger */ 5657 static inline struct hlist_head * 5658 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) 5659 { 5660 struct swevent_hlist *hlist; 5661 5662 hlist = rcu_dereference(swhash->swevent_hlist); 5663 if (!hlist) 5664 return NULL; 5665 5666 return __find_swevent_head(hlist, type, event_id); 5667 } 5668 5669 /* For the event head insertion and removal in the hlist */ 5670 static inline struct hlist_head * 5671 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) 5672 { 5673 struct swevent_hlist *hlist; 5674 u32 event_id = event->attr.config; 5675 u64 type = event->attr.type; 5676 5677 /* 5678 * Event scheduling is always serialized against hlist allocation 5679 * and release. Which makes the protected version suitable here. 5680 * The context lock guarantees that. 5681 */ 5682 hlist = rcu_dereference_protected(swhash->swevent_hlist, 5683 lockdep_is_held(&event->ctx->lock)); 5684 if (!hlist) 5685 return NULL; 5686 5687 return __find_swevent_head(hlist, type, event_id); 5688 } 5689 5690 static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 5691 u64 nr, 5692 struct perf_sample_data *data, 5693 struct pt_regs *regs) 5694 { 5695 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5696 struct perf_event *event; 5697 struct hlist_head *head; 5698 5699 rcu_read_lock(); 5700 head = find_swevent_head_rcu(swhash, type, event_id); 5701 if (!head) 5702 goto end; 5703 5704 hlist_for_each_entry_rcu(event, head, hlist_entry) { 5705 if (perf_swevent_match(event, type, event_id, data, regs)) 5706 perf_swevent_event(event, nr, data, regs); 5707 } 5708 end: 5709 rcu_read_unlock(); 5710 } 5711 5712 int perf_swevent_get_recursion_context(void) 5713 { 5714 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5715 5716 return get_recursion_context(swhash->recursion); 5717 } 5718 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 5719 5720 inline void perf_swevent_put_recursion_context(int rctx) 5721 { 5722 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5723 5724 put_recursion_context(swhash->recursion, rctx); 5725 } 5726 5727 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 5728 { 5729 struct perf_sample_data data; 5730 int rctx; 5731 5732 preempt_disable_notrace(); 5733 rctx = perf_swevent_get_recursion_context(); 5734 if (rctx < 0) 5735 return; 5736 5737 perf_sample_data_init(&data, addr, 0); 5738 5739 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 5740 5741 perf_swevent_put_recursion_context(rctx); 5742 preempt_enable_notrace(); 5743 } 5744 5745 static void perf_swevent_read(struct perf_event *event) 5746 { 5747 } 5748 5749 static int perf_swevent_add(struct perf_event *event, int flags) 5750 { 5751 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5752 struct hw_perf_event *hwc = &event->hw; 5753 struct hlist_head *head; 5754 5755 if (is_sampling_event(event)) { 5756 hwc->last_period = hwc->sample_period; 5757 perf_swevent_set_period(event); 5758 } 5759 5760 hwc->state = !(flags & PERF_EF_START); 5761 5762 head = find_swevent_head(swhash, event); 5763 if (!head) { 5764 /* 5765 * We can race with cpu hotplug code. Do not 5766 * WARN if the cpu just got unplugged. 5767 */ 5768 WARN_ON_ONCE(swhash->online); 5769 return -EINVAL; 5770 } 5771 5772 hlist_add_head_rcu(&event->hlist_entry, head); 5773 5774 return 0; 5775 } 5776 5777 static void perf_swevent_del(struct perf_event *event, int flags) 5778 { 5779 hlist_del_rcu(&event->hlist_entry); 5780 } 5781 5782 static void perf_swevent_start(struct perf_event *event, int flags) 5783 { 5784 event->hw.state = 0; 5785 } 5786 5787 static void perf_swevent_stop(struct perf_event *event, int flags) 5788 { 5789 event->hw.state = PERF_HES_STOPPED; 5790 } 5791 5792 /* Deref the hlist from the update side */ 5793 static inline struct swevent_hlist * 5794 swevent_hlist_deref(struct swevent_htable *swhash) 5795 { 5796 return rcu_dereference_protected(swhash->swevent_hlist, 5797 lockdep_is_held(&swhash->hlist_mutex)); 5798 } 5799 5800 static void swevent_hlist_release(struct swevent_htable *swhash) 5801 { 5802 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 5803 5804 if (!hlist) 5805 return; 5806 5807 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5808 kfree_rcu(hlist, rcu_head); 5809 } 5810 5811 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5812 { 5813 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 5814 5815 mutex_lock(&swhash->hlist_mutex); 5816 5817 if (!--swhash->hlist_refcount) 5818 swevent_hlist_release(swhash); 5819 5820 mutex_unlock(&swhash->hlist_mutex); 5821 } 5822 5823 static void swevent_hlist_put(struct perf_event *event) 5824 { 5825 int cpu; 5826 5827 for_each_possible_cpu(cpu) 5828 swevent_hlist_put_cpu(event, cpu); 5829 } 5830 5831 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 5832 { 5833 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 5834 int err = 0; 5835 5836 mutex_lock(&swhash->hlist_mutex); 5837 5838 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { 5839 struct swevent_hlist *hlist; 5840 5841 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 5842 if (!hlist) { 5843 err = -ENOMEM; 5844 goto exit; 5845 } 5846 rcu_assign_pointer(swhash->swevent_hlist, hlist); 5847 } 5848 swhash->hlist_refcount++; 5849 exit: 5850 mutex_unlock(&swhash->hlist_mutex); 5851 5852 return err; 5853 } 5854 5855 static int swevent_hlist_get(struct perf_event *event) 5856 { 5857 int err; 5858 int cpu, failed_cpu; 5859 5860 get_online_cpus(); 5861 for_each_possible_cpu(cpu) { 5862 err = swevent_hlist_get_cpu(event, cpu); 5863 if (err) { 5864 failed_cpu = cpu; 5865 goto fail; 5866 } 5867 } 5868 put_online_cpus(); 5869 5870 return 0; 5871 fail: 5872 for_each_possible_cpu(cpu) { 5873 if (cpu == failed_cpu) 5874 break; 5875 swevent_hlist_put_cpu(event, cpu); 5876 } 5877 5878 put_online_cpus(); 5879 return err; 5880 } 5881 5882 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5883 5884 static void sw_perf_event_destroy(struct perf_event *event) 5885 { 5886 u64 event_id = event->attr.config; 5887 5888 WARN_ON(event->parent); 5889 5890 static_key_slow_dec(&perf_swevent_enabled[event_id]); 5891 swevent_hlist_put(event); 5892 } 5893 5894 static int perf_swevent_init(struct perf_event *event) 5895 { 5896 u64 event_id = event->attr.config; 5897 5898 if (event->attr.type != PERF_TYPE_SOFTWARE) 5899 return -ENOENT; 5900 5901 /* 5902 * no branch sampling for software events 5903 */ 5904 if (has_branch_stack(event)) 5905 return -EOPNOTSUPP; 5906 5907 switch (event_id) { 5908 case PERF_COUNT_SW_CPU_CLOCK: 5909 case PERF_COUNT_SW_TASK_CLOCK: 5910 return -ENOENT; 5911 5912 default: 5913 break; 5914 } 5915 5916 if (event_id >= PERF_COUNT_SW_MAX) 5917 return -ENOENT; 5918 5919 if (!event->parent) { 5920 int err; 5921 5922 err = swevent_hlist_get(event); 5923 if (err) 5924 return err; 5925 5926 static_key_slow_inc(&perf_swevent_enabled[event_id]); 5927 event->destroy = sw_perf_event_destroy; 5928 } 5929 5930 return 0; 5931 } 5932 5933 static int perf_swevent_event_idx(struct perf_event *event) 5934 { 5935 return 0; 5936 } 5937 5938 static struct pmu perf_swevent = { 5939 .task_ctx_nr = perf_sw_context, 5940 5941 .event_init = perf_swevent_init, 5942 .add = perf_swevent_add, 5943 .del = perf_swevent_del, 5944 .start = perf_swevent_start, 5945 .stop = perf_swevent_stop, 5946 .read = perf_swevent_read, 5947 5948 .event_idx = perf_swevent_event_idx, 5949 }; 5950 5951 #ifdef CONFIG_EVENT_TRACING 5952 5953 static int perf_tp_filter_match(struct perf_event *event, 5954 struct perf_sample_data *data) 5955 { 5956 void *record = data->raw->data; 5957 5958 if (likely(!event->filter) || filter_match_preds(event->filter, record)) 5959 return 1; 5960 return 0; 5961 } 5962 5963 static int perf_tp_event_match(struct perf_event *event, 5964 struct perf_sample_data *data, 5965 struct pt_regs *regs) 5966 { 5967 if (event->hw.state & PERF_HES_STOPPED) 5968 return 0; 5969 /* 5970 * All tracepoints are from kernel-space. 5971 */ 5972 if (event->attr.exclude_kernel) 5973 return 0; 5974 5975 if (!perf_tp_filter_match(event, data)) 5976 return 0; 5977 5978 return 1; 5979 } 5980 5981 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 5982 struct pt_regs *regs, struct hlist_head *head, int rctx, 5983 struct task_struct *task) 5984 { 5985 struct perf_sample_data data; 5986 struct perf_event *event; 5987 5988 struct perf_raw_record raw = { 5989 .size = entry_size, 5990 .data = record, 5991 }; 5992 5993 perf_sample_data_init(&data, addr, 0); 5994 data.raw = &raw; 5995 5996 hlist_for_each_entry_rcu(event, head, hlist_entry) { 5997 if (perf_tp_event_match(event, &data, regs)) 5998 perf_swevent_event(event, count, &data, regs); 5999 } 6000 6001 /* 6002 * If we got specified a target task, also iterate its context and 6003 * deliver this event there too. 6004 */ 6005 if (task && task != current) { 6006 struct perf_event_context *ctx; 6007 struct trace_entry *entry = record; 6008 6009 rcu_read_lock(); 6010 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); 6011 if (!ctx) 6012 goto unlock; 6013 6014 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 6015 if (event->attr.type != PERF_TYPE_TRACEPOINT) 6016 continue; 6017 if (event->attr.config != entry->type) 6018 continue; 6019 if (perf_tp_event_match(event, &data, regs)) 6020 perf_swevent_event(event, count, &data, regs); 6021 } 6022 unlock: 6023 rcu_read_unlock(); 6024 } 6025 6026 perf_swevent_put_recursion_context(rctx); 6027 } 6028 EXPORT_SYMBOL_GPL(perf_tp_event); 6029 6030 static void tp_perf_event_destroy(struct perf_event *event) 6031 { 6032 perf_trace_destroy(event); 6033 } 6034 6035 static int perf_tp_event_init(struct perf_event *event) 6036 { 6037 int err; 6038 6039 if (event->attr.type != PERF_TYPE_TRACEPOINT) 6040 return -ENOENT; 6041 6042 /* 6043 * no branch sampling for tracepoint events 6044 */ 6045 if (has_branch_stack(event)) 6046 return -EOPNOTSUPP; 6047 6048 err = perf_trace_init(event); 6049 if (err) 6050 return err; 6051 6052 event->destroy = tp_perf_event_destroy; 6053 6054 return 0; 6055 } 6056 6057 static struct pmu perf_tracepoint = { 6058 .task_ctx_nr = perf_sw_context, 6059 6060 .event_init = perf_tp_event_init, 6061 .add = perf_trace_add, 6062 .del = perf_trace_del, 6063 .start = perf_swevent_start, 6064 .stop = perf_swevent_stop, 6065 .read = perf_swevent_read, 6066 6067 .event_idx = perf_swevent_event_idx, 6068 }; 6069 6070 static inline void perf_tp_register(void) 6071 { 6072 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); 6073 } 6074 6075 static int perf_event_set_filter(struct perf_event *event, void __user *arg) 6076 { 6077 char *filter_str; 6078 int ret; 6079 6080 if (event->attr.type != PERF_TYPE_TRACEPOINT) 6081 return -EINVAL; 6082 6083 filter_str = strndup_user(arg, PAGE_SIZE); 6084 if (IS_ERR(filter_str)) 6085 return PTR_ERR(filter_str); 6086 6087 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); 6088 6089 kfree(filter_str); 6090 return ret; 6091 } 6092 6093 static void perf_event_free_filter(struct perf_event *event) 6094 { 6095 ftrace_profile_free_filter(event); 6096 } 6097 6098 #else 6099 6100 static inline void perf_tp_register(void) 6101 { 6102 } 6103 6104 static int perf_event_set_filter(struct perf_event *event, void __user *arg) 6105 { 6106 return -ENOENT; 6107 } 6108 6109 static void perf_event_free_filter(struct perf_event *event) 6110 { 6111 } 6112 6113 #endif /* CONFIG_EVENT_TRACING */ 6114 6115 #ifdef CONFIG_HAVE_HW_BREAKPOINT 6116 void perf_bp_event(struct perf_event *bp, void *data) 6117 { 6118 struct perf_sample_data sample; 6119 struct pt_regs *regs = data; 6120 6121 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 6122 6123 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 6124 perf_swevent_event(bp, 1, &sample, regs); 6125 } 6126 #endif 6127 6128 /* 6129 * hrtimer based swevent callback 6130 */ 6131 6132 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) 6133 { 6134 enum hrtimer_restart ret = HRTIMER_RESTART; 6135 struct perf_sample_data data; 6136 struct pt_regs *regs; 6137 struct perf_event *event; 6138 u64 period; 6139 6140 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 6141 6142 if (event->state != PERF_EVENT_STATE_ACTIVE) 6143 return HRTIMER_NORESTART; 6144 6145 event->pmu->read(event); 6146 6147 perf_sample_data_init(&data, 0, event->hw.last_period); 6148 regs = get_irq_regs(); 6149 6150 if (regs && !perf_exclude_event(event, regs)) { 6151 if (!(event->attr.exclude_idle && is_idle_task(current))) 6152 if (__perf_event_overflow(event, 1, &data, regs)) 6153 ret = HRTIMER_NORESTART; 6154 } 6155 6156 period = max_t(u64, 10000, event->hw.sample_period); 6157 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 6158 6159 return ret; 6160 } 6161 6162 static void perf_swevent_start_hrtimer(struct perf_event *event) 6163 { 6164 struct hw_perf_event *hwc = &event->hw; 6165 s64 period; 6166 6167 if (!is_sampling_event(event)) 6168 return; 6169 6170 period = local64_read(&hwc->period_left); 6171 if (period) { 6172 if (period < 0) 6173 period = 10000; 6174 6175 local64_set(&hwc->period_left, 0); 6176 } else { 6177 period = max_t(u64, 10000, hwc->sample_period); 6178 } 6179 __hrtimer_start_range_ns(&hwc->hrtimer, 6180 ns_to_ktime(period), 0, 6181 HRTIMER_MODE_REL_PINNED, 0); 6182 } 6183 6184 static void perf_swevent_cancel_hrtimer(struct perf_event *event) 6185 { 6186 struct hw_perf_event *hwc = &event->hw; 6187 6188 if (is_sampling_event(event)) { 6189 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 6190 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 6191 6192 hrtimer_cancel(&hwc->hrtimer); 6193 } 6194 } 6195 6196 static void perf_swevent_init_hrtimer(struct perf_event *event) 6197 { 6198 struct hw_perf_event *hwc = &event->hw; 6199 6200 if (!is_sampling_event(event)) 6201 return; 6202 6203 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 6204 hwc->hrtimer.function = perf_swevent_hrtimer; 6205 6206 /* 6207 * Since hrtimers have a fixed rate, we can do a static freq->period 6208 * mapping and avoid the whole period adjust feedback stuff. 6209 */ 6210 if (event->attr.freq) { 6211 long freq = event->attr.sample_freq; 6212 6213 event->attr.sample_period = NSEC_PER_SEC / freq; 6214 hwc->sample_period = event->attr.sample_period; 6215 local64_set(&hwc->period_left, hwc->sample_period); 6216 hwc->last_period = hwc->sample_period; 6217 event->attr.freq = 0; 6218 } 6219 } 6220 6221 /* 6222 * Software event: cpu wall time clock 6223 */ 6224 6225 static void cpu_clock_event_update(struct perf_event *event) 6226 { 6227 s64 prev; 6228 u64 now; 6229 6230 now = local_clock(); 6231 prev = local64_xchg(&event->hw.prev_count, now); 6232 local64_add(now - prev, &event->count); 6233 } 6234 6235 static void cpu_clock_event_start(struct perf_event *event, int flags) 6236 { 6237 local64_set(&event->hw.prev_count, local_clock()); 6238 perf_swevent_start_hrtimer(event); 6239 } 6240 6241 static void cpu_clock_event_stop(struct perf_event *event, int flags) 6242 { 6243 perf_swevent_cancel_hrtimer(event); 6244 cpu_clock_event_update(event); 6245 } 6246 6247 static int cpu_clock_event_add(struct perf_event *event, int flags) 6248 { 6249 if (flags & PERF_EF_START) 6250 cpu_clock_event_start(event, flags); 6251 6252 return 0; 6253 } 6254 6255 static void cpu_clock_event_del(struct perf_event *event, int flags) 6256 { 6257 cpu_clock_event_stop(event, flags); 6258 } 6259 6260 static void cpu_clock_event_read(struct perf_event *event) 6261 { 6262 cpu_clock_event_update(event); 6263 } 6264 6265 static int cpu_clock_event_init(struct perf_event *event) 6266 { 6267 if (event->attr.type != PERF_TYPE_SOFTWARE) 6268 return -ENOENT; 6269 6270 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 6271 return -ENOENT; 6272 6273 /* 6274 * no branch sampling for software events 6275 */ 6276 if (has_branch_stack(event)) 6277 return -EOPNOTSUPP; 6278 6279 perf_swevent_init_hrtimer(event); 6280 6281 return 0; 6282 } 6283 6284 static struct pmu perf_cpu_clock = { 6285 .task_ctx_nr = perf_sw_context, 6286 6287 .event_init = cpu_clock_event_init, 6288 .add = cpu_clock_event_add, 6289 .del = cpu_clock_event_del, 6290 .start = cpu_clock_event_start, 6291 .stop = cpu_clock_event_stop, 6292 .read = cpu_clock_event_read, 6293 6294 .event_idx = perf_swevent_event_idx, 6295 }; 6296 6297 /* 6298 * Software event: task time clock 6299 */ 6300 6301 static void task_clock_event_update(struct perf_event *event, u64 now) 6302 { 6303 u64 prev; 6304 s64 delta; 6305 6306 prev = local64_xchg(&event->hw.prev_count, now); 6307 delta = now - prev; 6308 local64_add(delta, &event->count); 6309 } 6310 6311 static void task_clock_event_start(struct perf_event *event, int flags) 6312 { 6313 local64_set(&event->hw.prev_count, event->ctx->time); 6314 perf_swevent_start_hrtimer(event); 6315 } 6316 6317 static void task_clock_event_stop(struct perf_event *event, int flags) 6318 { 6319 perf_swevent_cancel_hrtimer(event); 6320 task_clock_event_update(event, event->ctx->time); 6321 } 6322 6323 static int task_clock_event_add(struct perf_event *event, int flags) 6324 { 6325 if (flags & PERF_EF_START) 6326 task_clock_event_start(event, flags); 6327 6328 return 0; 6329 } 6330 6331 static void task_clock_event_del(struct perf_event *event, int flags) 6332 { 6333 task_clock_event_stop(event, PERF_EF_UPDATE); 6334 } 6335 6336 static void task_clock_event_read(struct perf_event *event) 6337 { 6338 u64 now = perf_clock(); 6339 u64 delta = now - event->ctx->timestamp; 6340 u64 time = event->ctx->time + delta; 6341 6342 task_clock_event_update(event, time); 6343 } 6344 6345 static int task_clock_event_init(struct perf_event *event) 6346 { 6347 if (event->attr.type != PERF_TYPE_SOFTWARE) 6348 return -ENOENT; 6349 6350 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 6351 return -ENOENT; 6352 6353 /* 6354 * no branch sampling for software events 6355 */ 6356 if (has_branch_stack(event)) 6357 return -EOPNOTSUPP; 6358 6359 perf_swevent_init_hrtimer(event); 6360 6361 return 0; 6362 } 6363 6364 static struct pmu perf_task_clock = { 6365 .task_ctx_nr = perf_sw_context, 6366 6367 .event_init = task_clock_event_init, 6368 .add = task_clock_event_add, 6369 .del = task_clock_event_del, 6370 .start = task_clock_event_start, 6371 .stop = task_clock_event_stop, 6372 .read = task_clock_event_read, 6373 6374 .event_idx = perf_swevent_event_idx, 6375 }; 6376 6377 static void perf_pmu_nop_void(struct pmu *pmu) 6378 { 6379 } 6380 6381 static int perf_pmu_nop_int(struct pmu *pmu) 6382 { 6383 return 0; 6384 } 6385 6386 static void perf_pmu_start_txn(struct pmu *pmu) 6387 { 6388 perf_pmu_disable(pmu); 6389 } 6390 6391 static int perf_pmu_commit_txn(struct pmu *pmu) 6392 { 6393 perf_pmu_enable(pmu); 6394 return 0; 6395 } 6396 6397 static void perf_pmu_cancel_txn(struct pmu *pmu) 6398 { 6399 perf_pmu_enable(pmu); 6400 } 6401 6402 static int perf_event_idx_default(struct perf_event *event) 6403 { 6404 return event->hw.idx + 1; 6405 } 6406 6407 /* 6408 * Ensures all contexts with the same task_ctx_nr have the same 6409 * pmu_cpu_context too. 6410 */ 6411 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) 6412 { 6413 struct pmu *pmu; 6414 6415 if (ctxn < 0) 6416 return NULL; 6417 6418 list_for_each_entry(pmu, &pmus, entry) { 6419 if (pmu->task_ctx_nr == ctxn) 6420 return pmu->pmu_cpu_context; 6421 } 6422 6423 return NULL; 6424 } 6425 6426 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) 6427 { 6428 int cpu; 6429 6430 for_each_possible_cpu(cpu) { 6431 struct perf_cpu_context *cpuctx; 6432 6433 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 6434 6435 if (cpuctx->unique_pmu == old_pmu) 6436 cpuctx->unique_pmu = pmu; 6437 } 6438 } 6439 6440 static void free_pmu_context(struct pmu *pmu) 6441 { 6442 struct pmu *i; 6443 6444 mutex_lock(&pmus_lock); 6445 /* 6446 * Like a real lame refcount. 6447 */ 6448 list_for_each_entry(i, &pmus, entry) { 6449 if (i->pmu_cpu_context == pmu->pmu_cpu_context) { 6450 update_pmu_context(i, pmu); 6451 goto out; 6452 } 6453 } 6454 6455 free_percpu(pmu->pmu_cpu_context); 6456 out: 6457 mutex_unlock(&pmus_lock); 6458 } 6459 static struct idr pmu_idr; 6460 6461 static ssize_t 6462 type_show(struct device *dev, struct device_attribute *attr, char *page) 6463 { 6464 struct pmu *pmu = dev_get_drvdata(dev); 6465 6466 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6467 } 6468 static DEVICE_ATTR_RO(type); 6469 6470 static ssize_t 6471 perf_event_mux_interval_ms_show(struct device *dev, 6472 struct device_attribute *attr, 6473 char *page) 6474 { 6475 struct pmu *pmu = dev_get_drvdata(dev); 6476 6477 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); 6478 } 6479 6480 static ssize_t 6481 perf_event_mux_interval_ms_store(struct device *dev, 6482 struct device_attribute *attr, 6483 const char *buf, size_t count) 6484 { 6485 struct pmu *pmu = dev_get_drvdata(dev); 6486 int timer, cpu, ret; 6487 6488 ret = kstrtoint(buf, 0, &timer); 6489 if (ret) 6490 return ret; 6491 6492 if (timer < 1) 6493 return -EINVAL; 6494 6495 /* same value, noting to do */ 6496 if (timer == pmu->hrtimer_interval_ms) 6497 return count; 6498 6499 pmu->hrtimer_interval_ms = timer; 6500 6501 /* update all cpuctx for this PMU */ 6502 for_each_possible_cpu(cpu) { 6503 struct perf_cpu_context *cpuctx; 6504 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 6505 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); 6506 6507 if (hrtimer_active(&cpuctx->hrtimer)) 6508 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); 6509 } 6510 6511 return count; 6512 } 6513 static DEVICE_ATTR_RW(perf_event_mux_interval_ms); 6514 6515 static struct attribute *pmu_dev_attrs[] = { 6516 &dev_attr_type.attr, 6517 &dev_attr_perf_event_mux_interval_ms.attr, 6518 NULL, 6519 }; 6520 ATTRIBUTE_GROUPS(pmu_dev); 6521 6522 static int pmu_bus_running; 6523 static struct bus_type pmu_bus = { 6524 .name = "event_source", 6525 .dev_groups = pmu_dev_groups, 6526 }; 6527 6528 static void pmu_dev_release(struct device *dev) 6529 { 6530 kfree(dev); 6531 } 6532 6533 static int pmu_dev_alloc(struct pmu *pmu) 6534 { 6535 int ret = -ENOMEM; 6536 6537 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); 6538 if (!pmu->dev) 6539 goto out; 6540 6541 pmu->dev->groups = pmu->attr_groups; 6542 device_initialize(pmu->dev); 6543 ret = dev_set_name(pmu->dev, "%s", pmu->name); 6544 if (ret) 6545 goto free_dev; 6546 6547 dev_set_drvdata(pmu->dev, pmu); 6548 pmu->dev->bus = &pmu_bus; 6549 pmu->dev->release = pmu_dev_release; 6550 ret = device_add(pmu->dev); 6551 if (ret) 6552 goto free_dev; 6553 6554 out: 6555 return ret; 6556 6557 free_dev: 6558 put_device(pmu->dev); 6559 goto out; 6560 } 6561 6562 static struct lock_class_key cpuctx_mutex; 6563 static struct lock_class_key cpuctx_lock; 6564 6565 int perf_pmu_register(struct pmu *pmu, const char *name, int type) 6566 { 6567 int cpu, ret; 6568 6569 mutex_lock(&pmus_lock); 6570 ret = -ENOMEM; 6571 pmu->pmu_disable_count = alloc_percpu(int); 6572 if (!pmu->pmu_disable_count) 6573 goto unlock; 6574 6575 pmu->type = -1; 6576 if (!name) 6577 goto skip_type; 6578 pmu->name = name; 6579 6580 if (type < 0) { 6581 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); 6582 if (type < 0) { 6583 ret = type; 6584 goto free_pdc; 6585 } 6586 } 6587 pmu->type = type; 6588 6589 if (pmu_bus_running) { 6590 ret = pmu_dev_alloc(pmu); 6591 if (ret) 6592 goto free_idr; 6593 } 6594 6595 skip_type: 6596 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 6597 if (pmu->pmu_cpu_context) 6598 goto got_cpu_context; 6599 6600 ret = -ENOMEM; 6601 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 6602 if (!pmu->pmu_cpu_context) 6603 goto free_dev; 6604 6605 for_each_possible_cpu(cpu) { 6606 struct perf_cpu_context *cpuctx; 6607 6608 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 6609 __perf_event_init_context(&cpuctx->ctx); 6610 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 6611 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6612 cpuctx->ctx.type = cpu_context; 6613 cpuctx->ctx.pmu = pmu; 6614 6615 __perf_cpu_hrtimer_init(cpuctx, cpu); 6616 6617 INIT_LIST_HEAD(&cpuctx->rotation_list); 6618 cpuctx->unique_pmu = pmu; 6619 } 6620 6621 got_cpu_context: 6622 if (!pmu->start_txn) { 6623 if (pmu->pmu_enable) { 6624 /* 6625 * If we have pmu_enable/pmu_disable calls, install 6626 * transaction stubs that use that to try and batch 6627 * hardware accesses. 6628 */ 6629 pmu->start_txn = perf_pmu_start_txn; 6630 pmu->commit_txn = perf_pmu_commit_txn; 6631 pmu->cancel_txn = perf_pmu_cancel_txn; 6632 } else { 6633 pmu->start_txn = perf_pmu_nop_void; 6634 pmu->commit_txn = perf_pmu_nop_int; 6635 pmu->cancel_txn = perf_pmu_nop_void; 6636 } 6637 } 6638 6639 if (!pmu->pmu_enable) { 6640 pmu->pmu_enable = perf_pmu_nop_void; 6641 pmu->pmu_disable = perf_pmu_nop_void; 6642 } 6643 6644 if (!pmu->event_idx) 6645 pmu->event_idx = perf_event_idx_default; 6646 6647 list_add_rcu(&pmu->entry, &pmus); 6648 ret = 0; 6649 unlock: 6650 mutex_unlock(&pmus_lock); 6651 6652 return ret; 6653 6654 free_dev: 6655 device_del(pmu->dev); 6656 put_device(pmu->dev); 6657 6658 free_idr: 6659 if (pmu->type >= PERF_TYPE_MAX) 6660 idr_remove(&pmu_idr, pmu->type); 6661 6662 free_pdc: 6663 free_percpu(pmu->pmu_disable_count); 6664 goto unlock; 6665 } 6666 EXPORT_SYMBOL_GPL(perf_pmu_register); 6667 6668 void perf_pmu_unregister(struct pmu *pmu) 6669 { 6670 mutex_lock(&pmus_lock); 6671 list_del_rcu(&pmu->entry); 6672 mutex_unlock(&pmus_lock); 6673 6674 /* 6675 * We dereference the pmu list under both SRCU and regular RCU, so 6676 * synchronize against both of those. 6677 */ 6678 synchronize_srcu(&pmus_srcu); 6679 synchronize_rcu(); 6680 6681 free_percpu(pmu->pmu_disable_count); 6682 if (pmu->type >= PERF_TYPE_MAX) 6683 idr_remove(&pmu_idr, pmu->type); 6684 device_del(pmu->dev); 6685 put_device(pmu->dev); 6686 free_pmu_context(pmu); 6687 } 6688 EXPORT_SYMBOL_GPL(perf_pmu_unregister); 6689 6690 struct pmu *perf_init_event(struct perf_event *event) 6691 { 6692 struct pmu *pmu = NULL; 6693 int idx; 6694 int ret; 6695 6696 idx = srcu_read_lock(&pmus_srcu); 6697 6698 rcu_read_lock(); 6699 pmu = idr_find(&pmu_idr, event->attr.type); 6700 rcu_read_unlock(); 6701 if (pmu) { 6702 if (!try_module_get(pmu->module)) { 6703 pmu = ERR_PTR(-ENODEV); 6704 goto unlock; 6705 } 6706 event->pmu = pmu; 6707 ret = pmu->event_init(event); 6708 if (ret) 6709 pmu = ERR_PTR(ret); 6710 goto unlock; 6711 } 6712 6713 list_for_each_entry_rcu(pmu, &pmus, entry) { 6714 if (!try_module_get(pmu->module)) { 6715 pmu = ERR_PTR(-ENODEV); 6716 goto unlock; 6717 } 6718 event->pmu = pmu; 6719 ret = pmu->event_init(event); 6720 if (!ret) 6721 goto unlock; 6722 6723 if (ret != -ENOENT) { 6724 pmu = ERR_PTR(ret); 6725 goto unlock; 6726 } 6727 } 6728 pmu = ERR_PTR(-ENOENT); 6729 unlock: 6730 srcu_read_unlock(&pmus_srcu, idx); 6731 6732 return pmu; 6733 } 6734 6735 static void account_event_cpu(struct perf_event *event, int cpu) 6736 { 6737 if (event->parent) 6738 return; 6739 6740 if (has_branch_stack(event)) { 6741 if (!(event->attach_state & PERF_ATTACH_TASK)) 6742 atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); 6743 } 6744 if (is_cgroup_event(event)) 6745 atomic_inc(&per_cpu(perf_cgroup_events, cpu)); 6746 } 6747 6748 static void account_event(struct perf_event *event) 6749 { 6750 if (event->parent) 6751 return; 6752 6753 if (event->attach_state & PERF_ATTACH_TASK) 6754 static_key_slow_inc(&perf_sched_events.key); 6755 if (event->attr.mmap || event->attr.mmap_data) 6756 atomic_inc(&nr_mmap_events); 6757 if (event->attr.comm) 6758 atomic_inc(&nr_comm_events); 6759 if (event->attr.task) 6760 atomic_inc(&nr_task_events); 6761 if (event->attr.freq) { 6762 if (atomic_inc_return(&nr_freq_events) == 1) 6763 tick_nohz_full_kick_all(); 6764 } 6765 if (has_branch_stack(event)) 6766 static_key_slow_inc(&perf_sched_events.key); 6767 if (is_cgroup_event(event)) 6768 static_key_slow_inc(&perf_sched_events.key); 6769 6770 account_event_cpu(event, event->cpu); 6771 } 6772 6773 /* 6774 * Allocate and initialize a event structure 6775 */ 6776 static struct perf_event * 6777 perf_event_alloc(struct perf_event_attr *attr, int cpu, 6778 struct task_struct *task, 6779 struct perf_event *group_leader, 6780 struct perf_event *parent_event, 6781 perf_overflow_handler_t overflow_handler, 6782 void *context) 6783 { 6784 struct pmu *pmu; 6785 struct perf_event *event; 6786 struct hw_perf_event *hwc; 6787 long err = -EINVAL; 6788 6789 if ((unsigned)cpu >= nr_cpu_ids) { 6790 if (!task || cpu != -1) 6791 return ERR_PTR(-EINVAL); 6792 } 6793 6794 event = kzalloc(sizeof(*event), GFP_KERNEL); 6795 if (!event) 6796 return ERR_PTR(-ENOMEM); 6797 6798 /* 6799 * Single events are their own group leaders, with an 6800 * empty sibling list: 6801 */ 6802 if (!group_leader) 6803 group_leader = event; 6804 6805 mutex_init(&event->child_mutex); 6806 INIT_LIST_HEAD(&event->child_list); 6807 6808 INIT_LIST_HEAD(&event->group_entry); 6809 INIT_LIST_HEAD(&event->event_entry); 6810 INIT_LIST_HEAD(&event->sibling_list); 6811 INIT_LIST_HEAD(&event->rb_entry); 6812 INIT_LIST_HEAD(&event->active_entry); 6813 INIT_HLIST_NODE(&event->hlist_entry); 6814 6815 6816 init_waitqueue_head(&event->waitq); 6817 init_irq_work(&event->pending, perf_pending_event); 6818 6819 mutex_init(&event->mmap_mutex); 6820 6821 atomic_long_set(&event->refcount, 1); 6822 event->cpu = cpu; 6823 event->attr = *attr; 6824 event->group_leader = group_leader; 6825 event->pmu = NULL; 6826 event->oncpu = -1; 6827 6828 event->parent = parent_event; 6829 6830 event->ns = get_pid_ns(task_active_pid_ns(current)); 6831 event->id = atomic64_inc_return(&perf_event_id); 6832 6833 event->state = PERF_EVENT_STATE_INACTIVE; 6834 6835 if (task) { 6836 event->attach_state = PERF_ATTACH_TASK; 6837 6838 if (attr->type == PERF_TYPE_TRACEPOINT) 6839 event->hw.tp_target = task; 6840 #ifdef CONFIG_HAVE_HW_BREAKPOINT 6841 /* 6842 * hw_breakpoint is a bit difficult here.. 6843 */ 6844 else if (attr->type == PERF_TYPE_BREAKPOINT) 6845 event->hw.bp_target = task; 6846 #endif 6847 } 6848 6849 if (!overflow_handler && parent_event) { 6850 overflow_handler = parent_event->overflow_handler; 6851 context = parent_event->overflow_handler_context; 6852 } 6853 6854 event->overflow_handler = overflow_handler; 6855 event->overflow_handler_context = context; 6856 6857 perf_event__state_init(event); 6858 6859 pmu = NULL; 6860 6861 hwc = &event->hw; 6862 hwc->sample_period = attr->sample_period; 6863 if (attr->freq && attr->sample_freq) 6864 hwc->sample_period = 1; 6865 hwc->last_period = hwc->sample_period; 6866 6867 local64_set(&hwc->period_left, hwc->sample_period); 6868 6869 /* 6870 * we currently do not support PERF_FORMAT_GROUP on inherited events 6871 */ 6872 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6873 goto err_ns; 6874 6875 pmu = perf_init_event(event); 6876 if (!pmu) 6877 goto err_ns; 6878 else if (IS_ERR(pmu)) { 6879 err = PTR_ERR(pmu); 6880 goto err_ns; 6881 } 6882 6883 if (!event->parent) { 6884 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 6885 err = get_callchain_buffers(); 6886 if (err) 6887 goto err_pmu; 6888 } 6889 } 6890 6891 return event; 6892 6893 err_pmu: 6894 if (event->destroy) 6895 event->destroy(event); 6896 module_put(pmu->module); 6897 err_ns: 6898 if (event->ns) 6899 put_pid_ns(event->ns); 6900 kfree(event); 6901 6902 return ERR_PTR(err); 6903 } 6904 6905 static int perf_copy_attr(struct perf_event_attr __user *uattr, 6906 struct perf_event_attr *attr) 6907 { 6908 u32 size; 6909 int ret; 6910 6911 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) 6912 return -EFAULT; 6913 6914 /* 6915 * zero the full structure, so that a short copy will be nice. 6916 */ 6917 memset(attr, 0, sizeof(*attr)); 6918 6919 ret = get_user(size, &uattr->size); 6920 if (ret) 6921 return ret; 6922 6923 if (size > PAGE_SIZE) /* silly large */ 6924 goto err_size; 6925 6926 if (!size) /* abi compat */ 6927 size = PERF_ATTR_SIZE_VER0; 6928 6929 if (size < PERF_ATTR_SIZE_VER0) 6930 goto err_size; 6931 6932 /* 6933 * If we're handed a bigger struct than we know of, 6934 * ensure all the unknown bits are 0 - i.e. new 6935 * user-space does not rely on any kernel feature 6936 * extensions we dont know about yet. 6937 */ 6938 if (size > sizeof(*attr)) { 6939 unsigned char __user *addr; 6940 unsigned char __user *end; 6941 unsigned char val; 6942 6943 addr = (void __user *)uattr + sizeof(*attr); 6944 end = (void __user *)uattr + size; 6945 6946 for (; addr < end; addr++) { 6947 ret = get_user(val, addr); 6948 if (ret) 6949 return ret; 6950 if (val) 6951 goto err_size; 6952 } 6953 size = sizeof(*attr); 6954 } 6955 6956 ret = copy_from_user(attr, uattr, size); 6957 if (ret) 6958 return -EFAULT; 6959 6960 if (attr->__reserved_1) 6961 return -EINVAL; 6962 6963 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 6964 return -EINVAL; 6965 6966 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 6967 return -EINVAL; 6968 6969 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { 6970 u64 mask = attr->branch_sample_type; 6971 6972 /* only using defined bits */ 6973 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) 6974 return -EINVAL; 6975 6976 /* at least one branch bit must be set */ 6977 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6978 return -EINVAL; 6979 6980 /* propagate priv level, when not set for branch */ 6981 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6982 6983 /* exclude_kernel checked on syscall entry */ 6984 if (!attr->exclude_kernel) 6985 mask |= PERF_SAMPLE_BRANCH_KERNEL; 6986 6987 if (!attr->exclude_user) 6988 mask |= PERF_SAMPLE_BRANCH_USER; 6989 6990 if (!attr->exclude_hv) 6991 mask |= PERF_SAMPLE_BRANCH_HV; 6992 /* 6993 * adjust user setting (for HW filter setup) 6994 */ 6995 attr->branch_sample_type = mask; 6996 } 6997 /* privileged levels capture (kernel, hv): check permissions */ 6998 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) 6999 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 7000 return -EACCES; 7001 } 7002 7003 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 7004 ret = perf_reg_validate(attr->sample_regs_user); 7005 if (ret) 7006 return ret; 7007 } 7008 7009 if (attr->sample_type & PERF_SAMPLE_STACK_USER) { 7010 if (!arch_perf_have_user_stack_dump()) 7011 return -ENOSYS; 7012 7013 /* 7014 * We have __u32 type for the size, but so far 7015 * we can only use __u16 as maximum due to the 7016 * __u16 sample size limit. 7017 */ 7018 if (attr->sample_stack_user >= USHRT_MAX) 7019 ret = -EINVAL; 7020 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) 7021 ret = -EINVAL; 7022 } 7023 7024 out: 7025 return ret; 7026 7027 err_size: 7028 put_user(sizeof(*attr), &uattr->size); 7029 ret = -E2BIG; 7030 goto out; 7031 } 7032 7033 static int 7034 perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 7035 { 7036 struct ring_buffer *rb = NULL; 7037 int ret = -EINVAL; 7038 7039 if (!output_event) 7040 goto set; 7041 7042 /* don't allow circular references */ 7043 if (event == output_event) 7044 goto out; 7045 7046 /* 7047 * Don't allow cross-cpu buffers 7048 */ 7049 if (output_event->cpu != event->cpu) 7050 goto out; 7051 7052 /* 7053 * If its not a per-cpu rb, it must be the same task. 7054 */ 7055 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 7056 goto out; 7057 7058 set: 7059 mutex_lock(&event->mmap_mutex); 7060 /* Can't redirect output if we've got an active mmap() */ 7061 if (atomic_read(&event->mmap_count)) 7062 goto unlock; 7063 7064 if (output_event) { 7065 /* get the rb we want to redirect to */ 7066 rb = ring_buffer_get(output_event); 7067 if (!rb) 7068 goto unlock; 7069 } 7070 7071 ring_buffer_attach(event, rb); 7072 7073 ret = 0; 7074 unlock: 7075 mutex_unlock(&event->mmap_mutex); 7076 7077 out: 7078 return ret; 7079 } 7080 7081 /** 7082 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7083 * 7084 * @attr_uptr: event_id type attributes for monitoring/sampling 7085 * @pid: target pid 7086 * @cpu: target cpu 7087 * @group_fd: group leader event fd 7088 */ 7089 SYSCALL_DEFINE5(perf_event_open, 7090 struct perf_event_attr __user *, attr_uptr, 7091 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 7092 { 7093 struct perf_event *group_leader = NULL, *output_event = NULL; 7094 struct perf_event *event, *sibling; 7095 struct perf_event_attr attr; 7096 struct perf_event_context *ctx; 7097 struct file *event_file = NULL; 7098 struct fd group = {NULL, 0}; 7099 struct task_struct *task = NULL; 7100 struct pmu *pmu; 7101 int event_fd; 7102 int move_group = 0; 7103 int err; 7104 int f_flags = O_RDWR; 7105 7106 /* for future expandability... */ 7107 if (flags & ~PERF_FLAG_ALL) 7108 return -EINVAL; 7109 7110 err = perf_copy_attr(attr_uptr, &attr); 7111 if (err) 7112 return err; 7113 7114 if (!attr.exclude_kernel) { 7115 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 7116 return -EACCES; 7117 } 7118 7119 if (attr.freq) { 7120 if (attr.sample_freq > sysctl_perf_event_sample_rate) 7121 return -EINVAL; 7122 } else { 7123 if (attr.sample_period & (1ULL << 63)) 7124 return -EINVAL; 7125 } 7126 7127 /* 7128 * In cgroup mode, the pid argument is used to pass the fd 7129 * opened to the cgroup directory in cgroupfs. The cpu argument 7130 * designates the cpu on which to monitor threads from that 7131 * cgroup. 7132 */ 7133 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 7134 return -EINVAL; 7135 7136 if (flags & PERF_FLAG_FD_CLOEXEC) 7137 f_flags |= O_CLOEXEC; 7138 7139 event_fd = get_unused_fd_flags(f_flags); 7140 if (event_fd < 0) 7141 return event_fd; 7142 7143 if (group_fd != -1) { 7144 err = perf_fget_light(group_fd, &group); 7145 if (err) 7146 goto err_fd; 7147 group_leader = group.file->private_data; 7148 if (flags & PERF_FLAG_FD_OUTPUT) 7149 output_event = group_leader; 7150 if (flags & PERF_FLAG_FD_NO_GROUP) 7151 group_leader = NULL; 7152 } 7153 7154 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { 7155 task = find_lively_task_by_vpid(pid); 7156 if (IS_ERR(task)) { 7157 err = PTR_ERR(task); 7158 goto err_group_fd; 7159 } 7160 } 7161 7162 if (task && group_leader && 7163 group_leader->attr.inherit != attr.inherit) { 7164 err = -EINVAL; 7165 goto err_task; 7166 } 7167 7168 get_online_cpus(); 7169 7170 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7171 NULL, NULL); 7172 if (IS_ERR(event)) { 7173 err = PTR_ERR(event); 7174 goto err_cpus; 7175 } 7176 7177 if (flags & PERF_FLAG_PID_CGROUP) { 7178 err = perf_cgroup_connect(pid, event, &attr, group_leader); 7179 if (err) { 7180 __free_event(event); 7181 goto err_cpus; 7182 } 7183 } 7184 7185 if (is_sampling_event(event)) { 7186 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 7187 err = -ENOTSUPP; 7188 goto err_alloc; 7189 } 7190 } 7191 7192 account_event(event); 7193 7194 /* 7195 * Special case software events and allow them to be part of 7196 * any hardware group. 7197 */ 7198 pmu = event->pmu; 7199 7200 if (group_leader && 7201 (is_software_event(event) != is_software_event(group_leader))) { 7202 if (is_software_event(event)) { 7203 /* 7204 * If event and group_leader are not both a software 7205 * event, and event is, then group leader is not. 7206 * 7207 * Allow the addition of software events to !software 7208 * groups, this is safe because software events never 7209 * fail to schedule. 7210 */ 7211 pmu = group_leader->pmu; 7212 } else if (is_software_event(group_leader) && 7213 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { 7214 /* 7215 * In case the group is a pure software group, and we 7216 * try to add a hardware event, move the whole group to 7217 * the hardware context. 7218 */ 7219 move_group = 1; 7220 } 7221 } 7222 7223 /* 7224 * Get the target context (task or percpu): 7225 */ 7226 ctx = find_get_context(pmu, task, event->cpu); 7227 if (IS_ERR(ctx)) { 7228 err = PTR_ERR(ctx); 7229 goto err_alloc; 7230 } 7231 7232 if (task) { 7233 put_task_struct(task); 7234 task = NULL; 7235 } 7236 7237 /* 7238 * Look up the group leader (we will attach this event to it): 7239 */ 7240 if (group_leader) { 7241 err = -EINVAL; 7242 7243 /* 7244 * Do not allow a recursive hierarchy (this new sibling 7245 * becoming part of another group-sibling): 7246 */ 7247 if (group_leader->group_leader != group_leader) 7248 goto err_context; 7249 /* 7250 * Do not allow to attach to a group in a different 7251 * task or CPU context: 7252 */ 7253 if (move_group) { 7254 if (group_leader->ctx->type != ctx->type) 7255 goto err_context; 7256 } else { 7257 if (group_leader->ctx != ctx) 7258 goto err_context; 7259 } 7260 7261 /* 7262 * Only a group leader can be exclusive or pinned 7263 */ 7264 if (attr.exclusive || attr.pinned) 7265 goto err_context; 7266 } 7267 7268 if (output_event) { 7269 err = perf_event_set_output(event, output_event); 7270 if (err) 7271 goto err_context; 7272 } 7273 7274 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, 7275 f_flags); 7276 if (IS_ERR(event_file)) { 7277 err = PTR_ERR(event_file); 7278 goto err_context; 7279 } 7280 7281 if (move_group) { 7282 struct perf_event_context *gctx = group_leader->ctx; 7283 7284 mutex_lock(&gctx->mutex); 7285 perf_remove_from_context(group_leader, false); 7286 7287 /* 7288 * Removing from the context ends up with disabled 7289 * event. What we want here is event in the initial 7290 * startup state, ready to be add into new context. 7291 */ 7292 perf_event__state_init(group_leader); 7293 list_for_each_entry(sibling, &group_leader->sibling_list, 7294 group_entry) { 7295 perf_remove_from_context(sibling, false); 7296 perf_event__state_init(sibling); 7297 put_ctx(gctx); 7298 } 7299 mutex_unlock(&gctx->mutex); 7300 put_ctx(gctx); 7301 } 7302 7303 WARN_ON_ONCE(ctx->parent_ctx); 7304 mutex_lock(&ctx->mutex); 7305 7306 if (move_group) { 7307 synchronize_rcu(); 7308 perf_install_in_context(ctx, group_leader, event->cpu); 7309 get_ctx(ctx); 7310 list_for_each_entry(sibling, &group_leader->sibling_list, 7311 group_entry) { 7312 perf_install_in_context(ctx, sibling, event->cpu); 7313 get_ctx(ctx); 7314 } 7315 } 7316 7317 perf_install_in_context(ctx, event, event->cpu); 7318 perf_unpin_context(ctx); 7319 mutex_unlock(&ctx->mutex); 7320 7321 put_online_cpus(); 7322 7323 event->owner = current; 7324 7325 mutex_lock(¤t->perf_event_mutex); 7326 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 7327 mutex_unlock(¤t->perf_event_mutex); 7328 7329 /* 7330 * Precalculate sample_data sizes 7331 */ 7332 perf_event__header_size(event); 7333 perf_event__id_header_size(event); 7334 7335 /* 7336 * Drop the reference on the group_event after placing the 7337 * new event on the sibling_list. This ensures destruction 7338 * of the group leader will find the pointer to itself in 7339 * perf_group_detach(). 7340 */ 7341 fdput(group); 7342 fd_install(event_fd, event_file); 7343 return event_fd; 7344 7345 err_context: 7346 perf_unpin_context(ctx); 7347 put_ctx(ctx); 7348 err_alloc: 7349 free_event(event); 7350 err_cpus: 7351 put_online_cpus(); 7352 err_task: 7353 if (task) 7354 put_task_struct(task); 7355 err_group_fd: 7356 fdput(group); 7357 err_fd: 7358 put_unused_fd(event_fd); 7359 return err; 7360 } 7361 7362 /** 7363 * perf_event_create_kernel_counter 7364 * 7365 * @attr: attributes of the counter to create 7366 * @cpu: cpu in which the counter is bound 7367 * @task: task to profile (NULL for percpu) 7368 */ 7369 struct perf_event * 7370 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 7371 struct task_struct *task, 7372 perf_overflow_handler_t overflow_handler, 7373 void *context) 7374 { 7375 struct perf_event_context *ctx; 7376 struct perf_event *event; 7377 int err; 7378 7379 /* 7380 * Get the target context (task or percpu): 7381 */ 7382 7383 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 7384 overflow_handler, context); 7385 if (IS_ERR(event)) { 7386 err = PTR_ERR(event); 7387 goto err; 7388 } 7389 7390 account_event(event); 7391 7392 ctx = find_get_context(event->pmu, task, cpu); 7393 if (IS_ERR(ctx)) { 7394 err = PTR_ERR(ctx); 7395 goto err_free; 7396 } 7397 7398 WARN_ON_ONCE(ctx->parent_ctx); 7399 mutex_lock(&ctx->mutex); 7400 perf_install_in_context(ctx, event, cpu); 7401 perf_unpin_context(ctx); 7402 mutex_unlock(&ctx->mutex); 7403 7404 return event; 7405 7406 err_free: 7407 free_event(event); 7408 err: 7409 return ERR_PTR(err); 7410 } 7411 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 7412 7413 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) 7414 { 7415 struct perf_event_context *src_ctx; 7416 struct perf_event_context *dst_ctx; 7417 struct perf_event *event, *tmp; 7418 LIST_HEAD(events); 7419 7420 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 7421 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 7422 7423 mutex_lock(&src_ctx->mutex); 7424 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7425 event_entry) { 7426 perf_remove_from_context(event, false); 7427 unaccount_event_cpu(event, src_cpu); 7428 put_ctx(src_ctx); 7429 list_add(&event->migrate_entry, &events); 7430 } 7431 mutex_unlock(&src_ctx->mutex); 7432 7433 synchronize_rcu(); 7434 7435 mutex_lock(&dst_ctx->mutex); 7436 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 7437 list_del(&event->migrate_entry); 7438 if (event->state >= PERF_EVENT_STATE_OFF) 7439 event->state = PERF_EVENT_STATE_INACTIVE; 7440 account_event_cpu(event, dst_cpu); 7441 perf_install_in_context(dst_ctx, event, dst_cpu); 7442 get_ctx(dst_ctx); 7443 } 7444 mutex_unlock(&dst_ctx->mutex); 7445 } 7446 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 7447 7448 static void sync_child_event(struct perf_event *child_event, 7449 struct task_struct *child) 7450 { 7451 struct perf_event *parent_event = child_event->parent; 7452 u64 child_val; 7453 7454 if (child_event->attr.inherit_stat) 7455 perf_event_read_event(child_event, child); 7456 7457 child_val = perf_event_count(child_event); 7458 7459 /* 7460 * Add back the child's count to the parent's count: 7461 */ 7462 atomic64_add(child_val, &parent_event->child_count); 7463 atomic64_add(child_event->total_time_enabled, 7464 &parent_event->child_total_time_enabled); 7465 atomic64_add(child_event->total_time_running, 7466 &parent_event->child_total_time_running); 7467 7468 /* 7469 * Remove this event from the parent's list 7470 */ 7471 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 7472 mutex_lock(&parent_event->child_mutex); 7473 list_del_init(&child_event->child_list); 7474 mutex_unlock(&parent_event->child_mutex); 7475 7476 /* 7477 * Release the parent event, if this was the last 7478 * reference to it. 7479 */ 7480 put_event(parent_event); 7481 } 7482 7483 static void 7484 __perf_event_exit_task(struct perf_event *child_event, 7485 struct perf_event_context *child_ctx, 7486 struct task_struct *child) 7487 { 7488 /* 7489 * Do not destroy the 'original' grouping; because of the context 7490 * switch optimization the original events could've ended up in a 7491 * random child task. 7492 * 7493 * If we were to destroy the original group, all group related 7494 * operations would cease to function properly after this random 7495 * child dies. 7496 * 7497 * Do destroy all inherited groups, we don't care about those 7498 * and being thorough is better. 7499 */ 7500 perf_remove_from_context(child_event, !!child_event->parent); 7501 7502 /* 7503 * It can happen that the parent exits first, and has events 7504 * that are still around due to the child reference. These 7505 * events need to be zapped. 7506 */ 7507 if (child_event->parent) { 7508 sync_child_event(child_event, child); 7509 free_event(child_event); 7510 } 7511 } 7512 7513 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7514 { 7515 struct perf_event *child_event, *next; 7516 struct perf_event_context *child_ctx, *parent_ctx; 7517 unsigned long flags; 7518 7519 if (likely(!child->perf_event_ctxp[ctxn])) { 7520 perf_event_task(child, NULL, 0); 7521 return; 7522 } 7523 7524 local_irq_save(flags); 7525 /* 7526 * We can't reschedule here because interrupts are disabled, 7527 * and either child is current or it is a task that can't be 7528 * scheduled, so we are now safe from rescheduling changing 7529 * our context. 7530 */ 7531 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 7532 7533 /* 7534 * Take the context lock here so that if find_get_context is 7535 * reading child->perf_event_ctxp, we wait until it has 7536 * incremented the context's refcount before we do put_ctx below. 7537 */ 7538 raw_spin_lock(&child_ctx->lock); 7539 task_ctx_sched_out(child_ctx); 7540 child->perf_event_ctxp[ctxn] = NULL; 7541 7542 /* 7543 * In order to avoid freeing: child_ctx->parent_ctx->task 7544 * under perf_event_context::lock, grab another reference. 7545 */ 7546 parent_ctx = child_ctx->parent_ctx; 7547 if (parent_ctx) 7548 get_ctx(parent_ctx); 7549 7550 /* 7551 * If this context is a clone; unclone it so it can't get 7552 * swapped to another process while we're removing all 7553 * the events from it. 7554 */ 7555 unclone_ctx(child_ctx); 7556 update_context_time(child_ctx); 7557 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7558 7559 /* 7560 * Now that we no longer hold perf_event_context::lock, drop 7561 * our extra child_ctx->parent_ctx reference. 7562 */ 7563 if (parent_ctx) 7564 put_ctx(parent_ctx); 7565 7566 /* 7567 * Report the task dead after unscheduling the events so that we 7568 * won't get any samples after PERF_RECORD_EXIT. We can however still 7569 * get a few PERF_RECORD_READ events. 7570 */ 7571 perf_event_task(child, child_ctx, 0); 7572 7573 /* 7574 * We can recurse on the same lock type through: 7575 * 7576 * __perf_event_exit_task() 7577 * sync_child_event() 7578 * put_event() 7579 * mutex_lock(&ctx->mutex) 7580 * 7581 * But since its the parent context it won't be the same instance. 7582 */ 7583 mutex_lock(&child_ctx->mutex); 7584 7585 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 7586 __perf_event_exit_task(child_event, child_ctx, child); 7587 7588 mutex_unlock(&child_ctx->mutex); 7589 7590 put_ctx(child_ctx); 7591 } 7592 7593 /* 7594 * When a child task exits, feed back event values to parent events. 7595 */ 7596 void perf_event_exit_task(struct task_struct *child) 7597 { 7598 struct perf_event *event, *tmp; 7599 int ctxn; 7600 7601 mutex_lock(&child->perf_event_mutex); 7602 list_for_each_entry_safe(event, tmp, &child->perf_event_list, 7603 owner_entry) { 7604 list_del_init(&event->owner_entry); 7605 7606 /* 7607 * Ensure the list deletion is visible before we clear 7608 * the owner, closes a race against perf_release() where 7609 * we need to serialize on the owner->perf_event_mutex. 7610 */ 7611 smp_wmb(); 7612 event->owner = NULL; 7613 } 7614 mutex_unlock(&child->perf_event_mutex); 7615 7616 for_each_task_context_nr(ctxn) 7617 perf_event_exit_task_context(child, ctxn); 7618 } 7619 7620 static void perf_free_event(struct perf_event *event, 7621 struct perf_event_context *ctx) 7622 { 7623 struct perf_event *parent = event->parent; 7624 7625 if (WARN_ON_ONCE(!parent)) 7626 return; 7627 7628 mutex_lock(&parent->child_mutex); 7629 list_del_init(&event->child_list); 7630 mutex_unlock(&parent->child_mutex); 7631 7632 put_event(parent); 7633 7634 perf_group_detach(event); 7635 list_del_event(event, ctx); 7636 free_event(event); 7637 } 7638 7639 /* 7640 * free an unexposed, unused context as created by inheritance by 7641 * perf_event_init_task below, used by fork() in case of fail. 7642 */ 7643 void perf_event_free_task(struct task_struct *task) 7644 { 7645 struct perf_event_context *ctx; 7646 struct perf_event *event, *tmp; 7647 int ctxn; 7648 7649 for_each_task_context_nr(ctxn) { 7650 ctx = task->perf_event_ctxp[ctxn]; 7651 if (!ctx) 7652 continue; 7653 7654 mutex_lock(&ctx->mutex); 7655 again: 7656 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, 7657 group_entry) 7658 perf_free_event(event, ctx); 7659 7660 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 7661 group_entry) 7662 perf_free_event(event, ctx); 7663 7664 if (!list_empty(&ctx->pinned_groups) || 7665 !list_empty(&ctx->flexible_groups)) 7666 goto again; 7667 7668 mutex_unlock(&ctx->mutex); 7669 7670 put_ctx(ctx); 7671 } 7672 } 7673 7674 void perf_event_delayed_put(struct task_struct *task) 7675 { 7676 int ctxn; 7677 7678 for_each_task_context_nr(ctxn) 7679 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 7680 } 7681 7682 /* 7683 * inherit a event from parent task to child task: 7684 */ 7685 static struct perf_event * 7686 inherit_event(struct perf_event *parent_event, 7687 struct task_struct *parent, 7688 struct perf_event_context *parent_ctx, 7689 struct task_struct *child, 7690 struct perf_event *group_leader, 7691 struct perf_event_context *child_ctx) 7692 { 7693 struct perf_event *child_event; 7694 unsigned long flags; 7695 7696 /* 7697 * Instead of creating recursive hierarchies of events, 7698 * we link inherited events back to the original parent, 7699 * which has a filp for sure, which we use as the reference 7700 * count: 7701 */ 7702 if (parent_event->parent) 7703 parent_event = parent_event->parent; 7704 7705 child_event = perf_event_alloc(&parent_event->attr, 7706 parent_event->cpu, 7707 child, 7708 group_leader, parent_event, 7709 NULL, NULL); 7710 if (IS_ERR(child_event)) 7711 return child_event; 7712 7713 if (!atomic_long_inc_not_zero(&parent_event->refcount)) { 7714 free_event(child_event); 7715 return NULL; 7716 } 7717 7718 get_ctx(child_ctx); 7719 7720 /* 7721 * Make the child state follow the state of the parent event, 7722 * not its attr.disabled bit. We hold the parent's mutex, 7723 * so we won't race with perf_event_{en, dis}able_family. 7724 */ 7725 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) 7726 child_event->state = PERF_EVENT_STATE_INACTIVE; 7727 else 7728 child_event->state = PERF_EVENT_STATE_OFF; 7729 7730 if (parent_event->attr.freq) { 7731 u64 sample_period = parent_event->hw.sample_period; 7732 struct hw_perf_event *hwc = &child_event->hw; 7733 7734 hwc->sample_period = sample_period; 7735 hwc->last_period = sample_period; 7736 7737 local64_set(&hwc->period_left, sample_period); 7738 } 7739 7740 child_event->ctx = child_ctx; 7741 child_event->overflow_handler = parent_event->overflow_handler; 7742 child_event->overflow_handler_context 7743 = parent_event->overflow_handler_context; 7744 7745 /* 7746 * Precalculate sample_data sizes 7747 */ 7748 perf_event__header_size(child_event); 7749 perf_event__id_header_size(child_event); 7750 7751 /* 7752 * Link it up in the child's context: 7753 */ 7754 raw_spin_lock_irqsave(&child_ctx->lock, flags); 7755 add_event_to_ctx(child_event, child_ctx); 7756 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7757 7758 /* 7759 * Link this into the parent event's child list 7760 */ 7761 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 7762 mutex_lock(&parent_event->child_mutex); 7763 list_add_tail(&child_event->child_list, &parent_event->child_list); 7764 mutex_unlock(&parent_event->child_mutex); 7765 7766 return child_event; 7767 } 7768 7769 static int inherit_group(struct perf_event *parent_event, 7770 struct task_struct *parent, 7771 struct perf_event_context *parent_ctx, 7772 struct task_struct *child, 7773 struct perf_event_context *child_ctx) 7774 { 7775 struct perf_event *leader; 7776 struct perf_event *sub; 7777 struct perf_event *child_ctr; 7778 7779 leader = inherit_event(parent_event, parent, parent_ctx, 7780 child, NULL, child_ctx); 7781 if (IS_ERR(leader)) 7782 return PTR_ERR(leader); 7783 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { 7784 child_ctr = inherit_event(sub, parent, parent_ctx, 7785 child, leader, child_ctx); 7786 if (IS_ERR(child_ctr)) 7787 return PTR_ERR(child_ctr); 7788 } 7789 return 0; 7790 } 7791 7792 static int 7793 inherit_task_group(struct perf_event *event, struct task_struct *parent, 7794 struct perf_event_context *parent_ctx, 7795 struct task_struct *child, int ctxn, 7796 int *inherited_all) 7797 { 7798 int ret; 7799 struct perf_event_context *child_ctx; 7800 7801 if (!event->attr.inherit) { 7802 *inherited_all = 0; 7803 return 0; 7804 } 7805 7806 child_ctx = child->perf_event_ctxp[ctxn]; 7807 if (!child_ctx) { 7808 /* 7809 * This is executed from the parent task context, so 7810 * inherit events that have been marked for cloning. 7811 * First allocate and initialize a context for the 7812 * child. 7813 */ 7814 7815 child_ctx = alloc_perf_context(parent_ctx->pmu, child); 7816 if (!child_ctx) 7817 return -ENOMEM; 7818 7819 child->perf_event_ctxp[ctxn] = child_ctx; 7820 } 7821 7822 ret = inherit_group(event, parent, parent_ctx, 7823 child, child_ctx); 7824 7825 if (ret) 7826 *inherited_all = 0; 7827 7828 return ret; 7829 } 7830 7831 /* 7832 * Initialize the perf_event context in task_struct 7833 */ 7834 static int perf_event_init_context(struct task_struct *child, int ctxn) 7835 { 7836 struct perf_event_context *child_ctx, *parent_ctx; 7837 struct perf_event_context *cloned_ctx; 7838 struct perf_event *event; 7839 struct task_struct *parent = current; 7840 int inherited_all = 1; 7841 unsigned long flags; 7842 int ret = 0; 7843 7844 if (likely(!parent->perf_event_ctxp[ctxn])) 7845 return 0; 7846 7847 /* 7848 * If the parent's context is a clone, pin it so it won't get 7849 * swapped under us. 7850 */ 7851 parent_ctx = perf_pin_task_context(parent, ctxn); 7852 if (!parent_ctx) 7853 return 0; 7854 7855 /* 7856 * No need to check if parent_ctx != NULL here; since we saw 7857 * it non-NULL earlier, the only reason for it to become NULL 7858 * is if we exit, and since we're currently in the middle of 7859 * a fork we can't be exiting at the same time. 7860 */ 7861 7862 /* 7863 * Lock the parent list. No need to lock the child - not PID 7864 * hashed yet and not running, so nobody can access it. 7865 */ 7866 mutex_lock(&parent_ctx->mutex); 7867 7868 /* 7869 * We dont have to disable NMIs - we are only looking at 7870 * the list, not manipulating it: 7871 */ 7872 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 7873 ret = inherit_task_group(event, parent, parent_ctx, 7874 child, ctxn, &inherited_all); 7875 if (ret) 7876 break; 7877 } 7878 7879 /* 7880 * We can't hold ctx->lock when iterating the ->flexible_group list due 7881 * to allocations, but we need to prevent rotation because 7882 * rotate_ctx() will change the list from interrupt context. 7883 */ 7884 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 7885 parent_ctx->rotate_disable = 1; 7886 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 7887 7888 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 7889 ret = inherit_task_group(event, parent, parent_ctx, 7890 child, ctxn, &inherited_all); 7891 if (ret) 7892 break; 7893 } 7894 7895 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 7896 parent_ctx->rotate_disable = 0; 7897 7898 child_ctx = child->perf_event_ctxp[ctxn]; 7899 7900 if (child_ctx && inherited_all) { 7901 /* 7902 * Mark the child context as a clone of the parent 7903 * context, or of whatever the parent is a clone of. 7904 * 7905 * Note that if the parent is a clone, the holding of 7906 * parent_ctx->lock avoids it from being uncloned. 7907 */ 7908 cloned_ctx = parent_ctx->parent_ctx; 7909 if (cloned_ctx) { 7910 child_ctx->parent_ctx = cloned_ctx; 7911 child_ctx->parent_gen = parent_ctx->parent_gen; 7912 } else { 7913 child_ctx->parent_ctx = parent_ctx; 7914 child_ctx->parent_gen = parent_ctx->generation; 7915 } 7916 get_ctx(child_ctx->parent_ctx); 7917 } 7918 7919 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 7920 mutex_unlock(&parent_ctx->mutex); 7921 7922 perf_unpin_context(parent_ctx); 7923 put_ctx(parent_ctx); 7924 7925 return ret; 7926 } 7927 7928 /* 7929 * Initialize the perf_event context in task_struct 7930 */ 7931 int perf_event_init_task(struct task_struct *child) 7932 { 7933 int ctxn, ret; 7934 7935 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); 7936 mutex_init(&child->perf_event_mutex); 7937 INIT_LIST_HEAD(&child->perf_event_list); 7938 7939 for_each_task_context_nr(ctxn) { 7940 ret = perf_event_init_context(child, ctxn); 7941 if (ret) 7942 return ret; 7943 } 7944 7945 return 0; 7946 } 7947 7948 static void __init perf_event_init_all_cpus(void) 7949 { 7950 struct swevent_htable *swhash; 7951 int cpu; 7952 7953 for_each_possible_cpu(cpu) { 7954 swhash = &per_cpu(swevent_htable, cpu); 7955 mutex_init(&swhash->hlist_mutex); 7956 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); 7957 } 7958 } 7959 7960 static void perf_event_init_cpu(int cpu) 7961 { 7962 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7963 7964 mutex_lock(&swhash->hlist_mutex); 7965 swhash->online = true; 7966 if (swhash->hlist_refcount > 0) { 7967 struct swevent_hlist *hlist; 7968 7969 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); 7970 WARN_ON(!hlist); 7971 rcu_assign_pointer(swhash->swevent_hlist, hlist); 7972 } 7973 mutex_unlock(&swhash->hlist_mutex); 7974 } 7975 7976 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 7977 static void perf_pmu_rotate_stop(struct pmu *pmu) 7978 { 7979 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 7980 7981 WARN_ON(!irqs_disabled()); 7982 7983 list_del_init(&cpuctx->rotation_list); 7984 } 7985 7986 static void __perf_event_exit_context(void *__info) 7987 { 7988 struct remove_event re = { .detach_group = false }; 7989 struct perf_event_context *ctx = __info; 7990 7991 perf_pmu_rotate_stop(ctx->pmu); 7992 7993 rcu_read_lock(); 7994 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 7995 __perf_remove_from_context(&re); 7996 rcu_read_unlock(); 7997 } 7998 7999 static void perf_event_exit_cpu_context(int cpu) 8000 { 8001 struct perf_event_context *ctx; 8002 struct pmu *pmu; 8003 int idx; 8004 8005 idx = srcu_read_lock(&pmus_srcu); 8006 list_for_each_entry_rcu(pmu, &pmus, entry) { 8007 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; 8008 8009 mutex_lock(&ctx->mutex); 8010 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 8011 mutex_unlock(&ctx->mutex); 8012 } 8013 srcu_read_unlock(&pmus_srcu, idx); 8014 } 8015 8016 static void perf_event_exit_cpu(int cpu) 8017 { 8018 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 8019 8020 perf_event_exit_cpu_context(cpu); 8021 8022 mutex_lock(&swhash->hlist_mutex); 8023 swhash->online = false; 8024 swevent_hlist_release(swhash); 8025 mutex_unlock(&swhash->hlist_mutex); 8026 } 8027 #else 8028 static inline void perf_event_exit_cpu(int cpu) { } 8029 #endif 8030 8031 static int 8032 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) 8033 { 8034 int cpu; 8035 8036 for_each_online_cpu(cpu) 8037 perf_event_exit_cpu(cpu); 8038 8039 return NOTIFY_OK; 8040 } 8041 8042 /* 8043 * Run the perf reboot notifier at the very last possible moment so that 8044 * the generic watchdog code runs as long as possible. 8045 */ 8046 static struct notifier_block perf_reboot_notifier = { 8047 .notifier_call = perf_reboot, 8048 .priority = INT_MIN, 8049 }; 8050 8051 static int 8052 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 8053 { 8054 unsigned int cpu = (long)hcpu; 8055 8056 switch (action & ~CPU_TASKS_FROZEN) { 8057 8058 case CPU_UP_PREPARE: 8059 case CPU_DOWN_FAILED: 8060 perf_event_init_cpu(cpu); 8061 break; 8062 8063 case CPU_UP_CANCELED: 8064 case CPU_DOWN_PREPARE: 8065 perf_event_exit_cpu(cpu); 8066 break; 8067 default: 8068 break; 8069 } 8070 8071 return NOTIFY_OK; 8072 } 8073 8074 void __init perf_event_init(void) 8075 { 8076 int ret; 8077 8078 idr_init(&pmu_idr); 8079 8080 perf_event_init_all_cpus(); 8081 init_srcu_struct(&pmus_srcu); 8082 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); 8083 perf_pmu_register(&perf_cpu_clock, NULL, -1); 8084 perf_pmu_register(&perf_task_clock, NULL, -1); 8085 perf_tp_register(); 8086 perf_cpu_notifier(perf_cpu_notify); 8087 register_reboot_notifier(&perf_reboot_notifier); 8088 8089 ret = init_hw_breakpoint(); 8090 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 8091 8092 /* do not patch jump label more than once per second */ 8093 jump_label_rate_limit(&perf_sched_events, HZ); 8094 8095 /* 8096 * Build time assertion that we keep the data_head at the intended 8097 * location. IOW, validation we got the __reserved[] size right. 8098 */ 8099 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) 8100 != 1024); 8101 } 8102 8103 static int __init perf_event_sysfs_init(void) 8104 { 8105 struct pmu *pmu; 8106 int ret; 8107 8108 mutex_lock(&pmus_lock); 8109 8110 ret = bus_register(&pmu_bus); 8111 if (ret) 8112 goto unlock; 8113 8114 list_for_each_entry(pmu, &pmus, entry) { 8115 if (!pmu->name || pmu->type < 0) 8116 continue; 8117 8118 ret = pmu_dev_alloc(pmu); 8119 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); 8120 } 8121 pmu_bus_running = 1; 8122 ret = 0; 8123 8124 unlock: 8125 mutex_unlock(&pmus_lock); 8126 8127 return ret; 8128 } 8129 device_initcall(perf_event_sysfs_init); 8130 8131 #ifdef CONFIG_CGROUP_PERF 8132 static struct cgroup_subsys_state * 8133 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 8134 { 8135 struct perf_cgroup *jc; 8136 8137 jc = kzalloc(sizeof(*jc), GFP_KERNEL); 8138 if (!jc) 8139 return ERR_PTR(-ENOMEM); 8140 8141 jc->info = alloc_percpu(struct perf_cgroup_info); 8142 if (!jc->info) { 8143 kfree(jc); 8144 return ERR_PTR(-ENOMEM); 8145 } 8146 8147 return &jc->css; 8148 } 8149 8150 static void perf_cgroup_css_free(struct cgroup_subsys_state *css) 8151 { 8152 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); 8153 8154 free_percpu(jc->info); 8155 kfree(jc); 8156 } 8157 8158 static int __perf_cgroup_move(void *info) 8159 { 8160 struct task_struct *task = info; 8161 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); 8162 return 0; 8163 } 8164 8165 static void perf_cgroup_attach(struct cgroup_subsys_state *css, 8166 struct cgroup_taskset *tset) 8167 { 8168 struct task_struct *task; 8169 8170 cgroup_taskset_for_each(task, tset) 8171 task_function_call(task, __perf_cgroup_move, task); 8172 } 8173 8174 static void perf_cgroup_exit(struct cgroup_subsys_state *css, 8175 struct cgroup_subsys_state *old_css, 8176 struct task_struct *task) 8177 { 8178 /* 8179 * cgroup_exit() is called in the copy_process() failure path. 8180 * Ignore this case since the task hasn't ran yet, this avoids 8181 * trying to poke a half freed task state from generic code. 8182 */ 8183 if (!(task->flags & PF_EXITING)) 8184 return; 8185 8186 task_function_call(task, __perf_cgroup_move, task); 8187 } 8188 8189 struct cgroup_subsys perf_event_cgrp_subsys = { 8190 .css_alloc = perf_cgroup_css_alloc, 8191 .css_free = perf_cgroup_css_free, 8192 .exit = perf_cgroup_exit, 8193 .attach = perf_cgroup_attach, 8194 }; 8195 #endif /* CONFIG_CGROUP_PERF */ 8196