1 /* 2 * Performance events x86 architecture code 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2009 Jaswinder Singh Rajput 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14 15 #include <linux/perf_event.h> 16 #include <linux/capability.h> 17 #include <linux/notifier.h> 18 #include <linux/hardirq.h> 19 #include <linux/kprobes.h> 20 #include <linux/export.h> 21 #include <linux/init.h> 22 #include <linux/kdebug.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/clock.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <linux/cpu.h> 28 #include <linux/bitops.h> 29 #include <linux/device.h> 30 31 #include <asm/apic.h> 32 #include <asm/stacktrace.h> 33 #include <asm/nmi.h> 34 #include <asm/smp.h> 35 #include <asm/alternative.h> 36 #include <asm/mmu_context.h> 37 #include <asm/tlbflush.h> 38 #include <asm/timer.h> 39 #include <asm/desc.h> 40 #include <asm/ldt.h> 41 #include <asm/unwind.h> 42 43 #include "perf_event.h" 44 45 struct x86_pmu x86_pmu __read_mostly; 46 47 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 48 .enabled = 1, 49 }; 50 51 struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; 52 53 u64 __read_mostly hw_cache_event_ids 54 [PERF_COUNT_HW_CACHE_MAX] 55 [PERF_COUNT_HW_CACHE_OP_MAX] 56 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 57 u64 __read_mostly hw_cache_extra_regs 58 [PERF_COUNT_HW_CACHE_MAX] 59 [PERF_COUNT_HW_CACHE_OP_MAX] 60 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 61 62 /* 63 * Propagate event elapsed time into the generic event. 64 * Can only be executed on the CPU where the event is active. 65 * Returns the delta events processed. 66 */ 67 u64 x86_perf_event_update(struct perf_event *event) 68 { 69 struct hw_perf_event *hwc = &event->hw; 70 int shift = 64 - x86_pmu.cntval_bits; 71 u64 prev_raw_count, new_raw_count; 72 int idx = hwc->idx; 73 u64 delta; 74 75 if (idx == INTEL_PMC_IDX_FIXED_BTS) 76 return 0; 77 78 /* 79 * Careful: an NMI might modify the previous event value. 80 * 81 * Our tactic to handle this is to first atomically read and 82 * exchange a new raw count - then add that new-prev delta 83 * count to the generic event atomically: 84 */ 85 again: 86 prev_raw_count = local64_read(&hwc->prev_count); 87 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 88 89 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 90 new_raw_count) != prev_raw_count) 91 goto again; 92 93 /* 94 * Now we have the new raw value and have updated the prev 95 * timestamp already. We can now calculate the elapsed delta 96 * (event-)time and add that to the generic event. 97 * 98 * Careful, not all hw sign-extends above the physical width 99 * of the count. 100 */ 101 delta = (new_raw_count << shift) - (prev_raw_count << shift); 102 delta >>= shift; 103 104 local64_add(delta, &event->count); 105 local64_sub(delta, &hwc->period_left); 106 107 return new_raw_count; 108 } 109 110 /* 111 * Find and validate any extra registers to set up. 112 */ 113 static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 114 { 115 struct hw_perf_event_extra *reg; 116 struct extra_reg *er; 117 118 reg = &event->hw.extra_reg; 119 120 if (!x86_pmu.extra_regs) 121 return 0; 122 123 for (er = x86_pmu.extra_regs; er->msr; er++) { 124 if (er->event != (config & er->config_mask)) 125 continue; 126 if (event->attr.config1 & ~er->valid_mask) 127 return -EINVAL; 128 /* Check if the extra msrs can be safely accessed*/ 129 if (!er->extra_msr_access) 130 return -ENXIO; 131 132 reg->idx = er->idx; 133 reg->config = event->attr.config1; 134 reg->reg = er->msr; 135 break; 136 } 137 return 0; 138 } 139 140 static atomic_t active_events; 141 static atomic_t pmc_refcount; 142 static DEFINE_MUTEX(pmc_reserve_mutex); 143 144 #ifdef CONFIG_X86_LOCAL_APIC 145 146 static bool reserve_pmc_hardware(void) 147 { 148 int i; 149 150 for (i = 0; i < x86_pmu.num_counters; i++) { 151 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) 152 goto perfctr_fail; 153 } 154 155 for (i = 0; i < x86_pmu.num_counters; i++) { 156 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) 157 goto eventsel_fail; 158 } 159 160 return true; 161 162 eventsel_fail: 163 for (i--; i >= 0; i--) 164 release_evntsel_nmi(x86_pmu_config_addr(i)); 165 166 i = x86_pmu.num_counters; 167 168 perfctr_fail: 169 for (i--; i >= 0; i--) 170 release_perfctr_nmi(x86_pmu_event_addr(i)); 171 172 return false; 173 } 174 175 static void release_pmc_hardware(void) 176 { 177 int i; 178 179 for (i = 0; i < x86_pmu.num_counters; i++) { 180 release_perfctr_nmi(x86_pmu_event_addr(i)); 181 release_evntsel_nmi(x86_pmu_config_addr(i)); 182 } 183 } 184 185 #else 186 187 static bool reserve_pmc_hardware(void) { return true; } 188 static void release_pmc_hardware(void) {} 189 190 #endif 191 192 static bool check_hw_exists(void) 193 { 194 u64 val, val_fail, val_new= ~0; 195 int i, reg, reg_fail, ret = 0; 196 int bios_fail = 0; 197 int reg_safe = -1; 198 199 /* 200 * Check to see if the BIOS enabled any of the counters, if so 201 * complain and bail. 202 */ 203 for (i = 0; i < x86_pmu.num_counters; i++) { 204 reg = x86_pmu_config_addr(i); 205 ret = rdmsrl_safe(reg, &val); 206 if (ret) 207 goto msr_fail; 208 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { 209 bios_fail = 1; 210 val_fail = val; 211 reg_fail = reg; 212 } else { 213 reg_safe = i; 214 } 215 } 216 217 if (x86_pmu.num_counters_fixed) { 218 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 219 ret = rdmsrl_safe(reg, &val); 220 if (ret) 221 goto msr_fail; 222 for (i = 0; i < x86_pmu.num_counters_fixed; i++) { 223 if (val & (0x03 << i*4)) { 224 bios_fail = 1; 225 val_fail = val; 226 reg_fail = reg; 227 } 228 } 229 } 230 231 /* 232 * If all the counters are enabled, the below test will always 233 * fail. The tools will also become useless in this scenario. 234 * Just fail and disable the hardware counters. 235 */ 236 237 if (reg_safe == -1) { 238 reg = reg_safe; 239 goto msr_fail; 240 } 241 242 /* 243 * Read the current value, change it and read it back to see if it 244 * matches, this is needed to detect certain hardware emulators 245 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 246 */ 247 reg = x86_pmu_event_addr(reg_safe); 248 if (rdmsrl_safe(reg, &val)) 249 goto msr_fail; 250 val ^= 0xffffUL; 251 ret = wrmsrl_safe(reg, val); 252 ret |= rdmsrl_safe(reg, &val_new); 253 if (ret || val != val_new) 254 goto msr_fail; 255 256 /* 257 * We still allow the PMU driver to operate: 258 */ 259 if (bios_fail) { 260 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); 261 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", 262 reg_fail, val_fail); 263 } 264 265 return true; 266 267 msr_fail: 268 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { 269 pr_cont("PMU not available due to virtualization, using software events only.\n"); 270 } else { 271 pr_cont("Broken PMU hardware detected, using software events only.\n"); 272 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", 273 reg, val_new); 274 } 275 276 return false; 277 } 278 279 static void hw_perf_event_destroy(struct perf_event *event) 280 { 281 x86_release_hardware(); 282 atomic_dec(&active_events); 283 } 284 285 void hw_perf_lbr_event_destroy(struct perf_event *event) 286 { 287 hw_perf_event_destroy(event); 288 289 /* undo the lbr/bts event accounting */ 290 x86_del_exclusive(x86_lbr_exclusive_lbr); 291 } 292 293 static inline int x86_pmu_initialized(void) 294 { 295 return x86_pmu.handle_irq != NULL; 296 } 297 298 static inline int 299 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) 300 { 301 struct perf_event_attr *attr = &event->attr; 302 unsigned int cache_type, cache_op, cache_result; 303 u64 config, val; 304 305 config = attr->config; 306 307 cache_type = (config >> 0) & 0xff; 308 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 309 return -EINVAL; 310 311 cache_op = (config >> 8) & 0xff; 312 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 313 return -EINVAL; 314 315 cache_result = (config >> 16) & 0xff; 316 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 317 return -EINVAL; 318 319 val = hw_cache_event_ids[cache_type][cache_op][cache_result]; 320 321 if (val == 0) 322 return -ENOENT; 323 324 if (val == -1) 325 return -EINVAL; 326 327 hwc->config |= val; 328 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; 329 return x86_pmu_extra_regs(val, event); 330 } 331 332 int x86_reserve_hardware(void) 333 { 334 int err = 0; 335 336 if (!atomic_inc_not_zero(&pmc_refcount)) { 337 mutex_lock(&pmc_reserve_mutex); 338 if (atomic_read(&pmc_refcount) == 0) { 339 if (!reserve_pmc_hardware()) 340 err = -EBUSY; 341 else 342 reserve_ds_buffers(); 343 } 344 if (!err) 345 atomic_inc(&pmc_refcount); 346 mutex_unlock(&pmc_reserve_mutex); 347 } 348 349 return err; 350 } 351 352 void x86_release_hardware(void) 353 { 354 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { 355 release_pmc_hardware(); 356 release_ds_buffers(); 357 mutex_unlock(&pmc_reserve_mutex); 358 } 359 } 360 361 /* 362 * Check if we can create event of a certain type (that no conflicting events 363 * are present). 364 */ 365 int x86_add_exclusive(unsigned int what) 366 { 367 int i; 368 369 /* 370 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. 371 * LBR and BTS are still mutually exclusive. 372 */ 373 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 374 return 0; 375 376 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { 377 mutex_lock(&pmc_reserve_mutex); 378 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { 379 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) 380 goto fail_unlock; 381 } 382 atomic_inc(&x86_pmu.lbr_exclusive[what]); 383 mutex_unlock(&pmc_reserve_mutex); 384 } 385 386 atomic_inc(&active_events); 387 return 0; 388 389 fail_unlock: 390 mutex_unlock(&pmc_reserve_mutex); 391 return -EBUSY; 392 } 393 394 void x86_del_exclusive(unsigned int what) 395 { 396 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 397 return; 398 399 atomic_dec(&x86_pmu.lbr_exclusive[what]); 400 atomic_dec(&active_events); 401 } 402 403 int x86_setup_perfctr(struct perf_event *event) 404 { 405 struct perf_event_attr *attr = &event->attr; 406 struct hw_perf_event *hwc = &event->hw; 407 u64 config; 408 409 if (!is_sampling_event(event)) { 410 hwc->sample_period = x86_pmu.max_period; 411 hwc->last_period = hwc->sample_period; 412 local64_set(&hwc->period_left, hwc->sample_period); 413 } 414 415 if (attr->type == PERF_TYPE_RAW) 416 return x86_pmu_extra_regs(event->attr.config, event); 417 418 if (attr->type == PERF_TYPE_HW_CACHE) 419 return set_ext_hw_attr(hwc, event); 420 421 if (attr->config >= x86_pmu.max_events) 422 return -EINVAL; 423 424 /* 425 * The generic map: 426 */ 427 config = x86_pmu.event_map(attr->config); 428 429 if (config == 0) 430 return -ENOENT; 431 432 if (config == -1LL) 433 return -EINVAL; 434 435 /* 436 * Branch tracing: 437 */ 438 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && 439 !attr->freq && hwc->sample_period == 1) { 440 /* BTS is not supported by this architecture. */ 441 if (!x86_pmu.bts_active) 442 return -EOPNOTSUPP; 443 444 /* BTS is currently only allowed for user-mode. */ 445 if (!attr->exclude_kernel) 446 return -EOPNOTSUPP; 447 448 /* disallow bts if conflicting events are present */ 449 if (x86_add_exclusive(x86_lbr_exclusive_lbr)) 450 return -EBUSY; 451 452 event->destroy = hw_perf_lbr_event_destroy; 453 } 454 455 hwc->config |= config; 456 457 return 0; 458 } 459 460 /* 461 * check that branch_sample_type is compatible with 462 * settings needed for precise_ip > 1 which implies 463 * using the LBR to capture ALL taken branches at the 464 * priv levels of the measurement 465 */ 466 static inline int precise_br_compat(struct perf_event *event) 467 { 468 u64 m = event->attr.branch_sample_type; 469 u64 b = 0; 470 471 /* must capture all branches */ 472 if (!(m & PERF_SAMPLE_BRANCH_ANY)) 473 return 0; 474 475 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; 476 477 if (!event->attr.exclude_user) 478 b |= PERF_SAMPLE_BRANCH_USER; 479 480 if (!event->attr.exclude_kernel) 481 b |= PERF_SAMPLE_BRANCH_KERNEL; 482 483 /* 484 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 485 */ 486 487 return m == b; 488 } 489 490 int x86_pmu_hw_config(struct perf_event *event) 491 { 492 if (event->attr.precise_ip) { 493 int precise = 0; 494 495 /* Support for constant skid */ 496 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 497 precise++; 498 499 /* Support for IP fixup */ 500 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 501 precise++; 502 503 if (x86_pmu.pebs_prec_dist) 504 precise++; 505 } 506 507 if (event->attr.precise_ip > precise) 508 return -EOPNOTSUPP; 509 510 /* There's no sense in having PEBS for non sampling events: */ 511 if (!is_sampling_event(event)) 512 return -EINVAL; 513 } 514 /* 515 * check that PEBS LBR correction does not conflict with 516 * whatever the user is asking with attr->branch_sample_type 517 */ 518 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { 519 u64 *br_type = &event->attr.branch_sample_type; 520 521 if (has_branch_stack(event)) { 522 if (!precise_br_compat(event)) 523 return -EOPNOTSUPP; 524 525 /* branch_sample_type is compatible */ 526 527 } else { 528 /* 529 * user did not specify branch_sample_type 530 * 531 * For PEBS fixups, we capture all 532 * the branches at the priv level of the 533 * event. 534 */ 535 *br_type = PERF_SAMPLE_BRANCH_ANY; 536 537 if (!event->attr.exclude_user) 538 *br_type |= PERF_SAMPLE_BRANCH_USER; 539 540 if (!event->attr.exclude_kernel) 541 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 542 } 543 } 544 545 if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) 546 event->attach_state |= PERF_ATTACH_TASK_DATA; 547 548 /* 549 * Generate PMC IRQs: 550 * (keep 'enabled' bit clear for now) 551 */ 552 event->hw.config = ARCH_PERFMON_EVENTSEL_INT; 553 554 /* 555 * Count user and OS events unless requested not to 556 */ 557 if (!event->attr.exclude_user) 558 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; 559 if (!event->attr.exclude_kernel) 560 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; 561 562 if (event->attr.type == PERF_TYPE_RAW) 563 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 564 565 if (event->attr.sample_period && x86_pmu.limit_period) { 566 if (x86_pmu.limit_period(event, event->attr.sample_period) > 567 event->attr.sample_period) 568 return -EINVAL; 569 } 570 571 return x86_setup_perfctr(event); 572 } 573 574 /* 575 * Setup the hardware configuration for a given attr_type 576 */ 577 static int __x86_pmu_event_init(struct perf_event *event) 578 { 579 int err; 580 581 if (!x86_pmu_initialized()) 582 return -ENODEV; 583 584 err = x86_reserve_hardware(); 585 if (err) 586 return err; 587 588 atomic_inc(&active_events); 589 event->destroy = hw_perf_event_destroy; 590 591 event->hw.idx = -1; 592 event->hw.last_cpu = -1; 593 event->hw.last_tag = ~0ULL; 594 595 /* mark unused */ 596 event->hw.extra_reg.idx = EXTRA_REG_NONE; 597 event->hw.branch_reg.idx = EXTRA_REG_NONE; 598 599 return x86_pmu.hw_config(event); 600 } 601 602 void x86_pmu_disable_all(void) 603 { 604 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 605 int idx; 606 607 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 608 u64 val; 609 610 if (!test_bit(idx, cpuc->active_mask)) 611 continue; 612 rdmsrl(x86_pmu_config_addr(idx), val); 613 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 614 continue; 615 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 616 wrmsrl(x86_pmu_config_addr(idx), val); 617 } 618 } 619 620 /* 621 * There may be PMI landing after enabled=0. The PMI hitting could be before or 622 * after disable_all. 623 * 624 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. 625 * It will not be re-enabled in the NMI handler again, because enabled=0. After 626 * handling the NMI, disable_all will be called, which will not change the 627 * state either. If PMI hits after disable_all, the PMU is already disabled 628 * before entering NMI handler. The NMI handler will not change the state 629 * either. 630 * 631 * So either situation is harmless. 632 */ 633 static void x86_pmu_disable(struct pmu *pmu) 634 { 635 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 636 637 if (!x86_pmu_initialized()) 638 return; 639 640 if (!cpuc->enabled) 641 return; 642 643 cpuc->n_added = 0; 644 cpuc->enabled = 0; 645 barrier(); 646 647 x86_pmu.disable_all(); 648 } 649 650 void x86_pmu_enable_all(int added) 651 { 652 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 653 int idx; 654 655 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 656 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 657 658 if (!test_bit(idx, cpuc->active_mask)) 659 continue; 660 661 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 662 } 663 } 664 665 static struct pmu pmu; 666 667 static inline int is_x86_event(struct perf_event *event) 668 { 669 return event->pmu == &pmu; 670 } 671 672 /* 673 * Event scheduler state: 674 * 675 * Assign events iterating over all events and counters, beginning 676 * with events with least weights first. Keep the current iterator 677 * state in struct sched_state. 678 */ 679 struct sched_state { 680 int weight; 681 int event; /* event index */ 682 int counter; /* counter index */ 683 int unassigned; /* number of events to be assigned left */ 684 int nr_gp; /* number of GP counters used */ 685 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 686 }; 687 688 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ 689 #define SCHED_STATES_MAX 2 690 691 struct perf_sched { 692 int max_weight; 693 int max_events; 694 int max_gp; 695 int saved_states; 696 struct event_constraint **constraints; 697 struct sched_state state; 698 struct sched_state saved[SCHED_STATES_MAX]; 699 }; 700 701 /* 702 * Initialize interator that runs through all events and counters. 703 */ 704 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 705 int num, int wmin, int wmax, int gpmax) 706 { 707 int idx; 708 709 memset(sched, 0, sizeof(*sched)); 710 sched->max_events = num; 711 sched->max_weight = wmax; 712 sched->max_gp = gpmax; 713 sched->constraints = constraints; 714 715 for (idx = 0; idx < num; idx++) { 716 if (constraints[idx]->weight == wmin) 717 break; 718 } 719 720 sched->state.event = idx; /* start with min weight */ 721 sched->state.weight = wmin; 722 sched->state.unassigned = num; 723 } 724 725 static void perf_sched_save_state(struct perf_sched *sched) 726 { 727 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) 728 return; 729 730 sched->saved[sched->saved_states] = sched->state; 731 sched->saved_states++; 732 } 733 734 static bool perf_sched_restore_state(struct perf_sched *sched) 735 { 736 if (!sched->saved_states) 737 return false; 738 739 sched->saved_states--; 740 sched->state = sched->saved[sched->saved_states]; 741 742 /* continue with next counter: */ 743 clear_bit(sched->state.counter++, sched->state.used); 744 745 return true; 746 } 747 748 /* 749 * Select a counter for the current event to schedule. Return true on 750 * success. 751 */ 752 static bool __perf_sched_find_counter(struct perf_sched *sched) 753 { 754 struct event_constraint *c; 755 int idx; 756 757 if (!sched->state.unassigned) 758 return false; 759 760 if (sched->state.event >= sched->max_events) 761 return false; 762 763 c = sched->constraints[sched->state.event]; 764 /* Prefer fixed purpose counters */ 765 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 766 idx = INTEL_PMC_IDX_FIXED; 767 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 768 if (!__test_and_set_bit(idx, sched->state.used)) 769 goto done; 770 } 771 } 772 773 /* Grab the first unused counter starting with idx */ 774 idx = sched->state.counter; 775 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 776 if (!__test_and_set_bit(idx, sched->state.used)) { 777 if (sched->state.nr_gp++ >= sched->max_gp) 778 return false; 779 780 goto done; 781 } 782 } 783 784 return false; 785 786 done: 787 sched->state.counter = idx; 788 789 if (c->overlap) 790 perf_sched_save_state(sched); 791 792 return true; 793 } 794 795 static bool perf_sched_find_counter(struct perf_sched *sched) 796 { 797 while (!__perf_sched_find_counter(sched)) { 798 if (!perf_sched_restore_state(sched)) 799 return false; 800 } 801 802 return true; 803 } 804 805 /* 806 * Go through all unassigned events and find the next one to schedule. 807 * Take events with the least weight first. Return true on success. 808 */ 809 static bool perf_sched_next_event(struct perf_sched *sched) 810 { 811 struct event_constraint *c; 812 813 if (!sched->state.unassigned || !--sched->state.unassigned) 814 return false; 815 816 do { 817 /* next event */ 818 sched->state.event++; 819 if (sched->state.event >= sched->max_events) { 820 /* next weight */ 821 sched->state.event = 0; 822 sched->state.weight++; 823 if (sched->state.weight > sched->max_weight) 824 return false; 825 } 826 c = sched->constraints[sched->state.event]; 827 } while (c->weight != sched->state.weight); 828 829 sched->state.counter = 0; /* start with first counter */ 830 831 return true; 832 } 833 834 /* 835 * Assign a counter for each event. 836 */ 837 int perf_assign_events(struct event_constraint **constraints, int n, 838 int wmin, int wmax, int gpmax, int *assign) 839 { 840 struct perf_sched sched; 841 842 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 843 844 do { 845 if (!perf_sched_find_counter(&sched)) 846 break; /* failed */ 847 if (assign) 848 assign[sched.state.event] = sched.state.counter; 849 } while (perf_sched_next_event(&sched)); 850 851 return sched.state.unassigned; 852 } 853 EXPORT_SYMBOL_GPL(perf_assign_events); 854 855 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 856 { 857 struct event_constraint *c; 858 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 859 struct perf_event *e; 860 int i, wmin, wmax, unsched = 0; 861 struct hw_perf_event *hwc; 862 863 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 864 865 if (x86_pmu.start_scheduling) 866 x86_pmu.start_scheduling(cpuc); 867 868 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 869 cpuc->event_constraint[i] = NULL; 870 c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); 871 cpuc->event_constraint[i] = c; 872 873 wmin = min(wmin, c->weight); 874 wmax = max(wmax, c->weight); 875 } 876 877 /* 878 * fastpath, try to reuse previous register 879 */ 880 for (i = 0; i < n; i++) { 881 hwc = &cpuc->event_list[i]->hw; 882 c = cpuc->event_constraint[i]; 883 884 /* never assigned */ 885 if (hwc->idx == -1) 886 break; 887 888 /* constraint still honored */ 889 if (!test_bit(hwc->idx, c->idxmsk)) 890 break; 891 892 /* not already used */ 893 if (test_bit(hwc->idx, used_mask)) 894 break; 895 896 __set_bit(hwc->idx, used_mask); 897 if (assign) 898 assign[i] = hwc->idx; 899 } 900 901 /* slow path */ 902 if (i != n) { 903 int gpmax = x86_pmu.num_counters; 904 905 /* 906 * Do not allow scheduling of more than half the available 907 * generic counters. 908 * 909 * This helps avoid counter starvation of sibling thread by 910 * ensuring at most half the counters cannot be in exclusive 911 * mode. There is no designated counters for the limits. Any 912 * N/2 counters can be used. This helps with events with 913 * specific counter constraints. 914 */ 915 if (is_ht_workaround_enabled() && !cpuc->is_fake && 916 READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 917 gpmax /= 2; 918 919 unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 920 wmax, gpmax, assign); 921 } 922 923 /* 924 * In case of success (unsched = 0), mark events as committed, 925 * so we do not put_constraint() in case new events are added 926 * and fail to be scheduled 927 * 928 * We invoke the lower level commit callback to lock the resource 929 * 930 * We do not need to do all of this in case we are called to 931 * validate an event group (assign == NULL) 932 */ 933 if (!unsched && assign) { 934 for (i = 0; i < n; i++) { 935 e = cpuc->event_list[i]; 936 e->hw.flags |= PERF_X86_EVENT_COMMITTED; 937 if (x86_pmu.commit_scheduling) 938 x86_pmu.commit_scheduling(cpuc, i, assign[i]); 939 } 940 } else { 941 for (i = 0; i < n; i++) { 942 e = cpuc->event_list[i]; 943 /* 944 * do not put_constraint() on comitted events, 945 * because they are good to go 946 */ 947 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) 948 continue; 949 950 /* 951 * release events that failed scheduling 952 */ 953 if (x86_pmu.put_event_constraints) 954 x86_pmu.put_event_constraints(cpuc, e); 955 } 956 } 957 958 if (x86_pmu.stop_scheduling) 959 x86_pmu.stop_scheduling(cpuc); 960 961 return unsched ? -EINVAL : 0; 962 } 963 964 /* 965 * dogrp: true if must collect siblings events (group) 966 * returns total number of events and error code 967 */ 968 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) 969 { 970 struct perf_event *event; 971 int n, max_count; 972 973 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; 974 975 /* current number of events already accepted */ 976 n = cpuc->n_events; 977 978 if (is_x86_event(leader)) { 979 if (n >= max_count) 980 return -EINVAL; 981 cpuc->event_list[n] = leader; 982 n++; 983 } 984 if (!dogrp) 985 return n; 986 987 list_for_each_entry(event, &leader->sibling_list, group_entry) { 988 if (!is_x86_event(event) || 989 event->state <= PERF_EVENT_STATE_OFF) 990 continue; 991 992 if (n >= max_count) 993 return -EINVAL; 994 995 cpuc->event_list[n] = event; 996 n++; 997 } 998 return n; 999 } 1000 1001 static inline void x86_assign_hw_event(struct perf_event *event, 1002 struct cpu_hw_events *cpuc, int i) 1003 { 1004 struct hw_perf_event *hwc = &event->hw; 1005 1006 hwc->idx = cpuc->assign[i]; 1007 hwc->last_cpu = smp_processor_id(); 1008 hwc->last_tag = ++cpuc->tags[i]; 1009 1010 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { 1011 hwc->config_base = 0; 1012 hwc->event_base = 0; 1013 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { 1014 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1015 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); 1016 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; 1017 } else { 1018 hwc->config_base = x86_pmu_config_addr(hwc->idx); 1019 hwc->event_base = x86_pmu_event_addr(hwc->idx); 1020 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 1021 } 1022 } 1023 1024 static inline int match_prev_assignment(struct hw_perf_event *hwc, 1025 struct cpu_hw_events *cpuc, 1026 int i) 1027 { 1028 return hwc->idx == cpuc->assign[i] && 1029 hwc->last_cpu == smp_processor_id() && 1030 hwc->last_tag == cpuc->tags[i]; 1031 } 1032 1033 static void x86_pmu_start(struct perf_event *event, int flags); 1034 1035 static void x86_pmu_enable(struct pmu *pmu) 1036 { 1037 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1038 struct perf_event *event; 1039 struct hw_perf_event *hwc; 1040 int i, added = cpuc->n_added; 1041 1042 if (!x86_pmu_initialized()) 1043 return; 1044 1045 if (cpuc->enabled) 1046 return; 1047 1048 if (cpuc->n_added) { 1049 int n_running = cpuc->n_events - cpuc->n_added; 1050 /* 1051 * apply assignment obtained either from 1052 * hw_perf_group_sched_in() or x86_pmu_enable() 1053 * 1054 * step1: save events moving to new counters 1055 */ 1056 for (i = 0; i < n_running; i++) { 1057 event = cpuc->event_list[i]; 1058 hwc = &event->hw; 1059 1060 /* 1061 * we can avoid reprogramming counter if: 1062 * - assigned same counter as last time 1063 * - running on same CPU as last time 1064 * - no other event has used the counter since 1065 */ 1066 if (hwc->idx == -1 || 1067 match_prev_assignment(hwc, cpuc, i)) 1068 continue; 1069 1070 /* 1071 * Ensure we don't accidentally enable a stopped 1072 * counter simply because we rescheduled. 1073 */ 1074 if (hwc->state & PERF_HES_STOPPED) 1075 hwc->state |= PERF_HES_ARCH; 1076 1077 x86_pmu_stop(event, PERF_EF_UPDATE); 1078 } 1079 1080 /* 1081 * step2: reprogram moved events into new counters 1082 */ 1083 for (i = 0; i < cpuc->n_events; i++) { 1084 event = cpuc->event_list[i]; 1085 hwc = &event->hw; 1086 1087 if (!match_prev_assignment(hwc, cpuc, i)) 1088 x86_assign_hw_event(event, cpuc, i); 1089 else if (i < n_running) 1090 continue; 1091 1092 if (hwc->state & PERF_HES_ARCH) 1093 continue; 1094 1095 x86_pmu_start(event, PERF_EF_RELOAD); 1096 } 1097 cpuc->n_added = 0; 1098 perf_events_lapic_init(); 1099 } 1100 1101 cpuc->enabled = 1; 1102 barrier(); 1103 1104 x86_pmu.enable_all(added); 1105 } 1106 1107 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1108 1109 /* 1110 * Set the next IRQ period, based on the hwc->period_left value. 1111 * To be called with the event disabled in hw: 1112 */ 1113 int x86_perf_event_set_period(struct perf_event *event) 1114 { 1115 struct hw_perf_event *hwc = &event->hw; 1116 s64 left = local64_read(&hwc->period_left); 1117 s64 period = hwc->sample_period; 1118 int ret = 0, idx = hwc->idx; 1119 1120 if (idx == INTEL_PMC_IDX_FIXED_BTS) 1121 return 0; 1122 1123 /* 1124 * If we are way outside a reasonable range then just skip forward: 1125 */ 1126 if (unlikely(left <= -period)) { 1127 left = period; 1128 local64_set(&hwc->period_left, left); 1129 hwc->last_period = period; 1130 ret = 1; 1131 } 1132 1133 if (unlikely(left <= 0)) { 1134 left += period; 1135 local64_set(&hwc->period_left, left); 1136 hwc->last_period = period; 1137 ret = 1; 1138 } 1139 /* 1140 * Quirk: certain CPUs dont like it if just 1 hw_event is left: 1141 */ 1142 if (unlikely(left < 2)) 1143 left = 2; 1144 1145 if (left > x86_pmu.max_period) 1146 left = x86_pmu.max_period; 1147 1148 if (x86_pmu.limit_period) 1149 left = x86_pmu.limit_period(event, left); 1150 1151 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1152 1153 if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || 1154 local64_read(&hwc->prev_count) != (u64)-left) { 1155 /* 1156 * The hw event starts counting from this event offset, 1157 * mark it to be able to extra future deltas: 1158 */ 1159 local64_set(&hwc->prev_count, (u64)-left); 1160 1161 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1162 } 1163 1164 /* 1165 * Due to erratum on certan cpu we need 1166 * a second write to be sure the register 1167 * is updated properly 1168 */ 1169 if (x86_pmu.perfctr_second_write) { 1170 wrmsrl(hwc->event_base, 1171 (u64)(-left) & x86_pmu.cntval_mask); 1172 } 1173 1174 perf_event_update_userpage(event); 1175 1176 return ret; 1177 } 1178 1179 void x86_pmu_enable_event(struct perf_event *event) 1180 { 1181 if (__this_cpu_read(cpu_hw_events.enabled)) 1182 __x86_pmu_enable_event(&event->hw, 1183 ARCH_PERFMON_EVENTSEL_ENABLE); 1184 } 1185 1186 /* 1187 * Add a single event to the PMU. 1188 * 1189 * The event is added to the group of enabled events 1190 * but only if it can be scehduled with existing events. 1191 */ 1192 static int x86_pmu_add(struct perf_event *event, int flags) 1193 { 1194 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1195 struct hw_perf_event *hwc; 1196 int assign[X86_PMC_IDX_MAX]; 1197 int n, n0, ret; 1198 1199 hwc = &event->hw; 1200 1201 n0 = cpuc->n_events; 1202 ret = n = collect_events(cpuc, event, false); 1203 if (ret < 0) 1204 goto out; 1205 1206 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1207 if (!(flags & PERF_EF_START)) 1208 hwc->state |= PERF_HES_ARCH; 1209 1210 /* 1211 * If group events scheduling transaction was started, 1212 * skip the schedulability test here, it will be performed 1213 * at commit time (->commit_txn) as a whole. 1214 * 1215 * If commit fails, we'll call ->del() on all events 1216 * for which ->add() was called. 1217 */ 1218 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1219 goto done_collect; 1220 1221 ret = x86_pmu.schedule_events(cpuc, n, assign); 1222 if (ret) 1223 goto out; 1224 /* 1225 * copy new assignment, now we know it is possible 1226 * will be used by hw_perf_enable() 1227 */ 1228 memcpy(cpuc->assign, assign, n*sizeof(int)); 1229 1230 done_collect: 1231 /* 1232 * Commit the collect_events() state. See x86_pmu_del() and 1233 * x86_pmu_*_txn(). 1234 */ 1235 cpuc->n_events = n; 1236 cpuc->n_added += n - n0; 1237 cpuc->n_txn += n - n0; 1238 1239 if (x86_pmu.add) { 1240 /* 1241 * This is before x86_pmu_enable() will call x86_pmu_start(), 1242 * so we enable LBRs before an event needs them etc.. 1243 */ 1244 x86_pmu.add(event); 1245 } 1246 1247 ret = 0; 1248 out: 1249 return ret; 1250 } 1251 1252 static void x86_pmu_start(struct perf_event *event, int flags) 1253 { 1254 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1255 int idx = event->hw.idx; 1256 1257 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1258 return; 1259 1260 if (WARN_ON_ONCE(idx == -1)) 1261 return; 1262 1263 if (flags & PERF_EF_RELOAD) { 1264 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1265 x86_perf_event_set_period(event); 1266 } 1267 1268 event->hw.state = 0; 1269 1270 cpuc->events[idx] = event; 1271 __set_bit(idx, cpuc->active_mask); 1272 __set_bit(idx, cpuc->running); 1273 x86_pmu.enable(event); 1274 perf_event_update_userpage(event); 1275 } 1276 1277 void perf_event_print_debug(void) 1278 { 1279 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1280 u64 pebs, debugctl; 1281 struct cpu_hw_events *cpuc; 1282 unsigned long flags; 1283 int cpu, idx; 1284 1285 if (!x86_pmu.num_counters) 1286 return; 1287 1288 local_irq_save(flags); 1289 1290 cpu = smp_processor_id(); 1291 cpuc = &per_cpu(cpu_hw_events, cpu); 1292 1293 if (x86_pmu.version >= 2) { 1294 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1295 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1296 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1297 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1298 1299 pr_info("\n"); 1300 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1301 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1302 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1303 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1304 if (x86_pmu.pebs_constraints) { 1305 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); 1306 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1307 } 1308 if (x86_pmu.lbr_nr) { 1309 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 1310 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); 1311 } 1312 } 1313 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1314 1315 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1316 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); 1317 rdmsrl(x86_pmu_event_addr(idx), pmc_count); 1318 1319 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1320 1321 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1322 cpu, idx, pmc_ctrl); 1323 pr_info("CPU#%d: gen-PMC%d count: %016llx\n", 1324 cpu, idx, pmc_count); 1325 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1326 cpu, idx, prev_left); 1327 } 1328 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1329 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1330 1331 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1332 cpu, idx, pmc_count); 1333 } 1334 local_irq_restore(flags); 1335 } 1336 1337 void x86_pmu_stop(struct perf_event *event, int flags) 1338 { 1339 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1340 struct hw_perf_event *hwc = &event->hw; 1341 1342 if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { 1343 x86_pmu.disable(event); 1344 cpuc->events[hwc->idx] = NULL; 1345 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1346 hwc->state |= PERF_HES_STOPPED; 1347 } 1348 1349 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1350 /* 1351 * Drain the remaining delta count out of a event 1352 * that we are disabling: 1353 */ 1354 x86_perf_event_update(event); 1355 hwc->state |= PERF_HES_UPTODATE; 1356 } 1357 } 1358 1359 static void x86_pmu_del(struct perf_event *event, int flags) 1360 { 1361 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1362 int i; 1363 1364 /* 1365 * event is descheduled 1366 */ 1367 event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; 1368 1369 /* 1370 * If we're called during a txn, we only need to undo x86_pmu.add. 1371 * The events never got scheduled and ->cancel_txn will truncate 1372 * the event_list. 1373 * 1374 * XXX assumes any ->del() called during a TXN will only be on 1375 * an event added during that same TXN. 1376 */ 1377 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1378 goto do_del; 1379 1380 /* 1381 * Not a TXN, therefore cleanup properly. 1382 */ 1383 x86_pmu_stop(event, PERF_EF_UPDATE); 1384 1385 for (i = 0; i < cpuc->n_events; i++) { 1386 if (event == cpuc->event_list[i]) 1387 break; 1388 } 1389 1390 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ 1391 return; 1392 1393 /* If we have a newly added event; make sure to decrease n_added. */ 1394 if (i >= cpuc->n_events - cpuc->n_added) 1395 --cpuc->n_added; 1396 1397 if (x86_pmu.put_event_constraints) 1398 x86_pmu.put_event_constraints(cpuc, event); 1399 1400 /* Delete the array entry. */ 1401 while (++i < cpuc->n_events) { 1402 cpuc->event_list[i-1] = cpuc->event_list[i]; 1403 cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1404 } 1405 --cpuc->n_events; 1406 1407 perf_event_update_userpage(event); 1408 1409 do_del: 1410 if (x86_pmu.del) { 1411 /* 1412 * This is after x86_pmu_stop(); so we disable LBRs after any 1413 * event can need them etc.. 1414 */ 1415 x86_pmu.del(event); 1416 } 1417 } 1418 1419 int x86_pmu_handle_irq(struct pt_regs *regs) 1420 { 1421 struct perf_sample_data data; 1422 struct cpu_hw_events *cpuc; 1423 struct perf_event *event; 1424 int idx, handled = 0; 1425 u64 val; 1426 1427 cpuc = this_cpu_ptr(&cpu_hw_events); 1428 1429 /* 1430 * Some chipsets need to unmask the LVTPC in a particular spot 1431 * inside the nmi handler. As a result, the unmasking was pushed 1432 * into all the nmi handlers. 1433 * 1434 * This generic handler doesn't seem to have any issues where the 1435 * unmasking occurs so it was left at the top. 1436 */ 1437 apic_write(APIC_LVTPC, APIC_DM_NMI); 1438 1439 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1440 if (!test_bit(idx, cpuc->active_mask)) { 1441 /* 1442 * Though we deactivated the counter some cpus 1443 * might still deliver spurious interrupts still 1444 * in flight. Catch them: 1445 */ 1446 if (__test_and_clear_bit(idx, cpuc->running)) 1447 handled++; 1448 continue; 1449 } 1450 1451 event = cpuc->events[idx]; 1452 1453 val = x86_perf_event_update(event); 1454 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1455 continue; 1456 1457 /* 1458 * event overflow 1459 */ 1460 handled++; 1461 perf_sample_data_init(&data, 0, event->hw.last_period); 1462 1463 if (!x86_perf_event_set_period(event)) 1464 continue; 1465 1466 if (perf_event_overflow(event, &data, regs)) 1467 x86_pmu_stop(event, 0); 1468 } 1469 1470 if (handled) 1471 inc_irq_stat(apic_perf_irqs); 1472 1473 return handled; 1474 } 1475 1476 void perf_events_lapic_init(void) 1477 { 1478 if (!x86_pmu.apic || !x86_pmu_initialized()) 1479 return; 1480 1481 /* 1482 * Always use NMI for PMU 1483 */ 1484 apic_write(APIC_LVTPC, APIC_DM_NMI); 1485 } 1486 1487 static int 1488 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1489 { 1490 u64 start_clock; 1491 u64 finish_clock; 1492 int ret; 1493 1494 /* 1495 * All PMUs/events that share this PMI handler should make sure to 1496 * increment active_events for their events. 1497 */ 1498 if (!atomic_read(&active_events)) 1499 return NMI_DONE; 1500 1501 start_clock = sched_clock(); 1502 ret = x86_pmu.handle_irq(regs); 1503 finish_clock = sched_clock(); 1504 1505 perf_sample_event_took(finish_clock - start_clock); 1506 1507 return ret; 1508 } 1509 NOKPROBE_SYMBOL(perf_event_nmi_handler); 1510 1511 struct event_constraint emptyconstraint; 1512 struct event_constraint unconstrained; 1513 1514 static int x86_pmu_prepare_cpu(unsigned int cpu) 1515 { 1516 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1517 int i; 1518 1519 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) 1520 cpuc->kfree_on_online[i] = NULL; 1521 if (x86_pmu.cpu_prepare) 1522 return x86_pmu.cpu_prepare(cpu); 1523 return 0; 1524 } 1525 1526 static int x86_pmu_dead_cpu(unsigned int cpu) 1527 { 1528 if (x86_pmu.cpu_dead) 1529 x86_pmu.cpu_dead(cpu); 1530 return 0; 1531 } 1532 1533 static int x86_pmu_online_cpu(unsigned int cpu) 1534 { 1535 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1536 int i; 1537 1538 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { 1539 kfree(cpuc->kfree_on_online[i]); 1540 cpuc->kfree_on_online[i] = NULL; 1541 } 1542 return 0; 1543 } 1544 1545 static int x86_pmu_starting_cpu(unsigned int cpu) 1546 { 1547 if (x86_pmu.cpu_starting) 1548 x86_pmu.cpu_starting(cpu); 1549 return 0; 1550 } 1551 1552 static int x86_pmu_dying_cpu(unsigned int cpu) 1553 { 1554 if (x86_pmu.cpu_dying) 1555 x86_pmu.cpu_dying(cpu); 1556 return 0; 1557 } 1558 1559 static void __init pmu_check_apic(void) 1560 { 1561 if (boot_cpu_has(X86_FEATURE_APIC)) 1562 return; 1563 1564 x86_pmu.apic = 0; 1565 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1566 pr_info("no hardware sampling interrupt available.\n"); 1567 1568 /* 1569 * If we have a PMU initialized but no APIC 1570 * interrupts, we cannot sample hardware 1571 * events (user-space has to fall back and 1572 * sample via a hrtimer based software event): 1573 */ 1574 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1575 1576 } 1577 1578 static struct attribute_group x86_pmu_format_group = { 1579 .name = "format", 1580 .attrs = NULL, 1581 }; 1582 1583 /* 1584 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1585 * out of events_attr attributes. 1586 */ 1587 static void __init filter_events(struct attribute **attrs) 1588 { 1589 struct device_attribute *d; 1590 struct perf_pmu_events_attr *pmu_attr; 1591 int offset = 0; 1592 int i, j; 1593 1594 for (i = 0; attrs[i]; i++) { 1595 d = (struct device_attribute *)attrs[i]; 1596 pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); 1597 /* str trumps id */ 1598 if (pmu_attr->event_str) 1599 continue; 1600 if (x86_pmu.event_map(i + offset)) 1601 continue; 1602 1603 for (j = i; attrs[j]; j++) 1604 attrs[j] = attrs[j + 1]; 1605 1606 /* Check the shifted attr. */ 1607 i--; 1608 1609 /* 1610 * event_map() is index based, the attrs array is organized 1611 * by increasing event index. If we shift the events, then 1612 * we need to compensate for the event_map(), otherwise 1613 * we are looking up the wrong event in the map 1614 */ 1615 offset++; 1616 } 1617 } 1618 1619 /* Merge two pointer arrays */ 1620 __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) 1621 { 1622 struct attribute **new; 1623 int j, i; 1624 1625 for (j = 0; a[j]; j++) 1626 ; 1627 for (i = 0; b[i]; i++) 1628 j++; 1629 j++; 1630 1631 new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); 1632 if (!new) 1633 return NULL; 1634 1635 j = 0; 1636 for (i = 0; a[i]; i++) 1637 new[j++] = a[i]; 1638 for (i = 0; b[i]; i++) 1639 new[j++] = b[i]; 1640 new[j] = NULL; 1641 1642 return new; 1643 } 1644 1645 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) 1646 { 1647 struct perf_pmu_events_attr *pmu_attr = \ 1648 container_of(attr, struct perf_pmu_events_attr, attr); 1649 u64 config = x86_pmu.event_map(pmu_attr->id); 1650 1651 /* string trumps id */ 1652 if (pmu_attr->event_str) 1653 return sprintf(page, "%s", pmu_attr->event_str); 1654 1655 return x86_pmu.events_sysfs_show(page, config); 1656 } 1657 EXPORT_SYMBOL_GPL(events_sysfs_show); 1658 1659 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, 1660 char *page) 1661 { 1662 struct perf_pmu_events_ht_attr *pmu_attr = 1663 container_of(attr, struct perf_pmu_events_ht_attr, attr); 1664 1665 /* 1666 * Report conditional events depending on Hyper-Threading. 1667 * 1668 * This is overly conservative as usually the HT special 1669 * handling is not needed if the other CPU thread is idle. 1670 * 1671 * Note this does not (and cannot) handle the case when thread 1672 * siblings are invisible, for example with virtualization 1673 * if they are owned by some other guest. The user tool 1674 * has to re-read when a thread sibling gets onlined later. 1675 */ 1676 return sprintf(page, "%s", 1677 topology_max_smt_threads() > 1 ? 1678 pmu_attr->event_str_ht : 1679 pmu_attr->event_str_noht); 1680 } 1681 1682 EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1683 EVENT_ATTR(instructions, INSTRUCTIONS ); 1684 EVENT_ATTR(cache-references, CACHE_REFERENCES ); 1685 EVENT_ATTR(cache-misses, CACHE_MISSES ); 1686 EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); 1687 EVENT_ATTR(branch-misses, BRANCH_MISSES ); 1688 EVENT_ATTR(bus-cycles, BUS_CYCLES ); 1689 EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); 1690 EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); 1691 EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); 1692 1693 static struct attribute *empty_attrs; 1694 1695 static struct attribute *events_attr[] = { 1696 EVENT_PTR(CPU_CYCLES), 1697 EVENT_PTR(INSTRUCTIONS), 1698 EVENT_PTR(CACHE_REFERENCES), 1699 EVENT_PTR(CACHE_MISSES), 1700 EVENT_PTR(BRANCH_INSTRUCTIONS), 1701 EVENT_PTR(BRANCH_MISSES), 1702 EVENT_PTR(BUS_CYCLES), 1703 EVENT_PTR(STALLED_CYCLES_FRONTEND), 1704 EVENT_PTR(STALLED_CYCLES_BACKEND), 1705 EVENT_PTR(REF_CPU_CYCLES), 1706 NULL, 1707 }; 1708 1709 static struct attribute_group x86_pmu_events_group = { 1710 .name = "events", 1711 .attrs = events_attr, 1712 }; 1713 1714 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) 1715 { 1716 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 1717 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; 1718 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); 1719 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); 1720 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); 1721 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); 1722 ssize_t ret; 1723 1724 /* 1725 * We have whole page size to spend and just little data 1726 * to write, so we can safely use sprintf. 1727 */ 1728 ret = sprintf(page, "event=0x%02llx", event); 1729 1730 if (umask) 1731 ret += sprintf(page + ret, ",umask=0x%02llx", umask); 1732 1733 if (edge) 1734 ret += sprintf(page + ret, ",edge"); 1735 1736 if (pc) 1737 ret += sprintf(page + ret, ",pc"); 1738 1739 if (any) 1740 ret += sprintf(page + ret, ",any"); 1741 1742 if (inv) 1743 ret += sprintf(page + ret, ",inv"); 1744 1745 if (cmask) 1746 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); 1747 1748 ret += sprintf(page + ret, "\n"); 1749 1750 return ret; 1751 } 1752 1753 static int __init init_hw_perf_events(void) 1754 { 1755 struct x86_pmu_quirk *quirk; 1756 int err; 1757 1758 pr_info("Performance Events: "); 1759 1760 switch (boot_cpu_data.x86_vendor) { 1761 case X86_VENDOR_INTEL: 1762 err = intel_pmu_init(); 1763 break; 1764 case X86_VENDOR_AMD: 1765 err = amd_pmu_init(); 1766 break; 1767 default: 1768 err = -ENOTSUPP; 1769 } 1770 if (err != 0) { 1771 pr_cont("no PMU driver, software events only.\n"); 1772 return 0; 1773 } 1774 1775 pmu_check_apic(); 1776 1777 /* sanity check that the hardware exists or is emulated */ 1778 if (!check_hw_exists()) 1779 return 0; 1780 1781 pr_cont("%s PMU driver.\n", x86_pmu.name); 1782 1783 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1784 1785 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1786 quirk->func(); 1787 1788 if (!x86_pmu.intel_ctrl) 1789 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1790 1791 perf_events_lapic_init(); 1792 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1793 1794 unconstrained = (struct event_constraint) 1795 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1796 0, x86_pmu.num_counters, 0, 0); 1797 1798 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1799 1800 if (x86_pmu.event_attrs) 1801 x86_pmu_events_group.attrs = x86_pmu.event_attrs; 1802 1803 if (!x86_pmu.events_sysfs_show) 1804 x86_pmu_events_group.attrs = &empty_attrs; 1805 else 1806 filter_events(x86_pmu_events_group.attrs); 1807 1808 if (x86_pmu.cpu_events) { 1809 struct attribute **tmp; 1810 1811 tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); 1812 if (!WARN_ON(!tmp)) 1813 x86_pmu_events_group.attrs = tmp; 1814 } 1815 1816 pr_info("... version: %d\n", x86_pmu.version); 1817 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1818 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1819 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 1820 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1821 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1822 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1823 1824 /* 1825 * Install callbacks. Core will call them for each online 1826 * cpu. 1827 */ 1828 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", 1829 x86_pmu_prepare_cpu, x86_pmu_dead_cpu); 1830 if (err) 1831 return err; 1832 1833 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, 1834 "perf/x86:starting", x86_pmu_starting_cpu, 1835 x86_pmu_dying_cpu); 1836 if (err) 1837 goto out; 1838 1839 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", 1840 x86_pmu_online_cpu, NULL); 1841 if (err) 1842 goto out1; 1843 1844 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); 1845 if (err) 1846 goto out2; 1847 1848 return 0; 1849 1850 out2: 1851 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); 1852 out1: 1853 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); 1854 out: 1855 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); 1856 return err; 1857 } 1858 early_initcall(init_hw_perf_events); 1859 1860 static inline void x86_pmu_read(struct perf_event *event) 1861 { 1862 x86_perf_event_update(event); 1863 } 1864 1865 /* 1866 * Start group events scheduling transaction 1867 * Set the flag to make pmu::enable() not perform the 1868 * schedulability test, it will be performed at commit time 1869 * 1870 * We only support PERF_PMU_TXN_ADD transactions. Save the 1871 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD 1872 * transactions. 1873 */ 1874 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) 1875 { 1876 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1877 1878 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ 1879 1880 cpuc->txn_flags = txn_flags; 1881 if (txn_flags & ~PERF_PMU_TXN_ADD) 1882 return; 1883 1884 perf_pmu_disable(pmu); 1885 __this_cpu_write(cpu_hw_events.n_txn, 0); 1886 } 1887 1888 /* 1889 * Stop group events scheduling transaction 1890 * Clear the flag and pmu::enable() will perform the 1891 * schedulability test. 1892 */ 1893 static void x86_pmu_cancel_txn(struct pmu *pmu) 1894 { 1895 unsigned int txn_flags; 1896 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1897 1898 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 1899 1900 txn_flags = cpuc->txn_flags; 1901 cpuc->txn_flags = 0; 1902 if (txn_flags & ~PERF_PMU_TXN_ADD) 1903 return; 1904 1905 /* 1906 * Truncate collected array by the number of events added in this 1907 * transaction. See x86_pmu_add() and x86_pmu_*_txn(). 1908 */ 1909 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 1910 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 1911 perf_pmu_enable(pmu); 1912 } 1913 1914 /* 1915 * Commit group events scheduling transaction 1916 * Perform the group schedulability test as a whole 1917 * Return 0 if success 1918 * 1919 * Does not cancel the transaction on failure; expects the caller to do this. 1920 */ 1921 static int x86_pmu_commit_txn(struct pmu *pmu) 1922 { 1923 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1924 int assign[X86_PMC_IDX_MAX]; 1925 int n, ret; 1926 1927 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 1928 1929 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { 1930 cpuc->txn_flags = 0; 1931 return 0; 1932 } 1933 1934 n = cpuc->n_events; 1935 1936 if (!x86_pmu_initialized()) 1937 return -EAGAIN; 1938 1939 ret = x86_pmu.schedule_events(cpuc, n, assign); 1940 if (ret) 1941 return ret; 1942 1943 /* 1944 * copy new assignment, now we know it is possible 1945 * will be used by hw_perf_enable() 1946 */ 1947 memcpy(cpuc->assign, assign, n*sizeof(int)); 1948 1949 cpuc->txn_flags = 0; 1950 perf_pmu_enable(pmu); 1951 return 0; 1952 } 1953 /* 1954 * a fake_cpuc is used to validate event groups. Due to 1955 * the extra reg logic, we need to also allocate a fake 1956 * per_core and per_cpu structure. Otherwise, group events 1957 * using extra reg may conflict without the kernel being 1958 * able to catch this when the last event gets added to 1959 * the group. 1960 */ 1961 static void free_fake_cpuc(struct cpu_hw_events *cpuc) 1962 { 1963 kfree(cpuc->shared_regs); 1964 kfree(cpuc); 1965 } 1966 1967 static struct cpu_hw_events *allocate_fake_cpuc(void) 1968 { 1969 struct cpu_hw_events *cpuc; 1970 int cpu = raw_smp_processor_id(); 1971 1972 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); 1973 if (!cpuc) 1974 return ERR_PTR(-ENOMEM); 1975 1976 /* only needed, if we have extra_regs */ 1977 if (x86_pmu.extra_regs) { 1978 cpuc->shared_regs = allocate_shared_regs(cpu); 1979 if (!cpuc->shared_regs) 1980 goto error; 1981 } 1982 cpuc->is_fake = 1; 1983 return cpuc; 1984 error: 1985 free_fake_cpuc(cpuc); 1986 return ERR_PTR(-ENOMEM); 1987 } 1988 1989 /* 1990 * validate that we can schedule this event 1991 */ 1992 static int validate_event(struct perf_event *event) 1993 { 1994 struct cpu_hw_events *fake_cpuc; 1995 struct event_constraint *c; 1996 int ret = 0; 1997 1998 fake_cpuc = allocate_fake_cpuc(); 1999 if (IS_ERR(fake_cpuc)) 2000 return PTR_ERR(fake_cpuc); 2001 2002 c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); 2003 2004 if (!c || !c->weight) 2005 ret = -EINVAL; 2006 2007 if (x86_pmu.put_event_constraints) 2008 x86_pmu.put_event_constraints(fake_cpuc, event); 2009 2010 free_fake_cpuc(fake_cpuc); 2011 2012 return ret; 2013 } 2014 2015 /* 2016 * validate a single event group 2017 * 2018 * validation include: 2019 * - check events are compatible which each other 2020 * - events do not compete for the same counter 2021 * - number of events <= number of counters 2022 * 2023 * validation ensures the group can be loaded onto the 2024 * PMU if it was the only group available. 2025 */ 2026 static int validate_group(struct perf_event *event) 2027 { 2028 struct perf_event *leader = event->group_leader; 2029 struct cpu_hw_events *fake_cpuc; 2030 int ret = -EINVAL, n; 2031 2032 fake_cpuc = allocate_fake_cpuc(); 2033 if (IS_ERR(fake_cpuc)) 2034 return PTR_ERR(fake_cpuc); 2035 /* 2036 * the event is not yet connected with its 2037 * siblings therefore we must first collect 2038 * existing siblings, then add the new event 2039 * before we can simulate the scheduling 2040 */ 2041 n = collect_events(fake_cpuc, leader, true); 2042 if (n < 0) 2043 goto out; 2044 2045 fake_cpuc->n_events = n; 2046 n = collect_events(fake_cpuc, event, false); 2047 if (n < 0) 2048 goto out; 2049 2050 fake_cpuc->n_events = n; 2051 2052 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 2053 2054 out: 2055 free_fake_cpuc(fake_cpuc); 2056 return ret; 2057 } 2058 2059 static int x86_pmu_event_init(struct perf_event *event) 2060 { 2061 struct pmu *tmp; 2062 int err; 2063 2064 switch (event->attr.type) { 2065 case PERF_TYPE_RAW: 2066 case PERF_TYPE_HARDWARE: 2067 case PERF_TYPE_HW_CACHE: 2068 break; 2069 2070 default: 2071 return -ENOENT; 2072 } 2073 2074 err = __x86_pmu_event_init(event); 2075 if (!err) { 2076 /* 2077 * we temporarily connect event to its pmu 2078 * such that validate_group() can classify 2079 * it as an x86 event using is_x86_event() 2080 */ 2081 tmp = event->pmu; 2082 event->pmu = &pmu; 2083 2084 if (event->group_leader != event) 2085 err = validate_group(event); 2086 else 2087 err = validate_event(event); 2088 2089 event->pmu = tmp; 2090 } 2091 if (err) { 2092 if (event->destroy) 2093 event->destroy(event); 2094 } 2095 2096 if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) 2097 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; 2098 2099 return err; 2100 } 2101 2102 static void refresh_pce(void *ignored) 2103 { 2104 if (current->active_mm) 2105 load_mm_cr4(current->active_mm); 2106 } 2107 2108 static void x86_pmu_event_mapped(struct perf_event *event) 2109 { 2110 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2111 return; 2112 2113 /* 2114 * This function relies on not being called concurrently in two 2115 * tasks in the same mm. Otherwise one task could observe 2116 * perf_rdpmc_allowed > 1 and return all the way back to 2117 * userspace with CR4.PCE clear while another task is still 2118 * doing on_each_cpu_mask() to propagate CR4.PCE. 2119 * 2120 * For now, this can't happen because all callers hold mmap_sem 2121 * for write. If this changes, we'll need a different solution. 2122 */ 2123 lockdep_assert_held_exclusive(¤t->mm->mmap_sem); 2124 2125 if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) 2126 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); 2127 } 2128 2129 static void x86_pmu_event_unmapped(struct perf_event *event) 2130 { 2131 if (!current->mm) 2132 return; 2133 2134 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2135 return; 2136 2137 if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) 2138 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); 2139 } 2140 2141 static int x86_pmu_event_idx(struct perf_event *event) 2142 { 2143 int idx = event->hw.idx; 2144 2145 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2146 return 0; 2147 2148 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { 2149 idx -= INTEL_PMC_IDX_FIXED; 2150 idx |= 1 << 30; 2151 } 2152 2153 return idx + 1; 2154 } 2155 2156 static ssize_t get_attr_rdpmc(struct device *cdev, 2157 struct device_attribute *attr, 2158 char *buf) 2159 { 2160 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 2161 } 2162 2163 static ssize_t set_attr_rdpmc(struct device *cdev, 2164 struct device_attribute *attr, 2165 const char *buf, size_t count) 2166 { 2167 unsigned long val; 2168 ssize_t ret; 2169 2170 ret = kstrtoul(buf, 0, &val); 2171 if (ret) 2172 return ret; 2173 2174 if (val > 2) 2175 return -EINVAL; 2176 2177 if (x86_pmu.attr_rdpmc_broken) 2178 return -ENOTSUPP; 2179 2180 if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { 2181 /* 2182 * Changing into or out of always available, aka 2183 * perf-event-bypassing mode. This path is extremely slow, 2184 * but only root can trigger it, so it's okay. 2185 */ 2186 if (val == 2) 2187 static_key_slow_inc(&rdpmc_always_available); 2188 else 2189 static_key_slow_dec(&rdpmc_always_available); 2190 on_each_cpu(refresh_pce, NULL, 1); 2191 } 2192 2193 x86_pmu.attr_rdpmc = val; 2194 2195 return count; 2196 } 2197 2198 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); 2199 2200 static struct attribute *x86_pmu_attrs[] = { 2201 &dev_attr_rdpmc.attr, 2202 NULL, 2203 }; 2204 2205 static struct attribute_group x86_pmu_attr_group = { 2206 .attrs = x86_pmu_attrs, 2207 }; 2208 2209 static const struct attribute_group *x86_pmu_attr_groups[] = { 2210 &x86_pmu_attr_group, 2211 &x86_pmu_format_group, 2212 &x86_pmu_events_group, 2213 NULL, 2214 }; 2215 2216 static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) 2217 { 2218 if (x86_pmu.sched_task) 2219 x86_pmu.sched_task(ctx, sched_in); 2220 } 2221 2222 void perf_check_microcode(void) 2223 { 2224 if (x86_pmu.check_microcode) 2225 x86_pmu.check_microcode(); 2226 } 2227 EXPORT_SYMBOL_GPL(perf_check_microcode); 2228 2229 static struct pmu pmu = { 2230 .pmu_enable = x86_pmu_enable, 2231 .pmu_disable = x86_pmu_disable, 2232 2233 .attr_groups = x86_pmu_attr_groups, 2234 2235 .event_init = x86_pmu_event_init, 2236 2237 .event_mapped = x86_pmu_event_mapped, 2238 .event_unmapped = x86_pmu_event_unmapped, 2239 2240 .add = x86_pmu_add, 2241 .del = x86_pmu_del, 2242 .start = x86_pmu_start, 2243 .stop = x86_pmu_stop, 2244 .read = x86_pmu_read, 2245 2246 .start_txn = x86_pmu_start_txn, 2247 .cancel_txn = x86_pmu_cancel_txn, 2248 .commit_txn = x86_pmu_commit_txn, 2249 2250 .event_idx = x86_pmu_event_idx, 2251 .sched_task = x86_pmu_sched_task, 2252 .task_ctx_size = sizeof(struct x86_perf_task_context), 2253 }; 2254 2255 void arch_perf_update_userpage(struct perf_event *event, 2256 struct perf_event_mmap_page *userpg, u64 now) 2257 { 2258 struct cyc2ns_data *data; 2259 u64 offset; 2260 2261 userpg->cap_user_time = 0; 2262 userpg->cap_user_time_zero = 0; 2263 userpg->cap_user_rdpmc = 2264 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); 2265 userpg->pmc_width = x86_pmu.cntval_bits; 2266 2267 if (!using_native_sched_clock() || !sched_clock_stable()) 2268 return; 2269 2270 data = cyc2ns_read_begin(); 2271 2272 offset = data->cyc2ns_offset + __sched_clock_offset; 2273 2274 /* 2275 * Internal timekeeping for enabled/running/stopped times 2276 * is always in the local_clock domain. 2277 */ 2278 userpg->cap_user_time = 1; 2279 userpg->time_mult = data->cyc2ns_mul; 2280 userpg->time_shift = data->cyc2ns_shift; 2281 userpg->time_offset = offset - now; 2282 2283 /* 2284 * cap_user_time_zero doesn't make sense when we're using a different 2285 * time base for the records. 2286 */ 2287 if (!event->attr.use_clockid) { 2288 userpg->cap_user_time_zero = 1; 2289 userpg->time_zero = offset; 2290 } 2291 2292 cyc2ns_read_end(data); 2293 } 2294 2295 void 2296 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2297 { 2298 struct unwind_state state; 2299 unsigned long addr; 2300 2301 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2302 /* TODO: We don't support guest os callchain now */ 2303 return; 2304 } 2305 2306 if (perf_callchain_store(entry, regs->ip)) 2307 return; 2308 2309 for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); 2310 unwind_next_frame(&state)) { 2311 addr = unwind_get_return_address(&state); 2312 if (!addr || perf_callchain_store(entry, addr)) 2313 return; 2314 } 2315 } 2316 2317 static inline int 2318 valid_user_frame(const void __user *fp, unsigned long size) 2319 { 2320 return (__range_not_ok(fp, size, TASK_SIZE) == 0); 2321 } 2322 2323 static unsigned long get_segment_base(unsigned int segment) 2324 { 2325 struct desc_struct *desc; 2326 unsigned int idx = segment >> 3; 2327 2328 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2329 #ifdef CONFIG_MODIFY_LDT_SYSCALL 2330 struct ldt_struct *ldt; 2331 2332 if (idx > LDT_ENTRIES) 2333 return 0; 2334 2335 /* IRQs are off, so this synchronizes with smp_store_release */ 2336 ldt = lockless_dereference(current->active_mm->context.ldt); 2337 if (!ldt || idx > ldt->size) 2338 return 0; 2339 2340 desc = &ldt->entries[idx]; 2341 #else 2342 return 0; 2343 #endif 2344 } else { 2345 if (idx > GDT_ENTRIES) 2346 return 0; 2347 2348 desc = raw_cpu_ptr(gdt_page.gdt) + idx; 2349 } 2350 2351 return get_desc_base(desc); 2352 } 2353 2354 #ifdef CONFIG_IA32_EMULATION 2355 2356 #include <asm/compat.h> 2357 2358 static inline int 2359 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2360 { 2361 /* 32-bit process in 64-bit kernel. */ 2362 unsigned long ss_base, cs_base; 2363 struct stack_frame_ia32 frame; 2364 const void __user *fp; 2365 2366 if (!test_thread_flag(TIF_IA32)) 2367 return 0; 2368 2369 cs_base = get_segment_base(regs->cs); 2370 ss_base = get_segment_base(regs->ss); 2371 2372 fp = compat_ptr(ss_base + regs->bp); 2373 pagefault_disable(); 2374 while (entry->nr < entry->max_stack) { 2375 unsigned long bytes; 2376 frame.next_frame = 0; 2377 frame.return_address = 0; 2378 2379 if (!valid_user_frame(fp, sizeof(frame))) 2380 break; 2381 2382 bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); 2383 if (bytes != 0) 2384 break; 2385 bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4); 2386 if (bytes != 0) 2387 break; 2388 2389 perf_callchain_store(entry, cs_base + frame.return_address); 2390 fp = compat_ptr(ss_base + frame.next_frame); 2391 } 2392 pagefault_enable(); 2393 return 1; 2394 } 2395 #else 2396 static inline int 2397 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2398 { 2399 return 0; 2400 } 2401 #endif 2402 2403 void 2404 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2405 { 2406 struct stack_frame frame; 2407 const unsigned long __user *fp; 2408 2409 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2410 /* TODO: We don't support guest os callchain now */ 2411 return; 2412 } 2413 2414 /* 2415 * We don't know what to do with VM86 stacks.. ignore them for now. 2416 */ 2417 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) 2418 return; 2419 2420 fp = (unsigned long __user *)regs->bp; 2421 2422 perf_callchain_store(entry, regs->ip); 2423 2424 if (!current->mm) 2425 return; 2426 2427 if (perf_callchain_user32(regs, entry)) 2428 return; 2429 2430 pagefault_disable(); 2431 while (entry->nr < entry->max_stack) { 2432 unsigned long bytes; 2433 2434 frame.next_frame = NULL; 2435 frame.return_address = 0; 2436 2437 if (!valid_user_frame(fp, sizeof(frame))) 2438 break; 2439 2440 bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); 2441 if (bytes != 0) 2442 break; 2443 bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); 2444 if (bytes != 0) 2445 break; 2446 2447 perf_callchain_store(entry, frame.return_address); 2448 fp = (void __user *)frame.next_frame; 2449 } 2450 pagefault_enable(); 2451 } 2452 2453 /* 2454 * Deal with code segment offsets for the various execution modes: 2455 * 2456 * VM86 - the good olde 16 bit days, where the linear address is 2457 * 20 bits and we use regs->ip + 0x10 * regs->cs. 2458 * 2459 * IA32 - Where we need to look at GDT/LDT segment descriptor tables 2460 * to figure out what the 32bit base address is. 2461 * 2462 * X32 - has TIF_X32 set, but is running in x86_64 2463 * 2464 * X86_64 - CS,DS,SS,ES are all zero based. 2465 */ 2466 static unsigned long code_segment_base(struct pt_regs *regs) 2467 { 2468 /* 2469 * For IA32 we look at the GDT/LDT segment base to convert the 2470 * effective IP to a linear address. 2471 */ 2472 2473 #ifdef CONFIG_X86_32 2474 /* 2475 * If we are in VM86 mode, add the segment offset to convert to a 2476 * linear address. 2477 */ 2478 if (regs->flags & X86_VM_MASK) 2479 return 0x10 * regs->cs; 2480 2481 if (user_mode(regs) && regs->cs != __USER_CS) 2482 return get_segment_base(regs->cs); 2483 #else 2484 if (user_mode(regs) && !user_64bit_mode(regs) && 2485 regs->cs != __USER32_CS) 2486 return get_segment_base(regs->cs); 2487 #endif 2488 return 0; 2489 } 2490 2491 unsigned long perf_instruction_pointer(struct pt_regs *regs) 2492 { 2493 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) 2494 return perf_guest_cbs->get_guest_ip(); 2495 2496 return regs->ip + code_segment_base(regs); 2497 } 2498 2499 unsigned long perf_misc_flags(struct pt_regs *regs) 2500 { 2501 int misc = 0; 2502 2503 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2504 if (perf_guest_cbs->is_user_mode()) 2505 misc |= PERF_RECORD_MISC_GUEST_USER; 2506 else 2507 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 2508 } else { 2509 if (user_mode(regs)) 2510 misc |= PERF_RECORD_MISC_USER; 2511 else 2512 misc |= PERF_RECORD_MISC_KERNEL; 2513 } 2514 2515 if (regs->flags & PERF_EFLAGS_EXACT) 2516 misc |= PERF_RECORD_MISC_EXACT_IP; 2517 2518 return misc; 2519 } 2520 2521 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) 2522 { 2523 cap->version = x86_pmu.version; 2524 cap->num_counters_gp = x86_pmu.num_counters; 2525 cap->num_counters_fixed = x86_pmu.num_counters_fixed; 2526 cap->bit_width_gp = x86_pmu.cntval_bits; 2527 cap->bit_width_fixed = x86_pmu.cntval_bits; 2528 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 2529 cap->events_mask_len = x86_pmu.events_mask_len; 2530 } 2531 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); 2532