1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support 4 * 5 * Copyright 2015 Red Hat, Inc. and/or its affiliates. 6 * 7 * Authors: 8 * Avi Kivity <avi@redhat.com> 9 * Gleb Natapov <gleb@redhat.com> 10 * Wei Huang <wei@redhat.com> 11 */ 12 13 #include <linux/types.h> 14 #include <linux/kvm_host.h> 15 #include <linux/perf_event.h> 16 #include <asm/perf_event.h> 17 #include "x86.h" 18 #include "cpuid.h" 19 #include "lapic.h" 20 #include "pmu.h" 21 22 /* This is enough to filter the vast majority of currently defined events. */ 23 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 24 25 /* NOTE: 26 * - Each perf counter is defined as "struct kvm_pmc"; 27 * - There are two types of perf counters: general purpose (gp) and fixed. 28 * gp counters are stored in gp_counters[] and fixed counters are stored 29 * in fixed_counters[] respectively. Both of them are part of "struct 30 * kvm_pmu"; 31 * - pmu.c understands the difference between gp counters and fixed counters. 32 * However AMD doesn't support fixed-counters; 33 * - There are three types of index to access perf counters (PMC): 34 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD 35 * has MSR_K7_PERFCTRn. 36 * 2. MSR Index (named idx): This normally is used by RDPMC instruction. 37 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access 38 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except 39 * that it also supports fixed counters. idx can be used to as index to 40 * gp and fixed counters. 41 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU 42 * code. Each pmc, stored in kvm_pmc.idx field, is unique across 43 * all perf counters (both gp and fixed). The mapping relationship 44 * between pmc and perf counters is as the following: 45 * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters 46 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed 47 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters 48 */ 49 50 static void kvm_pmi_trigger_fn(struct irq_work *irq_work) 51 { 52 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); 53 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); 54 55 kvm_pmu_deliver_pmi(vcpu); 56 } 57 58 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) 59 { 60 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 61 62 /* Ignore counters that have been reprogrammed already. */ 63 if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) 64 return; 65 66 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 67 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 68 69 if (!pmc->intr) 70 return; 71 72 /* 73 * Inject PMI. If vcpu was in a guest mode during NMI PMI 74 * can be ejected on a guest mode re-entry. Otherwise we can't 75 * be sure that vcpu wasn't executing hlt instruction at the 76 * time of vmexit and is not going to re-enter guest mode until 77 * woken up. So we should wake it, but this is impossible from 78 * NMI context. Do it from irq work instead. 79 */ 80 if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) 81 irq_work_queue(&pmc_to_pmu(pmc)->irq_work); 82 else 83 kvm_make_request(KVM_REQ_PMI, pmc->vcpu); 84 } 85 86 static void kvm_perf_overflow(struct perf_event *perf_event, 87 struct perf_sample_data *data, 88 struct pt_regs *regs) 89 { 90 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 91 92 __kvm_perf_overflow(pmc, true); 93 } 94 95 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, 96 unsigned config, bool exclude_user, 97 bool exclude_kernel, bool intr, 98 bool in_tx, bool in_tx_cp) 99 { 100 struct perf_event *event; 101 struct perf_event_attr attr = { 102 .type = type, 103 .size = sizeof(attr), 104 .pinned = true, 105 .exclude_idle = true, 106 .exclude_host = 1, 107 .exclude_user = exclude_user, 108 .exclude_kernel = exclude_kernel, 109 .config = config, 110 }; 111 112 attr.sample_period = get_sample_period(pmc, pmc->counter); 113 114 if (in_tx) 115 attr.config |= HSW_IN_TX; 116 if (in_tx_cp) { 117 /* 118 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero 119 * period. Just clear the sample period so at least 120 * allocating the counter doesn't fail. 121 */ 122 attr.sample_period = 0; 123 attr.config |= HSW_IN_TX_CHECKPOINTED; 124 } 125 126 event = perf_event_create_kernel_counter(&attr, -1, current, 127 kvm_perf_overflow, pmc); 128 if (IS_ERR(event)) { 129 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", 130 PTR_ERR(event), pmc->idx); 131 return; 132 } 133 134 pmc->perf_event = event; 135 pmc_to_pmu(pmc)->event_count++; 136 clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); 137 pmc->is_paused = false; 138 pmc->intr = intr; 139 } 140 141 static void pmc_pause_counter(struct kvm_pmc *pmc) 142 { 143 u64 counter = pmc->counter; 144 145 if (!pmc->perf_event || pmc->is_paused) 146 return; 147 148 /* update counter, reset event value to avoid redundant accumulation */ 149 counter += perf_event_pause(pmc->perf_event, true); 150 pmc->counter = counter & pmc_bitmask(pmc); 151 pmc->is_paused = true; 152 } 153 154 static bool pmc_resume_counter(struct kvm_pmc *pmc) 155 { 156 if (!pmc->perf_event) 157 return false; 158 159 /* recalibrate sample period and check if it's accepted by perf core */ 160 if (perf_event_period(pmc->perf_event, 161 get_sample_period(pmc, pmc->counter))) 162 return false; 163 164 /* reuse perf_event to serve as pmc_reprogram_counter() does*/ 165 perf_event_enable(pmc->perf_event); 166 pmc->is_paused = false; 167 168 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); 169 return true; 170 } 171 172 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) 173 { 174 unsigned config, type = PERF_TYPE_RAW; 175 struct kvm *kvm = pmc->vcpu->kvm; 176 struct kvm_pmu_event_filter *filter; 177 int i; 178 bool allow_event = true; 179 180 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) 181 printk_once("kvm pmu: pin control bit is ignored\n"); 182 183 pmc->eventsel = eventsel; 184 185 pmc_pause_counter(pmc); 186 187 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) 188 return; 189 190 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 191 if (filter) { 192 for (i = 0; i < filter->nevents; i++) 193 if (filter->events[i] == 194 (eventsel & AMD64_RAW_EVENT_MASK_NB)) 195 break; 196 if (filter->action == KVM_PMU_EVENT_ALLOW && 197 i == filter->nevents) 198 allow_event = false; 199 if (filter->action == KVM_PMU_EVENT_DENY && 200 i < filter->nevents) 201 allow_event = false; 202 } 203 if (!allow_event) 204 return; 205 206 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 207 ARCH_PERFMON_EVENTSEL_INV | 208 ARCH_PERFMON_EVENTSEL_CMASK | 209 HSW_IN_TX | 210 HSW_IN_TX_CHECKPOINTED))) { 211 config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); 212 if (config != PERF_COUNT_HW_MAX) 213 type = PERF_TYPE_HARDWARE; 214 } 215 216 if (type == PERF_TYPE_RAW) 217 config = eventsel & X86_RAW_EVENT_MASK; 218 219 if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) 220 return; 221 222 pmc_release_perf_event(pmc); 223 224 pmc->current_config = eventsel; 225 pmc_reprogram_counter(pmc, type, config, 226 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 227 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 228 eventsel & ARCH_PERFMON_EVENTSEL_INT, 229 (eventsel & HSW_IN_TX), 230 (eventsel & HSW_IN_TX_CHECKPOINTED)); 231 } 232 EXPORT_SYMBOL_GPL(reprogram_gp_counter); 233 234 void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) 235 { 236 unsigned en_field = ctrl & 0x3; 237 bool pmi = ctrl & 0x8; 238 struct kvm_pmu_event_filter *filter; 239 struct kvm *kvm = pmc->vcpu->kvm; 240 241 pmc_pause_counter(pmc); 242 243 if (!en_field || !pmc_is_enabled(pmc)) 244 return; 245 246 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 247 if (filter) { 248 if (filter->action == KVM_PMU_EVENT_DENY && 249 test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 250 return; 251 if (filter->action == KVM_PMU_EVENT_ALLOW && 252 !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 253 return; 254 } 255 256 if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) 257 return; 258 259 pmc_release_perf_event(pmc); 260 261 pmc->current_config = (u64)ctrl; 262 pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, 263 kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), 264 !(en_field & 0x2), /* exclude user */ 265 !(en_field & 0x1), /* exclude kernel */ 266 pmi, false, false); 267 } 268 EXPORT_SYMBOL_GPL(reprogram_fixed_counter); 269 270 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) 271 { 272 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); 273 274 if (!pmc) 275 return; 276 277 if (pmc_is_gp(pmc)) 278 reprogram_gp_counter(pmc, pmc->eventsel); 279 else { 280 int idx = pmc_idx - INTEL_PMC_IDX_FIXED; 281 u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); 282 283 reprogram_fixed_counter(pmc, ctrl, idx); 284 } 285 } 286 EXPORT_SYMBOL_GPL(reprogram_counter); 287 288 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) 289 { 290 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 291 int bit; 292 293 for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { 294 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); 295 296 if (unlikely(!pmc || !pmc->perf_event)) { 297 clear_bit(bit, pmu->reprogram_pmi); 298 continue; 299 } 300 301 reprogram_counter(pmu, bit); 302 } 303 304 /* 305 * Unused perf_events are only released if the corresponding MSRs 306 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in 307 * triggers KVM_REQ_PMU if cleanup is needed. 308 */ 309 if (unlikely(pmu->need_cleanup)) 310 kvm_pmu_cleanup(vcpu); 311 } 312 313 /* check if idx is a valid index to access PMU */ 314 bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) 315 { 316 return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); 317 } 318 319 bool is_vmware_backdoor_pmc(u32 pmc_idx) 320 { 321 switch (pmc_idx) { 322 case VMWARE_BACKDOOR_PMC_HOST_TSC: 323 case VMWARE_BACKDOOR_PMC_REAL_TIME: 324 case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 325 return true; 326 } 327 return false; 328 } 329 330 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 331 { 332 u64 ctr_val; 333 334 switch (idx) { 335 case VMWARE_BACKDOOR_PMC_HOST_TSC: 336 ctr_val = rdtsc(); 337 break; 338 case VMWARE_BACKDOOR_PMC_REAL_TIME: 339 ctr_val = ktime_get_boottime_ns(); 340 break; 341 case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 342 ctr_val = ktime_get_boottime_ns() + 343 vcpu->kvm->arch.kvmclock_offset; 344 break; 345 default: 346 return 1; 347 } 348 349 *data = ctr_val; 350 return 0; 351 } 352 353 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 354 { 355 bool fast_mode = idx & (1u << 31); 356 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 357 struct kvm_pmc *pmc; 358 u64 mask = fast_mode ? ~0u : ~0ull; 359 360 if (!pmu->version) 361 return 1; 362 363 if (is_vmware_backdoor_pmc(idx)) 364 return kvm_pmu_rdpmc_vmware(vcpu, idx, data); 365 366 pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); 367 if (!pmc) 368 return 1; 369 370 if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && 371 (static_call(kvm_x86_get_cpl)(vcpu) != 0) && 372 (kvm_read_cr0(vcpu) & X86_CR0_PE)) 373 return 1; 374 375 *data = pmc_read_counter(pmc) & mask; 376 return 0; 377 } 378 379 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) 380 { 381 if (lapic_in_kernel(vcpu)) { 382 if (kvm_x86_ops.pmu_ops->deliver_pmi) 383 kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); 384 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); 385 } 386 } 387 388 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) 389 { 390 return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || 391 kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); 392 } 393 394 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) 395 { 396 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 397 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); 398 399 if (pmc) 400 __set_bit(pmc->idx, pmu->pmc_in_use); 401 } 402 403 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 404 { 405 return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); 406 } 407 408 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 409 { 410 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); 411 return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); 412 } 413 414 /* refresh PMU settings. This function generally is called when underlying 415 * settings are changed (such as changes of PMU CPUID by guest VMs), which 416 * should rarely happen. 417 */ 418 void kvm_pmu_refresh(struct kvm_vcpu *vcpu) 419 { 420 kvm_x86_ops.pmu_ops->refresh(vcpu); 421 } 422 423 void kvm_pmu_reset(struct kvm_vcpu *vcpu) 424 { 425 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 426 427 irq_work_sync(&pmu->irq_work); 428 kvm_x86_ops.pmu_ops->reset(vcpu); 429 } 430 431 void kvm_pmu_init(struct kvm_vcpu *vcpu) 432 { 433 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 434 435 memset(pmu, 0, sizeof(*pmu)); 436 kvm_x86_ops.pmu_ops->init(vcpu); 437 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); 438 pmu->event_count = 0; 439 pmu->need_cleanup = false; 440 kvm_pmu_refresh(vcpu); 441 } 442 443 static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) 444 { 445 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 446 447 if (pmc_is_fixed(pmc)) 448 return fixed_ctrl_field(pmu->fixed_ctr_ctrl, 449 pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; 450 451 return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; 452 } 453 454 /* Release perf_events for vPMCs that have been unused for a full time slice. */ 455 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) 456 { 457 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 458 struct kvm_pmc *pmc = NULL; 459 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); 460 int i; 461 462 pmu->need_cleanup = false; 463 464 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, 465 pmu->pmc_in_use, X86_PMC_IDX_MAX); 466 467 for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { 468 pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); 469 470 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) 471 pmc_stop_counter(pmc); 472 } 473 474 if (kvm_x86_ops.pmu_ops->cleanup) 475 kvm_x86_ops.pmu_ops->cleanup(vcpu); 476 477 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); 478 } 479 480 void kvm_pmu_destroy(struct kvm_vcpu *vcpu) 481 { 482 kvm_pmu_reset(vcpu); 483 } 484 485 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) 486 { 487 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 488 u64 prev_count; 489 490 prev_count = pmc->counter; 491 pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); 492 493 reprogram_counter(pmu, pmc->idx); 494 if (pmc->counter < prev_count) 495 __kvm_perf_overflow(pmc, false); 496 } 497 498 static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, 499 unsigned int perf_hw_id) 500 { 501 u64 old_eventsel = pmc->eventsel; 502 unsigned int config; 503 504 pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); 505 config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); 506 pmc->eventsel = old_eventsel; 507 return config == perf_hw_id; 508 } 509 510 static inline bool cpl_is_matched(struct kvm_pmc *pmc) 511 { 512 bool select_os, select_user; 513 u64 config = pmc->current_config; 514 515 if (pmc_is_gp(pmc)) { 516 select_os = config & ARCH_PERFMON_EVENTSEL_OS; 517 select_user = config & ARCH_PERFMON_EVENTSEL_USR; 518 } else { 519 select_os = config & 0x1; 520 select_user = config & 0x2; 521 } 522 523 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; 524 } 525 526 void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) 527 { 528 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 529 struct kvm_pmc *pmc; 530 int i; 531 532 for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { 533 pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); 534 535 if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) 536 continue; 537 538 /* Ignore checks for edge detect, pin control, invert and CMASK bits */ 539 if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) 540 kvm_pmu_incr_counter(pmc); 541 } 542 } 543 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); 544 545 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) 546 { 547 struct kvm_pmu_event_filter tmp, *filter; 548 size_t size; 549 int r; 550 551 if (copy_from_user(&tmp, argp, sizeof(tmp))) 552 return -EFAULT; 553 554 if (tmp.action != KVM_PMU_EVENT_ALLOW && 555 tmp.action != KVM_PMU_EVENT_DENY) 556 return -EINVAL; 557 558 if (tmp.flags != 0) 559 return -EINVAL; 560 561 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) 562 return -E2BIG; 563 564 size = struct_size(filter, events, tmp.nevents); 565 filter = kmalloc(size, GFP_KERNEL_ACCOUNT); 566 if (!filter) 567 return -ENOMEM; 568 569 r = -EFAULT; 570 if (copy_from_user(filter, argp, size)) 571 goto cleanup; 572 573 /* Ensure nevents can't be changed between the user copies. */ 574 *filter = tmp; 575 576 mutex_lock(&kvm->lock); 577 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, 578 mutex_is_locked(&kvm->lock)); 579 mutex_unlock(&kvm->lock); 580 581 synchronize_srcu_expedited(&kvm->srcu); 582 r = 0; 583 cleanup: 584 kfree(filter); 585 return r; 586 } 587