1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support 4 * 5 * Copyright 2015 Red Hat, Inc. and/or its affiliates. 6 * 7 * Authors: 8 * Avi Kivity <avi@redhat.com> 9 * Gleb Natapov <gleb@redhat.com> 10 * Wei Huang <wei@redhat.com> 11 */ 12 13 #include <linux/types.h> 14 #include <linux/kvm_host.h> 15 #include <linux/perf_event.h> 16 #include <asm/perf_event.h> 17 #include "x86.h" 18 #include "cpuid.h" 19 #include "lapic.h" 20 #include "pmu.h" 21 22 /* This is enough to filter the vast majority of currently defined events. */ 23 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 24 25 /* NOTE: 26 * - Each perf counter is defined as "struct kvm_pmc"; 27 * - There are two types of perf counters: general purpose (gp) and fixed. 28 * gp counters are stored in gp_counters[] and fixed counters are stored 29 * in fixed_counters[] respectively. Both of them are part of "struct 30 * kvm_pmu"; 31 * - pmu.c understands the difference between gp counters and fixed counters. 32 * However AMD doesn't support fixed-counters; 33 * - There are three types of index to access perf counters (PMC): 34 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD 35 * has MSR_K7_PERFCTRn. 36 * 2. MSR Index (named idx): This normally is used by RDPMC instruction. 37 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access 38 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except 39 * that it also supports fixed counters. idx can be used to as index to 40 * gp and fixed counters. 41 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU 42 * code. Each pmc, stored in kvm_pmc.idx field, is unique across 43 * all perf counters (both gp and fixed). The mapping relationship 44 * between pmc and perf counters is as the following: 45 * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters 46 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed 47 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters 48 */ 49 50 static void kvm_pmi_trigger_fn(struct irq_work *irq_work) 51 { 52 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); 53 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); 54 55 kvm_pmu_deliver_pmi(vcpu); 56 } 57 58 static void kvm_perf_overflow(struct perf_event *perf_event, 59 struct perf_sample_data *data, 60 struct pt_regs *regs) 61 { 62 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 63 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 64 65 if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { 66 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 67 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 68 } 69 } 70 71 static void kvm_perf_overflow_intr(struct perf_event *perf_event, 72 struct perf_sample_data *data, 73 struct pt_regs *regs) 74 { 75 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 76 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 77 78 if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { 79 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 80 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 81 82 /* 83 * Inject PMI. If vcpu was in a guest mode during NMI PMI 84 * can be ejected on a guest mode re-entry. Otherwise we can't 85 * be sure that vcpu wasn't executing hlt instruction at the 86 * time of vmexit and is not going to re-enter guest mode until 87 * woken up. So we should wake it, but this is impossible from 88 * NMI context. Do it from irq work instead. 89 */ 90 if (!kvm_is_in_guest()) 91 irq_work_queue(&pmc_to_pmu(pmc)->irq_work); 92 else 93 kvm_make_request(KVM_REQ_PMI, pmc->vcpu); 94 } 95 } 96 97 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, 98 unsigned config, bool exclude_user, 99 bool exclude_kernel, bool intr, 100 bool in_tx, bool in_tx_cp) 101 { 102 struct perf_event *event; 103 struct perf_event_attr attr = { 104 .type = type, 105 .size = sizeof(attr), 106 .pinned = true, 107 .exclude_idle = true, 108 .exclude_host = 1, 109 .exclude_user = exclude_user, 110 .exclude_kernel = exclude_kernel, 111 .config = config, 112 }; 113 114 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); 115 116 if (in_tx) 117 attr.config |= HSW_IN_TX; 118 if (in_tx_cp) { 119 /* 120 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero 121 * period. Just clear the sample period so at least 122 * allocating the counter doesn't fail. 123 */ 124 attr.sample_period = 0; 125 attr.config |= HSW_IN_TX_CHECKPOINTED; 126 } 127 128 event = perf_event_create_kernel_counter(&attr, -1, current, 129 intr ? kvm_perf_overflow_intr : 130 kvm_perf_overflow, pmc); 131 if (IS_ERR(event)) { 132 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", 133 PTR_ERR(event), pmc->idx); 134 return; 135 } 136 137 pmc->perf_event = event; 138 pmc_to_pmu(pmc)->event_count++; 139 clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); 140 } 141 142 static void pmc_pause_counter(struct kvm_pmc *pmc) 143 { 144 u64 counter = pmc->counter; 145 146 if (!pmc->perf_event) 147 return; 148 149 /* update counter, reset event value to avoid redundant accumulation */ 150 counter += perf_event_pause(pmc->perf_event, true); 151 pmc->counter = counter & pmc_bitmask(pmc); 152 } 153 154 static bool pmc_resume_counter(struct kvm_pmc *pmc) 155 { 156 if (!pmc->perf_event) 157 return false; 158 159 /* recalibrate sample period and check if it's accepted by perf core */ 160 if (perf_event_period(pmc->perf_event, 161 (-pmc->counter) & pmc_bitmask(pmc))) 162 return false; 163 164 /* reuse perf_event to serve as pmc_reprogram_counter() does*/ 165 perf_event_enable(pmc->perf_event); 166 167 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); 168 return true; 169 } 170 171 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) 172 { 173 unsigned config, type = PERF_TYPE_RAW; 174 u8 event_select, unit_mask; 175 struct kvm *kvm = pmc->vcpu->kvm; 176 struct kvm_pmu_event_filter *filter; 177 int i; 178 bool allow_event = true; 179 180 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) 181 printk_once("kvm pmu: pin control bit is ignored\n"); 182 183 pmc->eventsel = eventsel; 184 185 pmc_pause_counter(pmc); 186 187 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) 188 return; 189 190 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 191 if (filter) { 192 for (i = 0; i < filter->nevents; i++) 193 if (filter->events[i] == 194 (eventsel & AMD64_RAW_EVENT_MASK_NB)) 195 break; 196 if (filter->action == KVM_PMU_EVENT_ALLOW && 197 i == filter->nevents) 198 allow_event = false; 199 if (filter->action == KVM_PMU_EVENT_DENY && 200 i < filter->nevents) 201 allow_event = false; 202 } 203 if (!allow_event) 204 return; 205 206 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 207 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 208 209 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 210 ARCH_PERFMON_EVENTSEL_INV | 211 ARCH_PERFMON_EVENTSEL_CMASK | 212 HSW_IN_TX | 213 HSW_IN_TX_CHECKPOINTED))) { 214 config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc), 215 event_select, 216 unit_mask); 217 if (config != PERF_COUNT_HW_MAX) 218 type = PERF_TYPE_HARDWARE; 219 } 220 221 if (type == PERF_TYPE_RAW) 222 config = eventsel & X86_RAW_EVENT_MASK; 223 224 if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) 225 return; 226 227 pmc_release_perf_event(pmc); 228 229 pmc->current_config = eventsel; 230 pmc_reprogram_counter(pmc, type, config, 231 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 232 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 233 eventsel & ARCH_PERFMON_EVENTSEL_INT, 234 (eventsel & HSW_IN_TX), 235 (eventsel & HSW_IN_TX_CHECKPOINTED)); 236 } 237 EXPORT_SYMBOL_GPL(reprogram_gp_counter); 238 239 void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) 240 { 241 unsigned en_field = ctrl & 0x3; 242 bool pmi = ctrl & 0x8; 243 struct kvm_pmu_event_filter *filter; 244 struct kvm *kvm = pmc->vcpu->kvm; 245 246 pmc_pause_counter(pmc); 247 248 if (!en_field || !pmc_is_enabled(pmc)) 249 return; 250 251 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 252 if (filter) { 253 if (filter->action == KVM_PMU_EVENT_DENY && 254 test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 255 return; 256 if (filter->action == KVM_PMU_EVENT_ALLOW && 257 !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 258 return; 259 } 260 261 if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) 262 return; 263 264 pmc_release_perf_event(pmc); 265 266 pmc->current_config = (u64)ctrl; 267 pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, 268 kvm_x86_ops->pmu_ops->find_fixed_event(idx), 269 !(en_field & 0x2), /* exclude user */ 270 !(en_field & 0x1), /* exclude kernel */ 271 pmi, false, false); 272 } 273 EXPORT_SYMBOL_GPL(reprogram_fixed_counter); 274 275 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) 276 { 277 struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); 278 279 if (!pmc) 280 return; 281 282 if (pmc_is_gp(pmc)) 283 reprogram_gp_counter(pmc, pmc->eventsel); 284 else { 285 int idx = pmc_idx - INTEL_PMC_IDX_FIXED; 286 u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); 287 288 reprogram_fixed_counter(pmc, ctrl, idx); 289 } 290 } 291 EXPORT_SYMBOL_GPL(reprogram_counter); 292 293 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) 294 { 295 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 296 int bit; 297 298 for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { 299 struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); 300 301 if (unlikely(!pmc || !pmc->perf_event)) { 302 clear_bit(bit, pmu->reprogram_pmi); 303 continue; 304 } 305 306 reprogram_counter(pmu, bit); 307 } 308 309 /* 310 * Unused perf_events are only released if the corresponding MSRs 311 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in 312 * triggers KVM_REQ_PMU if cleanup is needed. 313 */ 314 if (unlikely(pmu->need_cleanup)) 315 kvm_pmu_cleanup(vcpu); 316 } 317 318 /* check if idx is a valid index to access PMU */ 319 int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) 320 { 321 return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); 322 } 323 324 bool is_vmware_backdoor_pmc(u32 pmc_idx) 325 { 326 switch (pmc_idx) { 327 case VMWARE_BACKDOOR_PMC_HOST_TSC: 328 case VMWARE_BACKDOOR_PMC_REAL_TIME: 329 case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 330 return true; 331 } 332 return false; 333 } 334 335 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 336 { 337 u64 ctr_val; 338 339 switch (idx) { 340 case VMWARE_BACKDOOR_PMC_HOST_TSC: 341 ctr_val = rdtsc(); 342 break; 343 case VMWARE_BACKDOOR_PMC_REAL_TIME: 344 ctr_val = ktime_get_boottime_ns(); 345 break; 346 case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 347 ctr_val = ktime_get_boottime_ns() + 348 vcpu->kvm->arch.kvmclock_offset; 349 break; 350 default: 351 return 1; 352 } 353 354 *data = ctr_val; 355 return 0; 356 } 357 358 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 359 { 360 bool fast_mode = idx & (1u << 31); 361 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 362 struct kvm_pmc *pmc; 363 u64 mask = fast_mode ? ~0u : ~0ull; 364 365 if (!pmu->version) 366 return 1; 367 368 if (is_vmware_backdoor_pmc(idx)) 369 return kvm_pmu_rdpmc_vmware(vcpu, idx, data); 370 371 pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); 372 if (!pmc) 373 return 1; 374 375 *data = pmc_read_counter(pmc) & mask; 376 return 0; 377 } 378 379 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) 380 { 381 if (lapic_in_kernel(vcpu)) 382 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); 383 } 384 385 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) 386 { 387 return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) || 388 kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); 389 } 390 391 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) 392 { 393 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 394 struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr); 395 396 if (pmc) 397 __set_bit(pmc->idx, pmu->pmc_in_use); 398 } 399 400 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) 401 { 402 return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data); 403 } 404 405 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 406 { 407 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); 408 return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); 409 } 410 411 /* refresh PMU settings. This function generally is called when underlying 412 * settings are changed (such as changes of PMU CPUID by guest VMs), which 413 * should rarely happen. 414 */ 415 void kvm_pmu_refresh(struct kvm_vcpu *vcpu) 416 { 417 kvm_x86_ops->pmu_ops->refresh(vcpu); 418 } 419 420 void kvm_pmu_reset(struct kvm_vcpu *vcpu) 421 { 422 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 423 424 irq_work_sync(&pmu->irq_work); 425 kvm_x86_ops->pmu_ops->reset(vcpu); 426 } 427 428 void kvm_pmu_init(struct kvm_vcpu *vcpu) 429 { 430 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 431 432 memset(pmu, 0, sizeof(*pmu)); 433 kvm_x86_ops->pmu_ops->init(vcpu); 434 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); 435 pmu->event_count = 0; 436 pmu->need_cleanup = false; 437 kvm_pmu_refresh(vcpu); 438 } 439 440 static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) 441 { 442 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 443 444 if (pmc_is_fixed(pmc)) 445 return fixed_ctrl_field(pmu->fixed_ctr_ctrl, 446 pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; 447 448 return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; 449 } 450 451 /* Release perf_events for vPMCs that have been unused for a full time slice. */ 452 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) 453 { 454 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 455 struct kvm_pmc *pmc = NULL; 456 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); 457 int i; 458 459 pmu->need_cleanup = false; 460 461 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, 462 pmu->pmc_in_use, X86_PMC_IDX_MAX); 463 464 for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { 465 pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i); 466 467 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) 468 pmc_stop_counter(pmc); 469 } 470 471 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); 472 } 473 474 void kvm_pmu_destroy(struct kvm_vcpu *vcpu) 475 { 476 kvm_pmu_reset(vcpu); 477 } 478 479 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) 480 { 481 struct kvm_pmu_event_filter tmp, *filter; 482 size_t size; 483 int r; 484 485 if (copy_from_user(&tmp, argp, sizeof(tmp))) 486 return -EFAULT; 487 488 if (tmp.action != KVM_PMU_EVENT_ALLOW && 489 tmp.action != KVM_PMU_EVENT_DENY) 490 return -EINVAL; 491 492 if (tmp.flags != 0) 493 return -EINVAL; 494 495 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) 496 return -E2BIG; 497 498 size = struct_size(filter, events, tmp.nevents); 499 filter = kmalloc(size, GFP_KERNEL_ACCOUNT); 500 if (!filter) 501 return -ENOMEM; 502 503 r = -EFAULT; 504 if (copy_from_user(filter, argp, size)) 505 goto cleanup; 506 507 /* Ensure nevents can't be changed between the user copies. */ 508 *filter = tmp; 509 510 mutex_lock(&kvm->lock); 511 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, 512 mutex_is_locked(&kvm->lock)); 513 mutex_unlock(&kvm->lock); 514 515 synchronize_srcu_expedited(&kvm->srcu); 516 r = 0; 517 cleanup: 518 kfree(filter); 519 return r; 520 } 521