120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 2f5132b01SGleb Natapov /* 3c7a7062fSGuo Chao * Kernel-based Virtual Machine -- Performance Monitoring Unit support 4f5132b01SGleb Natapov * 525462f7fSWei Huang * Copyright 2015 Red Hat, Inc. and/or its affiliates. 6f5132b01SGleb Natapov * 7f5132b01SGleb Natapov * Authors: 8f5132b01SGleb Natapov * Avi Kivity <avi@redhat.com> 9f5132b01SGleb Natapov * Gleb Natapov <gleb@redhat.com> 1025462f7fSWei Huang * Wei Huang <wei@redhat.com> 11f5132b01SGleb Natapov */ 12f5132b01SGleb Natapov 13f5132b01SGleb Natapov #include <linux/types.h> 14f5132b01SGleb Natapov #include <linux/kvm_host.h> 15f5132b01SGleb Natapov #include <linux/perf_event.h> 16d27aa7f1SNadav Amit #include <asm/perf_event.h> 17f5132b01SGleb Natapov #include "x86.h" 18f5132b01SGleb Natapov #include "cpuid.h" 19f5132b01SGleb Natapov #include "lapic.h" 20474a5bb9SWei Huang #include "pmu.h" 21f5132b01SGleb Natapov 2225462f7fSWei Huang /* NOTE: 2325462f7fSWei Huang * - Each perf counter is defined as "struct kvm_pmc"; 2425462f7fSWei Huang * - There are two types of perf counters: general purpose (gp) and fixed. 2525462f7fSWei Huang * gp counters are stored in gp_counters[] and fixed counters are stored 2625462f7fSWei Huang * in fixed_counters[] respectively. Both of them are part of "struct 2725462f7fSWei Huang * kvm_pmu"; 2825462f7fSWei Huang * - pmu.c understands the difference between gp counters and fixed counters. 2925462f7fSWei Huang * However AMD doesn't support fixed-counters; 3025462f7fSWei Huang * - There are three types of index to access perf counters (PMC): 3125462f7fSWei Huang * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD 3225462f7fSWei Huang * has MSR_K7_PERFCTRn. 3325462f7fSWei Huang * 2. MSR Index (named idx): This normally is used by RDPMC instruction. 3425462f7fSWei Huang * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access 3525462f7fSWei Huang * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except 3625462f7fSWei Huang * that it also supports fixed counters. idx can be used to as index to 3725462f7fSWei Huang * gp and fixed counters. 3825462f7fSWei Huang * 3. Global PMC Index (named pmc): pmc is an index specific to PMU 3925462f7fSWei Huang * code. Each pmc, stored in kvm_pmc.idx field, is unique across 4025462f7fSWei Huang * all perf counters (both gp and fixed). The mapping relationship 4125462f7fSWei Huang * between pmc and perf counters is as the following: 4225462f7fSWei Huang * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters 4325462f7fSWei Huang * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed 4425462f7fSWei Huang * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters 4525462f7fSWei Huang */ 46f5132b01SGleb Natapov 47c6702c9dSWei Huang static void kvm_pmi_trigger_fn(struct irq_work *irq_work) 48f5132b01SGleb Natapov { 49212dba12SWei Huang struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); 50212dba12SWei Huang struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); 51f5132b01SGleb Natapov 52c6702c9dSWei Huang kvm_pmu_deliver_pmi(vcpu); 53f5132b01SGleb Natapov } 54f5132b01SGleb Natapov 55f5132b01SGleb Natapov static void kvm_perf_overflow(struct perf_event *perf_event, 56f5132b01SGleb Natapov struct perf_sample_data *data, 57f5132b01SGleb Natapov struct pt_regs *regs) 58f5132b01SGleb Natapov { 59f5132b01SGleb Natapov struct kvm_pmc *pmc = perf_event->overflow_handler_context; 60212dba12SWei Huang struct kvm_pmu *pmu = pmc_to_pmu(pmc); 61e84cfe4cSWei Huang 62e84cfe4cSWei Huang if (!test_and_set_bit(pmc->idx, 63e84cfe4cSWei Huang (unsigned long *)&pmu->reprogram_pmi)) { 64f5132b01SGleb Natapov __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 65671bd993SNadav Amit kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 66671bd993SNadav Amit } 67f5132b01SGleb Natapov } 68f5132b01SGleb Natapov 69f5132b01SGleb Natapov static void kvm_perf_overflow_intr(struct perf_event *perf_event, 70e84cfe4cSWei Huang struct perf_sample_data *data, 71e84cfe4cSWei Huang struct pt_regs *regs) 72f5132b01SGleb Natapov { 73f5132b01SGleb Natapov struct kvm_pmc *pmc = perf_event->overflow_handler_context; 74212dba12SWei Huang struct kvm_pmu *pmu = pmc_to_pmu(pmc); 75e84cfe4cSWei Huang 76e84cfe4cSWei Huang if (!test_and_set_bit(pmc->idx, 77e84cfe4cSWei Huang (unsigned long *)&pmu->reprogram_pmi)) { 78671bd993SNadav Amit __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 79f5132b01SGleb Natapov kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 80e84cfe4cSWei Huang 81f5132b01SGleb Natapov /* 82f5132b01SGleb Natapov * Inject PMI. If vcpu was in a guest mode during NMI PMI 83f5132b01SGleb Natapov * can be ejected on a guest mode re-entry. Otherwise we can't 84f5132b01SGleb Natapov * be sure that vcpu wasn't executing hlt instruction at the 85e84cfe4cSWei Huang * time of vmexit and is not going to re-enter guest mode until 86f5132b01SGleb Natapov * woken up. So we should wake it, but this is impossible from 87f5132b01SGleb Natapov * NMI context. Do it from irq work instead. 88f5132b01SGleb Natapov */ 89f5132b01SGleb Natapov if (!kvm_is_in_guest()) 90212dba12SWei Huang irq_work_queue(&pmc_to_pmu(pmc)->irq_work); 91f5132b01SGleb Natapov else 92f5132b01SGleb Natapov kvm_make_request(KVM_REQ_PMI, pmc->vcpu); 93f5132b01SGleb Natapov } 94f5132b01SGleb Natapov } 95f5132b01SGleb Natapov 96c6702c9dSWei Huang static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, 97e84cfe4cSWei Huang unsigned config, bool exclude_user, 98e84cfe4cSWei Huang bool exclude_kernel, bool intr, 99e84cfe4cSWei Huang bool in_tx, bool in_tx_cp) 100f5132b01SGleb Natapov { 101f5132b01SGleb Natapov struct perf_event *event; 102f5132b01SGleb Natapov struct perf_event_attr attr = { 103f5132b01SGleb Natapov .type = type, 104f5132b01SGleb Natapov .size = sizeof(attr), 105f5132b01SGleb Natapov .pinned = true, 106f5132b01SGleb Natapov .exclude_idle = true, 107f5132b01SGleb Natapov .exclude_host = 1, 108f5132b01SGleb Natapov .exclude_user = exclude_user, 109f5132b01SGleb Natapov .exclude_kernel = exclude_kernel, 110f5132b01SGleb Natapov .config = config, 111f5132b01SGleb Natapov }; 112e84cfe4cSWei Huang 113bba82fd7SRobert O'Callahan attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); 114bba82fd7SRobert O'Callahan 115103af0a9SAndi Kleen if (in_tx) 116103af0a9SAndi Kleen attr.config |= HSW_IN_TX; 117bba82fd7SRobert O'Callahan if (in_tx_cp) { 118bba82fd7SRobert O'Callahan /* 119bba82fd7SRobert O'Callahan * HSW_IN_TX_CHECKPOINTED is not supported with nonzero 120bba82fd7SRobert O'Callahan * period. Just clear the sample period so at least 121bba82fd7SRobert O'Callahan * allocating the counter doesn't fail. 122bba82fd7SRobert O'Callahan */ 123bba82fd7SRobert O'Callahan attr.sample_period = 0; 124103af0a9SAndi Kleen attr.config |= HSW_IN_TX_CHECKPOINTED; 125bba82fd7SRobert O'Callahan } 126f5132b01SGleb Natapov 127f5132b01SGleb Natapov event = perf_event_create_kernel_counter(&attr, -1, current, 128f5132b01SGleb Natapov intr ? kvm_perf_overflow_intr : 129f5132b01SGleb Natapov kvm_perf_overflow, pmc); 130f5132b01SGleb Natapov if (IS_ERR(event)) { 131e84cfe4cSWei Huang printk_once("kvm_pmu: event creation failed %ld\n", 132f5132b01SGleb Natapov PTR_ERR(event)); 133f5132b01SGleb Natapov return; 134f5132b01SGleb Natapov } 135f5132b01SGleb Natapov 136f5132b01SGleb Natapov pmc->perf_event = event; 137212dba12SWei Huang clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); 138f5132b01SGleb Natapov } 139f5132b01SGleb Natapov 14025462f7fSWei Huang void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) 141f5132b01SGleb Natapov { 142f5132b01SGleb Natapov unsigned config, type = PERF_TYPE_RAW; 143f5132b01SGleb Natapov u8 event_select, unit_mask; 144f5132b01SGleb Natapov 145a7b9d2ccSGleb Natapov if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) 146a7b9d2ccSGleb Natapov printk_once("kvm pmu: pin control bit is ignored\n"); 147a7b9d2ccSGleb Natapov 148f5132b01SGleb Natapov pmc->eventsel = eventsel; 149f5132b01SGleb Natapov 150c6702c9dSWei Huang pmc_stop_counter(pmc); 151f5132b01SGleb Natapov 152c6702c9dSWei Huang if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) 153f5132b01SGleb Natapov return; 154f5132b01SGleb Natapov 155f5132b01SGleb Natapov event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 156f5132b01SGleb Natapov unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 157f5132b01SGleb Natapov 158fac33683SGleb Natapov if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 159f5132b01SGleb Natapov ARCH_PERFMON_EVENTSEL_INV | 160103af0a9SAndi Kleen ARCH_PERFMON_EVENTSEL_CMASK | 161103af0a9SAndi Kleen HSW_IN_TX | 162103af0a9SAndi Kleen HSW_IN_TX_CHECKPOINTED))) { 16325462f7fSWei Huang config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc), 16425462f7fSWei Huang event_select, 165f5132b01SGleb Natapov unit_mask); 166f5132b01SGleb Natapov if (config != PERF_COUNT_HW_MAX) 167f5132b01SGleb Natapov type = PERF_TYPE_HARDWARE; 168f5132b01SGleb Natapov } 169f5132b01SGleb Natapov 170f5132b01SGleb Natapov if (type == PERF_TYPE_RAW) 171f5132b01SGleb Natapov config = eventsel & X86_RAW_EVENT_MASK; 172f5132b01SGleb Natapov 173c6702c9dSWei Huang pmc_reprogram_counter(pmc, type, config, 174f5132b01SGleb Natapov !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 175f5132b01SGleb Natapov !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 176103af0a9SAndi Kleen eventsel & ARCH_PERFMON_EVENTSEL_INT, 177103af0a9SAndi Kleen (eventsel & HSW_IN_TX), 178103af0a9SAndi Kleen (eventsel & HSW_IN_TX_CHECKPOINTED)); 179f5132b01SGleb Natapov } 18025462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_gp_counter); 181f5132b01SGleb Natapov 18225462f7fSWei Huang void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) 183f5132b01SGleb Natapov { 184e84cfe4cSWei Huang unsigned en_field = ctrl & 0x3; 185e84cfe4cSWei Huang bool pmi = ctrl & 0x8; 186f5132b01SGleb Natapov 187c6702c9dSWei Huang pmc_stop_counter(pmc); 188f5132b01SGleb Natapov 189e84cfe4cSWei Huang if (!en_field || !pmc_is_enabled(pmc)) 190f5132b01SGleb Natapov return; 191f5132b01SGleb Natapov 192c6702c9dSWei Huang pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, 19325462f7fSWei Huang kvm_x86_ops->pmu_ops->find_fixed_event(idx), 194e84cfe4cSWei Huang !(en_field & 0x2), /* exclude user */ 195e84cfe4cSWei Huang !(en_field & 0x1), /* exclude kernel */ 196103af0a9SAndi Kleen pmi, false, false); 197f5132b01SGleb Natapov } 19825462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_fixed_counter); 199f5132b01SGleb Natapov 20025462f7fSWei Huang void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) 201f5132b01SGleb Natapov { 20225462f7fSWei Huang struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); 203f5132b01SGleb Natapov 204f5132b01SGleb Natapov if (!pmc) 205f5132b01SGleb Natapov return; 206f5132b01SGleb Natapov 207f5132b01SGleb Natapov if (pmc_is_gp(pmc)) 208f5132b01SGleb Natapov reprogram_gp_counter(pmc, pmc->eventsel); 209f5132b01SGleb Natapov else { 210e84cfe4cSWei Huang int idx = pmc_idx - INTEL_PMC_IDX_FIXED; 211e84cfe4cSWei Huang u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); 212e84cfe4cSWei Huang 213e84cfe4cSWei Huang reprogram_fixed_counter(pmc, ctrl, idx); 214f5132b01SGleb Natapov } 215f5132b01SGleb Natapov } 21625462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_counter); 217f5132b01SGleb Natapov 218e5af058aSWei Huang void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) 219e5af058aSWei Huang { 220e5af058aSWei Huang struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 221e5af058aSWei Huang u64 bitmask; 222e5af058aSWei Huang int bit; 223e5af058aSWei Huang 224e5af058aSWei Huang bitmask = pmu->reprogram_pmi; 225e5af058aSWei Huang 226e5af058aSWei Huang for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { 22725462f7fSWei Huang struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); 228e5af058aSWei Huang 229e5af058aSWei Huang if (unlikely(!pmc || !pmc->perf_event)) { 230e5af058aSWei Huang clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); 231e5af058aSWei Huang continue; 232e5af058aSWei Huang } 233e5af058aSWei Huang 234e5af058aSWei Huang reprogram_counter(pmu, bit); 235e5af058aSWei Huang } 236e5af058aSWei Huang } 237e5af058aSWei Huang 238e5af058aSWei Huang /* check if idx is a valid index to access PMU */ 239e5af058aSWei Huang int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) 240e5af058aSWei Huang { 24125462f7fSWei Huang return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); 24241aac14aSWei Huang } 24341aac14aSWei Huang 2442d7921c4SArbel Moshe bool is_vmware_backdoor_pmc(u32 pmc_idx) 2452d7921c4SArbel Moshe { 2462d7921c4SArbel Moshe switch (pmc_idx) { 2472d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_HOST_TSC: 2482d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_REAL_TIME: 2492d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 2502d7921c4SArbel Moshe return true; 2512d7921c4SArbel Moshe } 2522d7921c4SArbel Moshe return false; 2532d7921c4SArbel Moshe } 2542d7921c4SArbel Moshe 2552d7921c4SArbel Moshe static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 2562d7921c4SArbel Moshe { 2572d7921c4SArbel Moshe u64 ctr_val; 2582d7921c4SArbel Moshe 2592d7921c4SArbel Moshe switch (idx) { 2602d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_HOST_TSC: 2612d7921c4SArbel Moshe ctr_val = rdtsc(); 2622d7921c4SArbel Moshe break; 2632d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_REAL_TIME: 2642d7921c4SArbel Moshe ctr_val = ktime_get_boot_ns(); 2652d7921c4SArbel Moshe break; 2662d7921c4SArbel Moshe case VMWARE_BACKDOOR_PMC_APPARENT_TIME: 2672d7921c4SArbel Moshe ctr_val = ktime_get_boot_ns() + 2682d7921c4SArbel Moshe vcpu->kvm->arch.kvmclock_offset; 2692d7921c4SArbel Moshe break; 2702d7921c4SArbel Moshe default: 2712d7921c4SArbel Moshe return 1; 2722d7921c4SArbel Moshe } 2732d7921c4SArbel Moshe 2742d7921c4SArbel Moshe *data = ctr_val; 2752d7921c4SArbel Moshe return 0; 2762d7921c4SArbel Moshe } 2772d7921c4SArbel Moshe 27841aac14aSWei Huang int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 27941aac14aSWei Huang { 28041aac14aSWei Huang bool fast_mode = idx & (1u << 31); 281672ff6cfSLiran Alon struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 28241aac14aSWei Huang struct kvm_pmc *pmc; 2830e6f467eSPaolo Bonzini u64 mask = fast_mode ? ~0u : ~0ull; 28441aac14aSWei Huang 285672ff6cfSLiran Alon if (!pmu->version) 286672ff6cfSLiran Alon return 1; 287672ff6cfSLiran Alon 2882d7921c4SArbel Moshe if (is_vmware_backdoor_pmc(idx)) 2892d7921c4SArbel Moshe return kvm_pmu_rdpmc_vmware(vcpu, idx, data); 2902d7921c4SArbel Moshe 2910e6f467eSPaolo Bonzini pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask); 29241aac14aSWei Huang if (!pmc) 29341aac14aSWei Huang return 1; 29441aac14aSWei Huang 2950e6f467eSPaolo Bonzini *data = pmc_read_counter(pmc) & mask; 296e5af058aSWei Huang return 0; 297e5af058aSWei Huang } 298e5af058aSWei Huang 299e5af058aSWei Huang void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) 300e5af058aSWei Huang { 301bce87cceSPaolo Bonzini if (lapic_in_kernel(vcpu)) 302e5af058aSWei Huang kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); 303e5af058aSWei Huang } 304e5af058aSWei Huang 305c6702c9dSWei Huang bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) 306f5132b01SGleb Natapov { 30725462f7fSWei Huang return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); 308f5132b01SGleb Natapov } 309f5132b01SGleb Natapov 31025462f7fSWei Huang int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) 311f5132b01SGleb Natapov { 31225462f7fSWei Huang return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data); 313f5132b01SGleb Natapov } 314f5132b01SGleb Natapov 315afd80d85SPaolo Bonzini int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 316f5132b01SGleb Natapov { 31725462f7fSWei Huang return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); 318f5132b01SGleb Natapov } 319f5132b01SGleb Natapov 320e84cfe4cSWei Huang /* refresh PMU settings. This function generally is called when underlying 321e84cfe4cSWei Huang * settings are changed (such as changes of PMU CPUID by guest VMs), which 322e84cfe4cSWei Huang * should rarely happen. 323e84cfe4cSWei Huang */ 324c6702c9dSWei Huang void kvm_pmu_refresh(struct kvm_vcpu *vcpu) 325f5132b01SGleb Natapov { 32625462f7fSWei Huang kvm_x86_ops->pmu_ops->refresh(vcpu); 327f5132b01SGleb Natapov } 328f5132b01SGleb Natapov 329e5af058aSWei Huang void kvm_pmu_reset(struct kvm_vcpu *vcpu) 330e5af058aSWei Huang { 331e5af058aSWei Huang struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 332e5af058aSWei Huang 333e5af058aSWei Huang irq_work_sync(&pmu->irq_work); 33425462f7fSWei Huang kvm_x86_ops->pmu_ops->reset(vcpu); 335e5af058aSWei Huang } 336e5af058aSWei Huang 337f5132b01SGleb Natapov void kvm_pmu_init(struct kvm_vcpu *vcpu) 338f5132b01SGleb Natapov { 339212dba12SWei Huang struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 340f5132b01SGleb Natapov 341f5132b01SGleb Natapov memset(pmu, 0, sizeof(*pmu)); 34225462f7fSWei Huang kvm_x86_ops->pmu_ops->init(vcpu); 343c6702c9dSWei Huang init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); 344c6702c9dSWei Huang kvm_pmu_refresh(vcpu); 345f5132b01SGleb Natapov } 346f5132b01SGleb Natapov 347f5132b01SGleb Natapov void kvm_pmu_destroy(struct kvm_vcpu *vcpu) 348f5132b01SGleb Natapov { 349f5132b01SGleb Natapov kvm_pmu_reset(vcpu); 350f5132b01SGleb Natapov } 351