xref: /openbmc/linux/arch/x86/kvm/pmu.c (revision e644896f)
120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2f5132b01SGleb Natapov /*
3c7a7062fSGuo Chao  * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4f5132b01SGleb Natapov  *
525462f7fSWei Huang  * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6f5132b01SGleb Natapov  *
7f5132b01SGleb Natapov  * Authors:
8f5132b01SGleb Natapov  *   Avi Kivity   <avi@redhat.com>
9f5132b01SGleb Natapov  *   Gleb Natapov <gleb@redhat.com>
1025462f7fSWei Huang  *   Wei Huang    <wei@redhat.com>
11f5132b01SGleb Natapov  */
12f5132b01SGleb Natapov 
13f5132b01SGleb Natapov #include <linux/types.h>
14f5132b01SGleb Natapov #include <linux/kvm_host.h>
15f5132b01SGleb Natapov #include <linux/perf_event.h>
167ff775acSJim Mattson #include <linux/bsearch.h>
177ff775acSJim Mattson #include <linux/sort.h>
18d27aa7f1SNadav Amit #include <asm/perf_event.h>
19f5132b01SGleb Natapov #include "x86.h"
20f5132b01SGleb Natapov #include "cpuid.h"
21f5132b01SGleb Natapov #include "lapic.h"
22474a5bb9SWei Huang #include "pmu.h"
23f5132b01SGleb Natapov 
2430cd8604SEric Hankland /* This is enough to filter the vast majority of currently defined events. */
2530cd8604SEric Hankland #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
2666bb8a06SEric Hankland 
2725462f7fSWei Huang /* NOTE:
2825462f7fSWei Huang  * - Each perf counter is defined as "struct kvm_pmc";
2925462f7fSWei Huang  * - There are two types of perf counters: general purpose (gp) and fixed.
3025462f7fSWei Huang  *   gp counters are stored in gp_counters[] and fixed counters are stored
3125462f7fSWei Huang  *   in fixed_counters[] respectively. Both of them are part of "struct
3225462f7fSWei Huang  *   kvm_pmu";
3325462f7fSWei Huang  * - pmu.c understands the difference between gp counters and fixed counters.
3425462f7fSWei Huang  *   However AMD doesn't support fixed-counters;
3525462f7fSWei Huang  * - There are three types of index to access perf counters (PMC):
3625462f7fSWei Huang  *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
3725462f7fSWei Huang  *        has MSR_K7_PERFCTRn.
3825462f7fSWei Huang  *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
3925462f7fSWei Huang  *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
4025462f7fSWei Huang  *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
4125462f7fSWei Huang  *        that it also supports fixed counters. idx can be used to as index to
4225462f7fSWei Huang  *        gp and fixed counters.
4325462f7fSWei Huang  *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
4425462f7fSWei Huang  *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
4525462f7fSWei Huang  *        all perf counters (both gp and fixed). The mapping relationship
4625462f7fSWei Huang  *        between pmc and perf counters is as the following:
4725462f7fSWei Huang  *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
4825462f7fSWei Huang  *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
4925462f7fSWei Huang  *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
5025462f7fSWei Huang  */
51f5132b01SGleb Natapov 
52c6702c9dSWei Huang static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
53f5132b01SGleb Natapov {
54212dba12SWei Huang 	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
55212dba12SWei Huang 	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
56f5132b01SGleb Natapov 
57c6702c9dSWei Huang 	kvm_pmu_deliver_pmi(vcpu);
58f5132b01SGleb Natapov }
59f5132b01SGleb Natapov 
6040ccb96dSLike Xu static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
61f5132b01SGleb Natapov {
62212dba12SWei Huang 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
63e84cfe4cSWei Huang 
6440ccb96dSLike Xu 	/* Ignore counters that have been reprogrammed already. */
6540ccb96dSLike Xu 	if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
6640ccb96dSLike Xu 		return;
6740ccb96dSLike Xu 
68f5132b01SGleb Natapov 	__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
69671bd993SNadav Amit 	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
70f5132b01SGleb Natapov 
7140ccb96dSLike Xu 	if (!pmc->intr)
7240ccb96dSLike Xu 		return;
73e84cfe4cSWei Huang 
74f5132b01SGleb Natapov 	/*
75f5132b01SGleb Natapov 	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
76f5132b01SGleb Natapov 	 * can be ejected on a guest mode re-entry. Otherwise we can't
77f5132b01SGleb Natapov 	 * be sure that vcpu wasn't executing hlt instruction at the
78e84cfe4cSWei Huang 	 * time of vmexit and is not going to re-enter guest mode until
79f5132b01SGleb Natapov 	 * woken up. So we should wake it, but this is impossible from
80f5132b01SGleb Natapov 	 * NMI context. Do it from irq work instead.
81f5132b01SGleb Natapov 	 */
8279e06c4cSLinus Torvalds 	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
83212dba12SWei Huang 		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
84f5132b01SGleb Natapov 	else
85f5132b01SGleb Natapov 		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
86f5132b01SGleb Natapov }
8740ccb96dSLike Xu 
8840ccb96dSLike Xu static void kvm_perf_overflow(struct perf_event *perf_event,
8940ccb96dSLike Xu 			      struct perf_sample_data *data,
9040ccb96dSLike Xu 			      struct pt_regs *regs)
9140ccb96dSLike Xu {
9240ccb96dSLike Xu 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
9340ccb96dSLike Xu 
9440ccb96dSLike Xu 	__kvm_perf_overflow(pmc, true);
95f5132b01SGleb Natapov }
96f5132b01SGleb Natapov 
97c6702c9dSWei Huang static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
98b8bfee85SJim Mattson 				  u64 config, bool exclude_user,
99*e644896fSLike Xu 				  bool exclude_kernel, bool intr)
100f5132b01SGleb Natapov {
101f5132b01SGleb Natapov 	struct perf_event *event;
102f5132b01SGleb Natapov 	struct perf_event_attr attr = {
103f5132b01SGleb Natapov 		.type = type,
104f5132b01SGleb Natapov 		.size = sizeof(attr),
105f5132b01SGleb Natapov 		.pinned = true,
106f5132b01SGleb Natapov 		.exclude_idle = true,
107f5132b01SGleb Natapov 		.exclude_host = 1,
108f5132b01SGleb Natapov 		.exclude_user = exclude_user,
109f5132b01SGleb Natapov 		.exclude_kernel = exclude_kernel,
110f5132b01SGleb Natapov 		.config = config,
111f5132b01SGleb Natapov 	};
112e84cfe4cSWei Huang 
113a2186448SLike Xu 	if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
114a2186448SLike Xu 		return;
115a2186448SLike Xu 
116168d918fSEric Hankland 	attr.sample_period = get_sample_period(pmc, pmc->counter);
117bba82fd7SRobert O'Callahan 
118*e644896fSLike Xu 	if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
119*e644896fSLike Xu 	    guest_cpuid_is_intel(pmc->vcpu)) {
120bba82fd7SRobert O'Callahan 		/*
121bba82fd7SRobert O'Callahan 		 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
122bba82fd7SRobert O'Callahan 		 * period. Just clear the sample period so at least
123bba82fd7SRobert O'Callahan 		 * allocating the counter doesn't fail.
124bba82fd7SRobert O'Callahan 		 */
125bba82fd7SRobert O'Callahan 		attr.sample_period = 0;
126bba82fd7SRobert O'Callahan 	}
127f5132b01SGleb Natapov 
128f5132b01SGleb Natapov 	event = perf_event_create_kernel_counter(&attr, -1, current,
129f5132b01SGleb Natapov 						 kvm_perf_overflow, pmc);
130f5132b01SGleb Natapov 	if (IS_ERR(event)) {
1316fc3977cSLike Xu 		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
1326fc3977cSLike Xu 			    PTR_ERR(event), pmc->idx);
133f5132b01SGleb Natapov 		return;
134f5132b01SGleb Natapov 	}
135f5132b01SGleb Natapov 
136f5132b01SGleb Natapov 	pmc->perf_event = event;
137b35e5548SLike Xu 	pmc_to_pmu(pmc)->event_count++;
1384be94672SLike Xu 	clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
139e79f49c3SLike Xu 	pmc->is_paused = false;
14040ccb96dSLike Xu 	pmc->intr = intr;
141f5132b01SGleb Natapov }
142f5132b01SGleb Natapov 
143a6da0d77SLike Xu static void pmc_pause_counter(struct kvm_pmc *pmc)
144a6da0d77SLike Xu {
145a6da0d77SLike Xu 	u64 counter = pmc->counter;
146a6da0d77SLike Xu 
147e79f49c3SLike Xu 	if (!pmc->perf_event || pmc->is_paused)
148a6da0d77SLike Xu 		return;
149a6da0d77SLike Xu 
150a6da0d77SLike Xu 	/* update counter, reset event value to avoid redundant accumulation */
151a6da0d77SLike Xu 	counter += perf_event_pause(pmc->perf_event, true);
152a6da0d77SLike Xu 	pmc->counter = counter & pmc_bitmask(pmc);
153e79f49c3SLike Xu 	pmc->is_paused = true;
154a6da0d77SLike Xu }
155a6da0d77SLike Xu 
156a6da0d77SLike Xu static bool pmc_resume_counter(struct kvm_pmc *pmc)
157a6da0d77SLike Xu {
158a6da0d77SLike Xu 	if (!pmc->perf_event)
159a6da0d77SLike Xu 		return false;
160a6da0d77SLike Xu 
161a6da0d77SLike Xu 	/* recalibrate sample period and check if it's accepted by perf core */
162a6da0d77SLike Xu 	if (perf_event_period(pmc->perf_event,
163168d918fSEric Hankland 			      get_sample_period(pmc, pmc->counter)))
164a6da0d77SLike Xu 		return false;
165a6da0d77SLike Xu 
166a6da0d77SLike Xu 	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
167a6da0d77SLike Xu 	perf_event_enable(pmc->perf_event);
168e79f49c3SLike Xu 	pmc->is_paused = false;
169a6da0d77SLike Xu 
170a6da0d77SLike Xu 	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
171a6da0d77SLike Xu 	return true;
172a6da0d77SLike Xu }
173a6da0d77SLike Xu 
1747ff775acSJim Mattson static int cmp_u64(const void *a, const void *b)
1757ff775acSJim Mattson {
1767ff775acSJim Mattson 	return *(__u64 *)a - *(__u64 *)b;
1777ff775acSJim Mattson }
1787ff775acSJim Mattson 
17925462f7fSWei Huang void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
180f5132b01SGleb Natapov {
181b8bfee85SJim Mattson 	u64 config;
182b8bfee85SJim Mattson 	u32 type = PERF_TYPE_RAW;
18366bb8a06SEric Hankland 	struct kvm *kvm = pmc->vcpu->kvm;
18466bb8a06SEric Hankland 	struct kvm_pmu_event_filter *filter;
18595b065bfSJim Mattson 	struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
18666bb8a06SEric Hankland 	bool allow_event = true;
187f5132b01SGleb Natapov 
188a7b9d2ccSGleb Natapov 	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
189a7b9d2ccSGleb Natapov 		printk_once("kvm pmu: pin control bit is ignored\n");
190a7b9d2ccSGleb Natapov 
191f5132b01SGleb Natapov 	pmc->eventsel = eventsel;
192f5132b01SGleb Natapov 
193a6da0d77SLike Xu 	pmc_pause_counter(pmc);
194f5132b01SGleb Natapov 
195c6702c9dSWei Huang 	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
196f5132b01SGleb Natapov 		return;
197f5132b01SGleb Natapov 
19866bb8a06SEric Hankland 	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
19966bb8a06SEric Hankland 	if (filter) {
2007ff775acSJim Mattson 		__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
2017ff775acSJim Mattson 
2027ff775acSJim Mattson 		if (bsearch(&key, filter->events, filter->nevents,
2037ff775acSJim Mattson 			    sizeof(__u64), cmp_u64))
2047ff775acSJim Mattson 			allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
2057ff775acSJim Mattson 		else
2067ff775acSJim Mattson 			allow_event = filter->action == KVM_PMU_EVENT_DENY;
20766bb8a06SEric Hankland 	}
20866bb8a06SEric Hankland 	if (!allow_event)
20966bb8a06SEric Hankland 		return;
21066bb8a06SEric Hankland 
211fac33683SGleb Natapov 	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
212f5132b01SGleb Natapov 			  ARCH_PERFMON_EVENTSEL_INV |
213103af0a9SAndi Kleen 			  ARCH_PERFMON_EVENTSEL_CMASK |
214103af0a9SAndi Kleen 			  HSW_IN_TX |
215103af0a9SAndi Kleen 			  HSW_IN_TX_CHECKPOINTED))) {
2167c174f30SLike Xu 		config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
217f5132b01SGleb Natapov 		if (config != PERF_COUNT_HW_MAX)
218f5132b01SGleb Natapov 			type = PERF_TYPE_HARDWARE;
219f5132b01SGleb Natapov 	}
220f5132b01SGleb Natapov 
221f5132b01SGleb Natapov 	if (type == PERF_TYPE_RAW)
22295b065bfSJim Mattson 		config = eventsel & pmu->raw_event_mask;
223f5132b01SGleb Natapov 
224a6da0d77SLike Xu 	if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
225a6da0d77SLike Xu 		return;
226a6da0d77SLike Xu 
227a6da0d77SLike Xu 	pmc_release_perf_event(pmc);
228a6da0d77SLike Xu 
229a6da0d77SLike Xu 	pmc->current_config = eventsel;
230c6702c9dSWei Huang 	pmc_reprogram_counter(pmc, type, config,
231f5132b01SGleb Natapov 			      !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
232f5132b01SGleb Natapov 			      !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
233*e644896fSLike Xu 			      eventsel & ARCH_PERFMON_EVENTSEL_INT);
234f5132b01SGleb Natapov }
23525462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_gp_counter);
236f5132b01SGleb Natapov 
23725462f7fSWei Huang void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
238f5132b01SGleb Natapov {
239e84cfe4cSWei Huang 	unsigned en_field = ctrl & 0x3;
240e84cfe4cSWei Huang 	bool pmi = ctrl & 0x8;
24130cd8604SEric Hankland 	struct kvm_pmu_event_filter *filter;
24230cd8604SEric Hankland 	struct kvm *kvm = pmc->vcpu->kvm;
243f5132b01SGleb Natapov 
244a6da0d77SLike Xu 	pmc_pause_counter(pmc);
245f5132b01SGleb Natapov 
246e84cfe4cSWei Huang 	if (!en_field || !pmc_is_enabled(pmc))
247f5132b01SGleb Natapov 		return;
248f5132b01SGleb Natapov 
24930cd8604SEric Hankland 	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
25030cd8604SEric Hankland 	if (filter) {
25130cd8604SEric Hankland 		if (filter->action == KVM_PMU_EVENT_DENY &&
25230cd8604SEric Hankland 		    test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
25330cd8604SEric Hankland 			return;
25430cd8604SEric Hankland 		if (filter->action == KVM_PMU_EVENT_ALLOW &&
25530cd8604SEric Hankland 		    !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
25630cd8604SEric Hankland 			return;
25730cd8604SEric Hankland 	}
25830cd8604SEric Hankland 
259a6da0d77SLike Xu 	if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
260a6da0d77SLike Xu 		return;
261a6da0d77SLike Xu 
262a6da0d77SLike Xu 	pmc_release_perf_event(pmc);
263a6da0d77SLike Xu 
264a6da0d77SLike Xu 	pmc->current_config = (u64)ctrl;
265c6702c9dSWei Huang 	pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
2666ed1298eSLike Xu 			      kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
267e84cfe4cSWei Huang 			      !(en_field & 0x2), /* exclude user */
268e84cfe4cSWei Huang 			      !(en_field & 0x1), /* exclude kernel */
269*e644896fSLike Xu 			      pmi);
270f5132b01SGleb Natapov }
27125462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
272f5132b01SGleb Natapov 
27325462f7fSWei Huang void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
274f5132b01SGleb Natapov {
275afaf0b2fSSean Christopherson 	struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
276f5132b01SGleb Natapov 
277f5132b01SGleb Natapov 	if (!pmc)
278f5132b01SGleb Natapov 		return;
279f5132b01SGleb Natapov 
280f5132b01SGleb Natapov 	if (pmc_is_gp(pmc))
281f5132b01SGleb Natapov 		reprogram_gp_counter(pmc, pmc->eventsel);
282f5132b01SGleb Natapov 	else {
283e84cfe4cSWei Huang 		int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
284e84cfe4cSWei Huang 		u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
285e84cfe4cSWei Huang 
286e84cfe4cSWei Huang 		reprogram_fixed_counter(pmc, ctrl, idx);
287f5132b01SGleb Natapov 	}
288f5132b01SGleb Natapov }
28925462f7fSWei Huang EXPORT_SYMBOL_GPL(reprogram_counter);
290f5132b01SGleb Natapov 
291e5af058aSWei Huang void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
292e5af058aSWei Huang {
293e5af058aSWei Huang 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
294e5af058aSWei Huang 	int bit;
295e5af058aSWei Huang 
2964be94672SLike Xu 	for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
297afaf0b2fSSean Christopherson 		struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
298e5af058aSWei Huang 
299e5af058aSWei Huang 		if (unlikely(!pmc || !pmc->perf_event)) {
3004be94672SLike Xu 			clear_bit(bit, pmu->reprogram_pmi);
301e5af058aSWei Huang 			continue;
302e5af058aSWei Huang 		}
303e5af058aSWei Huang 
304e5af058aSWei Huang 		reprogram_counter(pmu, bit);
305e5af058aSWei Huang 	}
306b35e5548SLike Xu 
307b35e5548SLike Xu 	/*
308b35e5548SLike Xu 	 * Unused perf_events are only released if the corresponding MSRs
309b35e5548SLike Xu 	 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
310b35e5548SLike Xu 	 * triggers KVM_REQ_PMU if cleanup is needed.
311b35e5548SLike Xu 	 */
312b35e5548SLike Xu 	if (unlikely(pmu->need_cleanup))
313b35e5548SLike Xu 		kvm_pmu_cleanup(vcpu);
314e5af058aSWei Huang }
315e5af058aSWei Huang 
316e5af058aSWei Huang /* check if idx is a valid index to access PMU */
317e6cd31f1SJim Mattson bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
318e5af058aSWei Huang {
319afaf0b2fSSean Christopherson 	return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
32041aac14aSWei Huang }
32141aac14aSWei Huang 
3222d7921c4SArbel Moshe bool is_vmware_backdoor_pmc(u32 pmc_idx)
3232d7921c4SArbel Moshe {
3242d7921c4SArbel Moshe 	switch (pmc_idx) {
3252d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_HOST_TSC:
3262d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_REAL_TIME:
3272d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
3282d7921c4SArbel Moshe 		return true;
3292d7921c4SArbel Moshe 	}
3302d7921c4SArbel Moshe 	return false;
3312d7921c4SArbel Moshe }
3322d7921c4SArbel Moshe 
3332d7921c4SArbel Moshe static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
3342d7921c4SArbel Moshe {
3352d7921c4SArbel Moshe 	u64 ctr_val;
3362d7921c4SArbel Moshe 
3372d7921c4SArbel Moshe 	switch (idx) {
3382d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_HOST_TSC:
3392d7921c4SArbel Moshe 		ctr_val = rdtsc();
3402d7921c4SArbel Moshe 		break;
3412d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_REAL_TIME:
3429285ec4cSJason A. Donenfeld 		ctr_val = ktime_get_boottime_ns();
3432d7921c4SArbel Moshe 		break;
3442d7921c4SArbel Moshe 	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
3459285ec4cSJason A. Donenfeld 		ctr_val = ktime_get_boottime_ns() +
3462d7921c4SArbel Moshe 			vcpu->kvm->arch.kvmclock_offset;
3472d7921c4SArbel Moshe 		break;
3482d7921c4SArbel Moshe 	default:
3492d7921c4SArbel Moshe 		return 1;
3502d7921c4SArbel Moshe 	}
3512d7921c4SArbel Moshe 
3522d7921c4SArbel Moshe 	*data = ctr_val;
3532d7921c4SArbel Moshe 	return 0;
3542d7921c4SArbel Moshe }
3552d7921c4SArbel Moshe 
35641aac14aSWei Huang int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
35741aac14aSWei Huang {
35841aac14aSWei Huang 	bool fast_mode = idx & (1u << 31);
359672ff6cfSLiran Alon 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
36041aac14aSWei Huang 	struct kvm_pmc *pmc;
3610e6f467eSPaolo Bonzini 	u64 mask = fast_mode ? ~0u : ~0ull;
36241aac14aSWei Huang 
363672ff6cfSLiran Alon 	if (!pmu->version)
364672ff6cfSLiran Alon 		return 1;
365672ff6cfSLiran Alon 
3662d7921c4SArbel Moshe 	if (is_vmware_backdoor_pmc(idx))
3672d7921c4SArbel Moshe 		return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
3682d7921c4SArbel Moshe 
369afaf0b2fSSean Christopherson 	pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
37041aac14aSWei Huang 	if (!pmc)
37141aac14aSWei Huang 		return 1;
37241aac14aSWei Huang 
373632a4cf5SLike Xu 	if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
374b3646477SJason Baron 	    (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
375632a4cf5SLike Xu 	    (kvm_read_cr0(vcpu) & X86_CR0_PE))
376632a4cf5SLike Xu 		return 1;
377632a4cf5SLike Xu 
3780e6f467eSPaolo Bonzini 	*data = pmc_read_counter(pmc) & mask;
379e5af058aSWei Huang 	return 0;
380e5af058aSWei Huang }
381e5af058aSWei Huang 
382e5af058aSWei Huang void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
383e5af058aSWei Huang {
384e6209a3bSLike Xu 	if (lapic_in_kernel(vcpu)) {
385e6209a3bSLike Xu 		if (kvm_x86_ops.pmu_ops->deliver_pmi)
386e6209a3bSLike Xu 			kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
387e5af058aSWei Huang 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
388e5af058aSWei Huang 	}
389e6209a3bSLike Xu }
390e5af058aSWei Huang 
391c6702c9dSWei Huang bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
392f5132b01SGleb Natapov {
393afaf0b2fSSean Christopherson 	return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
394afaf0b2fSSean Christopherson 		kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
395f5132b01SGleb Natapov }
396f5132b01SGleb Natapov 
397b35e5548SLike Xu static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
398b35e5548SLike Xu {
399b35e5548SLike Xu 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
400afaf0b2fSSean Christopherson 	struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
401b35e5548SLike Xu 
402b35e5548SLike Xu 	if (pmc)
403b35e5548SLike Xu 		__set_bit(pmc->idx, pmu->pmc_in_use);
404b35e5548SLike Xu }
405b35e5548SLike Xu 
406cbd71758SWei Wang int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
407f5132b01SGleb Natapov {
408cbd71758SWei Wang 	return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
409f5132b01SGleb Natapov }
410f5132b01SGleb Natapov 
411afd80d85SPaolo Bonzini int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
412f5132b01SGleb Natapov {
413b35e5548SLike Xu 	kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
414afaf0b2fSSean Christopherson 	return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
415f5132b01SGleb Natapov }
416f5132b01SGleb Natapov 
417e84cfe4cSWei Huang /* refresh PMU settings. This function generally is called when underlying
418e84cfe4cSWei Huang  * settings are changed (such as changes of PMU CPUID by guest VMs), which
419e84cfe4cSWei Huang  * should rarely happen.
420e84cfe4cSWei Huang  */
421c6702c9dSWei Huang void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
422f5132b01SGleb Natapov {
423afaf0b2fSSean Christopherson 	kvm_x86_ops.pmu_ops->refresh(vcpu);
424f5132b01SGleb Natapov }
425f5132b01SGleb Natapov 
426e5af058aSWei Huang void kvm_pmu_reset(struct kvm_vcpu *vcpu)
427e5af058aSWei Huang {
428e5af058aSWei Huang 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
429e5af058aSWei Huang 
430e5af058aSWei Huang 	irq_work_sync(&pmu->irq_work);
431afaf0b2fSSean Christopherson 	kvm_x86_ops.pmu_ops->reset(vcpu);
432e5af058aSWei Huang }
433e5af058aSWei Huang 
434f5132b01SGleb Natapov void kvm_pmu_init(struct kvm_vcpu *vcpu)
435f5132b01SGleb Natapov {
436212dba12SWei Huang 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
437f5132b01SGleb Natapov 
438f5132b01SGleb Natapov 	memset(pmu, 0, sizeof(*pmu));
439afaf0b2fSSean Christopherson 	kvm_x86_ops.pmu_ops->init(vcpu);
440c6702c9dSWei Huang 	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
441b35e5548SLike Xu 	pmu->event_count = 0;
442b35e5548SLike Xu 	pmu->need_cleanup = false;
443c6702c9dSWei Huang 	kvm_pmu_refresh(vcpu);
444f5132b01SGleb Natapov }
445f5132b01SGleb Natapov 
446b35e5548SLike Xu static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
447b35e5548SLike Xu {
448b35e5548SLike Xu 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
449b35e5548SLike Xu 
450b35e5548SLike Xu 	if (pmc_is_fixed(pmc))
451b35e5548SLike Xu 		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
452b35e5548SLike Xu 			pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
453b35e5548SLike Xu 
454b35e5548SLike Xu 	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
455b35e5548SLike Xu }
456b35e5548SLike Xu 
457b35e5548SLike Xu /* Release perf_events for vPMCs that have been unused for a full time slice.  */
458b35e5548SLike Xu void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
459b35e5548SLike Xu {
460b35e5548SLike Xu 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
461b35e5548SLike Xu 	struct kvm_pmc *pmc = NULL;
462b35e5548SLike Xu 	DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
463b35e5548SLike Xu 	int i;
464b35e5548SLike Xu 
465b35e5548SLike Xu 	pmu->need_cleanup = false;
466b35e5548SLike Xu 
467b35e5548SLike Xu 	bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
468b35e5548SLike Xu 		      pmu->pmc_in_use, X86_PMC_IDX_MAX);
469b35e5548SLike Xu 
470b35e5548SLike Xu 	for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
471afaf0b2fSSean Christopherson 		pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
472b35e5548SLike Xu 
473b35e5548SLike Xu 		if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
474b35e5548SLike Xu 			pmc_stop_counter(pmc);
475b35e5548SLike Xu 	}
476b35e5548SLike Xu 
4779aa4f622SLike Xu 	if (kvm_x86_ops.pmu_ops->cleanup)
4789aa4f622SLike Xu 		kvm_x86_ops.pmu_ops->cleanup(vcpu);
4799aa4f622SLike Xu 
480b35e5548SLike Xu 	bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
481b35e5548SLike Xu }
482b35e5548SLike Xu 
483f5132b01SGleb Natapov void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
484f5132b01SGleb Natapov {
485f5132b01SGleb Natapov 	kvm_pmu_reset(vcpu);
486f5132b01SGleb Natapov }
48766bb8a06SEric Hankland 
4889cd803d4SEric Hankland static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
4899cd803d4SEric Hankland {
4909cd803d4SEric Hankland 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
4919cd803d4SEric Hankland 	u64 prev_count;
4929cd803d4SEric Hankland 
4939cd803d4SEric Hankland 	prev_count = pmc->counter;
4949cd803d4SEric Hankland 	pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
4959cd803d4SEric Hankland 
4969cd803d4SEric Hankland 	reprogram_counter(pmu, pmc->idx);
4979cd803d4SEric Hankland 	if (pmc->counter < prev_count)
4989cd803d4SEric Hankland 		__kvm_perf_overflow(pmc, false);
4999cd803d4SEric Hankland }
5009cd803d4SEric Hankland 
5019cd803d4SEric Hankland static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
5029cd803d4SEric Hankland 	unsigned int perf_hw_id)
5039cd803d4SEric Hankland {
5049cd803d4SEric Hankland 	u64 old_eventsel = pmc->eventsel;
5059cd803d4SEric Hankland 	unsigned int config;
5069cd803d4SEric Hankland 
5079cd803d4SEric Hankland 	pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
5089cd803d4SEric Hankland 	config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
5099cd803d4SEric Hankland 	pmc->eventsel = old_eventsel;
5109cd803d4SEric Hankland 	return config == perf_hw_id;
5119cd803d4SEric Hankland }
5129cd803d4SEric Hankland 
5139cd803d4SEric Hankland static inline bool cpl_is_matched(struct kvm_pmc *pmc)
5149cd803d4SEric Hankland {
5159cd803d4SEric Hankland 	bool select_os, select_user;
5169cd803d4SEric Hankland 	u64 config = pmc->current_config;
5179cd803d4SEric Hankland 
5189cd803d4SEric Hankland 	if (pmc_is_gp(pmc)) {
5199cd803d4SEric Hankland 		select_os = config & ARCH_PERFMON_EVENTSEL_OS;
5209cd803d4SEric Hankland 		select_user = config & ARCH_PERFMON_EVENTSEL_USR;
5219cd803d4SEric Hankland 	} else {
5229cd803d4SEric Hankland 		select_os = config & 0x1;
5239cd803d4SEric Hankland 		select_user = config & 0x2;
5249cd803d4SEric Hankland 	}
5259cd803d4SEric Hankland 
5269cd803d4SEric Hankland 	return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
5279cd803d4SEric Hankland }
5289cd803d4SEric Hankland 
5299cd803d4SEric Hankland void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
5309cd803d4SEric Hankland {
5319cd803d4SEric Hankland 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
5329cd803d4SEric Hankland 	struct kvm_pmc *pmc;
5339cd803d4SEric Hankland 	int i;
5349cd803d4SEric Hankland 
5359cd803d4SEric Hankland 	for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
5369cd803d4SEric Hankland 		pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
5379cd803d4SEric Hankland 
5389cd803d4SEric Hankland 		if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
5399cd803d4SEric Hankland 			continue;
5409cd803d4SEric Hankland 
5419cd803d4SEric Hankland 		/* Ignore checks for edge detect, pin control, invert and CMASK bits */
5429cd803d4SEric Hankland 		if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
5439cd803d4SEric Hankland 			kvm_pmu_incr_counter(pmc);
5449cd803d4SEric Hankland 	}
5459cd803d4SEric Hankland }
5469cd803d4SEric Hankland EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
5479cd803d4SEric Hankland 
54866bb8a06SEric Hankland int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
54966bb8a06SEric Hankland {
55066bb8a06SEric Hankland 	struct kvm_pmu_event_filter tmp, *filter;
55166bb8a06SEric Hankland 	size_t size;
55266bb8a06SEric Hankland 	int r;
55366bb8a06SEric Hankland 
55466bb8a06SEric Hankland 	if (copy_from_user(&tmp, argp, sizeof(tmp)))
55566bb8a06SEric Hankland 		return -EFAULT;
55666bb8a06SEric Hankland 
55766bb8a06SEric Hankland 	if (tmp.action != KVM_PMU_EVENT_ALLOW &&
55866bb8a06SEric Hankland 	    tmp.action != KVM_PMU_EVENT_DENY)
55966bb8a06SEric Hankland 		return -EINVAL;
56066bb8a06SEric Hankland 
56130cd8604SEric Hankland 	if (tmp.flags != 0)
56230cd8604SEric Hankland 		return -EINVAL;
56330cd8604SEric Hankland 
56466bb8a06SEric Hankland 	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
56566bb8a06SEric Hankland 		return -E2BIG;
56666bb8a06SEric Hankland 
56766bb8a06SEric Hankland 	size = struct_size(filter, events, tmp.nevents);
56866bb8a06SEric Hankland 	filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
56966bb8a06SEric Hankland 	if (!filter)
57066bb8a06SEric Hankland 		return -ENOMEM;
57166bb8a06SEric Hankland 
57266bb8a06SEric Hankland 	r = -EFAULT;
57366bb8a06SEric Hankland 	if (copy_from_user(filter, argp, size))
57466bb8a06SEric Hankland 		goto cleanup;
57566bb8a06SEric Hankland 
57666bb8a06SEric Hankland 	/* Ensure nevents can't be changed between the user copies. */
57766bb8a06SEric Hankland 	*filter = tmp;
57866bb8a06SEric Hankland 
5797ff775acSJim Mattson 	/*
5807ff775acSJim Mattson 	 * Sort the in-kernel list so that we can search it with bsearch.
5817ff775acSJim Mattson 	 */
5827ff775acSJim Mattson 	sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
5837ff775acSJim Mattson 
58466bb8a06SEric Hankland 	mutex_lock(&kvm->lock);
58512e78e69SPaul E. McKenney 	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
58666bb8a06SEric Hankland 				     mutex_is_locked(&kvm->lock));
58766bb8a06SEric Hankland 	mutex_unlock(&kvm->lock);
58866bb8a06SEric Hankland 
58966bb8a06SEric Hankland 	synchronize_srcu_expedited(&kvm->srcu);
59066bb8a06SEric Hankland 	r = 0;
59166bb8a06SEric Hankland cleanup:
59266bb8a06SEric Hankland 	kfree(filter);
59366bb8a06SEric Hankland 	return r;
59466bb8a06SEric Hankland }
595