xref: /openbmc/linux/arch/x86/kvm/pmu.c (revision 1cb8f3e2d8fe7533c26df9925a83bd3d185b312e)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4   *
5   * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6   *
7   * Authors:
8   *   Avi Kivity   <avi@redhat.com>
9   *   Gleb Natapov <gleb@redhat.com>
10   *   Wei Huang    <wei@redhat.com>
11   */
12  
13  #include <linux/types.h>
14  #include <linux/kvm_host.h>
15  #include <linux/perf_event.h>
16  #include <asm/perf_event.h>
17  #include "x86.h"
18  #include "cpuid.h"
19  #include "lapic.h"
20  #include "pmu.h"
21  
22  /* This is enough to filter the vast majority of currently defined events. */
23  #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
24  
25  /* NOTE:
26   * - Each perf counter is defined as "struct kvm_pmc";
27   * - There are two types of perf counters: general purpose (gp) and fixed.
28   *   gp counters are stored in gp_counters[] and fixed counters are stored
29   *   in fixed_counters[] respectively. Both of them are part of "struct
30   *   kvm_pmu";
31   * - pmu.c understands the difference between gp counters and fixed counters.
32   *   However AMD doesn't support fixed-counters;
33   * - There are three types of index to access perf counters (PMC):
34   *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
35   *        has MSR_K7_PERFCTRn.
36   *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
37   *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
38   *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
39   *        that it also supports fixed counters. idx can be used to as index to
40   *        gp and fixed counters.
41   *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
42   *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
43   *        all perf counters (both gp and fixed). The mapping relationship
44   *        between pmc and perf counters is as the following:
45   *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
46   *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
47   *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
48   */
49  
50  static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
51  {
52  	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
53  	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
54  
55  	kvm_pmu_deliver_pmi(vcpu);
56  }
57  
58  static void kvm_perf_overflow(struct perf_event *perf_event,
59  			      struct perf_sample_data *data,
60  			      struct pt_regs *regs)
61  {
62  	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
63  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
64  
65  	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
66  		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
67  		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
68  	}
69  }
70  
71  static void kvm_perf_overflow_intr(struct perf_event *perf_event,
72  				   struct perf_sample_data *data,
73  				   struct pt_regs *regs)
74  {
75  	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
76  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
77  
78  	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
79  		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
80  		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
81  
82  		/*
83  		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
84  		 * can be ejected on a guest mode re-entry. Otherwise we can't
85  		 * be sure that vcpu wasn't executing hlt instruction at the
86  		 * time of vmexit and is not going to re-enter guest mode until
87  		 * woken up. So we should wake it, but this is impossible from
88  		 * NMI context. Do it from irq work instead.
89  		 */
90  		if (!kvm_is_in_guest())
91  			irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
92  		else
93  			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
94  	}
95  }
96  
97  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
98  				  unsigned config, bool exclude_user,
99  				  bool exclude_kernel, bool intr,
100  				  bool in_tx, bool in_tx_cp)
101  {
102  	struct perf_event *event;
103  	struct perf_event_attr attr = {
104  		.type = type,
105  		.size = sizeof(attr),
106  		.pinned = true,
107  		.exclude_idle = true,
108  		.exclude_host = 1,
109  		.exclude_user = exclude_user,
110  		.exclude_kernel = exclude_kernel,
111  		.config = config,
112  	};
113  
114  	attr.sample_period = get_sample_period(pmc, pmc->counter);
115  
116  	if (in_tx)
117  		attr.config |= HSW_IN_TX;
118  	if (in_tx_cp) {
119  		/*
120  		 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
121  		 * period. Just clear the sample period so at least
122  		 * allocating the counter doesn't fail.
123  		 */
124  		attr.sample_period = 0;
125  		attr.config |= HSW_IN_TX_CHECKPOINTED;
126  	}
127  
128  	event = perf_event_create_kernel_counter(&attr, -1, current,
129  						 intr ? kvm_perf_overflow_intr :
130  						 kvm_perf_overflow, pmc);
131  	if (IS_ERR(event)) {
132  		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
133  			    PTR_ERR(event), pmc->idx);
134  		return;
135  	}
136  
137  	pmc->perf_event = event;
138  	pmc_to_pmu(pmc)->event_count++;
139  	clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
140  }
141  
142  static void pmc_pause_counter(struct kvm_pmc *pmc)
143  {
144  	u64 counter = pmc->counter;
145  
146  	if (!pmc->perf_event)
147  		return;
148  
149  	/* update counter, reset event value to avoid redundant accumulation */
150  	counter += perf_event_pause(pmc->perf_event, true);
151  	pmc->counter = counter & pmc_bitmask(pmc);
152  }
153  
154  static bool pmc_resume_counter(struct kvm_pmc *pmc)
155  {
156  	if (!pmc->perf_event)
157  		return false;
158  
159  	/* recalibrate sample period and check if it's accepted by perf core */
160  	if (perf_event_period(pmc->perf_event,
161  			      get_sample_period(pmc, pmc->counter)))
162  		return false;
163  
164  	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
165  	perf_event_enable(pmc->perf_event);
166  
167  	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
168  	return true;
169  }
170  
171  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
172  {
173  	unsigned config, type = PERF_TYPE_RAW;
174  	u8 event_select, unit_mask;
175  	struct kvm *kvm = pmc->vcpu->kvm;
176  	struct kvm_pmu_event_filter *filter;
177  	int i;
178  	bool allow_event = true;
179  
180  	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
181  		printk_once("kvm pmu: pin control bit is ignored\n");
182  
183  	pmc->eventsel = eventsel;
184  
185  	pmc_pause_counter(pmc);
186  
187  	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
188  		return;
189  
190  	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
191  	if (filter) {
192  		for (i = 0; i < filter->nevents; i++)
193  			if (filter->events[i] ==
194  			    (eventsel & AMD64_RAW_EVENT_MASK_NB))
195  				break;
196  		if (filter->action == KVM_PMU_EVENT_ALLOW &&
197  		    i == filter->nevents)
198  			allow_event = false;
199  		if (filter->action == KVM_PMU_EVENT_DENY &&
200  		    i < filter->nevents)
201  			allow_event = false;
202  	}
203  	if (!allow_event)
204  		return;
205  
206  	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
207  	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
208  
209  	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
210  			  ARCH_PERFMON_EVENTSEL_INV |
211  			  ARCH_PERFMON_EVENTSEL_CMASK |
212  			  HSW_IN_TX |
213  			  HSW_IN_TX_CHECKPOINTED))) {
214  		config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc),
215  						      event_select,
216  						      unit_mask);
217  		if (config != PERF_COUNT_HW_MAX)
218  			type = PERF_TYPE_HARDWARE;
219  	}
220  
221  	if (type == PERF_TYPE_RAW)
222  		config = eventsel & X86_RAW_EVENT_MASK;
223  
224  	if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
225  		return;
226  
227  	pmc_release_perf_event(pmc);
228  
229  	pmc->current_config = eventsel;
230  	pmc_reprogram_counter(pmc, type, config,
231  			      !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
232  			      !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
233  			      eventsel & ARCH_PERFMON_EVENTSEL_INT,
234  			      (eventsel & HSW_IN_TX),
235  			      (eventsel & HSW_IN_TX_CHECKPOINTED));
236  }
237  EXPORT_SYMBOL_GPL(reprogram_gp_counter);
238  
239  void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
240  {
241  	unsigned en_field = ctrl & 0x3;
242  	bool pmi = ctrl & 0x8;
243  	struct kvm_pmu_event_filter *filter;
244  	struct kvm *kvm = pmc->vcpu->kvm;
245  
246  	pmc_pause_counter(pmc);
247  
248  	if (!en_field || !pmc_is_enabled(pmc))
249  		return;
250  
251  	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
252  	if (filter) {
253  		if (filter->action == KVM_PMU_EVENT_DENY &&
254  		    test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
255  			return;
256  		if (filter->action == KVM_PMU_EVENT_ALLOW &&
257  		    !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
258  			return;
259  	}
260  
261  	if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
262  		return;
263  
264  	pmc_release_perf_event(pmc);
265  
266  	pmc->current_config = (u64)ctrl;
267  	pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
268  			      kvm_x86_ops.pmu_ops->find_fixed_event(idx),
269  			      !(en_field & 0x2), /* exclude user */
270  			      !(en_field & 0x1), /* exclude kernel */
271  			      pmi, false, false);
272  }
273  EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
274  
275  void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
276  {
277  	struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
278  
279  	if (!pmc)
280  		return;
281  
282  	if (pmc_is_gp(pmc))
283  		reprogram_gp_counter(pmc, pmc->eventsel);
284  	else {
285  		int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
286  		u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
287  
288  		reprogram_fixed_counter(pmc, ctrl, idx);
289  	}
290  }
291  EXPORT_SYMBOL_GPL(reprogram_counter);
292  
293  void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
294  {
295  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
296  	int bit;
297  
298  	for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
299  		struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
300  
301  		if (unlikely(!pmc || !pmc->perf_event)) {
302  			clear_bit(bit, pmu->reprogram_pmi);
303  			continue;
304  		}
305  
306  		reprogram_counter(pmu, bit);
307  	}
308  
309  	/*
310  	 * Unused perf_events are only released if the corresponding MSRs
311  	 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
312  	 * triggers KVM_REQ_PMU if cleanup is needed.
313  	 */
314  	if (unlikely(pmu->need_cleanup))
315  		kvm_pmu_cleanup(vcpu);
316  }
317  
318  /* check if idx is a valid index to access PMU */
319  int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
320  {
321  	return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
322  }
323  
324  bool is_vmware_backdoor_pmc(u32 pmc_idx)
325  {
326  	switch (pmc_idx) {
327  	case VMWARE_BACKDOOR_PMC_HOST_TSC:
328  	case VMWARE_BACKDOOR_PMC_REAL_TIME:
329  	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
330  		return true;
331  	}
332  	return false;
333  }
334  
335  static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
336  {
337  	u64 ctr_val;
338  
339  	switch (idx) {
340  	case VMWARE_BACKDOOR_PMC_HOST_TSC:
341  		ctr_val = rdtsc();
342  		break;
343  	case VMWARE_BACKDOOR_PMC_REAL_TIME:
344  		ctr_val = ktime_get_boottime_ns();
345  		break;
346  	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
347  		ctr_val = ktime_get_boottime_ns() +
348  			vcpu->kvm->arch.kvmclock_offset;
349  		break;
350  	default:
351  		return 1;
352  	}
353  
354  	*data = ctr_val;
355  	return 0;
356  }
357  
358  int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
359  {
360  	bool fast_mode = idx & (1u << 31);
361  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
362  	struct kvm_pmc *pmc;
363  	u64 mask = fast_mode ? ~0u : ~0ull;
364  
365  	if (!pmu->version)
366  		return 1;
367  
368  	if (is_vmware_backdoor_pmc(idx))
369  		return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
370  
371  	pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
372  	if (!pmc)
373  		return 1;
374  
375  	if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
376  	    (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
377  	    (kvm_read_cr0(vcpu) & X86_CR0_PE))
378  		return 1;
379  
380  	*data = pmc_read_counter(pmc) & mask;
381  	return 0;
382  }
383  
384  void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
385  {
386  	if (lapic_in_kernel(vcpu)) {
387  		if (kvm_x86_ops.pmu_ops->deliver_pmi)
388  			kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
389  		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
390  	}
391  }
392  
393  bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
394  {
395  	return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
396  		kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
397  }
398  
399  static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
400  {
401  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
402  	struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
403  
404  	if (pmc)
405  		__set_bit(pmc->idx, pmu->pmc_in_use);
406  }
407  
408  int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
409  {
410  	return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
411  }
412  
413  int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
414  {
415  	kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
416  	return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
417  }
418  
419  /* refresh PMU settings. This function generally is called when underlying
420   * settings are changed (such as changes of PMU CPUID by guest VMs), which
421   * should rarely happen.
422   */
423  void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
424  {
425  	kvm_x86_ops.pmu_ops->refresh(vcpu);
426  }
427  
428  void kvm_pmu_reset(struct kvm_vcpu *vcpu)
429  {
430  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
431  
432  	irq_work_sync(&pmu->irq_work);
433  	kvm_x86_ops.pmu_ops->reset(vcpu);
434  }
435  
436  void kvm_pmu_init(struct kvm_vcpu *vcpu)
437  {
438  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
439  
440  	memset(pmu, 0, sizeof(*pmu));
441  	kvm_x86_ops.pmu_ops->init(vcpu);
442  	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
443  	pmu->event_count = 0;
444  	pmu->need_cleanup = false;
445  	kvm_pmu_refresh(vcpu);
446  }
447  
448  static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
449  {
450  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
451  
452  	if (pmc_is_fixed(pmc))
453  		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
454  			pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
455  
456  	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
457  }
458  
459  /* Release perf_events for vPMCs that have been unused for a full time slice.  */
460  void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
461  {
462  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
463  	struct kvm_pmc *pmc = NULL;
464  	DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
465  	int i;
466  
467  	pmu->need_cleanup = false;
468  
469  	bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
470  		      pmu->pmc_in_use, X86_PMC_IDX_MAX);
471  
472  	for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
473  		pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
474  
475  		if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
476  			pmc_stop_counter(pmc);
477  	}
478  
479  	if (kvm_x86_ops.pmu_ops->cleanup)
480  		kvm_x86_ops.pmu_ops->cleanup(vcpu);
481  
482  	bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
483  }
484  
485  void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
486  {
487  	kvm_pmu_reset(vcpu);
488  }
489  
490  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
491  {
492  	struct kvm_pmu_event_filter tmp, *filter;
493  	size_t size;
494  	int r;
495  
496  	if (copy_from_user(&tmp, argp, sizeof(tmp)))
497  		return -EFAULT;
498  
499  	if (tmp.action != KVM_PMU_EVENT_ALLOW &&
500  	    tmp.action != KVM_PMU_EVENT_DENY)
501  		return -EINVAL;
502  
503  	if (tmp.flags != 0)
504  		return -EINVAL;
505  
506  	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
507  		return -E2BIG;
508  
509  	size = struct_size(filter, events, tmp.nevents);
510  	filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
511  	if (!filter)
512  		return -ENOMEM;
513  
514  	r = -EFAULT;
515  	if (copy_from_user(filter, argp, size))
516  		goto cleanup;
517  
518  	/* Ensure nevents can't be changed between the user copies. */
519  	*filter = tmp;
520  
521  	mutex_lock(&kvm->lock);
522  	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
523  				     mutex_is_locked(&kvm->lock));
524  	mutex_unlock(&kvm->lock);
525  
526  	synchronize_srcu_expedited(&kvm->srcu);
527  	r = 0;
528  cleanup:
529  	kfree(filter);
530  	return r;
531  }
532