xref: /openbmc/linux/virt/kvm/kvm_main.c (revision 34d6f206a88c2651d216bd3487ac956a40b2ba8e)
120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
20fce5623SAvi Kivity /*
30fce5623SAvi Kivity  * Kernel-based Virtual Machine driver for Linux
40fce5623SAvi Kivity  *
50fce5623SAvi Kivity  * This module enables machines with Intel VT-x extensions to run virtual
60fce5623SAvi Kivity  * machines without emulation or binary translation.
70fce5623SAvi Kivity  *
80fce5623SAvi Kivity  * Copyright (C) 2006 Qumranet, Inc.
99611c187SNicolas Kaiser  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
100fce5623SAvi Kivity  *
110fce5623SAvi Kivity  * Authors:
120fce5623SAvi Kivity  *   Avi Kivity   <avi@qumranet.com>
130fce5623SAvi Kivity  *   Yaniv Kamay  <yaniv@qumranet.com>
140fce5623SAvi Kivity  */
150fce5623SAvi Kivity 
16af669ac6SAndre Przywara #include <kvm/iodev.h>
170fce5623SAvi Kivity 
180fce5623SAvi Kivity #include <linux/kvm_host.h>
190fce5623SAvi Kivity #include <linux/kvm.h>
200fce5623SAvi Kivity #include <linux/module.h>
210fce5623SAvi Kivity #include <linux/errno.h>
220fce5623SAvi Kivity #include <linux/percpu.h>
230fce5623SAvi Kivity #include <linux/mm.h>
240fce5623SAvi Kivity #include <linux/miscdevice.h>
250fce5623SAvi Kivity #include <linux/vmalloc.h>
260fce5623SAvi Kivity #include <linux/reboot.h>
270fce5623SAvi Kivity #include <linux/debugfs.h>
280fce5623SAvi Kivity #include <linux/highmem.h>
290fce5623SAvi Kivity #include <linux/file.h>
30fb3600ccSRafael J. Wysocki #include <linux/syscore_ops.h>
310fce5623SAvi Kivity #include <linux/cpu.h>
32174cd4b1SIngo Molnar #include <linux/sched/signal.h>
336e84f315SIngo Molnar #include <linux/sched/mm.h>
3403441a34SIngo Molnar #include <linux/sched/stat.h>
350fce5623SAvi Kivity #include <linux/cpumask.h>
360fce5623SAvi Kivity #include <linux/smp.h>
370fce5623SAvi Kivity #include <linux/anon_inodes.h>
380fce5623SAvi Kivity #include <linux/profile.h>
390fce5623SAvi Kivity #include <linux/kvm_para.h>
400fce5623SAvi Kivity #include <linux/pagemap.h>
410fce5623SAvi Kivity #include <linux/mman.h>
4235149e21SAnthony Liguori #include <linux/swap.h>
43e56d532fSSheng Yang #include <linux/bitops.h>
44547de29eSMarcelo Tosatti #include <linux/spinlock.h>
456ff5894cSArnd Bergmann #include <linux/compat.h>
46bc6678a3SMarcelo Tosatti #include <linux/srcu.h>
478f0b1ab6SJoerg Roedel #include <linux/hugetlb.h>
485a0e3ad6STejun Heo #include <linux/slab.h>
49743eeb0bSSasha Levin #include <linux/sort.h>
50743eeb0bSSasha Levin #include <linux/bsearch.h>
51c011d23bSPaolo Bonzini #include <linux/io.h>
522eb06c30SWanpeng Li #include <linux/lockdep.h>
53c57c8046SJunaid Shahid #include <linux/kthread.h>
542fdef3a2SSergey Senozhatsky #include <linux/suspend.h>
550fce5623SAvi Kivity 
560fce5623SAvi Kivity #include <asm/processor.h>
572ea75be3SDavid Matlack #include <asm/ioctl.h>
587c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
590fce5623SAvi Kivity 
605f94c174SLaurent Vivier #include "coalesced_mmio.h"
61af585b92SGleb Natapov #include "async_pf.h"
62982ed0deSDavid Woodhouse #include "kvm_mm.h"
633c3c29fdSPaolo Bonzini #include "vfio.h"
645f94c174SLaurent Vivier 
654c8c3c7fSValentin Schneider #include <trace/events/ipi.h>
664c8c3c7fSValentin Schneider 
67229456fcSMarcelo Tosatti #define CREATE_TRACE_POINTS
68229456fcSMarcelo Tosatti #include <trace/events/kvm.h>
69229456fcSMarcelo Tosatti 
70fb04a1edSPeter Xu #include <linux/kvm_dirty_ring.h>
71fb04a1edSPeter Xu 
724c8c3c7fSValentin Schneider 
73536a6f88SJanosch Frank /* Worst case buffer size needed for holding an integer. */
74536a6f88SJanosch Frank #define ITOA_MAX_LEN 12
75536a6f88SJanosch Frank 
760fce5623SAvi Kivity MODULE_AUTHOR("Qumranet");
770fce5623SAvi Kivity MODULE_LICENSE("GPL");
780fce5623SAvi Kivity 
79920552b2SDavid Hildenbrand /* Architectures should define their poll value according to the halt latency */
80ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81039c5d1bSRoman Storozhenko module_param(halt_poll_ns, uint, 0644);
82ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns);
83f7819512SPaolo Bonzini 
84aca6ff29SWanpeng Li /* Default doubles per-vcpu halt_poll_ns. */
85ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns_grow = 2;
86039c5d1bSRoman Storozhenko module_param(halt_poll_ns_grow, uint, 0644);
87ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88aca6ff29SWanpeng Li 
8949113d36SNir Weiner /* The start value to grow halt_poll_ns from */
9049113d36SNir Weiner unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
9149113d36SNir Weiner module_param(halt_poll_ns_grow_start, uint, 0644);
9249113d36SNir Weiner EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
9349113d36SNir Weiner 
94aca6ff29SWanpeng Li /* Default resets per-vcpu halt_poll_ns . */
95ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns_shrink;
96039c5d1bSRoman Storozhenko module_param(halt_poll_ns_shrink, uint, 0644);
97ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98aca6ff29SWanpeng Li 
99fa40a821SMarcelo Tosatti /*
100fa40a821SMarcelo Tosatti  * Ordering of locks:
101fa40a821SMarcelo Tosatti  *
102fae3a353SSheng Yang  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103fa40a821SMarcelo Tosatti  */
104fa40a821SMarcelo Tosatti 
1050d9ce162SJunaid Shahid DEFINE_MUTEX(kvm_lock);
1060fce5623SAvi Kivity LIST_HEAD(vm_list);
1070fce5623SAvi Kivity 
108aaba298cSSean Christopherson static struct kmem_cache *kvm_vcpu_cache;
1090fce5623SAvi Kivity 
1100fce5623SAvi Kivity static __read_mostly struct preempt_ops kvm_preempt_ops;
1117495e22bSPaolo Bonzini static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
1120fce5623SAvi Kivity 
11376f7c879SHollis Blanchard struct dentry *kvm_debugfs_dir;
114e23a808bSPaul Mackerras EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
1150fce5623SAvi Kivity 
11609cbcef6SMilan Pandurov static const struct file_operations stat_fops_per_vm;
117536a6f88SJanosch Frank 
1185f6de5cbSDavid Matlack static struct file_operations kvm_chardev_ops;
1195f6de5cbSDavid Matlack 
1200fce5623SAvi Kivity static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
1210fce5623SAvi Kivity 			   unsigned long arg);
122de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
1231dda606cSAlexander Graf static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
1241dda606cSAlexander Graf 				  unsigned long arg);
1257ddfd3e0SMarc Zyngier #define KVM_COMPAT(c)	.compat_ioctl	= (c)
1267ddfd3e0SMarc Zyngier #else
1279cb09e7cSMarc Zyngier /*
1289cb09e7cSMarc Zyngier  * For architectures that don't implement a compat infrastructure,
1299cb09e7cSMarc Zyngier  * adopt a double line of defense:
1309cb09e7cSMarc Zyngier  * - Prevent a compat task from opening /dev/kvm
1319cb09e7cSMarc Zyngier  * - If the open has been done by a 64bit task, and the KVM fd
1329cb09e7cSMarc Zyngier  *   passed to a compat task, let the ioctls fail.
1339cb09e7cSMarc Zyngier  */
kvm_no_compat_ioctl(struct file * file,unsigned int ioctl,unsigned long arg)1347ddfd3e0SMarc Zyngier static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
1357ddfd3e0SMarc Zyngier 				unsigned long arg) { return -EINVAL; }
136b9876e6dSMarc Zyngier 
kvm_no_compat_open(struct inode * inode,struct file * file)137b9876e6dSMarc Zyngier static int kvm_no_compat_open(struct inode *inode, struct file *file)
138b9876e6dSMarc Zyngier {
139b9876e6dSMarc Zyngier 	return is_compat_task() ? -ENODEV : 0;
140b9876e6dSMarc Zyngier }
141b9876e6dSMarc Zyngier #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
142b9876e6dSMarc Zyngier 			.open		= kvm_no_compat_open
1431dda606cSAlexander Graf #endif
14410474ae8SAlexander Graf static int hardware_enable_all(void);
14510474ae8SAlexander Graf static void hardware_disable_all(void);
1460fce5623SAvi Kivity 
147e93f8a0fSMarcelo Tosatti static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
1487940876eSStephen Hemminger 
149286de8f6SClaudio Imbrenda #define KVM_EVENT_CREATE_VM 0
150286de8f6SClaudio Imbrenda #define KVM_EVENT_DESTROY_VM 1
151286de8f6SClaudio Imbrenda static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
152286de8f6SClaudio Imbrenda static unsigned long long kvm_createvm_count;
153286de8f6SClaudio Imbrenda static unsigned long long kvm_active_vms;
154286de8f6SClaudio Imbrenda 
155baff59ccSVitaly Kuznetsov static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
156baff59ccSVitaly Kuznetsov 
kvm_arch_guest_memory_reclaimed(struct kvm * kvm)157683412ccSMingwei Zhang __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
158683412ccSMingwei Zhang {
159683412ccSMingwei Zhang }
160683412ccSMingwei Zhang 
kvm_is_zone_device_page(struct page * page)161284dc493SSean Christopherson bool kvm_is_zone_device_page(struct page *page)
162a78986aaSSean Christopherson {
163a78986aaSSean Christopherson 	/*
164a78986aaSSean Christopherson 	 * The metadata used by is_zone_device_page() to determine whether or
165a78986aaSSean Christopherson 	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
166a78986aaSSean Christopherson 	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
167a78986aaSSean Christopherson 	 * page_count() is zero to help detect bad usage of this helper.
168a78986aaSSean Christopherson 	 */
169284dc493SSean Christopherson 	if (WARN_ON_ONCE(!page_count(page)))
170a78986aaSSean Christopherson 		return false;
171a78986aaSSean Christopherson 
172284dc493SSean Christopherson 	return is_zone_device_page(page);
173a78986aaSSean Christopherson }
174a78986aaSSean Christopherson 
175b14b2690SSean Christopherson /*
176b14b2690SSean Christopherson  * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
177b14b2690SSean Christopherson  * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
178b14b2690SSean Christopherson  * is likely incomplete, it has been compiled purely through people wanting to
179b14b2690SSean Christopherson  * back guest with a certain type of memory and encountering issues.
180b14b2690SSean Christopherson  */
kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)181b14b2690SSean Christopherson struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
182cbff90a7SBen-Ami Yassour {
183b14b2690SSean Christopherson 	struct page *page;
184b14b2690SSean Christopherson 
185b14b2690SSean Christopherson 	if (!pfn_valid(pfn))
186b14b2690SSean Christopherson 		return NULL;
187b14b2690SSean Christopherson 
188b14b2690SSean Christopherson 	page = pfn_to_page(pfn);
189b14b2690SSean Christopherson 	if (!PageReserved(page))
190b14b2690SSean Christopherson 		return page;
191b14b2690SSean Christopherson 
192b14b2690SSean Christopherson 	/* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
193b14b2690SSean Christopherson 	if (is_zero_pfn(pfn))
194b14b2690SSean Christopherson 		return page;
195b14b2690SSean Christopherson 
196a78986aaSSean Christopherson 	/*
197a78986aaSSean Christopherson 	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
198a78986aaSSean Christopherson 	 * perspective they are "normal" pages, albeit with slightly different
199a78986aaSSean Christopherson 	 * usage rules.
200a78986aaSSean Christopherson 	 */
201b14b2690SSean Christopherson 	if (kvm_is_zone_device_page(page))
202b14b2690SSean Christopherson 		return page;
203cbff90a7SBen-Ami Yassour 
204b14b2690SSean Christopherson 	return NULL;
205cbff90a7SBen-Ami Yassour }
206cbff90a7SBen-Ami Yassour 
2070fce5623SAvi Kivity /*
2080fce5623SAvi Kivity  * Switches to specified vcpu, until a matching vcpu_put()
2090fce5623SAvi Kivity  */
vcpu_load(struct kvm_vcpu * vcpu)210ec7660ccSChristoffer Dall void vcpu_load(struct kvm_vcpu *vcpu)
2110fce5623SAvi Kivity {
212ec7660ccSChristoffer Dall 	int cpu = get_cpu();
2137495e22bSPaolo Bonzini 
2147495e22bSPaolo Bonzini 	__this_cpu_write(kvm_running_vcpu, vcpu);
2150fce5623SAvi Kivity 	preempt_notifier_register(&vcpu->preempt_notifier);
2160fce5623SAvi Kivity 	kvm_arch_vcpu_load(vcpu, cpu);
2170fce5623SAvi Kivity 	put_cpu();
2180fce5623SAvi Kivity }
2192f1fe811SJim Mattson EXPORT_SYMBOL_GPL(vcpu_load);
2200fce5623SAvi Kivity 
vcpu_put(struct kvm_vcpu * vcpu)2210fce5623SAvi Kivity void vcpu_put(struct kvm_vcpu *vcpu)
2220fce5623SAvi Kivity {
2230fce5623SAvi Kivity 	preempt_disable();
2240fce5623SAvi Kivity 	kvm_arch_vcpu_put(vcpu);
2250fce5623SAvi Kivity 	preempt_notifier_unregister(&vcpu->preempt_notifier);
2267495e22bSPaolo Bonzini 	__this_cpu_write(kvm_running_vcpu, NULL);
2270fce5623SAvi Kivity 	preempt_enable();
2280fce5623SAvi Kivity }
2292f1fe811SJim Mattson EXPORT_SYMBOL_GPL(vcpu_put);
2300fce5623SAvi Kivity 
2317a97cec2SPaolo Bonzini /* TODO: merge with kvm_arch_vcpu_should_kick */
kvm_request_needs_ipi(struct kvm_vcpu * vcpu,unsigned req)2327a97cec2SPaolo Bonzini static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
2337a97cec2SPaolo Bonzini {
2347a97cec2SPaolo Bonzini 	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
2357a97cec2SPaolo Bonzini 
2367a97cec2SPaolo Bonzini 	/*
2377a97cec2SPaolo Bonzini 	 * We need to wait for the VCPU to reenable interrupts and get out of
2387a97cec2SPaolo Bonzini 	 * READING_SHADOW_PAGE_TABLES mode.
2397a97cec2SPaolo Bonzini 	 */
2407a97cec2SPaolo Bonzini 	if (req & KVM_REQUEST_WAIT)
2417a97cec2SPaolo Bonzini 		return mode != OUTSIDE_GUEST_MODE;
2427a97cec2SPaolo Bonzini 
2437a97cec2SPaolo Bonzini 	/*
2447a97cec2SPaolo Bonzini 	 * Need to kick a running VCPU, but otherwise there is nothing to do.
2457a97cec2SPaolo Bonzini 	 */
2467a97cec2SPaolo Bonzini 	return mode == IN_GUEST_MODE;
2477a97cec2SPaolo Bonzini }
2487a97cec2SPaolo Bonzini 
ack_kick(void * _completed)249f24b44e4SLai Jiangshan static void ack_kick(void *_completed)
2500fce5623SAvi Kivity {
2510fce5623SAvi Kivity }
2520fce5623SAvi Kivity 
kvm_kick_many_cpus(struct cpumask * cpus,bool wait)253620b2438SVitaly Kuznetsov static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
254b49defe8SPaolo Bonzini {
255b49defe8SPaolo Bonzini 	if (cpumask_empty(cpus))
256b49defe8SPaolo Bonzini 		return false;
257b49defe8SPaolo Bonzini 
258f24b44e4SLai Jiangshan 	smp_call_function_many(cpus, ack_kick, NULL, wait);
259b49defe8SPaolo Bonzini 	return true;
260b49defe8SPaolo Bonzini }
261b49defe8SPaolo Bonzini 
kvm_make_vcpu_request(struct kvm_vcpu * vcpu,unsigned int req,struct cpumask * tmp,int current_cpu)262b56bd8e0SJinrong Liang static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
263b56bd8e0SJinrong Liang 				  struct cpumask *tmp, int current_cpu)
2640fce5623SAvi Kivity {
265ae0946cdSVitaly Kuznetsov 	int cpu;
2667053df4eSVitaly Kuznetsov 
267df06dae3SSean Christopherson 	if (likely(!(req & KVM_REQUEST_NO_ACTION)))
268df06dae3SSean Christopherson 		__kvm_make_request(req, vcpu);
2696b7e2d09SXiao Guangrong 
270178f02ffSRadim Krčmář 	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
271ae0946cdSVitaly Kuznetsov 		return;
2726c6e8360SRadim Krčmář 
27385b64045SSean Christopherson 	/*
274ae0946cdSVitaly Kuznetsov 	 * Note, the vCPU could get migrated to a different pCPU at any point
275ae0946cdSVitaly Kuznetsov 	 * after kvm_request_needs_ipi(), which could result in sending an IPI
276ae0946cdSVitaly Kuznetsov 	 * to the previous pCPU.  But, that's OK because the purpose of the IPI
277ae0946cdSVitaly Kuznetsov 	 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
278ae0946cdSVitaly Kuznetsov 	 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
279ae0946cdSVitaly Kuznetsov 	 * after this point is also OK, as the requirement is only that KVM wait
280ae0946cdSVitaly Kuznetsov 	 * for vCPUs that were reading SPTEs _before_ any changes were
281ae0946cdSVitaly Kuznetsov 	 * finalized. See kvm_vcpu_kick() for more details on handling requests.
28285b64045SSean Christopherson 	 */
2830bbc2ca8SSean Christopherson 	if (kvm_request_needs_ipi(vcpu, req)) {
28485b64045SSean Christopherson 		cpu = READ_ONCE(vcpu->cpu);
285ae0946cdSVitaly Kuznetsov 		if (cpu != -1 && cpu != current_cpu)
2867053df4eSVitaly Kuznetsov 			__cpumask_set_cpu(cpu, tmp);
2870fce5623SAvi Kivity 	}
28885b64045SSean Christopherson }
2897053df4eSVitaly Kuznetsov 
kvm_make_vcpus_request_mask(struct kvm * kvm,unsigned int req,unsigned long * vcpu_bitmap)290ae0946cdSVitaly Kuznetsov bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
291620b2438SVitaly Kuznetsov 				 unsigned long *vcpu_bitmap)
292ae0946cdSVitaly Kuznetsov {
293ae0946cdSVitaly Kuznetsov 	struct kvm_vcpu *vcpu;
294620b2438SVitaly Kuznetsov 	struct cpumask *cpus;
295ae0946cdSVitaly Kuznetsov 	int i, me;
296ae0946cdSVitaly Kuznetsov 	bool called;
297ae0946cdSVitaly Kuznetsov 
298ae0946cdSVitaly Kuznetsov 	me = get_cpu();
299ae0946cdSVitaly Kuznetsov 
300620b2438SVitaly Kuznetsov 	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
301620b2438SVitaly Kuznetsov 	cpumask_clear(cpus);
302620b2438SVitaly Kuznetsov 
303ae0946cdSVitaly Kuznetsov 	for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
304ae0946cdSVitaly Kuznetsov 		vcpu = kvm_get_vcpu(kvm, i);
305381cecc5SVitaly Kuznetsov 		if (!vcpu)
306ae0946cdSVitaly Kuznetsov 			continue;
307b56bd8e0SJinrong Liang 		kvm_make_vcpu_request(vcpu, req, cpus, me);
308ae0946cdSVitaly Kuznetsov 	}
309ae0946cdSVitaly Kuznetsov 
310620b2438SVitaly Kuznetsov 	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
3113cba4130SXiao Guangrong 	put_cpu();
3127053df4eSVitaly Kuznetsov 
3137053df4eSVitaly Kuznetsov 	return called;
3147053df4eSVitaly Kuznetsov }
3157053df4eSVitaly Kuznetsov 
kvm_make_all_cpus_request_except(struct kvm * kvm,unsigned int req,struct kvm_vcpu * except)31654163a34SSuravee Suthikulpanit bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
31754163a34SSuravee Suthikulpanit 				      struct kvm_vcpu *except)
3187053df4eSVitaly Kuznetsov {
319ae0946cdSVitaly Kuznetsov 	struct kvm_vcpu *vcpu;
320baff59ccSVitaly Kuznetsov 	struct cpumask *cpus;
32146808a4cSMarc Zyngier 	unsigned long i;
3227053df4eSVitaly Kuznetsov 	bool called;
32346808a4cSMarc Zyngier 	int me;
3247053df4eSVitaly Kuznetsov 
325ae0946cdSVitaly Kuznetsov 	me = get_cpu();
326ae0946cdSVitaly Kuznetsov 
327baff59ccSVitaly Kuznetsov 	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
328baff59ccSVitaly Kuznetsov 	cpumask_clear(cpus);
329baff59ccSVitaly Kuznetsov 
330ae0946cdSVitaly Kuznetsov 	kvm_for_each_vcpu(i, vcpu, kvm) {
331ae0946cdSVitaly Kuznetsov 		if (vcpu == except)
332ae0946cdSVitaly Kuznetsov 			continue;
333b56bd8e0SJinrong Liang 		kvm_make_vcpu_request(vcpu, req, cpus, me);
334ae0946cdSVitaly Kuznetsov 	}
335ae0946cdSVitaly Kuznetsov 
336ae0946cdSVitaly Kuznetsov 	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
337ae0946cdSVitaly Kuznetsov 	put_cpu();
3387053df4eSVitaly Kuznetsov 
33949846896SRusty Russell 	return called;
34049846896SRusty Russell }
34149846896SRusty Russell 
kvm_make_all_cpus_request(struct kvm * kvm,unsigned int req)34254163a34SSuravee Suthikulpanit bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
34354163a34SSuravee Suthikulpanit {
34454163a34SSuravee Suthikulpanit 	return kvm_make_all_cpus_request_except(kvm, req, NULL);
34554163a34SSuravee Suthikulpanit }
346a2486020SMarcelo Tosatti EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
34754163a34SSuravee Suthikulpanit 
kvm_flush_remote_tlbs(struct kvm * kvm)34849846896SRusty Russell void kvm_flush_remote_tlbs(struct kvm *kvm)
34949846896SRusty Russell {
3503cc4e148SJing Zhang 	++kvm->stat.generic.remote_tlb_flush_requests;
3516bc6db00SLai Jiangshan 
3524ae3cb3aSLan Tianyu 	/*
3534ae3cb3aSLan Tianyu 	 * We want to publish modifications to the page tables before reading
3544ae3cb3aSLan Tianyu 	 * mode. Pairs with a memory barrier in arch-specific code.
3554ae3cb3aSLan Tianyu 	 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
3564ae3cb3aSLan Tianyu 	 * and smp_mb in walk_shadow_page_lockless_begin/end.
3574ae3cb3aSLan Tianyu 	 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
3584ae3cb3aSLan Tianyu 	 *
3594ae3cb3aSLan Tianyu 	 * There is already an smp_mb__after_atomic() before
3604ae3cb3aSLan Tianyu 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
3614ae3cb3aSLan Tianyu 	 * barrier here.
3624ae3cb3aSLan Tianyu 	 */
363a1342c80SDavid Matlack 	if (!kvm_arch_flush_remote_tlbs(kvm)
364b08660e5STianyu Lan 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
3650193cc90SJing Zhang 		++kvm->stat.generic.remote_tlb_flush;
3660fce5623SAvi Kivity }
3672ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
3680fce5623SAvi Kivity 
kvm_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)369d4788996SDavid Matlack void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
370d4788996SDavid Matlack {
371d4788996SDavid Matlack 	if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
372d4788996SDavid Matlack 		return;
373d4788996SDavid Matlack 
374d4788996SDavid Matlack 	/*
375d4788996SDavid Matlack 	 * Fall back to a flushing entire TLBs if the architecture range-based
376d4788996SDavid Matlack 	 * TLB invalidation is unsupported or can't be performed for whatever
377d4788996SDavid Matlack 	 * reason.
378d4788996SDavid Matlack 	 */
379d4788996SDavid Matlack 	kvm_flush_remote_tlbs(kvm);
380d4788996SDavid Matlack }
381d4788996SDavid Matlack 
kvm_flush_remote_tlbs_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)382619b5072SDavid Matlack void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
383619b5072SDavid Matlack 				   const struct kvm_memory_slot *memslot)
384619b5072SDavid Matlack {
385619b5072SDavid Matlack 	/*
386619b5072SDavid Matlack 	 * All current use cases for flushing the TLBs for a specific memslot
387619b5072SDavid Matlack 	 * are related to dirty logging, and many do the TLB flush out of
388619b5072SDavid Matlack 	 * mmu_lock. The interaction between the various operations on memslot
389619b5072SDavid Matlack 	 * must be serialized by slots_locks to ensure the TLB flush from one
390619b5072SDavid Matlack 	 * operation is observed by any other operation on the same memslot.
391619b5072SDavid Matlack 	 */
392619b5072SDavid Matlack 	lockdep_assert_held(&kvm->slots_lock);
393619b5072SDavid Matlack 	kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
394619b5072SDavid Matlack }
3950fce5623SAvi Kivity 
kvm_flush_shadow_all(struct kvm * kvm)396683412ccSMingwei Zhang static void kvm_flush_shadow_all(struct kvm *kvm)
397683412ccSMingwei Zhang {
398683412ccSMingwei Zhang 	kvm_arch_flush_shadow_all(kvm);
399683412ccSMingwei Zhang 	kvm_arch_guest_memory_reclaimed(kvm);
400683412ccSMingwei Zhang }
401683412ccSMingwei Zhang 
4026926f95aSSean Christopherson #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache * mc,gfp_t gfp_flags)4036926f95aSSean Christopherson static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
4046926f95aSSean Christopherson 					       gfp_t gfp_flags)
4056926f95aSSean Christopherson {
4066926f95aSSean Christopherson 	gfp_flags |= mc->gfp_zero;
4076926f95aSSean Christopherson 
4086926f95aSSean Christopherson 	if (mc->kmem_cache)
4096926f95aSSean Christopherson 		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
4106926f95aSSean Christopherson 	else
4116926f95aSSean Christopherson 		return (void *)__get_free_page(gfp_flags);
4126926f95aSSean Christopherson }
4136926f95aSSean Christopherson 
__kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int capacity,int min)414837f66c7SDavid Matlack int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
4156926f95aSSean Christopherson {
41663f4b210SPaolo Bonzini 	gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
4176926f95aSSean Christopherson 	void *obj;
4186926f95aSSean Christopherson 
4196926f95aSSean Christopherson 	if (mc->nobjs >= min)
4206926f95aSSean Christopherson 		return 0;
421837f66c7SDavid Matlack 
422837f66c7SDavid Matlack 	if (unlikely(!mc->objects)) {
423837f66c7SDavid Matlack 		if (WARN_ON_ONCE(!capacity))
424837f66c7SDavid Matlack 			return -EIO;
425837f66c7SDavid Matlack 
426837f66c7SDavid Matlack 		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
427837f66c7SDavid Matlack 		if (!mc->objects)
428837f66c7SDavid Matlack 			return -ENOMEM;
429837f66c7SDavid Matlack 
430837f66c7SDavid Matlack 		mc->capacity = capacity;
431837f66c7SDavid Matlack 	}
432837f66c7SDavid Matlack 
433837f66c7SDavid Matlack 	/* It is illegal to request a different capacity across topups. */
434837f66c7SDavid Matlack 	if (WARN_ON_ONCE(mc->capacity != capacity))
435837f66c7SDavid Matlack 		return -EIO;
436837f66c7SDavid Matlack 
437837f66c7SDavid Matlack 	while (mc->nobjs < mc->capacity) {
438837f66c7SDavid Matlack 		obj = mmu_memory_cache_alloc_obj(mc, gfp);
4396926f95aSSean Christopherson 		if (!obj)
4406926f95aSSean Christopherson 			return mc->nobjs >= min ? 0 : -ENOMEM;
4416926f95aSSean Christopherson 		mc->objects[mc->nobjs++] = obj;
4426926f95aSSean Christopherson 	}
4436926f95aSSean Christopherson 	return 0;
4446926f95aSSean Christopherson }
4456926f95aSSean Christopherson 
kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int min)446837f66c7SDavid Matlack int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
447837f66c7SDavid Matlack {
448837f66c7SDavid Matlack 	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
449837f66c7SDavid Matlack }
450837f66c7SDavid Matlack 
kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache * mc)4516926f95aSSean Christopherson int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
4526926f95aSSean Christopherson {
4536926f95aSSean Christopherson 	return mc->nobjs;
4546926f95aSSean Christopherson }
4556926f95aSSean Christopherson 
kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)4566926f95aSSean Christopherson void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
4576926f95aSSean Christopherson {
4586926f95aSSean Christopherson 	while (mc->nobjs) {
4596926f95aSSean Christopherson 		if (mc->kmem_cache)
4606926f95aSSean Christopherson 			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
4616926f95aSSean Christopherson 		else
4626926f95aSSean Christopherson 			free_page((unsigned long)mc->objects[--mc->nobjs]);
4636926f95aSSean Christopherson 	}
464837f66c7SDavid Matlack 
465837f66c7SDavid Matlack 	kvfree(mc->objects);
466837f66c7SDavid Matlack 
467837f66c7SDavid Matlack 	mc->objects = NULL;
468837f66c7SDavid Matlack 	mc->capacity = 0;
4696926f95aSSean Christopherson }
4706926f95aSSean Christopherson 
kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)4716926f95aSSean Christopherson void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
4726926f95aSSean Christopherson {
4736926f95aSSean Christopherson 	void *p;
4746926f95aSSean Christopherson 
4756926f95aSSean Christopherson 	if (WARN_ON(!mc->nobjs))
4766926f95aSSean Christopherson 		p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
4776926f95aSSean Christopherson 	else
4786926f95aSSean Christopherson 		p = mc->objects[--mc->nobjs];
4796926f95aSSean Christopherson 	BUG_ON(!p);
4806926f95aSSean Christopherson 	return p;
4816926f95aSSean Christopherson }
4826926f95aSSean Christopherson #endif
4836926f95aSSean Christopherson 
kvm_vcpu_init(struct kvm_vcpu * vcpu,struct kvm * kvm,unsigned id)4848bd826d6SSean Christopherson static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
4850fce5623SAvi Kivity {
4860fce5623SAvi Kivity 	mutex_init(&vcpu->mutex);
4870fce5623SAvi Kivity 	vcpu->cpu = -1;
4880fce5623SAvi Kivity 	vcpu->kvm = kvm;
4890fce5623SAvi Kivity 	vcpu->vcpu_id = id;
49034bb10b7SRik van Riel 	vcpu->pid = NULL;
491510958e9SSean Christopherson #ifndef __KVM_HAVE_ARCH_WQP
492da4ad88cSDavidlohr Bueso 	rcuwait_init(&vcpu->wait);
493510958e9SSean Christopherson #endif
494af585b92SGleb Natapov 	kvm_async_pf_vcpu_init(vcpu);
4950fce5623SAvi Kivity 
4964c088493SRaghavendra K T 	kvm_vcpu_set_in_spin_loop(vcpu, false);
4974c088493SRaghavendra K T 	kvm_vcpu_set_dy_eligible(vcpu, false);
4983a08a8f9SRaghavendra K T 	vcpu->preempted = false;
499d73eb57bSWanpeng Li 	vcpu->ready = false;
500d5c48debSSean Christopherson 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
501a54d8066SMaciej S. Szmigiero 	vcpu->last_used_slot = NULL;
50258fc1166SOliver Upton 
50358fc1166SOliver Upton 	/* Fill the stats id string for the vcpu */
50458fc1166SOliver Upton 	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
50558fc1166SOliver Upton 		 task_pid_nr(current), id);
5060fce5623SAvi Kivity }
5070fce5623SAvi Kivity 
kvm_vcpu_destroy(struct kvm_vcpu * vcpu)50827592ae8SMarc Zyngier static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
5094543bdc0SSean Christopherson {
5104543bdc0SSean Christopherson 	kvm_arch_vcpu_destroy(vcpu);
5115593473aSPaolo Bonzini 	kvm_dirty_ring_free(&vcpu->dirty_ring);
512e529ef66SSean Christopherson 
5139941d224SSean Christopherson 	/*
5149941d224SSean Christopherson 	 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
5159941d224SSean Christopherson 	 * the vcpu->pid pointer, and at destruction time all file descriptors
5169941d224SSean Christopherson 	 * are already gone.
5179941d224SSean Christopherson 	 */
5189941d224SSean Christopherson 	put_pid(rcu_dereference_protected(vcpu->pid, 1));
5199941d224SSean Christopherson 
5208bd826d6SSean Christopherson 	free_page((unsigned long)vcpu->run);
521e529ef66SSean Christopherson 	kmem_cache_free(kvm_vcpu_cache, vcpu);
5224543bdc0SSean Christopherson }
52327592ae8SMarc Zyngier 
kvm_destroy_vcpus(struct kvm * kvm)52427592ae8SMarc Zyngier void kvm_destroy_vcpus(struct kvm *kvm)
52527592ae8SMarc Zyngier {
52646808a4cSMarc Zyngier 	unsigned long i;
52727592ae8SMarc Zyngier 	struct kvm_vcpu *vcpu;
52827592ae8SMarc Zyngier 
52927592ae8SMarc Zyngier 	kvm_for_each_vcpu(i, vcpu, kvm) {
53027592ae8SMarc Zyngier 		kvm_vcpu_destroy(vcpu);
531c5b07754SMarc Zyngier 		xa_erase(&kvm->vcpu_array, i);
53227592ae8SMarc Zyngier 	}
53327592ae8SMarc Zyngier 
53427592ae8SMarc Zyngier 	atomic_set(&kvm->online_vcpus, 0);
53527592ae8SMarc Zyngier }
53627592ae8SMarc Zyngier EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
5374543bdc0SSean Christopherson 
538e930bffeSAndrea Arcangeli #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_to_kvm(struct mmu_notifier * mn)539e930bffeSAndrea Arcangeli static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
540e930bffeSAndrea Arcangeli {
541e930bffeSAndrea Arcangeli 	return container_of(mn, struct kvm, mmu_notifier);
542e930bffeSAndrea Arcangeli }
543e930bffeSAndrea Arcangeli 
5443039bcc7SSean Christopherson typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
5453039bcc7SSean Christopherson 
546f922bd9bSSean Christopherson typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
547f922bd9bSSean Christopherson 			     unsigned long end);
548f922bd9bSSean Christopherson 
549683412ccSMingwei Zhang typedef void (*on_unlock_fn_t)(struct kvm *kvm);
550683412ccSMingwei Zhang 
5513039bcc7SSean Christopherson struct kvm_hva_range {
5523039bcc7SSean Christopherson 	unsigned long start;
5533039bcc7SSean Christopherson 	unsigned long end;
5543e1efe2bSSean Christopherson 	union kvm_mmu_notifier_arg arg;
5553039bcc7SSean Christopherson 	hva_handler_t handler;
556f922bd9bSSean Christopherson 	on_lock_fn_t on_lock;
557683412ccSMingwei Zhang 	on_unlock_fn_t on_unlock;
5583039bcc7SSean Christopherson 	bool flush_on_ret;
5593039bcc7SSean Christopherson 	bool may_block;
5603039bcc7SSean Christopherson };
5613039bcc7SSean Christopherson 
562f922bd9bSSean Christopherson /*
563f922bd9bSSean Christopherson  * Use a dedicated stub instead of NULL to indicate that there is no callback
564f922bd9bSSean Christopherson  * function/handler.  The compiler technically can't guarantee that a real
565f922bd9bSSean Christopherson  * function will have a non-zero address, and so it will generate code to
566f922bd9bSSean Christopherson  * check for !NULL, whereas comparing against a stub will be elided at compile
567f922bd9bSSean Christopherson  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
568f922bd9bSSean Christopherson  */
kvm_null_fn(void)569f922bd9bSSean Christopherson static void kvm_null_fn(void)
570f922bd9bSSean Christopherson {
571f922bd9bSSean Christopherson 
572f922bd9bSSean Christopherson }
573f922bd9bSSean Christopherson #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
574f922bd9bSSean Christopherson 
5753e1efe2bSSean Christopherson static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
5763e1efe2bSSean Christopherson 
577ed922739SMaciej S. Szmigiero /* Iterate over each memslot intersecting [start, last] (inclusive) range */
578ed922739SMaciej S. Szmigiero #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
579ed922739SMaciej S. Szmigiero 	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
580ed922739SMaciej S. Szmigiero 	     node;							     \
581ed922739SMaciej S. Szmigiero 	     node = interval_tree_iter_next(node, start, last))	     \
582ed922739SMaciej S. Szmigiero 
__kvm_handle_hva_range(struct kvm * kvm,const struct kvm_hva_range * range)5833039bcc7SSean Christopherson static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
5843039bcc7SSean Christopherson 						  const struct kvm_hva_range *range)
5853039bcc7SSean Christopherson {
5868931a454SSean Christopherson 	bool ret = false, locked = false;
587f922bd9bSSean Christopherson 	struct kvm_gfn_range gfn_range;
5883039bcc7SSean Christopherson 	struct kvm_memory_slot *slot;
5893039bcc7SSean Christopherson 	struct kvm_memslots *slots;
5903039bcc7SSean Christopherson 	int i, idx;
5913039bcc7SSean Christopherson 
592ed922739SMaciej S. Szmigiero 	if (WARN_ON_ONCE(range->end <= range->start))
593ed922739SMaciej S. Szmigiero 		return 0;
594ed922739SMaciej S. Szmigiero 
595f922bd9bSSean Christopherson 	/* A null handler is allowed if and only if on_lock() is provided. */
596f922bd9bSSean Christopherson 	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
597f922bd9bSSean Christopherson 			 IS_KVM_NULL_FN(range->handler)))
598f922bd9bSSean Christopherson 		return 0;
599f922bd9bSSean Christopherson 
6003039bcc7SSean Christopherson 	idx = srcu_read_lock(&kvm->srcu);
6013039bcc7SSean Christopherson 
6023039bcc7SSean Christopherson 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
603ed922739SMaciej S. Szmigiero 		struct interval_tree_node *node;
604ed922739SMaciej S. Szmigiero 
6053039bcc7SSean Christopherson 		slots = __kvm_memslots(kvm, i);
606ed922739SMaciej S. Szmigiero 		kvm_for_each_memslot_in_hva_range(node, slots,
607ed922739SMaciej S. Szmigiero 						  range->start, range->end - 1) {
6083039bcc7SSean Christopherson 			unsigned long hva_start, hva_end;
6093039bcc7SSean Christopherson 
610a54d8066SMaciej S. Szmigiero 			slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
6113039bcc7SSean Christopherson 			hva_start = max(range->start, slot->userspace_addr);
6123039bcc7SSean Christopherson 			hva_end = min(range->end, slot->userspace_addr +
6133039bcc7SSean Christopherson 						  (slot->npages << PAGE_SHIFT));
6143039bcc7SSean Christopherson 
6153039bcc7SSean Christopherson 			/*
6163039bcc7SSean Christopherson 			 * To optimize for the likely case where the address
6173039bcc7SSean Christopherson 			 * range is covered by zero or one memslots, don't
6183039bcc7SSean Christopherson 			 * bother making these conditional (to avoid writes on
6193039bcc7SSean Christopherson 			 * the second or later invocation of the handler).
6203039bcc7SSean Christopherson 			 */
6213e1efe2bSSean Christopherson 			gfn_range.arg = range->arg;
6223039bcc7SSean Christopherson 			gfn_range.may_block = range->may_block;
6233039bcc7SSean Christopherson 
6243039bcc7SSean Christopherson 			/*
6253039bcc7SSean Christopherson 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
6263039bcc7SSean Christopherson 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
6273039bcc7SSean Christopherson 			 */
6283039bcc7SSean Christopherson 			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
6293039bcc7SSean Christopherson 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
6303039bcc7SSean Christopherson 			gfn_range.slot = slot;
6313039bcc7SSean Christopherson 
6328931a454SSean Christopherson 			if (!locked) {
6338931a454SSean Christopherson 				locked = true;
6348931a454SSean Christopherson 				KVM_MMU_LOCK(kvm);
635071064f1SPaolo Bonzini 				if (!IS_KVM_NULL_FN(range->on_lock))
636071064f1SPaolo Bonzini 					range->on_lock(kvm, range->start, range->end);
637071064f1SPaolo Bonzini 				if (IS_KVM_NULL_FN(range->handler))
638071064f1SPaolo Bonzini 					break;
6398931a454SSean Christopherson 			}
6403039bcc7SSean Christopherson 			ret |= range->handler(kvm, &gfn_range);
6413039bcc7SSean Christopherson 		}
6423039bcc7SSean Christopherson 	}
6433039bcc7SSean Christopherson 
6446bc6db00SLai Jiangshan 	if (range->flush_on_ret && ret)
6453039bcc7SSean Christopherson 		kvm_flush_remote_tlbs(kvm);
6463039bcc7SSean Christopherson 
647683412ccSMingwei Zhang 	if (locked) {
648f922bd9bSSean Christopherson 		KVM_MMU_UNLOCK(kvm);
649683412ccSMingwei Zhang 		if (!IS_KVM_NULL_FN(range->on_unlock))
650683412ccSMingwei Zhang 			range->on_unlock(kvm);
651683412ccSMingwei Zhang 	}
652f922bd9bSSean Christopherson 
6533039bcc7SSean Christopherson 	srcu_read_unlock(&kvm->srcu, idx);
6543039bcc7SSean Christopherson 
6553039bcc7SSean Christopherson 	/* The notifiers are averse to booleans. :-( */
6563039bcc7SSean Christopherson 	return (int)ret;
6573039bcc7SSean Christopherson }
6583039bcc7SSean Christopherson 
kvm_handle_hva_range(struct mmu_notifier * mn,unsigned long start,unsigned long end,union kvm_mmu_notifier_arg arg,hva_handler_t handler)6593039bcc7SSean Christopherson static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
6603039bcc7SSean Christopherson 						unsigned long start,
6613039bcc7SSean Christopherson 						unsigned long end,
6623e1efe2bSSean Christopherson 						union kvm_mmu_notifier_arg arg,
6633039bcc7SSean Christopherson 						hva_handler_t handler)
6643039bcc7SSean Christopherson {
6653039bcc7SSean Christopherson 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
6663039bcc7SSean Christopherson 	const struct kvm_hva_range range = {
6673039bcc7SSean Christopherson 		.start		= start,
6683039bcc7SSean Christopherson 		.end		= end,
6693e1efe2bSSean Christopherson 		.arg		= arg,
6703039bcc7SSean Christopherson 		.handler	= handler,
671f922bd9bSSean Christopherson 		.on_lock	= (void *)kvm_null_fn,
672683412ccSMingwei Zhang 		.on_unlock	= (void *)kvm_null_fn,
6733039bcc7SSean Christopherson 		.flush_on_ret	= true,
6743039bcc7SSean Christopherson 		.may_block	= false,
6753039bcc7SSean Christopherson 	};
6763039bcc7SSean Christopherson 
677f922bd9bSSean Christopherson 	return __kvm_handle_hva_range(kvm, &range);
6783039bcc7SSean Christopherson }
6793039bcc7SSean Christopherson 
kvm_handle_hva_range_no_flush(struct mmu_notifier * mn,unsigned long start,unsigned long end,hva_handler_t handler)6803039bcc7SSean Christopherson static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
6813039bcc7SSean Christopherson 							 unsigned long start,
6823039bcc7SSean Christopherson 							 unsigned long end,
6833039bcc7SSean Christopherson 							 hva_handler_t handler)
6843039bcc7SSean Christopherson {
6853039bcc7SSean Christopherson 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
6863039bcc7SSean Christopherson 	const struct kvm_hva_range range = {
6873039bcc7SSean Christopherson 		.start		= start,
6883039bcc7SSean Christopherson 		.end		= end,
6893039bcc7SSean Christopherson 		.handler	= handler,
690f922bd9bSSean Christopherson 		.on_lock	= (void *)kvm_null_fn,
691683412ccSMingwei Zhang 		.on_unlock	= (void *)kvm_null_fn,
6923039bcc7SSean Christopherson 		.flush_on_ret	= false,
6933039bcc7SSean Christopherson 		.may_block	= false,
6943039bcc7SSean Christopherson 	};
6953039bcc7SSean Christopherson 
696f922bd9bSSean Christopherson 	return __kvm_handle_hva_range(kvm, &range);
6973039bcc7SSean Christopherson }
6982230f9e1SGavin Shan 
kvm_change_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)6992230f9e1SGavin Shan static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
7002230f9e1SGavin Shan {
7012230f9e1SGavin Shan 	/*
7022230f9e1SGavin Shan 	 * Skipping invalid memslots is correct if and only change_pte() is
7032230f9e1SGavin Shan 	 * surrounded by invalidate_range_{start,end}(), which is currently
7042230f9e1SGavin Shan 	 * guaranteed by the primary MMU.  If that ever changes, KVM needs to
7052230f9e1SGavin Shan 	 * unmap the memslot instead of skipping the memslot to ensure that KVM
7062230f9e1SGavin Shan 	 * doesn't hold references to the old PFN.
7072230f9e1SGavin Shan 	 */
7082230f9e1SGavin Shan 	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
7092230f9e1SGavin Shan 
7102230f9e1SGavin Shan 	if (range->slot->flags & KVM_MEMSLOT_INVALID)
7112230f9e1SGavin Shan 		return false;
7122230f9e1SGavin Shan 
7132230f9e1SGavin Shan 	return kvm_set_spte_gfn(kvm, range);
7142230f9e1SGavin Shan }
7152230f9e1SGavin Shan 
kvm_mmu_notifier_change_pte(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address,pte_t pte)7163da0dd43SIzik Eidus static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
7173da0dd43SIzik Eidus 					struct mm_struct *mm,
7183da0dd43SIzik Eidus 					unsigned long address,
7193da0dd43SIzik Eidus 					pte_t pte)
7203da0dd43SIzik Eidus {
7213da0dd43SIzik Eidus 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
7223e1efe2bSSean Christopherson 	const union kvm_mmu_notifier_arg arg = { .pte = pte };
7233da0dd43SIzik Eidus 
724501b9185SSean Christopherson 	trace_kvm_set_spte_hva(address);
725501b9185SSean Christopherson 
726c13fda23SSean Christopherson 	/*
72752ac8b35SPaolo Bonzini 	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
72820ec3ebdSChao Peng 	 * If mmu_invalidate_in_progress is zero, then no in-progress
72920ec3ebdSChao Peng 	 * invalidations, including this one, found a relevant memslot at
73020ec3ebdSChao Peng 	 * start(); rechecking memslots here is unnecessary.  Note, a false
73120ec3ebdSChao Peng 	 * positive (count elevated by a different invalidation) is sub-optimal
73220ec3ebdSChao Peng 	 * but functionally ok.
733c13fda23SSean Christopherson 	 */
73452ac8b35SPaolo Bonzini 	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
73520ec3ebdSChao Peng 	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
736071064f1SPaolo Bonzini 		return;
737c13fda23SSean Christopherson 
7383e1efe2bSSean Christopherson 	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
7393da0dd43SIzik Eidus }
7403da0dd43SIzik Eidus 
kvm_mmu_invalidate_begin(struct kvm * kvm,unsigned long start,unsigned long end)74120ec3ebdSChao Peng void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
742f922bd9bSSean Christopherson 			      unsigned long end)
743e930bffeSAndrea Arcangeli {
744e930bffeSAndrea Arcangeli 	/*
745e930bffeSAndrea Arcangeli 	 * The count increase must become visible at unlock time as no
746e930bffeSAndrea Arcangeli 	 * spte can be established without taking the mmu_lock and
747e930bffeSAndrea Arcangeli 	 * count is also read inside the mmu_lock critical section.
748e930bffeSAndrea Arcangeli 	 */
74920ec3ebdSChao Peng 	kvm->mmu_invalidate_in_progress++;
75020ec3ebdSChao Peng 	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
75120ec3ebdSChao Peng 		kvm->mmu_invalidate_range_start = start;
75220ec3ebdSChao Peng 		kvm->mmu_invalidate_range_end = end;
7534a42d848SDavid Stevens 	} else {
7544a42d848SDavid Stevens 		/*
755a413a625STom Rix 		 * Fully tracking multiple concurrent ranges has diminishing
7564a42d848SDavid Stevens 		 * returns. Keep things simple and just find the minimal range
7574a42d848SDavid Stevens 		 * which includes the current and new ranges. As there won't be
7584a42d848SDavid Stevens 		 * enough information to subtract a range after its invalidate
7594a42d848SDavid Stevens 		 * completes, any ranges invalidated concurrently will
7604a42d848SDavid Stevens 		 * accumulate and persist until all outstanding invalidates
7614a42d848SDavid Stevens 		 * complete.
7624a42d848SDavid Stevens 		 */
76320ec3ebdSChao Peng 		kvm->mmu_invalidate_range_start =
76420ec3ebdSChao Peng 			min(kvm->mmu_invalidate_range_start, start);
76520ec3ebdSChao Peng 		kvm->mmu_invalidate_range_end =
76620ec3ebdSChao Peng 			max(kvm->mmu_invalidate_range_end, end);
767f922bd9bSSean Christopherson 	}
7684a42d848SDavid Stevens }
7693039bcc7SSean Christopherson 
kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier * mn,const struct mmu_notifier_range * range)770f922bd9bSSean Christopherson static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
771f922bd9bSSean Christopherson 					const struct mmu_notifier_range *range)
772f922bd9bSSean Christopherson {
773f922bd9bSSean Christopherson 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
774f922bd9bSSean Christopherson 	const struct kvm_hva_range hva_range = {
775f922bd9bSSean Christopherson 		.start		= range->start,
776f922bd9bSSean Christopherson 		.end		= range->end,
777f922bd9bSSean Christopherson 		.handler	= kvm_unmap_gfn_range,
77820ec3ebdSChao Peng 		.on_lock	= kvm_mmu_invalidate_begin,
779683412ccSMingwei Zhang 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
780f922bd9bSSean Christopherson 		.flush_on_ret	= true,
781f922bd9bSSean Christopherson 		.may_block	= mmu_notifier_range_blockable(range),
782f922bd9bSSean Christopherson 	};
783565f3be2STakuya Yoshikawa 
784f922bd9bSSean Christopherson 	trace_kvm_unmap_hva_range(range->start, range->end);
785f922bd9bSSean Christopherson 
78652ac8b35SPaolo Bonzini 	/*
78752ac8b35SPaolo Bonzini 	 * Prevent memslot modification between range_start() and range_end()
78852ac8b35SPaolo Bonzini 	 * so that conditionally locking provides the same result in both
78920ec3ebdSChao Peng 	 * functions.  Without that guarantee, the mmu_invalidate_in_progress
79052ac8b35SPaolo Bonzini 	 * adjustments will be imbalanced.
79152ac8b35SPaolo Bonzini 	 *
79252ac8b35SPaolo Bonzini 	 * Pairs with the decrement in range_end().
79352ac8b35SPaolo Bonzini 	 */
79452ac8b35SPaolo Bonzini 	spin_lock(&kvm->mn_invalidate_lock);
79552ac8b35SPaolo Bonzini 	kvm->mn_active_invalidate_count++;
79652ac8b35SPaolo Bonzini 	spin_unlock(&kvm->mn_invalidate_lock);
79752ac8b35SPaolo Bonzini 
79858cd407cSSean Christopherson 	/*
79958cd407cSSean Christopherson 	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
80058cd407cSSean Christopherson 	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
80158cd407cSSean Christopherson 	 * each cache's lock.  There are relatively few caches in existence at
80258cd407cSSean Christopherson 	 * any given time, and the caches themselves can check for hva overlap,
80358cd407cSSean Christopherson 	 * i.e. don't need to rely on memslot overlap checks for performance.
80458cd407cSSean Christopherson 	 * Because this runs without holding mmu_lock, the pfn caches must use
80520ec3ebdSChao Peng 	 * mn_active_invalidate_count (see above) instead of
80620ec3ebdSChao Peng 	 * mmu_invalidate_in_progress.
80758cd407cSSean Christopherson 	 */
808982ed0deSDavid Woodhouse 	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
809982ed0deSDavid Woodhouse 					  hva_range.may_block);
810982ed0deSDavid Woodhouse 
811f922bd9bSSean Christopherson 	__kvm_handle_hva_range(kvm, &hva_range);
81293065ac7SMichal Hocko 
813e649b3f0SEiichi Tsukata 	return 0;
814e930bffeSAndrea Arcangeli }
815e930bffeSAndrea Arcangeli 
kvm_mmu_invalidate_end(struct kvm * kvm,unsigned long start,unsigned long end)81620ec3ebdSChao Peng void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
817f922bd9bSSean Christopherson 			    unsigned long end)
818e930bffeSAndrea Arcangeli {
819e930bffeSAndrea Arcangeli 	/*
820e930bffeSAndrea Arcangeli 	 * This sequence increase will notify the kvm page fault that
821e930bffeSAndrea Arcangeli 	 * the page that is going to be mapped in the spte could have
822e930bffeSAndrea Arcangeli 	 * been freed.
823e930bffeSAndrea Arcangeli 	 */
82420ec3ebdSChao Peng 	kvm->mmu_invalidate_seq++;
825a355aa54SPaul Mackerras 	smp_wmb();
826e930bffeSAndrea Arcangeli 	/*
827e930bffeSAndrea Arcangeli 	 * The above sequence increase must be visible before the
828a355aa54SPaul Mackerras 	 * below count decrease, which is ensured by the smp_wmb above
82920ec3ebdSChao Peng 	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
830e930bffeSAndrea Arcangeli 	 */
83120ec3ebdSChao Peng 	kvm->mmu_invalidate_in_progress--;
832f922bd9bSSean Christopherson }
833f922bd9bSSean Christopherson 
kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier * mn,const struct mmu_notifier_range * range)834f922bd9bSSean Christopherson static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
835f922bd9bSSean Christopherson 					const struct mmu_notifier_range *range)
836f922bd9bSSean Christopherson {
837f922bd9bSSean Christopherson 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
838f922bd9bSSean Christopherson 	const struct kvm_hva_range hva_range = {
839f922bd9bSSean Christopherson 		.start		= range->start,
840f922bd9bSSean Christopherson 		.end		= range->end,
841f922bd9bSSean Christopherson 		.handler	= (void *)kvm_null_fn,
84220ec3ebdSChao Peng 		.on_lock	= kvm_mmu_invalidate_end,
843683412ccSMingwei Zhang 		.on_unlock	= (void *)kvm_null_fn,
844f922bd9bSSean Christopherson 		.flush_on_ret	= false,
845f922bd9bSSean Christopherson 		.may_block	= mmu_notifier_range_blockable(range),
846f922bd9bSSean Christopherson 	};
84752ac8b35SPaolo Bonzini 	bool wake;
848f922bd9bSSean Christopherson 
849f922bd9bSSean Christopherson 	__kvm_handle_hva_range(kvm, &hva_range);
850e930bffeSAndrea Arcangeli 
85152ac8b35SPaolo Bonzini 	/* Pairs with the increment in range_start(). */
85252ac8b35SPaolo Bonzini 	spin_lock(&kvm->mn_invalidate_lock);
85352ac8b35SPaolo Bonzini 	wake = (--kvm->mn_active_invalidate_count == 0);
85452ac8b35SPaolo Bonzini 	spin_unlock(&kvm->mn_invalidate_lock);
85552ac8b35SPaolo Bonzini 
85652ac8b35SPaolo Bonzini 	/*
85752ac8b35SPaolo Bonzini 	 * There can only be one waiter, since the wait happens under
85852ac8b35SPaolo Bonzini 	 * slots_lock.
85952ac8b35SPaolo Bonzini 	 */
86052ac8b35SPaolo Bonzini 	if (wake)
86152ac8b35SPaolo Bonzini 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
86252ac8b35SPaolo Bonzini 
86320ec3ebdSChao Peng 	BUG_ON(kvm->mmu_invalidate_in_progress < 0);
864e930bffeSAndrea Arcangeli }
865e930bffeSAndrea Arcangeli 
kvm_mmu_notifier_clear_flush_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)866e930bffeSAndrea Arcangeli static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
867e930bffeSAndrea Arcangeli 					      struct mm_struct *mm,
86857128468SAndres Lagar-Cavilla 					      unsigned long start,
86957128468SAndres Lagar-Cavilla 					      unsigned long end)
870e930bffeSAndrea Arcangeli {
871501b9185SSean Christopherson 	trace_kvm_age_hva(start, end);
872501b9185SSean Christopherson 
8733e1efe2bSSean Christopherson 	return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
8743e1efe2bSSean Christopherson 				    kvm_age_gfn);
875e930bffeSAndrea Arcangeli }
876e930bffeSAndrea Arcangeli 
kvm_mmu_notifier_clear_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)8771d7715c6SVladimir Davydov static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
8781d7715c6SVladimir Davydov 					struct mm_struct *mm,
8791d7715c6SVladimir Davydov 					unsigned long start,
8801d7715c6SVladimir Davydov 					unsigned long end)
8811d7715c6SVladimir Davydov {
882501b9185SSean Christopherson 	trace_kvm_age_hva(start, end);
883501b9185SSean Christopherson 
8841d7715c6SVladimir Davydov 	/*
8851d7715c6SVladimir Davydov 	 * Even though we do not flush TLB, this will still adversely
8861d7715c6SVladimir Davydov 	 * affect performance on pre-Haswell Intel EPT, where there is
8871d7715c6SVladimir Davydov 	 * no EPT Access Bit to clear so that we have to tear down EPT
8881d7715c6SVladimir Davydov 	 * tables instead. If we find this unacceptable, we can always
8891d7715c6SVladimir Davydov 	 * add a parameter to kvm_age_hva so that it effectively doesn't
8901d7715c6SVladimir Davydov 	 * do anything on clear_young.
8911d7715c6SVladimir Davydov 	 *
8921d7715c6SVladimir Davydov 	 * Also note that currently we never issue secondary TLB flushes
8931d7715c6SVladimir Davydov 	 * from clear_young, leaving this job up to the regular system
8941d7715c6SVladimir Davydov 	 * cadence. If we find this inaccurate, we might come up with a
8951d7715c6SVladimir Davydov 	 * more sophisticated heuristic later.
8961d7715c6SVladimir Davydov 	 */
8973039bcc7SSean Christopherson 	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
8981d7715c6SVladimir Davydov }
8991d7715c6SVladimir Davydov 
kvm_mmu_notifier_test_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address)9008ee53820SAndrea Arcangeli static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
9018ee53820SAndrea Arcangeli 				       struct mm_struct *mm,
9028ee53820SAndrea Arcangeli 				       unsigned long address)
9038ee53820SAndrea Arcangeli {
904501b9185SSean Christopherson 	trace_kvm_test_age_hva(address);
905501b9185SSean Christopherson 
9063039bcc7SSean Christopherson 	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
9073039bcc7SSean Christopherson 					     kvm_test_age_gfn);
9088ee53820SAndrea Arcangeli }
9098ee53820SAndrea Arcangeli 
kvm_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)91085db06e5SMarcelo Tosatti static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
91185db06e5SMarcelo Tosatti 				     struct mm_struct *mm)
91285db06e5SMarcelo Tosatti {
91385db06e5SMarcelo Tosatti 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
914eda2bedaSLai Jiangshan 	int idx;
915eda2bedaSLai Jiangshan 
916eda2bedaSLai Jiangshan 	idx = srcu_read_lock(&kvm->srcu);
917683412ccSMingwei Zhang 	kvm_flush_shadow_all(kvm);
918eda2bedaSLai Jiangshan 	srcu_read_unlock(&kvm->srcu, idx);
91985db06e5SMarcelo Tosatti }
92085db06e5SMarcelo Tosatti 
921e930bffeSAndrea Arcangeli static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
922e930bffeSAndrea Arcangeli 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
923e930bffeSAndrea Arcangeli 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
924e930bffeSAndrea Arcangeli 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
9251d7715c6SVladimir Davydov 	.clear_young		= kvm_mmu_notifier_clear_young,
9268ee53820SAndrea Arcangeli 	.test_young		= kvm_mmu_notifier_test_young,
9273da0dd43SIzik Eidus 	.change_pte		= kvm_mmu_notifier_change_pte,
92885db06e5SMarcelo Tosatti 	.release		= kvm_mmu_notifier_release,
929e930bffeSAndrea Arcangeli };
9304c07b0a4SAvi Kivity 
kvm_init_mmu_notifier(struct kvm * kvm)9314c07b0a4SAvi Kivity static int kvm_init_mmu_notifier(struct kvm *kvm)
9324c07b0a4SAvi Kivity {
9334c07b0a4SAvi Kivity 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
9344c07b0a4SAvi Kivity 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
9354c07b0a4SAvi Kivity }
9364c07b0a4SAvi Kivity 
9374c07b0a4SAvi Kivity #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
9384c07b0a4SAvi Kivity 
kvm_init_mmu_notifier(struct kvm * kvm)9394c07b0a4SAvi Kivity static int kvm_init_mmu_notifier(struct kvm *kvm)
9404c07b0a4SAvi Kivity {
9414c07b0a4SAvi Kivity 	return 0;
9424c07b0a4SAvi Kivity }
9434c07b0a4SAvi Kivity 
944e930bffeSAndrea Arcangeli #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
945e930bffeSAndrea Arcangeli 
9462fdef3a2SSergey Senozhatsky #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
kvm_pm_notifier_call(struct notifier_block * bl,unsigned long state,void * unused)9472fdef3a2SSergey Senozhatsky static int kvm_pm_notifier_call(struct notifier_block *bl,
9482fdef3a2SSergey Senozhatsky 				unsigned long state,
9492fdef3a2SSergey Senozhatsky 				void *unused)
9502fdef3a2SSergey Senozhatsky {
9512fdef3a2SSergey Senozhatsky 	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
9522fdef3a2SSergey Senozhatsky 
9532fdef3a2SSergey Senozhatsky 	return kvm_arch_pm_notifier(kvm, state);
9542fdef3a2SSergey Senozhatsky }
9552fdef3a2SSergey Senozhatsky 
kvm_init_pm_notifier(struct kvm * kvm)9562fdef3a2SSergey Senozhatsky static void kvm_init_pm_notifier(struct kvm *kvm)
9572fdef3a2SSergey Senozhatsky {
9582fdef3a2SSergey Senozhatsky 	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
9592fdef3a2SSergey Senozhatsky 	/* Suspend KVM before we suspend ftrace, RCU, etc. */
9602fdef3a2SSergey Senozhatsky 	kvm->pm_notifier.priority = INT_MAX;
9612fdef3a2SSergey Senozhatsky 	register_pm_notifier(&kvm->pm_notifier);
9622fdef3a2SSergey Senozhatsky }
9632fdef3a2SSergey Senozhatsky 
kvm_destroy_pm_notifier(struct kvm * kvm)9642fdef3a2SSergey Senozhatsky static void kvm_destroy_pm_notifier(struct kvm *kvm)
9652fdef3a2SSergey Senozhatsky {
9662fdef3a2SSergey Senozhatsky 	unregister_pm_notifier(&kvm->pm_notifier);
9672fdef3a2SSergey Senozhatsky }
9682fdef3a2SSergey Senozhatsky #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
kvm_init_pm_notifier(struct kvm * kvm)9692fdef3a2SSergey Senozhatsky static void kvm_init_pm_notifier(struct kvm *kvm)
9702fdef3a2SSergey Senozhatsky {
9712fdef3a2SSergey Senozhatsky }
9722fdef3a2SSergey Senozhatsky 
kvm_destroy_pm_notifier(struct kvm * kvm)9732fdef3a2SSergey Senozhatsky static void kvm_destroy_pm_notifier(struct kvm *kvm)
9742fdef3a2SSergey Senozhatsky {
9752fdef3a2SSergey Senozhatsky }
9762fdef3a2SSergey Senozhatsky #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
9772fdef3a2SSergey Senozhatsky 
kvm_destroy_dirty_bitmap(struct kvm_memory_slot * memslot)978a47d2b07SPaolo Bonzini static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
979a47d2b07SPaolo Bonzini {
980a47d2b07SPaolo Bonzini 	if (!memslot->dirty_bitmap)
981a47d2b07SPaolo Bonzini 		return;
982a47d2b07SPaolo Bonzini 
983a47d2b07SPaolo Bonzini 	kvfree(memslot->dirty_bitmap);
984a47d2b07SPaolo Bonzini 	memslot->dirty_bitmap = NULL;
985a47d2b07SPaolo Bonzini }
986a47d2b07SPaolo Bonzini 
987a54d8066SMaciej S. Szmigiero /* This does not remove the slot from struct kvm_memslots data structures */
kvm_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)988e96c81eeSSean Christopherson static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
989a47d2b07SPaolo Bonzini {
990e96c81eeSSean Christopherson 	kvm_destroy_dirty_bitmap(slot);
991a47d2b07SPaolo Bonzini 
992e96c81eeSSean Christopherson 	kvm_arch_free_memslot(kvm, slot);
993a47d2b07SPaolo Bonzini 
994a54d8066SMaciej S. Szmigiero 	kfree(slot);
995a47d2b07SPaolo Bonzini }
996a47d2b07SPaolo Bonzini 
kvm_free_memslots(struct kvm * kvm,struct kvm_memslots * slots)997a47d2b07SPaolo Bonzini static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
998a47d2b07SPaolo Bonzini {
999a54d8066SMaciej S. Szmigiero 	struct hlist_node *idnode;
1000a47d2b07SPaolo Bonzini 	struct kvm_memory_slot *memslot;
1001a54d8066SMaciej S. Szmigiero 	int bkt;
1002a47d2b07SPaolo Bonzini 
1003a54d8066SMaciej S. Szmigiero 	/*
1004a54d8066SMaciej S. Szmigiero 	 * The same memslot objects live in both active and inactive sets,
1005a54d8066SMaciej S. Szmigiero 	 * arbitrarily free using index '1' so the second invocation of this
1006a54d8066SMaciej S. Szmigiero 	 * function isn't operating over a structure with dangling pointers
1007a54d8066SMaciej S. Szmigiero 	 * (even though this function isn't actually touching them).
1008a54d8066SMaciej S. Szmigiero 	 */
1009a54d8066SMaciej S. Szmigiero 	if (!slots->node_idx)
1010a47d2b07SPaolo Bonzini 		return;
1011a47d2b07SPaolo Bonzini 
1012a54d8066SMaciej S. Szmigiero 	hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013e96c81eeSSean Christopherson 		kvm_free_memslot(kvm, memslot);
1014bf3e05bcSXiao Guangrong }
1015bf3e05bcSXiao Guangrong 
kvm_stats_debugfs_mode(const struct _kvm_stats_desc * pdesc)1016bc9e9e67SJing Zhang static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017bc9e9e67SJing Zhang {
1018bc9e9e67SJing Zhang 	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019bc9e9e67SJing Zhang 	case KVM_STATS_TYPE_INSTANT:
1020bc9e9e67SJing Zhang 		return 0444;
1021bc9e9e67SJing Zhang 	case KVM_STATS_TYPE_CUMULATIVE:
1022bc9e9e67SJing Zhang 	case KVM_STATS_TYPE_PEAK:
1023bc9e9e67SJing Zhang 	default:
1024bc9e9e67SJing Zhang 		return 0644;
1025bc9e9e67SJing Zhang 	}
1026bc9e9e67SJing Zhang }
1027bc9e9e67SJing Zhang 
1028bc9e9e67SJing Zhang 
kvm_destroy_vm_debugfs(struct kvm * kvm)1029536a6f88SJanosch Frank static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030536a6f88SJanosch Frank {
1031536a6f88SJanosch Frank 	int i;
1032bc9e9e67SJing Zhang 	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033bc9e9e67SJing Zhang 				      kvm_vcpu_stats_header.num_desc;
1034536a6f88SJanosch Frank 
1035a44a4cc1SOliver Upton 	if (IS_ERR(kvm->debugfs_dentry))
1036536a6f88SJanosch Frank 		return;
1037536a6f88SJanosch Frank 
1038536a6f88SJanosch Frank 	debugfs_remove_recursive(kvm->debugfs_dentry);
1039536a6f88SJanosch Frank 
10409d5a1dceSLuiz Capitulino 	if (kvm->debugfs_stat_data) {
1041536a6f88SJanosch Frank 		for (i = 0; i < kvm_debugfs_num_entries; i++)
1042536a6f88SJanosch Frank 			kfree(kvm->debugfs_stat_data[i]);
1043536a6f88SJanosch Frank 		kfree(kvm->debugfs_stat_data);
1044536a6f88SJanosch Frank 	}
10459d5a1dceSLuiz Capitulino }
1046536a6f88SJanosch Frank 
kvm_create_vm_debugfs(struct kvm * kvm,const char * fdname)104759f82aadSOliver Upton static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048536a6f88SJanosch Frank {
104985cd39afSPaolo Bonzini 	static DEFINE_MUTEX(kvm_debugfs_lock);
105085cd39afSPaolo Bonzini 	struct dentry *dent;
1051536a6f88SJanosch Frank 	char dir_name[ITOA_MAX_LEN * 2];
1052536a6f88SJanosch Frank 	struct kvm_stat_data *stat_data;
1053bc9e9e67SJing Zhang 	const struct _kvm_stats_desc *pdesc;
1054b74ed7a6SOliver Upton 	int i, ret = -ENOMEM;
1055bc9e9e67SJing Zhang 	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056bc9e9e67SJing Zhang 				      kvm_vcpu_stats_header.num_desc;
1057536a6f88SJanosch Frank 
1058536a6f88SJanosch Frank 	if (!debugfs_initialized())
1059536a6f88SJanosch Frank 		return 0;
1060536a6f88SJanosch Frank 
106159f82aadSOliver Upton 	snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
106285cd39afSPaolo Bonzini 	mutex_lock(&kvm_debugfs_lock);
106385cd39afSPaolo Bonzini 	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
106485cd39afSPaolo Bonzini 	if (dent) {
106585cd39afSPaolo Bonzini 		pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
106685cd39afSPaolo Bonzini 		dput(dent);
106785cd39afSPaolo Bonzini 		mutex_unlock(&kvm_debugfs_lock);
106885cd39afSPaolo Bonzini 		return 0;
106985cd39afSPaolo Bonzini 	}
107085cd39afSPaolo Bonzini 	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
107185cd39afSPaolo Bonzini 	mutex_unlock(&kvm_debugfs_lock);
107285cd39afSPaolo Bonzini 	if (IS_ERR(dent))
107385cd39afSPaolo Bonzini 		return 0;
1074536a6f88SJanosch Frank 
107585cd39afSPaolo Bonzini 	kvm->debugfs_dentry = dent;
1076536a6f88SJanosch Frank 	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077536a6f88SJanosch Frank 					 sizeof(*kvm->debugfs_stat_data),
1078b12ce36aSBen Gardon 					 GFP_KERNEL_ACCOUNT);
1079536a6f88SJanosch Frank 	if (!kvm->debugfs_stat_data)
1080b74ed7a6SOliver Upton 		goto out_err;
1081536a6f88SJanosch Frank 
1082bc9e9e67SJing Zhang 	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083bc9e9e67SJing Zhang 		pdesc = &kvm_vm_stats_desc[i];
1084b12ce36aSBen Gardon 		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085536a6f88SJanosch Frank 		if (!stat_data)
1086b74ed7a6SOliver Upton 			goto out_err;
1087536a6f88SJanosch Frank 
1088536a6f88SJanosch Frank 		stat_data->kvm = kvm;
1089bc9e9e67SJing Zhang 		stat_data->desc = pdesc;
1090bc9e9e67SJing Zhang 		stat_data->kind = KVM_STAT_VM;
1091bc9e9e67SJing Zhang 		kvm->debugfs_stat_data[i] = stat_data;
1092bc9e9e67SJing Zhang 		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093bc9e9e67SJing Zhang 				    kvm->debugfs_dentry, stat_data,
1094bc9e9e67SJing Zhang 				    &stat_fops_per_vm);
1095bc9e9e67SJing Zhang 	}
1096bc9e9e67SJing Zhang 
1097bc9e9e67SJing Zhang 	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098bc9e9e67SJing Zhang 		pdesc = &kvm_vcpu_stats_desc[i];
1099bc9e9e67SJing Zhang 		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100bc9e9e67SJing Zhang 		if (!stat_data)
1101b74ed7a6SOliver Upton 			goto out_err;
1102bc9e9e67SJing Zhang 
1103bc9e9e67SJing Zhang 		stat_data->kvm = kvm;
1104bc9e9e67SJing Zhang 		stat_data->desc = pdesc;
1105bc9e9e67SJing Zhang 		stat_data->kind = KVM_STAT_VCPU;
1106004d62ebSPavel Skripkin 		kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107bc9e9e67SJing Zhang 		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
110809cbcef6SMilan Pandurov 				    kvm->debugfs_dentry, stat_data,
110909cbcef6SMilan Pandurov 				    &stat_fops_per_vm);
1110536a6f88SJanosch Frank 	}
11113165af73SPeter Xu 
11123165af73SPeter Xu 	ret = kvm_arch_create_vm_debugfs(kvm);
1113b74ed7a6SOliver Upton 	if (ret)
1114b74ed7a6SOliver Upton 		goto out_err;
11153165af73SPeter Xu 
1116536a6f88SJanosch Frank 	return 0;
1117b74ed7a6SOliver Upton out_err:
1118b74ed7a6SOliver Upton 	kvm_destroy_vm_debugfs(kvm);
1119b74ed7a6SOliver Upton 	return ret;
1120536a6f88SJanosch Frank }
1121536a6f88SJanosch Frank 
11221aa9b957SJunaid Shahid /*
11231aa9b957SJunaid Shahid  * Called after the VM is otherwise initialized, but just before adding it to
11241aa9b957SJunaid Shahid  * the vm_list.
11251aa9b957SJunaid Shahid  */
kvm_arch_post_init_vm(struct kvm * kvm)11261aa9b957SJunaid Shahid int __weak kvm_arch_post_init_vm(struct kvm *kvm)
11271aa9b957SJunaid Shahid {
11281aa9b957SJunaid Shahid 	return 0;
11291aa9b957SJunaid Shahid }
11301aa9b957SJunaid Shahid 
11311aa9b957SJunaid Shahid /*
11321aa9b957SJunaid Shahid  * Called just after removing the VM from the vm_list, but before doing any
11331aa9b957SJunaid Shahid  * other destruction.
11341aa9b957SJunaid Shahid  */
kvm_arch_pre_destroy_vm(struct kvm * kvm)11351aa9b957SJunaid Shahid void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
11361aa9b957SJunaid Shahid {
11371aa9b957SJunaid Shahid }
11381aa9b957SJunaid Shahid 
11393165af73SPeter Xu /*
11403165af73SPeter Xu  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
11413165af73SPeter Xu  * be setup already, so we can create arch-specific debugfs entries under it.
11423165af73SPeter Xu  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
11433165af73SPeter Xu  * a per-arch destroy interface is not needed.
11443165af73SPeter Xu  */
kvm_arch_create_vm_debugfs(struct kvm * kvm)11453165af73SPeter Xu int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
11463165af73SPeter Xu {
11473165af73SPeter Xu 	return 0;
11483165af73SPeter Xu }
11493165af73SPeter Xu 
kvm_create_vm(unsigned long type,const char * fdname)1150b74ed7a6SOliver Upton static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
11510fce5623SAvi Kivity {
1152d89f5effSJan Kiszka 	struct kvm *kvm = kvm_arch_alloc_vm();
1153a54d8066SMaciej S. Szmigiero 	struct kvm_memslots *slots;
11549121923cSJim Mattson 	int r = -ENOMEM;
1155a54d8066SMaciej S. Szmigiero 	int i, j;
11560fce5623SAvi Kivity 
1157d89f5effSJan Kiszka 	if (!kvm)
1158d89f5effSJan Kiszka 		return ERR_PTR(-ENOMEM);
1159d89f5effSJan Kiszka 
1160405294f2SSean Christopherson 	/* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1161405294f2SSean Christopherson 	__module_get(kvm_chardev_ops.owner);
1162405294f2SSean Christopherson 
1163531810caSBen Gardon 	KVM_MMU_LOCK_INIT(kvm);
1164f1f10076SVegard Nossum 	mmgrab(current->mm);
1165e9ad4ec8SPaolo Bonzini 	kvm->mm = current->mm;
1166e9ad4ec8SPaolo Bonzini 	kvm_eventfd_init(kvm);
1167e9ad4ec8SPaolo Bonzini 	mutex_init(&kvm->lock);
1168e9ad4ec8SPaolo Bonzini 	mutex_init(&kvm->irq_lock);
1169e9ad4ec8SPaolo Bonzini 	mutex_init(&kvm->slots_lock);
1170b10a038eSBen Gardon 	mutex_init(&kvm->slots_arch_lock);
117152ac8b35SPaolo Bonzini 	spin_lock_init(&kvm->mn_invalidate_lock);
117252ac8b35SPaolo Bonzini 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1173c5b07754SMarc Zyngier 	xa_init(&kvm->vcpu_array);
117452ac8b35SPaolo Bonzini 
1175982ed0deSDavid Woodhouse 	INIT_LIST_HEAD(&kvm->gpc_list);
1176982ed0deSDavid Woodhouse 	spin_lock_init(&kvm->gpc_lock);
1177e9ad4ec8SPaolo Bonzini 
1178e9ad4ec8SPaolo Bonzini 	INIT_LIST_HEAD(&kvm->devices);
1179f502cc56SSean Christopherson 	kvm->max_vcpus = KVM_MAX_VCPUS;
1180e9ad4ec8SPaolo Bonzini 
11819121923cSJim Mattson 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
11829121923cSJim Mattson 
11835c697c36SSean Christopherson 	/*
11845c697c36SSean Christopherson 	 * Force subsequent debugfs file creations to fail if the VM directory
11855c697c36SSean Christopherson 	 * is not created (by kvm_create_vm_debugfs()).
11865c697c36SSean Christopherson 	 */
11875c697c36SSean Christopherson 	kvm->debugfs_dentry = ERR_PTR(-ENOENT);
11885c697c36SSean Christopherson 
1189f2759c08SOliver Upton 	snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1190f2759c08SOliver Upton 		 task_pid_nr(current));
1191f2759c08SOliver Upton 
11928a44119aSPaolo Bonzini 	if (init_srcu_struct(&kvm->srcu))
11938a44119aSPaolo Bonzini 		goto out_err_no_srcu;
11948a44119aSPaolo Bonzini 	if (init_srcu_struct(&kvm->irq_srcu))
11958a44119aSPaolo Bonzini 		goto out_err_no_irq_srcu;
11968a44119aSPaolo Bonzini 
1197e2d3fcafSPaolo Bonzini 	refcount_set(&kvm->users_count, 1);
11989121923cSJim Mattson 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199a54d8066SMaciej S. Szmigiero 		for (j = 0; j < 2; j++) {
1200a54d8066SMaciej S. Szmigiero 			slots = &kvm->__memslots[i][j];
12019121923cSJim Mattson 
1202a54d8066SMaciej S. Szmigiero 			atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1203a54d8066SMaciej S. Szmigiero 			slots->hva_tree = RB_ROOT_CACHED;
1204a54d8066SMaciej S. Szmigiero 			slots->gfn_tree = RB_ROOT;
1205a54d8066SMaciej S. Szmigiero 			hash_init(slots->id_hash);
1206a54d8066SMaciej S. Szmigiero 			slots->node_idx = j;
1207a54d8066SMaciej S. Szmigiero 
12089121923cSJim Mattson 			/* Generations must be different for each address space. */
12099121923cSJim Mattson 			slots->generation = i;
1210a54d8066SMaciej S. Szmigiero 		}
1211a54d8066SMaciej S. Szmigiero 
1212a54d8066SMaciej S. Szmigiero 		rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
12139121923cSJim Mattson 	}
12149121923cSJim Mattson 
12159121923cSJim Mattson 	for (i = 0; i < KVM_NR_BUSES; i++) {
12169121923cSJim Mattson 		rcu_assign_pointer(kvm->buses[i],
12179121923cSJim Mattson 			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
12189121923cSJim Mattson 		if (!kvm->buses[i])
1219a97b0e77SJim Mattson 			goto out_err_no_arch_destroy_vm;
12209121923cSJim Mattson 	}
12219121923cSJim Mattson 
1222e08b9637SCarsten Otte 	r = kvm_arch_init_vm(kvm, type);
1223d89f5effSJan Kiszka 	if (r)
1224a97b0e77SJim Mattson 		goto out_err_no_arch_destroy_vm;
122510474ae8SAlexander Graf 
122610474ae8SAlexander Graf 	r = hardware_enable_all();
122710474ae8SAlexander Graf 	if (r)
1228719d93cdSChristian Borntraeger 		goto out_err_no_disable;
122910474ae8SAlexander Graf 
1230c77dcacbSPaolo Bonzini #ifdef CONFIG_HAVE_KVM_IRQFD
1231136bdfeeSGleb Natapov 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
123275858a84SAvi Kivity #endif
12330fce5623SAvi Kivity 
123474b5c5bfSMike Waychison 	r = kvm_init_mmu_notifier(kvm);
123574b5c5bfSMike Waychison 	if (r)
12361aa9b957SJunaid Shahid 		goto out_err_no_mmu_notifier;
12371aa9b957SJunaid Shahid 
1238c2b82397SSean Christopherson 	r = kvm_coalesced_mmio_init(kvm);
1239c2b82397SSean Christopherson 	if (r < 0)
1240c2b82397SSean Christopherson 		goto out_no_coalesced_mmio;
1241c2b82397SSean Christopherson 
12424ba4f419SSean Christopherson 	r = kvm_create_vm_debugfs(kvm, fdname);
12434ba4f419SSean Christopherson 	if (r)
12444ba4f419SSean Christopherson 		goto out_err_no_debugfs;
12454ba4f419SSean Christopherson 
12461aa9b957SJunaid Shahid 	r = kvm_arch_post_init_vm(kvm);
12471aa9b957SJunaid Shahid 	if (r)
12484ba4f419SSean Christopherson 		goto out_err;
124974b5c5bfSMike Waychison 
12500d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
12510fce5623SAvi Kivity 	list_add(&kvm->vm_list, &vm_list);
12520d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
1253d89f5effSJan Kiszka 
12542ecd9d29SPeter Zijlstra 	preempt_notifier_inc();
12552fdef3a2SSergey Senozhatsky 	kvm_init_pm_notifier(kvm);
12562ecd9d29SPeter Zijlstra 
12570fce5623SAvi Kivity 	return kvm;
125810474ae8SAlexander Graf 
125910474ae8SAlexander Graf out_err:
12604ba4f419SSean Christopherson 	kvm_destroy_vm_debugfs(kvm);
12614ba4f419SSean Christopherson out_err_no_debugfs:
1262c2b82397SSean Christopherson 	kvm_coalesced_mmio_free(kvm);
1263c2b82397SSean Christopherson out_no_coalesced_mmio:
12641aa9b957SJunaid Shahid #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
12651aa9b957SJunaid Shahid 	if (kvm->mmu_notifier.ops)
12661aa9b957SJunaid Shahid 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
12671aa9b957SJunaid Shahid #endif
12681aa9b957SJunaid Shahid out_err_no_mmu_notifier:
126910474ae8SAlexander Graf 	hardware_disable_all();
1270719d93cdSChristian Borntraeger out_err_no_disable:
1271a97b0e77SJim Mattson 	kvm_arch_destroy_vm(kvm);
1272a97b0e77SJim Mattson out_err_no_arch_destroy_vm:
1273e2d3fcafSPaolo Bonzini 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274e93f8a0fSMarcelo Tosatti 	for (i = 0; i < KVM_NR_BUSES; i++)
12753898da94SPaolo Bonzini 		kfree(kvm_get_bus(kvm, i));
12768a44119aSPaolo Bonzini 	cleanup_srcu_struct(&kvm->irq_srcu);
12778a44119aSPaolo Bonzini out_err_no_irq_srcu:
12788a44119aSPaolo Bonzini 	cleanup_srcu_struct(&kvm->srcu);
12798a44119aSPaolo Bonzini out_err_no_srcu:
1280d89f5effSJan Kiszka 	kvm_arch_free_vm(kvm);
1281e9ad4ec8SPaolo Bonzini 	mmdrop(current->mm);
1282405294f2SSean Christopherson 	module_put(kvm_chardev_ops.owner);
128310474ae8SAlexander Graf 	return ERR_PTR(r);
12840fce5623SAvi Kivity }
12850fce5623SAvi Kivity 
kvm_destroy_devices(struct kvm * kvm)128607f0a7bdSScott Wood static void kvm_destroy_devices(struct kvm *kvm)
128707f0a7bdSScott Wood {
1288e6e3b5a6SGeliang Tang 	struct kvm_device *dev, *tmp;
128907f0a7bdSScott Wood 
1290a28ebea2SChristoffer Dall 	/*
1291a28ebea2SChristoffer Dall 	 * We do not need to take the kvm->lock here, because nobody else
1292a28ebea2SChristoffer Dall 	 * has a reference to the struct kvm at this point and therefore
1293a28ebea2SChristoffer Dall 	 * cannot access the devices list anyhow.
1294a28ebea2SChristoffer Dall 	 */
1295e6e3b5a6SGeliang Tang 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296e6e3b5a6SGeliang Tang 		list_del(&dev->vm_node);
129707f0a7bdSScott Wood 		dev->ops->destroy(dev);
129807f0a7bdSScott Wood 	}
129907f0a7bdSScott Wood }
130007f0a7bdSScott Wood 
kvm_destroy_vm(struct kvm * kvm)13010fce5623SAvi Kivity static void kvm_destroy_vm(struct kvm *kvm)
13020fce5623SAvi Kivity {
1303e93f8a0fSMarcelo Tosatti 	int i;
13040fce5623SAvi Kivity 	struct mm_struct *mm = kvm->mm;
13050fce5623SAvi Kivity 
13062fdef3a2SSergey Senozhatsky 	kvm_destroy_pm_notifier(kvm);
1307286de8f6SClaudio Imbrenda 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308536a6f88SJanosch Frank 	kvm_destroy_vm_debugfs(kvm);
1309ad8ba2cdSSheng Yang 	kvm_arch_sync_events(kvm);
13100d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
13110fce5623SAvi Kivity 	list_del(&kvm->vm_list);
13120d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
13131aa9b957SJunaid Shahid 	kvm_arch_pre_destroy_vm(kvm);
13141aa9b957SJunaid Shahid 
1315399ec807SAvi Kivity 	kvm_free_irq_routing(kvm);
1316df630b8cSPeter Xu 	for (i = 0; i < KVM_NR_BUSES; i++) {
13173898da94SPaolo Bonzini 		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
13184a12f951SChristian Borntraeger 
13194a12f951SChristian Borntraeger 		if (bus)
13204a12f951SChristian Borntraeger 			kvm_io_bus_destroy(bus);
1321df630b8cSPeter Xu 		kvm->buses[i] = NULL;
1322df630b8cSPeter Xu 	}
1323980da6ceSAvi Kivity 	kvm_coalesced_mmio_free(kvm);
1324e930bffeSAndrea Arcangeli #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325e930bffeSAndrea Arcangeli 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
132652ac8b35SPaolo Bonzini 	/*
132752ac8b35SPaolo Bonzini 	 * At this point, pending calls to invalidate_range_start()
132852ac8b35SPaolo Bonzini 	 * have completed but no more MMU notifiers will run, so
132952ac8b35SPaolo Bonzini 	 * mn_active_invalidate_count may remain unbalanced.
1330b0d23708SJun Miao 	 * No threads can be waiting in kvm_swap_active_memslots() as the
133152ac8b35SPaolo Bonzini 	 * last reference on KVM has been dropped, but freeing
133252ac8b35SPaolo Bonzini 	 * memslots would deadlock without this manual intervention.
133352ac8b35SPaolo Bonzini 	 */
133452ac8b35SPaolo Bonzini 	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
133552ac8b35SPaolo Bonzini 	kvm->mn_active_invalidate_count = 0;
1336f00be0caSGleb Natapov #else
1337683412ccSMingwei Zhang 	kvm_flush_shadow_all(kvm);
1338e930bffeSAndrea Arcangeli #endif
13390fce5623SAvi Kivity 	kvm_arch_destroy_vm(kvm);
134007f0a7bdSScott Wood 	kvm_destroy_devices(kvm);
1341a54d8066SMaciej S. Szmigiero 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342a54d8066SMaciej S. Szmigiero 		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1343a54d8066SMaciej S. Szmigiero 		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1344a54d8066SMaciej S. Szmigiero 	}
1345820b3fcdSPaolo Bonzini 	cleanup_srcu_struct(&kvm->irq_srcu);
1346d89f5effSJan Kiszka 	cleanup_srcu_struct(&kvm->srcu);
1347d89f5effSJan Kiszka 	kvm_arch_free_vm(kvm);
13482ecd9d29SPeter Zijlstra 	preempt_notifier_dec();
134910474ae8SAlexander Graf 	hardware_disable_all();
13500fce5623SAvi Kivity 	mmdrop(mm);
13515f6de5cbSDavid Matlack 	module_put(kvm_chardev_ops.owner);
13520fce5623SAvi Kivity }
13530fce5623SAvi Kivity 
kvm_get_kvm(struct kvm * kvm)1354d39f13b0SIzik Eidus void kvm_get_kvm(struct kvm *kvm)
1355d39f13b0SIzik Eidus {
1356e3736c3eSElena Reshetova 	refcount_inc(&kvm->users_count);
1357d39f13b0SIzik Eidus }
1358d39f13b0SIzik Eidus EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359d39f13b0SIzik Eidus 
1360605c7130SPeter Xu /*
1361605c7130SPeter Xu  * Make sure the vm is not during destruction, which is a safe version of
1362605c7130SPeter Xu  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1363605c7130SPeter Xu  */
kvm_get_kvm_safe(struct kvm * kvm)1364605c7130SPeter Xu bool kvm_get_kvm_safe(struct kvm *kvm)
1365605c7130SPeter Xu {
1366605c7130SPeter Xu 	return refcount_inc_not_zero(&kvm->users_count);
1367605c7130SPeter Xu }
1368605c7130SPeter Xu EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369605c7130SPeter Xu 
kvm_put_kvm(struct kvm * kvm)1370d39f13b0SIzik Eidus void kvm_put_kvm(struct kvm *kvm)
1371d39f13b0SIzik Eidus {
1372e3736c3eSElena Reshetova 	if (refcount_dec_and_test(&kvm->users_count))
1373d39f13b0SIzik Eidus 		kvm_destroy_vm(kvm);
1374d39f13b0SIzik Eidus }
1375d39f13b0SIzik Eidus EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376d39f13b0SIzik Eidus 
1377149487bdSSean Christopherson /*
1378149487bdSSean Christopherson  * Used to put a reference that was taken on behalf of an object associated
1379149487bdSSean Christopherson  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380149487bdSSean Christopherson  * of the new file descriptor fails and the reference cannot be transferred to
1381149487bdSSean Christopherson  * its final owner.  In such cases, the caller is still actively using @kvm and
1382149487bdSSean Christopherson  * will fail miserably if the refcount unexpectedly hits zero.
1383149487bdSSean Christopherson  */
kvm_put_kvm_no_destroy(struct kvm * kvm)1384149487bdSSean Christopherson void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385149487bdSSean Christopherson {
1386149487bdSSean Christopherson 	WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387149487bdSSean Christopherson }
1388149487bdSSean Christopherson EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389d39f13b0SIzik Eidus 
kvm_vm_release(struct inode * inode,struct file * filp)13900fce5623SAvi Kivity static int kvm_vm_release(struct inode *inode, struct file *filp)
13910fce5623SAvi Kivity {
13920fce5623SAvi Kivity 	struct kvm *kvm = filp->private_data;
13930fce5623SAvi Kivity 
1394721eecbfSGregory Haskins 	kvm_irqfd_release(kvm);
1395721eecbfSGregory Haskins 
1396d39f13b0SIzik Eidus 	kvm_put_kvm(kvm);
13970fce5623SAvi Kivity 	return 0;
13980fce5623SAvi Kivity }
13990fce5623SAvi Kivity 
1400515a0127STakuya Yoshikawa /*
1401515a0127STakuya Yoshikawa  * Allocation size is twice as large as the actual dirty bitmap size.
14020dff0846SSean Christopherson  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403515a0127STakuya Yoshikawa  */
kvm_alloc_dirty_bitmap(struct kvm_memory_slot * memslot)14043c9bd400SJay Zhou static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405a36a57b1STakuya Yoshikawa {
140637b2a651SPaolo Bonzini 	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407a36a57b1STakuya Yoshikawa 
140837b2a651SPaolo Bonzini 	memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1409a36a57b1STakuya Yoshikawa 	if (!memslot->dirty_bitmap)
1410a36a57b1STakuya Yoshikawa 		return -ENOMEM;
1411a36a57b1STakuya Yoshikawa 
1412a36a57b1STakuya Yoshikawa 	return 0;
1413a36a57b1STakuya Yoshikawa }
1414a36a57b1STakuya Yoshikawa 
kvm_get_inactive_memslots(struct kvm * kvm,int as_id)1415a54d8066SMaciej S. Szmigiero static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1416bf3e05bcSXiao Guangrong {
1417a54d8066SMaciej S. Szmigiero 	struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418a54d8066SMaciej S. Szmigiero 	int node_idx_inactive = active->node_idx ^ 1;
1419bf3e05bcSXiao Guangrong 
1420a54d8066SMaciej S. Szmigiero 	return &kvm->__memslots[as_id][node_idx_inactive];
14218593176cSPaolo Bonzini }
1422efbeec70SPaolo Bonzini 
1423efbeec70SPaolo Bonzini /*
1424a54d8066SMaciej S. Szmigiero  * Helper to get the address space ID when one of memslot pointers may be NULL.
1425a54d8066SMaciej S. Szmigiero  * This also serves as a sanity that at least one of the pointers is non-NULL,
1426a54d8066SMaciej S. Szmigiero  * and that their address space IDs don't diverge.
1427efbeec70SPaolo Bonzini  */
kvm_memslots_get_as_id(struct kvm_memory_slot * a,struct kvm_memory_slot * b)1428a54d8066SMaciej S. Szmigiero static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429a54d8066SMaciej S. Szmigiero 				  struct kvm_memory_slot *b)
14300577d1abSSean Christopherson {
1431a54d8066SMaciej S. Szmigiero 	if (WARN_ON_ONCE(!a && !b))
1432a54d8066SMaciej S. Szmigiero 		return 0;
1433a54d8066SMaciej S. Szmigiero 
1434a54d8066SMaciej S. Szmigiero 	if (!a)
1435a54d8066SMaciej S. Szmigiero 		return b->as_id;
1436a54d8066SMaciej S. Szmigiero 	if (!b)
1437a54d8066SMaciej S. Szmigiero 		return a->as_id;
1438a54d8066SMaciej S. Szmigiero 
1439a54d8066SMaciej S. Szmigiero 	WARN_ON_ONCE(a->as_id != b->as_id);
1440a54d8066SMaciej S. Szmigiero 	return a->as_id;
14410577d1abSSean Christopherson }
14420577d1abSSean Christopherson 
kvm_insert_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1443a54d8066SMaciej S. Szmigiero static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444a54d8066SMaciej S. Szmigiero 				struct kvm_memory_slot *slot)
14450577d1abSSean Christopherson {
1446a54d8066SMaciej S. Szmigiero 	struct rb_root *gfn_tree = &slots->gfn_tree;
1447a54d8066SMaciej S. Szmigiero 	struct rb_node **node, *parent;
1448a54d8066SMaciej S. Szmigiero 	int idx = slots->node_idx;
14490577d1abSSean Christopherson 
1450a54d8066SMaciej S. Szmigiero 	parent = NULL;
1451a54d8066SMaciej S. Szmigiero 	for (node = &gfn_tree->rb_node; *node; ) {
1452a54d8066SMaciej S. Szmigiero 		struct kvm_memory_slot *tmp;
14530577d1abSSean Christopherson 
1454a54d8066SMaciej S. Szmigiero 		tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1455a54d8066SMaciej S. Szmigiero 		parent = *node;
1456a54d8066SMaciej S. Szmigiero 		if (slot->base_gfn < tmp->base_gfn)
1457a54d8066SMaciej S. Szmigiero 			node = &(*node)->rb_left;
1458a54d8066SMaciej S. Szmigiero 		else if (slot->base_gfn > tmp->base_gfn)
1459a54d8066SMaciej S. Szmigiero 			node = &(*node)->rb_right;
14600577d1abSSean Christopherson 		else
1461a54d8066SMaciej S. Szmigiero 			BUG();
1462a54d8066SMaciej S. Szmigiero 	}
1463a54d8066SMaciej S. Szmigiero 
1464a54d8066SMaciej S. Szmigiero 	rb_link_node(&slot->gfn_node[idx], parent, node);
1465a54d8066SMaciej S. Szmigiero 	rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466a54d8066SMaciej S. Szmigiero }
1467a54d8066SMaciej S. Szmigiero 
kvm_erase_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1468a54d8066SMaciej S. Szmigiero static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469a54d8066SMaciej S. Szmigiero 			       struct kvm_memory_slot *slot)
1470a54d8066SMaciej S. Szmigiero {
1471a54d8066SMaciej S. Szmigiero 	rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472a54d8066SMaciej S. Szmigiero }
1473a54d8066SMaciej S. Szmigiero 
kvm_replace_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1474a54d8066SMaciej S. Szmigiero static void kvm_replace_gfn_node(struct kvm_memslots *slots,
147526b8345aSMaciej S. Szmigiero 				 struct kvm_memory_slot *old,
147626b8345aSMaciej S. Szmigiero 				 struct kvm_memory_slot *new)
147726b8345aSMaciej S. Szmigiero {
1478a54d8066SMaciej S. Szmigiero 	int idx = slots->node_idx;
1479a54d8066SMaciej S. Szmigiero 
1480a54d8066SMaciej S. Szmigiero 	WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481a54d8066SMaciej S. Szmigiero 
1482a54d8066SMaciej S. Szmigiero 	rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1483a54d8066SMaciej S. Szmigiero 			&slots->gfn_tree);
1484a54d8066SMaciej S. Szmigiero }
14850577d1abSSean Christopherson 
14860577d1abSSean Christopherson /*
1487a54d8066SMaciej S. Szmigiero  * Replace @old with @new in the inactive memslots.
1488a54d8066SMaciej S. Szmigiero  *
1489a54d8066SMaciej S. Szmigiero  * With NULL @old this simply adds @new.
1490a54d8066SMaciej S. Szmigiero  * With NULL @new this simply removes @old.
1491a54d8066SMaciej S. Szmigiero  *
1492a54d8066SMaciej S. Szmigiero  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1493a54d8066SMaciej S. Szmigiero  * appropriately.
14940577d1abSSean Christopherson  */
kvm_replace_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1495a54d8066SMaciej S. Szmigiero static void kvm_replace_memslot(struct kvm *kvm,
1496a54d8066SMaciej S. Szmigiero 				struct kvm_memory_slot *old,
1497a54d8066SMaciej S. Szmigiero 				struct kvm_memory_slot *new)
1498a54d8066SMaciej S. Szmigiero {
1499a54d8066SMaciej S. Szmigiero 	int as_id = kvm_memslots_get_as_id(old, new);
1500a54d8066SMaciej S. Szmigiero 	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501a54d8066SMaciej S. Szmigiero 	int idx = slots->node_idx;
1502a54d8066SMaciej S. Szmigiero 
150326b8345aSMaciej S. Szmigiero 	if (old) {
1504a54d8066SMaciej S. Szmigiero 		hash_del(&old->id_node[idx]);
1505a54d8066SMaciej S. Szmigiero 		interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
150626b8345aSMaciej S. Szmigiero 
1507a54d8066SMaciej S. Szmigiero 		if ((long)old == atomic_long_read(&slots->last_used_slot))
1508a54d8066SMaciej S. Szmigiero 			atomic_long_set(&slots->last_used_slot, (long)new);
1509a54d8066SMaciej S. Szmigiero 
1510a54d8066SMaciej S. Szmigiero 		if (!new) {
1511a54d8066SMaciej S. Szmigiero 			kvm_erase_gfn_node(slots, old);
151226b8345aSMaciej S. Szmigiero 			return;
1513a54d8066SMaciej S. Szmigiero 		}
1514a54d8066SMaciej S. Szmigiero 	}
151526b8345aSMaciej S. Szmigiero 
1516a54d8066SMaciej S. Szmigiero 	/*
1517a54d8066SMaciej S. Szmigiero 	 * Initialize @new's hva range.  Do this even when replacing an @old
1518a54d8066SMaciej S. Szmigiero 	 * slot, kvm_copy_memslot() deliberately does not touch node data.
1519a54d8066SMaciej S. Szmigiero 	 */
1520a54d8066SMaciej S. Szmigiero 	new->hva_node[idx].start = new->userspace_addr;
1521a54d8066SMaciej S. Szmigiero 	new->hva_node[idx].last = new->userspace_addr +
1522ed922739SMaciej S. Szmigiero 				  (new->npages << PAGE_SHIFT) - 1;
152326b8345aSMaciej S. Szmigiero 
15240fce5623SAvi Kivity 	/*
1525a54d8066SMaciej S. Szmigiero 	 * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1526a54d8066SMaciej S. Szmigiero 	 * hva_node needs to be swapped with remove+insert even though hva can't
1527a54d8066SMaciej S. Szmigiero 	 * change when replacing an existing slot.
15280fce5623SAvi Kivity 	 */
1529a54d8066SMaciej S. Szmigiero 	hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530a54d8066SMaciej S. Szmigiero 	interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
15310fce5623SAvi Kivity 
153226b8345aSMaciej S. Szmigiero 	/*
1533a54d8066SMaciej S. Szmigiero 	 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1534a54d8066SMaciej S. Szmigiero 	 * switch the node in the gfn tree instead of removing the old and
1535a54d8066SMaciej S. Szmigiero 	 * inserting the new as two separate operations. Replacement is a
1536a54d8066SMaciej S. Szmigiero 	 * single O(1) operation versus two O(log(n)) operations for
1537a54d8066SMaciej S. Szmigiero 	 * remove+insert.
153826b8345aSMaciej S. Szmigiero 	 */
1539a54d8066SMaciej S. Szmigiero 	if (old && old->base_gfn == new->base_gfn) {
1540a54d8066SMaciej S. Szmigiero 		kvm_replace_gfn_node(slots, old, new);
15410577d1abSSean Christopherson 	} else {
1542a54d8066SMaciej S. Szmigiero 		if (old)
1543a54d8066SMaciej S. Szmigiero 			kvm_erase_gfn_node(slots, old);
1544a54d8066SMaciej S. Szmigiero 		kvm_insert_gfn_node(slots, new);
15450577d1abSSean Christopherson 	}
1546bf3e05bcSXiao Guangrong }
1547bf3e05bcSXiao Guangrong 
check_memory_region_flags(const struct kvm_userspace_memory_region * mem)154809170a49SPaolo Bonzini static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549a50d64d6SXiao Guangrong {
15504d8b81abSXiao Guangrong 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
15514d8b81abSXiao Guangrong 
15520f8a4de3SChristoffer Dall #ifdef __KVM_HAVE_READONLY_MEM
15534d8b81abSXiao Guangrong 	valid_flags |= KVM_MEM_READONLY;
15544d8b81abSXiao Guangrong #endif
15554d8b81abSXiao Guangrong 
15564d8b81abSXiao Guangrong 	if (mem->flags & ~valid_flags)
1557a50d64d6SXiao Guangrong 		return -EINVAL;
1558a50d64d6SXiao Guangrong 
1559a50d64d6SXiao Guangrong 	return 0;
1560a50d64d6SXiao Guangrong }
1561a50d64d6SXiao Guangrong 
kvm_swap_active_memslots(struct kvm * kvm,int as_id)1562a54d8066SMaciej S. Szmigiero static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
15637ec4fb44SGleb Natapov {
1564a54d8066SMaciej S. Szmigiero 	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565a54d8066SMaciej S. Szmigiero 
1566a54d8066SMaciej S. Szmigiero 	/* Grab the generation from the activate memslots. */
1567a54d8066SMaciej S. Szmigiero 	u64 gen = __kvm_memslots(kvm, as_id)->generation;
15687ec4fb44SGleb Natapov 
1569361209e0SSean Christopherson 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570361209e0SSean Christopherson 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571ee3d1570SDavid Matlack 
157252ac8b35SPaolo Bonzini 	/*
157352ac8b35SPaolo Bonzini 	 * Do not store the new memslots while there are invalidations in
1574071064f1SPaolo Bonzini 	 * progress, otherwise the locking in invalidate_range_start and
1575071064f1SPaolo Bonzini 	 * invalidate_range_end will be unbalanced.
157652ac8b35SPaolo Bonzini 	 */
157752ac8b35SPaolo Bonzini 	spin_lock(&kvm->mn_invalidate_lock);
157852ac8b35SPaolo Bonzini 	prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
157952ac8b35SPaolo Bonzini 	while (kvm->mn_active_invalidate_count) {
158052ac8b35SPaolo Bonzini 		set_current_state(TASK_UNINTERRUPTIBLE);
158152ac8b35SPaolo Bonzini 		spin_unlock(&kvm->mn_invalidate_lock);
158252ac8b35SPaolo Bonzini 		schedule();
158352ac8b35SPaolo Bonzini 		spin_lock(&kvm->mn_invalidate_lock);
158452ac8b35SPaolo Bonzini 	}
158552ac8b35SPaolo Bonzini 	finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1586f481b069SPaolo Bonzini 	rcu_assign_pointer(kvm->memslots[as_id], slots);
158752ac8b35SPaolo Bonzini 	spin_unlock(&kvm->mn_invalidate_lock);
1588b10a038eSBen Gardon 
1589b10a038eSBen Gardon 	/*
1590b10a038eSBen Gardon 	 * Acquired in kvm_set_memslot. Must be released before synchronize
1591b10a038eSBen Gardon 	 * SRCU below in order to avoid deadlock with another thread
1592b10a038eSBen Gardon 	 * acquiring the slots_arch_lock in an srcu critical section.
1593b10a038eSBen Gardon 	 */
1594b10a038eSBen Gardon 	mutex_unlock(&kvm->slots_arch_lock);
1595b10a038eSBen Gardon 
15967ec4fb44SGleb Natapov 	synchronize_srcu_expedited(&kvm->srcu);
1597e59dbe09STakuya Yoshikawa 
1598ee3d1570SDavid Matlack 	/*
1599361209e0SSean Christopherson 	 * Increment the new memslot generation a second time, dropping the
160000116795SMiaohe Lin 	 * update in-progress flag and incrementing the generation based on
1601361209e0SSean Christopherson 	 * the number of address spaces.  This provides a unique and easily
1602361209e0SSean Christopherson 	 * identifiable generation number while the memslots are in flux.
1603361209e0SSean Christopherson 	 */
1604361209e0SSean Christopherson 	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605361209e0SSean Christopherson 
1606361209e0SSean Christopherson 	/*
16074bd518f1SPaolo Bonzini 	 * Generations must be unique even across address spaces.  We do not need
16084bd518f1SPaolo Bonzini 	 * a global counter for that, instead the generation space is evenly split
16094bd518f1SPaolo Bonzini 	 * across address spaces.  For example, with two address spaces, address
1610164bf7e5SSean Christopherson 	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1611164bf7e5SSean Christopherson 	 * use generations 1, 3, 5, ...
1612ee3d1570SDavid Matlack 	 */
1613164bf7e5SSean Christopherson 	gen += KVM_ADDRESS_SPACE_NUM;
1614ee3d1570SDavid Matlack 
161515248258SSean Christopherson 	kvm_arch_memslots_updated(kvm, gen);
161615248258SSean Christopherson 
161715248258SSean Christopherson 	slots->generation = gen;
16187ec4fb44SGleb Natapov }
16197ec4fb44SGleb Natapov 
kvm_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)162007921665SSean Christopherson static int kvm_prepare_memory_region(struct kvm *kvm,
162107921665SSean Christopherson 				     const struct kvm_memory_slot *old,
162207921665SSean Christopherson 				     struct kvm_memory_slot *new,
162336947254SSean Christopherson 				     enum kvm_mr_change change)
162436947254SSean Christopherson {
1625cf47f50bSSean Christopherson 	int r;
1626cf47f50bSSean Christopherson 
1627b10a038eSBen Gardon 	/*
162807921665SSean Christopherson 	 * If dirty logging is disabled, nullify the bitmap; the old bitmap
162907921665SSean Christopherson 	 * will be freed on "commit".  If logging is enabled in both old and
163007921665SSean Christopherson 	 * new, reuse the existing bitmap.  If logging is enabled only in the
163107921665SSean Christopherson 	 * new and KVM isn't using a ring buffer, allocate and initialize a
163207921665SSean Christopherson 	 * new bitmap.
163307921665SSean Christopherson 	 */
1634244893faSSean Christopherson 	if (change != KVM_MR_DELETE) {
163507921665SSean Christopherson 		if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
163607921665SSean Christopherson 			new->dirty_bitmap = NULL;
1637244893faSSean Christopherson 		else if (old && old->dirty_bitmap)
163807921665SSean Christopherson 			new->dirty_bitmap = old->dirty_bitmap;
163986bdf3ebSGavin Shan 		else if (kvm_use_dirty_bitmap(kvm)) {
164007921665SSean Christopherson 			r = kvm_alloc_dirty_bitmap(new);
164107921665SSean Christopherson 			if (r)
164207921665SSean Christopherson 				return r;
164307921665SSean Christopherson 
164407921665SSean Christopherson 			if (kvm_dirty_log_manual_protect_and_init_set(kvm))
164507921665SSean Christopherson 				bitmap_set(new->dirty_bitmap, 0, new->npages);
164607921665SSean Christopherson 		}
1647244893faSSean Christopherson 	}
164807921665SSean Christopherson 
164907921665SSean Christopherson 	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
165007921665SSean Christopherson 
165107921665SSean Christopherson 	/* Free the bitmap on failure if it was allocated above. */
1652c87661f8SSean Christopherson 	if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
165307921665SSean Christopherson 		kvm_destroy_dirty_bitmap(new);
165407921665SSean Christopherson 
165507921665SSean Christopherson 	return r;
165607921665SSean Christopherson }
165707921665SSean Christopherson 
kvm_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)165807921665SSean Christopherson static void kvm_commit_memory_region(struct kvm *kvm,
165907921665SSean Christopherson 				     struct kvm_memory_slot *old,
166007921665SSean Christopherson 				     const struct kvm_memory_slot *new,
166107921665SSean Christopherson 				     enum kvm_mr_change change)
166207921665SSean Christopherson {
16636c7b2202SPaolo Bonzini 	int old_flags = old ? old->flags : 0;
16646c7b2202SPaolo Bonzini 	int new_flags = new ? new->flags : 0;
166507921665SSean Christopherson 	/*
166607921665SSean Christopherson 	 * Update the total number of memslot pages before calling the arch
166707921665SSean Christopherson 	 * hook so that architectures can consume the result directly.
166807921665SSean Christopherson 	 */
166907921665SSean Christopherson 	if (change == KVM_MR_DELETE)
167007921665SSean Christopherson 		kvm->nr_memslot_pages -= old->npages;
167107921665SSean Christopherson 	else if (change == KVM_MR_CREATE)
167207921665SSean Christopherson 		kvm->nr_memslot_pages += new->npages;
167307921665SSean Christopherson 
16746c7b2202SPaolo Bonzini 	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
16756c7b2202SPaolo Bonzini 		int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
16766c7b2202SPaolo Bonzini 		atomic_set(&kvm->nr_memslots_dirty_logging,
16776c7b2202SPaolo Bonzini 			   atomic_read(&kvm->nr_memslots_dirty_logging) + change);
16786c7b2202SPaolo Bonzini 	}
16796c7b2202SPaolo Bonzini 
168007921665SSean Christopherson 	kvm_arch_commit_memory_region(kvm, old, new, change);
168107921665SSean Christopherson 
1682a54d8066SMaciej S. Szmigiero 	switch (change) {
1683a54d8066SMaciej S. Szmigiero 	case KVM_MR_CREATE:
1684a54d8066SMaciej S. Szmigiero 		/* Nothing more to do. */
1685a54d8066SMaciej S. Szmigiero 		break;
1686a54d8066SMaciej S. Szmigiero 	case KVM_MR_DELETE:
1687a54d8066SMaciej S. Szmigiero 		/* Free the old memslot and all its metadata. */
168807921665SSean Christopherson 		kvm_free_memslot(kvm, old);
1689a54d8066SMaciej S. Szmigiero 		break;
1690a54d8066SMaciej S. Szmigiero 	case KVM_MR_MOVE:
1691a54d8066SMaciej S. Szmigiero 	case KVM_MR_FLAGS_ONLY:
1692a54d8066SMaciej S. Szmigiero 		/*
1693a54d8066SMaciej S. Szmigiero 		 * Free the dirty bitmap as needed; the below check encompasses
1694a54d8066SMaciej S. Szmigiero 		 * both the flags and whether a ring buffer is being used)
1695a54d8066SMaciej S. Szmigiero 		 */
1696a54d8066SMaciej S. Szmigiero 		if (old->dirty_bitmap && !new->dirty_bitmap)
169707921665SSean Christopherson 			kvm_destroy_dirty_bitmap(old);
1698a54d8066SMaciej S. Szmigiero 
1699a54d8066SMaciej S. Szmigiero 		/*
1700a54d8066SMaciej S. Szmigiero 		 * The final quirk.  Free the detached, old slot, but only its
1701a54d8066SMaciej S. Szmigiero 		 * memory, not any metadata.  Metadata, including arch specific
1702a54d8066SMaciej S. Szmigiero 		 * data, may be reused by @new.
1703a54d8066SMaciej S. Szmigiero 		 */
1704a54d8066SMaciej S. Szmigiero 		kfree(old);
1705a54d8066SMaciej S. Szmigiero 		break;
1706a54d8066SMaciej S. Szmigiero 	default:
1707a54d8066SMaciej S. Szmigiero 		BUG();
1708a54d8066SMaciej S. Szmigiero 	}
1709a54d8066SMaciej S. Szmigiero }
1710a54d8066SMaciej S. Szmigiero 
1711a54d8066SMaciej S. Szmigiero /*
1712a54d8066SMaciej S. Szmigiero  * Activate @new, which must be installed in the inactive slots by the caller,
1713a54d8066SMaciej S. Szmigiero  * by swapping the active slots and then propagating @new to @old once @old is
1714a54d8066SMaciej S. Szmigiero  * unreachable and can be safely modified.
1715a54d8066SMaciej S. Szmigiero  *
1716a54d8066SMaciej S. Szmigiero  * With NULL @old this simply adds @new to @active (while swapping the sets).
1717a54d8066SMaciej S. Szmigiero  * With NULL @new this simply removes @old from @active and frees it
1718a54d8066SMaciej S. Szmigiero  * (while also swapping the sets).
1719a54d8066SMaciej S. Szmigiero  */
kvm_activate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1720a54d8066SMaciej S. Szmigiero static void kvm_activate_memslot(struct kvm *kvm,
1721a54d8066SMaciej S. Szmigiero 				 struct kvm_memory_slot *old,
1722a54d8066SMaciej S. Szmigiero 				 struct kvm_memory_slot *new)
1723a54d8066SMaciej S. Szmigiero {
1724a54d8066SMaciej S. Szmigiero 	int as_id = kvm_memslots_get_as_id(old, new);
1725a54d8066SMaciej S. Szmigiero 
1726a54d8066SMaciej S. Szmigiero 	kvm_swap_active_memslots(kvm, as_id);
1727a54d8066SMaciej S. Szmigiero 
1728a54d8066SMaciej S. Szmigiero 	/* Propagate the new memslot to the now inactive memslots. */
1729a54d8066SMaciej S. Szmigiero 	kvm_replace_memslot(kvm, old, new);
1730a54d8066SMaciej S. Szmigiero }
1731a54d8066SMaciej S. Szmigiero 
kvm_copy_memslot(struct kvm_memory_slot * dest,const struct kvm_memory_slot * src)1732a54d8066SMaciej S. Szmigiero static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733a54d8066SMaciej S. Szmigiero 			     const struct kvm_memory_slot *src)
1734a54d8066SMaciej S. Szmigiero {
1735a54d8066SMaciej S. Szmigiero 	dest->base_gfn = src->base_gfn;
1736a54d8066SMaciej S. Szmigiero 	dest->npages = src->npages;
1737a54d8066SMaciej S. Szmigiero 	dest->dirty_bitmap = src->dirty_bitmap;
1738a54d8066SMaciej S. Szmigiero 	dest->arch = src->arch;
1739a54d8066SMaciej S. Szmigiero 	dest->userspace_addr = src->userspace_addr;
1740a54d8066SMaciej S. Szmigiero 	dest->flags = src->flags;
1741a54d8066SMaciej S. Szmigiero 	dest->id = src->id;
1742a54d8066SMaciej S. Szmigiero 	dest->as_id = src->as_id;
1743a54d8066SMaciej S. Szmigiero }
1744a54d8066SMaciej S. Szmigiero 
kvm_invalidate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1745a54d8066SMaciej S. Szmigiero static void kvm_invalidate_memslot(struct kvm *kvm,
1746a54d8066SMaciej S. Szmigiero 				   struct kvm_memory_slot *old,
1747244893faSSean Christopherson 				   struct kvm_memory_slot *invalid_slot)
1748a54d8066SMaciej S. Szmigiero {
1749a54d8066SMaciej S. Szmigiero 	/*
1750a54d8066SMaciej S. Szmigiero 	 * Mark the current slot INVALID.  As with all memslot modifications,
1751a54d8066SMaciej S. Szmigiero 	 * this must be done on an unreachable slot to avoid modifying the
1752a54d8066SMaciej S. Szmigiero 	 * current slot in the active tree.
1753a54d8066SMaciej S. Szmigiero 	 */
1754244893faSSean Christopherson 	kvm_copy_memslot(invalid_slot, old);
1755244893faSSean Christopherson 	invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1756244893faSSean Christopherson 	kvm_replace_memslot(kvm, old, invalid_slot);
1757a54d8066SMaciej S. Szmigiero 
1758a54d8066SMaciej S. Szmigiero 	/*
1759a54d8066SMaciej S. Szmigiero 	 * Activate the slot that is now marked INVALID, but don't propagate
1760a54d8066SMaciej S. Szmigiero 	 * the slot to the now inactive slots. The slot is either going to be
1761a54d8066SMaciej S. Szmigiero 	 * deleted or recreated as a new slot.
1762a54d8066SMaciej S. Szmigiero 	 */
1763a54d8066SMaciej S. Szmigiero 	kvm_swap_active_memslots(kvm, old->as_id);
1764a54d8066SMaciej S. Szmigiero 
1765a54d8066SMaciej S. Szmigiero 	/*
1766a54d8066SMaciej S. Szmigiero 	 * From this point no new shadow pages pointing to a deleted, or moved,
1767a54d8066SMaciej S. Szmigiero 	 * memslot will be created.  Validation of sp->gfn happens in:
1768a54d8066SMaciej S. Szmigiero 	 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769a54d8066SMaciej S. Szmigiero 	 *	- kvm_is_visible_gfn (mmu_check_root)
1770a54d8066SMaciej S. Szmigiero 	 */
1771bcb63dcdSMaciej S. Szmigiero 	kvm_arch_flush_shadow_memslot(kvm, old);
1772683412ccSMingwei Zhang 	kvm_arch_guest_memory_reclaimed(kvm);
1773a54d8066SMaciej S. Szmigiero 
1774b0d23708SJun Miao 	/* Was released by kvm_swap_active_memslots(), reacquire. */
1775a54d8066SMaciej S. Szmigiero 	mutex_lock(&kvm->slots_arch_lock);
1776a54d8066SMaciej S. Szmigiero 
1777a54d8066SMaciej S. Szmigiero 	/*
1778a54d8066SMaciej S. Szmigiero 	 * Copy the arch-specific field of the newly-installed slot back to the
1779a54d8066SMaciej S. Szmigiero 	 * old slot as the arch data could have changed between releasing
1780b0d23708SJun Miao 	 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781a54d8066SMaciej S. Szmigiero 	 * above.  Writers are required to retrieve memslots *after* acquiring
1782a54d8066SMaciej S. Szmigiero 	 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783a54d8066SMaciej S. Szmigiero 	 */
1784244893faSSean Christopherson 	old->arch = invalid_slot->arch;
1785a54d8066SMaciej S. Szmigiero }
1786a54d8066SMaciej S. Szmigiero 
kvm_create_memslot(struct kvm * kvm,struct kvm_memory_slot * new)1787a54d8066SMaciej S. Szmigiero static void kvm_create_memslot(struct kvm *kvm,
1788244893faSSean Christopherson 			       struct kvm_memory_slot *new)
1789a54d8066SMaciej S. Szmigiero {
1790244893faSSean Christopherson 	/* Add the new memslot to the inactive set and activate. */
1791244893faSSean Christopherson 	kvm_replace_memslot(kvm, NULL, new);
1792244893faSSean Christopherson 	kvm_activate_memslot(kvm, NULL, new);
1793a54d8066SMaciej S. Szmigiero }
1794a54d8066SMaciej S. Szmigiero 
kvm_delete_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1795a54d8066SMaciej S. Szmigiero static void kvm_delete_memslot(struct kvm *kvm,
1796a54d8066SMaciej S. Szmigiero 			       struct kvm_memory_slot *old,
1797a54d8066SMaciej S. Szmigiero 			       struct kvm_memory_slot *invalid_slot)
1798a54d8066SMaciej S. Szmigiero {
1799a54d8066SMaciej S. Szmigiero 	/*
1800a54d8066SMaciej S. Szmigiero 	 * Remove the old memslot (in the inactive memslots) by passing NULL as
1801244893faSSean Christopherson 	 * the "new" slot, and for the invalid version in the active slots.
1802a54d8066SMaciej S. Szmigiero 	 */
1803a54d8066SMaciej S. Szmigiero 	kvm_replace_memslot(kvm, old, NULL);
1804a54d8066SMaciej S. Szmigiero 	kvm_activate_memslot(kvm, invalid_slot, NULL);
1805a54d8066SMaciej S. Szmigiero }
1806a54d8066SMaciej S. Szmigiero 
kvm_move_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,struct kvm_memory_slot * invalid_slot)1807244893faSSean Christopherson static void kvm_move_memslot(struct kvm *kvm,
1808a54d8066SMaciej S. Szmigiero 			     struct kvm_memory_slot *old,
1809244893faSSean Christopherson 			     struct kvm_memory_slot *new,
1810a54d8066SMaciej S. Szmigiero 			     struct kvm_memory_slot *invalid_slot)
1811a54d8066SMaciej S. Szmigiero {
1812a54d8066SMaciej S. Szmigiero 	/*
1813244893faSSean Christopherson 	 * Replace the old memslot in the inactive slots, and then swap slots
1814244893faSSean Christopherson 	 * and replace the current INVALID with the new as well.
1815a54d8066SMaciej S. Szmigiero 	 */
1816244893faSSean Christopherson 	kvm_replace_memslot(kvm, old, new);
1817244893faSSean Christopherson 	kvm_activate_memslot(kvm, invalid_slot, new);
1818a54d8066SMaciej S. Szmigiero }
1819a54d8066SMaciej S. Szmigiero 
kvm_update_flags_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1820a54d8066SMaciej S. Szmigiero static void kvm_update_flags_memslot(struct kvm *kvm,
1821a54d8066SMaciej S. Szmigiero 				     struct kvm_memory_slot *old,
1822244893faSSean Christopherson 				     struct kvm_memory_slot *new)
1823a54d8066SMaciej S. Szmigiero {
1824a54d8066SMaciej S. Szmigiero 	/*
1825a54d8066SMaciej S. Szmigiero 	 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1826a54d8066SMaciej S. Szmigiero 	 * an intermediate step. Instead, the old memslot is simply replaced
1827a54d8066SMaciej S. Szmigiero 	 * with a new, updated copy in both memslot sets.
1828a54d8066SMaciej S. Szmigiero 	 */
1829244893faSSean Christopherson 	kvm_replace_memslot(kvm, old, new);
1830244893faSSean Christopherson 	kvm_activate_memslot(kvm, old, new);
183107921665SSean Christopherson }
183207921665SSean Christopherson 
kvm_set_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1833cf47f50bSSean Christopherson static int kvm_set_memslot(struct kvm *kvm,
1834a54d8066SMaciej S. Szmigiero 			   struct kvm_memory_slot *old,
1835ce5f0215SSean Christopherson 			   struct kvm_memory_slot *new,
1836cf47f50bSSean Christopherson 			   enum kvm_mr_change change)
1837cf47f50bSSean Christopherson {
1838244893faSSean Christopherson 	struct kvm_memory_slot *invalid_slot;
1839cf47f50bSSean Christopherson 	int r;
1840cf47f50bSSean Christopherson 
1841b10a038eSBen Gardon 	/*
1842b0d23708SJun Miao 	 * Released in kvm_swap_active_memslots().
1843b10a038eSBen Gardon 	 *
1844b0d23708SJun Miao 	 * Must be held from before the current memslots are copied until after
1845b0d23708SJun Miao 	 * the new memslots are installed with rcu_assign_pointer, then
1846b0d23708SJun Miao 	 * released before the synchronize srcu in kvm_swap_active_memslots().
1847b10a038eSBen Gardon 	 *
1848b10a038eSBen Gardon 	 * When modifying memslots outside of the slots_lock, must be held
1849b10a038eSBen Gardon 	 * before reading the pointer to the current memslots until after all
1850b10a038eSBen Gardon 	 * changes to those memslots are complete.
1851b10a038eSBen Gardon 	 *
1852b10a038eSBen Gardon 	 * These rules ensure that installing new memslots does not lose
1853b10a038eSBen Gardon 	 * changes made to the previous memslots.
1854b10a038eSBen Gardon 	 */
1855b10a038eSBen Gardon 	mutex_lock(&kvm->slots_arch_lock);
1856b10a038eSBen Gardon 
1857cf47f50bSSean Christopherson 	/*
1858a54d8066SMaciej S. Szmigiero 	 * Invalidate the old slot if it's being deleted or moved.  This is
1859a54d8066SMaciej S. Szmigiero 	 * done prior to actually deleting/moving the memslot to allow vCPUs to
1860a54d8066SMaciej S. Szmigiero 	 * continue running by ensuring there are no mappings or shadow pages
1861a54d8066SMaciej S. Szmigiero 	 * for the memslot when it is deleted/moved.  Without pre-invalidation
1862a54d8066SMaciej S. Szmigiero 	 * (and without a lock), a window would exist between effecting the
1863a54d8066SMaciej S. Szmigiero 	 * delete/move and committing the changes in arch code where KVM or a
1864a54d8066SMaciej S. Szmigiero 	 * guest could access a non-existent memslot.
1865244893faSSean Christopherson 	 *
1866244893faSSean Christopherson 	 * Modifications are done on a temporary, unreachable slot.  The old
1867244893faSSean Christopherson 	 * slot needs to be preserved in case a later step fails and the
1868244893faSSean Christopherson 	 * invalidation needs to be reverted.
1869cf47f50bSSean Christopherson 	 */
1870244893faSSean Christopherson 	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1871244893faSSean Christopherson 		invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872244893faSSean Christopherson 		if (!invalid_slot) {
1873cf47f50bSSean Christopherson 			mutex_unlock(&kvm->slots_arch_lock);
1874cf47f50bSSean Christopherson 			return -ENOMEM;
1875cf47f50bSSean Christopherson 		}
1876244893faSSean Christopherson 		kvm_invalidate_memslot(kvm, old, invalid_slot);
1877cf47f50bSSean Christopherson 	}
1878cf47f50bSSean Christopherson 
1879a54d8066SMaciej S. Szmigiero 	r = kvm_prepare_memory_region(kvm, old, new, change);
1880a54d8066SMaciej S. Szmigiero 	if (r) {
1881bda44d84SSean Christopherson 		/*
1882a54d8066SMaciej S. Szmigiero 		 * For DELETE/MOVE, revert the above INVALID change.  No
1883a54d8066SMaciej S. Szmigiero 		 * modifications required since the original slot was preserved
1884a54d8066SMaciej S. Szmigiero 		 * in the inactive slots.  Changing the active memslots also
1885a54d8066SMaciej S. Szmigiero 		 * release slots_arch_lock.
1886bda44d84SSean Christopherson 		 */
1887b10a038eSBen Gardon 		if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1888244893faSSean Christopherson 			kvm_activate_memslot(kvm, invalid_slot, old);
1889244893faSSean Christopherson 			kfree(invalid_slot);
1890b10a038eSBen Gardon 		} else {
1891b10a038eSBen Gardon 			mutex_unlock(&kvm->slots_arch_lock);
1892b10a038eSBen Gardon 		}
1893cf47f50bSSean Christopherson 		return r;
1894cf47f50bSSean Christopherson 	}
1895cf47f50bSSean Christopherson 
18969e9eb226SPeter Xu 	/*
1897a54d8066SMaciej S. Szmigiero 	 * For DELETE and MOVE, the working slot is now active as the INVALID
1898a54d8066SMaciej S. Szmigiero 	 * version of the old slot.  MOVE is particularly special as it reuses
1899a54d8066SMaciej S. Szmigiero 	 * the old slot and returns a copy of the old slot (in working_slot).
1900a54d8066SMaciej S. Szmigiero 	 * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1901a54d8066SMaciej S. Szmigiero 	 * old slot is detached but otherwise preserved.
19029e9eb226SPeter Xu 	 */
1903a54d8066SMaciej S. Szmigiero 	if (change == KVM_MR_CREATE)
1904244893faSSean Christopherson 		kvm_create_memslot(kvm, new);
1905a54d8066SMaciej S. Szmigiero 	else if (change == KVM_MR_DELETE)
1906244893faSSean Christopherson 		kvm_delete_memslot(kvm, old, invalid_slot);
1907a54d8066SMaciej S. Szmigiero 	else if (change == KVM_MR_MOVE)
1908244893faSSean Christopherson 		kvm_move_memslot(kvm, old, new, invalid_slot);
1909a54d8066SMaciej S. Szmigiero 	else if (change == KVM_MR_FLAGS_ONLY)
1910244893faSSean Christopherson 		kvm_update_flags_memslot(kvm, old, new);
1911a54d8066SMaciej S. Szmigiero 	else
1912a54d8066SMaciej S. Szmigiero 		BUG();
19135c0b4f3dSSean Christopherson 
1914244893faSSean Christopherson 	/* Free the temporary INVALID slot used for DELETE and MOVE. */
1915244893faSSean Christopherson 	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1916244893faSSean Christopherson 		kfree(invalid_slot);
1917244893faSSean Christopherson 
1918a54d8066SMaciej S. Szmigiero 	/*
1919a54d8066SMaciej S. Szmigiero 	 * No need to refresh new->arch, changes after dropping slots_arch_lock
1920a413a625STom Rix 	 * will directly hit the final, active memslot.  Architectures are
1921a54d8066SMaciej S. Szmigiero 	 * responsible for knowing that new->arch may be stale.
1922a54d8066SMaciej S. Szmigiero 	 */
1923a54d8066SMaciej S. Szmigiero 	kvm_commit_memory_region(kvm, old, new, change);
1924a54d8066SMaciej S. Szmigiero 
1925a54d8066SMaciej S. Szmigiero 	return 0;
1926a54d8066SMaciej S. Szmigiero }
1927a54d8066SMaciej S. Szmigiero 
kvm_check_memslot_overlap(struct kvm_memslots * slots,int id,gfn_t start,gfn_t end)192844401a20SMaciej S. Szmigiero static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
192944401a20SMaciej S. Szmigiero 				      gfn_t start, gfn_t end)
193044401a20SMaciej S. Szmigiero {
193144401a20SMaciej S. Szmigiero 	struct kvm_memslot_iter iter;
193244401a20SMaciej S. Szmigiero 
193344401a20SMaciej S. Szmigiero 	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
193444401a20SMaciej S. Szmigiero 		if (iter.slot->id != id)
193544401a20SMaciej S. Szmigiero 			return true;
193644401a20SMaciej S. Szmigiero 	}
193744401a20SMaciej S. Szmigiero 
193844401a20SMaciej S. Szmigiero 	return false;
19395c0b4f3dSSean Christopherson }
19405c0b4f3dSSean Christopherson 
19410fce5623SAvi Kivity /*
19420fce5623SAvi Kivity  * Allocate some memory and give it an address in the guest physical address
19430fce5623SAvi Kivity  * space.
19440fce5623SAvi Kivity  *
19450fce5623SAvi Kivity  * Discontiguous memory is allowed, mostly for framebuffers.
19460fce5623SAvi Kivity  *
194702d5d55bSDominik Dingel  * Must be called holding kvm->slots_lock for write.
19480fce5623SAvi Kivity  */
__kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)19490fce5623SAvi Kivity int __kvm_set_memory_region(struct kvm *kvm,
195009170a49SPaolo Bonzini 			    const struct kvm_userspace_memory_region *mem)
19510fce5623SAvi Kivity {
1952244893faSSean Christopherson 	struct kvm_memory_slot *old, *new;
195344401a20SMaciej S. Szmigiero 	struct kvm_memslots *slots;
1954f64c0398STakuya Yoshikawa 	enum kvm_mr_change change;
19550f9bdef3SSean Christopherson 	unsigned long npages;
19560f9bdef3SSean Christopherson 	gfn_t base_gfn;
1957163da372SSean Christopherson 	int as_id, id;
1958163da372SSean Christopherson 	int r;
19590fce5623SAvi Kivity 
1960a50d64d6SXiao Guangrong 	r = check_memory_region_flags(mem);
1961a50d64d6SXiao Guangrong 	if (r)
196271a4c30bSSean Christopherson 		return r;
1963a50d64d6SXiao Guangrong 
1964f481b069SPaolo Bonzini 	as_id = mem->slot >> 16;
1965f481b069SPaolo Bonzini 	id = (u16)mem->slot;
1966f481b069SPaolo Bonzini 
19670fce5623SAvi Kivity 	/* General sanity checks */
19686b285a55SSean Christopherson 	if ((mem->memory_size & (PAGE_SIZE - 1)) ||
19696b285a55SSean Christopherson 	    (mem->memory_size != (unsigned long)mem->memory_size))
197071a4c30bSSean Christopherson 		return -EINVAL;
19710fce5623SAvi Kivity 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
197271a4c30bSSean Christopherson 		return -EINVAL;
1973fa3d315aSTakuya Yoshikawa 	/* We can read the guest memory with __xxx_user() later on. */
197409d952c9SPaolo Bonzini 	if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1975139bc8a6SMarc Zyngier 	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
197696d4f267SLinus Torvalds 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
197709d952c9SPaolo Bonzini 			mem->memory_size))
197871a4c30bSSean Christopherson 		return -EINVAL;
1979f481b069SPaolo Bonzini 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
198071a4c30bSSean Christopherson 		return -EINVAL;
19810fce5623SAvi Kivity 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
198271a4c30bSSean Christopherson 		return -EINVAL;
19830f9bdef3SSean Christopherson 	if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
19840f9bdef3SSean Christopherson 		return -EINVAL;
19850fce5623SAvi Kivity 
198644401a20SMaciej S. Szmigiero 	slots = __kvm_memslots(kvm, as_id);
19870fce5623SAvi Kivity 
19885c0b4f3dSSean Christopherson 	/*
19897cd08553SSean Christopherson 	 * Note, the old memslot (and the pointer itself!) may be invalidated
19907cd08553SSean Christopherson 	 * and/or destroyed by kvm_set_memslot().
19915c0b4f3dSSean Christopherson 	 */
199244401a20SMaciej S. Szmigiero 	old = id_to_memslot(slots, id);
1993163da372SSean Christopherson 
199447ea7d90SSean Christopherson 	if (!mem->memory_size) {
19957cd08553SSean Christopherson 		if (!old || !old->npages)
199647ea7d90SSean Christopherson 			return -EINVAL;
199747ea7d90SSean Christopherson 
19987cd08553SSean Christopherson 		if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
199947ea7d90SSean Christopherson 			return -EIO;
200047ea7d90SSean Christopherson 
2001244893faSSean Christopherson 		return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
20025c0b4f3dSSean Christopherson 	}
20035c0b4f3dSSean Christopherson 
20040f9bdef3SSean Christopherson 	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
20050f9bdef3SSean Christopherson 	npages = (mem->memory_size >> PAGE_SHIFT);
20065c0b4f3dSSean Christopherson 
20077cd08553SSean Christopherson 	if (!old || !old->npages) {
2008f64c0398STakuya Yoshikawa 		change = KVM_MR_CREATE;
2009afa319a5SSean Christopherson 
2010afa319a5SSean Christopherson 		/*
2011afa319a5SSean Christopherson 		 * To simplify KVM internals, the total number of pages across
2012afa319a5SSean Christopherson 		 * all memslots must fit in an unsigned long.
2013afa319a5SSean Christopherson 		 */
20140f9bdef3SSean Christopherson 		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015afa319a5SSean Christopherson 			return -EINVAL;
20165c0b4f3dSSean Christopherson 	} else { /* Modify an existing slot. */
20170f9bdef3SSean Christopherson 		if ((mem->userspace_addr != old->userspace_addr) ||
20180f9bdef3SSean Christopherson 		    (npages != old->npages) ||
20190f9bdef3SSean Christopherson 		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
202071a4c30bSSean Christopherson 			return -EINVAL;
20210fce5623SAvi Kivity 
20220f9bdef3SSean Christopherson 		if (base_gfn != old->base_gfn)
2023f64c0398STakuya Yoshikawa 			change = KVM_MR_MOVE;
20240f9bdef3SSean Christopherson 		else if (mem->flags != old->flags)
2025f64c0398STakuya Yoshikawa 			change = KVM_MR_FLAGS_ONLY;
202671a4c30bSSean Christopherson 		else /* Nothing to change. */
202771a4c30bSSean Christopherson 			return 0;
2028f64c0398STakuya Yoshikawa 	}
202909170a49SPaolo Bonzini 
203044401a20SMaciej S. Szmigiero 	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
20310f9bdef3SSean Christopherson 	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
203271a4c30bSSean Christopherson 		return -EEXIST;
20330fce5623SAvi Kivity 
2034244893faSSean Christopherson 	/* Allocate a slot that will persist in the memslot. */
2035244893faSSean Christopherson 	new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2036244893faSSean Christopherson 	if (!new)
2037244893faSSean Christopherson 		return -ENOMEM;
20380f9bdef3SSean Christopherson 
2039244893faSSean Christopherson 	new->as_id = as_id;
2040244893faSSean Christopherson 	new->id = id;
2041244893faSSean Christopherson 	new->base_gfn = base_gfn;
2042244893faSSean Christopherson 	new->npages = npages;
2043244893faSSean Christopherson 	new->flags = mem->flags;
2044244893faSSean Christopherson 	new->userspace_addr = mem->userspace_addr;
2045244893faSSean Christopherson 
2046244893faSSean Christopherson 	r = kvm_set_memslot(kvm, old, new, change);
204771a4c30bSSean Christopherson 	if (r)
2048244893faSSean Christopherson 		kfree(new);
20490fce5623SAvi Kivity 	return r;
20500fce5623SAvi Kivity }
20510fce5623SAvi Kivity EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
20520fce5623SAvi Kivity 
kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)20530fce5623SAvi Kivity int kvm_set_memory_region(struct kvm *kvm,
205409170a49SPaolo Bonzini 			  const struct kvm_userspace_memory_region *mem)
20550fce5623SAvi Kivity {
20560fce5623SAvi Kivity 	int r;
20570fce5623SAvi Kivity 
205879fac95eSMarcelo Tosatti 	mutex_lock(&kvm->slots_lock);
205947ae31e2STakuya Yoshikawa 	r = __kvm_set_memory_region(kvm, mem);
206079fac95eSMarcelo Tosatti 	mutex_unlock(&kvm->slots_lock);
20610fce5623SAvi Kivity 	return r;
20620fce5623SAvi Kivity }
20630fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_set_memory_region);
20640fce5623SAvi Kivity 
kvm_vm_ioctl_set_memory_region(struct kvm * kvm,struct kvm_userspace_memory_region * mem)20657940876eSStephen Hemminger static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
206647ae31e2STakuya Yoshikawa 					  struct kvm_userspace_memory_region *mem)
20670fce5623SAvi Kivity {
2068f481b069SPaolo Bonzini 	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
20690fce5623SAvi Kivity 		return -EINVAL;
207009170a49SPaolo Bonzini 
207147ae31e2STakuya Yoshikawa 	return kvm_set_memory_region(kvm, mem);
20720fce5623SAvi Kivity }
20730fce5623SAvi Kivity 
20740dff0846SSean Christopherson #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
20752a49f61dSSean Christopherson /**
20762a49f61dSSean Christopherson  * kvm_get_dirty_log - get a snapshot of dirty pages
20772a49f61dSSean Christopherson  * @kvm:	pointer to kvm instance
20782a49f61dSSean Christopherson  * @log:	slot id and address to which we copy the log
20792a49f61dSSean Christopherson  * @is_dirty:	set to '1' if any dirty pages were found
20802a49f61dSSean Christopherson  * @memslot:	set to the associated memslot, always valid on success
20812a49f61dSSean Christopherson  */
kvm_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log,int * is_dirty,struct kvm_memory_slot ** memslot)20822a49f61dSSean Christopherson int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
20832a49f61dSSean Christopherson 		      int *is_dirty, struct kvm_memory_slot **memslot)
20840fce5623SAvi Kivity {
20859f6b8029SPaolo Bonzini 	struct kvm_memslots *slots;
2086843574a3SMarkus Elfring 	int i, as_id, id;
208787bf6e7dSTakuya Yoshikawa 	unsigned long n;
20880fce5623SAvi Kivity 	unsigned long any = 0;
20890fce5623SAvi Kivity 
209086bdf3ebSGavin Shan 	/* Dirty ring tracking may be exclusive to dirty log tracking */
209186bdf3ebSGavin Shan 	if (!kvm_use_dirty_bitmap(kvm))
2092b2cc64c4SPeter Xu 		return -ENXIO;
2093b2cc64c4SPeter Xu 
20942a49f61dSSean Christopherson 	*memslot = NULL;
20952a49f61dSSean Christopherson 	*is_dirty = 0;
20962a49f61dSSean Christopherson 
2097f481b069SPaolo Bonzini 	as_id = log->slot >> 16;
2098f481b069SPaolo Bonzini 	id = (u16)log->slot;
2099f481b069SPaolo Bonzini 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2100843574a3SMarkus Elfring 		return -EINVAL;
21010fce5623SAvi Kivity 
2102f481b069SPaolo Bonzini 	slots = __kvm_memslots(kvm, as_id);
21032a49f61dSSean Christopherson 	*memslot = id_to_memslot(slots, id);
21040577d1abSSean Christopherson 	if (!(*memslot) || !(*memslot)->dirty_bitmap)
2105843574a3SMarkus Elfring 		return -ENOENT;
21060fce5623SAvi Kivity 
21072a49f61dSSean Christopherson 	kvm_arch_sync_dirty_log(kvm, *memslot);
21082a49f61dSSean Christopherson 
21092a49f61dSSean Christopherson 	n = kvm_dirty_bitmap_bytes(*memslot);
21100fce5623SAvi Kivity 
21110fce5623SAvi Kivity 	for (i = 0; !any && i < n/sizeof(long); ++i)
21122a49f61dSSean Christopherson 		any = (*memslot)->dirty_bitmap[i];
21130fce5623SAvi Kivity 
21142a49f61dSSean Christopherson 	if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115843574a3SMarkus Elfring 		return -EFAULT;
21160fce5623SAvi Kivity 
21170fce5623SAvi Kivity 	if (any)
21180fce5623SAvi Kivity 		*is_dirty = 1;
2119843574a3SMarkus Elfring 	return 0;
21200fce5623SAvi Kivity }
21212ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
21220fce5623SAvi Kivity 
21230dff0846SSean Christopherson #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124ba0513b5SMario Smarduch /**
2125b8b00220SJiang Biao  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
21262a31b9dbSPaolo Bonzini  *	and reenable dirty page tracking for the corresponding pages.
2127ba0513b5SMario Smarduch  * @kvm:	pointer to kvm instance
2128ba0513b5SMario Smarduch  * @log:	slot id and address to which we copy the log
2129ba0513b5SMario Smarduch  *
2130ba0513b5SMario Smarduch  * We need to keep it in mind that VCPU threads can write to the bitmap
2131ba0513b5SMario Smarduch  * concurrently. So, to avoid losing track of dirty pages we keep the
2132ba0513b5SMario Smarduch  * following order:
2133ba0513b5SMario Smarduch  *
2134ba0513b5SMario Smarduch  *    1. Take a snapshot of the bit and clear it if needed.
2135ba0513b5SMario Smarduch  *    2. Write protect the corresponding page.
2136ba0513b5SMario Smarduch  *    3. Copy the snapshot to the userspace.
2137ba0513b5SMario Smarduch  *    4. Upon return caller flushes TLB's if needed.
2138ba0513b5SMario Smarduch  *
2139ba0513b5SMario Smarduch  * Between 2 and 4, the guest may write to the page using the remaining TLB
2140ba0513b5SMario Smarduch  * entry.  This is not a problem because the page is reported dirty using
2141ba0513b5SMario Smarduch  * the snapshot taken before and step 4 ensures that writes done after
2142ba0513b5SMario Smarduch  * exiting to userspace will be logged for the next call.
2143ba0513b5SMario Smarduch  *
2144ba0513b5SMario Smarduch  */
kvm_get_dirty_log_protect(struct kvm * kvm,struct kvm_dirty_log * log)21450dff0846SSean Christopherson static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2146ba0513b5SMario Smarduch {
21479f6b8029SPaolo Bonzini 	struct kvm_memslots *slots;
2148ba0513b5SMario Smarduch 	struct kvm_memory_slot *memslot;
214958d6db34SMarkus Elfring 	int i, as_id, id;
2150ba0513b5SMario Smarduch 	unsigned long n;
2151ba0513b5SMario Smarduch 	unsigned long *dirty_bitmap;
2152ba0513b5SMario Smarduch 	unsigned long *dirty_bitmap_buffer;
21530dff0846SSean Christopherson 	bool flush;
2154ba0513b5SMario Smarduch 
215586bdf3ebSGavin Shan 	/* Dirty ring tracking may be exclusive to dirty log tracking */
215686bdf3ebSGavin Shan 	if (!kvm_use_dirty_bitmap(kvm))
2157b2cc64c4SPeter Xu 		return -ENXIO;
2158b2cc64c4SPeter Xu 
2159f481b069SPaolo Bonzini 	as_id = log->slot >> 16;
2160f481b069SPaolo Bonzini 	id = (u16)log->slot;
2161f481b069SPaolo Bonzini 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
216258d6db34SMarkus Elfring 		return -EINVAL;
2163ba0513b5SMario Smarduch 
2164f481b069SPaolo Bonzini 	slots = __kvm_memslots(kvm, as_id);
2165f481b069SPaolo Bonzini 	memslot = id_to_memslot(slots, id);
21660577d1abSSean Christopherson 	if (!memslot || !memslot->dirty_bitmap)
21670577d1abSSean Christopherson 		return -ENOENT;
2168ba0513b5SMario Smarduch 
2169ba0513b5SMario Smarduch 	dirty_bitmap = memslot->dirty_bitmap;
2170ba0513b5SMario Smarduch 
21710dff0846SSean Christopherson 	kvm_arch_sync_dirty_log(kvm, memslot);
21720dff0846SSean Christopherson 
2173ba0513b5SMario Smarduch 	n = kvm_dirty_bitmap_bytes(memslot);
21740dff0846SSean Christopherson 	flush = false;
21752a31b9dbSPaolo Bonzini 	if (kvm->manual_dirty_log_protect) {
21762a31b9dbSPaolo Bonzini 		/*
21772a31b9dbSPaolo Bonzini 		 * Unlike kvm_get_dirty_log, we always return false in *flush,
21782a31b9dbSPaolo Bonzini 		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
21792a31b9dbSPaolo Bonzini 		 * is some code duplication between this function and
21802a31b9dbSPaolo Bonzini 		 * kvm_get_dirty_log, but hopefully all architecture
21812a31b9dbSPaolo Bonzini 		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
21822a31b9dbSPaolo Bonzini 		 * can be eliminated.
21832a31b9dbSPaolo Bonzini 		 */
21842a31b9dbSPaolo Bonzini 		dirty_bitmap_buffer = dirty_bitmap;
21852a31b9dbSPaolo Bonzini 	} else {
218603133347SClaudio Imbrenda 		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187ba0513b5SMario Smarduch 		memset(dirty_bitmap_buffer, 0, n);
2188ba0513b5SMario Smarduch 
2189531810caSBen Gardon 		KVM_MMU_LOCK(kvm);
2190ba0513b5SMario Smarduch 		for (i = 0; i < n / sizeof(long); i++) {
2191ba0513b5SMario Smarduch 			unsigned long mask;
2192ba0513b5SMario Smarduch 			gfn_t offset;
2193ba0513b5SMario Smarduch 
2194ba0513b5SMario Smarduch 			if (!dirty_bitmap[i])
2195ba0513b5SMario Smarduch 				continue;
2196ba0513b5SMario Smarduch 
21970dff0846SSean Christopherson 			flush = true;
2198ba0513b5SMario Smarduch 			mask = xchg(&dirty_bitmap[i], 0);
2199ba0513b5SMario Smarduch 			dirty_bitmap_buffer[i] = mask;
2200ba0513b5SMario Smarduch 
2201ba0513b5SMario Smarduch 			offset = i * BITS_PER_LONG;
220258d2930fSTakuya Yoshikawa 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
220358d2930fSTakuya Yoshikawa 								offset, mask);
220458d2930fSTakuya Yoshikawa 		}
2205531810caSBen Gardon 		KVM_MMU_UNLOCK(kvm);
22062a31b9dbSPaolo Bonzini 	}
22072a31b9dbSPaolo Bonzini 
22080dff0846SSean Christopherson 	if (flush)
2209619b5072SDavid Matlack 		kvm_flush_remote_tlbs_memslot(kvm, memslot);
22100dff0846SSean Christopherson 
2211ba0513b5SMario Smarduch 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
221258d6db34SMarkus Elfring 		return -EFAULT;
221358d6db34SMarkus Elfring 	return 0;
2214ba0513b5SMario Smarduch }
22150dff0846SSean Christopherson 
22160dff0846SSean Christopherson 
22170dff0846SSean Christopherson /**
22180dff0846SSean Christopherson  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
22190dff0846SSean Christopherson  * @kvm: kvm instance
22200dff0846SSean Christopherson  * @log: slot id and address to which we copy the log
22210dff0846SSean Christopherson  *
22220dff0846SSean Christopherson  * Steps 1-4 below provide general overview of dirty page logging. See
22230dff0846SSean Christopherson  * kvm_get_dirty_log_protect() function description for additional details.
22240dff0846SSean Christopherson  *
22250dff0846SSean Christopherson  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
22260dff0846SSean Christopherson  * always flush the TLB (step 4) even if previous step failed  and the dirty
22270dff0846SSean Christopherson  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
22280dff0846SSean Christopherson  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
22290dff0846SSean Christopherson  * writes will be marked dirty for next log read.
22300dff0846SSean Christopherson  *
22310dff0846SSean Christopherson  *   1. Take a snapshot of the bit and clear it if needed.
22320dff0846SSean Christopherson  *   2. Write protect the corresponding page.
22330dff0846SSean Christopherson  *   3. Copy the snapshot to the userspace.
22340dff0846SSean Christopherson  *   4. Flush TLB's if needed.
22350dff0846SSean Christopherson  */
kvm_vm_ioctl_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log)22360dff0846SSean Christopherson static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
22370dff0846SSean Christopherson 				      struct kvm_dirty_log *log)
22380dff0846SSean Christopherson {
22390dff0846SSean Christopherson 	int r;
22400dff0846SSean Christopherson 
22410dff0846SSean Christopherson 	mutex_lock(&kvm->slots_lock);
22420dff0846SSean Christopherson 
22430dff0846SSean Christopherson 	r = kvm_get_dirty_log_protect(kvm, log);
22440dff0846SSean Christopherson 
22450dff0846SSean Christopherson 	mutex_unlock(&kvm->slots_lock);
22460dff0846SSean Christopherson 	return r;
22470dff0846SSean Christopherson }
22482a31b9dbSPaolo Bonzini 
22492a31b9dbSPaolo Bonzini /**
22502a31b9dbSPaolo Bonzini  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
22512a31b9dbSPaolo Bonzini  *	and reenable dirty page tracking for the corresponding pages.
22522a31b9dbSPaolo Bonzini  * @kvm:	pointer to kvm instance
22532a31b9dbSPaolo Bonzini  * @log:	slot id and address from which to fetch the bitmap of dirty pages
22542a31b9dbSPaolo Bonzini  */
kvm_clear_dirty_log_protect(struct kvm * kvm,struct kvm_clear_dirty_log * log)22550dff0846SSean Christopherson static int kvm_clear_dirty_log_protect(struct kvm *kvm,
22560dff0846SSean Christopherson 				       struct kvm_clear_dirty_log *log)
22572a31b9dbSPaolo Bonzini {
22582a31b9dbSPaolo Bonzini 	struct kvm_memslots *slots;
22592a31b9dbSPaolo Bonzini 	struct kvm_memory_slot *memslot;
226098938aa8STomas Bortoli 	int as_id, id;
22612a31b9dbSPaolo Bonzini 	gfn_t offset;
226298938aa8STomas Bortoli 	unsigned long i, n;
22632a31b9dbSPaolo Bonzini 	unsigned long *dirty_bitmap;
22642a31b9dbSPaolo Bonzini 	unsigned long *dirty_bitmap_buffer;
22650dff0846SSean Christopherson 	bool flush;
22662a31b9dbSPaolo Bonzini 
226786bdf3ebSGavin Shan 	/* Dirty ring tracking may be exclusive to dirty log tracking */
226886bdf3ebSGavin Shan 	if (!kvm_use_dirty_bitmap(kvm))
2269b2cc64c4SPeter Xu 		return -ENXIO;
2270b2cc64c4SPeter Xu 
22712a31b9dbSPaolo Bonzini 	as_id = log->slot >> 16;
22722a31b9dbSPaolo Bonzini 	id = (u16)log->slot;
22732a31b9dbSPaolo Bonzini 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
22742a31b9dbSPaolo Bonzini 		return -EINVAL;
22752a31b9dbSPaolo Bonzini 
227676d58e0fSPaolo Bonzini 	if (log->first_page & 63)
22772a31b9dbSPaolo Bonzini 		return -EINVAL;
22782a31b9dbSPaolo Bonzini 
22792a31b9dbSPaolo Bonzini 	slots = __kvm_memslots(kvm, as_id);
22802a31b9dbSPaolo Bonzini 	memslot = id_to_memslot(slots, id);
22810577d1abSSean Christopherson 	if (!memslot || !memslot->dirty_bitmap)
22820577d1abSSean Christopherson 		return -ENOENT;
22832a31b9dbSPaolo Bonzini 
22842a31b9dbSPaolo Bonzini 	dirty_bitmap = memslot->dirty_bitmap;
22852a31b9dbSPaolo Bonzini 
22864ddc9204SPeter Xu 	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
228798938aa8STomas Bortoli 
228898938aa8STomas Bortoli 	if (log->first_page > memslot->npages ||
228976d58e0fSPaolo Bonzini 	    log->num_pages > memslot->npages - log->first_page ||
229076d58e0fSPaolo Bonzini 	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
229198938aa8STomas Bortoli 	    return -EINVAL;
229298938aa8STomas Bortoli 
22930dff0846SSean Christopherson 	kvm_arch_sync_dirty_log(kvm, memslot);
22940dff0846SSean Christopherson 
22950dff0846SSean Christopherson 	flush = false;
22962a31b9dbSPaolo Bonzini 	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
22972a31b9dbSPaolo Bonzini 	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
22982a31b9dbSPaolo Bonzini 		return -EFAULT;
22992a31b9dbSPaolo Bonzini 
2300531810caSBen Gardon 	KVM_MMU_LOCK(kvm);
230153eac7a8SPeter Xu 	for (offset = log->first_page, i = offset / BITS_PER_LONG,
230253eac7a8SPeter Xu 		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
23032a31b9dbSPaolo Bonzini 	     i++, offset += BITS_PER_LONG) {
23042a31b9dbSPaolo Bonzini 		unsigned long mask = *dirty_bitmap_buffer++;
23052a31b9dbSPaolo Bonzini 		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
23062a31b9dbSPaolo Bonzini 		if (!mask)
23072a31b9dbSPaolo Bonzini 			continue;
23082a31b9dbSPaolo Bonzini 
23092a31b9dbSPaolo Bonzini 		mask &= atomic_long_fetch_andnot(mask, p);
23102a31b9dbSPaolo Bonzini 
23112a31b9dbSPaolo Bonzini 		/*
23122a31b9dbSPaolo Bonzini 		 * mask contains the bits that really have been cleared.  This
23132a31b9dbSPaolo Bonzini 		 * never includes any bits beyond the length of the memslot (if
23142a31b9dbSPaolo Bonzini 		 * the length is not aligned to 64 pages), therefore it is not
23152a31b9dbSPaolo Bonzini 		 * a problem if userspace sets them in log->dirty_bitmap.
23162a31b9dbSPaolo Bonzini 		*/
23172a31b9dbSPaolo Bonzini 		if (mask) {
23180dff0846SSean Christopherson 			flush = true;
23192a31b9dbSPaolo Bonzini 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
23202a31b9dbSPaolo Bonzini 								offset, mask);
23212a31b9dbSPaolo Bonzini 		}
23222a31b9dbSPaolo Bonzini 	}
2323531810caSBen Gardon 	KVM_MMU_UNLOCK(kvm);
23242a31b9dbSPaolo Bonzini 
23250dff0846SSean Christopherson 	if (flush)
2326619b5072SDavid Matlack 		kvm_flush_remote_tlbs_memslot(kvm, memslot);
23270dff0846SSean Christopherson 
23282a31b9dbSPaolo Bonzini 	return 0;
23292a31b9dbSPaolo Bonzini }
23300dff0846SSean Christopherson 
kvm_vm_ioctl_clear_dirty_log(struct kvm * kvm,struct kvm_clear_dirty_log * log)23310dff0846SSean Christopherson static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
23320dff0846SSean Christopherson 					struct kvm_clear_dirty_log *log)
23330dff0846SSean Christopherson {
23340dff0846SSean Christopherson 	int r;
23350dff0846SSean Christopherson 
23360dff0846SSean Christopherson 	mutex_lock(&kvm->slots_lock);
23370dff0846SSean Christopherson 
23380dff0846SSean Christopherson 	r = kvm_clear_dirty_log_protect(kvm, log);
23390dff0846SSean Christopherson 
23400dff0846SSean Christopherson 	mutex_unlock(&kvm->slots_lock);
23410dff0846SSean Christopherson 	return r;
23420dff0846SSean Christopherson }
23430dff0846SSean Christopherson #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344ba0513b5SMario Smarduch 
gfn_to_memslot(struct kvm * kvm,gfn_t gfn)234549c7754cSGleb Natapov struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
234649c7754cSGleb Natapov {
234749c7754cSGleb Natapov 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
234849c7754cSGleb Natapov }
2349a1f4d395SAvi Kivity EXPORT_SYMBOL_GPL(gfn_to_memslot);
23500fce5623SAvi Kivity 
kvm_vcpu_gfn_to_memslot(struct kvm_vcpu * vcpu,gfn_t gfn)23518e73485cSPaolo Bonzini struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
23528e73485cSPaolo Bonzini {
2353fe22ed82SDavid Matlack 	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354a54d8066SMaciej S. Szmigiero 	u64 gen = slots->generation;
2355fe22ed82SDavid Matlack 	struct kvm_memory_slot *slot;
2356fe22ed82SDavid Matlack 
2357a54d8066SMaciej S. Szmigiero 	/*
2358a54d8066SMaciej S. Szmigiero 	 * This also protects against using a memslot from a different address space,
2359a54d8066SMaciej S. Szmigiero 	 * since different address spaces have different generation numbers.
2360a54d8066SMaciej S. Szmigiero 	 */
2361a54d8066SMaciej S. Szmigiero 	if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362a54d8066SMaciej S. Szmigiero 		vcpu->last_used_slot = NULL;
2363a54d8066SMaciej S. Szmigiero 		vcpu->last_used_slot_gen = gen;
2364a54d8066SMaciej S. Szmigiero 	}
2365a54d8066SMaciej S. Szmigiero 
2366a54d8066SMaciej S. Szmigiero 	slot = try_get_memslot(vcpu->last_used_slot, gfn);
2367fe22ed82SDavid Matlack 	if (slot)
2368fe22ed82SDavid Matlack 		return slot;
2369fe22ed82SDavid Matlack 
2370fe22ed82SDavid Matlack 	/*
2371fe22ed82SDavid Matlack 	 * Fall back to searching all memslots. We purposely use
2372fe22ed82SDavid Matlack 	 * search_memslots() instead of __gfn_to_memslot() to avoid
2373a54d8066SMaciej S. Szmigiero 	 * thrashing the VM-wide last_used_slot in kvm_memslots.
2374fe22ed82SDavid Matlack 	 */
2375a54d8066SMaciej S. Szmigiero 	slot = search_memslots(slots, gfn, false);
2376fe22ed82SDavid Matlack 	if (slot) {
2377a54d8066SMaciej S. Szmigiero 		vcpu->last_used_slot = slot;
2378fe22ed82SDavid Matlack 		return slot;
2379fe22ed82SDavid Matlack 	}
2380fe22ed82SDavid Matlack 
2381fe22ed82SDavid Matlack 	return NULL;
23828e73485cSPaolo Bonzini }
23838e73485cSPaolo Bonzini 
kvm_is_visible_gfn(struct kvm * kvm,gfn_t gfn)238433e94154SYaowei Bai bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
23850fce5623SAvi Kivity {
2386bf3e05bcSXiao Guangrong 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
23870fce5623SAvi Kivity 
2388c36b7150SPaolo Bonzini 	return kvm_is_visible_memslot(memslot);
23890fce5623SAvi Kivity }
23900fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
23910fce5623SAvi Kivity 
kvm_vcpu_is_visible_gfn(struct kvm_vcpu * vcpu,gfn_t gfn)2392995decb6SVitaly Kuznetsov bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393995decb6SVitaly Kuznetsov {
2394995decb6SVitaly Kuznetsov 	struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395995decb6SVitaly Kuznetsov 
2396995decb6SVitaly Kuznetsov 	return kvm_is_visible_memslot(memslot);
2397995decb6SVitaly Kuznetsov }
2398995decb6SVitaly Kuznetsov EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399995decb6SVitaly Kuznetsov 
kvm_host_page_size(struct kvm_vcpu * vcpu,gfn_t gfn)2400f9b84e19SSean Christopherson unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
24018f0b1ab6SJoerg Roedel {
24028f0b1ab6SJoerg Roedel 	struct vm_area_struct *vma;
24038f0b1ab6SJoerg Roedel 	unsigned long addr, size;
24048f0b1ab6SJoerg Roedel 
24058f0b1ab6SJoerg Roedel 	size = PAGE_SIZE;
24068f0b1ab6SJoerg Roedel 
240742cde48bSSean Christopherson 	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
24088f0b1ab6SJoerg Roedel 	if (kvm_is_error_hva(addr))
24098f0b1ab6SJoerg Roedel 		return PAGE_SIZE;
24108f0b1ab6SJoerg Roedel 
2411d8ed45c5SMichel Lespinasse 	mmap_read_lock(current->mm);
24128f0b1ab6SJoerg Roedel 	vma = find_vma(current->mm, addr);
24138f0b1ab6SJoerg Roedel 	if (!vma)
24148f0b1ab6SJoerg Roedel 		goto out;
24158f0b1ab6SJoerg Roedel 
24168f0b1ab6SJoerg Roedel 	size = vma_kernel_pagesize(vma);
24178f0b1ab6SJoerg Roedel 
24188f0b1ab6SJoerg Roedel out:
2419d8ed45c5SMichel Lespinasse 	mmap_read_unlock(current->mm);
24208f0b1ab6SJoerg Roedel 
24218f0b1ab6SJoerg Roedel 	return size;
24228f0b1ab6SJoerg Roedel }
24238f0b1ab6SJoerg Roedel 
memslot_is_readonly(const struct kvm_memory_slot * slot)24248283e36aSBen Gardon static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
24254d8b81abSXiao Guangrong {
24264d8b81abSXiao Guangrong 	return slot->flags & KVM_MEM_READONLY;
24274d8b81abSXiao Guangrong }
24284d8b81abSXiao Guangrong 
__gfn_to_hva_many(const struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages,bool write)24298283e36aSBen Gardon static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
24304d8b81abSXiao Guangrong 				       gfn_t *nr_pages, bool write)
24310fce5623SAvi Kivity {
2432bc6678a3SMarcelo Tosatti 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2433ca3a490cSXiao Guangrong 		return KVM_HVA_ERR_BAD;
243448987781SXiao Guangrong 
24354d8b81abSXiao Guangrong 	if (memslot_is_readonly(slot) && write)
24364d8b81abSXiao Guangrong 		return KVM_HVA_ERR_RO_BAD;
243748987781SXiao Guangrong 
243848987781SXiao Guangrong 	if (nr_pages)
243948987781SXiao Guangrong 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
244048987781SXiao Guangrong 
24414d8b81abSXiao Guangrong 	return __gfn_to_hva_memslot(slot, gfn);
24420fce5623SAvi Kivity }
244348987781SXiao Guangrong 
gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages)24444d8b81abSXiao Guangrong static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
24454d8b81abSXiao Guangrong 				     gfn_t *nr_pages)
24464d8b81abSXiao Guangrong {
24474d8b81abSXiao Guangrong 	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
24484d8b81abSXiao Guangrong }
24494d8b81abSXiao Guangrong 
gfn_to_hva_memslot(struct kvm_memory_slot * slot,gfn_t gfn)24504d8b81abSXiao Guangrong unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
24514d8b81abSXiao Guangrong 					gfn_t gfn)
24524d8b81abSXiao Guangrong {
24534d8b81abSXiao Guangrong 	return gfn_to_hva_many(slot, gfn, NULL);
24544d8b81abSXiao Guangrong }
24554d8b81abSXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
24564d8b81abSXiao Guangrong 
gfn_to_hva(struct kvm * kvm,gfn_t gfn)245748987781SXiao Guangrong unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
245848987781SXiao Guangrong {
245949c7754cSGleb Natapov 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
246048987781SXiao Guangrong }
24610d150298SSheng Yang EXPORT_SYMBOL_GPL(gfn_to_hva);
24620fce5623SAvi Kivity 
kvm_vcpu_gfn_to_hva(struct kvm_vcpu * vcpu,gfn_t gfn)24638e73485cSPaolo Bonzini unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
24648e73485cSPaolo Bonzini {
24658e73485cSPaolo Bonzini 	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
24668e73485cSPaolo Bonzini }
24678e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
24688e73485cSPaolo Bonzini 
246986ab8cffSXiao Guangrong /*
2470970c0d4bSWei Yang  * Return the hva of a @gfn and the R/W attribute if possible.
2471970c0d4bSWei Yang  *
2472970c0d4bSWei Yang  * @slot: the kvm_memory_slot which contains @gfn
2473970c0d4bSWei Yang  * @gfn: the gfn to be translated
2474970c0d4bSWei Yang  * @writable: used to return the read/write attribute of the @slot if the hva
2475970c0d4bSWei Yang  * is valid and @writable is not NULL
247686ab8cffSXiao Guangrong  */
gfn_to_hva_memslot_prot(struct kvm_memory_slot * slot,gfn_t gfn,bool * writable)247764d83126SChristoffer Dall unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
247864d83126SChristoffer Dall 				      gfn_t gfn, bool *writable)
24798030089fSGleb Natapov {
2480a2ac07feSGleb Natapov 	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2481a2ac07feSGleb Natapov 
2482a2ac07feSGleb Natapov 	if (!kvm_is_error_hva(hva) && writable)
2483ba6a3541SPaolo Bonzini 		*writable = !memslot_is_readonly(slot);
2484ba6a3541SPaolo Bonzini 
2485a2ac07feSGleb Natapov 	return hva;
248686ab8cffSXiao Guangrong }
248786ab8cffSXiao Guangrong 
gfn_to_hva_prot(struct kvm * kvm,gfn_t gfn,bool * writable)248864d83126SChristoffer Dall unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
248964d83126SChristoffer Dall {
249064d83126SChristoffer Dall 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
249164d83126SChristoffer Dall 
249264d83126SChristoffer Dall 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
249364d83126SChristoffer Dall }
249464d83126SChristoffer Dall 
kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu * vcpu,gfn_t gfn,bool * writable)24958e73485cSPaolo Bonzini unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
24968e73485cSPaolo Bonzini {
24978e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
24988e73485cSPaolo Bonzini 
24998e73485cSPaolo Bonzini 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
25008e73485cSPaolo Bonzini }
25018e73485cSPaolo Bonzini 
check_user_page_hwpoison(unsigned long addr)2502fafc3dbaSHuang Ying static inline int check_user_page_hwpoison(unsigned long addr)
2503fafc3dbaSHuang Ying {
25040d731759SLorenzo Stoakes 	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2505fafc3dbaSHuang Ying 
250654d02069SLorenzo Stoakes 	rc = get_user_pages(addr, 1, flags, NULL);
2507fafc3dbaSHuang Ying 	return rc == -EHWPOISON;
2508fafc3dbaSHuang Ying }
2509fafc3dbaSHuang Ying 
25102fc84311SXiao Guangrong /*
2511b9b33da2SPaolo Bonzini  * The fast path to get the writable pfn which will be stored in @pfn,
2512b9b33da2SPaolo Bonzini  * true indicates success, otherwise false is returned.  It's also the
2513311497e0SMiaohe Lin  * only part that runs if we can in atomic context.
25142fc84311SXiao Guangrong  */
hva_to_pfn_fast(unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * pfn)2515b9b33da2SPaolo Bonzini static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516b9b33da2SPaolo Bonzini 			    bool *writable, kvm_pfn_t *pfn)
25170fce5623SAvi Kivity {
25180fce5623SAvi Kivity 	struct page *page[1];
25190fce5623SAvi Kivity 
252012ce13feSXiao Guangrong 	/*
252112ce13feSXiao Guangrong 	 * Fast pin a writable pfn only if it is a write fault request
252212ce13feSXiao Guangrong 	 * or the caller allows to map a writable pfn for a read fault
252312ce13feSXiao Guangrong 	 * request.
252412ce13feSXiao Guangrong 	 */
252512ce13feSXiao Guangrong 	if (!(write_fault || writable))
252612ce13feSXiao Guangrong 		return false;
252712ce13feSXiao Guangrong 
2528dadbb612SSouptick Joarder 	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
25292fc84311SXiao Guangrong 		*pfn = page_to_pfn(page[0]);
2530612819c3SMarcelo Tosatti 
2531612819c3SMarcelo Tosatti 		if (writable)
2532612819c3SMarcelo Tosatti 			*writable = true;
25332fc84311SXiao Guangrong 		return true;
25342fc84311SXiao Guangrong 	}
2535612819c3SMarcelo Tosatti 
25362fc84311SXiao Guangrong 	return false;
25372fc84311SXiao Guangrong }
2538af585b92SGleb Natapov 
25392fc84311SXiao Guangrong /*
25402fc84311SXiao Guangrong  * The slow path to get the pfn of the specified host virtual address,
25412fc84311SXiao Guangrong  * 1 indicates success, -errno is returned if error is detected.
25422fc84311SXiao Guangrong  */
hva_to_pfn_slow(unsigned long addr,bool * async,bool write_fault,bool interruptible,bool * writable,kvm_pfn_t * pfn)25432fc84311SXiao Guangrong static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544c8b88b33SPeter Xu 			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
25452fc84311SXiao Guangrong {
2546b1e1296dSDavid Hildenbrand 	/*
2547b1e1296dSDavid Hildenbrand 	 * When a VCPU accesses a page that is not mapped into the secondary
2548b1e1296dSDavid Hildenbrand 	 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2549b1e1296dSDavid Hildenbrand 	 * make progress. We always want to honor NUMA hinting faults in that
2550b1e1296dSDavid Hildenbrand 	 * case, because GUP usage corresponds to memory accesses from the VCPU.
2551b1e1296dSDavid Hildenbrand 	 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2552b1e1296dSDavid Hildenbrand 	 * mapped into the secondary MMU and gets accessed by a VCPU.
2553b1e1296dSDavid Hildenbrand 	 *
2554b1e1296dSDavid Hildenbrand 	 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2555b1e1296dSDavid Hildenbrand 	 * implicitly honor NUMA hinting faults and don't need this flag.
2556b1e1296dSDavid Hildenbrand 	 */
2557b1e1296dSDavid Hildenbrand 	unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2558ce53053cSAl Viro 	struct page *page;
255928249139SLi kunyu 	int npages;
25602fc84311SXiao Guangrong 
25610fce5623SAvi Kivity 	might_sleep();
2562612819c3SMarcelo Tosatti 
2563612819c3SMarcelo Tosatti 	if (writable)
2564612819c3SMarcelo Tosatti 		*writable = write_fault;
2565612819c3SMarcelo Tosatti 
2566d4944b0eSLorenzo Stoakes 	if (write_fault)
2567d4944b0eSLorenzo Stoakes 		flags |= FOLL_WRITE;
2568ce53053cSAl Viro 	if (async)
2569ce53053cSAl Viro 		flags |= FOLL_NOWAIT;
2570c8b88b33SPeter Xu 	if (interruptible)
2571c8b88b33SPeter Xu 		flags |= FOLL_INTERRUPTIBLE;
2572d4944b0eSLorenzo Stoakes 
2573ce53053cSAl Viro 	npages = get_user_pages_unlocked(addr, 1, &page, flags);
25742fc84311SXiao Guangrong 	if (npages != 1)
25752fc84311SXiao Guangrong 		return npages;
2576612819c3SMarcelo Tosatti 
2577612819c3SMarcelo Tosatti 	/* map read fault as writable if possible */
257812ce13feSXiao Guangrong 	if (unlikely(!write_fault) && writable) {
2579ce53053cSAl Viro 		struct page *wpage;
2580612819c3SMarcelo Tosatti 
2581dadbb612SSouptick Joarder 		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2582612819c3SMarcelo Tosatti 			*writable = true;
2583ce53053cSAl Viro 			put_page(page);
2584ce53053cSAl Viro 			page = wpage;
2585612819c3SMarcelo Tosatti 		}
2586612819c3SMarcelo Tosatti 	}
2587ce53053cSAl Viro 	*pfn = page_to_pfn(page);
25882fc84311SXiao Guangrong 	return npages;
2589887c08acSXiao Guangrong }
25900fce5623SAvi Kivity 
vma_is_valid(struct vm_area_struct * vma,bool write_fault)25914d8b81abSXiao Guangrong static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
25924d8b81abSXiao Guangrong {
25934d8b81abSXiao Guangrong 	if (unlikely(!(vma->vm_flags & VM_READ)))
25944d8b81abSXiao Guangrong 		return false;
25954d8b81abSXiao Guangrong 
25964d8b81abSXiao Guangrong 	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
25974d8b81abSXiao Guangrong 		return false;
25984d8b81abSXiao Guangrong 
25994d8b81abSXiao Guangrong 	return true;
26004d8b81abSXiao Guangrong }
26014d8b81abSXiao Guangrong 
kvm_try_get_pfn(kvm_pfn_t pfn)2602f8be156bSNicholas Piggin static int kvm_try_get_pfn(kvm_pfn_t pfn)
2603f8be156bSNicholas Piggin {
2604b14b2690SSean Christopherson 	struct page *page = kvm_pfn_to_refcounted_page(pfn);
2605b14b2690SSean Christopherson 
2606b14b2690SSean Christopherson 	if (!page)
2607f8be156bSNicholas Piggin 		return 1;
2608b14b2690SSean Christopherson 
2609b14b2690SSean Christopherson 	return get_page_unless_zero(page);
2610f8be156bSNicholas Piggin }
2611f8be156bSNicholas Piggin 
hva_to_pfn_remapped(struct vm_area_struct * vma,unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * p_pfn)261292176a8eSPaolo Bonzini static int hva_to_pfn_remapped(struct vm_area_struct *vma,
26131625566eSXianting Tian 			       unsigned long addr, bool write_fault,
26141625566eSXianting Tian 			       bool *writable, kvm_pfn_t *p_pfn)
261592176a8eSPaolo Bonzini {
2616a9545779SSean Christopherson 	kvm_pfn_t pfn;
2617bd2fae8dSPaolo Bonzini 	pte_t *ptep;
2618c33c7948SRyan Roberts 	pte_t pte;
2619bd2fae8dSPaolo Bonzini 	spinlock_t *ptl;
2620add6a0cdSPaolo Bonzini 	int r;
2621add6a0cdSPaolo Bonzini 
26229fd6dad1SPaolo Bonzini 	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2623add6a0cdSPaolo Bonzini 	if (r) {
2624add6a0cdSPaolo Bonzini 		/*
2625add6a0cdSPaolo Bonzini 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2626add6a0cdSPaolo Bonzini 		 * not call the fault handler, so do it here.
2627add6a0cdSPaolo Bonzini 		 */
2628add6a0cdSPaolo Bonzini 		bool unlocked = false;
262964019a2eSPeter Xu 		r = fixup_user_fault(current->mm, addr,
2630add6a0cdSPaolo Bonzini 				     (write_fault ? FAULT_FLAG_WRITE : 0),
2631add6a0cdSPaolo Bonzini 				     &unlocked);
2632a8387d0bSPaolo Bonzini 		if (unlocked)
2633a8387d0bSPaolo Bonzini 			return -EAGAIN;
2634add6a0cdSPaolo Bonzini 		if (r)
2635add6a0cdSPaolo Bonzini 			return r;
2636add6a0cdSPaolo Bonzini 
26379fd6dad1SPaolo Bonzini 		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2638add6a0cdSPaolo Bonzini 		if (r)
2639add6a0cdSPaolo Bonzini 			return r;
2640bd2fae8dSPaolo Bonzini 	}
2641add6a0cdSPaolo Bonzini 
2642c33c7948SRyan Roberts 	pte = ptep_get(ptep);
2643c33c7948SRyan Roberts 
2644c33c7948SRyan Roberts 	if (write_fault && !pte_write(pte)) {
2645bd2fae8dSPaolo Bonzini 		pfn = KVM_PFN_ERR_RO_FAULT;
2646bd2fae8dSPaolo Bonzini 		goto out;
2647add6a0cdSPaolo Bonzini 	}
2648add6a0cdSPaolo Bonzini 
2649a340b3e2SKarimAllah Ahmed 	if (writable)
2650c33c7948SRyan Roberts 		*writable = pte_write(pte);
2651c33c7948SRyan Roberts 	pfn = pte_pfn(pte);
2652add6a0cdSPaolo Bonzini 
2653add6a0cdSPaolo Bonzini 	/*
2654add6a0cdSPaolo Bonzini 	 * Get a reference here because callers of *hva_to_pfn* and
2655add6a0cdSPaolo Bonzini 	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2656add6a0cdSPaolo Bonzini 	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
265736c3ce6cSMarc Zyngier 	 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2658add6a0cdSPaolo Bonzini 	 * simply do nothing for reserved pfns.
2659add6a0cdSPaolo Bonzini 	 *
2660add6a0cdSPaolo Bonzini 	 * Whoever called remap_pfn_range is also going to call e.g.
2661add6a0cdSPaolo Bonzini 	 * unmap_mapping_range before the underlying pages are freed,
2662add6a0cdSPaolo Bonzini 	 * causing a call to our MMU notifier.
2663f8be156bSNicholas Piggin 	 *
2664f8be156bSNicholas Piggin 	 * Certain IO or PFNMAP mappings can be backed with valid
2665f8be156bSNicholas Piggin 	 * struct pages, but be allocated without refcounting e.g.,
2666f8be156bSNicholas Piggin 	 * tail pages of non-compound higher order allocations, which
2667f8be156bSNicholas Piggin 	 * would then underflow the refcount when the caller does the
2668f8be156bSNicholas Piggin 	 * required put_page. Don't allow those pages here.
2669add6a0cdSPaolo Bonzini 	 */
2670f8be156bSNicholas Piggin 	if (!kvm_try_get_pfn(pfn))
2671f8be156bSNicholas Piggin 		r = -EFAULT;
2672add6a0cdSPaolo Bonzini 
2673bd2fae8dSPaolo Bonzini out:
2674bd2fae8dSPaolo Bonzini 	pte_unmap_unlock(ptep, ptl);
2675add6a0cdSPaolo Bonzini 	*p_pfn = pfn;
2676f8be156bSNicholas Piggin 
2677f8be156bSNicholas Piggin 	return r;
267892176a8eSPaolo Bonzini }
267992176a8eSPaolo Bonzini 
268012ce13feSXiao Guangrong /*
268112ce13feSXiao Guangrong  * Pin guest page in memory and return its pfn.
268212ce13feSXiao Guangrong  * @addr: host virtual address which maps memory to the guest
268312ce13feSXiao Guangrong  * @atomic: whether this function can sleep
2684c8b88b33SPeter Xu  * @interruptible: whether the process can be interrupted by non-fatal signals
268512ce13feSXiao Guangrong  * @async: whether this function need to wait IO complete if the
268612ce13feSXiao Guangrong  *         host page is not in the memory
268712ce13feSXiao Guangrong  * @write_fault: whether we should get a writable host page
268812ce13feSXiao Guangrong  * @writable: whether it allows to map a writable host page for !@write_fault
268912ce13feSXiao Guangrong  *
269012ce13feSXiao Guangrong  * The function will map a writable host page for these two cases:
269112ce13feSXiao Guangrong  * 1): @write_fault = true
269212ce13feSXiao Guangrong  * 2): @write_fault = false && @writable, @writable will tell the caller
269312ce13feSXiao Guangrong  *     whether the mapping is writable.
269412ce13feSXiao Guangrong  */
hva_to_pfn(unsigned long addr,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable)2695c8b88b33SPeter Xu kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2696c8b88b33SPeter Xu 		     bool *async, bool write_fault, bool *writable)
26972fc84311SXiao Guangrong {
26982e2e3738SAnthony Liguori 	struct vm_area_struct *vma;
2699943dfea8SSean Christopherson 	kvm_pfn_t pfn;
270092176a8eSPaolo Bonzini 	int npages, r;
27012fc84311SXiao Guangrong 
27022fc84311SXiao Guangrong 	/* we can do it either atomically or asynchronously, not both */
27032fc84311SXiao Guangrong 	BUG_ON(atomic && async);
27042fc84311SXiao Guangrong 
2705b9b33da2SPaolo Bonzini 	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
27062fc84311SXiao Guangrong 		return pfn;
27072e2e3738SAnthony Liguori 
2708887c08acSXiao Guangrong 	if (atomic)
27096c8ee57bSXiao Guangrong 		return KVM_PFN_ERR_FAULT;
2710887c08acSXiao Guangrong 
2711c8b88b33SPeter Xu 	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2712c8b88b33SPeter Xu 				 writable, &pfn);
27132fc84311SXiao Guangrong 	if (npages == 1)
27142fc84311SXiao Guangrong 		return pfn;
2715fe5ed56cSPeter Xu 	if (npages == -EINTR)
2716fe5ed56cSPeter Xu 		return KVM_PFN_ERR_SIGPENDING;
27172e2e3738SAnthony Liguori 
2718d8ed45c5SMichel Lespinasse 	mmap_read_lock(current->mm);
27190857b9e9SGleb Natapov 	if (npages == -EHWPOISON ||
27200857b9e9SGleb Natapov 	      (!async && check_user_page_hwpoison(addr))) {
27212fc84311SXiao Guangrong 		pfn = KVM_PFN_ERR_HWPOISON;
27222fc84311SXiao Guangrong 		goto exit;
2723bf998156SHuang Ying 	}
2724bf998156SHuang Ying 
2725a8387d0bSPaolo Bonzini retry:
2726fc98c03bSLiam Howlett 	vma = vma_lookup(current->mm, addr);
27274c2155ceSMarcelo Tosatti 
27288030089fSGleb Natapov 	if (vma == NULL)
27296c8ee57bSXiao Guangrong 		pfn = KVM_PFN_ERR_FAULT;
273092176a8eSPaolo Bonzini 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
27311625566eSXianting Tian 		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2732a8387d0bSPaolo Bonzini 		if (r == -EAGAIN)
2733a8387d0bSPaolo Bonzini 			goto retry;
273492176a8eSPaolo Bonzini 		if (r < 0)
273592176a8eSPaolo Bonzini 			pfn = KVM_PFN_ERR_FAULT;
27368030089fSGleb Natapov 	} else {
27374d8b81abSXiao Guangrong 		if (async && vma_is_valid(vma, write_fault))
27388030089fSGleb Natapov 			*async = true;
27396c8ee57bSXiao Guangrong 		pfn = KVM_PFN_ERR_FAULT;
27408030089fSGleb Natapov 	}
27412fc84311SXiao Guangrong exit:
2742d8ed45c5SMichel Lespinasse 	mmap_read_unlock(current->mm);
27432e2e3738SAnthony Liguori 	return pfn;
274435149e21SAnthony Liguori }
274535149e21SAnthony Liguori 
__gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable,hva_t * hva)27468283e36aSBen Gardon kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2747c8b88b33SPeter Xu 			       bool atomic, bool interruptible, bool *async,
2748c8b88b33SPeter Xu 			       bool write_fault, bool *writable, hva_t *hva)
2749887c08acSXiao Guangrong {
27504d8b81abSXiao Guangrong 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
27514d8b81abSXiao Guangrong 
27524a42d848SDavid Stevens 	if (hva)
27534a42d848SDavid Stevens 		*hva = addr;
27544a42d848SDavid Stevens 
2755b2740d35SPaolo Bonzini 	if (addr == KVM_HVA_ERR_RO_BAD) {
2756b2740d35SPaolo Bonzini 		if (writable)
2757b2740d35SPaolo Bonzini 			*writable = false;
27584d8b81abSXiao Guangrong 		return KVM_PFN_ERR_RO_FAULT;
2759b2740d35SPaolo Bonzini 	}
27604d8b81abSXiao Guangrong 
2761b2740d35SPaolo Bonzini 	if (kvm_is_error_hva(addr)) {
2762b2740d35SPaolo Bonzini 		if (writable)
2763b2740d35SPaolo Bonzini 			*writable = false;
276481c52c56SXiao Guangrong 		return KVM_PFN_NOSLOT;
2765b2740d35SPaolo Bonzini 	}
27664d8b81abSXiao Guangrong 
27674d8b81abSXiao Guangrong 	/* Do not map writable pfn in the readonly memslot. */
27684d8b81abSXiao Guangrong 	if (writable && memslot_is_readonly(slot)) {
27694d8b81abSXiao Guangrong 		*writable = false;
27704d8b81abSXiao Guangrong 		writable = NULL;
2771887c08acSXiao Guangrong 	}
27724d8b81abSXiao Guangrong 
2773c8b88b33SPeter Xu 	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
27744d8b81abSXiao Guangrong 			  writable);
27754d8b81abSXiao Guangrong }
27763520469dSPaolo Bonzini EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2777887c08acSXiao Guangrong 
gfn_to_pfn_prot(struct kvm * kvm,gfn_t gfn,bool write_fault,bool * writable)2778ba049e93SDan Williams kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2779612819c3SMarcelo Tosatti 		      bool *writable)
2780612819c3SMarcelo Tosatti {
2781c8b88b33SPeter Xu 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2782c8b88b33SPeter Xu 				    NULL, write_fault, writable, NULL);
2783612819c3SMarcelo Tosatti }
2784612819c3SMarcelo Tosatti EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2785612819c3SMarcelo Tosatti 
gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn)27868283e36aSBen Gardon kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2787506f0d6fSMarcelo Tosatti {
2788c8b88b33SPeter Xu 	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2789c8b88b33SPeter Xu 				    NULL, NULL);
2790506f0d6fSMarcelo Tosatti }
2791e37afc6eSPaolo Bonzini EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2792506f0d6fSMarcelo Tosatti 
gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot * slot,gfn_t gfn)27938283e36aSBen Gardon kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2794037d92dcSXiao Guangrong {
2795c8b88b33SPeter Xu 	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2796c8b88b33SPeter Xu 				    NULL, NULL);
2797037d92dcSXiao Guangrong }
2798037d92dcSXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2799037d92dcSXiao Guangrong 
kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu * vcpu,gfn_t gfn)2800ba049e93SDan Williams kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
28018e73485cSPaolo Bonzini {
28028e73485cSPaolo Bonzini 	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
28038e73485cSPaolo Bonzini }
28048e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
28058e73485cSPaolo Bonzini 
gfn_to_pfn(struct kvm * kvm,gfn_t gfn)2806ba049e93SDan Williams kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2807e37afc6eSPaolo Bonzini {
2808e37afc6eSPaolo Bonzini 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2809e37afc6eSPaolo Bonzini }
2810e37afc6eSPaolo Bonzini EXPORT_SYMBOL_GPL(gfn_to_pfn);
2811e37afc6eSPaolo Bonzini 
kvm_vcpu_gfn_to_pfn(struct kvm_vcpu * vcpu,gfn_t gfn)2812ba049e93SDan Williams kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
28138e73485cSPaolo Bonzini {
28148e73485cSPaolo Bonzini 	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
28158e73485cSPaolo Bonzini }
28168e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
28178e73485cSPaolo Bonzini 
gfn_to_page_many_atomic(struct kvm_memory_slot * slot,gfn_t gfn,struct page ** pages,int nr_pages)2818d9ef13c2SPaolo Bonzini int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819d9ef13c2SPaolo Bonzini 			    struct page **pages, int nr_pages)
282048987781SXiao Guangrong {
282148987781SXiao Guangrong 	unsigned long addr;
2822076b925dSArnd Bergmann 	gfn_t entry = 0;
282348987781SXiao Guangrong 
2824d9ef13c2SPaolo Bonzini 	addr = gfn_to_hva_many(slot, gfn, &entry);
282548987781SXiao Guangrong 	if (kvm_is_error_hva(addr))
282648987781SXiao Guangrong 		return -1;
282748987781SXiao Guangrong 
282848987781SXiao Guangrong 	if (entry < nr_pages)
282948987781SXiao Guangrong 		return 0;
283048987781SXiao Guangrong 
2831dadbb612SSouptick Joarder 	return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
283248987781SXiao Guangrong }
283348987781SXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
283448987781SXiao Guangrong 
2835b1624f99SSean Christopherson /*
2836b1624f99SSean Christopherson  * Do not use this helper unless you are absolutely certain the gfn _must_ be
2837b1624f99SSean Christopherson  * backed by 'struct page'.  A valid example is if the backing memslot is
2838b1624f99SSean Christopherson  * controlled by KVM.  Note, if the returned page is valid, it's refcount has
2839b1624f99SSean Christopherson  * been elevated by gfn_to_pfn().
2840b1624f99SSean Christopherson  */
gfn_to_page(struct kvm * kvm,gfn_t gfn)284135149e21SAnthony Liguori struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
284235149e21SAnthony Liguori {
2843b14b2690SSean Christopherson 	struct page *page;
2844ba049e93SDan Williams 	kvm_pfn_t pfn;
28452e2e3738SAnthony Liguori 
28462e2e3738SAnthony Liguori 	pfn = gfn_to_pfn(kvm, gfn);
28472e2e3738SAnthony Liguori 
2848c77fb9dcSXiantao Zhang 	if (is_error_noslot_pfn(pfn))
28492e2e3738SAnthony Liguori 		return KVM_ERR_PTR_BAD_PAGE;
28502e2e3738SAnthony Liguori 
2851b14b2690SSean Christopherson 	page = kvm_pfn_to_refcounted_page(pfn);
2852b14b2690SSean Christopherson 	if (!page)
28530fce5623SAvi Kivity 		return KVM_ERR_PTR_BAD_PAGE;
28540fce5623SAvi Kivity 
2855b14b2690SSean Christopherson 	return page;
28560fce5623SAvi Kivity }
28570fce5623SAvi Kivity EXPORT_SYMBOL_GPL(gfn_to_page);
28580fce5623SAvi Kivity 
kvm_release_pfn(kvm_pfn_t pfn,bool dirty)2859357a18adSDavid Woodhouse void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
286091724814SBoris Ostrovsky {
286191724814SBoris Ostrovsky 	if (dirty)
286291724814SBoris Ostrovsky 		kvm_release_pfn_dirty(pfn);
286391724814SBoris Ostrovsky 	else
286491724814SBoris Ostrovsky 		kvm_release_pfn_clean(pfn);
286591724814SBoris Ostrovsky }
286691724814SBoris Ostrovsky 
kvm_vcpu_map(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map)2867357a18adSDavid Woodhouse int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2868e45adf66SKarimAllah Ahmed {
2869e45adf66SKarimAllah Ahmed 	kvm_pfn_t pfn;
2870e45adf66SKarimAllah Ahmed 	void *hva = NULL;
2871e45adf66SKarimAllah Ahmed 	struct page *page = KVM_UNMAPPED_PAGE;
2872e45adf66SKarimAllah Ahmed 
2873e45adf66SKarimAllah Ahmed 	if (!map)
2874e45adf66SKarimAllah Ahmed 		return -EINVAL;
2875e45adf66SKarimAllah Ahmed 
2876357a18adSDavid Woodhouse 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2877e45adf66SKarimAllah Ahmed 	if (is_error_noslot_pfn(pfn))
2878e45adf66SKarimAllah Ahmed 		return -EINVAL;
2879e45adf66SKarimAllah Ahmed 
2880e45adf66SKarimAllah Ahmed 	if (pfn_valid(pfn)) {
2881e45adf66SKarimAllah Ahmed 		page = pfn_to_page(pfn);
2882e45adf66SKarimAllah Ahmed 		hva = kmap(page);
2883d30b214dSPaolo Bonzini #ifdef CONFIG_HAS_IOMEM
288491724814SBoris Ostrovsky 	} else {
2885357a18adSDavid Woodhouse 		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2886d30b214dSPaolo Bonzini #endif
2887e45adf66SKarimAllah Ahmed 	}
2888e45adf66SKarimAllah Ahmed 
2889e45adf66SKarimAllah Ahmed 	if (!hva)
2890e45adf66SKarimAllah Ahmed 		return -EFAULT;
2891e45adf66SKarimAllah Ahmed 
2892e45adf66SKarimAllah Ahmed 	map->page = page;
2893e45adf66SKarimAllah Ahmed 	map->hva = hva;
2894e45adf66SKarimAllah Ahmed 	map->pfn = pfn;
2895e45adf66SKarimAllah Ahmed 	map->gfn = gfn;
2896e45adf66SKarimAllah Ahmed 
2897e45adf66SKarimAllah Ahmed 	return 0;
2898e45adf66SKarimAllah Ahmed }
2899e45adf66SKarimAllah Ahmed EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2900e45adf66SKarimAllah Ahmed 
kvm_vcpu_unmap(struct kvm_vcpu * vcpu,struct kvm_host_map * map,bool dirty)2901357a18adSDavid Woodhouse void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2902e45adf66SKarimAllah Ahmed {
2903e45adf66SKarimAllah Ahmed 	if (!map)
2904e45adf66SKarimAllah Ahmed 		return;
2905e45adf66SKarimAllah Ahmed 
2906e45adf66SKarimAllah Ahmed 	if (!map->hva)
2907e45adf66SKarimAllah Ahmed 		return;
2908e45adf66SKarimAllah Ahmed 
2909357a18adSDavid Woodhouse 	if (map->page != KVM_UNMAPPED_PAGE)
291091724814SBoris Ostrovsky 		kunmap(map->page);
291191724814SBoris Ostrovsky #ifdef CONFIG_HAS_IOMEM
291291724814SBoris Ostrovsky 	else
2913357a18adSDavid Woodhouse 		memunmap(map->hva);
2914eb1f2f38SChristian Borntraeger #endif
2915e45adf66SKarimAllah Ahmed 
291691724814SBoris Ostrovsky 	if (dirty)
2917357a18adSDavid Woodhouse 		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
291891724814SBoris Ostrovsky 
2919357a18adSDavid Woodhouse 	kvm_release_pfn(map->pfn, dirty);
2920e45adf66SKarimAllah Ahmed 
2921e45adf66SKarimAllah Ahmed 	map->hva = NULL;
2922e45adf66SKarimAllah Ahmed 	map->page = NULL;
2923e45adf66SKarimAllah Ahmed }
2924e45adf66SKarimAllah Ahmed EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2925e45adf66SKarimAllah Ahmed 
kvm_is_ad_tracked_page(struct page * page)29268e1c6914SSean Christopherson static bool kvm_is_ad_tracked_page(struct page *page)
29278e73485cSPaolo Bonzini {
29288e1c6914SSean Christopherson 	/*
29298e1c6914SSean Christopherson 	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
29308e1c6914SSean Christopherson 	 * touched (e.g. set dirty) except by its owner".
29318e1c6914SSean Christopherson 	 */
29328e1c6914SSean Christopherson 	return !PageReserved(page);
29338e73485cSPaolo Bonzini }
29348e1c6914SSean Christopherson 
kvm_set_page_dirty(struct page * page)29358e1c6914SSean Christopherson static void kvm_set_page_dirty(struct page *page)
29368e1c6914SSean Christopherson {
29378e1c6914SSean Christopherson 	if (kvm_is_ad_tracked_page(page))
29388e1c6914SSean Christopherson 		SetPageDirty(page);
29398e1c6914SSean Christopherson }
29408e1c6914SSean Christopherson 
kvm_set_page_accessed(struct page * page)29418e1c6914SSean Christopherson static void kvm_set_page_accessed(struct page *page)
29428e1c6914SSean Christopherson {
29438e1c6914SSean Christopherson 	if (kvm_is_ad_tracked_page(page))
29448e1c6914SSean Christopherson 		mark_page_accessed(page);
29458e1c6914SSean Christopherson }
29468e73485cSPaolo Bonzini 
kvm_release_page_clean(struct page * page)29470fce5623SAvi Kivity void kvm_release_page_clean(struct page *page)
29480fce5623SAvi Kivity {
294932cad84fSXiao Guangrong 	WARN_ON(is_error_page(page));
295032cad84fSXiao Guangrong 
29518e1c6914SSean Christopherson 	kvm_set_page_accessed(page);
29528e1c6914SSean Christopherson 	put_page(page);
29530fce5623SAvi Kivity }
29540fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_release_page_clean);
29550fce5623SAvi Kivity 
kvm_release_pfn_clean(kvm_pfn_t pfn)2956ba049e93SDan Williams void kvm_release_pfn_clean(kvm_pfn_t pfn)
295735149e21SAnthony Liguori {
2958b14b2690SSean Christopherson 	struct page *page;
2959b14b2690SSean Christopherson 
2960b14b2690SSean Christopherson 	if (is_error_noslot_pfn(pfn))
2961b14b2690SSean Christopherson 		return;
2962b14b2690SSean Christopherson 
2963b14b2690SSean Christopherson 	page = kvm_pfn_to_refcounted_page(pfn);
2964b14b2690SSean Christopherson 	if (!page)
2965b14b2690SSean Christopherson 		return;
2966b14b2690SSean Christopherson 
2967b14b2690SSean Christopherson 	kvm_release_page_clean(page);
296835149e21SAnthony Liguori }
296935149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
297035149e21SAnthony Liguori 
kvm_release_page_dirty(struct page * page)29710fce5623SAvi Kivity void kvm_release_page_dirty(struct page *page)
29720fce5623SAvi Kivity {
2973a2766325SXiao Guangrong 	WARN_ON(is_error_page(page));
2974a2766325SXiao Guangrong 
29758e1c6914SSean Christopherson 	kvm_set_page_dirty(page);
29768e1c6914SSean Christopherson 	kvm_release_page_clean(page);
29770fce5623SAvi Kivity }
29780fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
29790fce5623SAvi Kivity 
kvm_release_pfn_dirty(kvm_pfn_t pfn)2980f7a6509fSDavid Hildenbrand void kvm_release_pfn_dirty(kvm_pfn_t pfn)
298135149e21SAnthony Liguori {
2982b14b2690SSean Christopherson 	struct page *page;
2983b14b2690SSean Christopherson 
2984b14b2690SSean Christopherson 	if (is_error_noslot_pfn(pfn))
2985b14b2690SSean Christopherson 		return;
2986b14b2690SSean Christopherson 
2987b14b2690SSean Christopherson 	page = kvm_pfn_to_refcounted_page(pfn);
2988b14b2690SSean Christopherson 	if (!page)
2989b14b2690SSean Christopherson 		return;
2990b14b2690SSean Christopherson 
2991b14b2690SSean Christopherson 	kvm_release_page_dirty(page);
299235149e21SAnthony Liguori }
2993f7a6509fSDavid Hildenbrand EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
299435149e21SAnthony Liguori 
2995a1040b0dSSean Christopherson /*
29968e1c6914SSean Christopherson  * Note, checking for an error/noslot pfn is the caller's responsibility when
29978e1c6914SSean Christopherson  * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
29988e1c6914SSean Christopherson  * "set" helpers are not to be used when the pfn might point at garbage.
2999a1040b0dSSean Christopherson  */
kvm_set_pfn_dirty(kvm_pfn_t pfn)3000ba049e93SDan Williams void kvm_set_pfn_dirty(kvm_pfn_t pfn)
300135149e21SAnthony Liguori {
30028e1c6914SSean Christopherson 	if (WARN_ON(is_error_noslot_pfn(pfn)))
30038e1c6914SSean Christopherson 		return;
30048e1c6914SSean Christopherson 
30058e1c6914SSean Christopherson 	if (pfn_valid(pfn))
30068e1c6914SSean Christopherson 		kvm_set_page_dirty(pfn_to_page(pfn));
30072e2e3738SAnthony Liguori }
300835149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
300935149e21SAnthony Liguori 
kvm_set_pfn_accessed(kvm_pfn_t pfn)3010ba049e93SDan Williams void kvm_set_pfn_accessed(kvm_pfn_t pfn)
301135149e21SAnthony Liguori {
30128e1c6914SSean Christopherson 	if (WARN_ON(is_error_noslot_pfn(pfn)))
30138e1c6914SSean Christopherson 		return;
30148e1c6914SSean Christopherson 
30158e1c6914SSean Christopherson 	if (pfn_valid(pfn))
30168e1c6914SSean Christopherson 		kvm_set_page_accessed(pfn_to_page(pfn));
301735149e21SAnthony Liguori }
301835149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
301935149e21SAnthony Liguori 
next_segment(unsigned long len,int offset)30200fce5623SAvi Kivity static int next_segment(unsigned long len, int offset)
30210fce5623SAvi Kivity {
30220fce5623SAvi Kivity 	if (len > PAGE_SIZE - offset)
30230fce5623SAvi Kivity 		return PAGE_SIZE - offset;
30240fce5623SAvi Kivity 	else
30250fce5623SAvi Kivity 		return len;
30260fce5623SAvi Kivity }
30270fce5623SAvi Kivity 
__kvm_read_guest_page(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,int len)30288e73485cSPaolo Bonzini static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
30298e73485cSPaolo Bonzini 				 void *data, int offset, int len)
30300fce5623SAvi Kivity {
30310fce5623SAvi Kivity 	int r;
30320fce5623SAvi Kivity 	unsigned long addr;
30330fce5623SAvi Kivity 
30348e73485cSPaolo Bonzini 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
30350fce5623SAvi Kivity 	if (kvm_is_error_hva(addr))
30360fce5623SAvi Kivity 		return -EFAULT;
30373180a7fcSPaolo Bonzini 	r = __copy_from_user(data, (void __user *)addr + offset, len);
30380fce5623SAvi Kivity 	if (r)
30390fce5623SAvi Kivity 		return -EFAULT;
30400fce5623SAvi Kivity 	return 0;
30410fce5623SAvi Kivity }
30428e73485cSPaolo Bonzini 
kvm_read_guest_page(struct kvm * kvm,gfn_t gfn,void * data,int offset,int len)30438e73485cSPaolo Bonzini int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
30448e73485cSPaolo Bonzini 			int len)
30458e73485cSPaolo Bonzini {
30468e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
30478e73485cSPaolo Bonzini 
30488e73485cSPaolo Bonzini 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
30498e73485cSPaolo Bonzini }
30500fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_read_guest_page);
30510fce5623SAvi Kivity 
kvm_vcpu_read_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,void * data,int offset,int len)30528e73485cSPaolo Bonzini int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
30538e73485cSPaolo Bonzini 			     int offset, int len)
30548e73485cSPaolo Bonzini {
30558e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
30568e73485cSPaolo Bonzini 
30578e73485cSPaolo Bonzini 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
30588e73485cSPaolo Bonzini }
30598e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
30608e73485cSPaolo Bonzini 
kvm_read_guest(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)30610fce5623SAvi Kivity int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
30620fce5623SAvi Kivity {
30630fce5623SAvi Kivity 	gfn_t gfn = gpa >> PAGE_SHIFT;
30640fce5623SAvi Kivity 	int seg;
30650fce5623SAvi Kivity 	int offset = offset_in_page(gpa);
30660fce5623SAvi Kivity 	int ret;
30670fce5623SAvi Kivity 
30680fce5623SAvi Kivity 	while ((seg = next_segment(len, offset)) != 0) {
30690fce5623SAvi Kivity 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
30700fce5623SAvi Kivity 		if (ret < 0)
30710fce5623SAvi Kivity 			return ret;
30720fce5623SAvi Kivity 		offset = 0;
30730fce5623SAvi Kivity 		len -= seg;
30740fce5623SAvi Kivity 		data += seg;
30750fce5623SAvi Kivity 		++gfn;
30760fce5623SAvi Kivity 	}
30770fce5623SAvi Kivity 	return 0;
30780fce5623SAvi Kivity }
30790fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_read_guest);
30800fce5623SAvi Kivity 
kvm_vcpu_read_guest(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)30818e73485cSPaolo Bonzini int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
30828e73485cSPaolo Bonzini {
30838e73485cSPaolo Bonzini 	gfn_t gfn = gpa >> PAGE_SHIFT;
30848e73485cSPaolo Bonzini 	int seg;
30858e73485cSPaolo Bonzini 	int offset = offset_in_page(gpa);
30868e73485cSPaolo Bonzini 	int ret;
30878e73485cSPaolo Bonzini 
30888e73485cSPaolo Bonzini 	while ((seg = next_segment(len, offset)) != 0) {
30898e73485cSPaolo Bonzini 		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
30908e73485cSPaolo Bonzini 		if (ret < 0)
30918e73485cSPaolo Bonzini 			return ret;
30928e73485cSPaolo Bonzini 		offset = 0;
30938e73485cSPaolo Bonzini 		len -= seg;
30948e73485cSPaolo Bonzini 		data += seg;
30958e73485cSPaolo Bonzini 		++gfn;
30968e73485cSPaolo Bonzini 	}
30978e73485cSPaolo Bonzini 	return 0;
30988e73485cSPaolo Bonzini }
30998e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
31008e73485cSPaolo Bonzini 
__kvm_read_guest_atomic(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,unsigned long len)31018e73485cSPaolo Bonzini static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
31028e73485cSPaolo Bonzini 			           void *data, int offset, unsigned long len)
31037ec54588SMarcelo Tosatti {
31047ec54588SMarcelo Tosatti 	int r;
31057ec54588SMarcelo Tosatti 	unsigned long addr;
31067ec54588SMarcelo Tosatti 
31078e73485cSPaolo Bonzini 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
31087ec54588SMarcelo Tosatti 	if (kvm_is_error_hva(addr))
31097ec54588SMarcelo Tosatti 		return -EFAULT;
31100aac03f0SAndrea Arcangeli 	pagefault_disable();
31113180a7fcSPaolo Bonzini 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
31120aac03f0SAndrea Arcangeli 	pagefault_enable();
31137ec54588SMarcelo Tosatti 	if (r)
31147ec54588SMarcelo Tosatti 		return -EFAULT;
31157ec54588SMarcelo Tosatti 	return 0;
31167ec54588SMarcelo Tosatti }
31177ec54588SMarcelo Tosatti 
kvm_vcpu_read_guest_atomic(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)31188e73485cSPaolo Bonzini int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
31198e73485cSPaolo Bonzini 			       void *data, unsigned long len)
31208e73485cSPaolo Bonzini {
31218e73485cSPaolo Bonzini 	gfn_t gfn = gpa >> PAGE_SHIFT;
31228e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
31238e73485cSPaolo Bonzini 	int offset = offset_in_page(gpa);
31248e73485cSPaolo Bonzini 
31258e73485cSPaolo Bonzini 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
31268e73485cSPaolo Bonzini }
31278e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
31288e73485cSPaolo Bonzini 
__kvm_write_guest_page(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,const void * data,int offset,int len)312928bd726aSPeter Xu static int __kvm_write_guest_page(struct kvm *kvm,
313028bd726aSPeter Xu 				  struct kvm_memory_slot *memslot, gfn_t gfn,
31318e73485cSPaolo Bonzini 			          const void *data, int offset, int len)
31320fce5623SAvi Kivity {
31330fce5623SAvi Kivity 	int r;
31340fce5623SAvi Kivity 	unsigned long addr;
31350fce5623SAvi Kivity 
3136251eb841SRadim Krčmář 	addr = gfn_to_hva_memslot(memslot, gfn);
31370fce5623SAvi Kivity 	if (kvm_is_error_hva(addr))
31380fce5623SAvi Kivity 		return -EFAULT;
31398b0cedffSXiao Guangrong 	r = __copy_to_user((void __user *)addr + offset, data, len);
31400fce5623SAvi Kivity 	if (r)
31410fce5623SAvi Kivity 		return -EFAULT;
314228bd726aSPeter Xu 	mark_page_dirty_in_slot(kvm, memslot, gfn);
31430fce5623SAvi Kivity 	return 0;
31440fce5623SAvi Kivity }
31458e73485cSPaolo Bonzini 
kvm_write_guest_page(struct kvm * kvm,gfn_t gfn,const void * data,int offset,int len)31468e73485cSPaolo Bonzini int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
31478e73485cSPaolo Bonzini 			 const void *data, int offset, int len)
31488e73485cSPaolo Bonzini {
31498e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
31508e73485cSPaolo Bonzini 
315128bd726aSPeter Xu 	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
31528e73485cSPaolo Bonzini }
31530fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_write_guest_page);
31540fce5623SAvi Kivity 
kvm_vcpu_write_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,const void * data,int offset,int len)31558e73485cSPaolo Bonzini int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
31568e73485cSPaolo Bonzini 			      const void *data, int offset, int len)
31578e73485cSPaolo Bonzini {
31588e73485cSPaolo Bonzini 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
31598e73485cSPaolo Bonzini 
316028bd726aSPeter Xu 	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
31618e73485cSPaolo Bonzini }
31628e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
31638e73485cSPaolo Bonzini 
kvm_write_guest(struct kvm * kvm,gpa_t gpa,const void * data,unsigned long len)31640fce5623SAvi Kivity int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
31650fce5623SAvi Kivity 		    unsigned long len)
31660fce5623SAvi Kivity {
31670fce5623SAvi Kivity 	gfn_t gfn = gpa >> PAGE_SHIFT;
31680fce5623SAvi Kivity 	int seg;
31690fce5623SAvi Kivity 	int offset = offset_in_page(gpa);
31700fce5623SAvi Kivity 	int ret;
31710fce5623SAvi Kivity 
31720fce5623SAvi Kivity 	while ((seg = next_segment(len, offset)) != 0) {
31730fce5623SAvi Kivity 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
31740fce5623SAvi Kivity 		if (ret < 0)
31750fce5623SAvi Kivity 			return ret;
31760fce5623SAvi Kivity 		offset = 0;
31770fce5623SAvi Kivity 		len -= seg;
31780fce5623SAvi Kivity 		data += seg;
31790fce5623SAvi Kivity 		++gfn;
31800fce5623SAvi Kivity 	}
31810fce5623SAvi Kivity 	return 0;
31820fce5623SAvi Kivity }
3183ff651cb6SWincy Van EXPORT_SYMBOL_GPL(kvm_write_guest);
31840fce5623SAvi Kivity 
kvm_vcpu_write_guest(struct kvm_vcpu * vcpu,gpa_t gpa,const void * data,unsigned long len)31858e73485cSPaolo Bonzini int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
31868e73485cSPaolo Bonzini 		         unsigned long len)
31878e73485cSPaolo Bonzini {
31888e73485cSPaolo Bonzini 	gfn_t gfn = gpa >> PAGE_SHIFT;
31898e73485cSPaolo Bonzini 	int seg;
31908e73485cSPaolo Bonzini 	int offset = offset_in_page(gpa);
31918e73485cSPaolo Bonzini 	int ret;
31928e73485cSPaolo Bonzini 
31938e73485cSPaolo Bonzini 	while ((seg = next_segment(len, offset)) != 0) {
31948e73485cSPaolo Bonzini 		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
31958e73485cSPaolo Bonzini 		if (ret < 0)
31968e73485cSPaolo Bonzini 			return ret;
31978e73485cSPaolo Bonzini 		offset = 0;
31988e73485cSPaolo Bonzini 		len -= seg;
31998e73485cSPaolo Bonzini 		data += seg;
32008e73485cSPaolo Bonzini 		++gfn;
32018e73485cSPaolo Bonzini 	}
32028e73485cSPaolo Bonzini 	return 0;
32038e73485cSPaolo Bonzini }
32048e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
32058e73485cSPaolo Bonzini 
__kvm_gfn_to_hva_cache_init(struct kvm_memslots * slots,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)32065a2d4365SPaolo Bonzini static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
32075a2d4365SPaolo Bonzini 				       struct gfn_to_hva_cache *ghc,
32088f964525SAndrew Honig 				       gpa_t gpa, unsigned long len)
320949c7754cSGleb Natapov {
321049c7754cSGleb Natapov 	int offset = offset_in_page(gpa);
32118f964525SAndrew Honig 	gfn_t start_gfn = gpa >> PAGE_SHIFT;
32128f964525SAndrew Honig 	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
32138f964525SAndrew Honig 	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
32148f964525SAndrew Honig 	gfn_t nr_pages_avail;
321549c7754cSGleb Natapov 
32166ad1e29fSSean Christopherson 	/* Update ghc->generation before performing any error checks. */
321749c7754cSGleb Natapov 	ghc->generation = slots->generation;
32186ad1e29fSSean Christopherson 
32196ad1e29fSSean Christopherson 	if (start_gfn > end_gfn) {
3220f1b9dd5eSJim Mattson 		ghc->hva = KVM_HVA_ERR_BAD;
32216ad1e29fSSean Christopherson 		return -EINVAL;
32226ad1e29fSSean Christopherson 	}
3223f1b9dd5eSJim Mattson 
32248f964525SAndrew Honig 	/*
32258f964525SAndrew Honig 	 * If the requested region crosses two memslots, we still
32268f964525SAndrew Honig 	 * verify that the entire region is valid here.
32278f964525SAndrew Honig 	 */
32286ad1e29fSSean Christopherson 	for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
32295a2d4365SPaolo Bonzini 		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
32308f964525SAndrew Honig 		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
32318f964525SAndrew Honig 					   &nr_pages_avail);
32328f964525SAndrew Honig 		if (kvm_is_error_hva(ghc->hva))
32336ad1e29fSSean Christopherson 			return -EFAULT;
32348f964525SAndrew Honig 	}
3235f1b9dd5eSJim Mattson 
32368f964525SAndrew Honig 	/* Use the slow path for cross page reads and writes. */
32376ad1e29fSSean Christopherson 	if (nr_pages_needed == 1)
3238f1b9dd5eSJim Mattson 		ghc->hva += offset;
3239f1b9dd5eSJim Mattson 	else
32408f964525SAndrew Honig 		ghc->memslot = NULL;
3241f1b9dd5eSJim Mattson 
32426ad1e29fSSean Christopherson 	ghc->gpa = gpa;
32436ad1e29fSSean Christopherson 	ghc->len = len;
32446ad1e29fSSean Christopherson 	return 0;
324549c7754cSGleb Natapov }
32465a2d4365SPaolo Bonzini 
kvm_gfn_to_hva_cache_init(struct kvm * kvm,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)32474e335d9eSPaolo Bonzini int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32485a2d4365SPaolo Bonzini 			      gpa_t gpa, unsigned long len)
32495a2d4365SPaolo Bonzini {
32504e335d9eSPaolo Bonzini 	struct kvm_memslots *slots = kvm_memslots(kvm);
32515a2d4365SPaolo Bonzini 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
32525a2d4365SPaolo Bonzini }
32534e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
325449c7754cSGleb Natapov 
kvm_write_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)32554e335d9eSPaolo Bonzini int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32567a86dab8SJim Mattson 				  void *data, unsigned int offset,
32577a86dab8SJim Mattson 				  unsigned long len)
325849c7754cSGleb Natapov {
32594e335d9eSPaolo Bonzini 	struct kvm_memslots *slots = kvm_memslots(kvm);
326049c7754cSGleb Natapov 	int r;
32614ec6e863SPan Xinhui 	gpa_t gpa = ghc->gpa + offset;
326249c7754cSGleb Natapov 
32635f25e71eSPaolo Bonzini 	if (WARN_ON_ONCE(len + offset > ghc->len))
32645f25e71eSPaolo Bonzini 		return -EINVAL;
32658f964525SAndrew Honig 
3266dc9ce71eSSean Christopherson 	if (slots->generation != ghc->generation) {
3267dc9ce71eSSean Christopherson 		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3268dc9ce71eSSean Christopherson 			return -EFAULT;
3269dc9ce71eSSean Christopherson 	}
32708f964525SAndrew Honig 
327149c7754cSGleb Natapov 	if (kvm_is_error_hva(ghc->hva))
327249c7754cSGleb Natapov 		return -EFAULT;
327349c7754cSGleb Natapov 
3274fcfbc617SSean Christopherson 	if (unlikely(!ghc->memslot))
3275fcfbc617SSean Christopherson 		return kvm_write_guest(kvm, gpa, data, len);
3276fcfbc617SSean Christopherson 
32774ec6e863SPan Xinhui 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
327849c7754cSGleb Natapov 	if (r)
327949c7754cSGleb Natapov 		return -EFAULT;
328028bd726aSPeter Xu 	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
328149c7754cSGleb Natapov 
328249c7754cSGleb Natapov 	return 0;
328349c7754cSGleb Natapov }
32844e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
32854ec6e863SPan Xinhui 
kvm_write_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)32864e335d9eSPaolo Bonzini int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32874ec6e863SPan Xinhui 			   void *data, unsigned long len)
32884ec6e863SPan Xinhui {
32894e335d9eSPaolo Bonzini 	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
32904ec6e863SPan Xinhui }
32914e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
329249c7754cSGleb Natapov 
kvm_read_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)32930958f0ceSVitaly Kuznetsov int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32940958f0ceSVitaly Kuznetsov 				 void *data, unsigned int offset,
32950958f0ceSVitaly Kuznetsov 				 unsigned long len)
3296e03b644fSGleb Natapov {
32974e335d9eSPaolo Bonzini 	struct kvm_memslots *slots = kvm_memslots(kvm);
3298e03b644fSGleb Natapov 	int r;
32990958f0ceSVitaly Kuznetsov 	gpa_t gpa = ghc->gpa + offset;
3300e03b644fSGleb Natapov 
33015f25e71eSPaolo Bonzini 	if (WARN_ON_ONCE(len + offset > ghc->len))
33025f25e71eSPaolo Bonzini 		return -EINVAL;
33038f964525SAndrew Honig 
3304dc9ce71eSSean Christopherson 	if (slots->generation != ghc->generation) {
3305dc9ce71eSSean Christopherson 		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3306dc9ce71eSSean Christopherson 			return -EFAULT;
3307dc9ce71eSSean Christopherson 	}
33088f964525SAndrew Honig 
3309e03b644fSGleb Natapov 	if (kvm_is_error_hva(ghc->hva))
3310e03b644fSGleb Natapov 		return -EFAULT;
3311e03b644fSGleb Natapov 
3312fcfbc617SSean Christopherson 	if (unlikely(!ghc->memslot))
33130958f0ceSVitaly Kuznetsov 		return kvm_read_guest(kvm, gpa, data, len);
3314fcfbc617SSean Christopherson 
33150958f0ceSVitaly Kuznetsov 	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3316e03b644fSGleb Natapov 	if (r)
3317e03b644fSGleb Natapov 		return -EFAULT;
3318e03b644fSGleb Natapov 
3319e03b644fSGleb Natapov 	return 0;
3320e03b644fSGleb Natapov }
33210958f0ceSVitaly Kuznetsov EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
33220958f0ceSVitaly Kuznetsov 
kvm_read_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)33230958f0ceSVitaly Kuznetsov int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
33240958f0ceSVitaly Kuznetsov 			  void *data, unsigned long len)
33250958f0ceSVitaly Kuznetsov {
33260958f0ceSVitaly Kuznetsov 	return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
33270958f0ceSVitaly Kuznetsov }
33284e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3329e03b644fSGleb Natapov 
kvm_clear_guest(struct kvm * kvm,gpa_t gpa,unsigned long len)33300fce5623SAvi Kivity int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
33310fce5623SAvi Kivity {
33322f541442SPaolo Bonzini 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
33330fce5623SAvi Kivity 	gfn_t gfn = gpa >> PAGE_SHIFT;
33340fce5623SAvi Kivity 	int seg;
33350fce5623SAvi Kivity 	int offset = offset_in_page(gpa);
33360fce5623SAvi Kivity 	int ret;
33370fce5623SAvi Kivity 
33380fce5623SAvi Kivity 	while ((seg = next_segment(len, offset)) != 0) {
33392f541442SPaolo Bonzini 		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
33400fce5623SAvi Kivity 		if (ret < 0)
33410fce5623SAvi Kivity 			return ret;
33420fce5623SAvi Kivity 		offset = 0;
33430fce5623SAvi Kivity 		len -= seg;
33440fce5623SAvi Kivity 		++gfn;
33450fce5623SAvi Kivity 	}
33460fce5623SAvi Kivity 	return 0;
33470fce5623SAvi Kivity }
33480fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_clear_guest);
33490fce5623SAvi Kivity 
mark_page_dirty_in_slot(struct kvm * kvm,const struct kvm_memory_slot * memslot,gfn_t gfn)335028bd726aSPeter Xu void mark_page_dirty_in_slot(struct kvm *kvm,
33518283e36aSBen Gardon 			     const struct kvm_memory_slot *memslot,
335228bd726aSPeter Xu 		 	     gfn_t gfn)
33530fce5623SAvi Kivity {
33542efd61a6SDavid Woodhouse 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
33552efd61a6SDavid Woodhouse 
3356e09fccb5SChristian Borntraeger #ifdef CONFIG_HAVE_KVM_DIRTY_RING
335786bdf3ebSGavin Shan 	if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
33582efd61a6SDavid Woodhouse 		return;
335986bdf3ebSGavin Shan 
3360c57351a7SGavin Shan 	WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3361e09fccb5SChristian Borntraeger #endif
33622efd61a6SDavid Woodhouse 
3363044c59c4SPeter Xu 	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
33640fce5623SAvi Kivity 		unsigned long rel_gfn = gfn - memslot->base_gfn;
3365fb04a1edSPeter Xu 		u32 slot = (memslot->as_id << 16) | memslot->id;
33660fce5623SAvi Kivity 
336786bdf3ebSGavin Shan 		if (kvm->dirty_ring_size && vcpu)
3368cf87ac73SGavin Shan 			kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3369c57351a7SGavin Shan 		else if (memslot->dirty_bitmap)
3370b74ca3b3STakuya Yoshikawa 			set_bit_le(rel_gfn, memslot->dirty_bitmap);
33710fce5623SAvi Kivity 	}
33720fce5623SAvi Kivity }
3373a6a0b05dSBen Gardon EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
33740fce5623SAvi Kivity 
mark_page_dirty(struct kvm * kvm,gfn_t gfn)337549c7754cSGleb Natapov void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
337649c7754cSGleb Natapov {
337749c7754cSGleb Natapov 	struct kvm_memory_slot *memslot;
337849c7754cSGleb Natapov 
337949c7754cSGleb Natapov 	memslot = gfn_to_memslot(kvm, gfn);
338028bd726aSPeter Xu 	mark_page_dirty_in_slot(kvm, memslot, gfn);
338149c7754cSGleb Natapov }
33822ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(mark_page_dirty);
338349c7754cSGleb Natapov 
kvm_vcpu_mark_page_dirty(struct kvm_vcpu * vcpu,gfn_t gfn)33848e73485cSPaolo Bonzini void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
33858e73485cSPaolo Bonzini {
33868e73485cSPaolo Bonzini 	struct kvm_memory_slot *memslot;
33878e73485cSPaolo Bonzini 
33888e73485cSPaolo Bonzini 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
338928bd726aSPeter Xu 	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
33908e73485cSPaolo Bonzini }
33918e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
33928e73485cSPaolo Bonzini 
kvm_sigset_activate(struct kvm_vcpu * vcpu)339320b7035cSJan H. Schönherr void kvm_sigset_activate(struct kvm_vcpu *vcpu)
339420b7035cSJan H. Schönherr {
339520b7035cSJan H. Schönherr 	if (!vcpu->sigset_active)
339620b7035cSJan H. Schönherr 		return;
339720b7035cSJan H. Schönherr 
339820b7035cSJan H. Schönherr 	/*
339920b7035cSJan H. Schönherr 	 * This does a lockless modification of ->real_blocked, which is fine
340020b7035cSJan H. Schönherr 	 * because, only current can change ->real_blocked and all readers of
340120b7035cSJan H. Schönherr 	 * ->real_blocked don't care as long ->real_blocked is always a subset
340220b7035cSJan H. Schönherr 	 * of ->blocked.
340320b7035cSJan H. Schönherr 	 */
340420b7035cSJan H. Schönherr 	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
340520b7035cSJan H. Schönherr }
340620b7035cSJan H. Schönherr 
kvm_sigset_deactivate(struct kvm_vcpu * vcpu)340720b7035cSJan H. Schönherr void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
340820b7035cSJan H. Schönherr {
340920b7035cSJan H. Schönherr 	if (!vcpu->sigset_active)
341020b7035cSJan H. Schönherr 		return;
341120b7035cSJan H. Schönherr 
341220b7035cSJan H. Schönherr 	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
341320b7035cSJan H. Schönherr 	sigemptyset(&current->real_blocked);
341420b7035cSJan H. Schönherr }
341520b7035cSJan H. Schönherr 
grow_halt_poll_ns(struct kvm_vcpu * vcpu)3416aca6ff29SWanpeng Li static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3417aca6ff29SWanpeng Li {
3418dee339b5SNir Weiner 	unsigned int old, val, grow, grow_start;
3419aca6ff29SWanpeng Li 
34202cbd7824SWanpeng Li 	old = val = vcpu->halt_poll_ns;
3421dee339b5SNir Weiner 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
34226b6de68cSChristian Borntraeger 	grow = READ_ONCE(halt_poll_ns_grow);
34237fa08e71SNir Weiner 	if (!grow)
34247fa08e71SNir Weiner 		goto out;
34257fa08e71SNir Weiner 
34266b6de68cSChristian Borntraeger 	val *= grow;
3427dee339b5SNir Weiner 	if (val < grow_start)
3428dee339b5SNir Weiner 		val = grow_start;
3429aca6ff29SWanpeng Li 
3430aca6ff29SWanpeng Li 	vcpu->halt_poll_ns = val;
34317fa08e71SNir Weiner out:
34322cbd7824SWanpeng Li 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3433aca6ff29SWanpeng Li }
3434aca6ff29SWanpeng Li 
shrink_halt_poll_ns(struct kvm_vcpu * vcpu)3435aca6ff29SWanpeng Li static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3436aca6ff29SWanpeng Li {
3437ae232ea4SSergey Senozhatsky 	unsigned int old, val, shrink, grow_start;
3438aca6ff29SWanpeng Li 
34392cbd7824SWanpeng Li 	old = val = vcpu->halt_poll_ns;
34406b6de68cSChristian Borntraeger 	shrink = READ_ONCE(halt_poll_ns_shrink);
3441ae232ea4SSergey Senozhatsky 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
34426b6de68cSChristian Borntraeger 	if (shrink == 0)
3443aca6ff29SWanpeng Li 		val = 0;
3444aca6ff29SWanpeng Li 	else
34456b6de68cSChristian Borntraeger 		val /= shrink;
3446aca6ff29SWanpeng Li 
3447ae232ea4SSergey Senozhatsky 	if (val < grow_start)
3448ae232ea4SSergey Senozhatsky 		val = 0;
3449ae232ea4SSergey Senozhatsky 
3450aca6ff29SWanpeng Li 	vcpu->halt_poll_ns = val;
34512cbd7824SWanpeng Li 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3452aca6ff29SWanpeng Li }
3453aca6ff29SWanpeng Li 
kvm_vcpu_check_block(struct kvm_vcpu * vcpu)3454f7819512SPaolo Bonzini static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3455f7819512SPaolo Bonzini {
345650c28f21SJunaid Shahid 	int ret = -EINTR;
345750c28f21SJunaid Shahid 	int idx = srcu_read_lock(&vcpu->kvm->srcu);
345850c28f21SJunaid Shahid 
3459c59fb127SPaolo Bonzini 	if (kvm_arch_vcpu_runnable(vcpu))
346050c28f21SJunaid Shahid 		goto out;
3461f7819512SPaolo Bonzini 	if (kvm_cpu_has_pending_timer(vcpu))
346250c28f21SJunaid Shahid 		goto out;
3463f7819512SPaolo Bonzini 	if (signal_pending(current))
346450c28f21SJunaid Shahid 		goto out;
3465084071d5SMarcelo Tosatti 	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3466084071d5SMarcelo Tosatti 		goto out;
3467f7819512SPaolo Bonzini 
346850c28f21SJunaid Shahid 	ret = 0;
346950c28f21SJunaid Shahid out:
347050c28f21SJunaid Shahid 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
347150c28f21SJunaid Shahid 	return ret;
3472f7819512SPaolo Bonzini }
3473f7819512SPaolo Bonzini 
34740fce5623SAvi Kivity /*
3475fac42688SSean Christopherson  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3476fac42688SSean Christopherson  * pending.  This is mostly used when halting a vCPU, but may also be used
3477fac42688SSean Christopherson  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
34780fce5623SAvi Kivity  */
kvm_vcpu_block(struct kvm_vcpu * vcpu)3479fac42688SSean Christopherson bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
34800fce5623SAvi Kivity {
3481fac42688SSean Christopherson 	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3482f7819512SPaolo Bonzini 	bool waited = false;
3483fac42688SSean Christopherson 
3484c3858335SJing Zhang 	vcpu->stat.generic.blocking = 1;
3485f7819512SPaolo Bonzini 
348618869f26SMaxim Levitsky 	preempt_disable();
348707ab0f8dSMarc Zyngier 	kvm_arch_vcpu_blocking(vcpu);
3488fac42688SSean Christopherson 	prepare_to_rcuwait(wait);
348918869f26SMaxim Levitsky 	preempt_enable();
349018869f26SMaxim Levitsky 
3491e5c239cfSMarcelo Tosatti 	for (;;) {
3492da4ad88cSDavidlohr Bueso 		set_current_state(TASK_INTERRUPTIBLE);
34930fce5623SAvi Kivity 
3494f7819512SPaolo Bonzini 		if (kvm_vcpu_check_block(vcpu) < 0)
3495e5c239cfSMarcelo Tosatti 			break;
3496e5c239cfSMarcelo Tosatti 
3497f7819512SPaolo Bonzini 		waited = true;
34980fce5623SAvi Kivity 		schedule();
34990fce5623SAvi Kivity 	}
3500fac42688SSean Christopherson 
350118869f26SMaxim Levitsky 	preempt_disable();
350218869f26SMaxim Levitsky 	finish_rcuwait(wait);
3503fac42688SSean Christopherson 	kvm_arch_vcpu_unblocking(vcpu);
350418869f26SMaxim Levitsky 	preempt_enable();
3505fac42688SSean Christopherson 
3506c3858335SJing Zhang 	vcpu->stat.generic.blocking = 0;
3507c3858335SJing Zhang 
3508fac42688SSean Christopherson 	return waited;
3509fac42688SSean Christopherson }
3510fac42688SSean Christopherson 
update_halt_poll_stats(struct kvm_vcpu * vcpu,ktime_t start,ktime_t end,bool success)351129e72893SSean Christopherson static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
351229e72893SSean Christopherson 					  ktime_t end, bool success)
35130fce5623SAvi Kivity {
351430c94347SSean Christopherson 	struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
351529e72893SSean Christopherson 	u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
351629e72893SSean Christopherson 
351730c94347SSean Christopherson 	++vcpu->stat.generic.halt_attempted_poll;
351830c94347SSean Christopherson 
351930c94347SSean Christopherson 	if (success) {
352030c94347SSean Christopherson 		++vcpu->stat.generic.halt_successful_poll;
352130c94347SSean Christopherson 
352230c94347SSean Christopherson 		if (!vcpu_valid_wakeup(vcpu))
352330c94347SSean Christopherson 			++vcpu->stat.generic.halt_poll_invalid;
352430c94347SSean Christopherson 
352530c94347SSean Christopherson 		stats->halt_poll_success_ns += poll_ns;
352630c94347SSean Christopherson 		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
352730c94347SSean Christopherson 	} else {
352830c94347SSean Christopherson 		stats->halt_poll_fail_ns += poll_ns;
352930c94347SSean Christopherson 		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
353030c94347SSean Christopherson 	}
3531e5c239cfSMarcelo Tosatti }
35320fce5623SAvi Kivity 
kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu * vcpu)3533175d5dc7SDavid Matlack static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3534175d5dc7SDavid Matlack {
35359eb8ca04SDavid Matlack 	struct kvm *kvm = vcpu->kvm;
35369eb8ca04SDavid Matlack 
35379eb8ca04SDavid Matlack 	if (kvm->override_halt_poll_ns) {
35389eb8ca04SDavid Matlack 		/*
35399eb8ca04SDavid Matlack 		 * Ensure kvm->max_halt_poll_ns is not read before
35409eb8ca04SDavid Matlack 		 * kvm->override_halt_poll_ns.
35419eb8ca04SDavid Matlack 		 *
35429eb8ca04SDavid Matlack 		 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
35439eb8ca04SDavid Matlack 		 */
35449eb8ca04SDavid Matlack 		smp_rmb();
35459eb8ca04SDavid Matlack 		return READ_ONCE(kvm->max_halt_poll_ns);
35469eb8ca04SDavid Matlack 	}
35479eb8ca04SDavid Matlack 
35489eb8ca04SDavid Matlack 	return READ_ONCE(halt_poll_ns);
3549175d5dc7SDavid Matlack }
3550175d5dc7SDavid Matlack 
3551fac42688SSean Christopherson /*
3552fac42688SSean Christopherson  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3553fac42688SSean Christopherson  * polling is enabled, busy wait for a short time before blocking to avoid the
3554fac42688SSean Christopherson  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3555fac42688SSean Christopherson  * is halted.
3556fac42688SSean Christopherson  */
kvm_vcpu_halt(struct kvm_vcpu * vcpu)355791b99ea7SSean Christopherson void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
35580fce5623SAvi Kivity {
3559175d5dc7SDavid Matlack 	unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
35606f390916SSean Christopherson 	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
35610fce5623SAvi Kivity 	ktime_t start, cur, poll_end;
35620fce5623SAvi Kivity 	bool waited = false;
356397b6847aSDavid Matlack 	bool do_halt_poll;
356491b99ea7SSean Christopherson 	u64 halt_ns;
35650fce5623SAvi Kivity 
3566175d5dc7SDavid Matlack 	if (vcpu->halt_poll_ns > max_halt_poll_ns)
3567175d5dc7SDavid Matlack 		vcpu->halt_poll_ns = max_halt_poll_ns;
356897b6847aSDavid Matlack 
356997b6847aSDavid Matlack 	do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
357097b6847aSDavid Matlack 
35710fce5623SAvi Kivity 	start = cur = poll_end = ktime_get();
35728df6a61cSSean Christopherson 	if (do_halt_poll) {
3573109a9826SSean Christopherson 		ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3574d255f4f2SZhai, Edwin 
3575d255f4f2SZhai, Edwin 		do {
357630c94347SSean Christopherson 			if (kvm_vcpu_check_block(vcpu) < 0)
35770fce5623SAvi Kivity 				goto out;
35780fce5623SAvi Kivity 			cpu_relax();
35790fce5623SAvi Kivity 			poll_end = cur = ktime_get();
35800fce5623SAvi Kivity 		} while (kvm_vcpu_can_poll(cur, stop));
35810fce5623SAvi Kivity 	}
35820fce5623SAvi Kivity 
3583fac42688SSean Christopherson 	waited = kvm_vcpu_block(vcpu);
3584f6c60d08SSean Christopherson 
3585f7819512SPaolo Bonzini 	cur = ktime_get();
358687bcc5faSJing Zhang 	if (waited) {
358787bcc5faSJing Zhang 		vcpu->stat.generic.halt_wait_ns +=
358887bcc5faSJing Zhang 			ktime_to_ns(cur) - ktime_to_ns(poll_end);
35898ccba534SJing Zhang 		KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
35908ccba534SJing Zhang 				ktime_to_ns(cur) - ktime_to_ns(poll_end));
359187bcc5faSJing Zhang 	}
3592f7819512SPaolo Bonzini out:
359391b99ea7SSean Christopherson 	/* The total time the vCPU was "halted", including polling time. */
359491b99ea7SSean Christopherson 	halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3595aca6ff29SWanpeng Li 
359629e72893SSean Christopherson 	/*
359729e72893SSean Christopherson 	 * Note, halt-polling is considered successful so long as the vCPU was
359829e72893SSean Christopherson 	 * never actually scheduled out, i.e. even if the wake event arrived
359929e72893SSean Christopherson 	 * after of the halt-polling loop itself, but before the full wait.
360029e72893SSean Christopherson 	 */
36018df6a61cSSean Christopherson 	if (do_halt_poll)
360229e72893SSean Christopherson 		update_halt_poll_stats(vcpu, start, poll_end, !waited);
3603cb953129SDavid Matlack 
36046f390916SSean Christopherson 	if (halt_poll_allowed) {
3605175d5dc7SDavid Matlack 		/* Recompute the max halt poll time in case it changed. */
3606175d5dc7SDavid Matlack 		max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3607175d5dc7SDavid Matlack 
360844551b2fSWanpeng Li 		if (!vcpu_valid_wakeup(vcpu)) {
36092086d320SChristian Borntraeger 			shrink_halt_poll_ns(vcpu);
3610175d5dc7SDavid Matlack 		} else if (max_halt_poll_ns) {
361191b99ea7SSean Christopherson 			if (halt_ns <= vcpu->halt_poll_ns)
3612aca6ff29SWanpeng Li 				;
3613aca6ff29SWanpeng Li 			/* we had a long block, shrink polling */
3614acd05785SDavid Matlack 			else if (vcpu->halt_poll_ns &&
3615175d5dc7SDavid Matlack 				 halt_ns > max_halt_poll_ns)
3616aca6ff29SWanpeng Li 				shrink_halt_poll_ns(vcpu);
3617aca6ff29SWanpeng Li 			/* we had a short halt and our poll time is too small */
3618175d5dc7SDavid Matlack 			else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3619175d5dc7SDavid Matlack 				 halt_ns < max_halt_poll_ns)
3620aca6ff29SWanpeng Li 				grow_halt_poll_ns(vcpu);
362144551b2fSWanpeng Li 		} else {
3622edb9272fSWanpeng Li 			vcpu->halt_poll_ns = 0;
362344551b2fSWanpeng Li 		}
362444551b2fSWanpeng Li 	}
3625aca6ff29SWanpeng Li 
362691b99ea7SSean Christopherson 	trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
36270fce5623SAvi Kivity }
362891b99ea7SSean Christopherson EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
36290fce5623SAvi Kivity 
kvm_vcpu_wake_up(struct kvm_vcpu * vcpu)3630178f02ffSRadim Krčmář bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3631b6d33834SChristoffer Dall {
3632d92a5d1cSSean Christopherson 	if (__kvm_vcpu_wake_up(vcpu)) {
3633d73eb57bSWanpeng Li 		WRITE_ONCE(vcpu->ready, true);
36340193cc90SJing Zhang 		++vcpu->stat.generic.halt_wakeup;
3635178f02ffSRadim Krčmář 		return true;
3636b6d33834SChristoffer Dall 	}
3637b6d33834SChristoffer Dall 
3638178f02ffSRadim Krčmář 	return false;
3639dd1a4cc1SRadim Krčmář }
3640dd1a4cc1SRadim Krčmář EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3641dd1a4cc1SRadim Krčmář 
36420266c894SPaolo Bonzini #ifndef CONFIG_S390
3643dd1a4cc1SRadim Krčmář /*
3644dd1a4cc1SRadim Krčmář  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3645dd1a4cc1SRadim Krčmář  */
kvm_vcpu_kick(struct kvm_vcpu * vcpu)3646dd1a4cc1SRadim Krčmář void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3647dd1a4cc1SRadim Krčmář {
364885b64045SSean Christopherson 	int me, cpu;
3649dd1a4cc1SRadim Krčmář 
3650178f02ffSRadim Krčmář 	if (kvm_vcpu_wake_up(vcpu))
3651178f02ffSRadim Krčmář 		return;
3652178f02ffSRadim Krčmář 
3653aefdc2edSPaolo Bonzini 	me = get_cpu();
3654aefdc2edSPaolo Bonzini 	/*
3655aefdc2edSPaolo Bonzini 	 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3656aefdc2edSPaolo Bonzini 	 * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3657aefdc2edSPaolo Bonzini 	 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3658aefdc2edSPaolo Bonzini 	 * within the vCPU thread itself.
3659aefdc2edSPaolo Bonzini 	 */
3660aefdc2edSPaolo Bonzini 	if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3661aefdc2edSPaolo Bonzini 		if (vcpu->mode == IN_GUEST_MODE)
3662aefdc2edSPaolo Bonzini 			WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3663aefdc2edSPaolo Bonzini 		goto out;
3664aefdc2edSPaolo Bonzini 	}
3665aefdc2edSPaolo Bonzini 
366685b64045SSean Christopherson 	/*
366785b64045SSean Christopherson 	 * Note, the vCPU could get migrated to a different pCPU at any point
366885b64045SSean Christopherson 	 * after kvm_arch_vcpu_should_kick(), which could result in sending an
366985b64045SSean Christopherson 	 * IPI to the previous pCPU.  But, that's ok because the purpose of the
367085b64045SSean Christopherson 	 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
367185b64045SSean Christopherson 	 * vCPU also requires it to leave IN_GUEST_MODE.
367285b64045SSean Christopherson 	 */
367385b64045SSean Christopherson 	if (kvm_arch_vcpu_should_kick(vcpu)) {
367485b64045SSean Christopherson 		cpu = READ_ONCE(vcpu->cpu);
3675b6d33834SChristoffer Dall 		if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3676b6d33834SChristoffer Dall 			smp_send_reschedule(cpu);
367785b64045SSean Christopherson 	}
3678aefdc2edSPaolo Bonzini out:
3679b6d33834SChristoffer Dall 	put_cpu();
3680b6d33834SChristoffer Dall }
3681a20ed54dSYang Zhang EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
36820266c894SPaolo Bonzini #endif /* !CONFIG_S390 */
3683b6d33834SChristoffer Dall 
kvm_vcpu_yield_to(struct kvm_vcpu * target)3684fa93384fSDan Carpenter int kvm_vcpu_yield_to(struct kvm_vcpu *target)
368541628d33SKonstantin Weitz {
368641628d33SKonstantin Weitz 	struct pid *pid;
368741628d33SKonstantin Weitz 	struct task_struct *task = NULL;
3688fa93384fSDan Carpenter 	int ret = 0;
368941628d33SKonstantin Weitz 
369041628d33SKonstantin Weitz 	rcu_read_lock();
369141628d33SKonstantin Weitz 	pid = rcu_dereference(target->pid);
369241628d33SKonstantin Weitz 	if (pid)
369327fbe64bSSam Bobroff 		task = get_pid_task(pid, PIDTYPE_PID);
369441628d33SKonstantin Weitz 	rcu_read_unlock();
369541628d33SKonstantin Weitz 	if (!task)
3696c45c528eSRaghavendra K T 		return ret;
3697c45c528eSRaghavendra K T 	ret = yield_to(task, 1);
369841628d33SKonstantin Weitz 	put_task_struct(task);
3699c45c528eSRaghavendra K T 
3700c45c528eSRaghavendra K T 	return ret;
370141628d33SKonstantin Weitz }
370241628d33SKonstantin Weitz EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
370341628d33SKonstantin Weitz 
370406e48c51SRaghavendra K T /*
370506e48c51SRaghavendra K T  * Helper that checks whether a VCPU is eligible for directed yield.
370606e48c51SRaghavendra K T  * Most eligible candidate to yield is decided by following heuristics:
370706e48c51SRaghavendra K T  *
370806e48c51SRaghavendra K T  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
370906e48c51SRaghavendra K T  *  (preempted lock holder), indicated by @in_spin_loop.
3710656012c7SFuad Tabba  *  Set at the beginning and cleared at the end of interception/PLE handler.
371106e48c51SRaghavendra K T  *
371206e48c51SRaghavendra K T  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
371306e48c51SRaghavendra K T  *  chance last time (mostly it has become eligible now since we have probably
371406e48c51SRaghavendra K T  *  yielded to lockholder in last iteration. This is done by toggling
371506e48c51SRaghavendra K T  *  @dy_eligible each time a VCPU checked for eligibility.)
371606e48c51SRaghavendra K T  *
371706e48c51SRaghavendra K T  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
371806e48c51SRaghavendra K T  *  to preempted lock-holder could result in wrong VCPU selection and CPU
371906e48c51SRaghavendra K T  *  burning. Giving priority for a potential lock-holder increases lock
372006e48c51SRaghavendra K T  *  progress.
372106e48c51SRaghavendra K T  *
372206e48c51SRaghavendra K T  *  Since algorithm is based on heuristics, accessing another VCPU data without
372306e48c51SRaghavendra K T  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
372406e48c51SRaghavendra K T  *  and continue with next VCPU and so on.
372506e48c51SRaghavendra K T  */
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu * vcpu)37267940876eSStephen Hemminger static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
372706e48c51SRaghavendra K T {
37284a55dd72SScott Wood #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
372906e48c51SRaghavendra K T 	bool eligible;
373006e48c51SRaghavendra K T 
373106e48c51SRaghavendra K T 	eligible = !vcpu->spin_loop.in_spin_loop ||
373234656113SChristian Borntraeger 		    vcpu->spin_loop.dy_eligible;
373306e48c51SRaghavendra K T 
373406e48c51SRaghavendra K T 	if (vcpu->spin_loop.in_spin_loop)
373506e48c51SRaghavendra K T 		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
373606e48c51SRaghavendra K T 
373706e48c51SRaghavendra K T 	return eligible;
37384a55dd72SScott Wood #else
37394a55dd72SScott Wood 	return true;
374006e48c51SRaghavendra K T #endif
37414a55dd72SScott Wood }
3742c45c528eSRaghavendra K T 
374317e433b5SWanpeng Li /*
374417e433b5SWanpeng Li  * Unlike kvm_arch_vcpu_runnable, this function is called outside
374517e433b5SWanpeng Li  * a vcpu_load/vcpu_put pair.  However, for most architectures
374617e433b5SWanpeng Li  * kvm_arch_vcpu_runnable does not require vcpu_load.
374717e433b5SWanpeng Li  */
kvm_arch_dy_runnable(struct kvm_vcpu * vcpu)374817e433b5SWanpeng Li bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
374917e433b5SWanpeng Li {
375017e433b5SWanpeng Li 	return kvm_arch_vcpu_runnable(vcpu);
375117e433b5SWanpeng Li }
375217e433b5SWanpeng Li 
vcpu_dy_runnable(struct kvm_vcpu * vcpu)375317e433b5SWanpeng Li static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
375417e433b5SWanpeng Li {
375517e433b5SWanpeng Li 	if (kvm_arch_dy_runnable(vcpu))
375617e433b5SWanpeng Li 		return true;
375717e433b5SWanpeng Li 
375817e433b5SWanpeng Li #ifdef CONFIG_KVM_ASYNC_PF
375917e433b5SWanpeng Li 	if (!list_empty_careful(&vcpu->async_pf.done))
376017e433b5SWanpeng Li 		return true;
376117e433b5SWanpeng Li #endif
376217e433b5SWanpeng Li 
376317e433b5SWanpeng Li 	return false;
376417e433b5SWanpeng Li }
376517e433b5SWanpeng Li 
kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu * vcpu)376652acd22fSWanpeng Li bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
376752acd22fSWanpeng Li {
376852acd22fSWanpeng Li 	return false;
376952acd22fSWanpeng Li }
377052acd22fSWanpeng Li 
kvm_vcpu_on_spin(struct kvm_vcpu * me,bool yield_to_kernel_mode)3771199b5763SLongpeng(Mike) void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3772d255f4f2SZhai, Edwin {
3773217ece61SRik van Riel 	struct kvm *kvm = me->kvm;
3774217ece61SRik van Riel 	struct kvm_vcpu *vcpu;
3775a937ef95SBreno Leitao 	int last_boosted_vcpu;
377646808a4cSMarc Zyngier 	unsigned long i;
3777217ece61SRik van Riel 	int yielded = 0;
3778c45c528eSRaghavendra K T 	int try = 3;
3779217ece61SRik van Riel 	int pass;
3780d255f4f2SZhai, Edwin 
3781a937ef95SBreno Leitao 	last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
37824c088493SRaghavendra K T 	kvm_vcpu_set_in_spin_loop(me, true);
3783217ece61SRik van Riel 	/*
3784217ece61SRik van Riel 	 * We boost the priority of a VCPU that is runnable but not
3785217ece61SRik van Riel 	 * currently running, because it got preempted by something
3786217ece61SRik van Riel 	 * else and called schedule in __vcpu_run.  Hopefully that
3787217ece61SRik van Riel 	 * VCPU is holding the lock that we need and will release it.
3788217ece61SRik van Riel 	 * We approximate round-robin by starting at the last boosted VCPU.
3789217ece61SRik van Riel 	 */
3790c45c528eSRaghavendra K T 	for (pass = 0; pass < 2 && !yielded && try; pass++) {
3791217ece61SRik van Riel 		kvm_for_each_vcpu(i, vcpu, kvm) {
37925cfc2aabSRik van Riel 			if (!pass && i <= last_boosted_vcpu) {
3793217ece61SRik van Riel 				i = last_boosted_vcpu;
3794217ece61SRik van Riel 				continue;
3795217ece61SRik van Riel 			} else if (pass && i > last_boosted_vcpu)
3796217ece61SRik van Riel 				break;
3797d73eb57bSWanpeng Li 			if (!READ_ONCE(vcpu->ready))
37987bc7ae25SRaghavendra K T 				continue;
3799217ece61SRik van Riel 			if (vcpu == me)
3800217ece61SRik van Riel 				continue;
3801d92a5d1cSSean Christopherson 			if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3802217ece61SRik van Riel 				continue;
3803046ddeedSWanpeng Li 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
380452acd22fSWanpeng Li 			    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3805046ddeedSWanpeng Li 			    !kvm_arch_vcpu_in_kernel(vcpu))
3806199b5763SLongpeng(Mike) 				continue;
380706e48c51SRaghavendra K T 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
380806e48c51SRaghavendra K T 				continue;
3809c45c528eSRaghavendra K T 
3810c45c528eSRaghavendra K T 			yielded = kvm_vcpu_yield_to(vcpu);
3811c45c528eSRaghavendra K T 			if (yielded > 0) {
3812a937ef95SBreno Leitao 				WRITE_ONCE(kvm->last_boosted_vcpu, i);
3813c45c528eSRaghavendra K T 				break;
3814c45c528eSRaghavendra K T 			} else if (yielded < 0) {
3815c45c528eSRaghavendra K T 				try--;
3816c45c528eSRaghavendra K T 				if (!try)
3817217ece61SRik van Riel 					break;
3818217ece61SRik van Riel 			}
3819217ece61SRik van Riel 		}
3820217ece61SRik van Riel 	}
38214c088493SRaghavendra K T 	kvm_vcpu_set_in_spin_loop(me, false);
382206e48c51SRaghavendra K T 
382306e48c51SRaghavendra K T 	/* Ensure vcpu is not eligible during next spinloop */
382406e48c51SRaghavendra K T 	kvm_vcpu_set_dy_eligible(me, false);
3825d255f4f2SZhai, Edwin }
3826d255f4f2SZhai, Edwin EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3827d255f4f2SZhai, Edwin 
kvm_page_in_dirty_ring(struct kvm * kvm,unsigned long pgoff)3828fb04a1edSPeter Xu static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3829fb04a1edSPeter Xu {
3830dc70ec21SDavid Woodhouse #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3831fb04a1edSPeter Xu 	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3832fb04a1edSPeter Xu 	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3833fb04a1edSPeter Xu 	     kvm->dirty_ring_size / PAGE_SIZE);
3834fb04a1edSPeter Xu #else
3835fb04a1edSPeter Xu 	return false;
3836fb04a1edSPeter Xu #endif
3837fb04a1edSPeter Xu }
3838fb04a1edSPeter Xu 
kvm_vcpu_fault(struct vm_fault * vmf)38391499fa80SSouptick Joarder static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
38400fce5623SAvi Kivity {
384111bac800SDave Jiang 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
38420fce5623SAvi Kivity 	struct page *page;
38430fce5623SAvi Kivity 
38440fce5623SAvi Kivity 	if (vmf->pgoff == 0)
38450fce5623SAvi Kivity 		page = virt_to_page(vcpu->run);
384609566765SAvi Kivity #ifdef CONFIG_X86
38470fce5623SAvi Kivity 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
38480fce5623SAvi Kivity 		page = virt_to_page(vcpu->arch.pio_data);
384909566765SAvi Kivity #endif
38504b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
38515f94c174SLaurent Vivier 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
38525f94c174SLaurent Vivier 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
38535f94c174SLaurent Vivier #endif
3854fb04a1edSPeter Xu 	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3855fb04a1edSPeter Xu 		page = kvm_dirty_ring_get_page(
3856fb04a1edSPeter Xu 		    &vcpu->dirty_ring,
3857fb04a1edSPeter Xu 		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
38580fce5623SAvi Kivity 	else
38595b1c1493SCarsten Otte 		return kvm_arch_vcpu_fault(vcpu, vmf);
38600fce5623SAvi Kivity 	get_page(page);
38610fce5623SAvi Kivity 	vmf->page = page;
38620fce5623SAvi Kivity 	return 0;
38630fce5623SAvi Kivity }
38640fce5623SAvi Kivity 
3865f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct kvm_vcpu_vm_ops = {
38660fce5623SAvi Kivity 	.fault = kvm_vcpu_fault,
38670fce5623SAvi Kivity };
38680fce5623SAvi Kivity 
kvm_vcpu_mmap(struct file * file,struct vm_area_struct * vma)38690fce5623SAvi Kivity static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
38700fce5623SAvi Kivity {
3871fb04a1edSPeter Xu 	struct kvm_vcpu *vcpu = file->private_data;
387211476d27SYang Li 	unsigned long pages = vma_pages(vma);
3873fb04a1edSPeter Xu 
3874fb04a1edSPeter Xu 	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3875fb04a1edSPeter Xu 	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3876fb04a1edSPeter Xu 	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3877fb04a1edSPeter Xu 		return -EINVAL;
3878fb04a1edSPeter Xu 
38790fce5623SAvi Kivity 	vma->vm_ops = &kvm_vcpu_vm_ops;
38800fce5623SAvi Kivity 	return 0;
38810fce5623SAvi Kivity }
38820fce5623SAvi Kivity 
kvm_vcpu_release(struct inode * inode,struct file * filp)38830fce5623SAvi Kivity static int kvm_vcpu_release(struct inode *inode, struct file *filp)
38840fce5623SAvi Kivity {
38850fce5623SAvi Kivity 	struct kvm_vcpu *vcpu = filp->private_data;
38860fce5623SAvi Kivity 
388766c0b394SAl Viro 	kvm_put_kvm(vcpu->kvm);
38880fce5623SAvi Kivity 	return 0;
38890fce5623SAvi Kivity }
38900fce5623SAvi Kivity 
389170375c2dSDavid Matlack static const struct file_operations kvm_vcpu_fops = {
38920fce5623SAvi Kivity 	.release        = kvm_vcpu_release,
38930fce5623SAvi Kivity 	.unlocked_ioctl = kvm_vcpu_ioctl,
38940fce5623SAvi Kivity 	.mmap           = kvm_vcpu_mmap,
38956038f373SArnd Bergmann 	.llseek		= noop_llseek,
38967ddfd3e0SMarc Zyngier 	KVM_COMPAT(kvm_vcpu_compat_ioctl),
38970fce5623SAvi Kivity };
38980fce5623SAvi Kivity 
38990fce5623SAvi Kivity /*
39000fce5623SAvi Kivity  * Allocates an inode for the vcpu.
39010fce5623SAvi Kivity  */
create_vcpu_fd(struct kvm_vcpu * vcpu)39020fce5623SAvi Kivity static int create_vcpu_fd(struct kvm_vcpu *vcpu)
39030fce5623SAvi Kivity {
3904e46b4692SMasatake YAMATO 	char name[8 + 1 + ITOA_MAX_LEN + 1];
3905e46b4692SMasatake YAMATO 
3906e46b4692SMasatake YAMATO 	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3907e46b4692SMasatake YAMATO 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
39080fce5623SAvi Kivity }
39090fce5623SAvi Kivity 
3910e36de87dSVineeth Pillai #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
vcpu_get_pid(void * data,u64 * val)3911e36de87dSVineeth Pillai static int vcpu_get_pid(void *data, u64 *val)
3912e36de87dSVineeth Pillai {
391314aa40a1SLi kunyu 	struct kvm_vcpu *vcpu = data;
391476021e96SSean Christopherson 
391576021e96SSean Christopherson 	rcu_read_lock();
391676021e96SSean Christopherson 	*val = pid_nr(rcu_dereference(vcpu->pid));
391776021e96SSean Christopherson 	rcu_read_unlock();
3918e36de87dSVineeth Pillai 	return 0;
3919e36de87dSVineeth Pillai }
3920e36de87dSVineeth Pillai 
3921e36de87dSVineeth Pillai DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3922e36de87dSVineeth Pillai 
kvm_create_vcpu_debugfs(struct kvm_vcpu * vcpu)39233e7093d0SGreg KH static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
392445b5939eSLuiz Capitulino {
3925d56f5136SPaolo Bonzini 	struct dentry *debugfs_dentry;
392645b5939eSLuiz Capitulino 	char dir_name[ITOA_MAX_LEN * 2];
392745b5939eSLuiz Capitulino 
392845b5939eSLuiz Capitulino 	if (!debugfs_initialized())
39293e7093d0SGreg KH 		return;
393045b5939eSLuiz Capitulino 
393145b5939eSLuiz Capitulino 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3932d56f5136SPaolo Bonzini 	debugfs_dentry = debugfs_create_dir(dir_name,
393345b5939eSLuiz Capitulino 					    vcpu->kvm->debugfs_dentry);
3934e36de87dSVineeth Pillai 	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3935e36de87dSVineeth Pillai 			    &vcpu_get_pid_fops);
393645b5939eSLuiz Capitulino 
3937d56f5136SPaolo Bonzini 	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
393845b5939eSLuiz Capitulino }
3939e36de87dSVineeth Pillai #endif
394045b5939eSLuiz Capitulino 
39410fce5623SAvi Kivity /*
39420fce5623SAvi Kivity  * Creates some virtual cpus.  Good luck creating more than one.
39430fce5623SAvi Kivity  */
kvm_vm_ioctl_create_vcpu(struct kvm * kvm,u32 id)394473880c80SGleb Natapov static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
39450fce5623SAvi Kivity {
39460fce5623SAvi Kivity 	int r;
3947e09fefdeSDavid Hildenbrand 	struct kvm_vcpu *vcpu;
39488bd826d6SSean Christopherson 	struct page *page;
39490fce5623SAvi Kivity 
3950a1c42ddeSJuergen Gross 	if (id >= KVM_MAX_VCPU_IDS)
3951338c7dbaSAndy Honig 		return -EINVAL;
3952338c7dbaSAndy Honig 
39536c7caebcSPaolo Bonzini 	mutex_lock(&kvm->lock);
3954f502cc56SSean Christopherson 	if (kvm->created_vcpus >= kvm->max_vcpus) {
39556c7caebcSPaolo Bonzini 		mutex_unlock(&kvm->lock);
39566c7caebcSPaolo Bonzini 		return -EINVAL;
39576c7caebcSPaolo Bonzini 	}
39586c7caebcSPaolo Bonzini 
39591d5e740dSZeng Guang 	r = kvm_arch_vcpu_precreate(kvm, id);
39601d5e740dSZeng Guang 	if (r) {
39611d5e740dSZeng Guang 		mutex_unlock(&kvm->lock);
39621d5e740dSZeng Guang 		return r;
39631d5e740dSZeng Guang 	}
39641d5e740dSZeng Guang 
39656c7caebcSPaolo Bonzini 	kvm->created_vcpus++;
39666c7caebcSPaolo Bonzini 	mutex_unlock(&kvm->lock);
39676c7caebcSPaolo Bonzini 
396885f47930SSean Christopherson 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3969e529ef66SSean Christopherson 	if (!vcpu) {
3970e529ef66SSean Christopherson 		r = -ENOMEM;
39716c7caebcSPaolo Bonzini 		goto vcpu_decrement;
39726c7caebcSPaolo Bonzini 	}
39730fce5623SAvi Kivity 
3974fcd97ad5SPeter Xu 	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
397593bb59caSShakeel Butt 	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
39768bd826d6SSean Christopherson 	if (!page) {
39778bd826d6SSean Christopherson 		r = -ENOMEM;
3978e529ef66SSean Christopherson 		goto vcpu_free;
39798bd826d6SSean Christopherson 	}
39808bd826d6SSean Christopherson 	vcpu->run = page_address(page);
39818bd826d6SSean Christopherson 
39828bd826d6SSean Christopherson 	kvm_vcpu_init(vcpu, kvm, id);
3983e529ef66SSean Christopherson 
3984e529ef66SSean Christopherson 	r = kvm_arch_vcpu_create(vcpu);
3985e529ef66SSean Christopherson 	if (r)
39868bd826d6SSean Christopherson 		goto vcpu_free_run_page;
3987e529ef66SSean Christopherson 
3988fb04a1edSPeter Xu 	if (kvm->dirty_ring_size) {
3989fb04a1edSPeter Xu 		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3990fb04a1edSPeter Xu 					 id, kvm->dirty_ring_size);
3991fb04a1edSPeter Xu 		if (r)
3992fb04a1edSPeter Xu 			goto arch_vcpu_destroy;
3993fb04a1edSPeter Xu 	}
3994fb04a1edSPeter Xu 
39950fce5623SAvi Kivity 	mutex_lock(&kvm->lock);
399642a90008SDavid Woodhouse 
399742a90008SDavid Woodhouse #ifdef CONFIG_LOCKDEP
399842a90008SDavid Woodhouse 	/* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
399942a90008SDavid Woodhouse 	mutex_lock(&vcpu->mutex);
400042a90008SDavid Woodhouse 	mutex_unlock(&vcpu->mutex);
400142a90008SDavid Woodhouse #endif
400242a90008SDavid Woodhouse 
4003e09fefdeSDavid Hildenbrand 	if (kvm_get_vcpu_by_id(kvm, id)) {
40040fce5623SAvi Kivity 		r = -EEXIST;
4005d780592bSJan Kiszka 		goto unlock_vcpu_destroy;
40060fce5623SAvi Kivity 	}
400773880c80SGleb Natapov 
40088750e72aSRadim Krčmář 	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4009afb2acb2SMichal Luczaj 	r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4010c5b07754SMarc Zyngier 	if (r)
4011c5b07754SMarc Zyngier 		goto unlock_vcpu_destroy;
40120fce5623SAvi Kivity 
40130fce5623SAvi Kivity 	/* Now it's all set up, let userspace reach it */
401466c0b394SAl Viro 	kvm_get_kvm(kvm);
40150fce5623SAvi Kivity 	r = create_vcpu_fd(vcpu);
4016afb2acb2SMichal Luczaj 	if (r < 0)
4017afb2acb2SMichal Luczaj 		goto kvm_put_xa_release;
4018afb2acb2SMichal Luczaj 
40195f643e46SMichal Luczaj 	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4020afb2acb2SMichal Luczaj 		r = -EINVAL;
4021afb2acb2SMichal Luczaj 		goto kvm_put_xa_release;
402273880c80SGleb Natapov 	}
402373880c80SGleb Natapov 
4024dd489240SPaolo Bonzini 	/*
4025c5b07754SMarc Zyngier 	 * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4026c5b07754SMarc Zyngier 	 * pointer before kvm->online_vcpu's incremented value.
4027dd489240SPaolo Bonzini 	 */
402873880c80SGleb Natapov 	smp_wmb();
402973880c80SGleb Natapov 	atomic_inc(&kvm->online_vcpus);
403073880c80SGleb Natapov 
403173880c80SGleb Natapov 	mutex_unlock(&kvm->lock);
403242897d86SMarcelo Tosatti 	kvm_arch_vcpu_postcreate(vcpu);
403363d04348SPaolo Bonzini 	kvm_create_vcpu_debugfs(vcpu);
40340fce5623SAvi Kivity 	return r;
40350fce5623SAvi Kivity 
4036afb2acb2SMichal Luczaj kvm_put_xa_release:
4037afb2acb2SMichal Luczaj 	kvm_put_kvm_no_destroy(kvm);
4038afb2acb2SMichal Luczaj 	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4039d780592bSJan Kiszka unlock_vcpu_destroy:
40407d8fece6SGlauber Costa 	mutex_unlock(&kvm->lock);
4041fb04a1edSPeter Xu 	kvm_dirty_ring_free(&vcpu->dirty_ring);
4042fb04a1edSPeter Xu arch_vcpu_destroy:
40430fce5623SAvi Kivity 	kvm_arch_vcpu_destroy(vcpu);
40448bd826d6SSean Christopherson vcpu_free_run_page:
40458bd826d6SSean Christopherson 	free_page((unsigned long)vcpu->run);
4046e529ef66SSean Christopherson vcpu_free:
4047e529ef66SSean Christopherson 	kmem_cache_free(kvm_vcpu_cache, vcpu);
40486c7caebcSPaolo Bonzini vcpu_decrement:
40496c7caebcSPaolo Bonzini 	mutex_lock(&kvm->lock);
40506c7caebcSPaolo Bonzini 	kvm->created_vcpus--;
40516c7caebcSPaolo Bonzini 	mutex_unlock(&kvm->lock);
40520fce5623SAvi Kivity 	return r;
40530fce5623SAvi Kivity }
40540fce5623SAvi Kivity 
kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu * vcpu,sigset_t * sigset)40550fce5623SAvi Kivity static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
40560fce5623SAvi Kivity {
40570fce5623SAvi Kivity 	if (sigset) {
40580fce5623SAvi Kivity 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
40590fce5623SAvi Kivity 		vcpu->sigset_active = 1;
40600fce5623SAvi Kivity 		vcpu->sigset = *sigset;
40610fce5623SAvi Kivity 	} else
40620fce5623SAvi Kivity 		vcpu->sigset_active = 0;
40630fce5623SAvi Kivity 	return 0;
40640fce5623SAvi Kivity }
40650fce5623SAvi Kivity 
kvm_vcpu_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4066ce55c049SJing Zhang static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4067ce55c049SJing Zhang 			      size_t size, loff_t *offset)
4068ce55c049SJing Zhang {
4069ce55c049SJing Zhang 	struct kvm_vcpu *vcpu = file->private_data;
4070ce55c049SJing Zhang 
4071ce55c049SJing Zhang 	return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4072ce55c049SJing Zhang 			&kvm_vcpu_stats_desc[0], &vcpu->stat,
4073ce55c049SJing Zhang 			sizeof(vcpu->stat), user_buffer, size, offset);
4074ce55c049SJing Zhang }
4075ce55c049SJing Zhang 
kvm_vcpu_stats_release(struct inode * inode,struct file * file)4076eed3013fSSean Christopherson static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4077eed3013fSSean Christopherson {
4078eed3013fSSean Christopherson 	struct kvm_vcpu *vcpu = file->private_data;
4079eed3013fSSean Christopherson 
4080eed3013fSSean Christopherson 	kvm_put_kvm(vcpu->kvm);
4081eed3013fSSean Christopherson 	return 0;
4082eed3013fSSean Christopherson }
4083eed3013fSSean Christopherson 
4084ce55c049SJing Zhang static const struct file_operations kvm_vcpu_stats_fops = {
4085ce55c049SJing Zhang 	.read = kvm_vcpu_stats_read,
4086eed3013fSSean Christopherson 	.release = kvm_vcpu_stats_release,
4087ce55c049SJing Zhang 	.llseek = noop_llseek,
4088ce55c049SJing Zhang };
4089ce55c049SJing Zhang 
kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu * vcpu)4090ce55c049SJing Zhang static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4091ce55c049SJing Zhang {
4092ce55c049SJing Zhang 	int fd;
4093ce55c049SJing Zhang 	struct file *file;
4094ce55c049SJing Zhang 	char name[15 + ITOA_MAX_LEN + 1];
4095ce55c049SJing Zhang 
4096ce55c049SJing Zhang 	snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4097ce55c049SJing Zhang 
4098ce55c049SJing Zhang 	fd = get_unused_fd_flags(O_CLOEXEC);
4099ce55c049SJing Zhang 	if (fd < 0)
4100ce55c049SJing Zhang 		return fd;
4101ce55c049SJing Zhang 
4102ce55c049SJing Zhang 	file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4103ce55c049SJing Zhang 	if (IS_ERR(file)) {
4104ce55c049SJing Zhang 		put_unused_fd(fd);
4105ce55c049SJing Zhang 		return PTR_ERR(file);
4106ce55c049SJing Zhang 	}
4107eed3013fSSean Christopherson 
4108eed3013fSSean Christopherson 	kvm_get_kvm(vcpu->kvm);
4109eed3013fSSean Christopherson 
4110ce55c049SJing Zhang 	file->f_mode |= FMODE_PREAD;
4111ce55c049SJing Zhang 	fd_install(fd, file);
4112ce55c049SJing Zhang 
4113ce55c049SJing Zhang 	return fd;
4114ce55c049SJing Zhang }
4115ce55c049SJing Zhang 
kvm_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)41160fce5623SAvi Kivity static long kvm_vcpu_ioctl(struct file *filp,
41170fce5623SAvi Kivity 			   unsigned int ioctl, unsigned long arg)
41180fce5623SAvi Kivity {
41190fce5623SAvi Kivity 	struct kvm_vcpu *vcpu = filp->private_data;
41200fce5623SAvi Kivity 	void __user *argp = (void __user *)arg;
41210fce5623SAvi Kivity 	int r;
4122fa3795a7SDave Hansen 	struct kvm_fpu *fpu = NULL;
4123fa3795a7SDave Hansen 	struct kvm_sregs *kvm_sregs = NULL;
41240fce5623SAvi Kivity 
4125f4d31653SPaolo Bonzini 	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
41260fce5623SAvi Kivity 		return -EIO;
41272122ff5eSAvi Kivity 
41282ea75be3SDavid Matlack 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
41292ea75be3SDavid Matlack 		return -EINVAL;
41302ea75be3SDavid Matlack 
41312122ff5eSAvi Kivity 	/*
41325cb0944cSPaolo Bonzini 	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
41335cb0944cSPaolo Bonzini 	 * execution; mutex_lock() would break them.
41342122ff5eSAvi Kivity 	 */
41355cb0944cSPaolo Bonzini 	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
41365cb0944cSPaolo Bonzini 	if (r != -ENOIOCTLCMD)
41379fc77441SMichael S. Tsirkin 		return r;
41382122ff5eSAvi Kivity 
4139ec7660ccSChristoffer Dall 	if (mutex_lock_killable(&vcpu->mutex))
4140ec7660ccSChristoffer Dall 		return -EINTR;
41410fce5623SAvi Kivity 	switch (ioctl) {
41420e4524a5SChristian Borntraeger 	case KVM_RUN: {
41430e4524a5SChristian Borntraeger 		struct pid *oldpid;
41440fce5623SAvi Kivity 		r = -EINVAL;
41450fce5623SAvi Kivity 		if (arg)
41460fce5623SAvi Kivity 			goto out;
41470e4524a5SChristian Borntraeger 		oldpid = rcu_access_pointer(vcpu->pid);
414871dbc8a9SEric W. Biederman 		if (unlikely(oldpid != task_pid(current))) {
41497a72f7a1SChristian Borntraeger 			/* The thread running this VCPU changed. */
4150bd2a6394SChristoffer Dall 			struct pid *newpid;
4151f95ef0cdSXiubo Li 
4152bd2a6394SChristoffer Dall 			r = kvm_arch_vcpu_run_pid_change(vcpu);
4153bd2a6394SChristoffer Dall 			if (r)
4154bd2a6394SChristoffer Dall 				break;
4155bd2a6394SChristoffer Dall 
4156bd2a6394SChristoffer Dall 			newpid = get_task_pid(current, PIDTYPE_PID);
41577a72f7a1SChristian Borntraeger 			rcu_assign_pointer(vcpu->pid, newpid);
41587a72f7a1SChristian Borntraeger 			if (oldpid)
41597a72f7a1SChristian Borntraeger 				synchronize_rcu();
41607a72f7a1SChristian Borntraeger 			put_pid(oldpid);
41617a72f7a1SChristian Borntraeger 		}
41621b94f6f8STianjia Zhang 		r = kvm_arch_vcpu_ioctl_run(vcpu);
416364be5007SGleb Natapov 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
41640fce5623SAvi Kivity 		break;
41650e4524a5SChristian Borntraeger 	}
41660fce5623SAvi Kivity 	case KVM_GET_REGS: {
41673e4bb3acSXiantao Zhang 		struct kvm_regs *kvm_regs;
41680fce5623SAvi Kivity 
41693e4bb3acSXiantao Zhang 		r = -ENOMEM;
4170b12ce36aSBen Gardon 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
41713e4bb3acSXiantao Zhang 		if (!kvm_regs)
41723e4bb3acSXiantao Zhang 			goto out;
41733e4bb3acSXiantao Zhang 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
41740fce5623SAvi Kivity 		if (r)
41753e4bb3acSXiantao Zhang 			goto out_free1;
41760fce5623SAvi Kivity 		r = -EFAULT;
41773e4bb3acSXiantao Zhang 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
41783e4bb3acSXiantao Zhang 			goto out_free1;
41790fce5623SAvi Kivity 		r = 0;
41803e4bb3acSXiantao Zhang out_free1:
41813e4bb3acSXiantao Zhang 		kfree(kvm_regs);
41820fce5623SAvi Kivity 		break;
41830fce5623SAvi Kivity 	}
41840fce5623SAvi Kivity 	case KVM_SET_REGS: {
41853e4bb3acSXiantao Zhang 		struct kvm_regs *kvm_regs;
41860fce5623SAvi Kivity 
4187ff5c2c03SSasha Levin 		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4188ff5c2c03SSasha Levin 		if (IS_ERR(kvm_regs)) {
4189ff5c2c03SSasha Levin 			r = PTR_ERR(kvm_regs);
41903e4bb3acSXiantao Zhang 			goto out;
4191ff5c2c03SSasha Levin 		}
41923e4bb3acSXiantao Zhang 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
41933e4bb3acSXiantao Zhang 		kfree(kvm_regs);
41940fce5623SAvi Kivity 		break;
41950fce5623SAvi Kivity 	}
41960fce5623SAvi Kivity 	case KVM_GET_SREGS: {
4197b12ce36aSBen Gardon 		kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4198b12ce36aSBen Gardon 				    GFP_KERNEL_ACCOUNT);
4199fa3795a7SDave Hansen 		r = -ENOMEM;
4200fa3795a7SDave Hansen 		if (!kvm_sregs)
4201fa3795a7SDave Hansen 			goto out;
4202fa3795a7SDave Hansen 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
42030fce5623SAvi Kivity 		if (r)
42040fce5623SAvi Kivity 			goto out;
42050fce5623SAvi Kivity 		r = -EFAULT;
4206fa3795a7SDave Hansen 		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
42070fce5623SAvi Kivity 			goto out;
42080fce5623SAvi Kivity 		r = 0;
42090fce5623SAvi Kivity 		break;
42100fce5623SAvi Kivity 	}
42110fce5623SAvi Kivity 	case KVM_SET_SREGS: {
4212ff5c2c03SSasha Levin 		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4213ff5c2c03SSasha Levin 		if (IS_ERR(kvm_sregs)) {
4214ff5c2c03SSasha Levin 			r = PTR_ERR(kvm_sregs);
421518595411SGuo Chao 			kvm_sregs = NULL;
42160fce5623SAvi Kivity 			goto out;
4217ff5c2c03SSasha Levin 		}
4218fa3795a7SDave Hansen 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
42190fce5623SAvi Kivity 		break;
42200fce5623SAvi Kivity 	}
422162d9f0dbSMarcelo Tosatti 	case KVM_GET_MP_STATE: {
422262d9f0dbSMarcelo Tosatti 		struct kvm_mp_state mp_state;
422362d9f0dbSMarcelo Tosatti 
422462d9f0dbSMarcelo Tosatti 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
422562d9f0dbSMarcelo Tosatti 		if (r)
422662d9f0dbSMarcelo Tosatti 			goto out;
422762d9f0dbSMarcelo Tosatti 		r = -EFAULT;
4228893bdbf1SXiubo Li 		if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
422962d9f0dbSMarcelo Tosatti 			goto out;
423062d9f0dbSMarcelo Tosatti 		r = 0;
423162d9f0dbSMarcelo Tosatti 		break;
423262d9f0dbSMarcelo Tosatti 	}
423362d9f0dbSMarcelo Tosatti 	case KVM_SET_MP_STATE: {
423462d9f0dbSMarcelo Tosatti 		struct kvm_mp_state mp_state;
423562d9f0dbSMarcelo Tosatti 
423662d9f0dbSMarcelo Tosatti 		r = -EFAULT;
4237893bdbf1SXiubo Li 		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
423862d9f0dbSMarcelo Tosatti 			goto out;
423962d9f0dbSMarcelo Tosatti 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
424062d9f0dbSMarcelo Tosatti 		break;
424162d9f0dbSMarcelo Tosatti 	}
42420fce5623SAvi Kivity 	case KVM_TRANSLATE: {
42430fce5623SAvi Kivity 		struct kvm_translation tr;
42440fce5623SAvi Kivity 
42450fce5623SAvi Kivity 		r = -EFAULT;
4246893bdbf1SXiubo Li 		if (copy_from_user(&tr, argp, sizeof(tr)))
42470fce5623SAvi Kivity 			goto out;
42480fce5623SAvi Kivity 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
42490fce5623SAvi Kivity 		if (r)
42500fce5623SAvi Kivity 			goto out;
42510fce5623SAvi Kivity 		r = -EFAULT;
4252893bdbf1SXiubo Li 		if (copy_to_user(argp, &tr, sizeof(tr)))
42530fce5623SAvi Kivity 			goto out;
42540fce5623SAvi Kivity 		r = 0;
42550fce5623SAvi Kivity 		break;
42560fce5623SAvi Kivity 	}
4257d0bfb940SJan Kiszka 	case KVM_SET_GUEST_DEBUG: {
4258d0bfb940SJan Kiszka 		struct kvm_guest_debug dbg;
42590fce5623SAvi Kivity 
42600fce5623SAvi Kivity 		r = -EFAULT;
4261893bdbf1SXiubo Li 		if (copy_from_user(&dbg, argp, sizeof(dbg)))
42620fce5623SAvi Kivity 			goto out;
4263d0bfb940SJan Kiszka 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
42640fce5623SAvi Kivity 		break;
42650fce5623SAvi Kivity 	}
42660fce5623SAvi Kivity 	case KVM_SET_SIGNAL_MASK: {
42670fce5623SAvi Kivity 		struct kvm_signal_mask __user *sigmask_arg = argp;
42680fce5623SAvi Kivity 		struct kvm_signal_mask kvm_sigmask;
42690fce5623SAvi Kivity 		sigset_t sigset, *p;
42700fce5623SAvi Kivity 
42710fce5623SAvi Kivity 		p = NULL;
42720fce5623SAvi Kivity 		if (argp) {
42730fce5623SAvi Kivity 			r = -EFAULT;
42740fce5623SAvi Kivity 			if (copy_from_user(&kvm_sigmask, argp,
4275893bdbf1SXiubo Li 					   sizeof(kvm_sigmask)))
42760fce5623SAvi Kivity 				goto out;
42770fce5623SAvi Kivity 			r = -EINVAL;
4278893bdbf1SXiubo Li 			if (kvm_sigmask.len != sizeof(sigset))
42790fce5623SAvi Kivity 				goto out;
42800fce5623SAvi Kivity 			r = -EFAULT;
42810fce5623SAvi Kivity 			if (copy_from_user(&sigset, sigmask_arg->sigset,
4282893bdbf1SXiubo Li 					   sizeof(sigset)))
42830fce5623SAvi Kivity 				goto out;
42840fce5623SAvi Kivity 			p = &sigset;
42850fce5623SAvi Kivity 		}
4286376d41ffSAndi Kleen 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
42870fce5623SAvi Kivity 		break;
42880fce5623SAvi Kivity 	}
42890fce5623SAvi Kivity 	case KVM_GET_FPU: {
4290b12ce36aSBen Gardon 		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4291fa3795a7SDave Hansen 		r = -ENOMEM;
4292fa3795a7SDave Hansen 		if (!fpu)
4293fa3795a7SDave Hansen 			goto out;
4294fa3795a7SDave Hansen 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
42950fce5623SAvi Kivity 		if (r)
42960fce5623SAvi Kivity 			goto out;
42970fce5623SAvi Kivity 		r = -EFAULT;
4298fa3795a7SDave Hansen 		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
42990fce5623SAvi Kivity 			goto out;
43000fce5623SAvi Kivity 		r = 0;
43010fce5623SAvi Kivity 		break;
43020fce5623SAvi Kivity 	}
43030fce5623SAvi Kivity 	case KVM_SET_FPU: {
4304ff5c2c03SSasha Levin 		fpu = memdup_user(argp, sizeof(*fpu));
4305ff5c2c03SSasha Levin 		if (IS_ERR(fpu)) {
4306ff5c2c03SSasha Levin 			r = PTR_ERR(fpu);
430718595411SGuo Chao 			fpu = NULL;
43080fce5623SAvi Kivity 			goto out;
4309ff5c2c03SSasha Levin 		}
4310fa3795a7SDave Hansen 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
43110fce5623SAvi Kivity 		break;
43120fce5623SAvi Kivity 	}
4313ce55c049SJing Zhang 	case KVM_GET_STATS_FD: {
4314ce55c049SJing Zhang 		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4315ce55c049SJing Zhang 		break;
4316ce55c049SJing Zhang 	}
43170fce5623SAvi Kivity 	default:
43180fce5623SAvi Kivity 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
43190fce5623SAvi Kivity 	}
43200fce5623SAvi Kivity out:
4321ec7660ccSChristoffer Dall 	mutex_unlock(&vcpu->mutex);
4322fa3795a7SDave Hansen 	kfree(fpu);
4323fa3795a7SDave Hansen 	kfree(kvm_sregs);
43240fce5623SAvi Kivity 	return r;
43250fce5623SAvi Kivity }
43260fce5623SAvi Kivity 
4327de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
kvm_vcpu_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)43281dda606cSAlexander Graf static long kvm_vcpu_compat_ioctl(struct file *filp,
43291dda606cSAlexander Graf 				  unsigned int ioctl, unsigned long arg)
43301dda606cSAlexander Graf {
43311dda606cSAlexander Graf 	struct kvm_vcpu *vcpu = filp->private_data;
43321dda606cSAlexander Graf 	void __user *argp = compat_ptr(arg);
43331dda606cSAlexander Graf 	int r;
43341dda606cSAlexander Graf 
4335f4d31653SPaolo Bonzini 	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
43361dda606cSAlexander Graf 		return -EIO;
43371dda606cSAlexander Graf 
43381dda606cSAlexander Graf 	switch (ioctl) {
43391dda606cSAlexander Graf 	case KVM_SET_SIGNAL_MASK: {
43401dda606cSAlexander Graf 		struct kvm_signal_mask __user *sigmask_arg = argp;
43411dda606cSAlexander Graf 		struct kvm_signal_mask kvm_sigmask;
43421dda606cSAlexander Graf 		sigset_t sigset;
43431dda606cSAlexander Graf 
43441dda606cSAlexander Graf 		if (argp) {
43451dda606cSAlexander Graf 			r = -EFAULT;
43461dda606cSAlexander Graf 			if (copy_from_user(&kvm_sigmask, argp,
4347893bdbf1SXiubo Li 					   sizeof(kvm_sigmask)))
43481dda606cSAlexander Graf 				goto out;
43491dda606cSAlexander Graf 			r = -EINVAL;
43503968cf62SAl Viro 			if (kvm_sigmask.len != sizeof(compat_sigset_t))
43511dda606cSAlexander Graf 				goto out;
43521dda606cSAlexander Graf 			r = -EFAULT;
43531393b4aaSPaolo Bonzini 			if (get_compat_sigset(&sigset,
43541393b4aaSPaolo Bonzini 					      (compat_sigset_t __user *)sigmask_arg->sigset))
43551dda606cSAlexander Graf 				goto out;
43561dda606cSAlexander Graf 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4357760a9a30SAlan Cox 		} else
4358760a9a30SAlan Cox 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
43591dda606cSAlexander Graf 		break;
43601dda606cSAlexander Graf 	}
43611dda606cSAlexander Graf 	default:
43621dda606cSAlexander Graf 		r = kvm_vcpu_ioctl(filp, ioctl, arg);
43631dda606cSAlexander Graf 	}
43641dda606cSAlexander Graf 
43651dda606cSAlexander Graf out:
43661dda606cSAlexander Graf 	return r;
43671dda606cSAlexander Graf }
43681dda606cSAlexander Graf #endif
43691dda606cSAlexander Graf 
kvm_device_mmap(struct file * filp,struct vm_area_struct * vma)4370a1cd3f08SCédric Le Goater static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4371a1cd3f08SCédric Le Goater {
4372a1cd3f08SCédric Le Goater 	struct kvm_device *dev = filp->private_data;
4373a1cd3f08SCédric Le Goater 
4374a1cd3f08SCédric Le Goater 	if (dev->ops->mmap)
4375a1cd3f08SCédric Le Goater 		return dev->ops->mmap(dev, vma);
4376a1cd3f08SCédric Le Goater 
4377a1cd3f08SCédric Le Goater 	return -ENODEV;
4378a1cd3f08SCédric Le Goater }
4379a1cd3f08SCédric Le Goater 
kvm_device_ioctl_attr(struct kvm_device * dev,int (* accessor)(struct kvm_device * dev,struct kvm_device_attr * attr),unsigned long arg)4380852b6d57SScott Wood static int kvm_device_ioctl_attr(struct kvm_device *dev,
4381852b6d57SScott Wood 				 int (*accessor)(struct kvm_device *dev,
4382852b6d57SScott Wood 						 struct kvm_device_attr *attr),
4383852b6d57SScott Wood 				 unsigned long arg)
4384852b6d57SScott Wood {
4385852b6d57SScott Wood 	struct kvm_device_attr attr;
4386852b6d57SScott Wood 
4387852b6d57SScott Wood 	if (!accessor)
4388852b6d57SScott Wood 		return -EPERM;
4389852b6d57SScott Wood 
4390852b6d57SScott Wood 	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4391852b6d57SScott Wood 		return -EFAULT;
4392852b6d57SScott Wood 
4393852b6d57SScott Wood 	return accessor(dev, &attr);
4394852b6d57SScott Wood }
4395852b6d57SScott Wood 
kvm_device_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4396852b6d57SScott Wood static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4397852b6d57SScott Wood 			     unsigned long arg)
4398852b6d57SScott Wood {
4399852b6d57SScott Wood 	struct kvm_device *dev = filp->private_data;
4400852b6d57SScott Wood 
4401f4d31653SPaolo Bonzini 	if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4402ddba9180SSean Christopherson 		return -EIO;
4403ddba9180SSean Christopherson 
4404852b6d57SScott Wood 	switch (ioctl) {
4405852b6d57SScott Wood 	case KVM_SET_DEVICE_ATTR:
4406852b6d57SScott Wood 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4407852b6d57SScott Wood 	case KVM_GET_DEVICE_ATTR:
4408852b6d57SScott Wood 		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4409852b6d57SScott Wood 	case KVM_HAS_DEVICE_ATTR:
4410852b6d57SScott Wood 		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4411852b6d57SScott Wood 	default:
4412852b6d57SScott Wood 		if (dev->ops->ioctl)
4413852b6d57SScott Wood 			return dev->ops->ioctl(dev, ioctl, arg);
4414852b6d57SScott Wood 
4415852b6d57SScott Wood 		return -ENOTTY;
4416852b6d57SScott Wood 	}
4417852b6d57SScott Wood }
4418852b6d57SScott Wood 
kvm_device_release(struct inode * inode,struct file * filp)4419852b6d57SScott Wood static int kvm_device_release(struct inode *inode, struct file *filp)
4420852b6d57SScott Wood {
4421852b6d57SScott Wood 	struct kvm_device *dev = filp->private_data;
4422852b6d57SScott Wood 	struct kvm *kvm = dev->kvm;
4423852b6d57SScott Wood 
44242bde9b3eSCédric Le Goater 	if (dev->ops->release) {
44252bde9b3eSCédric Le Goater 		mutex_lock(&kvm->lock);
44262bde9b3eSCédric Le Goater 		list_del(&dev->vm_node);
44272bde9b3eSCédric Le Goater 		dev->ops->release(dev);
44282bde9b3eSCédric Le Goater 		mutex_unlock(&kvm->lock);
44292bde9b3eSCédric Le Goater 	}
44302bde9b3eSCédric Le Goater 
4431852b6d57SScott Wood 	kvm_put_kvm(kvm);
4432852b6d57SScott Wood 	return 0;
4433852b6d57SScott Wood }
4434852b6d57SScott Wood 
4435852b6d57SScott Wood static const struct file_operations kvm_device_fops = {
4436852b6d57SScott Wood 	.unlocked_ioctl = kvm_device_ioctl,
4437852b6d57SScott Wood 	.release = kvm_device_release,
44387ddfd3e0SMarc Zyngier 	KVM_COMPAT(kvm_device_ioctl),
4439a1cd3f08SCédric Le Goater 	.mmap = kvm_device_mmap,
4440852b6d57SScott Wood };
4441852b6d57SScott Wood 
kvm_device_from_filp(struct file * filp)4442852b6d57SScott Wood struct kvm_device *kvm_device_from_filp(struct file *filp)
4443852b6d57SScott Wood {
4444852b6d57SScott Wood 	if (filp->f_op != &kvm_device_fops)
4445852b6d57SScott Wood 		return NULL;
4446852b6d57SScott Wood 
4447852b6d57SScott Wood 	return filp->private_data;
4448852b6d57SScott Wood }
4449852b6d57SScott Wood 
44508538cb22SSteven Price static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4451d60eacb0SWill Deacon #ifdef CONFIG_KVM_MPIC
4452d60eacb0SWill Deacon 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
4453d60eacb0SWill Deacon 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
4454d60eacb0SWill Deacon #endif
4455d60eacb0SWill Deacon };
4456d60eacb0SWill Deacon 
kvm_register_device_ops(const struct kvm_device_ops * ops,u32 type)44578538cb22SSteven Price int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4458d60eacb0SWill Deacon {
4459d60eacb0SWill Deacon 	if (type >= ARRAY_SIZE(kvm_device_ops_table))
4460d60eacb0SWill Deacon 		return -ENOSPC;
4461d60eacb0SWill Deacon 
4462d60eacb0SWill Deacon 	if (kvm_device_ops_table[type] != NULL)
4463d60eacb0SWill Deacon 		return -EEXIST;
4464d60eacb0SWill Deacon 
4465d60eacb0SWill Deacon 	kvm_device_ops_table[type] = ops;
4466d60eacb0SWill Deacon 	return 0;
4467d60eacb0SWill Deacon }
4468d60eacb0SWill Deacon 
kvm_unregister_device_ops(u32 type)4469571ee1b6SWanpeng Li void kvm_unregister_device_ops(u32 type)
4470571ee1b6SWanpeng Li {
4471571ee1b6SWanpeng Li 	if (kvm_device_ops_table[type] != NULL)
4472571ee1b6SWanpeng Li 		kvm_device_ops_table[type] = NULL;
4473571ee1b6SWanpeng Li }
4474571ee1b6SWanpeng Li 
kvm_ioctl_create_device(struct kvm * kvm,struct kvm_create_device * cd)4475852b6d57SScott Wood static int kvm_ioctl_create_device(struct kvm *kvm,
4476852b6d57SScott Wood 				   struct kvm_create_device *cd)
4477852b6d57SScott Wood {
4478eceb6e1dSLi kunyu 	const struct kvm_device_ops *ops;
4479852b6d57SScott Wood 	struct kvm_device *dev;
4480852b6d57SScott Wood 	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
44811d487e9bSPaolo Bonzini 	int type;
4482852b6d57SScott Wood 	int ret;
4483852b6d57SScott Wood 
4484d60eacb0SWill Deacon 	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4485852b6d57SScott Wood 		return -ENODEV;
4486d60eacb0SWill Deacon 
44871d487e9bSPaolo Bonzini 	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
44881d487e9bSPaolo Bonzini 	ops = kvm_device_ops_table[type];
4489d60eacb0SWill Deacon 	if (ops == NULL)
4490d60eacb0SWill Deacon 		return -ENODEV;
4491852b6d57SScott Wood 
4492852b6d57SScott Wood 	if (test)
4493852b6d57SScott Wood 		return 0;
4494852b6d57SScott Wood 
4495b12ce36aSBen Gardon 	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4496852b6d57SScott Wood 	if (!dev)
4497852b6d57SScott Wood 		return -ENOMEM;
4498852b6d57SScott Wood 
4499852b6d57SScott Wood 	dev->ops = ops;
4500852b6d57SScott Wood 	dev->kvm = kvm;
4501852b6d57SScott Wood 
4502a28ebea2SChristoffer Dall 	mutex_lock(&kvm->lock);
45031d487e9bSPaolo Bonzini 	ret = ops->create(dev, type);
4504852b6d57SScott Wood 	if (ret < 0) {
4505a28ebea2SChristoffer Dall 		mutex_unlock(&kvm->lock);
4506852b6d57SScott Wood 		kfree(dev);
4507852b6d57SScott Wood 		return ret;
4508852b6d57SScott Wood 	}
4509a28ebea2SChristoffer Dall 	list_add(&dev->vm_node, &kvm->devices);
4510a28ebea2SChristoffer Dall 	mutex_unlock(&kvm->lock);
4511852b6d57SScott Wood 
4512023e9fddSChristoffer Dall 	if (ops->init)
4513023e9fddSChristoffer Dall 		ops->init(dev);
4514023e9fddSChristoffer Dall 
4515cfa39381SJann Horn 	kvm_get_kvm(kvm);
451624009b05SYann Droneaud 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4517852b6d57SScott Wood 	if (ret < 0) {
4518149487bdSSean Christopherson 		kvm_put_kvm_no_destroy(kvm);
4519a28ebea2SChristoffer Dall 		mutex_lock(&kvm->lock);
4520a28ebea2SChristoffer Dall 		list_del(&dev->vm_node);
4521e8bc2427SAlexey Kardashevskiy 		if (ops->release)
4522e8bc2427SAlexey Kardashevskiy 			ops->release(dev);
4523a28ebea2SChristoffer Dall 		mutex_unlock(&kvm->lock);
4524e8bc2427SAlexey Kardashevskiy 		if (ops->destroy)
4525a0f1d21cSDan Carpenter 			ops->destroy(dev);
4526852b6d57SScott Wood 		return ret;
4527852b6d57SScott Wood 	}
4528852b6d57SScott Wood 
4529852b6d57SScott Wood 	cd->fd = ret;
4530852b6d57SScott Wood 	return 0;
4531852b6d57SScott Wood }
4532852b6d57SScott Wood 
kvm_vm_ioctl_check_extension_generic(struct kvm * kvm,long arg)4533f15ba52bSThomas Huth static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
453492b591a4SAlexander Graf {
453592b591a4SAlexander Graf 	switch (arg) {
453692b591a4SAlexander Graf 	case KVM_CAP_USER_MEMORY:
453792b591a4SAlexander Graf 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
453892b591a4SAlexander Graf 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
453992b591a4SAlexander Graf 	case KVM_CAP_INTERNAL_ERROR_DATA:
454092b591a4SAlexander Graf #ifdef CONFIG_HAVE_KVM_MSI
454192b591a4SAlexander Graf 	case KVM_CAP_SIGNAL_MSI:
454292b591a4SAlexander Graf #endif
4543297e2105SPaul Mackerras #ifdef CONFIG_HAVE_KVM_IRQFD
4544dc9be0faSPaolo Bonzini 	case KVM_CAP_IRQFD:
454592b591a4SAlexander Graf #endif
4546e9ea5069SJason Wang 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
454792b591a4SAlexander Graf 	case KVM_CAP_CHECK_EXTENSION_VM:
4548e5d83c74SPaolo Bonzini 	case KVM_CAP_ENABLE_CAP_VM:
4549acd05785SDavid Matlack 	case KVM_CAP_HALT_POLL:
455092b591a4SAlexander Graf 		return 1;
45514b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
455230422558SPaolo Bonzini 	case KVM_CAP_COALESCED_MMIO:
455330422558SPaolo Bonzini 		return KVM_COALESCED_MMIO_PAGE_OFFSET;
45540804c849SPeng Hao 	case KVM_CAP_COALESCED_PIO:
45550804c849SPeng Hao 		return 1;
455630422558SPaolo Bonzini #endif
45573c9bd400SJay Zhou #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
45583c9bd400SJay Zhou 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
45593c9bd400SJay Zhou 		return KVM_DIRTY_LOG_MANUAL_CAPS;
45603c9bd400SJay Zhou #endif
456192b591a4SAlexander Graf #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
456292b591a4SAlexander Graf 	case KVM_CAP_IRQ_ROUTING:
456392b591a4SAlexander Graf 		return KVM_MAX_IRQ_ROUTES;
456492b591a4SAlexander Graf #endif
4565f481b069SPaolo Bonzini #if KVM_ADDRESS_SPACE_NUM > 1
4566f481b069SPaolo Bonzini 	case KVM_CAP_MULTI_ADDRESS_SPACE:
4567f481b069SPaolo Bonzini 		return KVM_ADDRESS_SPACE_NUM;
4568f481b069SPaolo Bonzini #endif
4569c110ae57SPaolo Bonzini 	case KVM_CAP_NR_MEMSLOTS:
4570c110ae57SPaolo Bonzini 		return KVM_USER_MEM_SLOTS;
4571fb04a1edSPeter Xu 	case KVM_CAP_DIRTY_LOG_RING:
457217601bfeSMarc Zyngier #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
457317601bfeSMarc Zyngier 		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
457417601bfeSMarc Zyngier #else
457517601bfeSMarc Zyngier 		return 0;
457617601bfeSMarc Zyngier #endif
457717601bfeSMarc Zyngier 	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
457817601bfeSMarc Zyngier #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4579fb04a1edSPeter Xu 		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4580fb04a1edSPeter Xu #else
4581fb04a1edSPeter Xu 		return 0;
4582fb04a1edSPeter Xu #endif
458386bdf3ebSGavin Shan #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
458486bdf3ebSGavin Shan 	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
458586bdf3ebSGavin Shan #endif
4586ce55c049SJing Zhang 	case KVM_CAP_BINARY_STATS_FD:
4587d495f942SPaolo Bonzini 	case KVM_CAP_SYSTEM_EVENT_DATA:
4588ce55c049SJing Zhang 		return 1;
458992b591a4SAlexander Graf 	default:
459092b591a4SAlexander Graf 		break;
459192b591a4SAlexander Graf 	}
459292b591a4SAlexander Graf 	return kvm_vm_ioctl_check_extension(kvm, arg);
459392b591a4SAlexander Graf }
459492b591a4SAlexander Graf 
kvm_vm_ioctl_enable_dirty_log_ring(struct kvm * kvm,u32 size)4595fb04a1edSPeter Xu static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4596fb04a1edSPeter Xu {
4597fb04a1edSPeter Xu 	int r;
4598fb04a1edSPeter Xu 
4599fb04a1edSPeter Xu 	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4600fb04a1edSPeter Xu 		return -EINVAL;
4601fb04a1edSPeter Xu 
4602fb04a1edSPeter Xu 	/* the size should be power of 2 */
4603fb04a1edSPeter Xu 	if (!size || (size & (size - 1)))
4604fb04a1edSPeter Xu 		return -EINVAL;
4605fb04a1edSPeter Xu 
4606fb04a1edSPeter Xu 	/* Should be bigger to keep the reserved entries, or a page */
4607fb04a1edSPeter Xu 	if (size < kvm_dirty_ring_get_rsvd_entries() *
4608fb04a1edSPeter Xu 	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4609fb04a1edSPeter Xu 		return -EINVAL;
4610fb04a1edSPeter Xu 
4611fb04a1edSPeter Xu 	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4612fb04a1edSPeter Xu 	    sizeof(struct kvm_dirty_gfn))
4613fb04a1edSPeter Xu 		return -E2BIG;
4614fb04a1edSPeter Xu 
4615fb04a1edSPeter Xu 	/* We only allow it to set once */
4616fb04a1edSPeter Xu 	if (kvm->dirty_ring_size)
4617fb04a1edSPeter Xu 		return -EINVAL;
4618fb04a1edSPeter Xu 
4619fb04a1edSPeter Xu 	mutex_lock(&kvm->lock);
4620fb04a1edSPeter Xu 
4621fb04a1edSPeter Xu 	if (kvm->created_vcpus) {
4622fb04a1edSPeter Xu 		/* We don't allow to change this value after vcpu created */
4623fb04a1edSPeter Xu 		r = -EINVAL;
4624fb04a1edSPeter Xu 	} else {
4625fb04a1edSPeter Xu 		kvm->dirty_ring_size = size;
4626fb04a1edSPeter Xu 		r = 0;
4627fb04a1edSPeter Xu 	}
4628fb04a1edSPeter Xu 
4629fb04a1edSPeter Xu 	mutex_unlock(&kvm->lock);
4630fb04a1edSPeter Xu 	return r;
4631fb04a1edSPeter Xu }
4632fb04a1edSPeter Xu 
kvm_vm_ioctl_reset_dirty_pages(struct kvm * kvm)4633fb04a1edSPeter Xu static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4634fb04a1edSPeter Xu {
463546808a4cSMarc Zyngier 	unsigned long i;
4636fb04a1edSPeter Xu 	struct kvm_vcpu *vcpu;
4637fb04a1edSPeter Xu 	int cleared = 0;
4638fb04a1edSPeter Xu 
4639fb04a1edSPeter Xu 	if (!kvm->dirty_ring_size)
4640fb04a1edSPeter Xu 		return -EINVAL;
4641fb04a1edSPeter Xu 
4642fb04a1edSPeter Xu 	mutex_lock(&kvm->slots_lock);
4643fb04a1edSPeter Xu 
4644fb04a1edSPeter Xu 	kvm_for_each_vcpu(i, vcpu, kvm)
4645fb04a1edSPeter Xu 		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4646fb04a1edSPeter Xu 
4647fb04a1edSPeter Xu 	mutex_unlock(&kvm->slots_lock);
4648fb04a1edSPeter Xu 
4649fb04a1edSPeter Xu 	if (cleared)
4650fb04a1edSPeter Xu 		kvm_flush_remote_tlbs(kvm);
4651fb04a1edSPeter Xu 
4652fb04a1edSPeter Xu 	return cleared;
4653fb04a1edSPeter Xu }
4654fb04a1edSPeter Xu 
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)4655e5d83c74SPaolo Bonzini int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4656e5d83c74SPaolo Bonzini 						  struct kvm_enable_cap *cap)
4657e5d83c74SPaolo Bonzini {
4658e5d83c74SPaolo Bonzini 	return -EINVAL;
4659e5d83c74SPaolo Bonzini }
4660e5d83c74SPaolo Bonzini 
kvm_are_all_memslots_empty(struct kvm * kvm)466126f45714SRicardo Koller bool kvm_are_all_memslots_empty(struct kvm *kvm)
466286bdf3ebSGavin Shan {
466386bdf3ebSGavin Shan 	int i;
466486bdf3ebSGavin Shan 
466586bdf3ebSGavin Shan 	lockdep_assert_held(&kvm->slots_lock);
466686bdf3ebSGavin Shan 
466786bdf3ebSGavin Shan 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
466886bdf3ebSGavin Shan 		if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
466986bdf3ebSGavin Shan 			return false;
467086bdf3ebSGavin Shan 	}
467186bdf3ebSGavin Shan 
467286bdf3ebSGavin Shan 	return true;
467386bdf3ebSGavin Shan }
467426f45714SRicardo Koller EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
467586bdf3ebSGavin Shan 
kvm_vm_ioctl_enable_cap_generic(struct kvm * kvm,struct kvm_enable_cap * cap)4676e5d83c74SPaolo Bonzini static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4677e5d83c74SPaolo Bonzini 					   struct kvm_enable_cap *cap)
4678e5d83c74SPaolo Bonzini {
4679e5d83c74SPaolo Bonzini 	switch (cap->cap) {
46802a31b9dbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
46813c9bd400SJay Zhou 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
46823c9bd400SJay Zhou 		u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
46833c9bd400SJay Zhou 
46843c9bd400SJay Zhou 		if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
46853c9bd400SJay Zhou 			allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
46863c9bd400SJay Zhou 
46873c9bd400SJay Zhou 		if (cap->flags || (cap->args[0] & ~allowed_options))
46882a31b9dbSPaolo Bonzini 			return -EINVAL;
46892a31b9dbSPaolo Bonzini 		kvm->manual_dirty_log_protect = cap->args[0];
46902a31b9dbSPaolo Bonzini 		return 0;
46913c9bd400SJay Zhou 	}
46922a31b9dbSPaolo Bonzini #endif
4693acd05785SDavid Matlack 	case KVM_CAP_HALT_POLL: {
4694acd05785SDavid Matlack 		if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4695acd05785SDavid Matlack 			return -EINVAL;
4696acd05785SDavid Matlack 
4697acd05785SDavid Matlack 		kvm->max_halt_poll_ns = cap->args[0];
46989eb8ca04SDavid Matlack 
46999eb8ca04SDavid Matlack 		/*
47009eb8ca04SDavid Matlack 		 * Ensure kvm->override_halt_poll_ns does not become visible
47019eb8ca04SDavid Matlack 		 * before kvm->max_halt_poll_ns.
47029eb8ca04SDavid Matlack 		 *
47039eb8ca04SDavid Matlack 		 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
47049eb8ca04SDavid Matlack 		 */
47059eb8ca04SDavid Matlack 		smp_wmb();
47069eb8ca04SDavid Matlack 		kvm->override_halt_poll_ns = true;
47079eb8ca04SDavid Matlack 
4708acd05785SDavid Matlack 		return 0;
4709acd05785SDavid Matlack 	}
4710fb04a1edSPeter Xu 	case KVM_CAP_DIRTY_LOG_RING:
471117601bfeSMarc Zyngier 	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
47127a2726ecSGavin Shan 		if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
47137a2726ecSGavin Shan 			return -EINVAL;
47147a2726ecSGavin Shan 
4715fb04a1edSPeter Xu 		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
471686bdf3ebSGavin Shan 	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
471786bdf3ebSGavin Shan 		int r = -EINVAL;
471886bdf3ebSGavin Shan 
471986bdf3ebSGavin Shan 		if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
472086bdf3ebSGavin Shan 		    !kvm->dirty_ring_size || cap->flags)
472186bdf3ebSGavin Shan 			return r;
472286bdf3ebSGavin Shan 
472386bdf3ebSGavin Shan 		mutex_lock(&kvm->slots_lock);
472486bdf3ebSGavin Shan 
472586bdf3ebSGavin Shan 		/*
472686bdf3ebSGavin Shan 		 * For simplicity, allow enabling ring+bitmap if and only if
472786bdf3ebSGavin Shan 		 * there are no memslots, e.g. to ensure all memslots allocate
472886bdf3ebSGavin Shan 		 * a bitmap after the capability is enabled.
472986bdf3ebSGavin Shan 		 */
473086bdf3ebSGavin Shan 		if (kvm_are_all_memslots_empty(kvm)) {
473186bdf3ebSGavin Shan 			kvm->dirty_ring_with_bitmap = true;
473286bdf3ebSGavin Shan 			r = 0;
473386bdf3ebSGavin Shan 		}
473486bdf3ebSGavin Shan 
473586bdf3ebSGavin Shan 		mutex_unlock(&kvm->slots_lock);
473686bdf3ebSGavin Shan 
473786bdf3ebSGavin Shan 		return r;
473886bdf3ebSGavin Shan 	}
4739e5d83c74SPaolo Bonzini 	default:
4740e5d83c74SPaolo Bonzini 		return kvm_vm_ioctl_enable_cap(kvm, cap);
4741e5d83c74SPaolo Bonzini 	}
4742e5d83c74SPaolo Bonzini }
4743e5d83c74SPaolo Bonzini 
kvm_vm_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4744fcfe1baeSJing Zhang static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4745fcfe1baeSJing Zhang 			      size_t size, loff_t *offset)
4746fcfe1baeSJing Zhang {
4747fcfe1baeSJing Zhang 	struct kvm *kvm = file->private_data;
4748fcfe1baeSJing Zhang 
4749fcfe1baeSJing Zhang 	return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4750fcfe1baeSJing Zhang 				&kvm_vm_stats_desc[0], &kvm->stat,
4751fcfe1baeSJing Zhang 				sizeof(kvm->stat), user_buffer, size, offset);
4752fcfe1baeSJing Zhang }
4753fcfe1baeSJing Zhang 
kvm_vm_stats_release(struct inode * inode,struct file * file)4754eed3013fSSean Christopherson static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4755eed3013fSSean Christopherson {
4756eed3013fSSean Christopherson 	struct kvm *kvm = file->private_data;
4757eed3013fSSean Christopherson 
4758eed3013fSSean Christopherson 	kvm_put_kvm(kvm);
4759eed3013fSSean Christopherson 	return 0;
4760eed3013fSSean Christopherson }
4761eed3013fSSean Christopherson 
4762fcfe1baeSJing Zhang static const struct file_operations kvm_vm_stats_fops = {
4763fcfe1baeSJing Zhang 	.read = kvm_vm_stats_read,
4764eed3013fSSean Christopherson 	.release = kvm_vm_stats_release,
4765fcfe1baeSJing Zhang 	.llseek = noop_llseek,
4766fcfe1baeSJing Zhang };
4767fcfe1baeSJing Zhang 
kvm_vm_ioctl_get_stats_fd(struct kvm * kvm)4768fcfe1baeSJing Zhang static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4769fcfe1baeSJing Zhang {
4770fcfe1baeSJing Zhang 	int fd;
4771fcfe1baeSJing Zhang 	struct file *file;
4772fcfe1baeSJing Zhang 
4773fcfe1baeSJing Zhang 	fd = get_unused_fd_flags(O_CLOEXEC);
4774fcfe1baeSJing Zhang 	if (fd < 0)
4775fcfe1baeSJing Zhang 		return fd;
4776fcfe1baeSJing Zhang 
4777fcfe1baeSJing Zhang 	file = anon_inode_getfile("kvm-vm-stats",
4778fcfe1baeSJing Zhang 			&kvm_vm_stats_fops, kvm, O_RDONLY);
4779fcfe1baeSJing Zhang 	if (IS_ERR(file)) {
4780fcfe1baeSJing Zhang 		put_unused_fd(fd);
4781fcfe1baeSJing Zhang 		return PTR_ERR(file);
4782fcfe1baeSJing Zhang 	}
4783eed3013fSSean Christopherson 
4784eed3013fSSean Christopherson 	kvm_get_kvm(kvm);
4785eed3013fSSean Christopherson 
4786fcfe1baeSJing Zhang 	file->f_mode |= FMODE_PREAD;
4787fcfe1baeSJing Zhang 	fd_install(fd, file);
4788fcfe1baeSJing Zhang 
4789fcfe1baeSJing Zhang 	return fd;
4790fcfe1baeSJing Zhang }
4791fcfe1baeSJing Zhang 
kvm_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)47920fce5623SAvi Kivity static long kvm_vm_ioctl(struct file *filp,
47930fce5623SAvi Kivity 			   unsigned int ioctl, unsigned long arg)
47940fce5623SAvi Kivity {
47950fce5623SAvi Kivity 	struct kvm *kvm = filp->private_data;
47960fce5623SAvi Kivity 	void __user *argp = (void __user *)arg;
47970fce5623SAvi Kivity 	int r;
47980fce5623SAvi Kivity 
4799f4d31653SPaolo Bonzini 	if (kvm->mm != current->mm || kvm->vm_dead)
48000fce5623SAvi Kivity 		return -EIO;
48010fce5623SAvi Kivity 	switch (ioctl) {
48020fce5623SAvi Kivity 	case KVM_CREATE_VCPU:
48030fce5623SAvi Kivity 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
48040fce5623SAvi Kivity 		break;
4805e5d83c74SPaolo Bonzini 	case KVM_ENABLE_CAP: {
4806e5d83c74SPaolo Bonzini 		struct kvm_enable_cap cap;
4807e5d83c74SPaolo Bonzini 
4808e5d83c74SPaolo Bonzini 		r = -EFAULT;
4809e5d83c74SPaolo Bonzini 		if (copy_from_user(&cap, argp, sizeof(cap)))
4810e5d83c74SPaolo Bonzini 			goto out;
4811e5d83c74SPaolo Bonzini 		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4812e5d83c74SPaolo Bonzini 		break;
4813e5d83c74SPaolo Bonzini 	}
48140fce5623SAvi Kivity 	case KVM_SET_USER_MEMORY_REGION: {
48150fce5623SAvi Kivity 		struct kvm_userspace_memory_region kvm_userspace_mem;
48160fce5623SAvi Kivity 
48170fce5623SAvi Kivity 		r = -EFAULT;
48180fce5623SAvi Kivity 		if (copy_from_user(&kvm_userspace_mem, argp,
4819893bdbf1SXiubo Li 						sizeof(kvm_userspace_mem)))
48200fce5623SAvi Kivity 			goto out;
48210fce5623SAvi Kivity 
482247ae31e2STakuya Yoshikawa 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
48230fce5623SAvi Kivity 		break;
48240fce5623SAvi Kivity 	}
48250fce5623SAvi Kivity 	case KVM_GET_DIRTY_LOG: {
48260fce5623SAvi Kivity 		struct kvm_dirty_log log;
48270fce5623SAvi Kivity 
48280fce5623SAvi Kivity 		r = -EFAULT;
4829893bdbf1SXiubo Li 		if (copy_from_user(&log, argp, sizeof(log)))
48300fce5623SAvi Kivity 			goto out;
48310fce5623SAvi Kivity 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
48320fce5623SAvi Kivity 		break;
48330fce5623SAvi Kivity 	}
48342a31b9dbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
48352a31b9dbSPaolo Bonzini 	case KVM_CLEAR_DIRTY_LOG: {
48362a31b9dbSPaolo Bonzini 		struct kvm_clear_dirty_log log;
48372a31b9dbSPaolo Bonzini 
48382a31b9dbSPaolo Bonzini 		r = -EFAULT;
48392a31b9dbSPaolo Bonzini 		if (copy_from_user(&log, argp, sizeof(log)))
48402a31b9dbSPaolo Bonzini 			goto out;
48412a31b9dbSPaolo Bonzini 		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
48422a31b9dbSPaolo Bonzini 		break;
48432a31b9dbSPaolo Bonzini 	}
48442a31b9dbSPaolo Bonzini #endif
48454b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
48465f94c174SLaurent Vivier 	case KVM_REGISTER_COALESCED_MMIO: {
48475f94c174SLaurent Vivier 		struct kvm_coalesced_mmio_zone zone;
4848f95ef0cdSXiubo Li 
48495f94c174SLaurent Vivier 		r = -EFAULT;
4850893bdbf1SXiubo Li 		if (copy_from_user(&zone, argp, sizeof(zone)))
48515f94c174SLaurent Vivier 			goto out;
48525f94c174SLaurent Vivier 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
48535f94c174SLaurent Vivier 		break;
48545f94c174SLaurent Vivier 	}
48555f94c174SLaurent Vivier 	case KVM_UNREGISTER_COALESCED_MMIO: {
48565f94c174SLaurent Vivier 		struct kvm_coalesced_mmio_zone zone;
4857f95ef0cdSXiubo Li 
48585f94c174SLaurent Vivier 		r = -EFAULT;
4859893bdbf1SXiubo Li 		if (copy_from_user(&zone, argp, sizeof(zone)))
48605f94c174SLaurent Vivier 			goto out;
48615f94c174SLaurent Vivier 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
48625f94c174SLaurent Vivier 		break;
48635f94c174SLaurent Vivier 	}
48645f94c174SLaurent Vivier #endif
4865721eecbfSGregory Haskins 	case KVM_IRQFD: {
4866721eecbfSGregory Haskins 		struct kvm_irqfd data;
4867721eecbfSGregory Haskins 
4868721eecbfSGregory Haskins 		r = -EFAULT;
4869893bdbf1SXiubo Li 		if (copy_from_user(&data, argp, sizeof(data)))
4870721eecbfSGregory Haskins 			goto out;
4871d4db2935SAlex Williamson 		r = kvm_irqfd(kvm, &data);
4872721eecbfSGregory Haskins 		break;
4873721eecbfSGregory Haskins 	}
4874d34e6b17SGregory Haskins 	case KVM_IOEVENTFD: {
4875d34e6b17SGregory Haskins 		struct kvm_ioeventfd data;
4876d34e6b17SGregory Haskins 
4877d34e6b17SGregory Haskins 		r = -EFAULT;
4878893bdbf1SXiubo Li 		if (copy_from_user(&data, argp, sizeof(data)))
4879d34e6b17SGregory Haskins 			goto out;
4880d34e6b17SGregory Haskins 		r = kvm_ioeventfd(kvm, &data);
4881d34e6b17SGregory Haskins 		break;
4882d34e6b17SGregory Haskins 	}
488307975ad3SJan Kiszka #ifdef CONFIG_HAVE_KVM_MSI
488407975ad3SJan Kiszka 	case KVM_SIGNAL_MSI: {
488507975ad3SJan Kiszka 		struct kvm_msi msi;
488607975ad3SJan Kiszka 
488707975ad3SJan Kiszka 		r = -EFAULT;
4888893bdbf1SXiubo Li 		if (copy_from_user(&msi, argp, sizeof(msi)))
488907975ad3SJan Kiszka 			goto out;
489007975ad3SJan Kiszka 		r = kvm_send_userspace_msi(kvm, &msi);
489107975ad3SJan Kiszka 		break;
489207975ad3SJan Kiszka 	}
489307975ad3SJan Kiszka #endif
489423d43cf9SChristoffer Dall #ifdef __KVM_HAVE_IRQ_LINE
489523d43cf9SChristoffer Dall 	case KVM_IRQ_LINE_STATUS:
489623d43cf9SChristoffer Dall 	case KVM_IRQ_LINE: {
489723d43cf9SChristoffer Dall 		struct kvm_irq_level irq_event;
489823d43cf9SChristoffer Dall 
489923d43cf9SChristoffer Dall 		r = -EFAULT;
4900893bdbf1SXiubo Li 		if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
490123d43cf9SChristoffer Dall 			goto out;
490223d43cf9SChristoffer Dall 
4903aa2fbe6dSYang Zhang 		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4904aa2fbe6dSYang Zhang 					ioctl == KVM_IRQ_LINE_STATUS);
490523d43cf9SChristoffer Dall 		if (r)
490623d43cf9SChristoffer Dall 			goto out;
490723d43cf9SChristoffer Dall 
490823d43cf9SChristoffer Dall 		r = -EFAULT;
490923d43cf9SChristoffer Dall 		if (ioctl == KVM_IRQ_LINE_STATUS) {
4910893bdbf1SXiubo Li 			if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
491123d43cf9SChristoffer Dall 				goto out;
491223d43cf9SChristoffer Dall 		}
491323d43cf9SChristoffer Dall 
491423d43cf9SChristoffer Dall 		r = 0;
491523d43cf9SChristoffer Dall 		break;
491623d43cf9SChristoffer Dall 	}
491723d43cf9SChristoffer Dall #endif
4918aa8d5944SAlexander Graf #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4919aa8d5944SAlexander Graf 	case KVM_SET_GSI_ROUTING: {
4920aa8d5944SAlexander Graf 		struct kvm_irq_routing routing;
4921aa8d5944SAlexander Graf 		struct kvm_irq_routing __user *urouting;
4922f8c1b85bSPaolo Bonzini 		struct kvm_irq_routing_entry *entries = NULL;
4923aa8d5944SAlexander Graf 
4924aa8d5944SAlexander Graf 		r = -EFAULT;
4925aa8d5944SAlexander Graf 		if (copy_from_user(&routing, argp, sizeof(routing)))
4926aa8d5944SAlexander Graf 			goto out;
4927aa8d5944SAlexander Graf 		r = -EINVAL;
49285c0aea0eSDavid Hildenbrand 		if (!kvm_arch_can_set_irq_routing(kvm))
49295c0aea0eSDavid Hildenbrand 			goto out;
4930caf1ff26SXiubo Li 		if (routing.nr > KVM_MAX_IRQ_ROUTES)
4931aa8d5944SAlexander Graf 			goto out;
4932aa8d5944SAlexander Graf 		if (routing.flags)
4933aa8d5944SAlexander Graf 			goto out;
4934f8c1b85bSPaolo Bonzini 		if (routing.nr) {
4935aa8d5944SAlexander Graf 			urouting = argp;
49367ec28e26SDenis Efremov 			entries = vmemdup_user(urouting->entries,
49377ec28e26SDenis Efremov 					       array_size(sizeof(*entries),
49387ec28e26SDenis Efremov 							  routing.nr));
49397ec28e26SDenis Efremov 			if (IS_ERR(entries)) {
49407ec28e26SDenis Efremov 				r = PTR_ERR(entries);
49417ec28e26SDenis Efremov 				goto out;
49427ec28e26SDenis Efremov 			}
4943f8c1b85bSPaolo Bonzini 		}
4944aa8d5944SAlexander Graf 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
4945aa8d5944SAlexander Graf 					routing.flags);
49467ec28e26SDenis Efremov 		kvfree(entries);
4947aa8d5944SAlexander Graf 		break;
4948aa8d5944SAlexander Graf 	}
4949aa8d5944SAlexander Graf #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4950852b6d57SScott Wood 	case KVM_CREATE_DEVICE: {
4951852b6d57SScott Wood 		struct kvm_create_device cd;
4952852b6d57SScott Wood 
4953852b6d57SScott Wood 		r = -EFAULT;
4954852b6d57SScott Wood 		if (copy_from_user(&cd, argp, sizeof(cd)))
4955852b6d57SScott Wood 			goto out;
4956852b6d57SScott Wood 
4957852b6d57SScott Wood 		r = kvm_ioctl_create_device(kvm, &cd);
4958852b6d57SScott Wood 		if (r)
4959852b6d57SScott Wood 			goto out;
4960852b6d57SScott Wood 
4961852b6d57SScott Wood 		r = -EFAULT;
4962852b6d57SScott Wood 		if (copy_to_user(argp, &cd, sizeof(cd)))
4963852b6d57SScott Wood 			goto out;
4964852b6d57SScott Wood 
4965852b6d57SScott Wood 		r = 0;
4966852b6d57SScott Wood 		break;
4967852b6d57SScott Wood 	}
496892b591a4SAlexander Graf 	case KVM_CHECK_EXTENSION:
496992b591a4SAlexander Graf 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
497092b591a4SAlexander Graf 		break;
4971fb04a1edSPeter Xu 	case KVM_RESET_DIRTY_RINGS:
4972fb04a1edSPeter Xu 		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4973fb04a1edSPeter Xu 		break;
4974fcfe1baeSJing Zhang 	case KVM_GET_STATS_FD:
4975fcfe1baeSJing Zhang 		r = kvm_vm_ioctl_get_stats_fd(kvm);
4976fcfe1baeSJing Zhang 		break;
49770fce5623SAvi Kivity 	default:
49780fce5623SAvi Kivity 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
49790fce5623SAvi Kivity 	}
49800fce5623SAvi Kivity out:
49810fce5623SAvi Kivity 	return r;
49820fce5623SAvi Kivity }
49830fce5623SAvi Kivity 
4984de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
49856ff5894cSArnd Bergmann struct compat_kvm_dirty_log {
49866ff5894cSArnd Bergmann 	__u32 slot;
49876ff5894cSArnd Bergmann 	__u32 padding1;
49886ff5894cSArnd Bergmann 	union {
49896ff5894cSArnd Bergmann 		compat_uptr_t dirty_bitmap; /* one bit per page */
49906ff5894cSArnd Bergmann 		__u64 padding2;
49916ff5894cSArnd Bergmann 	};
49926ff5894cSArnd Bergmann };
49936ff5894cSArnd Bergmann 
49948750f9bbSPaolo Bonzini struct compat_kvm_clear_dirty_log {
49958750f9bbSPaolo Bonzini 	__u32 slot;
49968750f9bbSPaolo Bonzini 	__u32 num_pages;
49978750f9bbSPaolo Bonzini 	__u64 first_page;
49988750f9bbSPaolo Bonzini 	union {
49998750f9bbSPaolo Bonzini 		compat_uptr_t dirty_bitmap; /* one bit per page */
50008750f9bbSPaolo Bonzini 		__u64 padding2;
50018750f9bbSPaolo Bonzini 	};
50028750f9bbSPaolo Bonzini };
50038750f9bbSPaolo Bonzini 
kvm_arch_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5004ed51862fSAlexander Graf long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5005ed51862fSAlexander Graf 				     unsigned long arg)
5006ed51862fSAlexander Graf {
5007ed51862fSAlexander Graf 	return -ENOTTY;
5008ed51862fSAlexander Graf }
5009ed51862fSAlexander Graf 
kvm_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)50106ff5894cSArnd Bergmann static long kvm_vm_compat_ioctl(struct file *filp,
50116ff5894cSArnd Bergmann 			   unsigned int ioctl, unsigned long arg)
50126ff5894cSArnd Bergmann {
50136ff5894cSArnd Bergmann 	struct kvm *kvm = filp->private_data;
50146ff5894cSArnd Bergmann 	int r;
50156ff5894cSArnd Bergmann 
5016f4d31653SPaolo Bonzini 	if (kvm->mm != current->mm || kvm->vm_dead)
50176ff5894cSArnd Bergmann 		return -EIO;
5018ed51862fSAlexander Graf 
5019ed51862fSAlexander Graf 	r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5020ed51862fSAlexander Graf 	if (r != -ENOTTY)
5021ed51862fSAlexander Graf 		return r;
5022ed51862fSAlexander Graf 
50236ff5894cSArnd Bergmann 	switch (ioctl) {
50248750f9bbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
50258750f9bbSPaolo Bonzini 	case KVM_CLEAR_DIRTY_LOG: {
50268750f9bbSPaolo Bonzini 		struct compat_kvm_clear_dirty_log compat_log;
50278750f9bbSPaolo Bonzini 		struct kvm_clear_dirty_log log;
50288750f9bbSPaolo Bonzini 
50298750f9bbSPaolo Bonzini 		if (copy_from_user(&compat_log, (void __user *)arg,
50308750f9bbSPaolo Bonzini 				   sizeof(compat_log)))
50318750f9bbSPaolo Bonzini 			return -EFAULT;
50328750f9bbSPaolo Bonzini 		log.slot	 = compat_log.slot;
50338750f9bbSPaolo Bonzini 		log.num_pages	 = compat_log.num_pages;
50348750f9bbSPaolo Bonzini 		log.first_page	 = compat_log.first_page;
50358750f9bbSPaolo Bonzini 		log.padding2	 = compat_log.padding2;
50368750f9bbSPaolo Bonzini 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
50378750f9bbSPaolo Bonzini 
50388750f9bbSPaolo Bonzini 		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
50398750f9bbSPaolo Bonzini 		break;
50408750f9bbSPaolo Bonzini 	}
50418750f9bbSPaolo Bonzini #endif
50426ff5894cSArnd Bergmann 	case KVM_GET_DIRTY_LOG: {
50436ff5894cSArnd Bergmann 		struct compat_kvm_dirty_log compat_log;
50446ff5894cSArnd Bergmann 		struct kvm_dirty_log log;
50456ff5894cSArnd Bergmann 
50466ff5894cSArnd Bergmann 		if (copy_from_user(&compat_log, (void __user *)arg,
50476ff5894cSArnd Bergmann 				   sizeof(compat_log)))
5048f6a3b168SMarkus Elfring 			return -EFAULT;
50496ff5894cSArnd Bergmann 		log.slot	 = compat_log.slot;
50506ff5894cSArnd Bergmann 		log.padding1	 = compat_log.padding1;
50516ff5894cSArnd Bergmann 		log.padding2	 = compat_log.padding2;
50526ff5894cSArnd Bergmann 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
50536ff5894cSArnd Bergmann 
50546ff5894cSArnd Bergmann 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
50556ff5894cSArnd Bergmann 		break;
50566ff5894cSArnd Bergmann 	}
50576ff5894cSArnd Bergmann 	default:
50586ff5894cSArnd Bergmann 		r = kvm_vm_ioctl(filp, ioctl, arg);
50596ff5894cSArnd Bergmann 	}
50606ff5894cSArnd Bergmann 	return r;
50616ff5894cSArnd Bergmann }
50626ff5894cSArnd Bergmann #endif
50636ff5894cSArnd Bergmann 
506470375c2dSDavid Matlack static const struct file_operations kvm_vm_fops = {
50650fce5623SAvi Kivity 	.release        = kvm_vm_release,
50660fce5623SAvi Kivity 	.unlocked_ioctl = kvm_vm_ioctl,
50676038f373SArnd Bergmann 	.llseek		= noop_llseek,
50687ddfd3e0SMarc Zyngier 	KVM_COMPAT(kvm_vm_compat_ioctl),
50690fce5623SAvi Kivity };
50700fce5623SAvi Kivity 
file_is_kvm(struct file * file)507154526d1fSNathan Tempelman bool file_is_kvm(struct file *file)
507254526d1fSNathan Tempelman {
507354526d1fSNathan Tempelman 	return file && file->f_op == &kvm_vm_fops;
507454526d1fSNathan Tempelman }
507554526d1fSNathan Tempelman EXPORT_SYMBOL_GPL(file_is_kvm);
507654526d1fSNathan Tempelman 
kvm_dev_ioctl_create_vm(unsigned long type)5077e08b9637SCarsten Otte static int kvm_dev_ioctl_create_vm(unsigned long type)
50780fce5623SAvi Kivity {
507959f82aadSOliver Upton 	char fdname[ITOA_MAX_LEN + 1];
508020020f4cSOliver Upton 	int r, fd;
50810fce5623SAvi Kivity 	struct kvm *kvm;
5082506cfba9SAl Viro 	struct file *file;
50830fce5623SAvi Kivity 
508420020f4cSOliver Upton 	fd = get_unused_fd_flags(O_CLOEXEC);
508520020f4cSOliver Upton 	if (fd < 0)
508620020f4cSOliver Upton 		return fd;
508720020f4cSOliver Upton 
508859f82aadSOliver Upton 	snprintf(fdname, sizeof(fdname), "%d", fd);
508959f82aadSOliver Upton 
5090b74ed7a6SOliver Upton 	kvm = kvm_create_vm(type, fdname);
509120020f4cSOliver Upton 	if (IS_ERR(kvm)) {
509220020f4cSOliver Upton 		r = PTR_ERR(kvm);
509320020f4cSOliver Upton 		goto put_fd;
509420020f4cSOliver Upton 	}
509520020f4cSOliver Upton 
5096506cfba9SAl Viro 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5097506cfba9SAl Viro 	if (IS_ERR(file)) {
509878588335SMarkus Elfring 		r = PTR_ERR(file);
509978588335SMarkus Elfring 		goto put_kvm;
5100506cfba9SAl Viro 	}
5101536a6f88SJanosch Frank 
5102525df861SPaolo Bonzini 	/*
5103525df861SPaolo Bonzini 	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5104525df861SPaolo Bonzini 	 * already set, with ->release() being kvm_vm_release().  In error
5105525df861SPaolo Bonzini 	 * cases it will be called by the final fput(file) and will take
5106525df861SPaolo Bonzini 	 * care of doing kvm_put_kvm(kvm).
5107525df861SPaolo Bonzini 	 */
5108286de8f6SClaudio Imbrenda 	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
51090fce5623SAvi Kivity 
511020020f4cSOliver Upton 	fd_install(fd, file);
511120020f4cSOliver Upton 	return fd;
511278588335SMarkus Elfring 
511378588335SMarkus Elfring put_kvm:
511478588335SMarkus Elfring 	kvm_put_kvm(kvm);
511520020f4cSOliver Upton put_fd:
511620020f4cSOliver Upton 	put_unused_fd(fd);
511778588335SMarkus Elfring 	return r;
51180fce5623SAvi Kivity }
51190fce5623SAvi Kivity 
kvm_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)51200fce5623SAvi Kivity static long kvm_dev_ioctl(struct file *filp,
51210fce5623SAvi Kivity 			  unsigned int ioctl, unsigned long arg)
51220fce5623SAvi Kivity {
5123f15ba52bSThomas Huth 	int r = -EINVAL;
51240fce5623SAvi Kivity 
51250fce5623SAvi Kivity 	switch (ioctl) {
51260fce5623SAvi Kivity 	case KVM_GET_API_VERSION:
51270fce5623SAvi Kivity 		if (arg)
51280fce5623SAvi Kivity 			goto out;
51290fce5623SAvi Kivity 		r = KVM_API_VERSION;
51300fce5623SAvi Kivity 		break;
51310fce5623SAvi Kivity 	case KVM_CREATE_VM:
5132e08b9637SCarsten Otte 		r = kvm_dev_ioctl_create_vm(arg);
51330fce5623SAvi Kivity 		break;
51340fce5623SAvi Kivity 	case KVM_CHECK_EXTENSION:
5135784aa3d7SAlexander Graf 		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
51360fce5623SAvi Kivity 		break;
51370fce5623SAvi Kivity 	case KVM_GET_VCPU_MMAP_SIZE:
51380fce5623SAvi Kivity 		if (arg)
51390fce5623SAvi Kivity 			goto out;
5140adb1ff46SAvi Kivity 		r = PAGE_SIZE;     /* struct kvm_run */
5141adb1ff46SAvi Kivity #ifdef CONFIG_X86
5142adb1ff46SAvi Kivity 		r += PAGE_SIZE;    /* pio data page */
5143adb1ff46SAvi Kivity #endif
51444b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
51455f94c174SLaurent Vivier 		r += PAGE_SIZE;    /* coalesced mmio ring page */
51465f94c174SLaurent Vivier #endif
51470fce5623SAvi Kivity 		break;
5148d4c9ff2dSFeng(Eric) Liu 	case KVM_TRACE_ENABLE:
5149d4c9ff2dSFeng(Eric) Liu 	case KVM_TRACE_PAUSE:
5150d4c9ff2dSFeng(Eric) Liu 	case KVM_TRACE_DISABLE:
51512023a29cSMarcelo Tosatti 		r = -EOPNOTSUPP;
5152d4c9ff2dSFeng(Eric) Liu 		break;
51530fce5623SAvi Kivity 	default:
51540fce5623SAvi Kivity 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
51550fce5623SAvi Kivity 	}
51560fce5623SAvi Kivity out:
51570fce5623SAvi Kivity 	return r;
51580fce5623SAvi Kivity }
51590fce5623SAvi Kivity 
51600fce5623SAvi Kivity static struct file_operations kvm_chardev_ops = {
51610fce5623SAvi Kivity 	.unlocked_ioctl = kvm_dev_ioctl,
51626038f373SArnd Bergmann 	.llseek		= noop_llseek,
51637ddfd3e0SMarc Zyngier 	KVM_COMPAT(kvm_dev_ioctl),
51640fce5623SAvi Kivity };
51650fce5623SAvi Kivity 
51660fce5623SAvi Kivity static struct miscdevice kvm_dev = {
51670fce5623SAvi Kivity 	KVM_MINOR,
51680fce5623SAvi Kivity 	"kvm",
51690fce5623SAvi Kivity 	&kvm_chardev_ops,
51700fce5623SAvi Kivity };
51710fce5623SAvi Kivity 
5172441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5173441f7bfaSSean Christopherson __visible bool kvm_rebooting;
5174441f7bfaSSean Christopherson EXPORT_SYMBOL_GPL(kvm_rebooting);
5175441f7bfaSSean Christopherson 
5176441f7bfaSSean Christopherson static DEFINE_PER_CPU(bool, hardware_enabled);
5177*4777225eSSean Christopherson static DEFINE_MUTEX(kvm_usage_lock);
5178441f7bfaSSean Christopherson static int kvm_usage_count;
5179441f7bfaSSean Christopherson 
__hardware_enable_nolock(void)5180e6fb7d6eSIsaku Yamahata static int __hardware_enable_nolock(void)
51810fce5623SAvi Kivity {
518237d25881SSean Christopherson 	if (__this_cpu_read(hardware_enabled))
5183e6fb7d6eSIsaku Yamahata 		return 0;
518410474ae8SAlexander Graf 
518537d25881SSean Christopherson 	if (kvm_arch_hardware_enable()) {
518637d25881SSean Christopherson 		pr_info("kvm: enabling virtualization on CPU%d failed\n",
518737d25881SSean Christopherson 			raw_smp_processor_id());
5188e6fb7d6eSIsaku Yamahata 		return -EIO;
518910474ae8SAlexander Graf 	}
519037d25881SSean Christopherson 
519137d25881SSean Christopherson 	__this_cpu_write(hardware_enabled, true);
5192e6fb7d6eSIsaku Yamahata 	return 0;
5193e6fb7d6eSIsaku Yamahata }
5194e6fb7d6eSIsaku Yamahata 
hardware_enable_nolock(void * failed)5195e6fb7d6eSIsaku Yamahata static void hardware_enable_nolock(void *failed)
5196e6fb7d6eSIsaku Yamahata {
5197e6fb7d6eSIsaku Yamahata 	if (__hardware_enable_nolock())
5198e6fb7d6eSIsaku Yamahata 		atomic_inc(failed);
51990fce5623SAvi Kivity }
52000fce5623SAvi Kivity 
kvm_online_cpu(unsigned int cpu)5201aaf12a7bSChao Gao static int kvm_online_cpu(unsigned int cpu)
520275b7127cSTakuya Yoshikawa {
5203aaf12a7bSChao Gao 	int ret = 0;
5204aaf12a7bSChao Gao 
5205aaf12a7bSChao Gao 	/*
5206aaf12a7bSChao Gao 	 * Abort the CPU online process if hardware virtualization cannot
5207aaf12a7bSChao Gao 	 * be enabled. Otherwise running VMs would encounter unrecoverable
5208aaf12a7bSChao Gao 	 * errors when scheduled to this CPU.
5209aaf12a7bSChao Gao 	 */
5210*4777225eSSean Christopherson 	mutex_lock(&kvm_usage_lock);
5211e6fb7d6eSIsaku Yamahata 	if (kvm_usage_count)
5212e6fb7d6eSIsaku Yamahata 		ret = __hardware_enable_nolock();
5213*4777225eSSean Christopherson 	mutex_unlock(&kvm_usage_lock);
5214aaf12a7bSChao Gao 	return ret;
521575b7127cSTakuya Yoshikawa }
521675b7127cSTakuya Yoshikawa 
hardware_disable_nolock(void * junk)521775b7127cSTakuya Yoshikawa static void hardware_disable_nolock(void *junk)
52180fce5623SAvi Kivity {
521937d25881SSean Christopherson 	/*
522037d25881SSean Christopherson 	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
522137d25881SSean Christopherson 	 * hardware, not just CPUs that successfully enabled hardware!
522237d25881SSean Christopherson 	 */
522337d25881SSean Christopherson 	if (!__this_cpu_read(hardware_enabled))
52240fce5623SAvi Kivity 		return;
522537d25881SSean Christopherson 
522613a34e06SRadim Krčmář 	kvm_arch_hardware_disable();
522737d25881SSean Christopherson 
522837d25881SSean Christopherson 	__this_cpu_write(hardware_enabled, false);
52290fce5623SAvi Kivity }
52300fce5623SAvi Kivity 
kvm_offline_cpu(unsigned int cpu)5231aaf12a7bSChao Gao static int kvm_offline_cpu(unsigned int cpu)
523275b7127cSTakuya Yoshikawa {
5233*4777225eSSean Christopherson 	mutex_lock(&kvm_usage_lock);
52344fa92fb2SPaolo Bonzini 	if (kvm_usage_count)
52354fa92fb2SPaolo Bonzini 		hardware_disable_nolock(NULL);
5236*4777225eSSean Christopherson 	mutex_unlock(&kvm_usage_lock);
52378c18b2d2SThomas Gleixner 	return 0;
523875b7127cSTakuya Yoshikawa }
523975b7127cSTakuya Yoshikawa 
hardware_disable_all_nolock(void)524010474ae8SAlexander Graf static void hardware_disable_all_nolock(void)
524110474ae8SAlexander Graf {
524210474ae8SAlexander Graf 	BUG_ON(!kvm_usage_count);
524310474ae8SAlexander Graf 
524410474ae8SAlexander Graf 	kvm_usage_count--;
524510474ae8SAlexander Graf 	if (!kvm_usage_count)
524675b7127cSTakuya Yoshikawa 		on_each_cpu(hardware_disable_nolock, NULL, 1);
524710474ae8SAlexander Graf }
524810474ae8SAlexander Graf 
hardware_disable_all(void)524910474ae8SAlexander Graf static void hardware_disable_all(void)
525010474ae8SAlexander Graf {
5251e4aa7f88SChao Gao 	cpus_read_lock();
5252*4777225eSSean Christopherson 	mutex_lock(&kvm_usage_lock);
525310474ae8SAlexander Graf 	hardware_disable_all_nolock();
5254*4777225eSSean Christopherson 	mutex_unlock(&kvm_usage_lock);
5255e4aa7f88SChao Gao 	cpus_read_unlock();
525610474ae8SAlexander Graf }
525710474ae8SAlexander Graf 
hardware_enable_all(void)525810474ae8SAlexander Graf static int hardware_enable_all(void)
525910474ae8SAlexander Graf {
5260e6fb7d6eSIsaku Yamahata 	atomic_t failed = ATOMIC_INIT(0);
5261e0ceec22SSean Christopherson 	int r;
5262e0ceec22SSean Christopherson 
5263e0ceec22SSean Christopherson 	/*
5264e0ceec22SSean Christopherson 	 * Do not enable hardware virtualization if the system is going down.
5265e0ceec22SSean Christopherson 	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5266e0ceec22SSean Christopherson 	 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5267e0ceec22SSean Christopherson 	 * after kvm_reboot() is called.  Note, this relies on system_state
5268e0ceec22SSean Christopherson 	 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5269e0ceec22SSean Christopherson 	 * hook instead of registering a dedicated reboot notifier (the latter
5270e0ceec22SSean Christopherson 	 * runs before system_state is updated).
5271e0ceec22SSean Christopherson 	 */
5272e0ceec22SSean Christopherson 	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5273e0ceec22SSean Christopherson 	    system_state == SYSTEM_RESTART)
5274e0ceec22SSean Christopherson 		return -EBUSY;
527510474ae8SAlexander Graf 
5276e4aa7f88SChao Gao 	/*
5277e4aa7f88SChao Gao 	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5278e4aa7f88SChao Gao 	 * is called, and so on_each_cpu() between them includes the CPU that
5279e4aa7f88SChao Gao 	 * is being onlined.  As a result, hardware_enable_nolock() may get
5280e4aa7f88SChao Gao 	 * invoked before kvm_online_cpu(), which also enables hardware if the
5281e4aa7f88SChao Gao 	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
5282e4aa7f88SChao Gao 	 * enable hardware multiple times.
5283e4aa7f88SChao Gao 	 */
5284e4aa7f88SChao Gao 	cpus_read_lock();
5285*4777225eSSean Christopherson 	mutex_lock(&kvm_usage_lock);
528610474ae8SAlexander Graf 
5287e0ceec22SSean Christopherson 	r = 0;
5288e0ceec22SSean Christopherson 
528910474ae8SAlexander Graf 	kvm_usage_count++;
529010474ae8SAlexander Graf 	if (kvm_usage_count == 1) {
5291e6fb7d6eSIsaku Yamahata 		on_each_cpu(hardware_enable_nolock, &failed, 1);
529210474ae8SAlexander Graf 
5293e6fb7d6eSIsaku Yamahata 		if (atomic_read(&failed)) {
529410474ae8SAlexander Graf 			hardware_disable_all_nolock();
529510474ae8SAlexander Graf 			r = -EBUSY;
529610474ae8SAlexander Graf 		}
529710474ae8SAlexander Graf 	}
529810474ae8SAlexander Graf 
5299*4777225eSSean Christopherson 	mutex_unlock(&kvm_usage_lock);
5300e4aa7f88SChao Gao 	cpus_read_unlock();
530110474ae8SAlexander Graf 
530210474ae8SAlexander Graf 	return r;
530310474ae8SAlexander Graf }
530410474ae8SAlexander Graf 
kvm_shutdown(void)53056735150bSSean Christopherson static void kvm_shutdown(void)
53060fce5623SAvi Kivity {
53070fce5623SAvi Kivity 	/*
53086735150bSSean Christopherson 	 * Disable hardware virtualization and set kvm_rebooting to indicate
53096735150bSSean Christopherson 	 * that KVM has asynchronously disabled hardware virtualization, i.e.
53106735150bSSean Christopherson 	 * that relevant errors and exceptions aren't entirely unexpected.
53116735150bSSean Christopherson 	 * Some flavors of hardware virtualization need to be disabled before
53126735150bSSean Christopherson 	 * transferring control to firmware (to perform shutdown/reboot), e.g.
53136735150bSSean Christopherson 	 * on x86, virtualization can block INIT interrupts, which are used by
53146735150bSSean Christopherson 	 * firmware to pull APs back under firmware control.  Note, this path
53156735150bSSean Christopherson 	 * is used for both shutdown and reboot scenarios, i.e. neither name is
53166735150bSSean Christopherson 	 * 100% comprehensive.
53170fce5623SAvi Kivity 	 */
53181170adc6SXiubo Li 	pr_info("kvm: exiting hardware virtualization\n");
53194ecac3fdSAvi Kivity 	kvm_rebooting = true;
532075b7127cSTakuya Yoshikawa 	on_each_cpu(hardware_disable_nolock, NULL, 1);
53210fce5623SAvi Kivity }
53220fce5623SAvi Kivity 
kvm_suspend(void)532335774a9fSSean Christopherson static int kvm_suspend(void)
532435774a9fSSean Christopherson {
532535774a9fSSean Christopherson 	/*
532635774a9fSSean Christopherson 	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5327*4777225eSSean Christopherson 	 * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
5328*4777225eSSean Christopherson 	 * count is stable.  Assert that kvm_usage_lock is not held to ensure
5329*4777225eSSean Christopherson 	 * the system isn't suspended while KVM is enabling hardware.  Hardware
5330*4777225eSSean Christopherson 	 * enabling can be preempted, but the task cannot be frozen until it has
5331*4777225eSSean Christopherson 	 * dropped all locks (userspace tasks are frozen via a fake signal).
533235774a9fSSean Christopherson 	 */
5333*4777225eSSean Christopherson 	lockdep_assert_not_held(&kvm_usage_lock);
533435774a9fSSean Christopherson 	lockdep_assert_irqs_disabled();
533535774a9fSSean Christopherson 
533635774a9fSSean Christopherson 	if (kvm_usage_count)
533735774a9fSSean Christopherson 		hardware_disable_nolock(NULL);
533835774a9fSSean Christopherson 	return 0;
533935774a9fSSean Christopherson }
534035774a9fSSean Christopherson 
kvm_resume(void)534135774a9fSSean Christopherson static void kvm_resume(void)
534235774a9fSSean Christopherson {
5343*4777225eSSean Christopherson 	lockdep_assert_not_held(&kvm_usage_lock);
534435774a9fSSean Christopherson 	lockdep_assert_irqs_disabled();
534535774a9fSSean Christopherson 
534635774a9fSSean Christopherson 	if (kvm_usage_count)
534735774a9fSSean Christopherson 		WARN_ON_ONCE(__hardware_enable_nolock());
534835774a9fSSean Christopherson }
534935774a9fSSean Christopherson 
535035774a9fSSean Christopherson static struct syscore_ops kvm_syscore_ops = {
535135774a9fSSean Christopherson 	.suspend = kvm_suspend,
535235774a9fSSean Christopherson 	.resume = kvm_resume,
53536735150bSSean Christopherson 	.shutdown = kvm_shutdown,
535435774a9fSSean Christopherson };
5355441f7bfaSSean Christopherson #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
hardware_enable_all(void)5356441f7bfaSSean Christopherson static int hardware_enable_all(void)
5357441f7bfaSSean Christopherson {
5358441f7bfaSSean Christopherson 	return 0;
5359441f7bfaSSean Christopherson }
5360441f7bfaSSean Christopherson 
hardware_disable_all(void)5361441f7bfaSSean Christopherson static void hardware_disable_all(void)
5362441f7bfaSSean Christopherson {
5363441f7bfaSSean Christopherson 
5364441f7bfaSSean Christopherson }
5365441f7bfaSSean Christopherson #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
536635774a9fSSean Christopherson 
kvm_iodevice_destructor(struct kvm_io_device * dev)53675ea5ca3cSWei Wang static void kvm_iodevice_destructor(struct kvm_io_device *dev)
53685ea5ca3cSWei Wang {
53695ea5ca3cSWei Wang 	if (dev->ops->destructor)
53705ea5ca3cSWei Wang 		dev->ops->destructor(dev);
53715ea5ca3cSWei Wang }
53725ea5ca3cSWei Wang 
kvm_io_bus_destroy(struct kvm_io_bus * bus)5373e93f8a0fSMarcelo Tosatti static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
53740fce5623SAvi Kivity {
53750fce5623SAvi Kivity 	int i;
53760fce5623SAvi Kivity 
53770fce5623SAvi Kivity 	for (i = 0; i < bus->dev_count; i++) {
5378743eeb0bSSasha Levin 		struct kvm_io_device *pos = bus->range[i].dev;
53790fce5623SAvi Kivity 
53800fce5623SAvi Kivity 		kvm_iodevice_destructor(pos);
53810fce5623SAvi Kivity 	}
5382e93f8a0fSMarcelo Tosatti 	kfree(bus);
53830fce5623SAvi Kivity }
53840fce5623SAvi Kivity 
kvm_io_bus_cmp(const struct kvm_io_range * r1,const struct kvm_io_range * r2)5385c21fbff1SPaolo Bonzini static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5386a343c9b7SPaolo Bonzini 				 const struct kvm_io_range *r2)
5387743eeb0bSSasha Levin {
53888f4216c7SJason Wang 	gpa_t addr1 = r1->addr;
53898f4216c7SJason Wang 	gpa_t addr2 = r2->addr;
53908f4216c7SJason Wang 
53918f4216c7SJason Wang 	if (addr1 < addr2)
5392743eeb0bSSasha Levin 		return -1;
53938f4216c7SJason Wang 
53948f4216c7SJason Wang 	/* If r2->len == 0, match the exact address.  If r2->len != 0,
53958f4216c7SJason Wang 	 * accept any overlapping write.  Any order is acceptable for
53968f4216c7SJason Wang 	 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
53978f4216c7SJason Wang 	 * we process all of them.
53988f4216c7SJason Wang 	 */
53998f4216c7SJason Wang 	if (r2->len) {
54008f4216c7SJason Wang 		addr1 += r1->len;
54018f4216c7SJason Wang 		addr2 += r2->len;
54028f4216c7SJason Wang 	}
54038f4216c7SJason Wang 
54048f4216c7SJason Wang 	if (addr1 > addr2)
5405743eeb0bSSasha Levin 		return 1;
54068f4216c7SJason Wang 
5407743eeb0bSSasha Levin 	return 0;
5408743eeb0bSSasha Levin }
5409743eeb0bSSasha Levin 
kvm_io_bus_sort_cmp(const void * p1,const void * p2)5410a343c9b7SPaolo Bonzini static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5411a343c9b7SPaolo Bonzini {
5412c21fbff1SPaolo Bonzini 	return kvm_io_bus_cmp(p1, p2);
5413a343c9b7SPaolo Bonzini }
5414a343c9b7SPaolo Bonzini 
kvm_io_bus_get_first_dev(struct kvm_io_bus * bus,gpa_t addr,int len)541539369f7aSGeoff Levand static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5416743eeb0bSSasha Levin 			     gpa_t addr, int len)
5417743eeb0bSSasha Levin {
5418743eeb0bSSasha Levin 	struct kvm_io_range *range, key;
5419743eeb0bSSasha Levin 	int off;
5420743eeb0bSSasha Levin 
5421743eeb0bSSasha Levin 	key = (struct kvm_io_range) {
5422743eeb0bSSasha Levin 		.addr = addr,
5423743eeb0bSSasha Levin 		.len = len,
5424743eeb0bSSasha Levin 	};
5425743eeb0bSSasha Levin 
5426743eeb0bSSasha Levin 	range = bsearch(&key, bus->range, bus->dev_count,
5427743eeb0bSSasha Levin 			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5428743eeb0bSSasha Levin 	if (range == NULL)
5429743eeb0bSSasha Levin 		return -ENOENT;
5430743eeb0bSSasha Levin 
5431743eeb0bSSasha Levin 	off = range - bus->range;
5432743eeb0bSSasha Levin 
5433c21fbff1SPaolo Bonzini 	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5434743eeb0bSSasha Levin 		off--;
5435743eeb0bSSasha Levin 
5436743eeb0bSSasha Levin 	return off;
5437743eeb0bSSasha Levin }
5438743eeb0bSSasha Levin 
__kvm_io_bus_write(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,const void * val)5439e32edf4fSNikolay Nikolaev static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5440126a5af5SCornelia Huck 			      struct kvm_io_range *range, const void *val)
5441126a5af5SCornelia Huck {
5442126a5af5SCornelia Huck 	int idx;
5443126a5af5SCornelia Huck 
5444126a5af5SCornelia Huck 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5445126a5af5SCornelia Huck 	if (idx < 0)
5446126a5af5SCornelia Huck 		return -EOPNOTSUPP;
5447126a5af5SCornelia Huck 
5448126a5af5SCornelia Huck 	while (idx < bus->dev_count &&
5449c21fbff1SPaolo Bonzini 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5450e32edf4fSNikolay Nikolaev 		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5451126a5af5SCornelia Huck 					range->len, val))
5452126a5af5SCornelia Huck 			return idx;
5453126a5af5SCornelia Huck 		idx++;
5454126a5af5SCornelia Huck 	}
5455126a5af5SCornelia Huck 
5456126a5af5SCornelia Huck 	return -EOPNOTSUPP;
5457126a5af5SCornelia Huck }
5458126a5af5SCornelia Huck 
5459bda9020eSMichael S. Tsirkin /* kvm_io_bus_write - called under kvm->slots_lock */
kvm_io_bus_write(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val)5460e32edf4fSNikolay Nikolaev int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5461bda9020eSMichael S. Tsirkin 		     int len, const void *val)
54620fce5623SAvi Kivity {
5463126a5af5SCornelia Huck 	struct kvm_io_bus *bus;
5464126a5af5SCornelia Huck 	struct kvm_io_range range;
5465126a5af5SCornelia Huck 	int r;
5466126a5af5SCornelia Huck 
5467126a5af5SCornelia Huck 	range = (struct kvm_io_range) {
5468126a5af5SCornelia Huck 		.addr = addr,
5469126a5af5SCornelia Huck 		.len = len,
5470126a5af5SCornelia Huck 	};
5471126a5af5SCornelia Huck 
5472e32edf4fSNikolay Nikolaev 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
547390db1043SDavid Hildenbrand 	if (!bus)
547490db1043SDavid Hildenbrand 		return -ENOMEM;
5475e32edf4fSNikolay Nikolaev 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
5476126a5af5SCornelia Huck 	return r < 0 ? r : 0;
5477126a5af5SCornelia Huck }
5478a2420107SLeo Yan EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5479126a5af5SCornelia Huck 
5480126a5af5SCornelia Huck /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
kvm_io_bus_write_cookie(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val,long cookie)5481e32edf4fSNikolay Nikolaev int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5482e32edf4fSNikolay Nikolaev 			    gpa_t addr, int len, const void *val, long cookie)
5483126a5af5SCornelia Huck {
548490d83dc3SLai Jiangshan 	struct kvm_io_bus *bus;
5485743eeb0bSSasha Levin 	struct kvm_io_range range;
5486743eeb0bSSasha Levin 
5487743eeb0bSSasha Levin 	range = (struct kvm_io_range) {
5488743eeb0bSSasha Levin 		.addr = addr,
5489743eeb0bSSasha Levin 		.len = len,
5490743eeb0bSSasha Levin 	};
549190d83dc3SLai Jiangshan 
5492e32edf4fSNikolay Nikolaev 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
549390db1043SDavid Hildenbrand 	if (!bus)
549490db1043SDavid Hildenbrand 		return -ENOMEM;
5495126a5af5SCornelia Huck 
5496126a5af5SCornelia Huck 	/* First try the device referenced by cookie. */
5497126a5af5SCornelia Huck 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
5498c21fbff1SPaolo Bonzini 	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5499e32edf4fSNikolay Nikolaev 		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5500126a5af5SCornelia Huck 					val))
5501126a5af5SCornelia Huck 			return cookie;
5502126a5af5SCornelia Huck 
5503126a5af5SCornelia Huck 	/*
5504126a5af5SCornelia Huck 	 * cookie contained garbage; fall back to search and return the
5505126a5af5SCornelia Huck 	 * correct cookie value.
5506126a5af5SCornelia Huck 	 */
5507e32edf4fSNikolay Nikolaev 	return __kvm_io_bus_write(vcpu, bus, &range, val);
5508126a5af5SCornelia Huck }
5509126a5af5SCornelia Huck 
__kvm_io_bus_read(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,void * val)5510e32edf4fSNikolay Nikolaev static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5511e32edf4fSNikolay Nikolaev 			     struct kvm_io_range *range, void *val)
5512126a5af5SCornelia Huck {
5513126a5af5SCornelia Huck 	int idx;
5514126a5af5SCornelia Huck 
5515126a5af5SCornelia Huck 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5516743eeb0bSSasha Levin 	if (idx < 0)
5517743eeb0bSSasha Levin 		return -EOPNOTSUPP;
5518743eeb0bSSasha Levin 
5519743eeb0bSSasha Levin 	while (idx < bus->dev_count &&
5520c21fbff1SPaolo Bonzini 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5521e32edf4fSNikolay Nikolaev 		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5522126a5af5SCornelia Huck 				       range->len, val))
5523126a5af5SCornelia Huck 			return idx;
5524743eeb0bSSasha Levin 		idx++;
5525743eeb0bSSasha Levin 	}
5526743eeb0bSSasha Levin 
5527bda9020eSMichael S. Tsirkin 	return -EOPNOTSUPP;
55280fce5623SAvi Kivity }
55290fce5623SAvi Kivity 
5530bda9020eSMichael S. Tsirkin /* kvm_io_bus_read - called under kvm->slots_lock */
kvm_io_bus_read(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,void * val)5531e32edf4fSNikolay Nikolaev int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5532e93f8a0fSMarcelo Tosatti 		    int len, void *val)
5533bda9020eSMichael S. Tsirkin {
5534126a5af5SCornelia Huck 	struct kvm_io_bus *bus;
5535126a5af5SCornelia Huck 	struct kvm_io_range range;
5536126a5af5SCornelia Huck 	int r;
5537126a5af5SCornelia Huck 
5538126a5af5SCornelia Huck 	range = (struct kvm_io_range) {
5539126a5af5SCornelia Huck 		.addr = addr,
5540126a5af5SCornelia Huck 		.len = len,
5541126a5af5SCornelia Huck 	};
5542126a5af5SCornelia Huck 
5543e32edf4fSNikolay Nikolaev 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
554490db1043SDavid Hildenbrand 	if (!bus)
554590db1043SDavid Hildenbrand 		return -ENOMEM;
5546e32edf4fSNikolay Nikolaev 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
5547126a5af5SCornelia Huck 	return r < 0 ? r : 0;
5548126a5af5SCornelia Huck }
5549126a5af5SCornelia Huck 
555079fac95eSMarcelo Tosatti /* Caller must hold slots_lock. */
kvm_io_bus_register_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr,int len,struct kvm_io_device * dev)5551743eeb0bSSasha Levin int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5552743eeb0bSSasha Levin 			    int len, struct kvm_io_device *dev)
5553090b7affSGregory Haskins {
5554d4c67a7aSGal Hammer 	int i;
5555e93f8a0fSMarcelo Tosatti 	struct kvm_io_bus *new_bus, *bus;
5556d4c67a7aSGal Hammer 	struct kvm_io_range range;
5557090b7affSGregory Haskins 
55584a12f951SChristian Borntraeger 	bus = kvm_get_bus(kvm, bus_idx);
555990db1043SDavid Hildenbrand 	if (!bus)
556090db1043SDavid Hildenbrand 		return -ENOMEM;
556190db1043SDavid Hildenbrand 
55626ea34c9bSAmos Kong 	/* exclude ioeventfd which is limited by maximum fd */
55636ea34c9bSAmos Kong 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5564090b7affSGregory Haskins 		return -ENOSPC;
5565090b7affSGregory Haskins 
556690952cd3SGustavo A. R. Silva 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5567b12ce36aSBen Gardon 			  GFP_KERNEL_ACCOUNT);
5568e93f8a0fSMarcelo Tosatti 	if (!new_bus)
5569e93f8a0fSMarcelo Tosatti 		return -ENOMEM;
5570d4c67a7aSGal Hammer 
5571d4c67a7aSGal Hammer 	range = (struct kvm_io_range) {
5572d4c67a7aSGal Hammer 		.addr = addr,
5573d4c67a7aSGal Hammer 		.len = len,
5574d4c67a7aSGal Hammer 		.dev = dev,
5575d4c67a7aSGal Hammer 	};
5576d4c67a7aSGal Hammer 
5577d4c67a7aSGal Hammer 	for (i = 0; i < bus->dev_count; i++)
5578d4c67a7aSGal Hammer 		if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5579d4c67a7aSGal Hammer 			break;
5580d4c67a7aSGal Hammer 
5581d4c67a7aSGal Hammer 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5582d4c67a7aSGal Hammer 	new_bus->dev_count++;
5583d4c67a7aSGal Hammer 	new_bus->range[i] = range;
5584d4c67a7aSGal Hammer 	memcpy(new_bus->range + i + 1, bus->range + i,
5585d4c67a7aSGal Hammer 		(bus->dev_count - i) * sizeof(struct kvm_io_range));
5586e93f8a0fSMarcelo Tosatti 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5587e93f8a0fSMarcelo Tosatti 	synchronize_srcu_expedited(&kvm->srcu);
5588e93f8a0fSMarcelo Tosatti 	kfree(bus);
5589090b7affSGregory Haskins 
5590090b7affSGregory Haskins 	return 0;
5591090b7affSGregory Haskins }
5592090b7affSGregory Haskins 
kvm_io_bus_unregister_dev(struct kvm * kvm,enum kvm_bus bus_idx,struct kvm_io_device * dev)55935d3c4c79SSean Christopherson int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
55946c474694SMichael S. Tsirkin 			      struct kvm_io_device *dev)
55956c474694SMichael S. Tsirkin {
55965ea5ca3cSWei Wang 	int i;
5597e93f8a0fSMarcelo Tosatti 	struct kvm_io_bus *new_bus, *bus;
55986c474694SMichael S. Tsirkin 
55997c896d37SSean Christopherson 	lockdep_assert_held(&kvm->slots_lock);
56007c896d37SSean Christopherson 
56014a12f951SChristian Borntraeger 	bus = kvm_get_bus(kvm, bus_idx);
5602df630b8cSPeter Xu 	if (!bus)
56035d3c4c79SSean Christopherson 		return 0;
5604df630b8cSPeter Xu 
56057c896d37SSean Christopherson 	for (i = 0; i < bus->dev_count; i++) {
5606a1300716SAmos Kong 		if (bus->range[i].dev == dev) {
5607090b7affSGregory Haskins 			break;
5608090b7affSGregory Haskins 		}
56097c896d37SSean Christopherson 	}
5610e93f8a0fSMarcelo Tosatti 
561190db1043SDavid Hildenbrand 	if (i == bus->dev_count)
56125d3c4c79SSean Christopherson 		return 0;
5613a1300716SAmos Kong 
561490952cd3SGustavo A. R. Silva 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5615b12ce36aSBen Gardon 			  GFP_KERNEL_ACCOUNT);
5616f6588660SRustam Kovhaev 	if (new_bus) {
5617871c433bSRustam Kovhaev 		memcpy(new_bus, bus, struct_size(bus, range, i));
5618a1300716SAmos Kong 		new_bus->dev_count--;
5619a1300716SAmos Kong 		memcpy(new_bus->range + i, bus->range + i + 1,
5620871c433bSRustam Kovhaev 				flex_array_size(new_bus, range, new_bus->dev_count - i));
56212ee37574SSean Christopherson 	}
56222ee37574SSean Christopherson 
56232ee37574SSean Christopherson 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
56242ee37574SSean Christopherson 	synchronize_srcu_expedited(&kvm->srcu);
56252ee37574SSean Christopherson 
56265ea5ca3cSWei Wang 	/*
56275ea5ca3cSWei Wang 	 * If NULL bus is installed, destroy the old bus, including all the
56285ea5ca3cSWei Wang 	 * attached devices. Otherwise, destroy the caller's device only.
56295ea5ca3cSWei Wang 	 */
56302ee37574SSean Christopherson 	if (!new_bus) {
5631f6588660SRustam Kovhaev 		pr_err("kvm: failed to shrink bus, removing it completely\n");
56325ea5ca3cSWei Wang 		kvm_io_bus_destroy(bus);
56335ea5ca3cSWei Wang 		return -ENOMEM;
5634f6588660SRustam Kovhaev 	}
5635e93f8a0fSMarcelo Tosatti 
56365ea5ca3cSWei Wang 	kvm_iodevice_destructor(dev);
5637e93f8a0fSMarcelo Tosatti 	kfree(bus);
56385ea5ca3cSWei Wang 	return 0;
56390fce5623SAvi Kivity }
56400fce5623SAvi Kivity 
kvm_io_bus_get_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr)56418a39d006SAndre Przywara struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
56428a39d006SAndre Przywara 					 gpa_t addr)
56438a39d006SAndre Przywara {
56448a39d006SAndre Przywara 	struct kvm_io_bus *bus;
56458a39d006SAndre Przywara 	int dev_idx, srcu_idx;
56468a39d006SAndre Przywara 	struct kvm_io_device *iodev = NULL;
56478a39d006SAndre Przywara 
56488a39d006SAndre Przywara 	srcu_idx = srcu_read_lock(&kvm->srcu);
56498a39d006SAndre Przywara 
56508a39d006SAndre Przywara 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
565190db1043SDavid Hildenbrand 	if (!bus)
565290db1043SDavid Hildenbrand 		goto out_unlock;
56538a39d006SAndre Przywara 
56548a39d006SAndre Przywara 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
56558a39d006SAndre Przywara 	if (dev_idx < 0)
56568a39d006SAndre Przywara 		goto out_unlock;
56578a39d006SAndre Przywara 
56588a39d006SAndre Przywara 	iodev = bus->range[dev_idx].dev;
56598a39d006SAndre Przywara 
56608a39d006SAndre Przywara out_unlock:
56618a39d006SAndre Przywara 	srcu_read_unlock(&kvm->srcu, srcu_idx);
56628a39d006SAndre Przywara 
56638a39d006SAndre Przywara 	return iodev;
56648a39d006SAndre Przywara }
56658a39d006SAndre Przywara EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
56668a39d006SAndre Przywara 
kvm_debugfs_open(struct inode * inode,struct file * file,int (* get)(void *,u64 *),int (* set)(void *,u64),const char * fmt)5667536a6f88SJanosch Frank static int kvm_debugfs_open(struct inode *inode, struct file *file,
5668536a6f88SJanosch Frank 			   int (*get)(void *, u64 *), int (*set)(void *, u64),
5669536a6f88SJanosch Frank 			   const char *fmt)
5670536a6f88SJanosch Frank {
5671180418e2SHou Wenlong 	int ret;
567214aa40a1SLi kunyu 	struct kvm_stat_data *stat_data = inode->i_private;
5673536a6f88SJanosch Frank 
5674605c7130SPeter Xu 	/*
5675605c7130SPeter Xu 	 * The debugfs files are a reference to the kvm struct which
5676605c7130SPeter Xu         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5677605c7130SPeter Xu         * avoids the race between open and the removal of the debugfs directory.
5678536a6f88SJanosch Frank 	 */
5679605c7130SPeter Xu 	if (!kvm_get_kvm_safe(stat_data->kvm))
5680536a6f88SJanosch Frank 		return -ENOENT;
5681536a6f88SJanosch Frank 
5682180418e2SHou Wenlong 	ret = simple_attr_open(inode, file, get,
5683bc9e9e67SJing Zhang 			       kvm_stats_debugfs_mode(stat_data->desc) & 0222
5684180418e2SHou Wenlong 			       ? set : NULL, fmt);
5685180418e2SHou Wenlong 	if (ret)
5686536a6f88SJanosch Frank 		kvm_put_kvm(stat_data->kvm);
5687536a6f88SJanosch Frank 
5688180418e2SHou Wenlong 	return ret;
5689536a6f88SJanosch Frank }
5690536a6f88SJanosch Frank 
kvm_debugfs_release(struct inode * inode,struct file * file)5691536a6f88SJanosch Frank static int kvm_debugfs_release(struct inode *inode, struct file *file)
5692536a6f88SJanosch Frank {
569314aa40a1SLi kunyu 	struct kvm_stat_data *stat_data = inode->i_private;
5694536a6f88SJanosch Frank 
5695536a6f88SJanosch Frank 	simple_attr_release(inode, file);
5696536a6f88SJanosch Frank 	kvm_put_kvm(stat_data->kvm);
5697536a6f88SJanosch Frank 
5698536a6f88SJanosch Frank 	return 0;
5699536a6f88SJanosch Frank }
5700536a6f88SJanosch Frank 
kvm_get_stat_per_vm(struct kvm * kvm,size_t offset,u64 * val)570109cbcef6SMilan Pandurov static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5702536a6f88SJanosch Frank {
5703bc9e9e67SJing Zhang 	*val = *(u64 *)((void *)(&kvm->stat) + offset);
5704536a6f88SJanosch Frank 
5705536a6f88SJanosch Frank 	return 0;
5706536a6f88SJanosch Frank }
5707536a6f88SJanosch Frank 
kvm_clear_stat_per_vm(struct kvm * kvm,size_t offset)570809cbcef6SMilan Pandurov static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5709ce35ef27SSuraj Jitindar Singh {
5710bc9e9e67SJing Zhang 	*(u64 *)((void *)(&kvm->stat) + offset) = 0;
5711ce35ef27SSuraj Jitindar Singh 
5712ce35ef27SSuraj Jitindar Singh 	return 0;
5713ce35ef27SSuraj Jitindar Singh }
5714ce35ef27SSuraj Jitindar Singh 
kvm_get_stat_per_vcpu(struct kvm * kvm,size_t offset,u64 * val)571509cbcef6SMilan Pandurov static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5716536a6f88SJanosch Frank {
571746808a4cSMarc Zyngier 	unsigned long i;
5718536a6f88SJanosch Frank 	struct kvm_vcpu *vcpu;
5719536a6f88SJanosch Frank 
5720536a6f88SJanosch Frank 	*val = 0;
5721536a6f88SJanosch Frank 
572209cbcef6SMilan Pandurov 	kvm_for_each_vcpu(i, vcpu, kvm)
5723bc9e9e67SJing Zhang 		*val += *(u64 *)((void *)(&vcpu->stat) + offset);
5724536a6f88SJanosch Frank 
5725536a6f88SJanosch Frank 	return 0;
5726536a6f88SJanosch Frank }
5727536a6f88SJanosch Frank 
kvm_clear_stat_per_vcpu(struct kvm * kvm,size_t offset)572809cbcef6SMilan Pandurov static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5729ce35ef27SSuraj Jitindar Singh {
573046808a4cSMarc Zyngier 	unsigned long i;
5731ce35ef27SSuraj Jitindar Singh 	struct kvm_vcpu *vcpu;
5732ce35ef27SSuraj Jitindar Singh 
573309cbcef6SMilan Pandurov 	kvm_for_each_vcpu(i, vcpu, kvm)
5734bc9e9e67SJing Zhang 		*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
573509cbcef6SMilan Pandurov 
573609cbcef6SMilan Pandurov 	return 0;
573709cbcef6SMilan Pandurov }
573809cbcef6SMilan Pandurov 
kvm_stat_data_get(void * data,u64 * val)573909cbcef6SMilan Pandurov static int kvm_stat_data_get(void *data, u64 *val)
574009cbcef6SMilan Pandurov {
574109cbcef6SMilan Pandurov 	int r = -EFAULT;
574214aa40a1SLi kunyu 	struct kvm_stat_data *stat_data = data;
574309cbcef6SMilan Pandurov 
5744bc9e9e67SJing Zhang 	switch (stat_data->kind) {
574509cbcef6SMilan Pandurov 	case KVM_STAT_VM:
574609cbcef6SMilan Pandurov 		r = kvm_get_stat_per_vm(stat_data->kvm,
5747bc9e9e67SJing Zhang 					stat_data->desc->desc.offset, val);
574809cbcef6SMilan Pandurov 		break;
574909cbcef6SMilan Pandurov 	case KVM_STAT_VCPU:
575009cbcef6SMilan Pandurov 		r = kvm_get_stat_per_vcpu(stat_data->kvm,
5751bc9e9e67SJing Zhang 					  stat_data->desc->desc.offset, val);
575209cbcef6SMilan Pandurov 		break;
575309cbcef6SMilan Pandurov 	}
575409cbcef6SMilan Pandurov 
575509cbcef6SMilan Pandurov 	return r;
575609cbcef6SMilan Pandurov }
575709cbcef6SMilan Pandurov 
kvm_stat_data_clear(void * data,u64 val)575809cbcef6SMilan Pandurov static int kvm_stat_data_clear(void *data, u64 val)
575909cbcef6SMilan Pandurov {
576009cbcef6SMilan Pandurov 	int r = -EFAULT;
576114aa40a1SLi kunyu 	struct kvm_stat_data *stat_data = data;
576209cbcef6SMilan Pandurov 
5763ce35ef27SSuraj Jitindar Singh 	if (val)
5764ce35ef27SSuraj Jitindar Singh 		return -EINVAL;
5765ce35ef27SSuraj Jitindar Singh 
5766bc9e9e67SJing Zhang 	switch (stat_data->kind) {
576709cbcef6SMilan Pandurov 	case KVM_STAT_VM:
576809cbcef6SMilan Pandurov 		r = kvm_clear_stat_per_vm(stat_data->kvm,
5769bc9e9e67SJing Zhang 					  stat_data->desc->desc.offset);
577009cbcef6SMilan Pandurov 		break;
577109cbcef6SMilan Pandurov 	case KVM_STAT_VCPU:
577209cbcef6SMilan Pandurov 		r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5773bc9e9e67SJing Zhang 					    stat_data->desc->desc.offset);
577409cbcef6SMilan Pandurov 		break;
5775ce35ef27SSuraj Jitindar Singh 	}
5776ce35ef27SSuraj Jitindar Singh 
577709cbcef6SMilan Pandurov 	return r;
577809cbcef6SMilan Pandurov }
577909cbcef6SMilan Pandurov 
kvm_stat_data_open(struct inode * inode,struct file * file)578009cbcef6SMilan Pandurov static int kvm_stat_data_open(struct inode *inode, struct file *file)
5781536a6f88SJanosch Frank {
5782536a6f88SJanosch Frank 	__simple_attr_check_format("%llu\n", 0ull);
578309cbcef6SMilan Pandurov 	return kvm_debugfs_open(inode, file, kvm_stat_data_get,
578409cbcef6SMilan Pandurov 				kvm_stat_data_clear, "%llu\n");
5785536a6f88SJanosch Frank }
5786536a6f88SJanosch Frank 
578709cbcef6SMilan Pandurov static const struct file_operations stat_fops_per_vm = {
5788536a6f88SJanosch Frank 	.owner = THIS_MODULE,
578909cbcef6SMilan Pandurov 	.open = kvm_stat_data_open,
5790536a6f88SJanosch Frank 	.release = kvm_debugfs_release,
5791536a6f88SJanosch Frank 	.read = simple_attr_read,
5792536a6f88SJanosch Frank 	.write = simple_attr_write,
57933bed8888SGeliang Tang 	.llseek = no_llseek,
5794536a6f88SJanosch Frank };
5795536a6f88SJanosch Frank 
vm_stat_get(void * _offset,u64 * val)57968b88b099SChristoph Hellwig static int vm_stat_get(void *_offset, u64 *val)
57970fce5623SAvi Kivity {
57980fce5623SAvi Kivity 	unsigned offset = (long)_offset;
57990fce5623SAvi Kivity 	struct kvm *kvm;
5800536a6f88SJanosch Frank 	u64 tmp_val;
58010fce5623SAvi Kivity 
58028b88b099SChristoph Hellwig 	*val = 0;
58030d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
5804536a6f88SJanosch Frank 	list_for_each_entry(kvm, &vm_list, vm_list) {
580509cbcef6SMilan Pandurov 		kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5806536a6f88SJanosch Frank 		*val += tmp_val;
5807536a6f88SJanosch Frank 	}
58080d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
58098b88b099SChristoph Hellwig 	return 0;
58100fce5623SAvi Kivity }
58110fce5623SAvi Kivity 
vm_stat_clear(void * _offset,u64 val)5812ce35ef27SSuraj Jitindar Singh static int vm_stat_clear(void *_offset, u64 val)
5813ce35ef27SSuraj Jitindar Singh {
5814ce35ef27SSuraj Jitindar Singh 	unsigned offset = (long)_offset;
5815ce35ef27SSuraj Jitindar Singh 	struct kvm *kvm;
5816ce35ef27SSuraj Jitindar Singh 
5817ce35ef27SSuraj Jitindar Singh 	if (val)
5818ce35ef27SSuraj Jitindar Singh 		return -EINVAL;
5819ce35ef27SSuraj Jitindar Singh 
58200d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
5821ce35ef27SSuraj Jitindar Singh 	list_for_each_entry(kvm, &vm_list, vm_list) {
582209cbcef6SMilan Pandurov 		kvm_clear_stat_per_vm(kvm, offset);
5823ce35ef27SSuraj Jitindar Singh 	}
58240d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
5825ce35ef27SSuraj Jitindar Singh 
5826ce35ef27SSuraj Jitindar Singh 	return 0;
5827ce35ef27SSuraj Jitindar Singh }
5828ce35ef27SSuraj Jitindar Singh 
5829ce35ef27SSuraj Jitindar Singh DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5830bc9e9e67SJing Zhang DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
58310fce5623SAvi Kivity 
vcpu_stat_get(void * _offset,u64 * val)58328b88b099SChristoph Hellwig static int vcpu_stat_get(void *_offset, u64 *val)
58330fce5623SAvi Kivity {
58340fce5623SAvi Kivity 	unsigned offset = (long)_offset;
58350fce5623SAvi Kivity 	struct kvm *kvm;
5836536a6f88SJanosch Frank 	u64 tmp_val;
58370fce5623SAvi Kivity 
58388b88b099SChristoph Hellwig 	*val = 0;
58390d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
5840536a6f88SJanosch Frank 	list_for_each_entry(kvm, &vm_list, vm_list) {
584109cbcef6SMilan Pandurov 		kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5842536a6f88SJanosch Frank 		*val += tmp_val;
5843536a6f88SJanosch Frank 	}
58440d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
58458b88b099SChristoph Hellwig 	return 0;
58460fce5623SAvi Kivity }
58470fce5623SAvi Kivity 
vcpu_stat_clear(void * _offset,u64 val)5848ce35ef27SSuraj Jitindar Singh static int vcpu_stat_clear(void *_offset, u64 val)
5849ce35ef27SSuraj Jitindar Singh {
5850ce35ef27SSuraj Jitindar Singh 	unsigned offset = (long)_offset;
5851ce35ef27SSuraj Jitindar Singh 	struct kvm *kvm;
5852ce35ef27SSuraj Jitindar Singh 
5853ce35ef27SSuraj Jitindar Singh 	if (val)
5854ce35ef27SSuraj Jitindar Singh 		return -EINVAL;
5855ce35ef27SSuraj Jitindar Singh 
58560d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
5857ce35ef27SSuraj Jitindar Singh 	list_for_each_entry(kvm, &vm_list, vm_list) {
585809cbcef6SMilan Pandurov 		kvm_clear_stat_per_vcpu(kvm, offset);
5859ce35ef27SSuraj Jitindar Singh 	}
58600d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
5861ce35ef27SSuraj Jitindar Singh 
5862ce35ef27SSuraj Jitindar Singh 	return 0;
5863ce35ef27SSuraj Jitindar Singh }
5864ce35ef27SSuraj Jitindar Singh 
5865ce35ef27SSuraj Jitindar Singh DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5866ce35ef27SSuraj Jitindar Singh 			"%llu\n");
5867bc9e9e67SJing Zhang DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
58680fce5623SAvi Kivity 
kvm_uevent_notify_change(unsigned int type,struct kvm * kvm)5869286de8f6SClaudio Imbrenda static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5870286de8f6SClaudio Imbrenda {
5871286de8f6SClaudio Imbrenda 	struct kobj_uevent_env *env;
5872286de8f6SClaudio Imbrenda 	unsigned long long created, active;
5873286de8f6SClaudio Imbrenda 
5874286de8f6SClaudio Imbrenda 	if (!kvm_dev.this_device || !kvm)
5875286de8f6SClaudio Imbrenda 		return;
5876286de8f6SClaudio Imbrenda 
58770d9ce162SJunaid Shahid 	mutex_lock(&kvm_lock);
5878286de8f6SClaudio Imbrenda 	if (type == KVM_EVENT_CREATE_VM) {
5879286de8f6SClaudio Imbrenda 		kvm_createvm_count++;
5880286de8f6SClaudio Imbrenda 		kvm_active_vms++;
5881286de8f6SClaudio Imbrenda 	} else if (type == KVM_EVENT_DESTROY_VM) {
5882286de8f6SClaudio Imbrenda 		kvm_active_vms--;
5883286de8f6SClaudio Imbrenda 	}
5884286de8f6SClaudio Imbrenda 	created = kvm_createvm_count;
5885286de8f6SClaudio Imbrenda 	active = kvm_active_vms;
58860d9ce162SJunaid Shahid 	mutex_unlock(&kvm_lock);
5887286de8f6SClaudio Imbrenda 
5888b12ce36aSBen Gardon 	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5889286de8f6SClaudio Imbrenda 	if (!env)
5890286de8f6SClaudio Imbrenda 		return;
5891286de8f6SClaudio Imbrenda 
5892286de8f6SClaudio Imbrenda 	add_uevent_var(env, "CREATED=%llu", created);
5893286de8f6SClaudio Imbrenda 	add_uevent_var(env, "COUNT=%llu", active);
5894286de8f6SClaudio Imbrenda 
5895fdeaf7e3SClaudio Imbrenda 	if (type == KVM_EVENT_CREATE_VM) {
5896286de8f6SClaudio Imbrenda 		add_uevent_var(env, "EVENT=create");
5897fdeaf7e3SClaudio Imbrenda 		kvm->userspace_pid = task_pid_nr(current);
5898fdeaf7e3SClaudio Imbrenda 	} else if (type == KVM_EVENT_DESTROY_VM) {
5899286de8f6SClaudio Imbrenda 		add_uevent_var(env, "EVENT=destroy");
5900fdeaf7e3SClaudio Imbrenda 	}
5901fdeaf7e3SClaudio Imbrenda 	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5902286de8f6SClaudio Imbrenda 
5903a44a4cc1SOliver Upton 	if (!IS_ERR(kvm->debugfs_dentry)) {
5904b12ce36aSBen Gardon 		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5905286de8f6SClaudio Imbrenda 
5906fdeaf7e3SClaudio Imbrenda 		if (p) {
5907fdeaf7e3SClaudio Imbrenda 			tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5908fdeaf7e3SClaudio Imbrenda 			if (!IS_ERR(tmp))
5909fdeaf7e3SClaudio Imbrenda 				add_uevent_var(env, "STATS_PATH=%s", tmp);
5910fdeaf7e3SClaudio Imbrenda 			kfree(p);
5911286de8f6SClaudio Imbrenda 		}
5912286de8f6SClaudio Imbrenda 	}
5913286de8f6SClaudio Imbrenda 	/* no need for checks, since we are adding at most only 5 keys */
5914286de8f6SClaudio Imbrenda 	env->envp[env->envp_idx++] = NULL;
5915286de8f6SClaudio Imbrenda 	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5916286de8f6SClaudio Imbrenda 	kfree(env);
5917286de8f6SClaudio Imbrenda }
5918286de8f6SClaudio Imbrenda 
kvm_init_debug(void)5919929f45e3SGreg Kroah-Hartman static void kvm_init_debug(void)
59200fce5623SAvi Kivity {
5921bc9e9e67SJing Zhang 	const struct file_operations *fops;
5922bc9e9e67SJing Zhang 	const struct _kvm_stats_desc *pdesc;
5923bc9e9e67SJing Zhang 	int i;
59240fce5623SAvi Kivity 
592576f7c879SHollis Blanchard 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
59264f69b680SHamo 
5927bc9e9e67SJing Zhang 	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5928bc9e9e67SJing Zhang 		pdesc = &kvm_vm_stats_desc[i];
5929bc9e9e67SJing Zhang 		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5930bc9e9e67SJing Zhang 			fops = &vm_stat_fops;
5931bc9e9e67SJing Zhang 		else
5932bc9e9e67SJing Zhang 			fops = &vm_stat_readonly_fops;
5933bc9e9e67SJing Zhang 		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5934bc9e9e67SJing Zhang 				kvm_debugfs_dir,
5935bc9e9e67SJing Zhang 				(void *)(long)pdesc->desc.offset, fops);
5936bc9e9e67SJing Zhang 	}
5937bc9e9e67SJing Zhang 
5938bc9e9e67SJing Zhang 	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5939bc9e9e67SJing Zhang 		pdesc = &kvm_vcpu_stats_desc[i];
5940bc9e9e67SJing Zhang 		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5941bc9e9e67SJing Zhang 			fops = &vcpu_stat_fops;
5942bc9e9e67SJing Zhang 		else
5943bc9e9e67SJing Zhang 			fops = &vcpu_stat_readonly_fops;
5944bc9e9e67SJing Zhang 		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5945bc9e9e67SJing Zhang 				kvm_debugfs_dir,
5946bc9e9e67SJing Zhang 				(void *)(long)pdesc->desc.offset, fops);
59474f69b680SHamo 	}
59480fce5623SAvi Kivity }
59490fce5623SAvi Kivity 
59500fce5623SAvi Kivity static inline
preempt_notifier_to_vcpu(struct preempt_notifier * pn)59510fce5623SAvi Kivity struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
59520fce5623SAvi Kivity {
59530fce5623SAvi Kivity 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
59540fce5623SAvi Kivity }
59550fce5623SAvi Kivity 
kvm_sched_in(struct preempt_notifier * pn,int cpu)59560fce5623SAvi Kivity static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
59570fce5623SAvi Kivity {
59580fce5623SAvi Kivity 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5959f95ef0cdSXiubo Li 
5960046ddeedSWanpeng Li 	WRITE_ONCE(vcpu->preempted, false);
5961d73eb57bSWanpeng Li 	WRITE_ONCE(vcpu->ready, false);
59620fce5623SAvi Kivity 
59637495e22bSPaolo Bonzini 	__this_cpu_write(kvm_running_vcpu, vcpu);
5964e790d9efSRadim Krčmář 	kvm_arch_sched_in(vcpu, cpu);
59650fce5623SAvi Kivity 	kvm_arch_vcpu_load(vcpu, cpu);
59660fce5623SAvi Kivity }
59670fce5623SAvi Kivity 
kvm_sched_out(struct preempt_notifier * pn,struct task_struct * next)59680fce5623SAvi Kivity static void kvm_sched_out(struct preempt_notifier *pn,
59690fce5623SAvi Kivity 			  struct task_struct *next)
59700fce5623SAvi Kivity {
59710fce5623SAvi Kivity 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
59720fce5623SAvi Kivity 
59733ba9f93bSPeter Zijlstra 	if (current->on_rq) {
5974046ddeedSWanpeng Li 		WRITE_ONCE(vcpu->preempted, true);
5975d73eb57bSWanpeng Li 		WRITE_ONCE(vcpu->ready, true);
5976d73eb57bSWanpeng Li 	}
59770fce5623SAvi Kivity 	kvm_arch_vcpu_put(vcpu);
59787495e22bSPaolo Bonzini 	__this_cpu_write(kvm_running_vcpu, NULL);
59797495e22bSPaolo Bonzini }
59807495e22bSPaolo Bonzini 
59817495e22bSPaolo Bonzini /**
59827495e22bSPaolo Bonzini  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
59831f03b2bcSMarc Zyngier  *
59841f03b2bcSMarc Zyngier  * We can disable preemption locally around accessing the per-CPU variable,
59851f03b2bcSMarc Zyngier  * and use the resolved vcpu pointer after enabling preemption again,
59861f03b2bcSMarc Zyngier  * because even if the current thread is migrated to another CPU, reading
59871f03b2bcSMarc Zyngier  * the per-CPU value later will give us the same value as we update the
59881f03b2bcSMarc Zyngier  * per-CPU variable in the preempt notifier handlers.
59897495e22bSPaolo Bonzini  */
kvm_get_running_vcpu(void)59907495e22bSPaolo Bonzini struct kvm_vcpu *kvm_get_running_vcpu(void)
59917495e22bSPaolo Bonzini {
59921f03b2bcSMarc Zyngier 	struct kvm_vcpu *vcpu;
59931f03b2bcSMarc Zyngier 
59941f03b2bcSMarc Zyngier 	preempt_disable();
59951f03b2bcSMarc Zyngier 	vcpu = __this_cpu_read(kvm_running_vcpu);
59961f03b2bcSMarc Zyngier 	preempt_enable();
59971f03b2bcSMarc Zyngier 
59981f03b2bcSMarc Zyngier 	return vcpu;
59997495e22bSPaolo Bonzini }
6000379a3c8eSWanpeng Li EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
60017495e22bSPaolo Bonzini 
60027495e22bSPaolo Bonzini /**
60037495e22bSPaolo Bonzini  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
60047495e22bSPaolo Bonzini  */
kvm_get_running_vcpus(void)60057495e22bSPaolo Bonzini struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
60067495e22bSPaolo Bonzini {
60077495e22bSPaolo Bonzini         return &kvm_running_vcpu;
60080fce5623SAvi Kivity }
60090fce5623SAvi Kivity 
6010e1bfc245SSean Christopherson #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_guest_state(void)6011e1bfc245SSean Christopherson static unsigned int kvm_guest_state(void)
6012e1bfc245SSean Christopherson {
6013e1bfc245SSean Christopherson 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6014e1bfc245SSean Christopherson 	unsigned int state;
6015e1bfc245SSean Christopherson 
6016e1bfc245SSean Christopherson 	if (!kvm_arch_pmi_in_guest(vcpu))
6017e1bfc245SSean Christopherson 		return 0;
6018e1bfc245SSean Christopherson 
6019e1bfc245SSean Christopherson 	state = PERF_GUEST_ACTIVE;
6020e1bfc245SSean Christopherson 	if (!kvm_arch_vcpu_in_kernel(vcpu))
6021e1bfc245SSean Christopherson 		state |= PERF_GUEST_USER;
6022e1bfc245SSean Christopherson 
6023e1bfc245SSean Christopherson 	return state;
6024e1bfc245SSean Christopherson }
6025e1bfc245SSean Christopherson 
kvm_guest_get_ip(void)6026e1bfc245SSean Christopherson static unsigned long kvm_guest_get_ip(void)
6027e1bfc245SSean Christopherson {
6028e1bfc245SSean Christopherson 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6029e1bfc245SSean Christopherson 
6030e1bfc245SSean Christopherson 	/* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6031e1bfc245SSean Christopherson 	if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6032e1bfc245SSean Christopherson 		return 0;
6033e1bfc245SSean Christopherson 
6034e1bfc245SSean Christopherson 	return kvm_arch_vcpu_get_ip(vcpu);
6035e1bfc245SSean Christopherson }
6036e1bfc245SSean Christopherson 
6037e1bfc245SSean Christopherson static struct perf_guest_info_callbacks kvm_guest_cbs = {
6038e1bfc245SSean Christopherson 	.state			= kvm_guest_state,
6039e1bfc245SSean Christopherson 	.get_ip			= kvm_guest_get_ip,
6040e1bfc245SSean Christopherson 	.handle_intel_pt_intr	= NULL,
6041e1bfc245SSean Christopherson };
6042e1bfc245SSean Christopherson 
kvm_register_perf_callbacks(unsigned int (* pt_intr_handler)(void))6043e1bfc245SSean Christopherson void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6044e1bfc245SSean Christopherson {
6045e1bfc245SSean Christopherson 	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6046e1bfc245SSean Christopherson 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
6047e1bfc245SSean Christopherson }
kvm_unregister_perf_callbacks(void)6048e1bfc245SSean Christopherson void kvm_unregister_perf_callbacks(void)
6049e1bfc245SSean Christopherson {
6050e1bfc245SSean Christopherson 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6051e1bfc245SSean Christopherson }
6052e1bfc245SSean Christopherson #endif
6053e1bfc245SSean Christopherson 
kvm_init(unsigned vcpu_size,unsigned vcpu_align,struct module * module)605481a1cf9fSSean Christopherson int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6055f257d6dcSSean Christopherson {
60560fce5623SAvi Kivity 	int r;
60570fce5623SAvi Kivity 	int cpu;
60580fce5623SAvi Kivity 
6059441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6060aaf12a7bSChao Gao 	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6061aaf12a7bSChao Gao 				      kvm_online_cpu, kvm_offline_cpu);
60620fce5623SAvi Kivity 	if (r)
606337d25881SSean Christopherson 		return r;
606437d25881SSean Christopherson 
606535774a9fSSean Christopherson 	register_syscore_ops(&kvm_syscore_ops);
6066441f7bfaSSean Christopherson #endif
60670fce5623SAvi Kivity 
60680fce5623SAvi Kivity 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
60690ee75beaSAvi Kivity 	if (!vcpu_align)
60700ee75beaSAvi Kivity 		vcpu_align = __alignof__(struct kvm_vcpu);
607146515736SPaolo Bonzini 	kvm_vcpu_cache =
607246515736SPaolo Bonzini 		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
607346515736SPaolo Bonzini 					   SLAB_ACCOUNT,
607446515736SPaolo Bonzini 					   offsetof(struct kvm_vcpu, arch),
6075ce55c049SJing Zhang 					   offsetofend(struct kvm_vcpu, stats_id)
6076ce55c049SJing Zhang 					   - offsetof(struct kvm_vcpu, arch),
607746515736SPaolo Bonzini 					   NULL);
60780fce5623SAvi Kivity 	if (!kvm_vcpu_cache) {
60790fce5623SAvi Kivity 		r = -ENOMEM;
60809f1a4c00SSean Christopherson 		goto err_vcpu_cache;
60810fce5623SAvi Kivity 	}
60820fce5623SAvi Kivity 
6083baff59ccSVitaly Kuznetsov 	for_each_possible_cpu(cpu) {
6084baff59ccSVitaly Kuznetsov 		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6085baff59ccSVitaly Kuznetsov 					    GFP_KERNEL, cpu_to_node(cpu))) {
6086baff59ccSVitaly Kuznetsov 			r = -ENOMEM;
60879f1a4c00SSean Christopherson 			goto err_cpu_kick_mask;
6088baff59ccSVitaly Kuznetsov 		}
6089baff59ccSVitaly Kuznetsov 	}
6090baff59ccSVitaly Kuznetsov 
60915910ccf0SSean Christopherson 	r = kvm_irqfd_init();
60925910ccf0SSean Christopherson 	if (r)
60935910ccf0SSean Christopherson 		goto err_irqfd;
60945910ccf0SSean Christopherson 
6095af585b92SGleb Natapov 	r = kvm_async_pf_init();
6096af585b92SGleb Natapov 	if (r)
60975910ccf0SSean Christopherson 		goto err_async_pf;
6098af585b92SGleb Natapov 
60990fce5623SAvi Kivity 	kvm_chardev_ops.owner = module;
61000fce5623SAvi Kivity 
61010fce5623SAvi Kivity 	kvm_preempt_ops.sched_in = kvm_sched_in;
61020fce5623SAvi Kivity 	kvm_preempt_ops.sched_out = kvm_sched_out;
61030fce5623SAvi Kivity 
6104929f45e3SGreg Kroah-Hartman 	kvm_init_debug();
61050ea4ed8eSDarrick J. Wong 
61063c3c29fdSPaolo Bonzini 	r = kvm_vfio_ops_init();
61072b012812SSean Christopherson 	if (WARN_ON_ONCE(r))
61082b012812SSean Christopherson 		goto err_vfio;
61092b012812SSean Christopherson 
61102b012812SSean Christopherson 	/*
61112b012812SSean Christopherson 	 * Registration _must_ be the very last thing done, as this exposes
61122b012812SSean Christopherson 	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
61132b012812SSean Christopherson 	 */
61142b012812SSean Christopherson 	r = misc_register(&kvm_dev);
61152b012812SSean Christopherson 	if (r) {
61162b012812SSean Christopherson 		pr_err("kvm: misc device register failed\n");
61172b012812SSean Christopherson 		goto err_register;
61182b012812SSean Christopherson 	}
61193c3c29fdSPaolo Bonzini 
61200fce5623SAvi Kivity 	return 0;
61210fce5623SAvi Kivity 
61222b012812SSean Christopherson err_register:
61232b012812SSean Christopherson 	kvm_vfio_ops_exit();
61242b012812SSean Christopherson err_vfio:
6125af585b92SGleb Natapov 	kvm_async_pf_deinit();
61265910ccf0SSean Christopherson err_async_pf:
61275910ccf0SSean Christopherson 	kvm_irqfd_exit();
61285910ccf0SSean Christopherson err_irqfd:
61299f1a4c00SSean Christopherson err_cpu_kick_mask:
6130baff59ccSVitaly Kuznetsov 	for_each_possible_cpu(cpu)
6131baff59ccSVitaly Kuznetsov 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
61320fce5623SAvi Kivity 	kmem_cache_destroy(kvm_vcpu_cache);
61339f1a4c00SSean Christopherson err_vcpu_cache:
6134441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
613535774a9fSSean Christopherson 	unregister_syscore_ops(&kvm_syscore_ops);
6136aaf12a7bSChao Gao 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6137441f7bfaSSean Christopherson #endif
61380fce5623SAvi Kivity 	return r;
61390fce5623SAvi Kivity }
61400fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_init);
61410fce5623SAvi Kivity 
kvm_exit(void)61420fce5623SAvi Kivity void kvm_exit(void)
61430fce5623SAvi Kivity {
6144baff59ccSVitaly Kuznetsov 	int cpu;
6145baff59ccSVitaly Kuznetsov 
61462b012812SSean Christopherson 	/*
61472b012812SSean Christopherson 	 * Note, unregistering /dev/kvm doesn't strictly need to come first,
61482b012812SSean Christopherson 	 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
61492b012812SSean Christopherson 	 * to KVM while the module is being stopped.
61502b012812SSean Christopherson 	 */
61510fce5623SAvi Kivity 	misc_deregister(&kvm_dev);
61522b012812SSean Christopherson 
61532b012812SSean Christopherson 	debugfs_remove_recursive(kvm_debugfs_dir);
6154baff59ccSVitaly Kuznetsov 	for_each_possible_cpu(cpu)
6155baff59ccSVitaly Kuznetsov 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
61560fce5623SAvi Kivity 	kmem_cache_destroy(kvm_vcpu_cache);
615773b8dc04SSean Christopherson 	kvm_vfio_ops_exit();
6158af585b92SGleb Natapov 	kvm_async_pf_deinit();
6159441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6160fb3600ccSRafael J. Wysocki 	unregister_syscore_ops(&kvm_syscore_ops);
6161aaf12a7bSChao Gao 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6162441f7bfaSSean Christopherson #endif
61635910ccf0SSean Christopherson 	kvm_irqfd_exit();
61640fce5623SAvi Kivity }
61650fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_exit);
6166c57c8046SJunaid Shahid 
6167c57c8046SJunaid Shahid struct kvm_vm_worker_thread_context {
6168c57c8046SJunaid Shahid 	struct kvm *kvm;
6169c57c8046SJunaid Shahid 	struct task_struct *parent;
6170c57c8046SJunaid Shahid 	struct completion init_done;
6171c57c8046SJunaid Shahid 	kvm_vm_thread_fn_t thread_fn;
6172c57c8046SJunaid Shahid 	uintptr_t data;
6173c57c8046SJunaid Shahid 	int err;
6174c57c8046SJunaid Shahid };
6175c57c8046SJunaid Shahid 
kvm_vm_worker_thread(void * context)6176c57c8046SJunaid Shahid static int kvm_vm_worker_thread(void *context)
6177c57c8046SJunaid Shahid {
6178c57c8046SJunaid Shahid 	/*
6179c57c8046SJunaid Shahid 	 * The init_context is allocated on the stack of the parent thread, so
6180c57c8046SJunaid Shahid 	 * we have to locally copy anything that is needed beyond initialization
6181c57c8046SJunaid Shahid 	 */
6182c57c8046SJunaid Shahid 	struct kvm_vm_worker_thread_context *init_context = context;
6183e45cce30SVipin Sharma 	struct task_struct *parent;
6184c57c8046SJunaid Shahid 	struct kvm *kvm = init_context->kvm;
6185c57c8046SJunaid Shahid 	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6186c57c8046SJunaid Shahid 	uintptr_t data = init_context->data;
6187c57c8046SJunaid Shahid 	int err;
6188c57c8046SJunaid Shahid 
6189c57c8046SJunaid Shahid 	err = kthread_park(current);
6190c57c8046SJunaid Shahid 	/* kthread_park(current) is never supposed to return an error */
6191c57c8046SJunaid Shahid 	WARN_ON(err != 0);
6192c57c8046SJunaid Shahid 	if (err)
6193c57c8046SJunaid Shahid 		goto init_complete;
6194c57c8046SJunaid Shahid 
6195c57c8046SJunaid Shahid 	err = cgroup_attach_task_all(init_context->parent, current);
6196c57c8046SJunaid Shahid 	if (err) {
6197c57c8046SJunaid Shahid 		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6198c57c8046SJunaid Shahid 			__func__, err);
6199c57c8046SJunaid Shahid 		goto init_complete;
6200c57c8046SJunaid Shahid 	}
6201c57c8046SJunaid Shahid 
6202c57c8046SJunaid Shahid 	set_user_nice(current, task_nice(init_context->parent));
6203c57c8046SJunaid Shahid 
6204c57c8046SJunaid Shahid init_complete:
6205c57c8046SJunaid Shahid 	init_context->err = err;
6206c57c8046SJunaid Shahid 	complete(&init_context->init_done);
6207c57c8046SJunaid Shahid 	init_context = NULL;
6208c57c8046SJunaid Shahid 
6209c57c8046SJunaid Shahid 	if (err)
6210e45cce30SVipin Sharma 		goto out;
6211c57c8046SJunaid Shahid 
6212c57c8046SJunaid Shahid 	/* Wait to be woken up by the spawner before proceeding. */
6213c57c8046SJunaid Shahid 	kthread_parkme();
6214c57c8046SJunaid Shahid 
6215c57c8046SJunaid Shahid 	if (!kthread_should_stop())
6216c57c8046SJunaid Shahid 		err = thread_fn(kvm, data);
6217c57c8046SJunaid Shahid 
6218e45cce30SVipin Sharma out:
6219e45cce30SVipin Sharma 	/*
6220e45cce30SVipin Sharma 	 * Move kthread back to its original cgroup to prevent it lingering in
6221e45cce30SVipin Sharma 	 * the cgroup of the VM process, after the latter finishes its
6222e45cce30SVipin Sharma 	 * execution.
6223e45cce30SVipin Sharma 	 *
6224e45cce30SVipin Sharma 	 * kthread_stop() waits on the 'exited' completion condition which is
6225e45cce30SVipin Sharma 	 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6226e45cce30SVipin Sharma 	 * kthread is removed from the cgroup in the cgroup_exit() which is
6227e45cce30SVipin Sharma 	 * called after the exit_mm(). This causes the kthread_stop() to return
6228e45cce30SVipin Sharma 	 * before the kthread actually quits the cgroup.
6229e45cce30SVipin Sharma 	 */
6230e45cce30SVipin Sharma 	rcu_read_lock();
6231e45cce30SVipin Sharma 	parent = rcu_dereference(current->real_parent);
6232e45cce30SVipin Sharma 	get_task_struct(parent);
6233e45cce30SVipin Sharma 	rcu_read_unlock();
6234e45cce30SVipin Sharma 	cgroup_attach_task_all(parent, current);
6235e45cce30SVipin Sharma 	put_task_struct(parent);
6236e45cce30SVipin Sharma 
6237c57c8046SJunaid Shahid 	return err;
6238c57c8046SJunaid Shahid }
6239c57c8046SJunaid Shahid 
kvm_vm_create_worker_thread(struct kvm * kvm,kvm_vm_thread_fn_t thread_fn,uintptr_t data,const char * name,struct task_struct ** thread_ptr)6240c57c8046SJunaid Shahid int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6241c57c8046SJunaid Shahid 				uintptr_t data, const char *name,
6242c57c8046SJunaid Shahid 				struct task_struct **thread_ptr)
6243c57c8046SJunaid Shahid {
6244c57c8046SJunaid Shahid 	struct kvm_vm_worker_thread_context init_context = {};
6245c57c8046SJunaid Shahid 	struct task_struct *thread;
6246c57c8046SJunaid Shahid 
6247c57c8046SJunaid Shahid 	*thread_ptr = NULL;
6248c57c8046SJunaid Shahid 	init_context.kvm = kvm;
6249c57c8046SJunaid Shahid 	init_context.parent = current;
6250c57c8046SJunaid Shahid 	init_context.thread_fn = thread_fn;
6251c57c8046SJunaid Shahid 	init_context.data = data;
6252c57c8046SJunaid Shahid 	init_completion(&init_context.init_done);
6253c57c8046SJunaid Shahid 
6254c57c8046SJunaid Shahid 	thread = kthread_run(kvm_vm_worker_thread, &init_context,
6255c57c8046SJunaid Shahid 			     "%s-%d", name, task_pid_nr(current));
6256c57c8046SJunaid Shahid 	if (IS_ERR(thread))
6257c57c8046SJunaid Shahid 		return PTR_ERR(thread);
6258c57c8046SJunaid Shahid 
6259c57c8046SJunaid Shahid 	/* kthread_run is never supposed to return NULL */
6260c57c8046SJunaid Shahid 	WARN_ON(thread == NULL);
6261c57c8046SJunaid Shahid 
6262c57c8046SJunaid Shahid 	wait_for_completion(&init_context.init_done);
6263c57c8046SJunaid Shahid 
6264c57c8046SJunaid Shahid 	if (!init_context.err)
6265c57c8046SJunaid Shahid 		*thread_ptr = thread;
6266c57c8046SJunaid Shahid 
6267c57c8046SJunaid Shahid 	return init_context.err;
6268c57c8046SJunaid Shahid }
6269