120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
20fce5623SAvi Kivity /*
30fce5623SAvi Kivity * Kernel-based Virtual Machine driver for Linux
40fce5623SAvi Kivity *
50fce5623SAvi Kivity * This module enables machines with Intel VT-x extensions to run virtual
60fce5623SAvi Kivity * machines without emulation or binary translation.
70fce5623SAvi Kivity *
80fce5623SAvi Kivity * Copyright (C) 2006 Qumranet, Inc.
99611c187SNicolas Kaiser * Copyright 2010 Red Hat, Inc. and/or its affiliates.
100fce5623SAvi Kivity *
110fce5623SAvi Kivity * Authors:
120fce5623SAvi Kivity * Avi Kivity <avi@qumranet.com>
130fce5623SAvi Kivity * Yaniv Kamay <yaniv@qumranet.com>
140fce5623SAvi Kivity */
150fce5623SAvi Kivity
16af669ac6SAndre Przywara #include <kvm/iodev.h>
170fce5623SAvi Kivity
180fce5623SAvi Kivity #include <linux/kvm_host.h>
190fce5623SAvi Kivity #include <linux/kvm.h>
200fce5623SAvi Kivity #include <linux/module.h>
210fce5623SAvi Kivity #include <linux/errno.h>
220fce5623SAvi Kivity #include <linux/percpu.h>
230fce5623SAvi Kivity #include <linux/mm.h>
240fce5623SAvi Kivity #include <linux/miscdevice.h>
250fce5623SAvi Kivity #include <linux/vmalloc.h>
260fce5623SAvi Kivity #include <linux/reboot.h>
270fce5623SAvi Kivity #include <linux/debugfs.h>
280fce5623SAvi Kivity #include <linux/highmem.h>
290fce5623SAvi Kivity #include <linux/file.h>
30fb3600ccSRafael J. Wysocki #include <linux/syscore_ops.h>
310fce5623SAvi Kivity #include <linux/cpu.h>
32174cd4b1SIngo Molnar #include <linux/sched/signal.h>
336e84f315SIngo Molnar #include <linux/sched/mm.h>
3403441a34SIngo Molnar #include <linux/sched/stat.h>
350fce5623SAvi Kivity #include <linux/cpumask.h>
360fce5623SAvi Kivity #include <linux/smp.h>
370fce5623SAvi Kivity #include <linux/anon_inodes.h>
380fce5623SAvi Kivity #include <linux/profile.h>
390fce5623SAvi Kivity #include <linux/kvm_para.h>
400fce5623SAvi Kivity #include <linux/pagemap.h>
410fce5623SAvi Kivity #include <linux/mman.h>
4235149e21SAnthony Liguori #include <linux/swap.h>
43e56d532fSSheng Yang #include <linux/bitops.h>
44547de29eSMarcelo Tosatti #include <linux/spinlock.h>
456ff5894cSArnd Bergmann #include <linux/compat.h>
46bc6678a3SMarcelo Tosatti #include <linux/srcu.h>
478f0b1ab6SJoerg Roedel #include <linux/hugetlb.h>
485a0e3ad6STejun Heo #include <linux/slab.h>
49743eeb0bSSasha Levin #include <linux/sort.h>
50743eeb0bSSasha Levin #include <linux/bsearch.h>
51c011d23bSPaolo Bonzini #include <linux/io.h>
522eb06c30SWanpeng Li #include <linux/lockdep.h>
53c57c8046SJunaid Shahid #include <linux/kthread.h>
542fdef3a2SSergey Senozhatsky #include <linux/suspend.h>
550fce5623SAvi Kivity
560fce5623SAvi Kivity #include <asm/processor.h>
572ea75be3SDavid Matlack #include <asm/ioctl.h>
587c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
590fce5623SAvi Kivity
605f94c174SLaurent Vivier #include "coalesced_mmio.h"
61af585b92SGleb Natapov #include "async_pf.h"
62982ed0deSDavid Woodhouse #include "kvm_mm.h"
633c3c29fdSPaolo Bonzini #include "vfio.h"
645f94c174SLaurent Vivier
654c8c3c7fSValentin Schneider #include <trace/events/ipi.h>
664c8c3c7fSValentin Schneider
67229456fcSMarcelo Tosatti #define CREATE_TRACE_POINTS
68229456fcSMarcelo Tosatti #include <trace/events/kvm.h>
69229456fcSMarcelo Tosatti
70fb04a1edSPeter Xu #include <linux/kvm_dirty_ring.h>
71fb04a1edSPeter Xu
724c8c3c7fSValentin Schneider
73536a6f88SJanosch Frank /* Worst case buffer size needed for holding an integer. */
74536a6f88SJanosch Frank #define ITOA_MAX_LEN 12
75536a6f88SJanosch Frank
760fce5623SAvi Kivity MODULE_AUTHOR("Qumranet");
770fce5623SAvi Kivity MODULE_LICENSE("GPL");
780fce5623SAvi Kivity
79920552b2SDavid Hildenbrand /* Architectures should define their poll value according to the halt latency */
80ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81039c5d1bSRoman Storozhenko module_param(halt_poll_ns, uint, 0644);
82ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns);
83f7819512SPaolo Bonzini
84aca6ff29SWanpeng Li /* Default doubles per-vcpu halt_poll_ns. */
85ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns_grow = 2;
86039c5d1bSRoman Storozhenko module_param(halt_poll_ns_grow, uint, 0644);
87ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88aca6ff29SWanpeng Li
8949113d36SNir Weiner /* The start value to grow halt_poll_ns from */
9049113d36SNir Weiner unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
9149113d36SNir Weiner module_param(halt_poll_ns_grow_start, uint, 0644);
9249113d36SNir Weiner EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
9349113d36SNir Weiner
94aca6ff29SWanpeng Li /* Default resets per-vcpu halt_poll_ns . */
95ec76d819SSuraj Jitindar Singh unsigned int halt_poll_ns_shrink;
96039c5d1bSRoman Storozhenko module_param(halt_poll_ns_shrink, uint, 0644);
97ec76d819SSuraj Jitindar Singh EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98aca6ff29SWanpeng Li
99fa40a821SMarcelo Tosatti /*
100fa40a821SMarcelo Tosatti * Ordering of locks:
101fa40a821SMarcelo Tosatti *
102fae3a353SSheng Yang * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103fa40a821SMarcelo Tosatti */
104fa40a821SMarcelo Tosatti
1050d9ce162SJunaid Shahid DEFINE_MUTEX(kvm_lock);
1060fce5623SAvi Kivity LIST_HEAD(vm_list);
1070fce5623SAvi Kivity
108aaba298cSSean Christopherson static struct kmem_cache *kvm_vcpu_cache;
1090fce5623SAvi Kivity
1100fce5623SAvi Kivity static __read_mostly struct preempt_ops kvm_preempt_ops;
1117495e22bSPaolo Bonzini static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
1120fce5623SAvi Kivity
11376f7c879SHollis Blanchard struct dentry *kvm_debugfs_dir;
114e23a808bSPaul Mackerras EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
1150fce5623SAvi Kivity
11609cbcef6SMilan Pandurov static const struct file_operations stat_fops_per_vm;
117536a6f88SJanosch Frank
1185f6de5cbSDavid Matlack static struct file_operations kvm_chardev_ops;
1195f6de5cbSDavid Matlack
1200fce5623SAvi Kivity static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
1210fce5623SAvi Kivity unsigned long arg);
122de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
1231dda606cSAlexander Graf static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
1241dda606cSAlexander Graf unsigned long arg);
1257ddfd3e0SMarc Zyngier #define KVM_COMPAT(c) .compat_ioctl = (c)
1267ddfd3e0SMarc Zyngier #else
1279cb09e7cSMarc Zyngier /*
1289cb09e7cSMarc Zyngier * For architectures that don't implement a compat infrastructure,
1299cb09e7cSMarc Zyngier * adopt a double line of defense:
1309cb09e7cSMarc Zyngier * - Prevent a compat task from opening /dev/kvm
1319cb09e7cSMarc Zyngier * - If the open has been done by a 64bit task, and the KVM fd
1329cb09e7cSMarc Zyngier * passed to a compat task, let the ioctls fail.
1339cb09e7cSMarc Zyngier */
kvm_no_compat_ioctl(struct file * file,unsigned int ioctl,unsigned long arg)1347ddfd3e0SMarc Zyngier static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
1357ddfd3e0SMarc Zyngier unsigned long arg) { return -EINVAL; }
136b9876e6dSMarc Zyngier
kvm_no_compat_open(struct inode * inode,struct file * file)137b9876e6dSMarc Zyngier static int kvm_no_compat_open(struct inode *inode, struct file *file)
138b9876e6dSMarc Zyngier {
139b9876e6dSMarc Zyngier return is_compat_task() ? -ENODEV : 0;
140b9876e6dSMarc Zyngier }
141b9876e6dSMarc Zyngier #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142b9876e6dSMarc Zyngier .open = kvm_no_compat_open
1431dda606cSAlexander Graf #endif
14410474ae8SAlexander Graf static int hardware_enable_all(void);
14510474ae8SAlexander Graf static void hardware_disable_all(void);
1460fce5623SAvi Kivity
147e93f8a0fSMarcelo Tosatti static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
1487940876eSStephen Hemminger
149286de8f6SClaudio Imbrenda #define KVM_EVENT_CREATE_VM 0
150286de8f6SClaudio Imbrenda #define KVM_EVENT_DESTROY_VM 1
151286de8f6SClaudio Imbrenda static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
152286de8f6SClaudio Imbrenda static unsigned long long kvm_createvm_count;
153286de8f6SClaudio Imbrenda static unsigned long long kvm_active_vms;
154286de8f6SClaudio Imbrenda
155baff59ccSVitaly Kuznetsov static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
156baff59ccSVitaly Kuznetsov
kvm_arch_guest_memory_reclaimed(struct kvm * kvm)157683412ccSMingwei Zhang __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
158683412ccSMingwei Zhang {
159683412ccSMingwei Zhang }
160683412ccSMingwei Zhang
kvm_is_zone_device_page(struct page * page)161284dc493SSean Christopherson bool kvm_is_zone_device_page(struct page *page)
162a78986aaSSean Christopherson {
163a78986aaSSean Christopherson /*
164a78986aaSSean Christopherson * The metadata used by is_zone_device_page() to determine whether or
165a78986aaSSean Christopherson * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
166a78986aaSSean Christopherson * the device has been pinned, e.g. by get_user_pages(). WARN if the
167a78986aaSSean Christopherson * page_count() is zero to help detect bad usage of this helper.
168a78986aaSSean Christopherson */
169284dc493SSean Christopherson if (WARN_ON_ONCE(!page_count(page)))
170a78986aaSSean Christopherson return false;
171a78986aaSSean Christopherson
172284dc493SSean Christopherson return is_zone_device_page(page);
173a78986aaSSean Christopherson }
174a78986aaSSean Christopherson
175b14b2690SSean Christopherson /*
176b14b2690SSean Christopherson * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
177b14b2690SSean Christopherson * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
178b14b2690SSean Christopherson * is likely incomplete, it has been compiled purely through people wanting to
179b14b2690SSean Christopherson * back guest with a certain type of memory and encountering issues.
180b14b2690SSean Christopherson */
kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)181b14b2690SSean Christopherson struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
182cbff90a7SBen-Ami Yassour {
183b14b2690SSean Christopherson struct page *page;
184b14b2690SSean Christopherson
185b14b2690SSean Christopherson if (!pfn_valid(pfn))
186b14b2690SSean Christopherson return NULL;
187b14b2690SSean Christopherson
188b14b2690SSean Christopherson page = pfn_to_page(pfn);
189b14b2690SSean Christopherson if (!PageReserved(page))
190b14b2690SSean Christopherson return page;
191b14b2690SSean Christopherson
192b14b2690SSean Christopherson /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
193b14b2690SSean Christopherson if (is_zero_pfn(pfn))
194b14b2690SSean Christopherson return page;
195b14b2690SSean Christopherson
196a78986aaSSean Christopherson /*
197a78986aaSSean Christopherson * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
198a78986aaSSean Christopherson * perspective they are "normal" pages, albeit with slightly different
199a78986aaSSean Christopherson * usage rules.
200a78986aaSSean Christopherson */
201b14b2690SSean Christopherson if (kvm_is_zone_device_page(page))
202b14b2690SSean Christopherson return page;
203cbff90a7SBen-Ami Yassour
204b14b2690SSean Christopherson return NULL;
205cbff90a7SBen-Ami Yassour }
206cbff90a7SBen-Ami Yassour
2070fce5623SAvi Kivity /*
2080fce5623SAvi Kivity * Switches to specified vcpu, until a matching vcpu_put()
2090fce5623SAvi Kivity */
vcpu_load(struct kvm_vcpu * vcpu)210ec7660ccSChristoffer Dall void vcpu_load(struct kvm_vcpu *vcpu)
2110fce5623SAvi Kivity {
212ec7660ccSChristoffer Dall int cpu = get_cpu();
2137495e22bSPaolo Bonzini
2147495e22bSPaolo Bonzini __this_cpu_write(kvm_running_vcpu, vcpu);
2150fce5623SAvi Kivity preempt_notifier_register(&vcpu->preempt_notifier);
2160fce5623SAvi Kivity kvm_arch_vcpu_load(vcpu, cpu);
2170fce5623SAvi Kivity put_cpu();
2180fce5623SAvi Kivity }
2192f1fe811SJim Mattson EXPORT_SYMBOL_GPL(vcpu_load);
2200fce5623SAvi Kivity
vcpu_put(struct kvm_vcpu * vcpu)2210fce5623SAvi Kivity void vcpu_put(struct kvm_vcpu *vcpu)
2220fce5623SAvi Kivity {
2230fce5623SAvi Kivity preempt_disable();
2240fce5623SAvi Kivity kvm_arch_vcpu_put(vcpu);
2250fce5623SAvi Kivity preempt_notifier_unregister(&vcpu->preempt_notifier);
2267495e22bSPaolo Bonzini __this_cpu_write(kvm_running_vcpu, NULL);
2270fce5623SAvi Kivity preempt_enable();
2280fce5623SAvi Kivity }
2292f1fe811SJim Mattson EXPORT_SYMBOL_GPL(vcpu_put);
2300fce5623SAvi Kivity
2317a97cec2SPaolo Bonzini /* TODO: merge with kvm_arch_vcpu_should_kick */
kvm_request_needs_ipi(struct kvm_vcpu * vcpu,unsigned req)2327a97cec2SPaolo Bonzini static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
2337a97cec2SPaolo Bonzini {
2347a97cec2SPaolo Bonzini int mode = kvm_vcpu_exiting_guest_mode(vcpu);
2357a97cec2SPaolo Bonzini
2367a97cec2SPaolo Bonzini /*
2377a97cec2SPaolo Bonzini * We need to wait for the VCPU to reenable interrupts and get out of
2387a97cec2SPaolo Bonzini * READING_SHADOW_PAGE_TABLES mode.
2397a97cec2SPaolo Bonzini */
2407a97cec2SPaolo Bonzini if (req & KVM_REQUEST_WAIT)
2417a97cec2SPaolo Bonzini return mode != OUTSIDE_GUEST_MODE;
2427a97cec2SPaolo Bonzini
2437a97cec2SPaolo Bonzini /*
2447a97cec2SPaolo Bonzini * Need to kick a running VCPU, but otherwise there is nothing to do.
2457a97cec2SPaolo Bonzini */
2467a97cec2SPaolo Bonzini return mode == IN_GUEST_MODE;
2477a97cec2SPaolo Bonzini }
2487a97cec2SPaolo Bonzini
ack_kick(void * _completed)249f24b44e4SLai Jiangshan static void ack_kick(void *_completed)
2500fce5623SAvi Kivity {
2510fce5623SAvi Kivity }
2520fce5623SAvi Kivity
kvm_kick_many_cpus(struct cpumask * cpus,bool wait)253620b2438SVitaly Kuznetsov static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
254b49defe8SPaolo Bonzini {
255b49defe8SPaolo Bonzini if (cpumask_empty(cpus))
256b49defe8SPaolo Bonzini return false;
257b49defe8SPaolo Bonzini
258f24b44e4SLai Jiangshan smp_call_function_many(cpus, ack_kick, NULL, wait);
259b49defe8SPaolo Bonzini return true;
260b49defe8SPaolo Bonzini }
261b49defe8SPaolo Bonzini
kvm_make_vcpu_request(struct kvm_vcpu * vcpu,unsigned int req,struct cpumask * tmp,int current_cpu)262b56bd8e0SJinrong Liang static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
263b56bd8e0SJinrong Liang struct cpumask *tmp, int current_cpu)
2640fce5623SAvi Kivity {
265ae0946cdSVitaly Kuznetsov int cpu;
2667053df4eSVitaly Kuznetsov
267df06dae3SSean Christopherson if (likely(!(req & KVM_REQUEST_NO_ACTION)))
268df06dae3SSean Christopherson __kvm_make_request(req, vcpu);
2696b7e2d09SXiao Guangrong
270178f02ffSRadim Krčmář if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
271ae0946cdSVitaly Kuznetsov return;
2726c6e8360SRadim Krčmář
27385b64045SSean Christopherson /*
274ae0946cdSVitaly Kuznetsov * Note, the vCPU could get migrated to a different pCPU at any point
275ae0946cdSVitaly Kuznetsov * after kvm_request_needs_ipi(), which could result in sending an IPI
276ae0946cdSVitaly Kuznetsov * to the previous pCPU. But, that's OK because the purpose of the IPI
277ae0946cdSVitaly Kuznetsov * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
278ae0946cdSVitaly Kuznetsov * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
279ae0946cdSVitaly Kuznetsov * after this point is also OK, as the requirement is only that KVM wait
280ae0946cdSVitaly Kuznetsov * for vCPUs that were reading SPTEs _before_ any changes were
281ae0946cdSVitaly Kuznetsov * finalized. See kvm_vcpu_kick() for more details on handling requests.
28285b64045SSean Christopherson */
2830bbc2ca8SSean Christopherson if (kvm_request_needs_ipi(vcpu, req)) {
28485b64045SSean Christopherson cpu = READ_ONCE(vcpu->cpu);
285ae0946cdSVitaly Kuznetsov if (cpu != -1 && cpu != current_cpu)
2867053df4eSVitaly Kuznetsov __cpumask_set_cpu(cpu, tmp);
2870fce5623SAvi Kivity }
28885b64045SSean Christopherson }
2897053df4eSVitaly Kuznetsov
kvm_make_vcpus_request_mask(struct kvm * kvm,unsigned int req,unsigned long * vcpu_bitmap)290ae0946cdSVitaly Kuznetsov bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
291620b2438SVitaly Kuznetsov unsigned long *vcpu_bitmap)
292ae0946cdSVitaly Kuznetsov {
293ae0946cdSVitaly Kuznetsov struct kvm_vcpu *vcpu;
294620b2438SVitaly Kuznetsov struct cpumask *cpus;
295ae0946cdSVitaly Kuznetsov int i, me;
296ae0946cdSVitaly Kuznetsov bool called;
297ae0946cdSVitaly Kuznetsov
298ae0946cdSVitaly Kuznetsov me = get_cpu();
299ae0946cdSVitaly Kuznetsov
300620b2438SVitaly Kuznetsov cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
301620b2438SVitaly Kuznetsov cpumask_clear(cpus);
302620b2438SVitaly Kuznetsov
303ae0946cdSVitaly Kuznetsov for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
304ae0946cdSVitaly Kuznetsov vcpu = kvm_get_vcpu(kvm, i);
305381cecc5SVitaly Kuznetsov if (!vcpu)
306ae0946cdSVitaly Kuznetsov continue;
307b56bd8e0SJinrong Liang kvm_make_vcpu_request(vcpu, req, cpus, me);
308ae0946cdSVitaly Kuznetsov }
309ae0946cdSVitaly Kuznetsov
310620b2438SVitaly Kuznetsov called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
3113cba4130SXiao Guangrong put_cpu();
3127053df4eSVitaly Kuznetsov
3137053df4eSVitaly Kuznetsov return called;
3147053df4eSVitaly Kuznetsov }
3157053df4eSVitaly Kuznetsov
kvm_make_all_cpus_request_except(struct kvm * kvm,unsigned int req,struct kvm_vcpu * except)31654163a34SSuravee Suthikulpanit bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
31754163a34SSuravee Suthikulpanit struct kvm_vcpu *except)
3187053df4eSVitaly Kuznetsov {
319ae0946cdSVitaly Kuznetsov struct kvm_vcpu *vcpu;
320baff59ccSVitaly Kuznetsov struct cpumask *cpus;
32146808a4cSMarc Zyngier unsigned long i;
3227053df4eSVitaly Kuznetsov bool called;
32346808a4cSMarc Zyngier int me;
3247053df4eSVitaly Kuznetsov
325ae0946cdSVitaly Kuznetsov me = get_cpu();
326ae0946cdSVitaly Kuznetsov
327baff59ccSVitaly Kuznetsov cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
328baff59ccSVitaly Kuznetsov cpumask_clear(cpus);
329baff59ccSVitaly Kuznetsov
330ae0946cdSVitaly Kuznetsov kvm_for_each_vcpu(i, vcpu, kvm) {
331ae0946cdSVitaly Kuznetsov if (vcpu == except)
332ae0946cdSVitaly Kuznetsov continue;
333b56bd8e0SJinrong Liang kvm_make_vcpu_request(vcpu, req, cpus, me);
334ae0946cdSVitaly Kuznetsov }
335ae0946cdSVitaly Kuznetsov
336ae0946cdSVitaly Kuznetsov called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
337ae0946cdSVitaly Kuznetsov put_cpu();
3387053df4eSVitaly Kuznetsov
33949846896SRusty Russell return called;
34049846896SRusty Russell }
34149846896SRusty Russell
kvm_make_all_cpus_request(struct kvm * kvm,unsigned int req)34254163a34SSuravee Suthikulpanit bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
34354163a34SSuravee Suthikulpanit {
34454163a34SSuravee Suthikulpanit return kvm_make_all_cpus_request_except(kvm, req, NULL);
34554163a34SSuravee Suthikulpanit }
346a2486020SMarcelo Tosatti EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
34754163a34SSuravee Suthikulpanit
kvm_flush_remote_tlbs(struct kvm * kvm)34849846896SRusty Russell void kvm_flush_remote_tlbs(struct kvm *kvm)
34949846896SRusty Russell {
3503cc4e148SJing Zhang ++kvm->stat.generic.remote_tlb_flush_requests;
3516bc6db00SLai Jiangshan
3524ae3cb3aSLan Tianyu /*
3534ae3cb3aSLan Tianyu * We want to publish modifications to the page tables before reading
3544ae3cb3aSLan Tianyu * mode. Pairs with a memory barrier in arch-specific code.
3554ae3cb3aSLan Tianyu * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
3564ae3cb3aSLan Tianyu * and smp_mb in walk_shadow_page_lockless_begin/end.
3574ae3cb3aSLan Tianyu * - powerpc: smp_mb in kvmppc_prepare_to_enter.
3584ae3cb3aSLan Tianyu *
3594ae3cb3aSLan Tianyu * There is already an smp_mb__after_atomic() before
3604ae3cb3aSLan Tianyu * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
3614ae3cb3aSLan Tianyu * barrier here.
3624ae3cb3aSLan Tianyu */
363a1342c80SDavid Matlack if (!kvm_arch_flush_remote_tlbs(kvm)
364b08660e5STianyu Lan || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
3650193cc90SJing Zhang ++kvm->stat.generic.remote_tlb_flush;
3660fce5623SAvi Kivity }
3672ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
3680fce5623SAvi Kivity
kvm_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)369d4788996SDavid Matlack void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
370d4788996SDavid Matlack {
371d4788996SDavid Matlack if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
372d4788996SDavid Matlack return;
373d4788996SDavid Matlack
374d4788996SDavid Matlack /*
375d4788996SDavid Matlack * Fall back to a flushing entire TLBs if the architecture range-based
376d4788996SDavid Matlack * TLB invalidation is unsupported or can't be performed for whatever
377d4788996SDavid Matlack * reason.
378d4788996SDavid Matlack */
379d4788996SDavid Matlack kvm_flush_remote_tlbs(kvm);
380d4788996SDavid Matlack }
381d4788996SDavid Matlack
kvm_flush_remote_tlbs_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)382619b5072SDavid Matlack void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
383619b5072SDavid Matlack const struct kvm_memory_slot *memslot)
384619b5072SDavid Matlack {
385619b5072SDavid Matlack /*
386619b5072SDavid Matlack * All current use cases for flushing the TLBs for a specific memslot
387619b5072SDavid Matlack * are related to dirty logging, and many do the TLB flush out of
388619b5072SDavid Matlack * mmu_lock. The interaction between the various operations on memslot
389619b5072SDavid Matlack * must be serialized by slots_locks to ensure the TLB flush from one
390619b5072SDavid Matlack * operation is observed by any other operation on the same memslot.
391619b5072SDavid Matlack */
392619b5072SDavid Matlack lockdep_assert_held(&kvm->slots_lock);
393619b5072SDavid Matlack kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
394619b5072SDavid Matlack }
3950fce5623SAvi Kivity
kvm_flush_shadow_all(struct kvm * kvm)396683412ccSMingwei Zhang static void kvm_flush_shadow_all(struct kvm *kvm)
397683412ccSMingwei Zhang {
398683412ccSMingwei Zhang kvm_arch_flush_shadow_all(kvm);
399683412ccSMingwei Zhang kvm_arch_guest_memory_reclaimed(kvm);
400683412ccSMingwei Zhang }
401683412ccSMingwei Zhang
4026926f95aSSean Christopherson #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache * mc,gfp_t gfp_flags)4036926f95aSSean Christopherson static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
4046926f95aSSean Christopherson gfp_t gfp_flags)
4056926f95aSSean Christopherson {
4066926f95aSSean Christopherson gfp_flags |= mc->gfp_zero;
4076926f95aSSean Christopherson
4086926f95aSSean Christopherson if (mc->kmem_cache)
4096926f95aSSean Christopherson return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
4106926f95aSSean Christopherson else
4116926f95aSSean Christopherson return (void *)__get_free_page(gfp_flags);
4126926f95aSSean Christopherson }
4136926f95aSSean Christopherson
__kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int capacity,int min)414837f66c7SDavid Matlack int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
4156926f95aSSean Christopherson {
41663f4b210SPaolo Bonzini gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
4176926f95aSSean Christopherson void *obj;
4186926f95aSSean Christopherson
4196926f95aSSean Christopherson if (mc->nobjs >= min)
4206926f95aSSean Christopherson return 0;
421837f66c7SDavid Matlack
422837f66c7SDavid Matlack if (unlikely(!mc->objects)) {
423837f66c7SDavid Matlack if (WARN_ON_ONCE(!capacity))
424837f66c7SDavid Matlack return -EIO;
425837f66c7SDavid Matlack
426837f66c7SDavid Matlack mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
427837f66c7SDavid Matlack if (!mc->objects)
428837f66c7SDavid Matlack return -ENOMEM;
429837f66c7SDavid Matlack
430837f66c7SDavid Matlack mc->capacity = capacity;
431837f66c7SDavid Matlack }
432837f66c7SDavid Matlack
433837f66c7SDavid Matlack /* It is illegal to request a different capacity across topups. */
434837f66c7SDavid Matlack if (WARN_ON_ONCE(mc->capacity != capacity))
435837f66c7SDavid Matlack return -EIO;
436837f66c7SDavid Matlack
437837f66c7SDavid Matlack while (mc->nobjs < mc->capacity) {
438837f66c7SDavid Matlack obj = mmu_memory_cache_alloc_obj(mc, gfp);
4396926f95aSSean Christopherson if (!obj)
4406926f95aSSean Christopherson return mc->nobjs >= min ? 0 : -ENOMEM;
4416926f95aSSean Christopherson mc->objects[mc->nobjs++] = obj;
4426926f95aSSean Christopherson }
4436926f95aSSean Christopherson return 0;
4446926f95aSSean Christopherson }
4456926f95aSSean Christopherson
kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int min)446837f66c7SDavid Matlack int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
447837f66c7SDavid Matlack {
448837f66c7SDavid Matlack return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
449837f66c7SDavid Matlack }
450837f66c7SDavid Matlack
kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache * mc)4516926f95aSSean Christopherson int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
4526926f95aSSean Christopherson {
4536926f95aSSean Christopherson return mc->nobjs;
4546926f95aSSean Christopherson }
4556926f95aSSean Christopherson
kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)4566926f95aSSean Christopherson void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
4576926f95aSSean Christopherson {
4586926f95aSSean Christopherson while (mc->nobjs) {
4596926f95aSSean Christopherson if (mc->kmem_cache)
4606926f95aSSean Christopherson kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
4616926f95aSSean Christopherson else
4626926f95aSSean Christopherson free_page((unsigned long)mc->objects[--mc->nobjs]);
4636926f95aSSean Christopherson }
464837f66c7SDavid Matlack
465837f66c7SDavid Matlack kvfree(mc->objects);
466837f66c7SDavid Matlack
467837f66c7SDavid Matlack mc->objects = NULL;
468837f66c7SDavid Matlack mc->capacity = 0;
4696926f95aSSean Christopherson }
4706926f95aSSean Christopherson
kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)4716926f95aSSean Christopherson void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
4726926f95aSSean Christopherson {
4736926f95aSSean Christopherson void *p;
4746926f95aSSean Christopherson
4756926f95aSSean Christopherson if (WARN_ON(!mc->nobjs))
4766926f95aSSean Christopherson p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
4776926f95aSSean Christopherson else
4786926f95aSSean Christopherson p = mc->objects[--mc->nobjs];
4796926f95aSSean Christopherson BUG_ON(!p);
4806926f95aSSean Christopherson return p;
4816926f95aSSean Christopherson }
4826926f95aSSean Christopherson #endif
4836926f95aSSean Christopherson
kvm_vcpu_init(struct kvm_vcpu * vcpu,struct kvm * kvm,unsigned id)4848bd826d6SSean Christopherson static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
4850fce5623SAvi Kivity {
4860fce5623SAvi Kivity mutex_init(&vcpu->mutex);
4870fce5623SAvi Kivity vcpu->cpu = -1;
4880fce5623SAvi Kivity vcpu->kvm = kvm;
4890fce5623SAvi Kivity vcpu->vcpu_id = id;
49034bb10b7SRik van Riel vcpu->pid = NULL;
491510958e9SSean Christopherson #ifndef __KVM_HAVE_ARCH_WQP
492da4ad88cSDavidlohr Bueso rcuwait_init(&vcpu->wait);
493510958e9SSean Christopherson #endif
494af585b92SGleb Natapov kvm_async_pf_vcpu_init(vcpu);
4950fce5623SAvi Kivity
4964c088493SRaghavendra K T kvm_vcpu_set_in_spin_loop(vcpu, false);
4974c088493SRaghavendra K T kvm_vcpu_set_dy_eligible(vcpu, false);
4983a08a8f9SRaghavendra K T vcpu->preempted = false;
499d73eb57bSWanpeng Li vcpu->ready = false;
500d5c48debSSean Christopherson preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
501a54d8066SMaciej S. Szmigiero vcpu->last_used_slot = NULL;
50258fc1166SOliver Upton
50358fc1166SOliver Upton /* Fill the stats id string for the vcpu */
50458fc1166SOliver Upton snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
50558fc1166SOliver Upton task_pid_nr(current), id);
5060fce5623SAvi Kivity }
5070fce5623SAvi Kivity
kvm_vcpu_destroy(struct kvm_vcpu * vcpu)50827592ae8SMarc Zyngier static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
5094543bdc0SSean Christopherson {
5104543bdc0SSean Christopherson kvm_arch_vcpu_destroy(vcpu);
5115593473aSPaolo Bonzini kvm_dirty_ring_free(&vcpu->dirty_ring);
512e529ef66SSean Christopherson
5139941d224SSean Christopherson /*
5149941d224SSean Christopherson * No need for rcu_read_lock as VCPU_RUN is the only place that changes
5159941d224SSean Christopherson * the vcpu->pid pointer, and at destruction time all file descriptors
5169941d224SSean Christopherson * are already gone.
5179941d224SSean Christopherson */
5189941d224SSean Christopherson put_pid(rcu_dereference_protected(vcpu->pid, 1));
5199941d224SSean Christopherson
5208bd826d6SSean Christopherson free_page((unsigned long)vcpu->run);
521e529ef66SSean Christopherson kmem_cache_free(kvm_vcpu_cache, vcpu);
5224543bdc0SSean Christopherson }
52327592ae8SMarc Zyngier
kvm_destroy_vcpus(struct kvm * kvm)52427592ae8SMarc Zyngier void kvm_destroy_vcpus(struct kvm *kvm)
52527592ae8SMarc Zyngier {
52646808a4cSMarc Zyngier unsigned long i;
52727592ae8SMarc Zyngier struct kvm_vcpu *vcpu;
52827592ae8SMarc Zyngier
52927592ae8SMarc Zyngier kvm_for_each_vcpu(i, vcpu, kvm) {
53027592ae8SMarc Zyngier kvm_vcpu_destroy(vcpu);
531c5b07754SMarc Zyngier xa_erase(&kvm->vcpu_array, i);
53227592ae8SMarc Zyngier }
53327592ae8SMarc Zyngier
53427592ae8SMarc Zyngier atomic_set(&kvm->online_vcpus, 0);
53527592ae8SMarc Zyngier }
53627592ae8SMarc Zyngier EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
5374543bdc0SSean Christopherson
538e930bffeSAndrea Arcangeli #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_to_kvm(struct mmu_notifier * mn)539e930bffeSAndrea Arcangeli static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
540e930bffeSAndrea Arcangeli {
541e930bffeSAndrea Arcangeli return container_of(mn, struct kvm, mmu_notifier);
542e930bffeSAndrea Arcangeli }
543e930bffeSAndrea Arcangeli
5443039bcc7SSean Christopherson typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
5453039bcc7SSean Christopherson
546f922bd9bSSean Christopherson typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
547f922bd9bSSean Christopherson unsigned long end);
548f922bd9bSSean Christopherson
549683412ccSMingwei Zhang typedef void (*on_unlock_fn_t)(struct kvm *kvm);
550683412ccSMingwei Zhang
5513039bcc7SSean Christopherson struct kvm_hva_range {
5523039bcc7SSean Christopherson unsigned long start;
5533039bcc7SSean Christopherson unsigned long end;
5543e1efe2bSSean Christopherson union kvm_mmu_notifier_arg arg;
5553039bcc7SSean Christopherson hva_handler_t handler;
556f922bd9bSSean Christopherson on_lock_fn_t on_lock;
557683412ccSMingwei Zhang on_unlock_fn_t on_unlock;
5583039bcc7SSean Christopherson bool flush_on_ret;
5593039bcc7SSean Christopherson bool may_block;
5603039bcc7SSean Christopherson };
5613039bcc7SSean Christopherson
562f922bd9bSSean Christopherson /*
563f922bd9bSSean Christopherson * Use a dedicated stub instead of NULL to indicate that there is no callback
564f922bd9bSSean Christopherson * function/handler. The compiler technically can't guarantee that a real
565f922bd9bSSean Christopherson * function will have a non-zero address, and so it will generate code to
566f922bd9bSSean Christopherson * check for !NULL, whereas comparing against a stub will be elided at compile
567f922bd9bSSean Christopherson * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
568f922bd9bSSean Christopherson */
kvm_null_fn(void)569f922bd9bSSean Christopherson static void kvm_null_fn(void)
570f922bd9bSSean Christopherson {
571f922bd9bSSean Christopherson
572f922bd9bSSean Christopherson }
573f922bd9bSSean Christopherson #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
574f922bd9bSSean Christopherson
5753e1efe2bSSean Christopherson static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
5763e1efe2bSSean Christopherson
577ed922739SMaciej S. Szmigiero /* Iterate over each memslot intersecting [start, last] (inclusive) range */
578ed922739SMaciej S. Szmigiero #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
579ed922739SMaciej S. Szmigiero for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
580ed922739SMaciej S. Szmigiero node; \
581ed922739SMaciej S. Szmigiero node = interval_tree_iter_next(node, start, last)) \
582ed922739SMaciej S. Szmigiero
__kvm_handle_hva_range(struct kvm * kvm,const struct kvm_hva_range * range)5833039bcc7SSean Christopherson static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
5843039bcc7SSean Christopherson const struct kvm_hva_range *range)
5853039bcc7SSean Christopherson {
5868931a454SSean Christopherson bool ret = false, locked = false;
587f922bd9bSSean Christopherson struct kvm_gfn_range gfn_range;
5883039bcc7SSean Christopherson struct kvm_memory_slot *slot;
5893039bcc7SSean Christopherson struct kvm_memslots *slots;
5903039bcc7SSean Christopherson int i, idx;
5913039bcc7SSean Christopherson
592ed922739SMaciej S. Szmigiero if (WARN_ON_ONCE(range->end <= range->start))
593ed922739SMaciej S. Szmigiero return 0;
594ed922739SMaciej S. Szmigiero
595f922bd9bSSean Christopherson /* A null handler is allowed if and only if on_lock() is provided. */
596f922bd9bSSean Christopherson if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
597f922bd9bSSean Christopherson IS_KVM_NULL_FN(range->handler)))
598f922bd9bSSean Christopherson return 0;
599f922bd9bSSean Christopherson
6003039bcc7SSean Christopherson idx = srcu_read_lock(&kvm->srcu);
6013039bcc7SSean Christopherson
6023039bcc7SSean Christopherson for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
603ed922739SMaciej S. Szmigiero struct interval_tree_node *node;
604ed922739SMaciej S. Szmigiero
6053039bcc7SSean Christopherson slots = __kvm_memslots(kvm, i);
606ed922739SMaciej S. Szmigiero kvm_for_each_memslot_in_hva_range(node, slots,
607ed922739SMaciej S. Szmigiero range->start, range->end - 1) {
6083039bcc7SSean Christopherson unsigned long hva_start, hva_end;
6093039bcc7SSean Christopherson
610a54d8066SMaciej S. Szmigiero slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
6113039bcc7SSean Christopherson hva_start = max(range->start, slot->userspace_addr);
6123039bcc7SSean Christopherson hva_end = min(range->end, slot->userspace_addr +
6133039bcc7SSean Christopherson (slot->npages << PAGE_SHIFT));
6143039bcc7SSean Christopherson
6153039bcc7SSean Christopherson /*
6163039bcc7SSean Christopherson * To optimize for the likely case where the address
6173039bcc7SSean Christopherson * range is covered by zero or one memslots, don't
6183039bcc7SSean Christopherson * bother making these conditional (to avoid writes on
6193039bcc7SSean Christopherson * the second or later invocation of the handler).
6203039bcc7SSean Christopherson */
6213e1efe2bSSean Christopherson gfn_range.arg = range->arg;
6223039bcc7SSean Christopherson gfn_range.may_block = range->may_block;
6233039bcc7SSean Christopherson
6243039bcc7SSean Christopherson /*
6253039bcc7SSean Christopherson * {gfn(page) | page intersects with [hva_start, hva_end)} =
6263039bcc7SSean Christopherson * {gfn_start, gfn_start+1, ..., gfn_end-1}.
6273039bcc7SSean Christopherson */
6283039bcc7SSean Christopherson gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
6293039bcc7SSean Christopherson gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
6303039bcc7SSean Christopherson gfn_range.slot = slot;
6313039bcc7SSean Christopherson
6328931a454SSean Christopherson if (!locked) {
6338931a454SSean Christopherson locked = true;
6348931a454SSean Christopherson KVM_MMU_LOCK(kvm);
635071064f1SPaolo Bonzini if (!IS_KVM_NULL_FN(range->on_lock))
636071064f1SPaolo Bonzini range->on_lock(kvm, range->start, range->end);
637071064f1SPaolo Bonzini if (IS_KVM_NULL_FN(range->handler))
638071064f1SPaolo Bonzini break;
6398931a454SSean Christopherson }
6403039bcc7SSean Christopherson ret |= range->handler(kvm, &gfn_range);
6413039bcc7SSean Christopherson }
6423039bcc7SSean Christopherson }
6433039bcc7SSean Christopherson
6446bc6db00SLai Jiangshan if (range->flush_on_ret && ret)
6453039bcc7SSean Christopherson kvm_flush_remote_tlbs(kvm);
6463039bcc7SSean Christopherson
647683412ccSMingwei Zhang if (locked) {
648f922bd9bSSean Christopherson KVM_MMU_UNLOCK(kvm);
649683412ccSMingwei Zhang if (!IS_KVM_NULL_FN(range->on_unlock))
650683412ccSMingwei Zhang range->on_unlock(kvm);
651683412ccSMingwei Zhang }
652f922bd9bSSean Christopherson
6533039bcc7SSean Christopherson srcu_read_unlock(&kvm->srcu, idx);
6543039bcc7SSean Christopherson
6553039bcc7SSean Christopherson /* The notifiers are averse to booleans. :-( */
6563039bcc7SSean Christopherson return (int)ret;
6573039bcc7SSean Christopherson }
6583039bcc7SSean Christopherson
kvm_handle_hva_range(struct mmu_notifier * mn,unsigned long start,unsigned long end,union kvm_mmu_notifier_arg arg,hva_handler_t handler)6593039bcc7SSean Christopherson static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
6603039bcc7SSean Christopherson unsigned long start,
6613039bcc7SSean Christopherson unsigned long end,
6623e1efe2bSSean Christopherson union kvm_mmu_notifier_arg arg,
6633039bcc7SSean Christopherson hva_handler_t handler)
6643039bcc7SSean Christopherson {
6653039bcc7SSean Christopherson struct kvm *kvm = mmu_notifier_to_kvm(mn);
6663039bcc7SSean Christopherson const struct kvm_hva_range range = {
6673039bcc7SSean Christopherson .start = start,
6683039bcc7SSean Christopherson .end = end,
6693e1efe2bSSean Christopherson .arg = arg,
6703039bcc7SSean Christopherson .handler = handler,
671f922bd9bSSean Christopherson .on_lock = (void *)kvm_null_fn,
672683412ccSMingwei Zhang .on_unlock = (void *)kvm_null_fn,
6733039bcc7SSean Christopherson .flush_on_ret = true,
6743039bcc7SSean Christopherson .may_block = false,
6753039bcc7SSean Christopherson };
6763039bcc7SSean Christopherson
677f922bd9bSSean Christopherson return __kvm_handle_hva_range(kvm, &range);
6783039bcc7SSean Christopherson }
6793039bcc7SSean Christopherson
kvm_handle_hva_range_no_flush(struct mmu_notifier * mn,unsigned long start,unsigned long end,hva_handler_t handler)6803039bcc7SSean Christopherson static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
6813039bcc7SSean Christopherson unsigned long start,
6823039bcc7SSean Christopherson unsigned long end,
6833039bcc7SSean Christopherson hva_handler_t handler)
6843039bcc7SSean Christopherson {
6853039bcc7SSean Christopherson struct kvm *kvm = mmu_notifier_to_kvm(mn);
6863039bcc7SSean Christopherson const struct kvm_hva_range range = {
6873039bcc7SSean Christopherson .start = start,
6883039bcc7SSean Christopherson .end = end,
6893039bcc7SSean Christopherson .handler = handler,
690f922bd9bSSean Christopherson .on_lock = (void *)kvm_null_fn,
691683412ccSMingwei Zhang .on_unlock = (void *)kvm_null_fn,
6923039bcc7SSean Christopherson .flush_on_ret = false,
6933039bcc7SSean Christopherson .may_block = false,
6943039bcc7SSean Christopherson };
6953039bcc7SSean Christopherson
696f922bd9bSSean Christopherson return __kvm_handle_hva_range(kvm, &range);
6973039bcc7SSean Christopherson }
6982230f9e1SGavin Shan
kvm_change_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)6992230f9e1SGavin Shan static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
7002230f9e1SGavin Shan {
7012230f9e1SGavin Shan /*
7022230f9e1SGavin Shan * Skipping invalid memslots is correct if and only change_pte() is
7032230f9e1SGavin Shan * surrounded by invalidate_range_{start,end}(), which is currently
7042230f9e1SGavin Shan * guaranteed by the primary MMU. If that ever changes, KVM needs to
7052230f9e1SGavin Shan * unmap the memslot instead of skipping the memslot to ensure that KVM
7062230f9e1SGavin Shan * doesn't hold references to the old PFN.
7072230f9e1SGavin Shan */
7082230f9e1SGavin Shan WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
7092230f9e1SGavin Shan
7102230f9e1SGavin Shan if (range->slot->flags & KVM_MEMSLOT_INVALID)
7112230f9e1SGavin Shan return false;
7122230f9e1SGavin Shan
7132230f9e1SGavin Shan return kvm_set_spte_gfn(kvm, range);
7142230f9e1SGavin Shan }
7152230f9e1SGavin Shan
kvm_mmu_notifier_change_pte(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address,pte_t pte)7163da0dd43SIzik Eidus static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
7173da0dd43SIzik Eidus struct mm_struct *mm,
7183da0dd43SIzik Eidus unsigned long address,
7193da0dd43SIzik Eidus pte_t pte)
7203da0dd43SIzik Eidus {
7213da0dd43SIzik Eidus struct kvm *kvm = mmu_notifier_to_kvm(mn);
7223e1efe2bSSean Christopherson const union kvm_mmu_notifier_arg arg = { .pte = pte };
7233da0dd43SIzik Eidus
724501b9185SSean Christopherson trace_kvm_set_spte_hva(address);
725501b9185SSean Christopherson
726c13fda23SSean Christopherson /*
72752ac8b35SPaolo Bonzini * .change_pte() must be surrounded by .invalidate_range_{start,end}().
72820ec3ebdSChao Peng * If mmu_invalidate_in_progress is zero, then no in-progress
72920ec3ebdSChao Peng * invalidations, including this one, found a relevant memslot at
73020ec3ebdSChao Peng * start(); rechecking memslots here is unnecessary. Note, a false
73120ec3ebdSChao Peng * positive (count elevated by a different invalidation) is sub-optimal
73220ec3ebdSChao Peng * but functionally ok.
733c13fda23SSean Christopherson */
73452ac8b35SPaolo Bonzini WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
73520ec3ebdSChao Peng if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
736071064f1SPaolo Bonzini return;
737c13fda23SSean Christopherson
7383e1efe2bSSean Christopherson kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
7393da0dd43SIzik Eidus }
7403da0dd43SIzik Eidus
kvm_mmu_invalidate_begin(struct kvm * kvm,unsigned long start,unsigned long end)74120ec3ebdSChao Peng void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
742f922bd9bSSean Christopherson unsigned long end)
743e930bffeSAndrea Arcangeli {
744e930bffeSAndrea Arcangeli /*
745e930bffeSAndrea Arcangeli * The count increase must become visible at unlock time as no
746e930bffeSAndrea Arcangeli * spte can be established without taking the mmu_lock and
747e930bffeSAndrea Arcangeli * count is also read inside the mmu_lock critical section.
748e930bffeSAndrea Arcangeli */
74920ec3ebdSChao Peng kvm->mmu_invalidate_in_progress++;
75020ec3ebdSChao Peng if (likely(kvm->mmu_invalidate_in_progress == 1)) {
75120ec3ebdSChao Peng kvm->mmu_invalidate_range_start = start;
75220ec3ebdSChao Peng kvm->mmu_invalidate_range_end = end;
7534a42d848SDavid Stevens } else {
7544a42d848SDavid Stevens /*
755a413a625STom Rix * Fully tracking multiple concurrent ranges has diminishing
7564a42d848SDavid Stevens * returns. Keep things simple and just find the minimal range
7574a42d848SDavid Stevens * which includes the current and new ranges. As there won't be
7584a42d848SDavid Stevens * enough information to subtract a range after its invalidate
7594a42d848SDavid Stevens * completes, any ranges invalidated concurrently will
7604a42d848SDavid Stevens * accumulate and persist until all outstanding invalidates
7614a42d848SDavid Stevens * complete.
7624a42d848SDavid Stevens */
76320ec3ebdSChao Peng kvm->mmu_invalidate_range_start =
76420ec3ebdSChao Peng min(kvm->mmu_invalidate_range_start, start);
76520ec3ebdSChao Peng kvm->mmu_invalidate_range_end =
76620ec3ebdSChao Peng max(kvm->mmu_invalidate_range_end, end);
767f922bd9bSSean Christopherson }
7684a42d848SDavid Stevens }
7693039bcc7SSean Christopherson
kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier * mn,const struct mmu_notifier_range * range)770f922bd9bSSean Christopherson static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
771f922bd9bSSean Christopherson const struct mmu_notifier_range *range)
772f922bd9bSSean Christopherson {
773f922bd9bSSean Christopherson struct kvm *kvm = mmu_notifier_to_kvm(mn);
774f922bd9bSSean Christopherson const struct kvm_hva_range hva_range = {
775f922bd9bSSean Christopherson .start = range->start,
776f922bd9bSSean Christopherson .end = range->end,
777f922bd9bSSean Christopherson .handler = kvm_unmap_gfn_range,
77820ec3ebdSChao Peng .on_lock = kvm_mmu_invalidate_begin,
779683412ccSMingwei Zhang .on_unlock = kvm_arch_guest_memory_reclaimed,
780f922bd9bSSean Christopherson .flush_on_ret = true,
781f922bd9bSSean Christopherson .may_block = mmu_notifier_range_blockable(range),
782f922bd9bSSean Christopherson };
783565f3be2STakuya Yoshikawa
784f922bd9bSSean Christopherson trace_kvm_unmap_hva_range(range->start, range->end);
785f922bd9bSSean Christopherson
78652ac8b35SPaolo Bonzini /*
78752ac8b35SPaolo Bonzini * Prevent memslot modification between range_start() and range_end()
78852ac8b35SPaolo Bonzini * so that conditionally locking provides the same result in both
78920ec3ebdSChao Peng * functions. Without that guarantee, the mmu_invalidate_in_progress
79052ac8b35SPaolo Bonzini * adjustments will be imbalanced.
79152ac8b35SPaolo Bonzini *
79252ac8b35SPaolo Bonzini * Pairs with the decrement in range_end().
79352ac8b35SPaolo Bonzini */
79452ac8b35SPaolo Bonzini spin_lock(&kvm->mn_invalidate_lock);
79552ac8b35SPaolo Bonzini kvm->mn_active_invalidate_count++;
79652ac8b35SPaolo Bonzini spin_unlock(&kvm->mn_invalidate_lock);
79752ac8b35SPaolo Bonzini
79858cd407cSSean Christopherson /*
79958cd407cSSean Christopherson * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
80058cd407cSSean Christopherson * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
80158cd407cSSean Christopherson * each cache's lock. There are relatively few caches in existence at
80258cd407cSSean Christopherson * any given time, and the caches themselves can check for hva overlap,
80358cd407cSSean Christopherson * i.e. don't need to rely on memslot overlap checks for performance.
80458cd407cSSean Christopherson * Because this runs without holding mmu_lock, the pfn caches must use
80520ec3ebdSChao Peng * mn_active_invalidate_count (see above) instead of
80620ec3ebdSChao Peng * mmu_invalidate_in_progress.
80758cd407cSSean Christopherson */
808982ed0deSDavid Woodhouse gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
809982ed0deSDavid Woodhouse hva_range.may_block);
810982ed0deSDavid Woodhouse
811f922bd9bSSean Christopherson __kvm_handle_hva_range(kvm, &hva_range);
81293065ac7SMichal Hocko
813e649b3f0SEiichi Tsukata return 0;
814e930bffeSAndrea Arcangeli }
815e930bffeSAndrea Arcangeli
kvm_mmu_invalidate_end(struct kvm * kvm,unsigned long start,unsigned long end)81620ec3ebdSChao Peng void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
817f922bd9bSSean Christopherson unsigned long end)
818e930bffeSAndrea Arcangeli {
819e930bffeSAndrea Arcangeli /*
820e930bffeSAndrea Arcangeli * This sequence increase will notify the kvm page fault that
821e930bffeSAndrea Arcangeli * the page that is going to be mapped in the spte could have
822e930bffeSAndrea Arcangeli * been freed.
823e930bffeSAndrea Arcangeli */
82420ec3ebdSChao Peng kvm->mmu_invalidate_seq++;
825a355aa54SPaul Mackerras smp_wmb();
826e930bffeSAndrea Arcangeli /*
827e930bffeSAndrea Arcangeli * The above sequence increase must be visible before the
828a355aa54SPaul Mackerras * below count decrease, which is ensured by the smp_wmb above
82920ec3ebdSChao Peng * in conjunction with the smp_rmb in mmu_invalidate_retry().
830e930bffeSAndrea Arcangeli */
83120ec3ebdSChao Peng kvm->mmu_invalidate_in_progress--;
832f922bd9bSSean Christopherson }
833f922bd9bSSean Christopherson
kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier * mn,const struct mmu_notifier_range * range)834f922bd9bSSean Christopherson static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
835f922bd9bSSean Christopherson const struct mmu_notifier_range *range)
836f922bd9bSSean Christopherson {
837f922bd9bSSean Christopherson struct kvm *kvm = mmu_notifier_to_kvm(mn);
838f922bd9bSSean Christopherson const struct kvm_hva_range hva_range = {
839f922bd9bSSean Christopherson .start = range->start,
840f922bd9bSSean Christopherson .end = range->end,
841f922bd9bSSean Christopherson .handler = (void *)kvm_null_fn,
84220ec3ebdSChao Peng .on_lock = kvm_mmu_invalidate_end,
843683412ccSMingwei Zhang .on_unlock = (void *)kvm_null_fn,
844f922bd9bSSean Christopherson .flush_on_ret = false,
845f922bd9bSSean Christopherson .may_block = mmu_notifier_range_blockable(range),
846f922bd9bSSean Christopherson };
84752ac8b35SPaolo Bonzini bool wake;
848f922bd9bSSean Christopherson
849f922bd9bSSean Christopherson __kvm_handle_hva_range(kvm, &hva_range);
850e930bffeSAndrea Arcangeli
85152ac8b35SPaolo Bonzini /* Pairs with the increment in range_start(). */
85252ac8b35SPaolo Bonzini spin_lock(&kvm->mn_invalidate_lock);
85352ac8b35SPaolo Bonzini wake = (--kvm->mn_active_invalidate_count == 0);
85452ac8b35SPaolo Bonzini spin_unlock(&kvm->mn_invalidate_lock);
85552ac8b35SPaolo Bonzini
85652ac8b35SPaolo Bonzini /*
85752ac8b35SPaolo Bonzini * There can only be one waiter, since the wait happens under
85852ac8b35SPaolo Bonzini * slots_lock.
85952ac8b35SPaolo Bonzini */
86052ac8b35SPaolo Bonzini if (wake)
86152ac8b35SPaolo Bonzini rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
86252ac8b35SPaolo Bonzini
86320ec3ebdSChao Peng BUG_ON(kvm->mmu_invalidate_in_progress < 0);
864e930bffeSAndrea Arcangeli }
865e930bffeSAndrea Arcangeli
kvm_mmu_notifier_clear_flush_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)866e930bffeSAndrea Arcangeli static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
867e930bffeSAndrea Arcangeli struct mm_struct *mm,
86857128468SAndres Lagar-Cavilla unsigned long start,
86957128468SAndres Lagar-Cavilla unsigned long end)
870e930bffeSAndrea Arcangeli {
871501b9185SSean Christopherson trace_kvm_age_hva(start, end);
872501b9185SSean Christopherson
8733e1efe2bSSean Christopherson return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
8743e1efe2bSSean Christopherson kvm_age_gfn);
875e930bffeSAndrea Arcangeli }
876e930bffeSAndrea Arcangeli
kvm_mmu_notifier_clear_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)8771d7715c6SVladimir Davydov static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
8781d7715c6SVladimir Davydov struct mm_struct *mm,
8791d7715c6SVladimir Davydov unsigned long start,
8801d7715c6SVladimir Davydov unsigned long end)
8811d7715c6SVladimir Davydov {
882501b9185SSean Christopherson trace_kvm_age_hva(start, end);
883501b9185SSean Christopherson
8841d7715c6SVladimir Davydov /*
8851d7715c6SVladimir Davydov * Even though we do not flush TLB, this will still adversely
8861d7715c6SVladimir Davydov * affect performance on pre-Haswell Intel EPT, where there is
8871d7715c6SVladimir Davydov * no EPT Access Bit to clear so that we have to tear down EPT
8881d7715c6SVladimir Davydov * tables instead. If we find this unacceptable, we can always
8891d7715c6SVladimir Davydov * add a parameter to kvm_age_hva so that it effectively doesn't
8901d7715c6SVladimir Davydov * do anything on clear_young.
8911d7715c6SVladimir Davydov *
8921d7715c6SVladimir Davydov * Also note that currently we never issue secondary TLB flushes
8931d7715c6SVladimir Davydov * from clear_young, leaving this job up to the regular system
8941d7715c6SVladimir Davydov * cadence. If we find this inaccurate, we might come up with a
8951d7715c6SVladimir Davydov * more sophisticated heuristic later.
8961d7715c6SVladimir Davydov */
8973039bcc7SSean Christopherson return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
8981d7715c6SVladimir Davydov }
8991d7715c6SVladimir Davydov
kvm_mmu_notifier_test_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address)9008ee53820SAndrea Arcangeli static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
9018ee53820SAndrea Arcangeli struct mm_struct *mm,
9028ee53820SAndrea Arcangeli unsigned long address)
9038ee53820SAndrea Arcangeli {
904501b9185SSean Christopherson trace_kvm_test_age_hva(address);
905501b9185SSean Christopherson
9063039bcc7SSean Christopherson return kvm_handle_hva_range_no_flush(mn, address, address + 1,
9073039bcc7SSean Christopherson kvm_test_age_gfn);
9088ee53820SAndrea Arcangeli }
9098ee53820SAndrea Arcangeli
kvm_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)91085db06e5SMarcelo Tosatti static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
91185db06e5SMarcelo Tosatti struct mm_struct *mm)
91285db06e5SMarcelo Tosatti {
91385db06e5SMarcelo Tosatti struct kvm *kvm = mmu_notifier_to_kvm(mn);
914eda2bedaSLai Jiangshan int idx;
915eda2bedaSLai Jiangshan
916eda2bedaSLai Jiangshan idx = srcu_read_lock(&kvm->srcu);
917683412ccSMingwei Zhang kvm_flush_shadow_all(kvm);
918eda2bedaSLai Jiangshan srcu_read_unlock(&kvm->srcu, idx);
91985db06e5SMarcelo Tosatti }
92085db06e5SMarcelo Tosatti
921e930bffeSAndrea Arcangeli static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
922e930bffeSAndrea Arcangeli .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
923e930bffeSAndrea Arcangeli .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
924e930bffeSAndrea Arcangeli .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
9251d7715c6SVladimir Davydov .clear_young = kvm_mmu_notifier_clear_young,
9268ee53820SAndrea Arcangeli .test_young = kvm_mmu_notifier_test_young,
9273da0dd43SIzik Eidus .change_pte = kvm_mmu_notifier_change_pte,
92885db06e5SMarcelo Tosatti .release = kvm_mmu_notifier_release,
929e930bffeSAndrea Arcangeli };
9304c07b0a4SAvi Kivity
kvm_init_mmu_notifier(struct kvm * kvm)9314c07b0a4SAvi Kivity static int kvm_init_mmu_notifier(struct kvm *kvm)
9324c07b0a4SAvi Kivity {
9334c07b0a4SAvi Kivity kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
9344c07b0a4SAvi Kivity return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
9354c07b0a4SAvi Kivity }
9364c07b0a4SAvi Kivity
9374c07b0a4SAvi Kivity #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
9384c07b0a4SAvi Kivity
kvm_init_mmu_notifier(struct kvm * kvm)9394c07b0a4SAvi Kivity static int kvm_init_mmu_notifier(struct kvm *kvm)
9404c07b0a4SAvi Kivity {
9414c07b0a4SAvi Kivity return 0;
9424c07b0a4SAvi Kivity }
9434c07b0a4SAvi Kivity
944e930bffeSAndrea Arcangeli #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
945e930bffeSAndrea Arcangeli
9462fdef3a2SSergey Senozhatsky #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
kvm_pm_notifier_call(struct notifier_block * bl,unsigned long state,void * unused)9472fdef3a2SSergey Senozhatsky static int kvm_pm_notifier_call(struct notifier_block *bl,
9482fdef3a2SSergey Senozhatsky unsigned long state,
9492fdef3a2SSergey Senozhatsky void *unused)
9502fdef3a2SSergey Senozhatsky {
9512fdef3a2SSergey Senozhatsky struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
9522fdef3a2SSergey Senozhatsky
9532fdef3a2SSergey Senozhatsky return kvm_arch_pm_notifier(kvm, state);
9542fdef3a2SSergey Senozhatsky }
9552fdef3a2SSergey Senozhatsky
kvm_init_pm_notifier(struct kvm * kvm)9562fdef3a2SSergey Senozhatsky static void kvm_init_pm_notifier(struct kvm *kvm)
9572fdef3a2SSergey Senozhatsky {
9582fdef3a2SSergey Senozhatsky kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
9592fdef3a2SSergey Senozhatsky /* Suspend KVM before we suspend ftrace, RCU, etc. */
9602fdef3a2SSergey Senozhatsky kvm->pm_notifier.priority = INT_MAX;
9612fdef3a2SSergey Senozhatsky register_pm_notifier(&kvm->pm_notifier);
9622fdef3a2SSergey Senozhatsky }
9632fdef3a2SSergey Senozhatsky
kvm_destroy_pm_notifier(struct kvm * kvm)9642fdef3a2SSergey Senozhatsky static void kvm_destroy_pm_notifier(struct kvm *kvm)
9652fdef3a2SSergey Senozhatsky {
9662fdef3a2SSergey Senozhatsky unregister_pm_notifier(&kvm->pm_notifier);
9672fdef3a2SSergey Senozhatsky }
9682fdef3a2SSergey Senozhatsky #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
kvm_init_pm_notifier(struct kvm * kvm)9692fdef3a2SSergey Senozhatsky static void kvm_init_pm_notifier(struct kvm *kvm)
9702fdef3a2SSergey Senozhatsky {
9712fdef3a2SSergey Senozhatsky }
9722fdef3a2SSergey Senozhatsky
kvm_destroy_pm_notifier(struct kvm * kvm)9732fdef3a2SSergey Senozhatsky static void kvm_destroy_pm_notifier(struct kvm *kvm)
9742fdef3a2SSergey Senozhatsky {
9752fdef3a2SSergey Senozhatsky }
9762fdef3a2SSergey Senozhatsky #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
9772fdef3a2SSergey Senozhatsky
kvm_destroy_dirty_bitmap(struct kvm_memory_slot * memslot)978a47d2b07SPaolo Bonzini static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
979a47d2b07SPaolo Bonzini {
980a47d2b07SPaolo Bonzini if (!memslot->dirty_bitmap)
981a47d2b07SPaolo Bonzini return;
982a47d2b07SPaolo Bonzini
983a47d2b07SPaolo Bonzini kvfree(memslot->dirty_bitmap);
984a47d2b07SPaolo Bonzini memslot->dirty_bitmap = NULL;
985a47d2b07SPaolo Bonzini }
986a47d2b07SPaolo Bonzini
987a54d8066SMaciej S. Szmigiero /* This does not remove the slot from struct kvm_memslots data structures */
kvm_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)988e96c81eeSSean Christopherson static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
989a47d2b07SPaolo Bonzini {
990e96c81eeSSean Christopherson kvm_destroy_dirty_bitmap(slot);
991a47d2b07SPaolo Bonzini
992e96c81eeSSean Christopherson kvm_arch_free_memslot(kvm, slot);
993a47d2b07SPaolo Bonzini
994a54d8066SMaciej S. Szmigiero kfree(slot);
995a47d2b07SPaolo Bonzini }
996a47d2b07SPaolo Bonzini
kvm_free_memslots(struct kvm * kvm,struct kvm_memslots * slots)997a47d2b07SPaolo Bonzini static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
998a47d2b07SPaolo Bonzini {
999a54d8066SMaciej S. Szmigiero struct hlist_node *idnode;
1000a47d2b07SPaolo Bonzini struct kvm_memory_slot *memslot;
1001a54d8066SMaciej S. Szmigiero int bkt;
1002a47d2b07SPaolo Bonzini
1003a54d8066SMaciej S. Szmigiero /*
1004a54d8066SMaciej S. Szmigiero * The same memslot objects live in both active and inactive sets,
1005a54d8066SMaciej S. Szmigiero * arbitrarily free using index '1' so the second invocation of this
1006a54d8066SMaciej S. Szmigiero * function isn't operating over a structure with dangling pointers
1007a54d8066SMaciej S. Szmigiero * (even though this function isn't actually touching them).
1008a54d8066SMaciej S. Szmigiero */
1009a54d8066SMaciej S. Szmigiero if (!slots->node_idx)
1010a47d2b07SPaolo Bonzini return;
1011a47d2b07SPaolo Bonzini
1012a54d8066SMaciej S. Szmigiero hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013e96c81eeSSean Christopherson kvm_free_memslot(kvm, memslot);
1014bf3e05bcSXiao Guangrong }
1015bf3e05bcSXiao Guangrong
kvm_stats_debugfs_mode(const struct _kvm_stats_desc * pdesc)1016bc9e9e67SJing Zhang static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017bc9e9e67SJing Zhang {
1018bc9e9e67SJing Zhang switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019bc9e9e67SJing Zhang case KVM_STATS_TYPE_INSTANT:
1020bc9e9e67SJing Zhang return 0444;
1021bc9e9e67SJing Zhang case KVM_STATS_TYPE_CUMULATIVE:
1022bc9e9e67SJing Zhang case KVM_STATS_TYPE_PEAK:
1023bc9e9e67SJing Zhang default:
1024bc9e9e67SJing Zhang return 0644;
1025bc9e9e67SJing Zhang }
1026bc9e9e67SJing Zhang }
1027bc9e9e67SJing Zhang
1028bc9e9e67SJing Zhang
kvm_destroy_vm_debugfs(struct kvm * kvm)1029536a6f88SJanosch Frank static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030536a6f88SJanosch Frank {
1031536a6f88SJanosch Frank int i;
1032bc9e9e67SJing Zhang int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033bc9e9e67SJing Zhang kvm_vcpu_stats_header.num_desc;
1034536a6f88SJanosch Frank
1035a44a4cc1SOliver Upton if (IS_ERR(kvm->debugfs_dentry))
1036536a6f88SJanosch Frank return;
1037536a6f88SJanosch Frank
1038536a6f88SJanosch Frank debugfs_remove_recursive(kvm->debugfs_dentry);
1039536a6f88SJanosch Frank
10409d5a1dceSLuiz Capitulino if (kvm->debugfs_stat_data) {
1041536a6f88SJanosch Frank for (i = 0; i < kvm_debugfs_num_entries; i++)
1042536a6f88SJanosch Frank kfree(kvm->debugfs_stat_data[i]);
1043536a6f88SJanosch Frank kfree(kvm->debugfs_stat_data);
1044536a6f88SJanosch Frank }
10459d5a1dceSLuiz Capitulino }
1046536a6f88SJanosch Frank
kvm_create_vm_debugfs(struct kvm * kvm,const char * fdname)104759f82aadSOliver Upton static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048536a6f88SJanosch Frank {
104985cd39afSPaolo Bonzini static DEFINE_MUTEX(kvm_debugfs_lock);
105085cd39afSPaolo Bonzini struct dentry *dent;
1051536a6f88SJanosch Frank char dir_name[ITOA_MAX_LEN * 2];
1052536a6f88SJanosch Frank struct kvm_stat_data *stat_data;
1053bc9e9e67SJing Zhang const struct _kvm_stats_desc *pdesc;
1054b74ed7a6SOliver Upton int i, ret = -ENOMEM;
1055bc9e9e67SJing Zhang int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056bc9e9e67SJing Zhang kvm_vcpu_stats_header.num_desc;
1057536a6f88SJanosch Frank
1058536a6f88SJanosch Frank if (!debugfs_initialized())
1059536a6f88SJanosch Frank return 0;
1060536a6f88SJanosch Frank
106159f82aadSOliver Upton snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
106285cd39afSPaolo Bonzini mutex_lock(&kvm_debugfs_lock);
106385cd39afSPaolo Bonzini dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
106485cd39afSPaolo Bonzini if (dent) {
106585cd39afSPaolo Bonzini pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
106685cd39afSPaolo Bonzini dput(dent);
106785cd39afSPaolo Bonzini mutex_unlock(&kvm_debugfs_lock);
106885cd39afSPaolo Bonzini return 0;
106985cd39afSPaolo Bonzini }
107085cd39afSPaolo Bonzini dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
107185cd39afSPaolo Bonzini mutex_unlock(&kvm_debugfs_lock);
107285cd39afSPaolo Bonzini if (IS_ERR(dent))
107385cd39afSPaolo Bonzini return 0;
1074536a6f88SJanosch Frank
107585cd39afSPaolo Bonzini kvm->debugfs_dentry = dent;
1076536a6f88SJanosch Frank kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077536a6f88SJanosch Frank sizeof(*kvm->debugfs_stat_data),
1078b12ce36aSBen Gardon GFP_KERNEL_ACCOUNT);
1079536a6f88SJanosch Frank if (!kvm->debugfs_stat_data)
1080b74ed7a6SOliver Upton goto out_err;
1081536a6f88SJanosch Frank
1082bc9e9e67SJing Zhang for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083bc9e9e67SJing Zhang pdesc = &kvm_vm_stats_desc[i];
1084b12ce36aSBen Gardon stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085536a6f88SJanosch Frank if (!stat_data)
1086b74ed7a6SOliver Upton goto out_err;
1087536a6f88SJanosch Frank
1088536a6f88SJanosch Frank stat_data->kvm = kvm;
1089bc9e9e67SJing Zhang stat_data->desc = pdesc;
1090bc9e9e67SJing Zhang stat_data->kind = KVM_STAT_VM;
1091bc9e9e67SJing Zhang kvm->debugfs_stat_data[i] = stat_data;
1092bc9e9e67SJing Zhang debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093bc9e9e67SJing Zhang kvm->debugfs_dentry, stat_data,
1094bc9e9e67SJing Zhang &stat_fops_per_vm);
1095bc9e9e67SJing Zhang }
1096bc9e9e67SJing Zhang
1097bc9e9e67SJing Zhang for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098bc9e9e67SJing Zhang pdesc = &kvm_vcpu_stats_desc[i];
1099bc9e9e67SJing Zhang stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100bc9e9e67SJing Zhang if (!stat_data)
1101b74ed7a6SOliver Upton goto out_err;
1102bc9e9e67SJing Zhang
1103bc9e9e67SJing Zhang stat_data->kvm = kvm;
1104bc9e9e67SJing Zhang stat_data->desc = pdesc;
1105bc9e9e67SJing Zhang stat_data->kind = KVM_STAT_VCPU;
1106004d62ebSPavel Skripkin kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107bc9e9e67SJing Zhang debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
110809cbcef6SMilan Pandurov kvm->debugfs_dentry, stat_data,
110909cbcef6SMilan Pandurov &stat_fops_per_vm);
1110536a6f88SJanosch Frank }
11113165af73SPeter Xu
11123165af73SPeter Xu ret = kvm_arch_create_vm_debugfs(kvm);
1113b74ed7a6SOliver Upton if (ret)
1114b74ed7a6SOliver Upton goto out_err;
11153165af73SPeter Xu
1116536a6f88SJanosch Frank return 0;
1117b74ed7a6SOliver Upton out_err:
1118b74ed7a6SOliver Upton kvm_destroy_vm_debugfs(kvm);
1119b74ed7a6SOliver Upton return ret;
1120536a6f88SJanosch Frank }
1121536a6f88SJanosch Frank
11221aa9b957SJunaid Shahid /*
11231aa9b957SJunaid Shahid * Called after the VM is otherwise initialized, but just before adding it to
11241aa9b957SJunaid Shahid * the vm_list.
11251aa9b957SJunaid Shahid */
kvm_arch_post_init_vm(struct kvm * kvm)11261aa9b957SJunaid Shahid int __weak kvm_arch_post_init_vm(struct kvm *kvm)
11271aa9b957SJunaid Shahid {
11281aa9b957SJunaid Shahid return 0;
11291aa9b957SJunaid Shahid }
11301aa9b957SJunaid Shahid
11311aa9b957SJunaid Shahid /*
11321aa9b957SJunaid Shahid * Called just after removing the VM from the vm_list, but before doing any
11331aa9b957SJunaid Shahid * other destruction.
11341aa9b957SJunaid Shahid */
kvm_arch_pre_destroy_vm(struct kvm * kvm)11351aa9b957SJunaid Shahid void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
11361aa9b957SJunaid Shahid {
11371aa9b957SJunaid Shahid }
11381aa9b957SJunaid Shahid
11393165af73SPeter Xu /*
11403165af73SPeter Xu * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
11413165af73SPeter Xu * be setup already, so we can create arch-specific debugfs entries under it.
11423165af73SPeter Xu * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
11433165af73SPeter Xu * a per-arch destroy interface is not needed.
11443165af73SPeter Xu */
kvm_arch_create_vm_debugfs(struct kvm * kvm)11453165af73SPeter Xu int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
11463165af73SPeter Xu {
11473165af73SPeter Xu return 0;
11483165af73SPeter Xu }
11493165af73SPeter Xu
kvm_create_vm(unsigned long type,const char * fdname)1150b74ed7a6SOliver Upton static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
11510fce5623SAvi Kivity {
1152d89f5effSJan Kiszka struct kvm *kvm = kvm_arch_alloc_vm();
1153a54d8066SMaciej S. Szmigiero struct kvm_memslots *slots;
11549121923cSJim Mattson int r = -ENOMEM;
1155a54d8066SMaciej S. Szmigiero int i, j;
11560fce5623SAvi Kivity
1157d89f5effSJan Kiszka if (!kvm)
1158d89f5effSJan Kiszka return ERR_PTR(-ENOMEM);
1159d89f5effSJan Kiszka
1160405294f2SSean Christopherson /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1161405294f2SSean Christopherson __module_get(kvm_chardev_ops.owner);
1162405294f2SSean Christopherson
1163531810caSBen Gardon KVM_MMU_LOCK_INIT(kvm);
1164f1f10076SVegard Nossum mmgrab(current->mm);
1165e9ad4ec8SPaolo Bonzini kvm->mm = current->mm;
1166e9ad4ec8SPaolo Bonzini kvm_eventfd_init(kvm);
1167e9ad4ec8SPaolo Bonzini mutex_init(&kvm->lock);
1168e9ad4ec8SPaolo Bonzini mutex_init(&kvm->irq_lock);
1169e9ad4ec8SPaolo Bonzini mutex_init(&kvm->slots_lock);
1170b10a038eSBen Gardon mutex_init(&kvm->slots_arch_lock);
117152ac8b35SPaolo Bonzini spin_lock_init(&kvm->mn_invalidate_lock);
117252ac8b35SPaolo Bonzini rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1173c5b07754SMarc Zyngier xa_init(&kvm->vcpu_array);
117452ac8b35SPaolo Bonzini
1175982ed0deSDavid Woodhouse INIT_LIST_HEAD(&kvm->gpc_list);
1176982ed0deSDavid Woodhouse spin_lock_init(&kvm->gpc_lock);
1177e9ad4ec8SPaolo Bonzini
1178e9ad4ec8SPaolo Bonzini INIT_LIST_HEAD(&kvm->devices);
1179f502cc56SSean Christopherson kvm->max_vcpus = KVM_MAX_VCPUS;
1180e9ad4ec8SPaolo Bonzini
11819121923cSJim Mattson BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
11829121923cSJim Mattson
11835c697c36SSean Christopherson /*
11845c697c36SSean Christopherson * Force subsequent debugfs file creations to fail if the VM directory
11855c697c36SSean Christopherson * is not created (by kvm_create_vm_debugfs()).
11865c697c36SSean Christopherson */
11875c697c36SSean Christopherson kvm->debugfs_dentry = ERR_PTR(-ENOENT);
11885c697c36SSean Christopherson
1189f2759c08SOliver Upton snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1190f2759c08SOliver Upton task_pid_nr(current));
1191f2759c08SOliver Upton
11928a44119aSPaolo Bonzini if (init_srcu_struct(&kvm->srcu))
11938a44119aSPaolo Bonzini goto out_err_no_srcu;
11948a44119aSPaolo Bonzini if (init_srcu_struct(&kvm->irq_srcu))
11958a44119aSPaolo Bonzini goto out_err_no_irq_srcu;
11968a44119aSPaolo Bonzini
1197e2d3fcafSPaolo Bonzini refcount_set(&kvm->users_count, 1);
11989121923cSJim Mattson for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199a54d8066SMaciej S. Szmigiero for (j = 0; j < 2; j++) {
1200a54d8066SMaciej S. Szmigiero slots = &kvm->__memslots[i][j];
12019121923cSJim Mattson
1202a54d8066SMaciej S. Szmigiero atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1203a54d8066SMaciej S. Szmigiero slots->hva_tree = RB_ROOT_CACHED;
1204a54d8066SMaciej S. Szmigiero slots->gfn_tree = RB_ROOT;
1205a54d8066SMaciej S. Szmigiero hash_init(slots->id_hash);
1206a54d8066SMaciej S. Szmigiero slots->node_idx = j;
1207a54d8066SMaciej S. Szmigiero
12089121923cSJim Mattson /* Generations must be different for each address space. */
12099121923cSJim Mattson slots->generation = i;
1210a54d8066SMaciej S. Szmigiero }
1211a54d8066SMaciej S. Szmigiero
1212a54d8066SMaciej S. Szmigiero rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
12139121923cSJim Mattson }
12149121923cSJim Mattson
12159121923cSJim Mattson for (i = 0; i < KVM_NR_BUSES; i++) {
12169121923cSJim Mattson rcu_assign_pointer(kvm->buses[i],
12179121923cSJim Mattson kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
12189121923cSJim Mattson if (!kvm->buses[i])
1219a97b0e77SJim Mattson goto out_err_no_arch_destroy_vm;
12209121923cSJim Mattson }
12219121923cSJim Mattson
1222e08b9637SCarsten Otte r = kvm_arch_init_vm(kvm, type);
1223d89f5effSJan Kiszka if (r)
1224a97b0e77SJim Mattson goto out_err_no_arch_destroy_vm;
122510474ae8SAlexander Graf
122610474ae8SAlexander Graf r = hardware_enable_all();
122710474ae8SAlexander Graf if (r)
1228719d93cdSChristian Borntraeger goto out_err_no_disable;
122910474ae8SAlexander Graf
1230c77dcacbSPaolo Bonzini #ifdef CONFIG_HAVE_KVM_IRQFD
1231136bdfeeSGleb Natapov INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
123275858a84SAvi Kivity #endif
12330fce5623SAvi Kivity
123474b5c5bfSMike Waychison r = kvm_init_mmu_notifier(kvm);
123574b5c5bfSMike Waychison if (r)
12361aa9b957SJunaid Shahid goto out_err_no_mmu_notifier;
12371aa9b957SJunaid Shahid
1238c2b82397SSean Christopherson r = kvm_coalesced_mmio_init(kvm);
1239c2b82397SSean Christopherson if (r < 0)
1240c2b82397SSean Christopherson goto out_no_coalesced_mmio;
1241c2b82397SSean Christopherson
12424ba4f419SSean Christopherson r = kvm_create_vm_debugfs(kvm, fdname);
12434ba4f419SSean Christopherson if (r)
12444ba4f419SSean Christopherson goto out_err_no_debugfs;
12454ba4f419SSean Christopherson
12461aa9b957SJunaid Shahid r = kvm_arch_post_init_vm(kvm);
12471aa9b957SJunaid Shahid if (r)
12484ba4f419SSean Christopherson goto out_err;
124974b5c5bfSMike Waychison
12500d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
12510fce5623SAvi Kivity list_add(&kvm->vm_list, &vm_list);
12520d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
1253d89f5effSJan Kiszka
12542ecd9d29SPeter Zijlstra preempt_notifier_inc();
12552fdef3a2SSergey Senozhatsky kvm_init_pm_notifier(kvm);
12562ecd9d29SPeter Zijlstra
12570fce5623SAvi Kivity return kvm;
125810474ae8SAlexander Graf
125910474ae8SAlexander Graf out_err:
12604ba4f419SSean Christopherson kvm_destroy_vm_debugfs(kvm);
12614ba4f419SSean Christopherson out_err_no_debugfs:
1262c2b82397SSean Christopherson kvm_coalesced_mmio_free(kvm);
1263c2b82397SSean Christopherson out_no_coalesced_mmio:
12641aa9b957SJunaid Shahid #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
12651aa9b957SJunaid Shahid if (kvm->mmu_notifier.ops)
12661aa9b957SJunaid Shahid mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
12671aa9b957SJunaid Shahid #endif
12681aa9b957SJunaid Shahid out_err_no_mmu_notifier:
126910474ae8SAlexander Graf hardware_disable_all();
1270719d93cdSChristian Borntraeger out_err_no_disable:
1271a97b0e77SJim Mattson kvm_arch_destroy_vm(kvm);
1272a97b0e77SJim Mattson out_err_no_arch_destroy_vm:
1273e2d3fcafSPaolo Bonzini WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274e93f8a0fSMarcelo Tosatti for (i = 0; i < KVM_NR_BUSES; i++)
12753898da94SPaolo Bonzini kfree(kvm_get_bus(kvm, i));
12768a44119aSPaolo Bonzini cleanup_srcu_struct(&kvm->irq_srcu);
12778a44119aSPaolo Bonzini out_err_no_irq_srcu:
12788a44119aSPaolo Bonzini cleanup_srcu_struct(&kvm->srcu);
12798a44119aSPaolo Bonzini out_err_no_srcu:
1280d89f5effSJan Kiszka kvm_arch_free_vm(kvm);
1281e9ad4ec8SPaolo Bonzini mmdrop(current->mm);
1282405294f2SSean Christopherson module_put(kvm_chardev_ops.owner);
128310474ae8SAlexander Graf return ERR_PTR(r);
12840fce5623SAvi Kivity }
12850fce5623SAvi Kivity
kvm_destroy_devices(struct kvm * kvm)128607f0a7bdSScott Wood static void kvm_destroy_devices(struct kvm *kvm)
128707f0a7bdSScott Wood {
1288e6e3b5a6SGeliang Tang struct kvm_device *dev, *tmp;
128907f0a7bdSScott Wood
1290a28ebea2SChristoffer Dall /*
1291a28ebea2SChristoffer Dall * We do not need to take the kvm->lock here, because nobody else
1292a28ebea2SChristoffer Dall * has a reference to the struct kvm at this point and therefore
1293a28ebea2SChristoffer Dall * cannot access the devices list anyhow.
1294a28ebea2SChristoffer Dall */
1295e6e3b5a6SGeliang Tang list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296e6e3b5a6SGeliang Tang list_del(&dev->vm_node);
129707f0a7bdSScott Wood dev->ops->destroy(dev);
129807f0a7bdSScott Wood }
129907f0a7bdSScott Wood }
130007f0a7bdSScott Wood
kvm_destroy_vm(struct kvm * kvm)13010fce5623SAvi Kivity static void kvm_destroy_vm(struct kvm *kvm)
13020fce5623SAvi Kivity {
1303e93f8a0fSMarcelo Tosatti int i;
13040fce5623SAvi Kivity struct mm_struct *mm = kvm->mm;
13050fce5623SAvi Kivity
13062fdef3a2SSergey Senozhatsky kvm_destroy_pm_notifier(kvm);
1307286de8f6SClaudio Imbrenda kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308536a6f88SJanosch Frank kvm_destroy_vm_debugfs(kvm);
1309ad8ba2cdSSheng Yang kvm_arch_sync_events(kvm);
13100d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
13110fce5623SAvi Kivity list_del(&kvm->vm_list);
13120d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
13131aa9b957SJunaid Shahid kvm_arch_pre_destroy_vm(kvm);
13141aa9b957SJunaid Shahid
1315399ec807SAvi Kivity kvm_free_irq_routing(kvm);
1316df630b8cSPeter Xu for (i = 0; i < KVM_NR_BUSES; i++) {
13173898da94SPaolo Bonzini struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
13184a12f951SChristian Borntraeger
13194a12f951SChristian Borntraeger if (bus)
13204a12f951SChristian Borntraeger kvm_io_bus_destroy(bus);
1321df630b8cSPeter Xu kvm->buses[i] = NULL;
1322df630b8cSPeter Xu }
1323980da6ceSAvi Kivity kvm_coalesced_mmio_free(kvm);
1324e930bffeSAndrea Arcangeli #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325e930bffeSAndrea Arcangeli mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
132652ac8b35SPaolo Bonzini /*
132752ac8b35SPaolo Bonzini * At this point, pending calls to invalidate_range_start()
132852ac8b35SPaolo Bonzini * have completed but no more MMU notifiers will run, so
132952ac8b35SPaolo Bonzini * mn_active_invalidate_count may remain unbalanced.
1330b0d23708SJun Miao * No threads can be waiting in kvm_swap_active_memslots() as the
133152ac8b35SPaolo Bonzini * last reference on KVM has been dropped, but freeing
133252ac8b35SPaolo Bonzini * memslots would deadlock without this manual intervention.
133352ac8b35SPaolo Bonzini */
133452ac8b35SPaolo Bonzini WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
133552ac8b35SPaolo Bonzini kvm->mn_active_invalidate_count = 0;
1336f00be0caSGleb Natapov #else
1337683412ccSMingwei Zhang kvm_flush_shadow_all(kvm);
1338e930bffeSAndrea Arcangeli #endif
13390fce5623SAvi Kivity kvm_arch_destroy_vm(kvm);
134007f0a7bdSScott Wood kvm_destroy_devices(kvm);
1341a54d8066SMaciej S. Szmigiero for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342a54d8066SMaciej S. Szmigiero kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1343a54d8066SMaciej S. Szmigiero kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1344a54d8066SMaciej S. Szmigiero }
1345820b3fcdSPaolo Bonzini cleanup_srcu_struct(&kvm->irq_srcu);
1346d89f5effSJan Kiszka cleanup_srcu_struct(&kvm->srcu);
1347d89f5effSJan Kiszka kvm_arch_free_vm(kvm);
13482ecd9d29SPeter Zijlstra preempt_notifier_dec();
134910474ae8SAlexander Graf hardware_disable_all();
13500fce5623SAvi Kivity mmdrop(mm);
13515f6de5cbSDavid Matlack module_put(kvm_chardev_ops.owner);
13520fce5623SAvi Kivity }
13530fce5623SAvi Kivity
kvm_get_kvm(struct kvm * kvm)1354d39f13b0SIzik Eidus void kvm_get_kvm(struct kvm *kvm)
1355d39f13b0SIzik Eidus {
1356e3736c3eSElena Reshetova refcount_inc(&kvm->users_count);
1357d39f13b0SIzik Eidus }
1358d39f13b0SIzik Eidus EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359d39f13b0SIzik Eidus
1360605c7130SPeter Xu /*
1361605c7130SPeter Xu * Make sure the vm is not during destruction, which is a safe version of
1362605c7130SPeter Xu * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1363605c7130SPeter Xu */
kvm_get_kvm_safe(struct kvm * kvm)1364605c7130SPeter Xu bool kvm_get_kvm_safe(struct kvm *kvm)
1365605c7130SPeter Xu {
1366605c7130SPeter Xu return refcount_inc_not_zero(&kvm->users_count);
1367605c7130SPeter Xu }
1368605c7130SPeter Xu EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369605c7130SPeter Xu
kvm_put_kvm(struct kvm * kvm)1370d39f13b0SIzik Eidus void kvm_put_kvm(struct kvm *kvm)
1371d39f13b0SIzik Eidus {
1372e3736c3eSElena Reshetova if (refcount_dec_and_test(&kvm->users_count))
1373d39f13b0SIzik Eidus kvm_destroy_vm(kvm);
1374d39f13b0SIzik Eidus }
1375d39f13b0SIzik Eidus EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376d39f13b0SIzik Eidus
1377149487bdSSean Christopherson /*
1378149487bdSSean Christopherson * Used to put a reference that was taken on behalf of an object associated
1379149487bdSSean Christopherson * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380149487bdSSean Christopherson * of the new file descriptor fails and the reference cannot be transferred to
1381149487bdSSean Christopherson * its final owner. In such cases, the caller is still actively using @kvm and
1382149487bdSSean Christopherson * will fail miserably if the refcount unexpectedly hits zero.
1383149487bdSSean Christopherson */
kvm_put_kvm_no_destroy(struct kvm * kvm)1384149487bdSSean Christopherson void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385149487bdSSean Christopherson {
1386149487bdSSean Christopherson WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387149487bdSSean Christopherson }
1388149487bdSSean Christopherson EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389d39f13b0SIzik Eidus
kvm_vm_release(struct inode * inode,struct file * filp)13900fce5623SAvi Kivity static int kvm_vm_release(struct inode *inode, struct file *filp)
13910fce5623SAvi Kivity {
13920fce5623SAvi Kivity struct kvm *kvm = filp->private_data;
13930fce5623SAvi Kivity
1394721eecbfSGregory Haskins kvm_irqfd_release(kvm);
1395721eecbfSGregory Haskins
1396d39f13b0SIzik Eidus kvm_put_kvm(kvm);
13970fce5623SAvi Kivity return 0;
13980fce5623SAvi Kivity }
13990fce5623SAvi Kivity
1400515a0127STakuya Yoshikawa /*
1401515a0127STakuya Yoshikawa * Allocation size is twice as large as the actual dirty bitmap size.
14020dff0846SSean Christopherson * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403515a0127STakuya Yoshikawa */
kvm_alloc_dirty_bitmap(struct kvm_memory_slot * memslot)14043c9bd400SJay Zhou static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405a36a57b1STakuya Yoshikawa {
140637b2a651SPaolo Bonzini unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407a36a57b1STakuya Yoshikawa
140837b2a651SPaolo Bonzini memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1409a36a57b1STakuya Yoshikawa if (!memslot->dirty_bitmap)
1410a36a57b1STakuya Yoshikawa return -ENOMEM;
1411a36a57b1STakuya Yoshikawa
1412a36a57b1STakuya Yoshikawa return 0;
1413a36a57b1STakuya Yoshikawa }
1414a36a57b1STakuya Yoshikawa
kvm_get_inactive_memslots(struct kvm * kvm,int as_id)1415a54d8066SMaciej S. Szmigiero static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1416bf3e05bcSXiao Guangrong {
1417a54d8066SMaciej S. Szmigiero struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418a54d8066SMaciej S. Szmigiero int node_idx_inactive = active->node_idx ^ 1;
1419bf3e05bcSXiao Guangrong
1420a54d8066SMaciej S. Szmigiero return &kvm->__memslots[as_id][node_idx_inactive];
14218593176cSPaolo Bonzini }
1422efbeec70SPaolo Bonzini
1423efbeec70SPaolo Bonzini /*
1424a54d8066SMaciej S. Szmigiero * Helper to get the address space ID when one of memslot pointers may be NULL.
1425a54d8066SMaciej S. Szmigiero * This also serves as a sanity that at least one of the pointers is non-NULL,
1426a54d8066SMaciej S. Szmigiero * and that their address space IDs don't diverge.
1427efbeec70SPaolo Bonzini */
kvm_memslots_get_as_id(struct kvm_memory_slot * a,struct kvm_memory_slot * b)1428a54d8066SMaciej S. Szmigiero static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *b)
14300577d1abSSean Christopherson {
1431a54d8066SMaciej S. Szmigiero if (WARN_ON_ONCE(!a && !b))
1432a54d8066SMaciej S. Szmigiero return 0;
1433a54d8066SMaciej S. Szmigiero
1434a54d8066SMaciej S. Szmigiero if (!a)
1435a54d8066SMaciej S. Szmigiero return b->as_id;
1436a54d8066SMaciej S. Szmigiero if (!b)
1437a54d8066SMaciej S. Szmigiero return a->as_id;
1438a54d8066SMaciej S. Szmigiero
1439a54d8066SMaciej S. Szmigiero WARN_ON_ONCE(a->as_id != b->as_id);
1440a54d8066SMaciej S. Szmigiero return a->as_id;
14410577d1abSSean Christopherson }
14420577d1abSSean Christopherson
kvm_insert_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1443a54d8066SMaciej S. Szmigiero static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *slot)
14450577d1abSSean Christopherson {
1446a54d8066SMaciej S. Szmigiero struct rb_root *gfn_tree = &slots->gfn_tree;
1447a54d8066SMaciej S. Szmigiero struct rb_node **node, *parent;
1448a54d8066SMaciej S. Szmigiero int idx = slots->node_idx;
14490577d1abSSean Christopherson
1450a54d8066SMaciej S. Szmigiero parent = NULL;
1451a54d8066SMaciej S. Szmigiero for (node = &gfn_tree->rb_node; *node; ) {
1452a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *tmp;
14530577d1abSSean Christopherson
1454a54d8066SMaciej S. Szmigiero tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1455a54d8066SMaciej S. Szmigiero parent = *node;
1456a54d8066SMaciej S. Szmigiero if (slot->base_gfn < tmp->base_gfn)
1457a54d8066SMaciej S. Szmigiero node = &(*node)->rb_left;
1458a54d8066SMaciej S. Szmigiero else if (slot->base_gfn > tmp->base_gfn)
1459a54d8066SMaciej S. Szmigiero node = &(*node)->rb_right;
14600577d1abSSean Christopherson else
1461a54d8066SMaciej S. Szmigiero BUG();
1462a54d8066SMaciej S. Szmigiero }
1463a54d8066SMaciej S. Szmigiero
1464a54d8066SMaciej S. Szmigiero rb_link_node(&slot->gfn_node[idx], parent, node);
1465a54d8066SMaciej S. Szmigiero rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466a54d8066SMaciej S. Szmigiero }
1467a54d8066SMaciej S. Szmigiero
kvm_erase_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1468a54d8066SMaciej S. Szmigiero static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *slot)
1470a54d8066SMaciej S. Szmigiero {
1471a54d8066SMaciej S. Szmigiero rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472a54d8066SMaciej S. Szmigiero }
1473a54d8066SMaciej S. Szmigiero
kvm_replace_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1474a54d8066SMaciej S. Szmigiero static void kvm_replace_gfn_node(struct kvm_memslots *slots,
147526b8345aSMaciej S. Szmigiero struct kvm_memory_slot *old,
147626b8345aSMaciej S. Szmigiero struct kvm_memory_slot *new)
147726b8345aSMaciej S. Szmigiero {
1478a54d8066SMaciej S. Szmigiero int idx = slots->node_idx;
1479a54d8066SMaciej S. Szmigiero
1480a54d8066SMaciej S. Szmigiero WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481a54d8066SMaciej S. Szmigiero
1482a54d8066SMaciej S. Szmigiero rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1483a54d8066SMaciej S. Szmigiero &slots->gfn_tree);
1484a54d8066SMaciej S. Szmigiero }
14850577d1abSSean Christopherson
14860577d1abSSean Christopherson /*
1487a54d8066SMaciej S. Szmigiero * Replace @old with @new in the inactive memslots.
1488a54d8066SMaciej S. Szmigiero *
1489a54d8066SMaciej S. Szmigiero * With NULL @old this simply adds @new.
1490a54d8066SMaciej S. Szmigiero * With NULL @new this simply removes @old.
1491a54d8066SMaciej S. Szmigiero *
1492a54d8066SMaciej S. Szmigiero * If @new is non-NULL its hva_node[slots_idx] range has to be set
1493a54d8066SMaciej S. Szmigiero * appropriately.
14940577d1abSSean Christopherson */
kvm_replace_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1495a54d8066SMaciej S. Szmigiero static void kvm_replace_memslot(struct kvm *kvm,
1496a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1497a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *new)
1498a54d8066SMaciej S. Szmigiero {
1499a54d8066SMaciej S. Szmigiero int as_id = kvm_memslots_get_as_id(old, new);
1500a54d8066SMaciej S. Szmigiero struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501a54d8066SMaciej S. Szmigiero int idx = slots->node_idx;
1502a54d8066SMaciej S. Szmigiero
150326b8345aSMaciej S. Szmigiero if (old) {
1504a54d8066SMaciej S. Szmigiero hash_del(&old->id_node[idx]);
1505a54d8066SMaciej S. Szmigiero interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
150626b8345aSMaciej S. Szmigiero
1507a54d8066SMaciej S. Szmigiero if ((long)old == atomic_long_read(&slots->last_used_slot))
1508a54d8066SMaciej S. Szmigiero atomic_long_set(&slots->last_used_slot, (long)new);
1509a54d8066SMaciej S. Szmigiero
1510a54d8066SMaciej S. Szmigiero if (!new) {
1511a54d8066SMaciej S. Szmigiero kvm_erase_gfn_node(slots, old);
151226b8345aSMaciej S. Szmigiero return;
1513a54d8066SMaciej S. Szmigiero }
1514a54d8066SMaciej S. Szmigiero }
151526b8345aSMaciej S. Szmigiero
1516a54d8066SMaciej S. Szmigiero /*
1517a54d8066SMaciej S. Szmigiero * Initialize @new's hva range. Do this even when replacing an @old
1518a54d8066SMaciej S. Szmigiero * slot, kvm_copy_memslot() deliberately does not touch node data.
1519a54d8066SMaciej S. Szmigiero */
1520a54d8066SMaciej S. Szmigiero new->hva_node[idx].start = new->userspace_addr;
1521a54d8066SMaciej S. Szmigiero new->hva_node[idx].last = new->userspace_addr +
1522ed922739SMaciej S. Szmigiero (new->npages << PAGE_SHIFT) - 1;
152326b8345aSMaciej S. Szmigiero
15240fce5623SAvi Kivity /*
1525a54d8066SMaciej S. Szmigiero * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1526a54d8066SMaciej S. Szmigiero * hva_node needs to be swapped with remove+insert even though hva can't
1527a54d8066SMaciej S. Szmigiero * change when replacing an existing slot.
15280fce5623SAvi Kivity */
1529a54d8066SMaciej S. Szmigiero hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530a54d8066SMaciej S. Szmigiero interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
15310fce5623SAvi Kivity
153226b8345aSMaciej S. Szmigiero /*
1533a54d8066SMaciej S. Szmigiero * If the memslot gfn is unchanged, rb_replace_node() can be used to
1534a54d8066SMaciej S. Szmigiero * switch the node in the gfn tree instead of removing the old and
1535a54d8066SMaciej S. Szmigiero * inserting the new as two separate operations. Replacement is a
1536a54d8066SMaciej S. Szmigiero * single O(1) operation versus two O(log(n)) operations for
1537a54d8066SMaciej S. Szmigiero * remove+insert.
153826b8345aSMaciej S. Szmigiero */
1539a54d8066SMaciej S. Szmigiero if (old && old->base_gfn == new->base_gfn) {
1540a54d8066SMaciej S. Szmigiero kvm_replace_gfn_node(slots, old, new);
15410577d1abSSean Christopherson } else {
1542a54d8066SMaciej S. Szmigiero if (old)
1543a54d8066SMaciej S. Szmigiero kvm_erase_gfn_node(slots, old);
1544a54d8066SMaciej S. Szmigiero kvm_insert_gfn_node(slots, new);
15450577d1abSSean Christopherson }
1546bf3e05bcSXiao Guangrong }
1547bf3e05bcSXiao Guangrong
check_memory_region_flags(const struct kvm_userspace_memory_region * mem)154809170a49SPaolo Bonzini static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549a50d64d6SXiao Guangrong {
15504d8b81abSXiao Guangrong u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
15514d8b81abSXiao Guangrong
15520f8a4de3SChristoffer Dall #ifdef __KVM_HAVE_READONLY_MEM
15534d8b81abSXiao Guangrong valid_flags |= KVM_MEM_READONLY;
15544d8b81abSXiao Guangrong #endif
15554d8b81abSXiao Guangrong
15564d8b81abSXiao Guangrong if (mem->flags & ~valid_flags)
1557a50d64d6SXiao Guangrong return -EINVAL;
1558a50d64d6SXiao Guangrong
1559a50d64d6SXiao Guangrong return 0;
1560a50d64d6SXiao Guangrong }
1561a50d64d6SXiao Guangrong
kvm_swap_active_memslots(struct kvm * kvm,int as_id)1562a54d8066SMaciej S. Szmigiero static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
15637ec4fb44SGleb Natapov {
1564a54d8066SMaciej S. Szmigiero struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565a54d8066SMaciej S. Szmigiero
1566a54d8066SMaciej S. Szmigiero /* Grab the generation from the activate memslots. */
1567a54d8066SMaciej S. Szmigiero u64 gen = __kvm_memslots(kvm, as_id)->generation;
15687ec4fb44SGleb Natapov
1569361209e0SSean Christopherson WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570361209e0SSean Christopherson slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571ee3d1570SDavid Matlack
157252ac8b35SPaolo Bonzini /*
157352ac8b35SPaolo Bonzini * Do not store the new memslots while there are invalidations in
1574071064f1SPaolo Bonzini * progress, otherwise the locking in invalidate_range_start and
1575071064f1SPaolo Bonzini * invalidate_range_end will be unbalanced.
157652ac8b35SPaolo Bonzini */
157752ac8b35SPaolo Bonzini spin_lock(&kvm->mn_invalidate_lock);
157852ac8b35SPaolo Bonzini prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
157952ac8b35SPaolo Bonzini while (kvm->mn_active_invalidate_count) {
158052ac8b35SPaolo Bonzini set_current_state(TASK_UNINTERRUPTIBLE);
158152ac8b35SPaolo Bonzini spin_unlock(&kvm->mn_invalidate_lock);
158252ac8b35SPaolo Bonzini schedule();
158352ac8b35SPaolo Bonzini spin_lock(&kvm->mn_invalidate_lock);
158452ac8b35SPaolo Bonzini }
158552ac8b35SPaolo Bonzini finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1586f481b069SPaolo Bonzini rcu_assign_pointer(kvm->memslots[as_id], slots);
158752ac8b35SPaolo Bonzini spin_unlock(&kvm->mn_invalidate_lock);
1588b10a038eSBen Gardon
1589b10a038eSBen Gardon /*
1590b10a038eSBen Gardon * Acquired in kvm_set_memslot. Must be released before synchronize
1591b10a038eSBen Gardon * SRCU below in order to avoid deadlock with another thread
1592b10a038eSBen Gardon * acquiring the slots_arch_lock in an srcu critical section.
1593b10a038eSBen Gardon */
1594b10a038eSBen Gardon mutex_unlock(&kvm->slots_arch_lock);
1595b10a038eSBen Gardon
15967ec4fb44SGleb Natapov synchronize_srcu_expedited(&kvm->srcu);
1597e59dbe09STakuya Yoshikawa
1598ee3d1570SDavid Matlack /*
1599361209e0SSean Christopherson * Increment the new memslot generation a second time, dropping the
160000116795SMiaohe Lin * update in-progress flag and incrementing the generation based on
1601361209e0SSean Christopherson * the number of address spaces. This provides a unique and easily
1602361209e0SSean Christopherson * identifiable generation number while the memslots are in flux.
1603361209e0SSean Christopherson */
1604361209e0SSean Christopherson gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605361209e0SSean Christopherson
1606361209e0SSean Christopherson /*
16074bd518f1SPaolo Bonzini * Generations must be unique even across address spaces. We do not need
16084bd518f1SPaolo Bonzini * a global counter for that, instead the generation space is evenly split
16094bd518f1SPaolo Bonzini * across address spaces. For example, with two address spaces, address
1610164bf7e5SSean Christopherson * space 0 will use generations 0, 2, 4, ... while address space 1 will
1611164bf7e5SSean Christopherson * use generations 1, 3, 5, ...
1612ee3d1570SDavid Matlack */
1613164bf7e5SSean Christopherson gen += KVM_ADDRESS_SPACE_NUM;
1614ee3d1570SDavid Matlack
161515248258SSean Christopherson kvm_arch_memslots_updated(kvm, gen);
161615248258SSean Christopherson
161715248258SSean Christopherson slots->generation = gen;
16187ec4fb44SGleb Natapov }
16197ec4fb44SGleb Natapov
kvm_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)162007921665SSean Christopherson static int kvm_prepare_memory_region(struct kvm *kvm,
162107921665SSean Christopherson const struct kvm_memory_slot *old,
162207921665SSean Christopherson struct kvm_memory_slot *new,
162336947254SSean Christopherson enum kvm_mr_change change)
162436947254SSean Christopherson {
1625cf47f50bSSean Christopherson int r;
1626cf47f50bSSean Christopherson
1627b10a038eSBen Gardon /*
162807921665SSean Christopherson * If dirty logging is disabled, nullify the bitmap; the old bitmap
162907921665SSean Christopherson * will be freed on "commit". If logging is enabled in both old and
163007921665SSean Christopherson * new, reuse the existing bitmap. If logging is enabled only in the
163107921665SSean Christopherson * new and KVM isn't using a ring buffer, allocate and initialize a
163207921665SSean Christopherson * new bitmap.
163307921665SSean Christopherson */
1634244893faSSean Christopherson if (change != KVM_MR_DELETE) {
163507921665SSean Christopherson if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
163607921665SSean Christopherson new->dirty_bitmap = NULL;
1637244893faSSean Christopherson else if (old && old->dirty_bitmap)
163807921665SSean Christopherson new->dirty_bitmap = old->dirty_bitmap;
163986bdf3ebSGavin Shan else if (kvm_use_dirty_bitmap(kvm)) {
164007921665SSean Christopherson r = kvm_alloc_dirty_bitmap(new);
164107921665SSean Christopherson if (r)
164207921665SSean Christopherson return r;
164307921665SSean Christopherson
164407921665SSean Christopherson if (kvm_dirty_log_manual_protect_and_init_set(kvm))
164507921665SSean Christopherson bitmap_set(new->dirty_bitmap, 0, new->npages);
164607921665SSean Christopherson }
1647244893faSSean Christopherson }
164807921665SSean Christopherson
164907921665SSean Christopherson r = kvm_arch_prepare_memory_region(kvm, old, new, change);
165007921665SSean Christopherson
165107921665SSean Christopherson /* Free the bitmap on failure if it was allocated above. */
1652c87661f8SSean Christopherson if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
165307921665SSean Christopherson kvm_destroy_dirty_bitmap(new);
165407921665SSean Christopherson
165507921665SSean Christopherson return r;
165607921665SSean Christopherson }
165707921665SSean Christopherson
kvm_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)165807921665SSean Christopherson static void kvm_commit_memory_region(struct kvm *kvm,
165907921665SSean Christopherson struct kvm_memory_slot *old,
166007921665SSean Christopherson const struct kvm_memory_slot *new,
166107921665SSean Christopherson enum kvm_mr_change change)
166207921665SSean Christopherson {
16636c7b2202SPaolo Bonzini int old_flags = old ? old->flags : 0;
16646c7b2202SPaolo Bonzini int new_flags = new ? new->flags : 0;
166507921665SSean Christopherson /*
166607921665SSean Christopherson * Update the total number of memslot pages before calling the arch
166707921665SSean Christopherson * hook so that architectures can consume the result directly.
166807921665SSean Christopherson */
166907921665SSean Christopherson if (change == KVM_MR_DELETE)
167007921665SSean Christopherson kvm->nr_memslot_pages -= old->npages;
167107921665SSean Christopherson else if (change == KVM_MR_CREATE)
167207921665SSean Christopherson kvm->nr_memslot_pages += new->npages;
167307921665SSean Christopherson
16746c7b2202SPaolo Bonzini if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
16756c7b2202SPaolo Bonzini int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
16766c7b2202SPaolo Bonzini atomic_set(&kvm->nr_memslots_dirty_logging,
16776c7b2202SPaolo Bonzini atomic_read(&kvm->nr_memslots_dirty_logging) + change);
16786c7b2202SPaolo Bonzini }
16796c7b2202SPaolo Bonzini
168007921665SSean Christopherson kvm_arch_commit_memory_region(kvm, old, new, change);
168107921665SSean Christopherson
1682a54d8066SMaciej S. Szmigiero switch (change) {
1683a54d8066SMaciej S. Szmigiero case KVM_MR_CREATE:
1684a54d8066SMaciej S. Szmigiero /* Nothing more to do. */
1685a54d8066SMaciej S. Szmigiero break;
1686a54d8066SMaciej S. Szmigiero case KVM_MR_DELETE:
1687a54d8066SMaciej S. Szmigiero /* Free the old memslot and all its metadata. */
168807921665SSean Christopherson kvm_free_memslot(kvm, old);
1689a54d8066SMaciej S. Szmigiero break;
1690a54d8066SMaciej S. Szmigiero case KVM_MR_MOVE:
1691a54d8066SMaciej S. Szmigiero case KVM_MR_FLAGS_ONLY:
1692a54d8066SMaciej S. Szmigiero /*
1693a54d8066SMaciej S. Szmigiero * Free the dirty bitmap as needed; the below check encompasses
1694a54d8066SMaciej S. Szmigiero * both the flags and whether a ring buffer is being used)
1695a54d8066SMaciej S. Szmigiero */
1696a54d8066SMaciej S. Szmigiero if (old->dirty_bitmap && !new->dirty_bitmap)
169707921665SSean Christopherson kvm_destroy_dirty_bitmap(old);
1698a54d8066SMaciej S. Szmigiero
1699a54d8066SMaciej S. Szmigiero /*
1700a54d8066SMaciej S. Szmigiero * The final quirk. Free the detached, old slot, but only its
1701a54d8066SMaciej S. Szmigiero * memory, not any metadata. Metadata, including arch specific
1702a54d8066SMaciej S. Szmigiero * data, may be reused by @new.
1703a54d8066SMaciej S. Szmigiero */
1704a54d8066SMaciej S. Szmigiero kfree(old);
1705a54d8066SMaciej S. Szmigiero break;
1706a54d8066SMaciej S. Szmigiero default:
1707a54d8066SMaciej S. Szmigiero BUG();
1708a54d8066SMaciej S. Szmigiero }
1709a54d8066SMaciej S. Szmigiero }
1710a54d8066SMaciej S. Szmigiero
1711a54d8066SMaciej S. Szmigiero /*
1712a54d8066SMaciej S. Szmigiero * Activate @new, which must be installed in the inactive slots by the caller,
1713a54d8066SMaciej S. Szmigiero * by swapping the active slots and then propagating @new to @old once @old is
1714a54d8066SMaciej S. Szmigiero * unreachable and can be safely modified.
1715a54d8066SMaciej S. Szmigiero *
1716a54d8066SMaciej S. Szmigiero * With NULL @old this simply adds @new to @active (while swapping the sets).
1717a54d8066SMaciej S. Szmigiero * With NULL @new this simply removes @old from @active and frees it
1718a54d8066SMaciej S. Szmigiero * (while also swapping the sets).
1719a54d8066SMaciej S. Szmigiero */
kvm_activate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1720a54d8066SMaciej S. Szmigiero static void kvm_activate_memslot(struct kvm *kvm,
1721a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1722a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *new)
1723a54d8066SMaciej S. Szmigiero {
1724a54d8066SMaciej S. Szmigiero int as_id = kvm_memslots_get_as_id(old, new);
1725a54d8066SMaciej S. Szmigiero
1726a54d8066SMaciej S. Szmigiero kvm_swap_active_memslots(kvm, as_id);
1727a54d8066SMaciej S. Szmigiero
1728a54d8066SMaciej S. Szmigiero /* Propagate the new memslot to the now inactive memslots. */
1729a54d8066SMaciej S. Szmigiero kvm_replace_memslot(kvm, old, new);
1730a54d8066SMaciej S. Szmigiero }
1731a54d8066SMaciej S. Szmigiero
kvm_copy_memslot(struct kvm_memory_slot * dest,const struct kvm_memory_slot * src)1732a54d8066SMaciej S. Szmigiero static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733a54d8066SMaciej S. Szmigiero const struct kvm_memory_slot *src)
1734a54d8066SMaciej S. Szmigiero {
1735a54d8066SMaciej S. Szmigiero dest->base_gfn = src->base_gfn;
1736a54d8066SMaciej S. Szmigiero dest->npages = src->npages;
1737a54d8066SMaciej S. Szmigiero dest->dirty_bitmap = src->dirty_bitmap;
1738a54d8066SMaciej S. Szmigiero dest->arch = src->arch;
1739a54d8066SMaciej S. Szmigiero dest->userspace_addr = src->userspace_addr;
1740a54d8066SMaciej S. Szmigiero dest->flags = src->flags;
1741a54d8066SMaciej S. Szmigiero dest->id = src->id;
1742a54d8066SMaciej S. Szmigiero dest->as_id = src->as_id;
1743a54d8066SMaciej S. Szmigiero }
1744a54d8066SMaciej S. Szmigiero
kvm_invalidate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1745a54d8066SMaciej S. Szmigiero static void kvm_invalidate_memslot(struct kvm *kvm,
1746a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1747244893faSSean Christopherson struct kvm_memory_slot *invalid_slot)
1748a54d8066SMaciej S. Szmigiero {
1749a54d8066SMaciej S. Szmigiero /*
1750a54d8066SMaciej S. Szmigiero * Mark the current slot INVALID. As with all memslot modifications,
1751a54d8066SMaciej S. Szmigiero * this must be done on an unreachable slot to avoid modifying the
1752a54d8066SMaciej S. Szmigiero * current slot in the active tree.
1753a54d8066SMaciej S. Szmigiero */
1754244893faSSean Christopherson kvm_copy_memslot(invalid_slot, old);
1755244893faSSean Christopherson invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1756244893faSSean Christopherson kvm_replace_memslot(kvm, old, invalid_slot);
1757a54d8066SMaciej S. Szmigiero
1758a54d8066SMaciej S. Szmigiero /*
1759a54d8066SMaciej S. Szmigiero * Activate the slot that is now marked INVALID, but don't propagate
1760a54d8066SMaciej S. Szmigiero * the slot to the now inactive slots. The slot is either going to be
1761a54d8066SMaciej S. Szmigiero * deleted or recreated as a new slot.
1762a54d8066SMaciej S. Szmigiero */
1763a54d8066SMaciej S. Szmigiero kvm_swap_active_memslots(kvm, old->as_id);
1764a54d8066SMaciej S. Szmigiero
1765a54d8066SMaciej S. Szmigiero /*
1766a54d8066SMaciej S. Szmigiero * From this point no new shadow pages pointing to a deleted, or moved,
1767a54d8066SMaciej S. Szmigiero * memslot will be created. Validation of sp->gfn happens in:
1768a54d8066SMaciej S. Szmigiero * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769a54d8066SMaciej S. Szmigiero * - kvm_is_visible_gfn (mmu_check_root)
1770a54d8066SMaciej S. Szmigiero */
1771bcb63dcdSMaciej S. Szmigiero kvm_arch_flush_shadow_memslot(kvm, old);
1772683412ccSMingwei Zhang kvm_arch_guest_memory_reclaimed(kvm);
1773a54d8066SMaciej S. Szmigiero
1774b0d23708SJun Miao /* Was released by kvm_swap_active_memslots(), reacquire. */
1775a54d8066SMaciej S. Szmigiero mutex_lock(&kvm->slots_arch_lock);
1776a54d8066SMaciej S. Szmigiero
1777a54d8066SMaciej S. Szmigiero /*
1778a54d8066SMaciej S. Szmigiero * Copy the arch-specific field of the newly-installed slot back to the
1779a54d8066SMaciej S. Szmigiero * old slot as the arch data could have changed between releasing
1780b0d23708SJun Miao * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781a54d8066SMaciej S. Szmigiero * above. Writers are required to retrieve memslots *after* acquiring
1782a54d8066SMaciej S. Szmigiero * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783a54d8066SMaciej S. Szmigiero */
1784244893faSSean Christopherson old->arch = invalid_slot->arch;
1785a54d8066SMaciej S. Szmigiero }
1786a54d8066SMaciej S. Szmigiero
kvm_create_memslot(struct kvm * kvm,struct kvm_memory_slot * new)1787a54d8066SMaciej S. Szmigiero static void kvm_create_memslot(struct kvm *kvm,
1788244893faSSean Christopherson struct kvm_memory_slot *new)
1789a54d8066SMaciej S. Szmigiero {
1790244893faSSean Christopherson /* Add the new memslot to the inactive set and activate. */
1791244893faSSean Christopherson kvm_replace_memslot(kvm, NULL, new);
1792244893faSSean Christopherson kvm_activate_memslot(kvm, NULL, new);
1793a54d8066SMaciej S. Szmigiero }
1794a54d8066SMaciej S. Szmigiero
kvm_delete_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1795a54d8066SMaciej S. Szmigiero static void kvm_delete_memslot(struct kvm *kvm,
1796a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1797a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *invalid_slot)
1798a54d8066SMaciej S. Szmigiero {
1799a54d8066SMaciej S. Szmigiero /*
1800a54d8066SMaciej S. Szmigiero * Remove the old memslot (in the inactive memslots) by passing NULL as
1801244893faSSean Christopherson * the "new" slot, and for the invalid version in the active slots.
1802a54d8066SMaciej S. Szmigiero */
1803a54d8066SMaciej S. Szmigiero kvm_replace_memslot(kvm, old, NULL);
1804a54d8066SMaciej S. Szmigiero kvm_activate_memslot(kvm, invalid_slot, NULL);
1805a54d8066SMaciej S. Szmigiero }
1806a54d8066SMaciej S. Szmigiero
kvm_move_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,struct kvm_memory_slot * invalid_slot)1807244893faSSean Christopherson static void kvm_move_memslot(struct kvm *kvm,
1808a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1809244893faSSean Christopherson struct kvm_memory_slot *new,
1810a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *invalid_slot)
1811a54d8066SMaciej S. Szmigiero {
1812a54d8066SMaciej S. Szmigiero /*
1813244893faSSean Christopherson * Replace the old memslot in the inactive slots, and then swap slots
1814244893faSSean Christopherson * and replace the current INVALID with the new as well.
1815a54d8066SMaciej S. Szmigiero */
1816244893faSSean Christopherson kvm_replace_memslot(kvm, old, new);
1817244893faSSean Christopherson kvm_activate_memslot(kvm, invalid_slot, new);
1818a54d8066SMaciej S. Szmigiero }
1819a54d8066SMaciej S. Szmigiero
kvm_update_flags_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1820a54d8066SMaciej S. Szmigiero static void kvm_update_flags_memslot(struct kvm *kvm,
1821a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1822244893faSSean Christopherson struct kvm_memory_slot *new)
1823a54d8066SMaciej S. Szmigiero {
1824a54d8066SMaciej S. Szmigiero /*
1825a54d8066SMaciej S. Szmigiero * Similar to the MOVE case, but the slot doesn't need to be zapped as
1826a54d8066SMaciej S. Szmigiero * an intermediate step. Instead, the old memslot is simply replaced
1827a54d8066SMaciej S. Szmigiero * with a new, updated copy in both memslot sets.
1828a54d8066SMaciej S. Szmigiero */
1829244893faSSean Christopherson kvm_replace_memslot(kvm, old, new);
1830244893faSSean Christopherson kvm_activate_memslot(kvm, old, new);
183107921665SSean Christopherson }
183207921665SSean Christopherson
kvm_set_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1833cf47f50bSSean Christopherson static int kvm_set_memslot(struct kvm *kvm,
1834a54d8066SMaciej S. Szmigiero struct kvm_memory_slot *old,
1835ce5f0215SSean Christopherson struct kvm_memory_slot *new,
1836cf47f50bSSean Christopherson enum kvm_mr_change change)
1837cf47f50bSSean Christopherson {
1838244893faSSean Christopherson struct kvm_memory_slot *invalid_slot;
1839cf47f50bSSean Christopherson int r;
1840cf47f50bSSean Christopherson
1841b10a038eSBen Gardon /*
1842b0d23708SJun Miao * Released in kvm_swap_active_memslots().
1843b10a038eSBen Gardon *
1844b0d23708SJun Miao * Must be held from before the current memslots are copied until after
1845b0d23708SJun Miao * the new memslots are installed with rcu_assign_pointer, then
1846b0d23708SJun Miao * released before the synchronize srcu in kvm_swap_active_memslots().
1847b10a038eSBen Gardon *
1848b10a038eSBen Gardon * When modifying memslots outside of the slots_lock, must be held
1849b10a038eSBen Gardon * before reading the pointer to the current memslots until after all
1850b10a038eSBen Gardon * changes to those memslots are complete.
1851b10a038eSBen Gardon *
1852b10a038eSBen Gardon * These rules ensure that installing new memslots does not lose
1853b10a038eSBen Gardon * changes made to the previous memslots.
1854b10a038eSBen Gardon */
1855b10a038eSBen Gardon mutex_lock(&kvm->slots_arch_lock);
1856b10a038eSBen Gardon
1857cf47f50bSSean Christopherson /*
1858a54d8066SMaciej S. Szmigiero * Invalidate the old slot if it's being deleted or moved. This is
1859a54d8066SMaciej S. Szmigiero * done prior to actually deleting/moving the memslot to allow vCPUs to
1860a54d8066SMaciej S. Szmigiero * continue running by ensuring there are no mappings or shadow pages
1861a54d8066SMaciej S. Szmigiero * for the memslot when it is deleted/moved. Without pre-invalidation
1862a54d8066SMaciej S. Szmigiero * (and without a lock), a window would exist between effecting the
1863a54d8066SMaciej S. Szmigiero * delete/move and committing the changes in arch code where KVM or a
1864a54d8066SMaciej S. Szmigiero * guest could access a non-existent memslot.
1865244893faSSean Christopherson *
1866244893faSSean Christopherson * Modifications are done on a temporary, unreachable slot. The old
1867244893faSSean Christopherson * slot needs to be preserved in case a later step fails and the
1868244893faSSean Christopherson * invalidation needs to be reverted.
1869cf47f50bSSean Christopherson */
1870244893faSSean Christopherson if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1871244893faSSean Christopherson invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872244893faSSean Christopherson if (!invalid_slot) {
1873cf47f50bSSean Christopherson mutex_unlock(&kvm->slots_arch_lock);
1874cf47f50bSSean Christopherson return -ENOMEM;
1875cf47f50bSSean Christopherson }
1876244893faSSean Christopherson kvm_invalidate_memslot(kvm, old, invalid_slot);
1877cf47f50bSSean Christopherson }
1878cf47f50bSSean Christopherson
1879a54d8066SMaciej S. Szmigiero r = kvm_prepare_memory_region(kvm, old, new, change);
1880a54d8066SMaciej S. Szmigiero if (r) {
1881bda44d84SSean Christopherson /*
1882a54d8066SMaciej S. Szmigiero * For DELETE/MOVE, revert the above INVALID change. No
1883a54d8066SMaciej S. Szmigiero * modifications required since the original slot was preserved
1884a54d8066SMaciej S. Szmigiero * in the inactive slots. Changing the active memslots also
1885a54d8066SMaciej S. Szmigiero * release slots_arch_lock.
1886bda44d84SSean Christopherson */
1887b10a038eSBen Gardon if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1888244893faSSean Christopherson kvm_activate_memslot(kvm, invalid_slot, old);
1889244893faSSean Christopherson kfree(invalid_slot);
1890b10a038eSBen Gardon } else {
1891b10a038eSBen Gardon mutex_unlock(&kvm->slots_arch_lock);
1892b10a038eSBen Gardon }
1893cf47f50bSSean Christopherson return r;
1894cf47f50bSSean Christopherson }
1895cf47f50bSSean Christopherson
18969e9eb226SPeter Xu /*
1897a54d8066SMaciej S. Szmigiero * For DELETE and MOVE, the working slot is now active as the INVALID
1898a54d8066SMaciej S. Szmigiero * version of the old slot. MOVE is particularly special as it reuses
1899a54d8066SMaciej S. Szmigiero * the old slot and returns a copy of the old slot (in working_slot).
1900a54d8066SMaciej S. Szmigiero * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1901a54d8066SMaciej S. Szmigiero * old slot is detached but otherwise preserved.
19029e9eb226SPeter Xu */
1903a54d8066SMaciej S. Szmigiero if (change == KVM_MR_CREATE)
1904244893faSSean Christopherson kvm_create_memslot(kvm, new);
1905a54d8066SMaciej S. Szmigiero else if (change == KVM_MR_DELETE)
1906244893faSSean Christopherson kvm_delete_memslot(kvm, old, invalid_slot);
1907a54d8066SMaciej S. Szmigiero else if (change == KVM_MR_MOVE)
1908244893faSSean Christopherson kvm_move_memslot(kvm, old, new, invalid_slot);
1909a54d8066SMaciej S. Szmigiero else if (change == KVM_MR_FLAGS_ONLY)
1910244893faSSean Christopherson kvm_update_flags_memslot(kvm, old, new);
1911a54d8066SMaciej S. Szmigiero else
1912a54d8066SMaciej S. Szmigiero BUG();
19135c0b4f3dSSean Christopherson
1914244893faSSean Christopherson /* Free the temporary INVALID slot used for DELETE and MOVE. */
1915244893faSSean Christopherson if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1916244893faSSean Christopherson kfree(invalid_slot);
1917244893faSSean Christopherson
1918a54d8066SMaciej S. Szmigiero /*
1919a54d8066SMaciej S. Szmigiero * No need to refresh new->arch, changes after dropping slots_arch_lock
1920a413a625STom Rix * will directly hit the final, active memslot. Architectures are
1921a54d8066SMaciej S. Szmigiero * responsible for knowing that new->arch may be stale.
1922a54d8066SMaciej S. Szmigiero */
1923a54d8066SMaciej S. Szmigiero kvm_commit_memory_region(kvm, old, new, change);
1924a54d8066SMaciej S. Szmigiero
1925a54d8066SMaciej S. Szmigiero return 0;
1926a54d8066SMaciej S. Szmigiero }
1927a54d8066SMaciej S. Szmigiero
kvm_check_memslot_overlap(struct kvm_memslots * slots,int id,gfn_t start,gfn_t end)192844401a20SMaciej S. Szmigiero static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
192944401a20SMaciej S. Szmigiero gfn_t start, gfn_t end)
193044401a20SMaciej S. Szmigiero {
193144401a20SMaciej S. Szmigiero struct kvm_memslot_iter iter;
193244401a20SMaciej S. Szmigiero
193344401a20SMaciej S. Szmigiero kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
193444401a20SMaciej S. Szmigiero if (iter.slot->id != id)
193544401a20SMaciej S. Szmigiero return true;
193644401a20SMaciej S. Szmigiero }
193744401a20SMaciej S. Szmigiero
193844401a20SMaciej S. Szmigiero return false;
19395c0b4f3dSSean Christopherson }
19405c0b4f3dSSean Christopherson
19410fce5623SAvi Kivity /*
19420fce5623SAvi Kivity * Allocate some memory and give it an address in the guest physical address
19430fce5623SAvi Kivity * space.
19440fce5623SAvi Kivity *
19450fce5623SAvi Kivity * Discontiguous memory is allowed, mostly for framebuffers.
19460fce5623SAvi Kivity *
194702d5d55bSDominik Dingel * Must be called holding kvm->slots_lock for write.
19480fce5623SAvi Kivity */
__kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)19490fce5623SAvi Kivity int __kvm_set_memory_region(struct kvm *kvm,
195009170a49SPaolo Bonzini const struct kvm_userspace_memory_region *mem)
19510fce5623SAvi Kivity {
1952244893faSSean Christopherson struct kvm_memory_slot *old, *new;
195344401a20SMaciej S. Szmigiero struct kvm_memslots *slots;
1954f64c0398STakuya Yoshikawa enum kvm_mr_change change;
19550f9bdef3SSean Christopherson unsigned long npages;
19560f9bdef3SSean Christopherson gfn_t base_gfn;
1957163da372SSean Christopherson int as_id, id;
1958163da372SSean Christopherson int r;
19590fce5623SAvi Kivity
1960a50d64d6SXiao Guangrong r = check_memory_region_flags(mem);
1961a50d64d6SXiao Guangrong if (r)
196271a4c30bSSean Christopherson return r;
1963a50d64d6SXiao Guangrong
1964f481b069SPaolo Bonzini as_id = mem->slot >> 16;
1965f481b069SPaolo Bonzini id = (u16)mem->slot;
1966f481b069SPaolo Bonzini
19670fce5623SAvi Kivity /* General sanity checks */
19686b285a55SSean Christopherson if ((mem->memory_size & (PAGE_SIZE - 1)) ||
19696b285a55SSean Christopherson (mem->memory_size != (unsigned long)mem->memory_size))
197071a4c30bSSean Christopherson return -EINVAL;
19710fce5623SAvi Kivity if (mem->guest_phys_addr & (PAGE_SIZE - 1))
197271a4c30bSSean Christopherson return -EINVAL;
1973fa3d315aSTakuya Yoshikawa /* We can read the guest memory with __xxx_user() later on. */
197409d952c9SPaolo Bonzini if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1975139bc8a6SMarc Zyngier (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
197696d4f267SLinus Torvalds !access_ok((void __user *)(unsigned long)mem->userspace_addr,
197709d952c9SPaolo Bonzini mem->memory_size))
197871a4c30bSSean Christopherson return -EINVAL;
1979f481b069SPaolo Bonzini if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
198071a4c30bSSean Christopherson return -EINVAL;
19810fce5623SAvi Kivity if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
198271a4c30bSSean Christopherson return -EINVAL;
19830f9bdef3SSean Christopherson if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
19840f9bdef3SSean Christopherson return -EINVAL;
19850fce5623SAvi Kivity
198644401a20SMaciej S. Szmigiero slots = __kvm_memslots(kvm, as_id);
19870fce5623SAvi Kivity
19885c0b4f3dSSean Christopherson /*
19897cd08553SSean Christopherson * Note, the old memslot (and the pointer itself!) may be invalidated
19907cd08553SSean Christopherson * and/or destroyed by kvm_set_memslot().
19915c0b4f3dSSean Christopherson */
199244401a20SMaciej S. Szmigiero old = id_to_memslot(slots, id);
1993163da372SSean Christopherson
199447ea7d90SSean Christopherson if (!mem->memory_size) {
19957cd08553SSean Christopherson if (!old || !old->npages)
199647ea7d90SSean Christopherson return -EINVAL;
199747ea7d90SSean Christopherson
19987cd08553SSean Christopherson if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
199947ea7d90SSean Christopherson return -EIO;
200047ea7d90SSean Christopherson
2001244893faSSean Christopherson return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
20025c0b4f3dSSean Christopherson }
20035c0b4f3dSSean Christopherson
20040f9bdef3SSean Christopherson base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
20050f9bdef3SSean Christopherson npages = (mem->memory_size >> PAGE_SHIFT);
20065c0b4f3dSSean Christopherson
20077cd08553SSean Christopherson if (!old || !old->npages) {
2008f64c0398STakuya Yoshikawa change = KVM_MR_CREATE;
2009afa319a5SSean Christopherson
2010afa319a5SSean Christopherson /*
2011afa319a5SSean Christopherson * To simplify KVM internals, the total number of pages across
2012afa319a5SSean Christopherson * all memslots must fit in an unsigned long.
2013afa319a5SSean Christopherson */
20140f9bdef3SSean Christopherson if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015afa319a5SSean Christopherson return -EINVAL;
20165c0b4f3dSSean Christopherson } else { /* Modify an existing slot. */
20170f9bdef3SSean Christopherson if ((mem->userspace_addr != old->userspace_addr) ||
20180f9bdef3SSean Christopherson (npages != old->npages) ||
20190f9bdef3SSean Christopherson ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
202071a4c30bSSean Christopherson return -EINVAL;
20210fce5623SAvi Kivity
20220f9bdef3SSean Christopherson if (base_gfn != old->base_gfn)
2023f64c0398STakuya Yoshikawa change = KVM_MR_MOVE;
20240f9bdef3SSean Christopherson else if (mem->flags != old->flags)
2025f64c0398STakuya Yoshikawa change = KVM_MR_FLAGS_ONLY;
202671a4c30bSSean Christopherson else /* Nothing to change. */
202771a4c30bSSean Christopherson return 0;
2028f64c0398STakuya Yoshikawa }
202909170a49SPaolo Bonzini
203044401a20SMaciej S. Szmigiero if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
20310f9bdef3SSean Christopherson kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
203271a4c30bSSean Christopherson return -EEXIST;
20330fce5623SAvi Kivity
2034244893faSSean Christopherson /* Allocate a slot that will persist in the memslot. */
2035244893faSSean Christopherson new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2036244893faSSean Christopherson if (!new)
2037244893faSSean Christopherson return -ENOMEM;
20380f9bdef3SSean Christopherson
2039244893faSSean Christopherson new->as_id = as_id;
2040244893faSSean Christopherson new->id = id;
2041244893faSSean Christopherson new->base_gfn = base_gfn;
2042244893faSSean Christopherson new->npages = npages;
2043244893faSSean Christopherson new->flags = mem->flags;
2044244893faSSean Christopherson new->userspace_addr = mem->userspace_addr;
2045244893faSSean Christopherson
2046244893faSSean Christopherson r = kvm_set_memslot(kvm, old, new, change);
204771a4c30bSSean Christopherson if (r)
2048244893faSSean Christopherson kfree(new);
20490fce5623SAvi Kivity return r;
20500fce5623SAvi Kivity }
20510fce5623SAvi Kivity EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
20520fce5623SAvi Kivity
kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)20530fce5623SAvi Kivity int kvm_set_memory_region(struct kvm *kvm,
205409170a49SPaolo Bonzini const struct kvm_userspace_memory_region *mem)
20550fce5623SAvi Kivity {
20560fce5623SAvi Kivity int r;
20570fce5623SAvi Kivity
205879fac95eSMarcelo Tosatti mutex_lock(&kvm->slots_lock);
205947ae31e2STakuya Yoshikawa r = __kvm_set_memory_region(kvm, mem);
206079fac95eSMarcelo Tosatti mutex_unlock(&kvm->slots_lock);
20610fce5623SAvi Kivity return r;
20620fce5623SAvi Kivity }
20630fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_set_memory_region);
20640fce5623SAvi Kivity
kvm_vm_ioctl_set_memory_region(struct kvm * kvm,struct kvm_userspace_memory_region * mem)20657940876eSStephen Hemminger static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
206647ae31e2STakuya Yoshikawa struct kvm_userspace_memory_region *mem)
20670fce5623SAvi Kivity {
2068f481b069SPaolo Bonzini if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
20690fce5623SAvi Kivity return -EINVAL;
207009170a49SPaolo Bonzini
207147ae31e2STakuya Yoshikawa return kvm_set_memory_region(kvm, mem);
20720fce5623SAvi Kivity }
20730fce5623SAvi Kivity
20740dff0846SSean Christopherson #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
20752a49f61dSSean Christopherson /**
20762a49f61dSSean Christopherson * kvm_get_dirty_log - get a snapshot of dirty pages
20772a49f61dSSean Christopherson * @kvm: pointer to kvm instance
20782a49f61dSSean Christopherson * @log: slot id and address to which we copy the log
20792a49f61dSSean Christopherson * @is_dirty: set to '1' if any dirty pages were found
20802a49f61dSSean Christopherson * @memslot: set to the associated memslot, always valid on success
20812a49f61dSSean Christopherson */
kvm_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log,int * is_dirty,struct kvm_memory_slot ** memslot)20822a49f61dSSean Christopherson int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
20832a49f61dSSean Christopherson int *is_dirty, struct kvm_memory_slot **memslot)
20840fce5623SAvi Kivity {
20859f6b8029SPaolo Bonzini struct kvm_memslots *slots;
2086843574a3SMarkus Elfring int i, as_id, id;
208787bf6e7dSTakuya Yoshikawa unsigned long n;
20880fce5623SAvi Kivity unsigned long any = 0;
20890fce5623SAvi Kivity
209086bdf3ebSGavin Shan /* Dirty ring tracking may be exclusive to dirty log tracking */
209186bdf3ebSGavin Shan if (!kvm_use_dirty_bitmap(kvm))
2092b2cc64c4SPeter Xu return -ENXIO;
2093b2cc64c4SPeter Xu
20942a49f61dSSean Christopherson *memslot = NULL;
20952a49f61dSSean Christopherson *is_dirty = 0;
20962a49f61dSSean Christopherson
2097f481b069SPaolo Bonzini as_id = log->slot >> 16;
2098f481b069SPaolo Bonzini id = (u16)log->slot;
2099f481b069SPaolo Bonzini if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2100843574a3SMarkus Elfring return -EINVAL;
21010fce5623SAvi Kivity
2102f481b069SPaolo Bonzini slots = __kvm_memslots(kvm, as_id);
21032a49f61dSSean Christopherson *memslot = id_to_memslot(slots, id);
21040577d1abSSean Christopherson if (!(*memslot) || !(*memslot)->dirty_bitmap)
2105843574a3SMarkus Elfring return -ENOENT;
21060fce5623SAvi Kivity
21072a49f61dSSean Christopherson kvm_arch_sync_dirty_log(kvm, *memslot);
21082a49f61dSSean Christopherson
21092a49f61dSSean Christopherson n = kvm_dirty_bitmap_bytes(*memslot);
21100fce5623SAvi Kivity
21110fce5623SAvi Kivity for (i = 0; !any && i < n/sizeof(long); ++i)
21122a49f61dSSean Christopherson any = (*memslot)->dirty_bitmap[i];
21130fce5623SAvi Kivity
21142a49f61dSSean Christopherson if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115843574a3SMarkus Elfring return -EFAULT;
21160fce5623SAvi Kivity
21170fce5623SAvi Kivity if (any)
21180fce5623SAvi Kivity *is_dirty = 1;
2119843574a3SMarkus Elfring return 0;
21200fce5623SAvi Kivity }
21212ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
21220fce5623SAvi Kivity
21230dff0846SSean Christopherson #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124ba0513b5SMario Smarduch /**
2125b8b00220SJiang Biao * kvm_get_dirty_log_protect - get a snapshot of dirty pages
21262a31b9dbSPaolo Bonzini * and reenable dirty page tracking for the corresponding pages.
2127ba0513b5SMario Smarduch * @kvm: pointer to kvm instance
2128ba0513b5SMario Smarduch * @log: slot id and address to which we copy the log
2129ba0513b5SMario Smarduch *
2130ba0513b5SMario Smarduch * We need to keep it in mind that VCPU threads can write to the bitmap
2131ba0513b5SMario Smarduch * concurrently. So, to avoid losing track of dirty pages we keep the
2132ba0513b5SMario Smarduch * following order:
2133ba0513b5SMario Smarduch *
2134ba0513b5SMario Smarduch * 1. Take a snapshot of the bit and clear it if needed.
2135ba0513b5SMario Smarduch * 2. Write protect the corresponding page.
2136ba0513b5SMario Smarduch * 3. Copy the snapshot to the userspace.
2137ba0513b5SMario Smarduch * 4. Upon return caller flushes TLB's if needed.
2138ba0513b5SMario Smarduch *
2139ba0513b5SMario Smarduch * Between 2 and 4, the guest may write to the page using the remaining TLB
2140ba0513b5SMario Smarduch * entry. This is not a problem because the page is reported dirty using
2141ba0513b5SMario Smarduch * the snapshot taken before and step 4 ensures that writes done after
2142ba0513b5SMario Smarduch * exiting to userspace will be logged for the next call.
2143ba0513b5SMario Smarduch *
2144ba0513b5SMario Smarduch */
kvm_get_dirty_log_protect(struct kvm * kvm,struct kvm_dirty_log * log)21450dff0846SSean Christopherson static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2146ba0513b5SMario Smarduch {
21479f6b8029SPaolo Bonzini struct kvm_memslots *slots;
2148ba0513b5SMario Smarduch struct kvm_memory_slot *memslot;
214958d6db34SMarkus Elfring int i, as_id, id;
2150ba0513b5SMario Smarduch unsigned long n;
2151ba0513b5SMario Smarduch unsigned long *dirty_bitmap;
2152ba0513b5SMario Smarduch unsigned long *dirty_bitmap_buffer;
21530dff0846SSean Christopherson bool flush;
2154ba0513b5SMario Smarduch
215586bdf3ebSGavin Shan /* Dirty ring tracking may be exclusive to dirty log tracking */
215686bdf3ebSGavin Shan if (!kvm_use_dirty_bitmap(kvm))
2157b2cc64c4SPeter Xu return -ENXIO;
2158b2cc64c4SPeter Xu
2159f481b069SPaolo Bonzini as_id = log->slot >> 16;
2160f481b069SPaolo Bonzini id = (u16)log->slot;
2161f481b069SPaolo Bonzini if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
216258d6db34SMarkus Elfring return -EINVAL;
2163ba0513b5SMario Smarduch
2164f481b069SPaolo Bonzini slots = __kvm_memslots(kvm, as_id);
2165f481b069SPaolo Bonzini memslot = id_to_memslot(slots, id);
21660577d1abSSean Christopherson if (!memslot || !memslot->dirty_bitmap)
21670577d1abSSean Christopherson return -ENOENT;
2168ba0513b5SMario Smarduch
2169ba0513b5SMario Smarduch dirty_bitmap = memslot->dirty_bitmap;
2170ba0513b5SMario Smarduch
21710dff0846SSean Christopherson kvm_arch_sync_dirty_log(kvm, memslot);
21720dff0846SSean Christopherson
2173ba0513b5SMario Smarduch n = kvm_dirty_bitmap_bytes(memslot);
21740dff0846SSean Christopherson flush = false;
21752a31b9dbSPaolo Bonzini if (kvm->manual_dirty_log_protect) {
21762a31b9dbSPaolo Bonzini /*
21772a31b9dbSPaolo Bonzini * Unlike kvm_get_dirty_log, we always return false in *flush,
21782a31b9dbSPaolo Bonzini * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
21792a31b9dbSPaolo Bonzini * is some code duplication between this function and
21802a31b9dbSPaolo Bonzini * kvm_get_dirty_log, but hopefully all architecture
21812a31b9dbSPaolo Bonzini * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
21822a31b9dbSPaolo Bonzini * can be eliminated.
21832a31b9dbSPaolo Bonzini */
21842a31b9dbSPaolo Bonzini dirty_bitmap_buffer = dirty_bitmap;
21852a31b9dbSPaolo Bonzini } else {
218603133347SClaudio Imbrenda dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187ba0513b5SMario Smarduch memset(dirty_bitmap_buffer, 0, n);
2188ba0513b5SMario Smarduch
2189531810caSBen Gardon KVM_MMU_LOCK(kvm);
2190ba0513b5SMario Smarduch for (i = 0; i < n / sizeof(long); i++) {
2191ba0513b5SMario Smarduch unsigned long mask;
2192ba0513b5SMario Smarduch gfn_t offset;
2193ba0513b5SMario Smarduch
2194ba0513b5SMario Smarduch if (!dirty_bitmap[i])
2195ba0513b5SMario Smarduch continue;
2196ba0513b5SMario Smarduch
21970dff0846SSean Christopherson flush = true;
2198ba0513b5SMario Smarduch mask = xchg(&dirty_bitmap[i], 0);
2199ba0513b5SMario Smarduch dirty_bitmap_buffer[i] = mask;
2200ba0513b5SMario Smarduch
2201ba0513b5SMario Smarduch offset = i * BITS_PER_LONG;
220258d2930fSTakuya Yoshikawa kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
220358d2930fSTakuya Yoshikawa offset, mask);
220458d2930fSTakuya Yoshikawa }
2205531810caSBen Gardon KVM_MMU_UNLOCK(kvm);
22062a31b9dbSPaolo Bonzini }
22072a31b9dbSPaolo Bonzini
22080dff0846SSean Christopherson if (flush)
2209619b5072SDavid Matlack kvm_flush_remote_tlbs_memslot(kvm, memslot);
22100dff0846SSean Christopherson
2211ba0513b5SMario Smarduch if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
221258d6db34SMarkus Elfring return -EFAULT;
221358d6db34SMarkus Elfring return 0;
2214ba0513b5SMario Smarduch }
22150dff0846SSean Christopherson
22160dff0846SSean Christopherson
22170dff0846SSean Christopherson /**
22180dff0846SSean Christopherson * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
22190dff0846SSean Christopherson * @kvm: kvm instance
22200dff0846SSean Christopherson * @log: slot id and address to which we copy the log
22210dff0846SSean Christopherson *
22220dff0846SSean Christopherson * Steps 1-4 below provide general overview of dirty page logging. See
22230dff0846SSean Christopherson * kvm_get_dirty_log_protect() function description for additional details.
22240dff0846SSean Christopherson *
22250dff0846SSean Christopherson * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
22260dff0846SSean Christopherson * always flush the TLB (step 4) even if previous step failed and the dirty
22270dff0846SSean Christopherson * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
22280dff0846SSean Christopherson * does not preclude user space subsequent dirty log read. Flushing TLB ensures
22290dff0846SSean Christopherson * writes will be marked dirty for next log read.
22300dff0846SSean Christopherson *
22310dff0846SSean Christopherson * 1. Take a snapshot of the bit and clear it if needed.
22320dff0846SSean Christopherson * 2. Write protect the corresponding page.
22330dff0846SSean Christopherson * 3. Copy the snapshot to the userspace.
22340dff0846SSean Christopherson * 4. Flush TLB's if needed.
22350dff0846SSean Christopherson */
kvm_vm_ioctl_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log)22360dff0846SSean Christopherson static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
22370dff0846SSean Christopherson struct kvm_dirty_log *log)
22380dff0846SSean Christopherson {
22390dff0846SSean Christopherson int r;
22400dff0846SSean Christopherson
22410dff0846SSean Christopherson mutex_lock(&kvm->slots_lock);
22420dff0846SSean Christopherson
22430dff0846SSean Christopherson r = kvm_get_dirty_log_protect(kvm, log);
22440dff0846SSean Christopherson
22450dff0846SSean Christopherson mutex_unlock(&kvm->slots_lock);
22460dff0846SSean Christopherson return r;
22470dff0846SSean Christopherson }
22482a31b9dbSPaolo Bonzini
22492a31b9dbSPaolo Bonzini /**
22502a31b9dbSPaolo Bonzini * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
22512a31b9dbSPaolo Bonzini * and reenable dirty page tracking for the corresponding pages.
22522a31b9dbSPaolo Bonzini * @kvm: pointer to kvm instance
22532a31b9dbSPaolo Bonzini * @log: slot id and address from which to fetch the bitmap of dirty pages
22542a31b9dbSPaolo Bonzini */
kvm_clear_dirty_log_protect(struct kvm * kvm,struct kvm_clear_dirty_log * log)22550dff0846SSean Christopherson static int kvm_clear_dirty_log_protect(struct kvm *kvm,
22560dff0846SSean Christopherson struct kvm_clear_dirty_log *log)
22572a31b9dbSPaolo Bonzini {
22582a31b9dbSPaolo Bonzini struct kvm_memslots *slots;
22592a31b9dbSPaolo Bonzini struct kvm_memory_slot *memslot;
226098938aa8STomas Bortoli int as_id, id;
22612a31b9dbSPaolo Bonzini gfn_t offset;
226298938aa8STomas Bortoli unsigned long i, n;
22632a31b9dbSPaolo Bonzini unsigned long *dirty_bitmap;
22642a31b9dbSPaolo Bonzini unsigned long *dirty_bitmap_buffer;
22650dff0846SSean Christopherson bool flush;
22662a31b9dbSPaolo Bonzini
226786bdf3ebSGavin Shan /* Dirty ring tracking may be exclusive to dirty log tracking */
226886bdf3ebSGavin Shan if (!kvm_use_dirty_bitmap(kvm))
2269b2cc64c4SPeter Xu return -ENXIO;
2270b2cc64c4SPeter Xu
22712a31b9dbSPaolo Bonzini as_id = log->slot >> 16;
22722a31b9dbSPaolo Bonzini id = (u16)log->slot;
22732a31b9dbSPaolo Bonzini if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
22742a31b9dbSPaolo Bonzini return -EINVAL;
22752a31b9dbSPaolo Bonzini
227676d58e0fSPaolo Bonzini if (log->first_page & 63)
22772a31b9dbSPaolo Bonzini return -EINVAL;
22782a31b9dbSPaolo Bonzini
22792a31b9dbSPaolo Bonzini slots = __kvm_memslots(kvm, as_id);
22802a31b9dbSPaolo Bonzini memslot = id_to_memslot(slots, id);
22810577d1abSSean Christopherson if (!memslot || !memslot->dirty_bitmap)
22820577d1abSSean Christopherson return -ENOENT;
22832a31b9dbSPaolo Bonzini
22842a31b9dbSPaolo Bonzini dirty_bitmap = memslot->dirty_bitmap;
22852a31b9dbSPaolo Bonzini
22864ddc9204SPeter Xu n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
228798938aa8STomas Bortoli
228898938aa8STomas Bortoli if (log->first_page > memslot->npages ||
228976d58e0fSPaolo Bonzini log->num_pages > memslot->npages - log->first_page ||
229076d58e0fSPaolo Bonzini (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
229198938aa8STomas Bortoli return -EINVAL;
229298938aa8STomas Bortoli
22930dff0846SSean Christopherson kvm_arch_sync_dirty_log(kvm, memslot);
22940dff0846SSean Christopherson
22950dff0846SSean Christopherson flush = false;
22962a31b9dbSPaolo Bonzini dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
22972a31b9dbSPaolo Bonzini if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
22982a31b9dbSPaolo Bonzini return -EFAULT;
22992a31b9dbSPaolo Bonzini
2300531810caSBen Gardon KVM_MMU_LOCK(kvm);
230153eac7a8SPeter Xu for (offset = log->first_page, i = offset / BITS_PER_LONG,
230253eac7a8SPeter Xu n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
23032a31b9dbSPaolo Bonzini i++, offset += BITS_PER_LONG) {
23042a31b9dbSPaolo Bonzini unsigned long mask = *dirty_bitmap_buffer++;
23052a31b9dbSPaolo Bonzini atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
23062a31b9dbSPaolo Bonzini if (!mask)
23072a31b9dbSPaolo Bonzini continue;
23082a31b9dbSPaolo Bonzini
23092a31b9dbSPaolo Bonzini mask &= atomic_long_fetch_andnot(mask, p);
23102a31b9dbSPaolo Bonzini
23112a31b9dbSPaolo Bonzini /*
23122a31b9dbSPaolo Bonzini * mask contains the bits that really have been cleared. This
23132a31b9dbSPaolo Bonzini * never includes any bits beyond the length of the memslot (if
23142a31b9dbSPaolo Bonzini * the length is not aligned to 64 pages), therefore it is not
23152a31b9dbSPaolo Bonzini * a problem if userspace sets them in log->dirty_bitmap.
23162a31b9dbSPaolo Bonzini */
23172a31b9dbSPaolo Bonzini if (mask) {
23180dff0846SSean Christopherson flush = true;
23192a31b9dbSPaolo Bonzini kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
23202a31b9dbSPaolo Bonzini offset, mask);
23212a31b9dbSPaolo Bonzini }
23222a31b9dbSPaolo Bonzini }
2323531810caSBen Gardon KVM_MMU_UNLOCK(kvm);
23242a31b9dbSPaolo Bonzini
23250dff0846SSean Christopherson if (flush)
2326619b5072SDavid Matlack kvm_flush_remote_tlbs_memslot(kvm, memslot);
23270dff0846SSean Christopherson
23282a31b9dbSPaolo Bonzini return 0;
23292a31b9dbSPaolo Bonzini }
23300dff0846SSean Christopherson
kvm_vm_ioctl_clear_dirty_log(struct kvm * kvm,struct kvm_clear_dirty_log * log)23310dff0846SSean Christopherson static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
23320dff0846SSean Christopherson struct kvm_clear_dirty_log *log)
23330dff0846SSean Christopherson {
23340dff0846SSean Christopherson int r;
23350dff0846SSean Christopherson
23360dff0846SSean Christopherson mutex_lock(&kvm->slots_lock);
23370dff0846SSean Christopherson
23380dff0846SSean Christopherson r = kvm_clear_dirty_log_protect(kvm, log);
23390dff0846SSean Christopherson
23400dff0846SSean Christopherson mutex_unlock(&kvm->slots_lock);
23410dff0846SSean Christopherson return r;
23420dff0846SSean Christopherson }
23430dff0846SSean Christopherson #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344ba0513b5SMario Smarduch
gfn_to_memslot(struct kvm * kvm,gfn_t gfn)234549c7754cSGleb Natapov struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
234649c7754cSGleb Natapov {
234749c7754cSGleb Natapov return __gfn_to_memslot(kvm_memslots(kvm), gfn);
234849c7754cSGleb Natapov }
2349a1f4d395SAvi Kivity EXPORT_SYMBOL_GPL(gfn_to_memslot);
23500fce5623SAvi Kivity
kvm_vcpu_gfn_to_memslot(struct kvm_vcpu * vcpu,gfn_t gfn)23518e73485cSPaolo Bonzini struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
23528e73485cSPaolo Bonzini {
2353fe22ed82SDavid Matlack struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354a54d8066SMaciej S. Szmigiero u64 gen = slots->generation;
2355fe22ed82SDavid Matlack struct kvm_memory_slot *slot;
2356fe22ed82SDavid Matlack
2357a54d8066SMaciej S. Szmigiero /*
2358a54d8066SMaciej S. Szmigiero * This also protects against using a memslot from a different address space,
2359a54d8066SMaciej S. Szmigiero * since different address spaces have different generation numbers.
2360a54d8066SMaciej S. Szmigiero */
2361a54d8066SMaciej S. Szmigiero if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362a54d8066SMaciej S. Szmigiero vcpu->last_used_slot = NULL;
2363a54d8066SMaciej S. Szmigiero vcpu->last_used_slot_gen = gen;
2364a54d8066SMaciej S. Szmigiero }
2365a54d8066SMaciej S. Szmigiero
2366a54d8066SMaciej S. Szmigiero slot = try_get_memslot(vcpu->last_used_slot, gfn);
2367fe22ed82SDavid Matlack if (slot)
2368fe22ed82SDavid Matlack return slot;
2369fe22ed82SDavid Matlack
2370fe22ed82SDavid Matlack /*
2371fe22ed82SDavid Matlack * Fall back to searching all memslots. We purposely use
2372fe22ed82SDavid Matlack * search_memslots() instead of __gfn_to_memslot() to avoid
2373a54d8066SMaciej S. Szmigiero * thrashing the VM-wide last_used_slot in kvm_memslots.
2374fe22ed82SDavid Matlack */
2375a54d8066SMaciej S. Szmigiero slot = search_memslots(slots, gfn, false);
2376fe22ed82SDavid Matlack if (slot) {
2377a54d8066SMaciej S. Szmigiero vcpu->last_used_slot = slot;
2378fe22ed82SDavid Matlack return slot;
2379fe22ed82SDavid Matlack }
2380fe22ed82SDavid Matlack
2381fe22ed82SDavid Matlack return NULL;
23828e73485cSPaolo Bonzini }
23838e73485cSPaolo Bonzini
kvm_is_visible_gfn(struct kvm * kvm,gfn_t gfn)238433e94154SYaowei Bai bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
23850fce5623SAvi Kivity {
2386bf3e05bcSXiao Guangrong struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
23870fce5623SAvi Kivity
2388c36b7150SPaolo Bonzini return kvm_is_visible_memslot(memslot);
23890fce5623SAvi Kivity }
23900fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
23910fce5623SAvi Kivity
kvm_vcpu_is_visible_gfn(struct kvm_vcpu * vcpu,gfn_t gfn)2392995decb6SVitaly Kuznetsov bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393995decb6SVitaly Kuznetsov {
2394995decb6SVitaly Kuznetsov struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395995decb6SVitaly Kuznetsov
2396995decb6SVitaly Kuznetsov return kvm_is_visible_memslot(memslot);
2397995decb6SVitaly Kuznetsov }
2398995decb6SVitaly Kuznetsov EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399995decb6SVitaly Kuznetsov
kvm_host_page_size(struct kvm_vcpu * vcpu,gfn_t gfn)2400f9b84e19SSean Christopherson unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
24018f0b1ab6SJoerg Roedel {
24028f0b1ab6SJoerg Roedel struct vm_area_struct *vma;
24038f0b1ab6SJoerg Roedel unsigned long addr, size;
24048f0b1ab6SJoerg Roedel
24058f0b1ab6SJoerg Roedel size = PAGE_SIZE;
24068f0b1ab6SJoerg Roedel
240742cde48bSSean Christopherson addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
24088f0b1ab6SJoerg Roedel if (kvm_is_error_hva(addr))
24098f0b1ab6SJoerg Roedel return PAGE_SIZE;
24108f0b1ab6SJoerg Roedel
2411d8ed45c5SMichel Lespinasse mmap_read_lock(current->mm);
24128f0b1ab6SJoerg Roedel vma = find_vma(current->mm, addr);
24138f0b1ab6SJoerg Roedel if (!vma)
24148f0b1ab6SJoerg Roedel goto out;
24158f0b1ab6SJoerg Roedel
24168f0b1ab6SJoerg Roedel size = vma_kernel_pagesize(vma);
24178f0b1ab6SJoerg Roedel
24188f0b1ab6SJoerg Roedel out:
2419d8ed45c5SMichel Lespinasse mmap_read_unlock(current->mm);
24208f0b1ab6SJoerg Roedel
24218f0b1ab6SJoerg Roedel return size;
24228f0b1ab6SJoerg Roedel }
24238f0b1ab6SJoerg Roedel
memslot_is_readonly(const struct kvm_memory_slot * slot)24248283e36aSBen Gardon static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
24254d8b81abSXiao Guangrong {
24264d8b81abSXiao Guangrong return slot->flags & KVM_MEM_READONLY;
24274d8b81abSXiao Guangrong }
24284d8b81abSXiao Guangrong
__gfn_to_hva_many(const struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages,bool write)24298283e36aSBen Gardon static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
24304d8b81abSXiao Guangrong gfn_t *nr_pages, bool write)
24310fce5623SAvi Kivity {
2432bc6678a3SMarcelo Tosatti if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2433ca3a490cSXiao Guangrong return KVM_HVA_ERR_BAD;
243448987781SXiao Guangrong
24354d8b81abSXiao Guangrong if (memslot_is_readonly(slot) && write)
24364d8b81abSXiao Guangrong return KVM_HVA_ERR_RO_BAD;
243748987781SXiao Guangrong
243848987781SXiao Guangrong if (nr_pages)
243948987781SXiao Guangrong *nr_pages = slot->npages - (gfn - slot->base_gfn);
244048987781SXiao Guangrong
24414d8b81abSXiao Guangrong return __gfn_to_hva_memslot(slot, gfn);
24420fce5623SAvi Kivity }
244348987781SXiao Guangrong
gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages)24444d8b81abSXiao Guangrong static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
24454d8b81abSXiao Guangrong gfn_t *nr_pages)
24464d8b81abSXiao Guangrong {
24474d8b81abSXiao Guangrong return __gfn_to_hva_many(slot, gfn, nr_pages, true);
24484d8b81abSXiao Guangrong }
24494d8b81abSXiao Guangrong
gfn_to_hva_memslot(struct kvm_memory_slot * slot,gfn_t gfn)24504d8b81abSXiao Guangrong unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
24514d8b81abSXiao Guangrong gfn_t gfn)
24524d8b81abSXiao Guangrong {
24534d8b81abSXiao Guangrong return gfn_to_hva_many(slot, gfn, NULL);
24544d8b81abSXiao Guangrong }
24554d8b81abSXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
24564d8b81abSXiao Guangrong
gfn_to_hva(struct kvm * kvm,gfn_t gfn)245748987781SXiao Guangrong unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
245848987781SXiao Guangrong {
245949c7754cSGleb Natapov return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
246048987781SXiao Guangrong }
24610d150298SSheng Yang EXPORT_SYMBOL_GPL(gfn_to_hva);
24620fce5623SAvi Kivity
kvm_vcpu_gfn_to_hva(struct kvm_vcpu * vcpu,gfn_t gfn)24638e73485cSPaolo Bonzini unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
24648e73485cSPaolo Bonzini {
24658e73485cSPaolo Bonzini return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
24668e73485cSPaolo Bonzini }
24678e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
24688e73485cSPaolo Bonzini
246986ab8cffSXiao Guangrong /*
2470970c0d4bSWei Yang * Return the hva of a @gfn and the R/W attribute if possible.
2471970c0d4bSWei Yang *
2472970c0d4bSWei Yang * @slot: the kvm_memory_slot which contains @gfn
2473970c0d4bSWei Yang * @gfn: the gfn to be translated
2474970c0d4bSWei Yang * @writable: used to return the read/write attribute of the @slot if the hva
2475970c0d4bSWei Yang * is valid and @writable is not NULL
247686ab8cffSXiao Guangrong */
gfn_to_hva_memslot_prot(struct kvm_memory_slot * slot,gfn_t gfn,bool * writable)247764d83126SChristoffer Dall unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
247864d83126SChristoffer Dall gfn_t gfn, bool *writable)
24798030089fSGleb Natapov {
2480a2ac07feSGleb Natapov unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2481a2ac07feSGleb Natapov
2482a2ac07feSGleb Natapov if (!kvm_is_error_hva(hva) && writable)
2483ba6a3541SPaolo Bonzini *writable = !memslot_is_readonly(slot);
2484ba6a3541SPaolo Bonzini
2485a2ac07feSGleb Natapov return hva;
248686ab8cffSXiao Guangrong }
248786ab8cffSXiao Guangrong
gfn_to_hva_prot(struct kvm * kvm,gfn_t gfn,bool * writable)248864d83126SChristoffer Dall unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
248964d83126SChristoffer Dall {
249064d83126SChristoffer Dall struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
249164d83126SChristoffer Dall
249264d83126SChristoffer Dall return gfn_to_hva_memslot_prot(slot, gfn, writable);
249364d83126SChristoffer Dall }
249464d83126SChristoffer Dall
kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu * vcpu,gfn_t gfn,bool * writable)24958e73485cSPaolo Bonzini unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
24968e73485cSPaolo Bonzini {
24978e73485cSPaolo Bonzini struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
24988e73485cSPaolo Bonzini
24998e73485cSPaolo Bonzini return gfn_to_hva_memslot_prot(slot, gfn, writable);
25008e73485cSPaolo Bonzini }
25018e73485cSPaolo Bonzini
check_user_page_hwpoison(unsigned long addr)2502fafc3dbaSHuang Ying static inline int check_user_page_hwpoison(unsigned long addr)
2503fafc3dbaSHuang Ying {
25040d731759SLorenzo Stoakes int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2505fafc3dbaSHuang Ying
250654d02069SLorenzo Stoakes rc = get_user_pages(addr, 1, flags, NULL);
2507fafc3dbaSHuang Ying return rc == -EHWPOISON;
2508fafc3dbaSHuang Ying }
2509fafc3dbaSHuang Ying
25102fc84311SXiao Guangrong /*
2511b9b33da2SPaolo Bonzini * The fast path to get the writable pfn which will be stored in @pfn,
2512b9b33da2SPaolo Bonzini * true indicates success, otherwise false is returned. It's also the
2513311497e0SMiaohe Lin * only part that runs if we can in atomic context.
25142fc84311SXiao Guangrong */
hva_to_pfn_fast(unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * pfn)2515b9b33da2SPaolo Bonzini static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516b9b33da2SPaolo Bonzini bool *writable, kvm_pfn_t *pfn)
25170fce5623SAvi Kivity {
25180fce5623SAvi Kivity struct page *page[1];
25190fce5623SAvi Kivity
252012ce13feSXiao Guangrong /*
252112ce13feSXiao Guangrong * Fast pin a writable pfn only if it is a write fault request
252212ce13feSXiao Guangrong * or the caller allows to map a writable pfn for a read fault
252312ce13feSXiao Guangrong * request.
252412ce13feSXiao Guangrong */
252512ce13feSXiao Guangrong if (!(write_fault || writable))
252612ce13feSXiao Guangrong return false;
252712ce13feSXiao Guangrong
2528dadbb612SSouptick Joarder if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
25292fc84311SXiao Guangrong *pfn = page_to_pfn(page[0]);
2530612819c3SMarcelo Tosatti
2531612819c3SMarcelo Tosatti if (writable)
2532612819c3SMarcelo Tosatti *writable = true;
25332fc84311SXiao Guangrong return true;
25342fc84311SXiao Guangrong }
2535612819c3SMarcelo Tosatti
25362fc84311SXiao Guangrong return false;
25372fc84311SXiao Guangrong }
2538af585b92SGleb Natapov
25392fc84311SXiao Guangrong /*
25402fc84311SXiao Guangrong * The slow path to get the pfn of the specified host virtual address,
25412fc84311SXiao Guangrong * 1 indicates success, -errno is returned if error is detected.
25422fc84311SXiao Guangrong */
hva_to_pfn_slow(unsigned long addr,bool * async,bool write_fault,bool interruptible,bool * writable,kvm_pfn_t * pfn)25432fc84311SXiao Guangrong static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544c8b88b33SPeter Xu bool interruptible, bool *writable, kvm_pfn_t *pfn)
25452fc84311SXiao Guangrong {
2546b1e1296dSDavid Hildenbrand /*
2547b1e1296dSDavid Hildenbrand * When a VCPU accesses a page that is not mapped into the secondary
2548b1e1296dSDavid Hildenbrand * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2549b1e1296dSDavid Hildenbrand * make progress. We always want to honor NUMA hinting faults in that
2550b1e1296dSDavid Hildenbrand * case, because GUP usage corresponds to memory accesses from the VCPU.
2551b1e1296dSDavid Hildenbrand * Otherwise, we'd not trigger NUMA hinting faults once a page is
2552b1e1296dSDavid Hildenbrand * mapped into the secondary MMU and gets accessed by a VCPU.
2553b1e1296dSDavid Hildenbrand *
2554b1e1296dSDavid Hildenbrand * Note that get_user_page_fast_only() and FOLL_WRITE for now
2555b1e1296dSDavid Hildenbrand * implicitly honor NUMA hinting faults and don't need this flag.
2556b1e1296dSDavid Hildenbrand */
2557b1e1296dSDavid Hildenbrand unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2558ce53053cSAl Viro struct page *page;
255928249139SLi kunyu int npages;
25602fc84311SXiao Guangrong
25610fce5623SAvi Kivity might_sleep();
2562612819c3SMarcelo Tosatti
2563612819c3SMarcelo Tosatti if (writable)
2564612819c3SMarcelo Tosatti *writable = write_fault;
2565612819c3SMarcelo Tosatti
2566d4944b0eSLorenzo Stoakes if (write_fault)
2567d4944b0eSLorenzo Stoakes flags |= FOLL_WRITE;
2568ce53053cSAl Viro if (async)
2569ce53053cSAl Viro flags |= FOLL_NOWAIT;
2570c8b88b33SPeter Xu if (interruptible)
2571c8b88b33SPeter Xu flags |= FOLL_INTERRUPTIBLE;
2572d4944b0eSLorenzo Stoakes
2573ce53053cSAl Viro npages = get_user_pages_unlocked(addr, 1, &page, flags);
25742fc84311SXiao Guangrong if (npages != 1)
25752fc84311SXiao Guangrong return npages;
2576612819c3SMarcelo Tosatti
2577612819c3SMarcelo Tosatti /* map read fault as writable if possible */
257812ce13feSXiao Guangrong if (unlikely(!write_fault) && writable) {
2579ce53053cSAl Viro struct page *wpage;
2580612819c3SMarcelo Tosatti
2581dadbb612SSouptick Joarder if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2582612819c3SMarcelo Tosatti *writable = true;
2583ce53053cSAl Viro put_page(page);
2584ce53053cSAl Viro page = wpage;
2585612819c3SMarcelo Tosatti }
2586612819c3SMarcelo Tosatti }
2587ce53053cSAl Viro *pfn = page_to_pfn(page);
25882fc84311SXiao Guangrong return npages;
2589887c08acSXiao Guangrong }
25900fce5623SAvi Kivity
vma_is_valid(struct vm_area_struct * vma,bool write_fault)25914d8b81abSXiao Guangrong static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
25924d8b81abSXiao Guangrong {
25934d8b81abSXiao Guangrong if (unlikely(!(vma->vm_flags & VM_READ)))
25944d8b81abSXiao Guangrong return false;
25954d8b81abSXiao Guangrong
25964d8b81abSXiao Guangrong if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
25974d8b81abSXiao Guangrong return false;
25984d8b81abSXiao Guangrong
25994d8b81abSXiao Guangrong return true;
26004d8b81abSXiao Guangrong }
26014d8b81abSXiao Guangrong
kvm_try_get_pfn(kvm_pfn_t pfn)2602f8be156bSNicholas Piggin static int kvm_try_get_pfn(kvm_pfn_t pfn)
2603f8be156bSNicholas Piggin {
2604b14b2690SSean Christopherson struct page *page = kvm_pfn_to_refcounted_page(pfn);
2605b14b2690SSean Christopherson
2606b14b2690SSean Christopherson if (!page)
2607f8be156bSNicholas Piggin return 1;
2608b14b2690SSean Christopherson
2609b14b2690SSean Christopherson return get_page_unless_zero(page);
2610f8be156bSNicholas Piggin }
2611f8be156bSNicholas Piggin
hva_to_pfn_remapped(struct vm_area_struct * vma,unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * p_pfn)261292176a8eSPaolo Bonzini static int hva_to_pfn_remapped(struct vm_area_struct *vma,
26131625566eSXianting Tian unsigned long addr, bool write_fault,
26141625566eSXianting Tian bool *writable, kvm_pfn_t *p_pfn)
261592176a8eSPaolo Bonzini {
2616a9545779SSean Christopherson kvm_pfn_t pfn;
2617bd2fae8dSPaolo Bonzini pte_t *ptep;
2618c33c7948SRyan Roberts pte_t pte;
2619bd2fae8dSPaolo Bonzini spinlock_t *ptl;
2620add6a0cdSPaolo Bonzini int r;
2621add6a0cdSPaolo Bonzini
26229fd6dad1SPaolo Bonzini r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2623add6a0cdSPaolo Bonzini if (r) {
2624add6a0cdSPaolo Bonzini /*
2625add6a0cdSPaolo Bonzini * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2626add6a0cdSPaolo Bonzini * not call the fault handler, so do it here.
2627add6a0cdSPaolo Bonzini */
2628add6a0cdSPaolo Bonzini bool unlocked = false;
262964019a2eSPeter Xu r = fixup_user_fault(current->mm, addr,
2630add6a0cdSPaolo Bonzini (write_fault ? FAULT_FLAG_WRITE : 0),
2631add6a0cdSPaolo Bonzini &unlocked);
2632a8387d0bSPaolo Bonzini if (unlocked)
2633a8387d0bSPaolo Bonzini return -EAGAIN;
2634add6a0cdSPaolo Bonzini if (r)
2635add6a0cdSPaolo Bonzini return r;
2636add6a0cdSPaolo Bonzini
26379fd6dad1SPaolo Bonzini r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2638add6a0cdSPaolo Bonzini if (r)
2639add6a0cdSPaolo Bonzini return r;
2640bd2fae8dSPaolo Bonzini }
2641add6a0cdSPaolo Bonzini
2642c33c7948SRyan Roberts pte = ptep_get(ptep);
2643c33c7948SRyan Roberts
2644c33c7948SRyan Roberts if (write_fault && !pte_write(pte)) {
2645bd2fae8dSPaolo Bonzini pfn = KVM_PFN_ERR_RO_FAULT;
2646bd2fae8dSPaolo Bonzini goto out;
2647add6a0cdSPaolo Bonzini }
2648add6a0cdSPaolo Bonzini
2649a340b3e2SKarimAllah Ahmed if (writable)
2650c33c7948SRyan Roberts *writable = pte_write(pte);
2651c33c7948SRyan Roberts pfn = pte_pfn(pte);
2652add6a0cdSPaolo Bonzini
2653add6a0cdSPaolo Bonzini /*
2654add6a0cdSPaolo Bonzini * Get a reference here because callers of *hva_to_pfn* and
2655add6a0cdSPaolo Bonzini * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2656add6a0cdSPaolo Bonzini * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
265736c3ce6cSMarc Zyngier * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2658add6a0cdSPaolo Bonzini * simply do nothing for reserved pfns.
2659add6a0cdSPaolo Bonzini *
2660add6a0cdSPaolo Bonzini * Whoever called remap_pfn_range is also going to call e.g.
2661add6a0cdSPaolo Bonzini * unmap_mapping_range before the underlying pages are freed,
2662add6a0cdSPaolo Bonzini * causing a call to our MMU notifier.
2663f8be156bSNicholas Piggin *
2664f8be156bSNicholas Piggin * Certain IO or PFNMAP mappings can be backed with valid
2665f8be156bSNicholas Piggin * struct pages, but be allocated without refcounting e.g.,
2666f8be156bSNicholas Piggin * tail pages of non-compound higher order allocations, which
2667f8be156bSNicholas Piggin * would then underflow the refcount when the caller does the
2668f8be156bSNicholas Piggin * required put_page. Don't allow those pages here.
2669add6a0cdSPaolo Bonzini */
2670f8be156bSNicholas Piggin if (!kvm_try_get_pfn(pfn))
2671f8be156bSNicholas Piggin r = -EFAULT;
2672add6a0cdSPaolo Bonzini
2673bd2fae8dSPaolo Bonzini out:
2674bd2fae8dSPaolo Bonzini pte_unmap_unlock(ptep, ptl);
2675add6a0cdSPaolo Bonzini *p_pfn = pfn;
2676f8be156bSNicholas Piggin
2677f8be156bSNicholas Piggin return r;
267892176a8eSPaolo Bonzini }
267992176a8eSPaolo Bonzini
268012ce13feSXiao Guangrong /*
268112ce13feSXiao Guangrong * Pin guest page in memory and return its pfn.
268212ce13feSXiao Guangrong * @addr: host virtual address which maps memory to the guest
268312ce13feSXiao Guangrong * @atomic: whether this function can sleep
2684c8b88b33SPeter Xu * @interruptible: whether the process can be interrupted by non-fatal signals
268512ce13feSXiao Guangrong * @async: whether this function need to wait IO complete if the
268612ce13feSXiao Guangrong * host page is not in the memory
268712ce13feSXiao Guangrong * @write_fault: whether we should get a writable host page
268812ce13feSXiao Guangrong * @writable: whether it allows to map a writable host page for !@write_fault
268912ce13feSXiao Guangrong *
269012ce13feSXiao Guangrong * The function will map a writable host page for these two cases:
269112ce13feSXiao Guangrong * 1): @write_fault = true
269212ce13feSXiao Guangrong * 2): @write_fault = false && @writable, @writable will tell the caller
269312ce13feSXiao Guangrong * whether the mapping is writable.
269412ce13feSXiao Guangrong */
hva_to_pfn(unsigned long addr,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable)2695c8b88b33SPeter Xu kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2696c8b88b33SPeter Xu bool *async, bool write_fault, bool *writable)
26972fc84311SXiao Guangrong {
26982e2e3738SAnthony Liguori struct vm_area_struct *vma;
2699943dfea8SSean Christopherson kvm_pfn_t pfn;
270092176a8eSPaolo Bonzini int npages, r;
27012fc84311SXiao Guangrong
27022fc84311SXiao Guangrong /* we can do it either atomically or asynchronously, not both */
27032fc84311SXiao Guangrong BUG_ON(atomic && async);
27042fc84311SXiao Guangrong
2705b9b33da2SPaolo Bonzini if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
27062fc84311SXiao Guangrong return pfn;
27072e2e3738SAnthony Liguori
2708887c08acSXiao Guangrong if (atomic)
27096c8ee57bSXiao Guangrong return KVM_PFN_ERR_FAULT;
2710887c08acSXiao Guangrong
2711c8b88b33SPeter Xu npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2712c8b88b33SPeter Xu writable, &pfn);
27132fc84311SXiao Guangrong if (npages == 1)
27142fc84311SXiao Guangrong return pfn;
2715fe5ed56cSPeter Xu if (npages == -EINTR)
2716fe5ed56cSPeter Xu return KVM_PFN_ERR_SIGPENDING;
27172e2e3738SAnthony Liguori
2718d8ed45c5SMichel Lespinasse mmap_read_lock(current->mm);
27190857b9e9SGleb Natapov if (npages == -EHWPOISON ||
27200857b9e9SGleb Natapov (!async && check_user_page_hwpoison(addr))) {
27212fc84311SXiao Guangrong pfn = KVM_PFN_ERR_HWPOISON;
27222fc84311SXiao Guangrong goto exit;
2723bf998156SHuang Ying }
2724bf998156SHuang Ying
2725a8387d0bSPaolo Bonzini retry:
2726fc98c03bSLiam Howlett vma = vma_lookup(current->mm, addr);
27274c2155ceSMarcelo Tosatti
27288030089fSGleb Natapov if (vma == NULL)
27296c8ee57bSXiao Guangrong pfn = KVM_PFN_ERR_FAULT;
273092176a8eSPaolo Bonzini else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
27311625566eSXianting Tian r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2732a8387d0bSPaolo Bonzini if (r == -EAGAIN)
2733a8387d0bSPaolo Bonzini goto retry;
273492176a8eSPaolo Bonzini if (r < 0)
273592176a8eSPaolo Bonzini pfn = KVM_PFN_ERR_FAULT;
27368030089fSGleb Natapov } else {
27374d8b81abSXiao Guangrong if (async && vma_is_valid(vma, write_fault))
27388030089fSGleb Natapov *async = true;
27396c8ee57bSXiao Guangrong pfn = KVM_PFN_ERR_FAULT;
27408030089fSGleb Natapov }
27412fc84311SXiao Guangrong exit:
2742d8ed45c5SMichel Lespinasse mmap_read_unlock(current->mm);
27432e2e3738SAnthony Liguori return pfn;
274435149e21SAnthony Liguori }
274535149e21SAnthony Liguori
__gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable,hva_t * hva)27468283e36aSBen Gardon kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2747c8b88b33SPeter Xu bool atomic, bool interruptible, bool *async,
2748c8b88b33SPeter Xu bool write_fault, bool *writable, hva_t *hva)
2749887c08acSXiao Guangrong {
27504d8b81abSXiao Guangrong unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
27514d8b81abSXiao Guangrong
27524a42d848SDavid Stevens if (hva)
27534a42d848SDavid Stevens *hva = addr;
27544a42d848SDavid Stevens
2755b2740d35SPaolo Bonzini if (addr == KVM_HVA_ERR_RO_BAD) {
2756b2740d35SPaolo Bonzini if (writable)
2757b2740d35SPaolo Bonzini *writable = false;
27584d8b81abSXiao Guangrong return KVM_PFN_ERR_RO_FAULT;
2759b2740d35SPaolo Bonzini }
27604d8b81abSXiao Guangrong
2761b2740d35SPaolo Bonzini if (kvm_is_error_hva(addr)) {
2762b2740d35SPaolo Bonzini if (writable)
2763b2740d35SPaolo Bonzini *writable = false;
276481c52c56SXiao Guangrong return KVM_PFN_NOSLOT;
2765b2740d35SPaolo Bonzini }
27664d8b81abSXiao Guangrong
27674d8b81abSXiao Guangrong /* Do not map writable pfn in the readonly memslot. */
27684d8b81abSXiao Guangrong if (writable && memslot_is_readonly(slot)) {
27694d8b81abSXiao Guangrong *writable = false;
27704d8b81abSXiao Guangrong writable = NULL;
2771887c08acSXiao Guangrong }
27724d8b81abSXiao Guangrong
2773c8b88b33SPeter Xu return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
27744d8b81abSXiao Guangrong writable);
27754d8b81abSXiao Guangrong }
27763520469dSPaolo Bonzini EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2777887c08acSXiao Guangrong
gfn_to_pfn_prot(struct kvm * kvm,gfn_t gfn,bool write_fault,bool * writable)2778ba049e93SDan Williams kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2779612819c3SMarcelo Tosatti bool *writable)
2780612819c3SMarcelo Tosatti {
2781c8b88b33SPeter Xu return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2782c8b88b33SPeter Xu NULL, write_fault, writable, NULL);
2783612819c3SMarcelo Tosatti }
2784612819c3SMarcelo Tosatti EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2785612819c3SMarcelo Tosatti
gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn)27868283e36aSBen Gardon kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2787506f0d6fSMarcelo Tosatti {
2788c8b88b33SPeter Xu return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2789c8b88b33SPeter Xu NULL, NULL);
2790506f0d6fSMarcelo Tosatti }
2791e37afc6eSPaolo Bonzini EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2792506f0d6fSMarcelo Tosatti
gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot * slot,gfn_t gfn)27938283e36aSBen Gardon kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2794037d92dcSXiao Guangrong {
2795c8b88b33SPeter Xu return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2796c8b88b33SPeter Xu NULL, NULL);
2797037d92dcSXiao Guangrong }
2798037d92dcSXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2799037d92dcSXiao Guangrong
kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu * vcpu,gfn_t gfn)2800ba049e93SDan Williams kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
28018e73485cSPaolo Bonzini {
28028e73485cSPaolo Bonzini return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
28038e73485cSPaolo Bonzini }
28048e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
28058e73485cSPaolo Bonzini
gfn_to_pfn(struct kvm * kvm,gfn_t gfn)2806ba049e93SDan Williams kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2807e37afc6eSPaolo Bonzini {
2808e37afc6eSPaolo Bonzini return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2809e37afc6eSPaolo Bonzini }
2810e37afc6eSPaolo Bonzini EXPORT_SYMBOL_GPL(gfn_to_pfn);
2811e37afc6eSPaolo Bonzini
kvm_vcpu_gfn_to_pfn(struct kvm_vcpu * vcpu,gfn_t gfn)2812ba049e93SDan Williams kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
28138e73485cSPaolo Bonzini {
28148e73485cSPaolo Bonzini return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
28158e73485cSPaolo Bonzini }
28168e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
28178e73485cSPaolo Bonzini
gfn_to_page_many_atomic(struct kvm_memory_slot * slot,gfn_t gfn,struct page ** pages,int nr_pages)2818d9ef13c2SPaolo Bonzini int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819d9ef13c2SPaolo Bonzini struct page **pages, int nr_pages)
282048987781SXiao Guangrong {
282148987781SXiao Guangrong unsigned long addr;
2822076b925dSArnd Bergmann gfn_t entry = 0;
282348987781SXiao Guangrong
2824d9ef13c2SPaolo Bonzini addr = gfn_to_hva_many(slot, gfn, &entry);
282548987781SXiao Guangrong if (kvm_is_error_hva(addr))
282648987781SXiao Guangrong return -1;
282748987781SXiao Guangrong
282848987781SXiao Guangrong if (entry < nr_pages)
282948987781SXiao Guangrong return 0;
283048987781SXiao Guangrong
2831dadbb612SSouptick Joarder return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
283248987781SXiao Guangrong }
283348987781SXiao Guangrong EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
283448987781SXiao Guangrong
2835b1624f99SSean Christopherson /*
2836b1624f99SSean Christopherson * Do not use this helper unless you are absolutely certain the gfn _must_ be
2837b1624f99SSean Christopherson * backed by 'struct page'. A valid example is if the backing memslot is
2838b1624f99SSean Christopherson * controlled by KVM. Note, if the returned page is valid, it's refcount has
2839b1624f99SSean Christopherson * been elevated by gfn_to_pfn().
2840b1624f99SSean Christopherson */
gfn_to_page(struct kvm * kvm,gfn_t gfn)284135149e21SAnthony Liguori struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
284235149e21SAnthony Liguori {
2843b14b2690SSean Christopherson struct page *page;
2844ba049e93SDan Williams kvm_pfn_t pfn;
28452e2e3738SAnthony Liguori
28462e2e3738SAnthony Liguori pfn = gfn_to_pfn(kvm, gfn);
28472e2e3738SAnthony Liguori
2848c77fb9dcSXiantao Zhang if (is_error_noslot_pfn(pfn))
28492e2e3738SAnthony Liguori return KVM_ERR_PTR_BAD_PAGE;
28502e2e3738SAnthony Liguori
2851b14b2690SSean Christopherson page = kvm_pfn_to_refcounted_page(pfn);
2852b14b2690SSean Christopherson if (!page)
28530fce5623SAvi Kivity return KVM_ERR_PTR_BAD_PAGE;
28540fce5623SAvi Kivity
2855b14b2690SSean Christopherson return page;
28560fce5623SAvi Kivity }
28570fce5623SAvi Kivity EXPORT_SYMBOL_GPL(gfn_to_page);
28580fce5623SAvi Kivity
kvm_release_pfn(kvm_pfn_t pfn,bool dirty)2859357a18adSDavid Woodhouse void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
286091724814SBoris Ostrovsky {
286191724814SBoris Ostrovsky if (dirty)
286291724814SBoris Ostrovsky kvm_release_pfn_dirty(pfn);
286391724814SBoris Ostrovsky else
286491724814SBoris Ostrovsky kvm_release_pfn_clean(pfn);
286591724814SBoris Ostrovsky }
286691724814SBoris Ostrovsky
kvm_vcpu_map(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map)2867357a18adSDavid Woodhouse int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2868e45adf66SKarimAllah Ahmed {
2869e45adf66SKarimAllah Ahmed kvm_pfn_t pfn;
2870e45adf66SKarimAllah Ahmed void *hva = NULL;
2871e45adf66SKarimAllah Ahmed struct page *page = KVM_UNMAPPED_PAGE;
2872e45adf66SKarimAllah Ahmed
2873e45adf66SKarimAllah Ahmed if (!map)
2874e45adf66SKarimAllah Ahmed return -EINVAL;
2875e45adf66SKarimAllah Ahmed
2876357a18adSDavid Woodhouse pfn = gfn_to_pfn(vcpu->kvm, gfn);
2877e45adf66SKarimAllah Ahmed if (is_error_noslot_pfn(pfn))
2878e45adf66SKarimAllah Ahmed return -EINVAL;
2879e45adf66SKarimAllah Ahmed
2880e45adf66SKarimAllah Ahmed if (pfn_valid(pfn)) {
2881e45adf66SKarimAllah Ahmed page = pfn_to_page(pfn);
2882e45adf66SKarimAllah Ahmed hva = kmap(page);
2883d30b214dSPaolo Bonzini #ifdef CONFIG_HAS_IOMEM
288491724814SBoris Ostrovsky } else {
2885357a18adSDavid Woodhouse hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2886d30b214dSPaolo Bonzini #endif
2887e45adf66SKarimAllah Ahmed }
2888e45adf66SKarimAllah Ahmed
2889e45adf66SKarimAllah Ahmed if (!hva)
2890e45adf66SKarimAllah Ahmed return -EFAULT;
2891e45adf66SKarimAllah Ahmed
2892e45adf66SKarimAllah Ahmed map->page = page;
2893e45adf66SKarimAllah Ahmed map->hva = hva;
2894e45adf66SKarimAllah Ahmed map->pfn = pfn;
2895e45adf66SKarimAllah Ahmed map->gfn = gfn;
2896e45adf66SKarimAllah Ahmed
2897e45adf66SKarimAllah Ahmed return 0;
2898e45adf66SKarimAllah Ahmed }
2899e45adf66SKarimAllah Ahmed EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2900e45adf66SKarimAllah Ahmed
kvm_vcpu_unmap(struct kvm_vcpu * vcpu,struct kvm_host_map * map,bool dirty)2901357a18adSDavid Woodhouse void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2902e45adf66SKarimAllah Ahmed {
2903e45adf66SKarimAllah Ahmed if (!map)
2904e45adf66SKarimAllah Ahmed return;
2905e45adf66SKarimAllah Ahmed
2906e45adf66SKarimAllah Ahmed if (!map->hva)
2907e45adf66SKarimAllah Ahmed return;
2908e45adf66SKarimAllah Ahmed
2909357a18adSDavid Woodhouse if (map->page != KVM_UNMAPPED_PAGE)
291091724814SBoris Ostrovsky kunmap(map->page);
291191724814SBoris Ostrovsky #ifdef CONFIG_HAS_IOMEM
291291724814SBoris Ostrovsky else
2913357a18adSDavid Woodhouse memunmap(map->hva);
2914eb1f2f38SChristian Borntraeger #endif
2915e45adf66SKarimAllah Ahmed
291691724814SBoris Ostrovsky if (dirty)
2917357a18adSDavid Woodhouse kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
291891724814SBoris Ostrovsky
2919357a18adSDavid Woodhouse kvm_release_pfn(map->pfn, dirty);
2920e45adf66SKarimAllah Ahmed
2921e45adf66SKarimAllah Ahmed map->hva = NULL;
2922e45adf66SKarimAllah Ahmed map->page = NULL;
2923e45adf66SKarimAllah Ahmed }
2924e45adf66SKarimAllah Ahmed EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2925e45adf66SKarimAllah Ahmed
kvm_is_ad_tracked_page(struct page * page)29268e1c6914SSean Christopherson static bool kvm_is_ad_tracked_page(struct page *page)
29278e73485cSPaolo Bonzini {
29288e1c6914SSean Christopherson /*
29298e1c6914SSean Christopherson * Per page-flags.h, pages tagged PG_reserved "should in general not be
29308e1c6914SSean Christopherson * touched (e.g. set dirty) except by its owner".
29318e1c6914SSean Christopherson */
29328e1c6914SSean Christopherson return !PageReserved(page);
29338e73485cSPaolo Bonzini }
29348e1c6914SSean Christopherson
kvm_set_page_dirty(struct page * page)29358e1c6914SSean Christopherson static void kvm_set_page_dirty(struct page *page)
29368e1c6914SSean Christopherson {
29378e1c6914SSean Christopherson if (kvm_is_ad_tracked_page(page))
29388e1c6914SSean Christopherson SetPageDirty(page);
29398e1c6914SSean Christopherson }
29408e1c6914SSean Christopherson
kvm_set_page_accessed(struct page * page)29418e1c6914SSean Christopherson static void kvm_set_page_accessed(struct page *page)
29428e1c6914SSean Christopherson {
29438e1c6914SSean Christopherson if (kvm_is_ad_tracked_page(page))
29448e1c6914SSean Christopherson mark_page_accessed(page);
29458e1c6914SSean Christopherson }
29468e73485cSPaolo Bonzini
kvm_release_page_clean(struct page * page)29470fce5623SAvi Kivity void kvm_release_page_clean(struct page *page)
29480fce5623SAvi Kivity {
294932cad84fSXiao Guangrong WARN_ON(is_error_page(page));
295032cad84fSXiao Guangrong
29518e1c6914SSean Christopherson kvm_set_page_accessed(page);
29528e1c6914SSean Christopherson put_page(page);
29530fce5623SAvi Kivity }
29540fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_release_page_clean);
29550fce5623SAvi Kivity
kvm_release_pfn_clean(kvm_pfn_t pfn)2956ba049e93SDan Williams void kvm_release_pfn_clean(kvm_pfn_t pfn)
295735149e21SAnthony Liguori {
2958b14b2690SSean Christopherson struct page *page;
2959b14b2690SSean Christopherson
2960b14b2690SSean Christopherson if (is_error_noslot_pfn(pfn))
2961b14b2690SSean Christopherson return;
2962b14b2690SSean Christopherson
2963b14b2690SSean Christopherson page = kvm_pfn_to_refcounted_page(pfn);
2964b14b2690SSean Christopherson if (!page)
2965b14b2690SSean Christopherson return;
2966b14b2690SSean Christopherson
2967b14b2690SSean Christopherson kvm_release_page_clean(page);
296835149e21SAnthony Liguori }
296935149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
297035149e21SAnthony Liguori
kvm_release_page_dirty(struct page * page)29710fce5623SAvi Kivity void kvm_release_page_dirty(struct page *page)
29720fce5623SAvi Kivity {
2973a2766325SXiao Guangrong WARN_ON(is_error_page(page));
2974a2766325SXiao Guangrong
29758e1c6914SSean Christopherson kvm_set_page_dirty(page);
29768e1c6914SSean Christopherson kvm_release_page_clean(page);
29770fce5623SAvi Kivity }
29780fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
29790fce5623SAvi Kivity
kvm_release_pfn_dirty(kvm_pfn_t pfn)2980f7a6509fSDavid Hildenbrand void kvm_release_pfn_dirty(kvm_pfn_t pfn)
298135149e21SAnthony Liguori {
2982b14b2690SSean Christopherson struct page *page;
2983b14b2690SSean Christopherson
2984b14b2690SSean Christopherson if (is_error_noslot_pfn(pfn))
2985b14b2690SSean Christopherson return;
2986b14b2690SSean Christopherson
2987b14b2690SSean Christopherson page = kvm_pfn_to_refcounted_page(pfn);
2988b14b2690SSean Christopherson if (!page)
2989b14b2690SSean Christopherson return;
2990b14b2690SSean Christopherson
2991b14b2690SSean Christopherson kvm_release_page_dirty(page);
299235149e21SAnthony Liguori }
2993f7a6509fSDavid Hildenbrand EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
299435149e21SAnthony Liguori
2995a1040b0dSSean Christopherson /*
29968e1c6914SSean Christopherson * Note, checking for an error/noslot pfn is the caller's responsibility when
29978e1c6914SSean Christopherson * directly marking a page dirty/accessed. Unlike the "release" helpers, the
29988e1c6914SSean Christopherson * "set" helpers are not to be used when the pfn might point at garbage.
2999a1040b0dSSean Christopherson */
kvm_set_pfn_dirty(kvm_pfn_t pfn)3000ba049e93SDan Williams void kvm_set_pfn_dirty(kvm_pfn_t pfn)
300135149e21SAnthony Liguori {
30028e1c6914SSean Christopherson if (WARN_ON(is_error_noslot_pfn(pfn)))
30038e1c6914SSean Christopherson return;
30048e1c6914SSean Christopherson
30058e1c6914SSean Christopherson if (pfn_valid(pfn))
30068e1c6914SSean Christopherson kvm_set_page_dirty(pfn_to_page(pfn));
30072e2e3738SAnthony Liguori }
300835149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
300935149e21SAnthony Liguori
kvm_set_pfn_accessed(kvm_pfn_t pfn)3010ba049e93SDan Williams void kvm_set_pfn_accessed(kvm_pfn_t pfn)
301135149e21SAnthony Liguori {
30128e1c6914SSean Christopherson if (WARN_ON(is_error_noslot_pfn(pfn)))
30138e1c6914SSean Christopherson return;
30148e1c6914SSean Christopherson
30158e1c6914SSean Christopherson if (pfn_valid(pfn))
30168e1c6914SSean Christopherson kvm_set_page_accessed(pfn_to_page(pfn));
301735149e21SAnthony Liguori }
301835149e21SAnthony Liguori EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
301935149e21SAnthony Liguori
next_segment(unsigned long len,int offset)30200fce5623SAvi Kivity static int next_segment(unsigned long len, int offset)
30210fce5623SAvi Kivity {
30220fce5623SAvi Kivity if (len > PAGE_SIZE - offset)
30230fce5623SAvi Kivity return PAGE_SIZE - offset;
30240fce5623SAvi Kivity else
30250fce5623SAvi Kivity return len;
30260fce5623SAvi Kivity }
30270fce5623SAvi Kivity
__kvm_read_guest_page(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,int len)30288e73485cSPaolo Bonzini static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
30298e73485cSPaolo Bonzini void *data, int offset, int len)
30300fce5623SAvi Kivity {
30310fce5623SAvi Kivity int r;
30320fce5623SAvi Kivity unsigned long addr;
30330fce5623SAvi Kivity
30348e73485cSPaolo Bonzini addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
30350fce5623SAvi Kivity if (kvm_is_error_hva(addr))
30360fce5623SAvi Kivity return -EFAULT;
30373180a7fcSPaolo Bonzini r = __copy_from_user(data, (void __user *)addr + offset, len);
30380fce5623SAvi Kivity if (r)
30390fce5623SAvi Kivity return -EFAULT;
30400fce5623SAvi Kivity return 0;
30410fce5623SAvi Kivity }
30428e73485cSPaolo Bonzini
kvm_read_guest_page(struct kvm * kvm,gfn_t gfn,void * data,int offset,int len)30438e73485cSPaolo Bonzini int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
30448e73485cSPaolo Bonzini int len)
30458e73485cSPaolo Bonzini {
30468e73485cSPaolo Bonzini struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
30478e73485cSPaolo Bonzini
30488e73485cSPaolo Bonzini return __kvm_read_guest_page(slot, gfn, data, offset, len);
30498e73485cSPaolo Bonzini }
30500fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_read_guest_page);
30510fce5623SAvi Kivity
kvm_vcpu_read_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,void * data,int offset,int len)30528e73485cSPaolo Bonzini int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
30538e73485cSPaolo Bonzini int offset, int len)
30548e73485cSPaolo Bonzini {
30558e73485cSPaolo Bonzini struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
30568e73485cSPaolo Bonzini
30578e73485cSPaolo Bonzini return __kvm_read_guest_page(slot, gfn, data, offset, len);
30588e73485cSPaolo Bonzini }
30598e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
30608e73485cSPaolo Bonzini
kvm_read_guest(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)30610fce5623SAvi Kivity int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
30620fce5623SAvi Kivity {
30630fce5623SAvi Kivity gfn_t gfn = gpa >> PAGE_SHIFT;
30640fce5623SAvi Kivity int seg;
30650fce5623SAvi Kivity int offset = offset_in_page(gpa);
30660fce5623SAvi Kivity int ret;
30670fce5623SAvi Kivity
30680fce5623SAvi Kivity while ((seg = next_segment(len, offset)) != 0) {
30690fce5623SAvi Kivity ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
30700fce5623SAvi Kivity if (ret < 0)
30710fce5623SAvi Kivity return ret;
30720fce5623SAvi Kivity offset = 0;
30730fce5623SAvi Kivity len -= seg;
30740fce5623SAvi Kivity data += seg;
30750fce5623SAvi Kivity ++gfn;
30760fce5623SAvi Kivity }
30770fce5623SAvi Kivity return 0;
30780fce5623SAvi Kivity }
30790fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_read_guest);
30800fce5623SAvi Kivity
kvm_vcpu_read_guest(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)30818e73485cSPaolo Bonzini int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
30828e73485cSPaolo Bonzini {
30838e73485cSPaolo Bonzini gfn_t gfn = gpa >> PAGE_SHIFT;
30848e73485cSPaolo Bonzini int seg;
30858e73485cSPaolo Bonzini int offset = offset_in_page(gpa);
30868e73485cSPaolo Bonzini int ret;
30878e73485cSPaolo Bonzini
30888e73485cSPaolo Bonzini while ((seg = next_segment(len, offset)) != 0) {
30898e73485cSPaolo Bonzini ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
30908e73485cSPaolo Bonzini if (ret < 0)
30918e73485cSPaolo Bonzini return ret;
30928e73485cSPaolo Bonzini offset = 0;
30938e73485cSPaolo Bonzini len -= seg;
30948e73485cSPaolo Bonzini data += seg;
30958e73485cSPaolo Bonzini ++gfn;
30968e73485cSPaolo Bonzini }
30978e73485cSPaolo Bonzini return 0;
30988e73485cSPaolo Bonzini }
30998e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
31008e73485cSPaolo Bonzini
__kvm_read_guest_atomic(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,unsigned long len)31018e73485cSPaolo Bonzini static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
31028e73485cSPaolo Bonzini void *data, int offset, unsigned long len)
31037ec54588SMarcelo Tosatti {
31047ec54588SMarcelo Tosatti int r;
31057ec54588SMarcelo Tosatti unsigned long addr;
31067ec54588SMarcelo Tosatti
31078e73485cSPaolo Bonzini addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
31087ec54588SMarcelo Tosatti if (kvm_is_error_hva(addr))
31097ec54588SMarcelo Tosatti return -EFAULT;
31100aac03f0SAndrea Arcangeli pagefault_disable();
31113180a7fcSPaolo Bonzini r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
31120aac03f0SAndrea Arcangeli pagefault_enable();
31137ec54588SMarcelo Tosatti if (r)
31147ec54588SMarcelo Tosatti return -EFAULT;
31157ec54588SMarcelo Tosatti return 0;
31167ec54588SMarcelo Tosatti }
31177ec54588SMarcelo Tosatti
kvm_vcpu_read_guest_atomic(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)31188e73485cSPaolo Bonzini int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
31198e73485cSPaolo Bonzini void *data, unsigned long len)
31208e73485cSPaolo Bonzini {
31218e73485cSPaolo Bonzini gfn_t gfn = gpa >> PAGE_SHIFT;
31228e73485cSPaolo Bonzini struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
31238e73485cSPaolo Bonzini int offset = offset_in_page(gpa);
31248e73485cSPaolo Bonzini
31258e73485cSPaolo Bonzini return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
31268e73485cSPaolo Bonzini }
31278e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
31288e73485cSPaolo Bonzini
__kvm_write_guest_page(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,const void * data,int offset,int len)312928bd726aSPeter Xu static int __kvm_write_guest_page(struct kvm *kvm,
313028bd726aSPeter Xu struct kvm_memory_slot *memslot, gfn_t gfn,
31318e73485cSPaolo Bonzini const void *data, int offset, int len)
31320fce5623SAvi Kivity {
31330fce5623SAvi Kivity int r;
31340fce5623SAvi Kivity unsigned long addr;
31350fce5623SAvi Kivity
3136251eb841SRadim Krčmář addr = gfn_to_hva_memslot(memslot, gfn);
31370fce5623SAvi Kivity if (kvm_is_error_hva(addr))
31380fce5623SAvi Kivity return -EFAULT;
31398b0cedffSXiao Guangrong r = __copy_to_user((void __user *)addr + offset, data, len);
31400fce5623SAvi Kivity if (r)
31410fce5623SAvi Kivity return -EFAULT;
314228bd726aSPeter Xu mark_page_dirty_in_slot(kvm, memslot, gfn);
31430fce5623SAvi Kivity return 0;
31440fce5623SAvi Kivity }
31458e73485cSPaolo Bonzini
kvm_write_guest_page(struct kvm * kvm,gfn_t gfn,const void * data,int offset,int len)31468e73485cSPaolo Bonzini int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
31478e73485cSPaolo Bonzini const void *data, int offset, int len)
31488e73485cSPaolo Bonzini {
31498e73485cSPaolo Bonzini struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
31508e73485cSPaolo Bonzini
315128bd726aSPeter Xu return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
31528e73485cSPaolo Bonzini }
31530fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_write_guest_page);
31540fce5623SAvi Kivity
kvm_vcpu_write_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,const void * data,int offset,int len)31558e73485cSPaolo Bonzini int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
31568e73485cSPaolo Bonzini const void *data, int offset, int len)
31578e73485cSPaolo Bonzini {
31588e73485cSPaolo Bonzini struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
31598e73485cSPaolo Bonzini
316028bd726aSPeter Xu return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
31618e73485cSPaolo Bonzini }
31628e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
31638e73485cSPaolo Bonzini
kvm_write_guest(struct kvm * kvm,gpa_t gpa,const void * data,unsigned long len)31640fce5623SAvi Kivity int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
31650fce5623SAvi Kivity unsigned long len)
31660fce5623SAvi Kivity {
31670fce5623SAvi Kivity gfn_t gfn = gpa >> PAGE_SHIFT;
31680fce5623SAvi Kivity int seg;
31690fce5623SAvi Kivity int offset = offset_in_page(gpa);
31700fce5623SAvi Kivity int ret;
31710fce5623SAvi Kivity
31720fce5623SAvi Kivity while ((seg = next_segment(len, offset)) != 0) {
31730fce5623SAvi Kivity ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
31740fce5623SAvi Kivity if (ret < 0)
31750fce5623SAvi Kivity return ret;
31760fce5623SAvi Kivity offset = 0;
31770fce5623SAvi Kivity len -= seg;
31780fce5623SAvi Kivity data += seg;
31790fce5623SAvi Kivity ++gfn;
31800fce5623SAvi Kivity }
31810fce5623SAvi Kivity return 0;
31820fce5623SAvi Kivity }
3183ff651cb6SWincy Van EXPORT_SYMBOL_GPL(kvm_write_guest);
31840fce5623SAvi Kivity
kvm_vcpu_write_guest(struct kvm_vcpu * vcpu,gpa_t gpa,const void * data,unsigned long len)31858e73485cSPaolo Bonzini int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
31868e73485cSPaolo Bonzini unsigned long len)
31878e73485cSPaolo Bonzini {
31888e73485cSPaolo Bonzini gfn_t gfn = gpa >> PAGE_SHIFT;
31898e73485cSPaolo Bonzini int seg;
31908e73485cSPaolo Bonzini int offset = offset_in_page(gpa);
31918e73485cSPaolo Bonzini int ret;
31928e73485cSPaolo Bonzini
31938e73485cSPaolo Bonzini while ((seg = next_segment(len, offset)) != 0) {
31948e73485cSPaolo Bonzini ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
31958e73485cSPaolo Bonzini if (ret < 0)
31968e73485cSPaolo Bonzini return ret;
31978e73485cSPaolo Bonzini offset = 0;
31988e73485cSPaolo Bonzini len -= seg;
31998e73485cSPaolo Bonzini data += seg;
32008e73485cSPaolo Bonzini ++gfn;
32018e73485cSPaolo Bonzini }
32028e73485cSPaolo Bonzini return 0;
32038e73485cSPaolo Bonzini }
32048e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
32058e73485cSPaolo Bonzini
__kvm_gfn_to_hva_cache_init(struct kvm_memslots * slots,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)32065a2d4365SPaolo Bonzini static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
32075a2d4365SPaolo Bonzini struct gfn_to_hva_cache *ghc,
32088f964525SAndrew Honig gpa_t gpa, unsigned long len)
320949c7754cSGleb Natapov {
321049c7754cSGleb Natapov int offset = offset_in_page(gpa);
32118f964525SAndrew Honig gfn_t start_gfn = gpa >> PAGE_SHIFT;
32128f964525SAndrew Honig gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
32138f964525SAndrew Honig gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
32148f964525SAndrew Honig gfn_t nr_pages_avail;
321549c7754cSGleb Natapov
32166ad1e29fSSean Christopherson /* Update ghc->generation before performing any error checks. */
321749c7754cSGleb Natapov ghc->generation = slots->generation;
32186ad1e29fSSean Christopherson
32196ad1e29fSSean Christopherson if (start_gfn > end_gfn) {
3220f1b9dd5eSJim Mattson ghc->hva = KVM_HVA_ERR_BAD;
32216ad1e29fSSean Christopherson return -EINVAL;
32226ad1e29fSSean Christopherson }
3223f1b9dd5eSJim Mattson
32248f964525SAndrew Honig /*
32258f964525SAndrew Honig * If the requested region crosses two memslots, we still
32268f964525SAndrew Honig * verify that the entire region is valid here.
32278f964525SAndrew Honig */
32286ad1e29fSSean Christopherson for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
32295a2d4365SPaolo Bonzini ghc->memslot = __gfn_to_memslot(slots, start_gfn);
32308f964525SAndrew Honig ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
32318f964525SAndrew Honig &nr_pages_avail);
32328f964525SAndrew Honig if (kvm_is_error_hva(ghc->hva))
32336ad1e29fSSean Christopherson return -EFAULT;
32348f964525SAndrew Honig }
3235f1b9dd5eSJim Mattson
32368f964525SAndrew Honig /* Use the slow path for cross page reads and writes. */
32376ad1e29fSSean Christopherson if (nr_pages_needed == 1)
3238f1b9dd5eSJim Mattson ghc->hva += offset;
3239f1b9dd5eSJim Mattson else
32408f964525SAndrew Honig ghc->memslot = NULL;
3241f1b9dd5eSJim Mattson
32426ad1e29fSSean Christopherson ghc->gpa = gpa;
32436ad1e29fSSean Christopherson ghc->len = len;
32446ad1e29fSSean Christopherson return 0;
324549c7754cSGleb Natapov }
32465a2d4365SPaolo Bonzini
kvm_gfn_to_hva_cache_init(struct kvm * kvm,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)32474e335d9eSPaolo Bonzini int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32485a2d4365SPaolo Bonzini gpa_t gpa, unsigned long len)
32495a2d4365SPaolo Bonzini {
32504e335d9eSPaolo Bonzini struct kvm_memslots *slots = kvm_memslots(kvm);
32515a2d4365SPaolo Bonzini return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
32525a2d4365SPaolo Bonzini }
32534e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
325449c7754cSGleb Natapov
kvm_write_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)32554e335d9eSPaolo Bonzini int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32567a86dab8SJim Mattson void *data, unsigned int offset,
32577a86dab8SJim Mattson unsigned long len)
325849c7754cSGleb Natapov {
32594e335d9eSPaolo Bonzini struct kvm_memslots *slots = kvm_memslots(kvm);
326049c7754cSGleb Natapov int r;
32614ec6e863SPan Xinhui gpa_t gpa = ghc->gpa + offset;
326249c7754cSGleb Natapov
32635f25e71eSPaolo Bonzini if (WARN_ON_ONCE(len + offset > ghc->len))
32645f25e71eSPaolo Bonzini return -EINVAL;
32658f964525SAndrew Honig
3266dc9ce71eSSean Christopherson if (slots->generation != ghc->generation) {
3267dc9ce71eSSean Christopherson if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3268dc9ce71eSSean Christopherson return -EFAULT;
3269dc9ce71eSSean Christopherson }
32708f964525SAndrew Honig
327149c7754cSGleb Natapov if (kvm_is_error_hva(ghc->hva))
327249c7754cSGleb Natapov return -EFAULT;
327349c7754cSGleb Natapov
3274fcfbc617SSean Christopherson if (unlikely(!ghc->memslot))
3275fcfbc617SSean Christopherson return kvm_write_guest(kvm, gpa, data, len);
3276fcfbc617SSean Christopherson
32774ec6e863SPan Xinhui r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
327849c7754cSGleb Natapov if (r)
327949c7754cSGleb Natapov return -EFAULT;
328028bd726aSPeter Xu mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
328149c7754cSGleb Natapov
328249c7754cSGleb Natapov return 0;
328349c7754cSGleb Natapov }
32844e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
32854ec6e863SPan Xinhui
kvm_write_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)32864e335d9eSPaolo Bonzini int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32874ec6e863SPan Xinhui void *data, unsigned long len)
32884ec6e863SPan Xinhui {
32894e335d9eSPaolo Bonzini return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
32904ec6e863SPan Xinhui }
32914e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
329249c7754cSGleb Natapov
kvm_read_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)32930958f0ceSVitaly Kuznetsov int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
32940958f0ceSVitaly Kuznetsov void *data, unsigned int offset,
32950958f0ceSVitaly Kuznetsov unsigned long len)
3296e03b644fSGleb Natapov {
32974e335d9eSPaolo Bonzini struct kvm_memslots *slots = kvm_memslots(kvm);
3298e03b644fSGleb Natapov int r;
32990958f0ceSVitaly Kuznetsov gpa_t gpa = ghc->gpa + offset;
3300e03b644fSGleb Natapov
33015f25e71eSPaolo Bonzini if (WARN_ON_ONCE(len + offset > ghc->len))
33025f25e71eSPaolo Bonzini return -EINVAL;
33038f964525SAndrew Honig
3304dc9ce71eSSean Christopherson if (slots->generation != ghc->generation) {
3305dc9ce71eSSean Christopherson if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3306dc9ce71eSSean Christopherson return -EFAULT;
3307dc9ce71eSSean Christopherson }
33088f964525SAndrew Honig
3309e03b644fSGleb Natapov if (kvm_is_error_hva(ghc->hva))
3310e03b644fSGleb Natapov return -EFAULT;
3311e03b644fSGleb Natapov
3312fcfbc617SSean Christopherson if (unlikely(!ghc->memslot))
33130958f0ceSVitaly Kuznetsov return kvm_read_guest(kvm, gpa, data, len);
3314fcfbc617SSean Christopherson
33150958f0ceSVitaly Kuznetsov r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3316e03b644fSGleb Natapov if (r)
3317e03b644fSGleb Natapov return -EFAULT;
3318e03b644fSGleb Natapov
3319e03b644fSGleb Natapov return 0;
3320e03b644fSGleb Natapov }
33210958f0ceSVitaly Kuznetsov EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
33220958f0ceSVitaly Kuznetsov
kvm_read_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)33230958f0ceSVitaly Kuznetsov int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
33240958f0ceSVitaly Kuznetsov void *data, unsigned long len)
33250958f0ceSVitaly Kuznetsov {
33260958f0ceSVitaly Kuznetsov return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
33270958f0ceSVitaly Kuznetsov }
33284e335d9eSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3329e03b644fSGleb Natapov
kvm_clear_guest(struct kvm * kvm,gpa_t gpa,unsigned long len)33300fce5623SAvi Kivity int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
33310fce5623SAvi Kivity {
33322f541442SPaolo Bonzini const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
33330fce5623SAvi Kivity gfn_t gfn = gpa >> PAGE_SHIFT;
33340fce5623SAvi Kivity int seg;
33350fce5623SAvi Kivity int offset = offset_in_page(gpa);
33360fce5623SAvi Kivity int ret;
33370fce5623SAvi Kivity
33380fce5623SAvi Kivity while ((seg = next_segment(len, offset)) != 0) {
33392f541442SPaolo Bonzini ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
33400fce5623SAvi Kivity if (ret < 0)
33410fce5623SAvi Kivity return ret;
33420fce5623SAvi Kivity offset = 0;
33430fce5623SAvi Kivity len -= seg;
33440fce5623SAvi Kivity ++gfn;
33450fce5623SAvi Kivity }
33460fce5623SAvi Kivity return 0;
33470fce5623SAvi Kivity }
33480fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_clear_guest);
33490fce5623SAvi Kivity
mark_page_dirty_in_slot(struct kvm * kvm,const struct kvm_memory_slot * memslot,gfn_t gfn)335028bd726aSPeter Xu void mark_page_dirty_in_slot(struct kvm *kvm,
33518283e36aSBen Gardon const struct kvm_memory_slot *memslot,
335228bd726aSPeter Xu gfn_t gfn)
33530fce5623SAvi Kivity {
33542efd61a6SDavid Woodhouse struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
33552efd61a6SDavid Woodhouse
3356e09fccb5SChristian Borntraeger #ifdef CONFIG_HAVE_KVM_DIRTY_RING
335786bdf3ebSGavin Shan if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
33582efd61a6SDavid Woodhouse return;
335986bdf3ebSGavin Shan
3360c57351a7SGavin Shan WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3361e09fccb5SChristian Borntraeger #endif
33622efd61a6SDavid Woodhouse
3363044c59c4SPeter Xu if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
33640fce5623SAvi Kivity unsigned long rel_gfn = gfn - memslot->base_gfn;
3365fb04a1edSPeter Xu u32 slot = (memslot->as_id << 16) | memslot->id;
33660fce5623SAvi Kivity
336786bdf3ebSGavin Shan if (kvm->dirty_ring_size && vcpu)
3368cf87ac73SGavin Shan kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3369c57351a7SGavin Shan else if (memslot->dirty_bitmap)
3370b74ca3b3STakuya Yoshikawa set_bit_le(rel_gfn, memslot->dirty_bitmap);
33710fce5623SAvi Kivity }
33720fce5623SAvi Kivity }
3373a6a0b05dSBen Gardon EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
33740fce5623SAvi Kivity
mark_page_dirty(struct kvm * kvm,gfn_t gfn)337549c7754cSGleb Natapov void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
337649c7754cSGleb Natapov {
337749c7754cSGleb Natapov struct kvm_memory_slot *memslot;
337849c7754cSGleb Natapov
337949c7754cSGleb Natapov memslot = gfn_to_memslot(kvm, gfn);
338028bd726aSPeter Xu mark_page_dirty_in_slot(kvm, memslot, gfn);
338149c7754cSGleb Natapov }
33822ba9f0d8SAneesh Kumar K.V EXPORT_SYMBOL_GPL(mark_page_dirty);
338349c7754cSGleb Natapov
kvm_vcpu_mark_page_dirty(struct kvm_vcpu * vcpu,gfn_t gfn)33848e73485cSPaolo Bonzini void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
33858e73485cSPaolo Bonzini {
33868e73485cSPaolo Bonzini struct kvm_memory_slot *memslot;
33878e73485cSPaolo Bonzini
33888e73485cSPaolo Bonzini memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
338928bd726aSPeter Xu mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
33908e73485cSPaolo Bonzini }
33918e73485cSPaolo Bonzini EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
33928e73485cSPaolo Bonzini
kvm_sigset_activate(struct kvm_vcpu * vcpu)339320b7035cSJan H. Schönherr void kvm_sigset_activate(struct kvm_vcpu *vcpu)
339420b7035cSJan H. Schönherr {
339520b7035cSJan H. Schönherr if (!vcpu->sigset_active)
339620b7035cSJan H. Schönherr return;
339720b7035cSJan H. Schönherr
339820b7035cSJan H. Schönherr /*
339920b7035cSJan H. Schönherr * This does a lockless modification of ->real_blocked, which is fine
340020b7035cSJan H. Schönherr * because, only current can change ->real_blocked and all readers of
340120b7035cSJan H. Schönherr * ->real_blocked don't care as long ->real_blocked is always a subset
340220b7035cSJan H. Schönherr * of ->blocked.
340320b7035cSJan H. Schönherr */
340420b7035cSJan H. Schönherr sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
340520b7035cSJan H. Schönherr }
340620b7035cSJan H. Schönherr
kvm_sigset_deactivate(struct kvm_vcpu * vcpu)340720b7035cSJan H. Schönherr void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
340820b7035cSJan H. Schönherr {
340920b7035cSJan H. Schönherr if (!vcpu->sigset_active)
341020b7035cSJan H. Schönherr return;
341120b7035cSJan H. Schönherr
341220b7035cSJan H. Schönherr sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
341320b7035cSJan H. Schönherr sigemptyset(¤t->real_blocked);
341420b7035cSJan H. Schönherr }
341520b7035cSJan H. Schönherr
grow_halt_poll_ns(struct kvm_vcpu * vcpu)3416aca6ff29SWanpeng Li static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3417aca6ff29SWanpeng Li {
3418dee339b5SNir Weiner unsigned int old, val, grow, grow_start;
3419aca6ff29SWanpeng Li
34202cbd7824SWanpeng Li old = val = vcpu->halt_poll_ns;
3421dee339b5SNir Weiner grow_start = READ_ONCE(halt_poll_ns_grow_start);
34226b6de68cSChristian Borntraeger grow = READ_ONCE(halt_poll_ns_grow);
34237fa08e71SNir Weiner if (!grow)
34247fa08e71SNir Weiner goto out;
34257fa08e71SNir Weiner
34266b6de68cSChristian Borntraeger val *= grow;
3427dee339b5SNir Weiner if (val < grow_start)
3428dee339b5SNir Weiner val = grow_start;
3429aca6ff29SWanpeng Li
3430aca6ff29SWanpeng Li vcpu->halt_poll_ns = val;
34317fa08e71SNir Weiner out:
34322cbd7824SWanpeng Li trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3433aca6ff29SWanpeng Li }
3434aca6ff29SWanpeng Li
shrink_halt_poll_ns(struct kvm_vcpu * vcpu)3435aca6ff29SWanpeng Li static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3436aca6ff29SWanpeng Li {
3437ae232ea4SSergey Senozhatsky unsigned int old, val, shrink, grow_start;
3438aca6ff29SWanpeng Li
34392cbd7824SWanpeng Li old = val = vcpu->halt_poll_ns;
34406b6de68cSChristian Borntraeger shrink = READ_ONCE(halt_poll_ns_shrink);
3441ae232ea4SSergey Senozhatsky grow_start = READ_ONCE(halt_poll_ns_grow_start);
34426b6de68cSChristian Borntraeger if (shrink == 0)
3443aca6ff29SWanpeng Li val = 0;
3444aca6ff29SWanpeng Li else
34456b6de68cSChristian Borntraeger val /= shrink;
3446aca6ff29SWanpeng Li
3447ae232ea4SSergey Senozhatsky if (val < grow_start)
3448ae232ea4SSergey Senozhatsky val = 0;
3449ae232ea4SSergey Senozhatsky
3450aca6ff29SWanpeng Li vcpu->halt_poll_ns = val;
34512cbd7824SWanpeng Li trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3452aca6ff29SWanpeng Li }
3453aca6ff29SWanpeng Li
kvm_vcpu_check_block(struct kvm_vcpu * vcpu)3454f7819512SPaolo Bonzini static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3455f7819512SPaolo Bonzini {
345650c28f21SJunaid Shahid int ret = -EINTR;
345750c28f21SJunaid Shahid int idx = srcu_read_lock(&vcpu->kvm->srcu);
345850c28f21SJunaid Shahid
3459c59fb127SPaolo Bonzini if (kvm_arch_vcpu_runnable(vcpu))
346050c28f21SJunaid Shahid goto out;
3461f7819512SPaolo Bonzini if (kvm_cpu_has_pending_timer(vcpu))
346250c28f21SJunaid Shahid goto out;
3463f7819512SPaolo Bonzini if (signal_pending(current))
346450c28f21SJunaid Shahid goto out;
3465084071d5SMarcelo Tosatti if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3466084071d5SMarcelo Tosatti goto out;
3467f7819512SPaolo Bonzini
346850c28f21SJunaid Shahid ret = 0;
346950c28f21SJunaid Shahid out:
347050c28f21SJunaid Shahid srcu_read_unlock(&vcpu->kvm->srcu, idx);
347150c28f21SJunaid Shahid return ret;
3472f7819512SPaolo Bonzini }
3473f7819512SPaolo Bonzini
34740fce5623SAvi Kivity /*
3475fac42688SSean Christopherson * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3476fac42688SSean Christopherson * pending. This is mostly used when halting a vCPU, but may also be used
3477fac42688SSean Christopherson * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
34780fce5623SAvi Kivity */
kvm_vcpu_block(struct kvm_vcpu * vcpu)3479fac42688SSean Christopherson bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
34800fce5623SAvi Kivity {
3481fac42688SSean Christopherson struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3482f7819512SPaolo Bonzini bool waited = false;
3483fac42688SSean Christopherson
3484c3858335SJing Zhang vcpu->stat.generic.blocking = 1;
3485f7819512SPaolo Bonzini
348618869f26SMaxim Levitsky preempt_disable();
348707ab0f8dSMarc Zyngier kvm_arch_vcpu_blocking(vcpu);
3488fac42688SSean Christopherson prepare_to_rcuwait(wait);
348918869f26SMaxim Levitsky preempt_enable();
349018869f26SMaxim Levitsky
3491e5c239cfSMarcelo Tosatti for (;;) {
3492da4ad88cSDavidlohr Bueso set_current_state(TASK_INTERRUPTIBLE);
34930fce5623SAvi Kivity
3494f7819512SPaolo Bonzini if (kvm_vcpu_check_block(vcpu) < 0)
3495e5c239cfSMarcelo Tosatti break;
3496e5c239cfSMarcelo Tosatti
3497f7819512SPaolo Bonzini waited = true;
34980fce5623SAvi Kivity schedule();
34990fce5623SAvi Kivity }
3500fac42688SSean Christopherson
350118869f26SMaxim Levitsky preempt_disable();
350218869f26SMaxim Levitsky finish_rcuwait(wait);
3503fac42688SSean Christopherson kvm_arch_vcpu_unblocking(vcpu);
350418869f26SMaxim Levitsky preempt_enable();
3505fac42688SSean Christopherson
3506c3858335SJing Zhang vcpu->stat.generic.blocking = 0;
3507c3858335SJing Zhang
3508fac42688SSean Christopherson return waited;
3509fac42688SSean Christopherson }
3510fac42688SSean Christopherson
update_halt_poll_stats(struct kvm_vcpu * vcpu,ktime_t start,ktime_t end,bool success)351129e72893SSean Christopherson static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
351229e72893SSean Christopherson ktime_t end, bool success)
35130fce5623SAvi Kivity {
351430c94347SSean Christopherson struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
351529e72893SSean Christopherson u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
351629e72893SSean Christopherson
351730c94347SSean Christopherson ++vcpu->stat.generic.halt_attempted_poll;
351830c94347SSean Christopherson
351930c94347SSean Christopherson if (success) {
352030c94347SSean Christopherson ++vcpu->stat.generic.halt_successful_poll;
352130c94347SSean Christopherson
352230c94347SSean Christopherson if (!vcpu_valid_wakeup(vcpu))
352330c94347SSean Christopherson ++vcpu->stat.generic.halt_poll_invalid;
352430c94347SSean Christopherson
352530c94347SSean Christopherson stats->halt_poll_success_ns += poll_ns;
352630c94347SSean Christopherson KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
352730c94347SSean Christopherson } else {
352830c94347SSean Christopherson stats->halt_poll_fail_ns += poll_ns;
352930c94347SSean Christopherson KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
353030c94347SSean Christopherson }
3531e5c239cfSMarcelo Tosatti }
35320fce5623SAvi Kivity
kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu * vcpu)3533175d5dc7SDavid Matlack static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3534175d5dc7SDavid Matlack {
35359eb8ca04SDavid Matlack struct kvm *kvm = vcpu->kvm;
35369eb8ca04SDavid Matlack
35379eb8ca04SDavid Matlack if (kvm->override_halt_poll_ns) {
35389eb8ca04SDavid Matlack /*
35399eb8ca04SDavid Matlack * Ensure kvm->max_halt_poll_ns is not read before
35409eb8ca04SDavid Matlack * kvm->override_halt_poll_ns.
35419eb8ca04SDavid Matlack *
35429eb8ca04SDavid Matlack * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
35439eb8ca04SDavid Matlack */
35449eb8ca04SDavid Matlack smp_rmb();
35459eb8ca04SDavid Matlack return READ_ONCE(kvm->max_halt_poll_ns);
35469eb8ca04SDavid Matlack }
35479eb8ca04SDavid Matlack
35489eb8ca04SDavid Matlack return READ_ONCE(halt_poll_ns);
3549175d5dc7SDavid Matlack }
3550175d5dc7SDavid Matlack
3551fac42688SSean Christopherson /*
3552fac42688SSean Christopherson * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3553fac42688SSean Christopherson * polling is enabled, busy wait for a short time before blocking to avoid the
3554fac42688SSean Christopherson * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3555fac42688SSean Christopherson * is halted.
3556fac42688SSean Christopherson */
kvm_vcpu_halt(struct kvm_vcpu * vcpu)355791b99ea7SSean Christopherson void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
35580fce5623SAvi Kivity {
3559175d5dc7SDavid Matlack unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
35606f390916SSean Christopherson bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
35610fce5623SAvi Kivity ktime_t start, cur, poll_end;
35620fce5623SAvi Kivity bool waited = false;
356397b6847aSDavid Matlack bool do_halt_poll;
356491b99ea7SSean Christopherson u64 halt_ns;
35650fce5623SAvi Kivity
3566175d5dc7SDavid Matlack if (vcpu->halt_poll_ns > max_halt_poll_ns)
3567175d5dc7SDavid Matlack vcpu->halt_poll_ns = max_halt_poll_ns;
356897b6847aSDavid Matlack
356997b6847aSDavid Matlack do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
357097b6847aSDavid Matlack
35710fce5623SAvi Kivity start = cur = poll_end = ktime_get();
35728df6a61cSSean Christopherson if (do_halt_poll) {
3573109a9826SSean Christopherson ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3574d255f4f2SZhai, Edwin
3575d255f4f2SZhai, Edwin do {
357630c94347SSean Christopherson if (kvm_vcpu_check_block(vcpu) < 0)
35770fce5623SAvi Kivity goto out;
35780fce5623SAvi Kivity cpu_relax();
35790fce5623SAvi Kivity poll_end = cur = ktime_get();
35800fce5623SAvi Kivity } while (kvm_vcpu_can_poll(cur, stop));
35810fce5623SAvi Kivity }
35820fce5623SAvi Kivity
3583fac42688SSean Christopherson waited = kvm_vcpu_block(vcpu);
3584f6c60d08SSean Christopherson
3585f7819512SPaolo Bonzini cur = ktime_get();
358687bcc5faSJing Zhang if (waited) {
358787bcc5faSJing Zhang vcpu->stat.generic.halt_wait_ns +=
358887bcc5faSJing Zhang ktime_to_ns(cur) - ktime_to_ns(poll_end);
35898ccba534SJing Zhang KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
35908ccba534SJing Zhang ktime_to_ns(cur) - ktime_to_ns(poll_end));
359187bcc5faSJing Zhang }
3592f7819512SPaolo Bonzini out:
359391b99ea7SSean Christopherson /* The total time the vCPU was "halted", including polling time. */
359491b99ea7SSean Christopherson halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3595aca6ff29SWanpeng Li
359629e72893SSean Christopherson /*
359729e72893SSean Christopherson * Note, halt-polling is considered successful so long as the vCPU was
359829e72893SSean Christopherson * never actually scheduled out, i.e. even if the wake event arrived
359929e72893SSean Christopherson * after of the halt-polling loop itself, but before the full wait.
360029e72893SSean Christopherson */
36018df6a61cSSean Christopherson if (do_halt_poll)
360229e72893SSean Christopherson update_halt_poll_stats(vcpu, start, poll_end, !waited);
3603cb953129SDavid Matlack
36046f390916SSean Christopherson if (halt_poll_allowed) {
3605175d5dc7SDavid Matlack /* Recompute the max halt poll time in case it changed. */
3606175d5dc7SDavid Matlack max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3607175d5dc7SDavid Matlack
360844551b2fSWanpeng Li if (!vcpu_valid_wakeup(vcpu)) {
36092086d320SChristian Borntraeger shrink_halt_poll_ns(vcpu);
3610175d5dc7SDavid Matlack } else if (max_halt_poll_ns) {
361191b99ea7SSean Christopherson if (halt_ns <= vcpu->halt_poll_ns)
3612aca6ff29SWanpeng Li ;
3613aca6ff29SWanpeng Li /* we had a long block, shrink polling */
3614acd05785SDavid Matlack else if (vcpu->halt_poll_ns &&
3615175d5dc7SDavid Matlack halt_ns > max_halt_poll_ns)
3616aca6ff29SWanpeng Li shrink_halt_poll_ns(vcpu);
3617aca6ff29SWanpeng Li /* we had a short halt and our poll time is too small */
3618175d5dc7SDavid Matlack else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3619175d5dc7SDavid Matlack halt_ns < max_halt_poll_ns)
3620aca6ff29SWanpeng Li grow_halt_poll_ns(vcpu);
362144551b2fSWanpeng Li } else {
3622edb9272fSWanpeng Li vcpu->halt_poll_ns = 0;
362344551b2fSWanpeng Li }
362444551b2fSWanpeng Li }
3625aca6ff29SWanpeng Li
362691b99ea7SSean Christopherson trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
36270fce5623SAvi Kivity }
362891b99ea7SSean Christopherson EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
36290fce5623SAvi Kivity
kvm_vcpu_wake_up(struct kvm_vcpu * vcpu)3630178f02ffSRadim Krčmář bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3631b6d33834SChristoffer Dall {
3632d92a5d1cSSean Christopherson if (__kvm_vcpu_wake_up(vcpu)) {
3633d73eb57bSWanpeng Li WRITE_ONCE(vcpu->ready, true);
36340193cc90SJing Zhang ++vcpu->stat.generic.halt_wakeup;
3635178f02ffSRadim Krčmář return true;
3636b6d33834SChristoffer Dall }
3637b6d33834SChristoffer Dall
3638178f02ffSRadim Krčmář return false;
3639dd1a4cc1SRadim Krčmář }
3640dd1a4cc1SRadim Krčmář EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3641dd1a4cc1SRadim Krčmář
36420266c894SPaolo Bonzini #ifndef CONFIG_S390
3643dd1a4cc1SRadim Krčmář /*
3644dd1a4cc1SRadim Krčmář * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3645dd1a4cc1SRadim Krčmář */
kvm_vcpu_kick(struct kvm_vcpu * vcpu)3646dd1a4cc1SRadim Krčmář void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3647dd1a4cc1SRadim Krčmář {
364885b64045SSean Christopherson int me, cpu;
3649dd1a4cc1SRadim Krčmář
3650178f02ffSRadim Krčmář if (kvm_vcpu_wake_up(vcpu))
3651178f02ffSRadim Krčmář return;
3652178f02ffSRadim Krčmář
3653aefdc2edSPaolo Bonzini me = get_cpu();
3654aefdc2edSPaolo Bonzini /*
3655aefdc2edSPaolo Bonzini * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3656aefdc2edSPaolo Bonzini * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3657aefdc2edSPaolo Bonzini * kick" check does not need atomic operations if kvm_vcpu_kick is used
3658aefdc2edSPaolo Bonzini * within the vCPU thread itself.
3659aefdc2edSPaolo Bonzini */
3660aefdc2edSPaolo Bonzini if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3661aefdc2edSPaolo Bonzini if (vcpu->mode == IN_GUEST_MODE)
3662aefdc2edSPaolo Bonzini WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3663aefdc2edSPaolo Bonzini goto out;
3664aefdc2edSPaolo Bonzini }
3665aefdc2edSPaolo Bonzini
366685b64045SSean Christopherson /*
366785b64045SSean Christopherson * Note, the vCPU could get migrated to a different pCPU at any point
366885b64045SSean Christopherson * after kvm_arch_vcpu_should_kick(), which could result in sending an
366985b64045SSean Christopherson * IPI to the previous pCPU. But, that's ok because the purpose of the
367085b64045SSean Christopherson * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
367185b64045SSean Christopherson * vCPU also requires it to leave IN_GUEST_MODE.
367285b64045SSean Christopherson */
367385b64045SSean Christopherson if (kvm_arch_vcpu_should_kick(vcpu)) {
367485b64045SSean Christopherson cpu = READ_ONCE(vcpu->cpu);
3675b6d33834SChristoffer Dall if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3676b6d33834SChristoffer Dall smp_send_reschedule(cpu);
367785b64045SSean Christopherson }
3678aefdc2edSPaolo Bonzini out:
3679b6d33834SChristoffer Dall put_cpu();
3680b6d33834SChristoffer Dall }
3681a20ed54dSYang Zhang EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
36820266c894SPaolo Bonzini #endif /* !CONFIG_S390 */
3683b6d33834SChristoffer Dall
kvm_vcpu_yield_to(struct kvm_vcpu * target)3684fa93384fSDan Carpenter int kvm_vcpu_yield_to(struct kvm_vcpu *target)
368541628d33SKonstantin Weitz {
368641628d33SKonstantin Weitz struct pid *pid;
368741628d33SKonstantin Weitz struct task_struct *task = NULL;
3688fa93384fSDan Carpenter int ret = 0;
368941628d33SKonstantin Weitz
369041628d33SKonstantin Weitz rcu_read_lock();
369141628d33SKonstantin Weitz pid = rcu_dereference(target->pid);
369241628d33SKonstantin Weitz if (pid)
369327fbe64bSSam Bobroff task = get_pid_task(pid, PIDTYPE_PID);
369441628d33SKonstantin Weitz rcu_read_unlock();
369541628d33SKonstantin Weitz if (!task)
3696c45c528eSRaghavendra K T return ret;
3697c45c528eSRaghavendra K T ret = yield_to(task, 1);
369841628d33SKonstantin Weitz put_task_struct(task);
3699c45c528eSRaghavendra K T
3700c45c528eSRaghavendra K T return ret;
370141628d33SKonstantin Weitz }
370241628d33SKonstantin Weitz EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
370341628d33SKonstantin Weitz
370406e48c51SRaghavendra K T /*
370506e48c51SRaghavendra K T * Helper that checks whether a VCPU is eligible for directed yield.
370606e48c51SRaghavendra K T * Most eligible candidate to yield is decided by following heuristics:
370706e48c51SRaghavendra K T *
370806e48c51SRaghavendra K T * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
370906e48c51SRaghavendra K T * (preempted lock holder), indicated by @in_spin_loop.
3710656012c7SFuad Tabba * Set at the beginning and cleared at the end of interception/PLE handler.
371106e48c51SRaghavendra K T *
371206e48c51SRaghavendra K T * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
371306e48c51SRaghavendra K T * chance last time (mostly it has become eligible now since we have probably
371406e48c51SRaghavendra K T * yielded to lockholder in last iteration. This is done by toggling
371506e48c51SRaghavendra K T * @dy_eligible each time a VCPU checked for eligibility.)
371606e48c51SRaghavendra K T *
371706e48c51SRaghavendra K T * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
371806e48c51SRaghavendra K T * to preempted lock-holder could result in wrong VCPU selection and CPU
371906e48c51SRaghavendra K T * burning. Giving priority for a potential lock-holder increases lock
372006e48c51SRaghavendra K T * progress.
372106e48c51SRaghavendra K T *
372206e48c51SRaghavendra K T * Since algorithm is based on heuristics, accessing another VCPU data without
372306e48c51SRaghavendra K T * locking does not harm. It may result in trying to yield to same VCPU, fail
372406e48c51SRaghavendra K T * and continue with next VCPU and so on.
372506e48c51SRaghavendra K T */
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu * vcpu)37267940876eSStephen Hemminger static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
372706e48c51SRaghavendra K T {
37284a55dd72SScott Wood #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
372906e48c51SRaghavendra K T bool eligible;
373006e48c51SRaghavendra K T
373106e48c51SRaghavendra K T eligible = !vcpu->spin_loop.in_spin_loop ||
373234656113SChristian Borntraeger vcpu->spin_loop.dy_eligible;
373306e48c51SRaghavendra K T
373406e48c51SRaghavendra K T if (vcpu->spin_loop.in_spin_loop)
373506e48c51SRaghavendra K T kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
373606e48c51SRaghavendra K T
373706e48c51SRaghavendra K T return eligible;
37384a55dd72SScott Wood #else
37394a55dd72SScott Wood return true;
374006e48c51SRaghavendra K T #endif
37414a55dd72SScott Wood }
3742c45c528eSRaghavendra K T
374317e433b5SWanpeng Li /*
374417e433b5SWanpeng Li * Unlike kvm_arch_vcpu_runnable, this function is called outside
374517e433b5SWanpeng Li * a vcpu_load/vcpu_put pair. However, for most architectures
374617e433b5SWanpeng Li * kvm_arch_vcpu_runnable does not require vcpu_load.
374717e433b5SWanpeng Li */
kvm_arch_dy_runnable(struct kvm_vcpu * vcpu)374817e433b5SWanpeng Li bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
374917e433b5SWanpeng Li {
375017e433b5SWanpeng Li return kvm_arch_vcpu_runnable(vcpu);
375117e433b5SWanpeng Li }
375217e433b5SWanpeng Li
vcpu_dy_runnable(struct kvm_vcpu * vcpu)375317e433b5SWanpeng Li static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
375417e433b5SWanpeng Li {
375517e433b5SWanpeng Li if (kvm_arch_dy_runnable(vcpu))
375617e433b5SWanpeng Li return true;
375717e433b5SWanpeng Li
375817e433b5SWanpeng Li #ifdef CONFIG_KVM_ASYNC_PF
375917e433b5SWanpeng Li if (!list_empty_careful(&vcpu->async_pf.done))
376017e433b5SWanpeng Li return true;
376117e433b5SWanpeng Li #endif
376217e433b5SWanpeng Li
376317e433b5SWanpeng Li return false;
376417e433b5SWanpeng Li }
376517e433b5SWanpeng Li
kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu * vcpu)376652acd22fSWanpeng Li bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
376752acd22fSWanpeng Li {
376852acd22fSWanpeng Li return false;
376952acd22fSWanpeng Li }
377052acd22fSWanpeng Li
kvm_vcpu_on_spin(struct kvm_vcpu * me,bool yield_to_kernel_mode)3771199b5763SLongpeng(Mike) void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3772d255f4f2SZhai, Edwin {
3773217ece61SRik van Riel struct kvm *kvm = me->kvm;
3774217ece61SRik van Riel struct kvm_vcpu *vcpu;
3775a937ef95SBreno Leitao int last_boosted_vcpu;
377646808a4cSMarc Zyngier unsigned long i;
3777217ece61SRik van Riel int yielded = 0;
3778c45c528eSRaghavendra K T int try = 3;
3779217ece61SRik van Riel int pass;
3780d255f4f2SZhai, Edwin
3781a937ef95SBreno Leitao last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
37824c088493SRaghavendra K T kvm_vcpu_set_in_spin_loop(me, true);
3783217ece61SRik van Riel /*
3784217ece61SRik van Riel * We boost the priority of a VCPU that is runnable but not
3785217ece61SRik van Riel * currently running, because it got preempted by something
3786217ece61SRik van Riel * else and called schedule in __vcpu_run. Hopefully that
3787217ece61SRik van Riel * VCPU is holding the lock that we need and will release it.
3788217ece61SRik van Riel * We approximate round-robin by starting at the last boosted VCPU.
3789217ece61SRik van Riel */
3790c45c528eSRaghavendra K T for (pass = 0; pass < 2 && !yielded && try; pass++) {
3791217ece61SRik van Riel kvm_for_each_vcpu(i, vcpu, kvm) {
37925cfc2aabSRik van Riel if (!pass && i <= last_boosted_vcpu) {
3793217ece61SRik van Riel i = last_boosted_vcpu;
3794217ece61SRik van Riel continue;
3795217ece61SRik van Riel } else if (pass && i > last_boosted_vcpu)
3796217ece61SRik van Riel break;
3797d73eb57bSWanpeng Li if (!READ_ONCE(vcpu->ready))
37987bc7ae25SRaghavendra K T continue;
3799217ece61SRik van Riel if (vcpu == me)
3800217ece61SRik van Riel continue;
3801d92a5d1cSSean Christopherson if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3802217ece61SRik van Riel continue;
3803046ddeedSWanpeng Li if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
380452acd22fSWanpeng Li !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3805046ddeedSWanpeng Li !kvm_arch_vcpu_in_kernel(vcpu))
3806199b5763SLongpeng(Mike) continue;
380706e48c51SRaghavendra K T if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
380806e48c51SRaghavendra K T continue;
3809c45c528eSRaghavendra K T
3810c45c528eSRaghavendra K T yielded = kvm_vcpu_yield_to(vcpu);
3811c45c528eSRaghavendra K T if (yielded > 0) {
3812a937ef95SBreno Leitao WRITE_ONCE(kvm->last_boosted_vcpu, i);
3813c45c528eSRaghavendra K T break;
3814c45c528eSRaghavendra K T } else if (yielded < 0) {
3815c45c528eSRaghavendra K T try--;
3816c45c528eSRaghavendra K T if (!try)
3817217ece61SRik van Riel break;
3818217ece61SRik van Riel }
3819217ece61SRik van Riel }
3820217ece61SRik van Riel }
38214c088493SRaghavendra K T kvm_vcpu_set_in_spin_loop(me, false);
382206e48c51SRaghavendra K T
382306e48c51SRaghavendra K T /* Ensure vcpu is not eligible during next spinloop */
382406e48c51SRaghavendra K T kvm_vcpu_set_dy_eligible(me, false);
3825d255f4f2SZhai, Edwin }
3826d255f4f2SZhai, Edwin EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3827d255f4f2SZhai, Edwin
kvm_page_in_dirty_ring(struct kvm * kvm,unsigned long pgoff)3828fb04a1edSPeter Xu static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3829fb04a1edSPeter Xu {
3830dc70ec21SDavid Woodhouse #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3831fb04a1edSPeter Xu return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3832fb04a1edSPeter Xu (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3833fb04a1edSPeter Xu kvm->dirty_ring_size / PAGE_SIZE);
3834fb04a1edSPeter Xu #else
3835fb04a1edSPeter Xu return false;
3836fb04a1edSPeter Xu #endif
3837fb04a1edSPeter Xu }
3838fb04a1edSPeter Xu
kvm_vcpu_fault(struct vm_fault * vmf)38391499fa80SSouptick Joarder static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
38400fce5623SAvi Kivity {
384111bac800SDave Jiang struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
38420fce5623SAvi Kivity struct page *page;
38430fce5623SAvi Kivity
38440fce5623SAvi Kivity if (vmf->pgoff == 0)
38450fce5623SAvi Kivity page = virt_to_page(vcpu->run);
384609566765SAvi Kivity #ifdef CONFIG_X86
38470fce5623SAvi Kivity else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
38480fce5623SAvi Kivity page = virt_to_page(vcpu->arch.pio_data);
384909566765SAvi Kivity #endif
38504b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
38515f94c174SLaurent Vivier else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
38525f94c174SLaurent Vivier page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
38535f94c174SLaurent Vivier #endif
3854fb04a1edSPeter Xu else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3855fb04a1edSPeter Xu page = kvm_dirty_ring_get_page(
3856fb04a1edSPeter Xu &vcpu->dirty_ring,
3857fb04a1edSPeter Xu vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
38580fce5623SAvi Kivity else
38595b1c1493SCarsten Otte return kvm_arch_vcpu_fault(vcpu, vmf);
38600fce5623SAvi Kivity get_page(page);
38610fce5623SAvi Kivity vmf->page = page;
38620fce5623SAvi Kivity return 0;
38630fce5623SAvi Kivity }
38640fce5623SAvi Kivity
3865f0f37e2fSAlexey Dobriyan static const struct vm_operations_struct kvm_vcpu_vm_ops = {
38660fce5623SAvi Kivity .fault = kvm_vcpu_fault,
38670fce5623SAvi Kivity };
38680fce5623SAvi Kivity
kvm_vcpu_mmap(struct file * file,struct vm_area_struct * vma)38690fce5623SAvi Kivity static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
38700fce5623SAvi Kivity {
3871fb04a1edSPeter Xu struct kvm_vcpu *vcpu = file->private_data;
387211476d27SYang Li unsigned long pages = vma_pages(vma);
3873fb04a1edSPeter Xu
3874fb04a1edSPeter Xu if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3875fb04a1edSPeter Xu kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3876fb04a1edSPeter Xu ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3877fb04a1edSPeter Xu return -EINVAL;
3878fb04a1edSPeter Xu
38790fce5623SAvi Kivity vma->vm_ops = &kvm_vcpu_vm_ops;
38800fce5623SAvi Kivity return 0;
38810fce5623SAvi Kivity }
38820fce5623SAvi Kivity
kvm_vcpu_release(struct inode * inode,struct file * filp)38830fce5623SAvi Kivity static int kvm_vcpu_release(struct inode *inode, struct file *filp)
38840fce5623SAvi Kivity {
38850fce5623SAvi Kivity struct kvm_vcpu *vcpu = filp->private_data;
38860fce5623SAvi Kivity
388766c0b394SAl Viro kvm_put_kvm(vcpu->kvm);
38880fce5623SAvi Kivity return 0;
38890fce5623SAvi Kivity }
38900fce5623SAvi Kivity
389170375c2dSDavid Matlack static const struct file_operations kvm_vcpu_fops = {
38920fce5623SAvi Kivity .release = kvm_vcpu_release,
38930fce5623SAvi Kivity .unlocked_ioctl = kvm_vcpu_ioctl,
38940fce5623SAvi Kivity .mmap = kvm_vcpu_mmap,
38956038f373SArnd Bergmann .llseek = noop_llseek,
38967ddfd3e0SMarc Zyngier KVM_COMPAT(kvm_vcpu_compat_ioctl),
38970fce5623SAvi Kivity };
38980fce5623SAvi Kivity
38990fce5623SAvi Kivity /*
39000fce5623SAvi Kivity * Allocates an inode for the vcpu.
39010fce5623SAvi Kivity */
create_vcpu_fd(struct kvm_vcpu * vcpu)39020fce5623SAvi Kivity static int create_vcpu_fd(struct kvm_vcpu *vcpu)
39030fce5623SAvi Kivity {
3904e46b4692SMasatake YAMATO char name[8 + 1 + ITOA_MAX_LEN + 1];
3905e46b4692SMasatake YAMATO
3906e46b4692SMasatake YAMATO snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3907e46b4692SMasatake YAMATO return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
39080fce5623SAvi Kivity }
39090fce5623SAvi Kivity
3910e36de87dSVineeth Pillai #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
vcpu_get_pid(void * data,u64 * val)3911e36de87dSVineeth Pillai static int vcpu_get_pid(void *data, u64 *val)
3912e36de87dSVineeth Pillai {
391314aa40a1SLi kunyu struct kvm_vcpu *vcpu = data;
391476021e96SSean Christopherson
391576021e96SSean Christopherson rcu_read_lock();
391676021e96SSean Christopherson *val = pid_nr(rcu_dereference(vcpu->pid));
391776021e96SSean Christopherson rcu_read_unlock();
3918e36de87dSVineeth Pillai return 0;
3919e36de87dSVineeth Pillai }
3920e36de87dSVineeth Pillai
3921e36de87dSVineeth Pillai DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3922e36de87dSVineeth Pillai
kvm_create_vcpu_debugfs(struct kvm_vcpu * vcpu)39233e7093d0SGreg KH static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
392445b5939eSLuiz Capitulino {
3925d56f5136SPaolo Bonzini struct dentry *debugfs_dentry;
392645b5939eSLuiz Capitulino char dir_name[ITOA_MAX_LEN * 2];
392745b5939eSLuiz Capitulino
392845b5939eSLuiz Capitulino if (!debugfs_initialized())
39293e7093d0SGreg KH return;
393045b5939eSLuiz Capitulino
393145b5939eSLuiz Capitulino snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3932d56f5136SPaolo Bonzini debugfs_dentry = debugfs_create_dir(dir_name,
393345b5939eSLuiz Capitulino vcpu->kvm->debugfs_dentry);
3934e36de87dSVineeth Pillai debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3935e36de87dSVineeth Pillai &vcpu_get_pid_fops);
393645b5939eSLuiz Capitulino
3937d56f5136SPaolo Bonzini kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
393845b5939eSLuiz Capitulino }
3939e36de87dSVineeth Pillai #endif
394045b5939eSLuiz Capitulino
39410fce5623SAvi Kivity /*
39420fce5623SAvi Kivity * Creates some virtual cpus. Good luck creating more than one.
39430fce5623SAvi Kivity */
kvm_vm_ioctl_create_vcpu(struct kvm * kvm,u32 id)394473880c80SGleb Natapov static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
39450fce5623SAvi Kivity {
39460fce5623SAvi Kivity int r;
3947e09fefdeSDavid Hildenbrand struct kvm_vcpu *vcpu;
39488bd826d6SSean Christopherson struct page *page;
39490fce5623SAvi Kivity
3950a1c42ddeSJuergen Gross if (id >= KVM_MAX_VCPU_IDS)
3951338c7dbaSAndy Honig return -EINVAL;
3952338c7dbaSAndy Honig
39536c7caebcSPaolo Bonzini mutex_lock(&kvm->lock);
3954f502cc56SSean Christopherson if (kvm->created_vcpus >= kvm->max_vcpus) {
39556c7caebcSPaolo Bonzini mutex_unlock(&kvm->lock);
39566c7caebcSPaolo Bonzini return -EINVAL;
39576c7caebcSPaolo Bonzini }
39586c7caebcSPaolo Bonzini
39591d5e740dSZeng Guang r = kvm_arch_vcpu_precreate(kvm, id);
39601d5e740dSZeng Guang if (r) {
39611d5e740dSZeng Guang mutex_unlock(&kvm->lock);
39621d5e740dSZeng Guang return r;
39631d5e740dSZeng Guang }
39641d5e740dSZeng Guang
39656c7caebcSPaolo Bonzini kvm->created_vcpus++;
39666c7caebcSPaolo Bonzini mutex_unlock(&kvm->lock);
39676c7caebcSPaolo Bonzini
396885f47930SSean Christopherson vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3969e529ef66SSean Christopherson if (!vcpu) {
3970e529ef66SSean Christopherson r = -ENOMEM;
39716c7caebcSPaolo Bonzini goto vcpu_decrement;
39726c7caebcSPaolo Bonzini }
39730fce5623SAvi Kivity
3974fcd97ad5SPeter Xu BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
397593bb59caSShakeel Butt page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
39768bd826d6SSean Christopherson if (!page) {
39778bd826d6SSean Christopherson r = -ENOMEM;
3978e529ef66SSean Christopherson goto vcpu_free;
39798bd826d6SSean Christopherson }
39808bd826d6SSean Christopherson vcpu->run = page_address(page);
39818bd826d6SSean Christopherson
39828bd826d6SSean Christopherson kvm_vcpu_init(vcpu, kvm, id);
3983e529ef66SSean Christopherson
3984e529ef66SSean Christopherson r = kvm_arch_vcpu_create(vcpu);
3985e529ef66SSean Christopherson if (r)
39868bd826d6SSean Christopherson goto vcpu_free_run_page;
3987e529ef66SSean Christopherson
3988fb04a1edSPeter Xu if (kvm->dirty_ring_size) {
3989fb04a1edSPeter Xu r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3990fb04a1edSPeter Xu id, kvm->dirty_ring_size);
3991fb04a1edSPeter Xu if (r)
3992fb04a1edSPeter Xu goto arch_vcpu_destroy;
3993fb04a1edSPeter Xu }
3994fb04a1edSPeter Xu
39950fce5623SAvi Kivity mutex_lock(&kvm->lock);
399642a90008SDavid Woodhouse
399742a90008SDavid Woodhouse #ifdef CONFIG_LOCKDEP
399842a90008SDavid Woodhouse /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
399942a90008SDavid Woodhouse mutex_lock(&vcpu->mutex);
400042a90008SDavid Woodhouse mutex_unlock(&vcpu->mutex);
400142a90008SDavid Woodhouse #endif
400242a90008SDavid Woodhouse
4003e09fefdeSDavid Hildenbrand if (kvm_get_vcpu_by_id(kvm, id)) {
40040fce5623SAvi Kivity r = -EEXIST;
4005d780592bSJan Kiszka goto unlock_vcpu_destroy;
40060fce5623SAvi Kivity }
400773880c80SGleb Natapov
40088750e72aSRadim Krčmář vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4009afb2acb2SMichal Luczaj r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4010c5b07754SMarc Zyngier if (r)
4011c5b07754SMarc Zyngier goto unlock_vcpu_destroy;
40120fce5623SAvi Kivity
40130fce5623SAvi Kivity /* Now it's all set up, let userspace reach it */
401466c0b394SAl Viro kvm_get_kvm(kvm);
40150fce5623SAvi Kivity r = create_vcpu_fd(vcpu);
4016afb2acb2SMichal Luczaj if (r < 0)
4017afb2acb2SMichal Luczaj goto kvm_put_xa_release;
4018afb2acb2SMichal Luczaj
40195f643e46SMichal Luczaj if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4020afb2acb2SMichal Luczaj r = -EINVAL;
4021afb2acb2SMichal Luczaj goto kvm_put_xa_release;
402273880c80SGleb Natapov }
402373880c80SGleb Natapov
4024dd489240SPaolo Bonzini /*
4025c5b07754SMarc Zyngier * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4026c5b07754SMarc Zyngier * pointer before kvm->online_vcpu's incremented value.
4027dd489240SPaolo Bonzini */
402873880c80SGleb Natapov smp_wmb();
402973880c80SGleb Natapov atomic_inc(&kvm->online_vcpus);
403073880c80SGleb Natapov
403173880c80SGleb Natapov mutex_unlock(&kvm->lock);
403242897d86SMarcelo Tosatti kvm_arch_vcpu_postcreate(vcpu);
403363d04348SPaolo Bonzini kvm_create_vcpu_debugfs(vcpu);
40340fce5623SAvi Kivity return r;
40350fce5623SAvi Kivity
4036afb2acb2SMichal Luczaj kvm_put_xa_release:
4037afb2acb2SMichal Luczaj kvm_put_kvm_no_destroy(kvm);
4038afb2acb2SMichal Luczaj xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4039d780592bSJan Kiszka unlock_vcpu_destroy:
40407d8fece6SGlauber Costa mutex_unlock(&kvm->lock);
4041fb04a1edSPeter Xu kvm_dirty_ring_free(&vcpu->dirty_ring);
4042fb04a1edSPeter Xu arch_vcpu_destroy:
40430fce5623SAvi Kivity kvm_arch_vcpu_destroy(vcpu);
40448bd826d6SSean Christopherson vcpu_free_run_page:
40458bd826d6SSean Christopherson free_page((unsigned long)vcpu->run);
4046e529ef66SSean Christopherson vcpu_free:
4047e529ef66SSean Christopherson kmem_cache_free(kvm_vcpu_cache, vcpu);
40486c7caebcSPaolo Bonzini vcpu_decrement:
40496c7caebcSPaolo Bonzini mutex_lock(&kvm->lock);
40506c7caebcSPaolo Bonzini kvm->created_vcpus--;
40516c7caebcSPaolo Bonzini mutex_unlock(&kvm->lock);
40520fce5623SAvi Kivity return r;
40530fce5623SAvi Kivity }
40540fce5623SAvi Kivity
kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu * vcpu,sigset_t * sigset)40550fce5623SAvi Kivity static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
40560fce5623SAvi Kivity {
40570fce5623SAvi Kivity if (sigset) {
40580fce5623SAvi Kivity sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
40590fce5623SAvi Kivity vcpu->sigset_active = 1;
40600fce5623SAvi Kivity vcpu->sigset = *sigset;
40610fce5623SAvi Kivity } else
40620fce5623SAvi Kivity vcpu->sigset_active = 0;
40630fce5623SAvi Kivity return 0;
40640fce5623SAvi Kivity }
40650fce5623SAvi Kivity
kvm_vcpu_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4066ce55c049SJing Zhang static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4067ce55c049SJing Zhang size_t size, loff_t *offset)
4068ce55c049SJing Zhang {
4069ce55c049SJing Zhang struct kvm_vcpu *vcpu = file->private_data;
4070ce55c049SJing Zhang
4071ce55c049SJing Zhang return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4072ce55c049SJing Zhang &kvm_vcpu_stats_desc[0], &vcpu->stat,
4073ce55c049SJing Zhang sizeof(vcpu->stat), user_buffer, size, offset);
4074ce55c049SJing Zhang }
4075ce55c049SJing Zhang
kvm_vcpu_stats_release(struct inode * inode,struct file * file)4076eed3013fSSean Christopherson static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4077eed3013fSSean Christopherson {
4078eed3013fSSean Christopherson struct kvm_vcpu *vcpu = file->private_data;
4079eed3013fSSean Christopherson
4080eed3013fSSean Christopherson kvm_put_kvm(vcpu->kvm);
4081eed3013fSSean Christopherson return 0;
4082eed3013fSSean Christopherson }
4083eed3013fSSean Christopherson
4084ce55c049SJing Zhang static const struct file_operations kvm_vcpu_stats_fops = {
4085ce55c049SJing Zhang .read = kvm_vcpu_stats_read,
4086eed3013fSSean Christopherson .release = kvm_vcpu_stats_release,
4087ce55c049SJing Zhang .llseek = noop_llseek,
4088ce55c049SJing Zhang };
4089ce55c049SJing Zhang
kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu * vcpu)4090ce55c049SJing Zhang static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4091ce55c049SJing Zhang {
4092ce55c049SJing Zhang int fd;
4093ce55c049SJing Zhang struct file *file;
4094ce55c049SJing Zhang char name[15 + ITOA_MAX_LEN + 1];
4095ce55c049SJing Zhang
4096ce55c049SJing Zhang snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4097ce55c049SJing Zhang
4098ce55c049SJing Zhang fd = get_unused_fd_flags(O_CLOEXEC);
4099ce55c049SJing Zhang if (fd < 0)
4100ce55c049SJing Zhang return fd;
4101ce55c049SJing Zhang
4102ce55c049SJing Zhang file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4103ce55c049SJing Zhang if (IS_ERR(file)) {
4104ce55c049SJing Zhang put_unused_fd(fd);
4105ce55c049SJing Zhang return PTR_ERR(file);
4106ce55c049SJing Zhang }
4107eed3013fSSean Christopherson
4108eed3013fSSean Christopherson kvm_get_kvm(vcpu->kvm);
4109eed3013fSSean Christopherson
4110ce55c049SJing Zhang file->f_mode |= FMODE_PREAD;
4111ce55c049SJing Zhang fd_install(fd, file);
4112ce55c049SJing Zhang
4113ce55c049SJing Zhang return fd;
4114ce55c049SJing Zhang }
4115ce55c049SJing Zhang
kvm_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)41160fce5623SAvi Kivity static long kvm_vcpu_ioctl(struct file *filp,
41170fce5623SAvi Kivity unsigned int ioctl, unsigned long arg)
41180fce5623SAvi Kivity {
41190fce5623SAvi Kivity struct kvm_vcpu *vcpu = filp->private_data;
41200fce5623SAvi Kivity void __user *argp = (void __user *)arg;
41210fce5623SAvi Kivity int r;
4122fa3795a7SDave Hansen struct kvm_fpu *fpu = NULL;
4123fa3795a7SDave Hansen struct kvm_sregs *kvm_sregs = NULL;
41240fce5623SAvi Kivity
4125f4d31653SPaolo Bonzini if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
41260fce5623SAvi Kivity return -EIO;
41272122ff5eSAvi Kivity
41282ea75be3SDavid Matlack if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
41292ea75be3SDavid Matlack return -EINVAL;
41302ea75be3SDavid Matlack
41312122ff5eSAvi Kivity /*
41325cb0944cSPaolo Bonzini * Some architectures have vcpu ioctls that are asynchronous to vcpu
41335cb0944cSPaolo Bonzini * execution; mutex_lock() would break them.
41342122ff5eSAvi Kivity */
41355cb0944cSPaolo Bonzini r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
41365cb0944cSPaolo Bonzini if (r != -ENOIOCTLCMD)
41379fc77441SMichael S. Tsirkin return r;
41382122ff5eSAvi Kivity
4139ec7660ccSChristoffer Dall if (mutex_lock_killable(&vcpu->mutex))
4140ec7660ccSChristoffer Dall return -EINTR;
41410fce5623SAvi Kivity switch (ioctl) {
41420e4524a5SChristian Borntraeger case KVM_RUN: {
41430e4524a5SChristian Borntraeger struct pid *oldpid;
41440fce5623SAvi Kivity r = -EINVAL;
41450fce5623SAvi Kivity if (arg)
41460fce5623SAvi Kivity goto out;
41470e4524a5SChristian Borntraeger oldpid = rcu_access_pointer(vcpu->pid);
414871dbc8a9SEric W. Biederman if (unlikely(oldpid != task_pid(current))) {
41497a72f7a1SChristian Borntraeger /* The thread running this VCPU changed. */
4150bd2a6394SChristoffer Dall struct pid *newpid;
4151f95ef0cdSXiubo Li
4152bd2a6394SChristoffer Dall r = kvm_arch_vcpu_run_pid_change(vcpu);
4153bd2a6394SChristoffer Dall if (r)
4154bd2a6394SChristoffer Dall break;
4155bd2a6394SChristoffer Dall
4156bd2a6394SChristoffer Dall newpid = get_task_pid(current, PIDTYPE_PID);
41577a72f7a1SChristian Borntraeger rcu_assign_pointer(vcpu->pid, newpid);
41587a72f7a1SChristian Borntraeger if (oldpid)
41597a72f7a1SChristian Borntraeger synchronize_rcu();
41607a72f7a1SChristian Borntraeger put_pid(oldpid);
41617a72f7a1SChristian Borntraeger }
41621b94f6f8STianjia Zhang r = kvm_arch_vcpu_ioctl_run(vcpu);
416364be5007SGleb Natapov trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
41640fce5623SAvi Kivity break;
41650e4524a5SChristian Borntraeger }
41660fce5623SAvi Kivity case KVM_GET_REGS: {
41673e4bb3acSXiantao Zhang struct kvm_regs *kvm_regs;
41680fce5623SAvi Kivity
41693e4bb3acSXiantao Zhang r = -ENOMEM;
4170b12ce36aSBen Gardon kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
41713e4bb3acSXiantao Zhang if (!kvm_regs)
41723e4bb3acSXiantao Zhang goto out;
41733e4bb3acSXiantao Zhang r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
41740fce5623SAvi Kivity if (r)
41753e4bb3acSXiantao Zhang goto out_free1;
41760fce5623SAvi Kivity r = -EFAULT;
41773e4bb3acSXiantao Zhang if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
41783e4bb3acSXiantao Zhang goto out_free1;
41790fce5623SAvi Kivity r = 0;
41803e4bb3acSXiantao Zhang out_free1:
41813e4bb3acSXiantao Zhang kfree(kvm_regs);
41820fce5623SAvi Kivity break;
41830fce5623SAvi Kivity }
41840fce5623SAvi Kivity case KVM_SET_REGS: {
41853e4bb3acSXiantao Zhang struct kvm_regs *kvm_regs;
41860fce5623SAvi Kivity
4187ff5c2c03SSasha Levin kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4188ff5c2c03SSasha Levin if (IS_ERR(kvm_regs)) {
4189ff5c2c03SSasha Levin r = PTR_ERR(kvm_regs);
41903e4bb3acSXiantao Zhang goto out;
4191ff5c2c03SSasha Levin }
41923e4bb3acSXiantao Zhang r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
41933e4bb3acSXiantao Zhang kfree(kvm_regs);
41940fce5623SAvi Kivity break;
41950fce5623SAvi Kivity }
41960fce5623SAvi Kivity case KVM_GET_SREGS: {
4197b12ce36aSBen Gardon kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4198b12ce36aSBen Gardon GFP_KERNEL_ACCOUNT);
4199fa3795a7SDave Hansen r = -ENOMEM;
4200fa3795a7SDave Hansen if (!kvm_sregs)
4201fa3795a7SDave Hansen goto out;
4202fa3795a7SDave Hansen r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
42030fce5623SAvi Kivity if (r)
42040fce5623SAvi Kivity goto out;
42050fce5623SAvi Kivity r = -EFAULT;
4206fa3795a7SDave Hansen if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
42070fce5623SAvi Kivity goto out;
42080fce5623SAvi Kivity r = 0;
42090fce5623SAvi Kivity break;
42100fce5623SAvi Kivity }
42110fce5623SAvi Kivity case KVM_SET_SREGS: {
4212ff5c2c03SSasha Levin kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4213ff5c2c03SSasha Levin if (IS_ERR(kvm_sregs)) {
4214ff5c2c03SSasha Levin r = PTR_ERR(kvm_sregs);
421518595411SGuo Chao kvm_sregs = NULL;
42160fce5623SAvi Kivity goto out;
4217ff5c2c03SSasha Levin }
4218fa3795a7SDave Hansen r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
42190fce5623SAvi Kivity break;
42200fce5623SAvi Kivity }
422162d9f0dbSMarcelo Tosatti case KVM_GET_MP_STATE: {
422262d9f0dbSMarcelo Tosatti struct kvm_mp_state mp_state;
422362d9f0dbSMarcelo Tosatti
422462d9f0dbSMarcelo Tosatti r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
422562d9f0dbSMarcelo Tosatti if (r)
422662d9f0dbSMarcelo Tosatti goto out;
422762d9f0dbSMarcelo Tosatti r = -EFAULT;
4228893bdbf1SXiubo Li if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
422962d9f0dbSMarcelo Tosatti goto out;
423062d9f0dbSMarcelo Tosatti r = 0;
423162d9f0dbSMarcelo Tosatti break;
423262d9f0dbSMarcelo Tosatti }
423362d9f0dbSMarcelo Tosatti case KVM_SET_MP_STATE: {
423462d9f0dbSMarcelo Tosatti struct kvm_mp_state mp_state;
423562d9f0dbSMarcelo Tosatti
423662d9f0dbSMarcelo Tosatti r = -EFAULT;
4237893bdbf1SXiubo Li if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
423862d9f0dbSMarcelo Tosatti goto out;
423962d9f0dbSMarcelo Tosatti r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
424062d9f0dbSMarcelo Tosatti break;
424162d9f0dbSMarcelo Tosatti }
42420fce5623SAvi Kivity case KVM_TRANSLATE: {
42430fce5623SAvi Kivity struct kvm_translation tr;
42440fce5623SAvi Kivity
42450fce5623SAvi Kivity r = -EFAULT;
4246893bdbf1SXiubo Li if (copy_from_user(&tr, argp, sizeof(tr)))
42470fce5623SAvi Kivity goto out;
42480fce5623SAvi Kivity r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
42490fce5623SAvi Kivity if (r)
42500fce5623SAvi Kivity goto out;
42510fce5623SAvi Kivity r = -EFAULT;
4252893bdbf1SXiubo Li if (copy_to_user(argp, &tr, sizeof(tr)))
42530fce5623SAvi Kivity goto out;
42540fce5623SAvi Kivity r = 0;
42550fce5623SAvi Kivity break;
42560fce5623SAvi Kivity }
4257d0bfb940SJan Kiszka case KVM_SET_GUEST_DEBUG: {
4258d0bfb940SJan Kiszka struct kvm_guest_debug dbg;
42590fce5623SAvi Kivity
42600fce5623SAvi Kivity r = -EFAULT;
4261893bdbf1SXiubo Li if (copy_from_user(&dbg, argp, sizeof(dbg)))
42620fce5623SAvi Kivity goto out;
4263d0bfb940SJan Kiszka r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
42640fce5623SAvi Kivity break;
42650fce5623SAvi Kivity }
42660fce5623SAvi Kivity case KVM_SET_SIGNAL_MASK: {
42670fce5623SAvi Kivity struct kvm_signal_mask __user *sigmask_arg = argp;
42680fce5623SAvi Kivity struct kvm_signal_mask kvm_sigmask;
42690fce5623SAvi Kivity sigset_t sigset, *p;
42700fce5623SAvi Kivity
42710fce5623SAvi Kivity p = NULL;
42720fce5623SAvi Kivity if (argp) {
42730fce5623SAvi Kivity r = -EFAULT;
42740fce5623SAvi Kivity if (copy_from_user(&kvm_sigmask, argp,
4275893bdbf1SXiubo Li sizeof(kvm_sigmask)))
42760fce5623SAvi Kivity goto out;
42770fce5623SAvi Kivity r = -EINVAL;
4278893bdbf1SXiubo Li if (kvm_sigmask.len != sizeof(sigset))
42790fce5623SAvi Kivity goto out;
42800fce5623SAvi Kivity r = -EFAULT;
42810fce5623SAvi Kivity if (copy_from_user(&sigset, sigmask_arg->sigset,
4282893bdbf1SXiubo Li sizeof(sigset)))
42830fce5623SAvi Kivity goto out;
42840fce5623SAvi Kivity p = &sigset;
42850fce5623SAvi Kivity }
4286376d41ffSAndi Kleen r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
42870fce5623SAvi Kivity break;
42880fce5623SAvi Kivity }
42890fce5623SAvi Kivity case KVM_GET_FPU: {
4290b12ce36aSBen Gardon fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4291fa3795a7SDave Hansen r = -ENOMEM;
4292fa3795a7SDave Hansen if (!fpu)
4293fa3795a7SDave Hansen goto out;
4294fa3795a7SDave Hansen r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
42950fce5623SAvi Kivity if (r)
42960fce5623SAvi Kivity goto out;
42970fce5623SAvi Kivity r = -EFAULT;
4298fa3795a7SDave Hansen if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
42990fce5623SAvi Kivity goto out;
43000fce5623SAvi Kivity r = 0;
43010fce5623SAvi Kivity break;
43020fce5623SAvi Kivity }
43030fce5623SAvi Kivity case KVM_SET_FPU: {
4304ff5c2c03SSasha Levin fpu = memdup_user(argp, sizeof(*fpu));
4305ff5c2c03SSasha Levin if (IS_ERR(fpu)) {
4306ff5c2c03SSasha Levin r = PTR_ERR(fpu);
430718595411SGuo Chao fpu = NULL;
43080fce5623SAvi Kivity goto out;
4309ff5c2c03SSasha Levin }
4310fa3795a7SDave Hansen r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
43110fce5623SAvi Kivity break;
43120fce5623SAvi Kivity }
4313ce55c049SJing Zhang case KVM_GET_STATS_FD: {
4314ce55c049SJing Zhang r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4315ce55c049SJing Zhang break;
4316ce55c049SJing Zhang }
43170fce5623SAvi Kivity default:
43180fce5623SAvi Kivity r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
43190fce5623SAvi Kivity }
43200fce5623SAvi Kivity out:
4321ec7660ccSChristoffer Dall mutex_unlock(&vcpu->mutex);
4322fa3795a7SDave Hansen kfree(fpu);
4323fa3795a7SDave Hansen kfree(kvm_sregs);
43240fce5623SAvi Kivity return r;
43250fce5623SAvi Kivity }
43260fce5623SAvi Kivity
4327de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
kvm_vcpu_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)43281dda606cSAlexander Graf static long kvm_vcpu_compat_ioctl(struct file *filp,
43291dda606cSAlexander Graf unsigned int ioctl, unsigned long arg)
43301dda606cSAlexander Graf {
43311dda606cSAlexander Graf struct kvm_vcpu *vcpu = filp->private_data;
43321dda606cSAlexander Graf void __user *argp = compat_ptr(arg);
43331dda606cSAlexander Graf int r;
43341dda606cSAlexander Graf
4335f4d31653SPaolo Bonzini if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
43361dda606cSAlexander Graf return -EIO;
43371dda606cSAlexander Graf
43381dda606cSAlexander Graf switch (ioctl) {
43391dda606cSAlexander Graf case KVM_SET_SIGNAL_MASK: {
43401dda606cSAlexander Graf struct kvm_signal_mask __user *sigmask_arg = argp;
43411dda606cSAlexander Graf struct kvm_signal_mask kvm_sigmask;
43421dda606cSAlexander Graf sigset_t sigset;
43431dda606cSAlexander Graf
43441dda606cSAlexander Graf if (argp) {
43451dda606cSAlexander Graf r = -EFAULT;
43461dda606cSAlexander Graf if (copy_from_user(&kvm_sigmask, argp,
4347893bdbf1SXiubo Li sizeof(kvm_sigmask)))
43481dda606cSAlexander Graf goto out;
43491dda606cSAlexander Graf r = -EINVAL;
43503968cf62SAl Viro if (kvm_sigmask.len != sizeof(compat_sigset_t))
43511dda606cSAlexander Graf goto out;
43521dda606cSAlexander Graf r = -EFAULT;
43531393b4aaSPaolo Bonzini if (get_compat_sigset(&sigset,
43541393b4aaSPaolo Bonzini (compat_sigset_t __user *)sigmask_arg->sigset))
43551dda606cSAlexander Graf goto out;
43561dda606cSAlexander Graf r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4357760a9a30SAlan Cox } else
4358760a9a30SAlan Cox r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
43591dda606cSAlexander Graf break;
43601dda606cSAlexander Graf }
43611dda606cSAlexander Graf default:
43621dda606cSAlexander Graf r = kvm_vcpu_ioctl(filp, ioctl, arg);
43631dda606cSAlexander Graf }
43641dda606cSAlexander Graf
43651dda606cSAlexander Graf out:
43661dda606cSAlexander Graf return r;
43671dda606cSAlexander Graf }
43681dda606cSAlexander Graf #endif
43691dda606cSAlexander Graf
kvm_device_mmap(struct file * filp,struct vm_area_struct * vma)4370a1cd3f08SCédric Le Goater static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4371a1cd3f08SCédric Le Goater {
4372a1cd3f08SCédric Le Goater struct kvm_device *dev = filp->private_data;
4373a1cd3f08SCédric Le Goater
4374a1cd3f08SCédric Le Goater if (dev->ops->mmap)
4375a1cd3f08SCédric Le Goater return dev->ops->mmap(dev, vma);
4376a1cd3f08SCédric Le Goater
4377a1cd3f08SCédric Le Goater return -ENODEV;
4378a1cd3f08SCédric Le Goater }
4379a1cd3f08SCédric Le Goater
kvm_device_ioctl_attr(struct kvm_device * dev,int (* accessor)(struct kvm_device * dev,struct kvm_device_attr * attr),unsigned long arg)4380852b6d57SScott Wood static int kvm_device_ioctl_attr(struct kvm_device *dev,
4381852b6d57SScott Wood int (*accessor)(struct kvm_device *dev,
4382852b6d57SScott Wood struct kvm_device_attr *attr),
4383852b6d57SScott Wood unsigned long arg)
4384852b6d57SScott Wood {
4385852b6d57SScott Wood struct kvm_device_attr attr;
4386852b6d57SScott Wood
4387852b6d57SScott Wood if (!accessor)
4388852b6d57SScott Wood return -EPERM;
4389852b6d57SScott Wood
4390852b6d57SScott Wood if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4391852b6d57SScott Wood return -EFAULT;
4392852b6d57SScott Wood
4393852b6d57SScott Wood return accessor(dev, &attr);
4394852b6d57SScott Wood }
4395852b6d57SScott Wood
kvm_device_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4396852b6d57SScott Wood static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4397852b6d57SScott Wood unsigned long arg)
4398852b6d57SScott Wood {
4399852b6d57SScott Wood struct kvm_device *dev = filp->private_data;
4400852b6d57SScott Wood
4401f4d31653SPaolo Bonzini if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4402ddba9180SSean Christopherson return -EIO;
4403ddba9180SSean Christopherson
4404852b6d57SScott Wood switch (ioctl) {
4405852b6d57SScott Wood case KVM_SET_DEVICE_ATTR:
4406852b6d57SScott Wood return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4407852b6d57SScott Wood case KVM_GET_DEVICE_ATTR:
4408852b6d57SScott Wood return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4409852b6d57SScott Wood case KVM_HAS_DEVICE_ATTR:
4410852b6d57SScott Wood return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4411852b6d57SScott Wood default:
4412852b6d57SScott Wood if (dev->ops->ioctl)
4413852b6d57SScott Wood return dev->ops->ioctl(dev, ioctl, arg);
4414852b6d57SScott Wood
4415852b6d57SScott Wood return -ENOTTY;
4416852b6d57SScott Wood }
4417852b6d57SScott Wood }
4418852b6d57SScott Wood
kvm_device_release(struct inode * inode,struct file * filp)4419852b6d57SScott Wood static int kvm_device_release(struct inode *inode, struct file *filp)
4420852b6d57SScott Wood {
4421852b6d57SScott Wood struct kvm_device *dev = filp->private_data;
4422852b6d57SScott Wood struct kvm *kvm = dev->kvm;
4423852b6d57SScott Wood
44242bde9b3eSCédric Le Goater if (dev->ops->release) {
44252bde9b3eSCédric Le Goater mutex_lock(&kvm->lock);
44262bde9b3eSCédric Le Goater list_del(&dev->vm_node);
44272bde9b3eSCédric Le Goater dev->ops->release(dev);
44282bde9b3eSCédric Le Goater mutex_unlock(&kvm->lock);
44292bde9b3eSCédric Le Goater }
44302bde9b3eSCédric Le Goater
4431852b6d57SScott Wood kvm_put_kvm(kvm);
4432852b6d57SScott Wood return 0;
4433852b6d57SScott Wood }
4434852b6d57SScott Wood
4435852b6d57SScott Wood static const struct file_operations kvm_device_fops = {
4436852b6d57SScott Wood .unlocked_ioctl = kvm_device_ioctl,
4437852b6d57SScott Wood .release = kvm_device_release,
44387ddfd3e0SMarc Zyngier KVM_COMPAT(kvm_device_ioctl),
4439a1cd3f08SCédric Le Goater .mmap = kvm_device_mmap,
4440852b6d57SScott Wood };
4441852b6d57SScott Wood
kvm_device_from_filp(struct file * filp)4442852b6d57SScott Wood struct kvm_device *kvm_device_from_filp(struct file *filp)
4443852b6d57SScott Wood {
4444852b6d57SScott Wood if (filp->f_op != &kvm_device_fops)
4445852b6d57SScott Wood return NULL;
4446852b6d57SScott Wood
4447852b6d57SScott Wood return filp->private_data;
4448852b6d57SScott Wood }
4449852b6d57SScott Wood
44508538cb22SSteven Price static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4451d60eacb0SWill Deacon #ifdef CONFIG_KVM_MPIC
4452d60eacb0SWill Deacon [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4453d60eacb0SWill Deacon [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4454d60eacb0SWill Deacon #endif
4455d60eacb0SWill Deacon };
4456d60eacb0SWill Deacon
kvm_register_device_ops(const struct kvm_device_ops * ops,u32 type)44578538cb22SSteven Price int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4458d60eacb0SWill Deacon {
4459d60eacb0SWill Deacon if (type >= ARRAY_SIZE(kvm_device_ops_table))
4460d60eacb0SWill Deacon return -ENOSPC;
4461d60eacb0SWill Deacon
4462d60eacb0SWill Deacon if (kvm_device_ops_table[type] != NULL)
4463d60eacb0SWill Deacon return -EEXIST;
4464d60eacb0SWill Deacon
4465d60eacb0SWill Deacon kvm_device_ops_table[type] = ops;
4466d60eacb0SWill Deacon return 0;
4467d60eacb0SWill Deacon }
4468d60eacb0SWill Deacon
kvm_unregister_device_ops(u32 type)4469571ee1b6SWanpeng Li void kvm_unregister_device_ops(u32 type)
4470571ee1b6SWanpeng Li {
4471571ee1b6SWanpeng Li if (kvm_device_ops_table[type] != NULL)
4472571ee1b6SWanpeng Li kvm_device_ops_table[type] = NULL;
4473571ee1b6SWanpeng Li }
4474571ee1b6SWanpeng Li
kvm_ioctl_create_device(struct kvm * kvm,struct kvm_create_device * cd)4475852b6d57SScott Wood static int kvm_ioctl_create_device(struct kvm *kvm,
4476852b6d57SScott Wood struct kvm_create_device *cd)
4477852b6d57SScott Wood {
4478eceb6e1dSLi kunyu const struct kvm_device_ops *ops;
4479852b6d57SScott Wood struct kvm_device *dev;
4480852b6d57SScott Wood bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
44811d487e9bSPaolo Bonzini int type;
4482852b6d57SScott Wood int ret;
4483852b6d57SScott Wood
4484d60eacb0SWill Deacon if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4485852b6d57SScott Wood return -ENODEV;
4486d60eacb0SWill Deacon
44871d487e9bSPaolo Bonzini type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
44881d487e9bSPaolo Bonzini ops = kvm_device_ops_table[type];
4489d60eacb0SWill Deacon if (ops == NULL)
4490d60eacb0SWill Deacon return -ENODEV;
4491852b6d57SScott Wood
4492852b6d57SScott Wood if (test)
4493852b6d57SScott Wood return 0;
4494852b6d57SScott Wood
4495b12ce36aSBen Gardon dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4496852b6d57SScott Wood if (!dev)
4497852b6d57SScott Wood return -ENOMEM;
4498852b6d57SScott Wood
4499852b6d57SScott Wood dev->ops = ops;
4500852b6d57SScott Wood dev->kvm = kvm;
4501852b6d57SScott Wood
4502a28ebea2SChristoffer Dall mutex_lock(&kvm->lock);
45031d487e9bSPaolo Bonzini ret = ops->create(dev, type);
4504852b6d57SScott Wood if (ret < 0) {
4505a28ebea2SChristoffer Dall mutex_unlock(&kvm->lock);
4506852b6d57SScott Wood kfree(dev);
4507852b6d57SScott Wood return ret;
4508852b6d57SScott Wood }
4509a28ebea2SChristoffer Dall list_add(&dev->vm_node, &kvm->devices);
4510a28ebea2SChristoffer Dall mutex_unlock(&kvm->lock);
4511852b6d57SScott Wood
4512023e9fddSChristoffer Dall if (ops->init)
4513023e9fddSChristoffer Dall ops->init(dev);
4514023e9fddSChristoffer Dall
4515cfa39381SJann Horn kvm_get_kvm(kvm);
451624009b05SYann Droneaud ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4517852b6d57SScott Wood if (ret < 0) {
4518149487bdSSean Christopherson kvm_put_kvm_no_destroy(kvm);
4519a28ebea2SChristoffer Dall mutex_lock(&kvm->lock);
4520a28ebea2SChristoffer Dall list_del(&dev->vm_node);
4521e8bc2427SAlexey Kardashevskiy if (ops->release)
4522e8bc2427SAlexey Kardashevskiy ops->release(dev);
4523a28ebea2SChristoffer Dall mutex_unlock(&kvm->lock);
4524e8bc2427SAlexey Kardashevskiy if (ops->destroy)
4525a0f1d21cSDan Carpenter ops->destroy(dev);
4526852b6d57SScott Wood return ret;
4527852b6d57SScott Wood }
4528852b6d57SScott Wood
4529852b6d57SScott Wood cd->fd = ret;
4530852b6d57SScott Wood return 0;
4531852b6d57SScott Wood }
4532852b6d57SScott Wood
kvm_vm_ioctl_check_extension_generic(struct kvm * kvm,long arg)4533f15ba52bSThomas Huth static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
453492b591a4SAlexander Graf {
453592b591a4SAlexander Graf switch (arg) {
453692b591a4SAlexander Graf case KVM_CAP_USER_MEMORY:
453792b591a4SAlexander Graf case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
453892b591a4SAlexander Graf case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
453992b591a4SAlexander Graf case KVM_CAP_INTERNAL_ERROR_DATA:
454092b591a4SAlexander Graf #ifdef CONFIG_HAVE_KVM_MSI
454192b591a4SAlexander Graf case KVM_CAP_SIGNAL_MSI:
454292b591a4SAlexander Graf #endif
4543297e2105SPaul Mackerras #ifdef CONFIG_HAVE_KVM_IRQFD
4544dc9be0faSPaolo Bonzini case KVM_CAP_IRQFD:
454592b591a4SAlexander Graf #endif
4546e9ea5069SJason Wang case KVM_CAP_IOEVENTFD_ANY_LENGTH:
454792b591a4SAlexander Graf case KVM_CAP_CHECK_EXTENSION_VM:
4548e5d83c74SPaolo Bonzini case KVM_CAP_ENABLE_CAP_VM:
4549acd05785SDavid Matlack case KVM_CAP_HALT_POLL:
455092b591a4SAlexander Graf return 1;
45514b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
455230422558SPaolo Bonzini case KVM_CAP_COALESCED_MMIO:
455330422558SPaolo Bonzini return KVM_COALESCED_MMIO_PAGE_OFFSET;
45540804c849SPeng Hao case KVM_CAP_COALESCED_PIO:
45550804c849SPeng Hao return 1;
455630422558SPaolo Bonzini #endif
45573c9bd400SJay Zhou #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
45583c9bd400SJay Zhou case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
45593c9bd400SJay Zhou return KVM_DIRTY_LOG_MANUAL_CAPS;
45603c9bd400SJay Zhou #endif
456192b591a4SAlexander Graf #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
456292b591a4SAlexander Graf case KVM_CAP_IRQ_ROUTING:
456392b591a4SAlexander Graf return KVM_MAX_IRQ_ROUTES;
456492b591a4SAlexander Graf #endif
4565f481b069SPaolo Bonzini #if KVM_ADDRESS_SPACE_NUM > 1
4566f481b069SPaolo Bonzini case KVM_CAP_MULTI_ADDRESS_SPACE:
4567f481b069SPaolo Bonzini return KVM_ADDRESS_SPACE_NUM;
4568f481b069SPaolo Bonzini #endif
4569c110ae57SPaolo Bonzini case KVM_CAP_NR_MEMSLOTS:
4570c110ae57SPaolo Bonzini return KVM_USER_MEM_SLOTS;
4571fb04a1edSPeter Xu case KVM_CAP_DIRTY_LOG_RING:
457217601bfeSMarc Zyngier #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
457317601bfeSMarc Zyngier return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
457417601bfeSMarc Zyngier #else
457517601bfeSMarc Zyngier return 0;
457617601bfeSMarc Zyngier #endif
457717601bfeSMarc Zyngier case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
457817601bfeSMarc Zyngier #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4579fb04a1edSPeter Xu return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4580fb04a1edSPeter Xu #else
4581fb04a1edSPeter Xu return 0;
4582fb04a1edSPeter Xu #endif
458386bdf3ebSGavin Shan #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
458486bdf3ebSGavin Shan case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
458586bdf3ebSGavin Shan #endif
4586ce55c049SJing Zhang case KVM_CAP_BINARY_STATS_FD:
4587d495f942SPaolo Bonzini case KVM_CAP_SYSTEM_EVENT_DATA:
4588ce55c049SJing Zhang return 1;
458992b591a4SAlexander Graf default:
459092b591a4SAlexander Graf break;
459192b591a4SAlexander Graf }
459292b591a4SAlexander Graf return kvm_vm_ioctl_check_extension(kvm, arg);
459392b591a4SAlexander Graf }
459492b591a4SAlexander Graf
kvm_vm_ioctl_enable_dirty_log_ring(struct kvm * kvm,u32 size)4595fb04a1edSPeter Xu static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4596fb04a1edSPeter Xu {
4597fb04a1edSPeter Xu int r;
4598fb04a1edSPeter Xu
4599fb04a1edSPeter Xu if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4600fb04a1edSPeter Xu return -EINVAL;
4601fb04a1edSPeter Xu
4602fb04a1edSPeter Xu /* the size should be power of 2 */
4603fb04a1edSPeter Xu if (!size || (size & (size - 1)))
4604fb04a1edSPeter Xu return -EINVAL;
4605fb04a1edSPeter Xu
4606fb04a1edSPeter Xu /* Should be bigger to keep the reserved entries, or a page */
4607fb04a1edSPeter Xu if (size < kvm_dirty_ring_get_rsvd_entries() *
4608fb04a1edSPeter Xu sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4609fb04a1edSPeter Xu return -EINVAL;
4610fb04a1edSPeter Xu
4611fb04a1edSPeter Xu if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4612fb04a1edSPeter Xu sizeof(struct kvm_dirty_gfn))
4613fb04a1edSPeter Xu return -E2BIG;
4614fb04a1edSPeter Xu
4615fb04a1edSPeter Xu /* We only allow it to set once */
4616fb04a1edSPeter Xu if (kvm->dirty_ring_size)
4617fb04a1edSPeter Xu return -EINVAL;
4618fb04a1edSPeter Xu
4619fb04a1edSPeter Xu mutex_lock(&kvm->lock);
4620fb04a1edSPeter Xu
4621fb04a1edSPeter Xu if (kvm->created_vcpus) {
4622fb04a1edSPeter Xu /* We don't allow to change this value after vcpu created */
4623fb04a1edSPeter Xu r = -EINVAL;
4624fb04a1edSPeter Xu } else {
4625fb04a1edSPeter Xu kvm->dirty_ring_size = size;
4626fb04a1edSPeter Xu r = 0;
4627fb04a1edSPeter Xu }
4628fb04a1edSPeter Xu
4629fb04a1edSPeter Xu mutex_unlock(&kvm->lock);
4630fb04a1edSPeter Xu return r;
4631fb04a1edSPeter Xu }
4632fb04a1edSPeter Xu
kvm_vm_ioctl_reset_dirty_pages(struct kvm * kvm)4633fb04a1edSPeter Xu static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4634fb04a1edSPeter Xu {
463546808a4cSMarc Zyngier unsigned long i;
4636fb04a1edSPeter Xu struct kvm_vcpu *vcpu;
4637fb04a1edSPeter Xu int cleared = 0;
4638fb04a1edSPeter Xu
4639fb04a1edSPeter Xu if (!kvm->dirty_ring_size)
4640fb04a1edSPeter Xu return -EINVAL;
4641fb04a1edSPeter Xu
4642fb04a1edSPeter Xu mutex_lock(&kvm->slots_lock);
4643fb04a1edSPeter Xu
4644fb04a1edSPeter Xu kvm_for_each_vcpu(i, vcpu, kvm)
4645fb04a1edSPeter Xu cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4646fb04a1edSPeter Xu
4647fb04a1edSPeter Xu mutex_unlock(&kvm->slots_lock);
4648fb04a1edSPeter Xu
4649fb04a1edSPeter Xu if (cleared)
4650fb04a1edSPeter Xu kvm_flush_remote_tlbs(kvm);
4651fb04a1edSPeter Xu
4652fb04a1edSPeter Xu return cleared;
4653fb04a1edSPeter Xu }
4654fb04a1edSPeter Xu
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)4655e5d83c74SPaolo Bonzini int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4656e5d83c74SPaolo Bonzini struct kvm_enable_cap *cap)
4657e5d83c74SPaolo Bonzini {
4658e5d83c74SPaolo Bonzini return -EINVAL;
4659e5d83c74SPaolo Bonzini }
4660e5d83c74SPaolo Bonzini
kvm_are_all_memslots_empty(struct kvm * kvm)466126f45714SRicardo Koller bool kvm_are_all_memslots_empty(struct kvm *kvm)
466286bdf3ebSGavin Shan {
466386bdf3ebSGavin Shan int i;
466486bdf3ebSGavin Shan
466586bdf3ebSGavin Shan lockdep_assert_held(&kvm->slots_lock);
466686bdf3ebSGavin Shan
466786bdf3ebSGavin Shan for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
466886bdf3ebSGavin Shan if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
466986bdf3ebSGavin Shan return false;
467086bdf3ebSGavin Shan }
467186bdf3ebSGavin Shan
467286bdf3ebSGavin Shan return true;
467386bdf3ebSGavin Shan }
467426f45714SRicardo Koller EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
467586bdf3ebSGavin Shan
kvm_vm_ioctl_enable_cap_generic(struct kvm * kvm,struct kvm_enable_cap * cap)4676e5d83c74SPaolo Bonzini static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4677e5d83c74SPaolo Bonzini struct kvm_enable_cap *cap)
4678e5d83c74SPaolo Bonzini {
4679e5d83c74SPaolo Bonzini switch (cap->cap) {
46802a31b9dbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
46813c9bd400SJay Zhou case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
46823c9bd400SJay Zhou u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
46833c9bd400SJay Zhou
46843c9bd400SJay Zhou if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
46853c9bd400SJay Zhou allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
46863c9bd400SJay Zhou
46873c9bd400SJay Zhou if (cap->flags || (cap->args[0] & ~allowed_options))
46882a31b9dbSPaolo Bonzini return -EINVAL;
46892a31b9dbSPaolo Bonzini kvm->manual_dirty_log_protect = cap->args[0];
46902a31b9dbSPaolo Bonzini return 0;
46913c9bd400SJay Zhou }
46922a31b9dbSPaolo Bonzini #endif
4693acd05785SDavid Matlack case KVM_CAP_HALT_POLL: {
4694acd05785SDavid Matlack if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4695acd05785SDavid Matlack return -EINVAL;
4696acd05785SDavid Matlack
4697acd05785SDavid Matlack kvm->max_halt_poll_ns = cap->args[0];
46989eb8ca04SDavid Matlack
46999eb8ca04SDavid Matlack /*
47009eb8ca04SDavid Matlack * Ensure kvm->override_halt_poll_ns does not become visible
47019eb8ca04SDavid Matlack * before kvm->max_halt_poll_ns.
47029eb8ca04SDavid Matlack *
47039eb8ca04SDavid Matlack * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
47049eb8ca04SDavid Matlack */
47059eb8ca04SDavid Matlack smp_wmb();
47069eb8ca04SDavid Matlack kvm->override_halt_poll_ns = true;
47079eb8ca04SDavid Matlack
4708acd05785SDavid Matlack return 0;
4709acd05785SDavid Matlack }
4710fb04a1edSPeter Xu case KVM_CAP_DIRTY_LOG_RING:
471117601bfeSMarc Zyngier case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
47127a2726ecSGavin Shan if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
47137a2726ecSGavin Shan return -EINVAL;
47147a2726ecSGavin Shan
4715fb04a1edSPeter Xu return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
471686bdf3ebSGavin Shan case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
471786bdf3ebSGavin Shan int r = -EINVAL;
471886bdf3ebSGavin Shan
471986bdf3ebSGavin Shan if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
472086bdf3ebSGavin Shan !kvm->dirty_ring_size || cap->flags)
472186bdf3ebSGavin Shan return r;
472286bdf3ebSGavin Shan
472386bdf3ebSGavin Shan mutex_lock(&kvm->slots_lock);
472486bdf3ebSGavin Shan
472586bdf3ebSGavin Shan /*
472686bdf3ebSGavin Shan * For simplicity, allow enabling ring+bitmap if and only if
472786bdf3ebSGavin Shan * there are no memslots, e.g. to ensure all memslots allocate
472886bdf3ebSGavin Shan * a bitmap after the capability is enabled.
472986bdf3ebSGavin Shan */
473086bdf3ebSGavin Shan if (kvm_are_all_memslots_empty(kvm)) {
473186bdf3ebSGavin Shan kvm->dirty_ring_with_bitmap = true;
473286bdf3ebSGavin Shan r = 0;
473386bdf3ebSGavin Shan }
473486bdf3ebSGavin Shan
473586bdf3ebSGavin Shan mutex_unlock(&kvm->slots_lock);
473686bdf3ebSGavin Shan
473786bdf3ebSGavin Shan return r;
473886bdf3ebSGavin Shan }
4739e5d83c74SPaolo Bonzini default:
4740e5d83c74SPaolo Bonzini return kvm_vm_ioctl_enable_cap(kvm, cap);
4741e5d83c74SPaolo Bonzini }
4742e5d83c74SPaolo Bonzini }
4743e5d83c74SPaolo Bonzini
kvm_vm_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4744fcfe1baeSJing Zhang static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4745fcfe1baeSJing Zhang size_t size, loff_t *offset)
4746fcfe1baeSJing Zhang {
4747fcfe1baeSJing Zhang struct kvm *kvm = file->private_data;
4748fcfe1baeSJing Zhang
4749fcfe1baeSJing Zhang return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4750fcfe1baeSJing Zhang &kvm_vm_stats_desc[0], &kvm->stat,
4751fcfe1baeSJing Zhang sizeof(kvm->stat), user_buffer, size, offset);
4752fcfe1baeSJing Zhang }
4753fcfe1baeSJing Zhang
kvm_vm_stats_release(struct inode * inode,struct file * file)4754eed3013fSSean Christopherson static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4755eed3013fSSean Christopherson {
4756eed3013fSSean Christopherson struct kvm *kvm = file->private_data;
4757eed3013fSSean Christopherson
4758eed3013fSSean Christopherson kvm_put_kvm(kvm);
4759eed3013fSSean Christopherson return 0;
4760eed3013fSSean Christopherson }
4761eed3013fSSean Christopherson
4762fcfe1baeSJing Zhang static const struct file_operations kvm_vm_stats_fops = {
4763fcfe1baeSJing Zhang .read = kvm_vm_stats_read,
4764eed3013fSSean Christopherson .release = kvm_vm_stats_release,
4765fcfe1baeSJing Zhang .llseek = noop_llseek,
4766fcfe1baeSJing Zhang };
4767fcfe1baeSJing Zhang
kvm_vm_ioctl_get_stats_fd(struct kvm * kvm)4768fcfe1baeSJing Zhang static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4769fcfe1baeSJing Zhang {
4770fcfe1baeSJing Zhang int fd;
4771fcfe1baeSJing Zhang struct file *file;
4772fcfe1baeSJing Zhang
4773fcfe1baeSJing Zhang fd = get_unused_fd_flags(O_CLOEXEC);
4774fcfe1baeSJing Zhang if (fd < 0)
4775fcfe1baeSJing Zhang return fd;
4776fcfe1baeSJing Zhang
4777fcfe1baeSJing Zhang file = anon_inode_getfile("kvm-vm-stats",
4778fcfe1baeSJing Zhang &kvm_vm_stats_fops, kvm, O_RDONLY);
4779fcfe1baeSJing Zhang if (IS_ERR(file)) {
4780fcfe1baeSJing Zhang put_unused_fd(fd);
4781fcfe1baeSJing Zhang return PTR_ERR(file);
4782fcfe1baeSJing Zhang }
4783eed3013fSSean Christopherson
4784eed3013fSSean Christopherson kvm_get_kvm(kvm);
4785eed3013fSSean Christopherson
4786fcfe1baeSJing Zhang file->f_mode |= FMODE_PREAD;
4787fcfe1baeSJing Zhang fd_install(fd, file);
4788fcfe1baeSJing Zhang
4789fcfe1baeSJing Zhang return fd;
4790fcfe1baeSJing Zhang }
4791fcfe1baeSJing Zhang
kvm_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)47920fce5623SAvi Kivity static long kvm_vm_ioctl(struct file *filp,
47930fce5623SAvi Kivity unsigned int ioctl, unsigned long arg)
47940fce5623SAvi Kivity {
47950fce5623SAvi Kivity struct kvm *kvm = filp->private_data;
47960fce5623SAvi Kivity void __user *argp = (void __user *)arg;
47970fce5623SAvi Kivity int r;
47980fce5623SAvi Kivity
4799f4d31653SPaolo Bonzini if (kvm->mm != current->mm || kvm->vm_dead)
48000fce5623SAvi Kivity return -EIO;
48010fce5623SAvi Kivity switch (ioctl) {
48020fce5623SAvi Kivity case KVM_CREATE_VCPU:
48030fce5623SAvi Kivity r = kvm_vm_ioctl_create_vcpu(kvm, arg);
48040fce5623SAvi Kivity break;
4805e5d83c74SPaolo Bonzini case KVM_ENABLE_CAP: {
4806e5d83c74SPaolo Bonzini struct kvm_enable_cap cap;
4807e5d83c74SPaolo Bonzini
4808e5d83c74SPaolo Bonzini r = -EFAULT;
4809e5d83c74SPaolo Bonzini if (copy_from_user(&cap, argp, sizeof(cap)))
4810e5d83c74SPaolo Bonzini goto out;
4811e5d83c74SPaolo Bonzini r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4812e5d83c74SPaolo Bonzini break;
4813e5d83c74SPaolo Bonzini }
48140fce5623SAvi Kivity case KVM_SET_USER_MEMORY_REGION: {
48150fce5623SAvi Kivity struct kvm_userspace_memory_region kvm_userspace_mem;
48160fce5623SAvi Kivity
48170fce5623SAvi Kivity r = -EFAULT;
48180fce5623SAvi Kivity if (copy_from_user(&kvm_userspace_mem, argp,
4819893bdbf1SXiubo Li sizeof(kvm_userspace_mem)))
48200fce5623SAvi Kivity goto out;
48210fce5623SAvi Kivity
482247ae31e2STakuya Yoshikawa r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
48230fce5623SAvi Kivity break;
48240fce5623SAvi Kivity }
48250fce5623SAvi Kivity case KVM_GET_DIRTY_LOG: {
48260fce5623SAvi Kivity struct kvm_dirty_log log;
48270fce5623SAvi Kivity
48280fce5623SAvi Kivity r = -EFAULT;
4829893bdbf1SXiubo Li if (copy_from_user(&log, argp, sizeof(log)))
48300fce5623SAvi Kivity goto out;
48310fce5623SAvi Kivity r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
48320fce5623SAvi Kivity break;
48330fce5623SAvi Kivity }
48342a31b9dbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
48352a31b9dbSPaolo Bonzini case KVM_CLEAR_DIRTY_LOG: {
48362a31b9dbSPaolo Bonzini struct kvm_clear_dirty_log log;
48372a31b9dbSPaolo Bonzini
48382a31b9dbSPaolo Bonzini r = -EFAULT;
48392a31b9dbSPaolo Bonzini if (copy_from_user(&log, argp, sizeof(log)))
48402a31b9dbSPaolo Bonzini goto out;
48412a31b9dbSPaolo Bonzini r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
48422a31b9dbSPaolo Bonzini break;
48432a31b9dbSPaolo Bonzini }
48442a31b9dbSPaolo Bonzini #endif
48454b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
48465f94c174SLaurent Vivier case KVM_REGISTER_COALESCED_MMIO: {
48475f94c174SLaurent Vivier struct kvm_coalesced_mmio_zone zone;
4848f95ef0cdSXiubo Li
48495f94c174SLaurent Vivier r = -EFAULT;
4850893bdbf1SXiubo Li if (copy_from_user(&zone, argp, sizeof(zone)))
48515f94c174SLaurent Vivier goto out;
48525f94c174SLaurent Vivier r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
48535f94c174SLaurent Vivier break;
48545f94c174SLaurent Vivier }
48555f94c174SLaurent Vivier case KVM_UNREGISTER_COALESCED_MMIO: {
48565f94c174SLaurent Vivier struct kvm_coalesced_mmio_zone zone;
4857f95ef0cdSXiubo Li
48585f94c174SLaurent Vivier r = -EFAULT;
4859893bdbf1SXiubo Li if (copy_from_user(&zone, argp, sizeof(zone)))
48605f94c174SLaurent Vivier goto out;
48615f94c174SLaurent Vivier r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
48625f94c174SLaurent Vivier break;
48635f94c174SLaurent Vivier }
48645f94c174SLaurent Vivier #endif
4865721eecbfSGregory Haskins case KVM_IRQFD: {
4866721eecbfSGregory Haskins struct kvm_irqfd data;
4867721eecbfSGregory Haskins
4868721eecbfSGregory Haskins r = -EFAULT;
4869893bdbf1SXiubo Li if (copy_from_user(&data, argp, sizeof(data)))
4870721eecbfSGregory Haskins goto out;
4871d4db2935SAlex Williamson r = kvm_irqfd(kvm, &data);
4872721eecbfSGregory Haskins break;
4873721eecbfSGregory Haskins }
4874d34e6b17SGregory Haskins case KVM_IOEVENTFD: {
4875d34e6b17SGregory Haskins struct kvm_ioeventfd data;
4876d34e6b17SGregory Haskins
4877d34e6b17SGregory Haskins r = -EFAULT;
4878893bdbf1SXiubo Li if (copy_from_user(&data, argp, sizeof(data)))
4879d34e6b17SGregory Haskins goto out;
4880d34e6b17SGregory Haskins r = kvm_ioeventfd(kvm, &data);
4881d34e6b17SGregory Haskins break;
4882d34e6b17SGregory Haskins }
488307975ad3SJan Kiszka #ifdef CONFIG_HAVE_KVM_MSI
488407975ad3SJan Kiszka case KVM_SIGNAL_MSI: {
488507975ad3SJan Kiszka struct kvm_msi msi;
488607975ad3SJan Kiszka
488707975ad3SJan Kiszka r = -EFAULT;
4888893bdbf1SXiubo Li if (copy_from_user(&msi, argp, sizeof(msi)))
488907975ad3SJan Kiszka goto out;
489007975ad3SJan Kiszka r = kvm_send_userspace_msi(kvm, &msi);
489107975ad3SJan Kiszka break;
489207975ad3SJan Kiszka }
489307975ad3SJan Kiszka #endif
489423d43cf9SChristoffer Dall #ifdef __KVM_HAVE_IRQ_LINE
489523d43cf9SChristoffer Dall case KVM_IRQ_LINE_STATUS:
489623d43cf9SChristoffer Dall case KVM_IRQ_LINE: {
489723d43cf9SChristoffer Dall struct kvm_irq_level irq_event;
489823d43cf9SChristoffer Dall
489923d43cf9SChristoffer Dall r = -EFAULT;
4900893bdbf1SXiubo Li if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
490123d43cf9SChristoffer Dall goto out;
490223d43cf9SChristoffer Dall
4903aa2fbe6dSYang Zhang r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4904aa2fbe6dSYang Zhang ioctl == KVM_IRQ_LINE_STATUS);
490523d43cf9SChristoffer Dall if (r)
490623d43cf9SChristoffer Dall goto out;
490723d43cf9SChristoffer Dall
490823d43cf9SChristoffer Dall r = -EFAULT;
490923d43cf9SChristoffer Dall if (ioctl == KVM_IRQ_LINE_STATUS) {
4910893bdbf1SXiubo Li if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
491123d43cf9SChristoffer Dall goto out;
491223d43cf9SChristoffer Dall }
491323d43cf9SChristoffer Dall
491423d43cf9SChristoffer Dall r = 0;
491523d43cf9SChristoffer Dall break;
491623d43cf9SChristoffer Dall }
491723d43cf9SChristoffer Dall #endif
4918aa8d5944SAlexander Graf #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4919aa8d5944SAlexander Graf case KVM_SET_GSI_ROUTING: {
4920aa8d5944SAlexander Graf struct kvm_irq_routing routing;
4921aa8d5944SAlexander Graf struct kvm_irq_routing __user *urouting;
4922f8c1b85bSPaolo Bonzini struct kvm_irq_routing_entry *entries = NULL;
4923aa8d5944SAlexander Graf
4924aa8d5944SAlexander Graf r = -EFAULT;
4925aa8d5944SAlexander Graf if (copy_from_user(&routing, argp, sizeof(routing)))
4926aa8d5944SAlexander Graf goto out;
4927aa8d5944SAlexander Graf r = -EINVAL;
49285c0aea0eSDavid Hildenbrand if (!kvm_arch_can_set_irq_routing(kvm))
49295c0aea0eSDavid Hildenbrand goto out;
4930caf1ff26SXiubo Li if (routing.nr > KVM_MAX_IRQ_ROUTES)
4931aa8d5944SAlexander Graf goto out;
4932aa8d5944SAlexander Graf if (routing.flags)
4933aa8d5944SAlexander Graf goto out;
4934f8c1b85bSPaolo Bonzini if (routing.nr) {
4935aa8d5944SAlexander Graf urouting = argp;
49367ec28e26SDenis Efremov entries = vmemdup_user(urouting->entries,
49377ec28e26SDenis Efremov array_size(sizeof(*entries),
49387ec28e26SDenis Efremov routing.nr));
49397ec28e26SDenis Efremov if (IS_ERR(entries)) {
49407ec28e26SDenis Efremov r = PTR_ERR(entries);
49417ec28e26SDenis Efremov goto out;
49427ec28e26SDenis Efremov }
4943f8c1b85bSPaolo Bonzini }
4944aa8d5944SAlexander Graf r = kvm_set_irq_routing(kvm, entries, routing.nr,
4945aa8d5944SAlexander Graf routing.flags);
49467ec28e26SDenis Efremov kvfree(entries);
4947aa8d5944SAlexander Graf break;
4948aa8d5944SAlexander Graf }
4949aa8d5944SAlexander Graf #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4950852b6d57SScott Wood case KVM_CREATE_DEVICE: {
4951852b6d57SScott Wood struct kvm_create_device cd;
4952852b6d57SScott Wood
4953852b6d57SScott Wood r = -EFAULT;
4954852b6d57SScott Wood if (copy_from_user(&cd, argp, sizeof(cd)))
4955852b6d57SScott Wood goto out;
4956852b6d57SScott Wood
4957852b6d57SScott Wood r = kvm_ioctl_create_device(kvm, &cd);
4958852b6d57SScott Wood if (r)
4959852b6d57SScott Wood goto out;
4960852b6d57SScott Wood
4961852b6d57SScott Wood r = -EFAULT;
4962852b6d57SScott Wood if (copy_to_user(argp, &cd, sizeof(cd)))
4963852b6d57SScott Wood goto out;
4964852b6d57SScott Wood
4965852b6d57SScott Wood r = 0;
4966852b6d57SScott Wood break;
4967852b6d57SScott Wood }
496892b591a4SAlexander Graf case KVM_CHECK_EXTENSION:
496992b591a4SAlexander Graf r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
497092b591a4SAlexander Graf break;
4971fb04a1edSPeter Xu case KVM_RESET_DIRTY_RINGS:
4972fb04a1edSPeter Xu r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4973fb04a1edSPeter Xu break;
4974fcfe1baeSJing Zhang case KVM_GET_STATS_FD:
4975fcfe1baeSJing Zhang r = kvm_vm_ioctl_get_stats_fd(kvm);
4976fcfe1baeSJing Zhang break;
49770fce5623SAvi Kivity default:
49780fce5623SAvi Kivity r = kvm_arch_vm_ioctl(filp, ioctl, arg);
49790fce5623SAvi Kivity }
49800fce5623SAvi Kivity out:
49810fce5623SAvi Kivity return r;
49820fce5623SAvi Kivity }
49830fce5623SAvi Kivity
4984de8e5d74SChristian Borntraeger #ifdef CONFIG_KVM_COMPAT
49856ff5894cSArnd Bergmann struct compat_kvm_dirty_log {
49866ff5894cSArnd Bergmann __u32 slot;
49876ff5894cSArnd Bergmann __u32 padding1;
49886ff5894cSArnd Bergmann union {
49896ff5894cSArnd Bergmann compat_uptr_t dirty_bitmap; /* one bit per page */
49906ff5894cSArnd Bergmann __u64 padding2;
49916ff5894cSArnd Bergmann };
49926ff5894cSArnd Bergmann };
49936ff5894cSArnd Bergmann
49948750f9bbSPaolo Bonzini struct compat_kvm_clear_dirty_log {
49958750f9bbSPaolo Bonzini __u32 slot;
49968750f9bbSPaolo Bonzini __u32 num_pages;
49978750f9bbSPaolo Bonzini __u64 first_page;
49988750f9bbSPaolo Bonzini union {
49998750f9bbSPaolo Bonzini compat_uptr_t dirty_bitmap; /* one bit per page */
50008750f9bbSPaolo Bonzini __u64 padding2;
50018750f9bbSPaolo Bonzini };
50028750f9bbSPaolo Bonzini };
50038750f9bbSPaolo Bonzini
kvm_arch_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5004ed51862fSAlexander Graf long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5005ed51862fSAlexander Graf unsigned long arg)
5006ed51862fSAlexander Graf {
5007ed51862fSAlexander Graf return -ENOTTY;
5008ed51862fSAlexander Graf }
5009ed51862fSAlexander Graf
kvm_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)50106ff5894cSArnd Bergmann static long kvm_vm_compat_ioctl(struct file *filp,
50116ff5894cSArnd Bergmann unsigned int ioctl, unsigned long arg)
50126ff5894cSArnd Bergmann {
50136ff5894cSArnd Bergmann struct kvm *kvm = filp->private_data;
50146ff5894cSArnd Bergmann int r;
50156ff5894cSArnd Bergmann
5016f4d31653SPaolo Bonzini if (kvm->mm != current->mm || kvm->vm_dead)
50176ff5894cSArnd Bergmann return -EIO;
5018ed51862fSAlexander Graf
5019ed51862fSAlexander Graf r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5020ed51862fSAlexander Graf if (r != -ENOTTY)
5021ed51862fSAlexander Graf return r;
5022ed51862fSAlexander Graf
50236ff5894cSArnd Bergmann switch (ioctl) {
50248750f9bbSPaolo Bonzini #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
50258750f9bbSPaolo Bonzini case KVM_CLEAR_DIRTY_LOG: {
50268750f9bbSPaolo Bonzini struct compat_kvm_clear_dirty_log compat_log;
50278750f9bbSPaolo Bonzini struct kvm_clear_dirty_log log;
50288750f9bbSPaolo Bonzini
50298750f9bbSPaolo Bonzini if (copy_from_user(&compat_log, (void __user *)arg,
50308750f9bbSPaolo Bonzini sizeof(compat_log)))
50318750f9bbSPaolo Bonzini return -EFAULT;
50328750f9bbSPaolo Bonzini log.slot = compat_log.slot;
50338750f9bbSPaolo Bonzini log.num_pages = compat_log.num_pages;
50348750f9bbSPaolo Bonzini log.first_page = compat_log.first_page;
50358750f9bbSPaolo Bonzini log.padding2 = compat_log.padding2;
50368750f9bbSPaolo Bonzini log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
50378750f9bbSPaolo Bonzini
50388750f9bbSPaolo Bonzini r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
50398750f9bbSPaolo Bonzini break;
50408750f9bbSPaolo Bonzini }
50418750f9bbSPaolo Bonzini #endif
50426ff5894cSArnd Bergmann case KVM_GET_DIRTY_LOG: {
50436ff5894cSArnd Bergmann struct compat_kvm_dirty_log compat_log;
50446ff5894cSArnd Bergmann struct kvm_dirty_log log;
50456ff5894cSArnd Bergmann
50466ff5894cSArnd Bergmann if (copy_from_user(&compat_log, (void __user *)arg,
50476ff5894cSArnd Bergmann sizeof(compat_log)))
5048f6a3b168SMarkus Elfring return -EFAULT;
50496ff5894cSArnd Bergmann log.slot = compat_log.slot;
50506ff5894cSArnd Bergmann log.padding1 = compat_log.padding1;
50516ff5894cSArnd Bergmann log.padding2 = compat_log.padding2;
50526ff5894cSArnd Bergmann log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
50536ff5894cSArnd Bergmann
50546ff5894cSArnd Bergmann r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
50556ff5894cSArnd Bergmann break;
50566ff5894cSArnd Bergmann }
50576ff5894cSArnd Bergmann default:
50586ff5894cSArnd Bergmann r = kvm_vm_ioctl(filp, ioctl, arg);
50596ff5894cSArnd Bergmann }
50606ff5894cSArnd Bergmann return r;
50616ff5894cSArnd Bergmann }
50626ff5894cSArnd Bergmann #endif
50636ff5894cSArnd Bergmann
506470375c2dSDavid Matlack static const struct file_operations kvm_vm_fops = {
50650fce5623SAvi Kivity .release = kvm_vm_release,
50660fce5623SAvi Kivity .unlocked_ioctl = kvm_vm_ioctl,
50676038f373SArnd Bergmann .llseek = noop_llseek,
50687ddfd3e0SMarc Zyngier KVM_COMPAT(kvm_vm_compat_ioctl),
50690fce5623SAvi Kivity };
50700fce5623SAvi Kivity
file_is_kvm(struct file * file)507154526d1fSNathan Tempelman bool file_is_kvm(struct file *file)
507254526d1fSNathan Tempelman {
507354526d1fSNathan Tempelman return file && file->f_op == &kvm_vm_fops;
507454526d1fSNathan Tempelman }
507554526d1fSNathan Tempelman EXPORT_SYMBOL_GPL(file_is_kvm);
507654526d1fSNathan Tempelman
kvm_dev_ioctl_create_vm(unsigned long type)5077e08b9637SCarsten Otte static int kvm_dev_ioctl_create_vm(unsigned long type)
50780fce5623SAvi Kivity {
507959f82aadSOliver Upton char fdname[ITOA_MAX_LEN + 1];
508020020f4cSOliver Upton int r, fd;
50810fce5623SAvi Kivity struct kvm *kvm;
5082506cfba9SAl Viro struct file *file;
50830fce5623SAvi Kivity
508420020f4cSOliver Upton fd = get_unused_fd_flags(O_CLOEXEC);
508520020f4cSOliver Upton if (fd < 0)
508620020f4cSOliver Upton return fd;
508720020f4cSOliver Upton
508859f82aadSOliver Upton snprintf(fdname, sizeof(fdname), "%d", fd);
508959f82aadSOliver Upton
5090b74ed7a6SOliver Upton kvm = kvm_create_vm(type, fdname);
509120020f4cSOliver Upton if (IS_ERR(kvm)) {
509220020f4cSOliver Upton r = PTR_ERR(kvm);
509320020f4cSOliver Upton goto put_fd;
509420020f4cSOliver Upton }
509520020f4cSOliver Upton
5096506cfba9SAl Viro file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5097506cfba9SAl Viro if (IS_ERR(file)) {
509878588335SMarkus Elfring r = PTR_ERR(file);
509978588335SMarkus Elfring goto put_kvm;
5100506cfba9SAl Viro }
5101536a6f88SJanosch Frank
5102525df861SPaolo Bonzini /*
5103525df861SPaolo Bonzini * Don't call kvm_put_kvm anymore at this point; file->f_op is
5104525df861SPaolo Bonzini * already set, with ->release() being kvm_vm_release(). In error
5105525df861SPaolo Bonzini * cases it will be called by the final fput(file) and will take
5106525df861SPaolo Bonzini * care of doing kvm_put_kvm(kvm).
5107525df861SPaolo Bonzini */
5108286de8f6SClaudio Imbrenda kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
51090fce5623SAvi Kivity
511020020f4cSOliver Upton fd_install(fd, file);
511120020f4cSOliver Upton return fd;
511278588335SMarkus Elfring
511378588335SMarkus Elfring put_kvm:
511478588335SMarkus Elfring kvm_put_kvm(kvm);
511520020f4cSOliver Upton put_fd:
511620020f4cSOliver Upton put_unused_fd(fd);
511778588335SMarkus Elfring return r;
51180fce5623SAvi Kivity }
51190fce5623SAvi Kivity
kvm_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)51200fce5623SAvi Kivity static long kvm_dev_ioctl(struct file *filp,
51210fce5623SAvi Kivity unsigned int ioctl, unsigned long arg)
51220fce5623SAvi Kivity {
5123f15ba52bSThomas Huth int r = -EINVAL;
51240fce5623SAvi Kivity
51250fce5623SAvi Kivity switch (ioctl) {
51260fce5623SAvi Kivity case KVM_GET_API_VERSION:
51270fce5623SAvi Kivity if (arg)
51280fce5623SAvi Kivity goto out;
51290fce5623SAvi Kivity r = KVM_API_VERSION;
51300fce5623SAvi Kivity break;
51310fce5623SAvi Kivity case KVM_CREATE_VM:
5132e08b9637SCarsten Otte r = kvm_dev_ioctl_create_vm(arg);
51330fce5623SAvi Kivity break;
51340fce5623SAvi Kivity case KVM_CHECK_EXTENSION:
5135784aa3d7SAlexander Graf r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
51360fce5623SAvi Kivity break;
51370fce5623SAvi Kivity case KVM_GET_VCPU_MMAP_SIZE:
51380fce5623SAvi Kivity if (arg)
51390fce5623SAvi Kivity goto out;
5140adb1ff46SAvi Kivity r = PAGE_SIZE; /* struct kvm_run */
5141adb1ff46SAvi Kivity #ifdef CONFIG_X86
5142adb1ff46SAvi Kivity r += PAGE_SIZE; /* pio data page */
5143adb1ff46SAvi Kivity #endif
51444b4357e0SPaolo Bonzini #ifdef CONFIG_KVM_MMIO
51455f94c174SLaurent Vivier r += PAGE_SIZE; /* coalesced mmio ring page */
51465f94c174SLaurent Vivier #endif
51470fce5623SAvi Kivity break;
5148d4c9ff2dSFeng(Eric) Liu case KVM_TRACE_ENABLE:
5149d4c9ff2dSFeng(Eric) Liu case KVM_TRACE_PAUSE:
5150d4c9ff2dSFeng(Eric) Liu case KVM_TRACE_DISABLE:
51512023a29cSMarcelo Tosatti r = -EOPNOTSUPP;
5152d4c9ff2dSFeng(Eric) Liu break;
51530fce5623SAvi Kivity default:
51540fce5623SAvi Kivity return kvm_arch_dev_ioctl(filp, ioctl, arg);
51550fce5623SAvi Kivity }
51560fce5623SAvi Kivity out:
51570fce5623SAvi Kivity return r;
51580fce5623SAvi Kivity }
51590fce5623SAvi Kivity
51600fce5623SAvi Kivity static struct file_operations kvm_chardev_ops = {
51610fce5623SAvi Kivity .unlocked_ioctl = kvm_dev_ioctl,
51626038f373SArnd Bergmann .llseek = noop_llseek,
51637ddfd3e0SMarc Zyngier KVM_COMPAT(kvm_dev_ioctl),
51640fce5623SAvi Kivity };
51650fce5623SAvi Kivity
51660fce5623SAvi Kivity static struct miscdevice kvm_dev = {
51670fce5623SAvi Kivity KVM_MINOR,
51680fce5623SAvi Kivity "kvm",
51690fce5623SAvi Kivity &kvm_chardev_ops,
51700fce5623SAvi Kivity };
51710fce5623SAvi Kivity
5172441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5173441f7bfaSSean Christopherson __visible bool kvm_rebooting;
5174441f7bfaSSean Christopherson EXPORT_SYMBOL_GPL(kvm_rebooting);
5175441f7bfaSSean Christopherson
5176441f7bfaSSean Christopherson static DEFINE_PER_CPU(bool, hardware_enabled);
5177*4777225eSSean Christopherson static DEFINE_MUTEX(kvm_usage_lock);
5178441f7bfaSSean Christopherson static int kvm_usage_count;
5179441f7bfaSSean Christopherson
__hardware_enable_nolock(void)5180e6fb7d6eSIsaku Yamahata static int __hardware_enable_nolock(void)
51810fce5623SAvi Kivity {
518237d25881SSean Christopherson if (__this_cpu_read(hardware_enabled))
5183e6fb7d6eSIsaku Yamahata return 0;
518410474ae8SAlexander Graf
518537d25881SSean Christopherson if (kvm_arch_hardware_enable()) {
518637d25881SSean Christopherson pr_info("kvm: enabling virtualization on CPU%d failed\n",
518737d25881SSean Christopherson raw_smp_processor_id());
5188e6fb7d6eSIsaku Yamahata return -EIO;
518910474ae8SAlexander Graf }
519037d25881SSean Christopherson
519137d25881SSean Christopherson __this_cpu_write(hardware_enabled, true);
5192e6fb7d6eSIsaku Yamahata return 0;
5193e6fb7d6eSIsaku Yamahata }
5194e6fb7d6eSIsaku Yamahata
hardware_enable_nolock(void * failed)5195e6fb7d6eSIsaku Yamahata static void hardware_enable_nolock(void *failed)
5196e6fb7d6eSIsaku Yamahata {
5197e6fb7d6eSIsaku Yamahata if (__hardware_enable_nolock())
5198e6fb7d6eSIsaku Yamahata atomic_inc(failed);
51990fce5623SAvi Kivity }
52000fce5623SAvi Kivity
kvm_online_cpu(unsigned int cpu)5201aaf12a7bSChao Gao static int kvm_online_cpu(unsigned int cpu)
520275b7127cSTakuya Yoshikawa {
5203aaf12a7bSChao Gao int ret = 0;
5204aaf12a7bSChao Gao
5205aaf12a7bSChao Gao /*
5206aaf12a7bSChao Gao * Abort the CPU online process if hardware virtualization cannot
5207aaf12a7bSChao Gao * be enabled. Otherwise running VMs would encounter unrecoverable
5208aaf12a7bSChao Gao * errors when scheduled to this CPU.
5209aaf12a7bSChao Gao */
5210*4777225eSSean Christopherson mutex_lock(&kvm_usage_lock);
5211e6fb7d6eSIsaku Yamahata if (kvm_usage_count)
5212e6fb7d6eSIsaku Yamahata ret = __hardware_enable_nolock();
5213*4777225eSSean Christopherson mutex_unlock(&kvm_usage_lock);
5214aaf12a7bSChao Gao return ret;
521575b7127cSTakuya Yoshikawa }
521675b7127cSTakuya Yoshikawa
hardware_disable_nolock(void * junk)521775b7127cSTakuya Yoshikawa static void hardware_disable_nolock(void *junk)
52180fce5623SAvi Kivity {
521937d25881SSean Christopherson /*
522037d25881SSean Christopherson * Note, hardware_disable_all_nolock() tells all online CPUs to disable
522137d25881SSean Christopherson * hardware, not just CPUs that successfully enabled hardware!
522237d25881SSean Christopherson */
522337d25881SSean Christopherson if (!__this_cpu_read(hardware_enabled))
52240fce5623SAvi Kivity return;
522537d25881SSean Christopherson
522613a34e06SRadim Krčmář kvm_arch_hardware_disable();
522737d25881SSean Christopherson
522837d25881SSean Christopherson __this_cpu_write(hardware_enabled, false);
52290fce5623SAvi Kivity }
52300fce5623SAvi Kivity
kvm_offline_cpu(unsigned int cpu)5231aaf12a7bSChao Gao static int kvm_offline_cpu(unsigned int cpu)
523275b7127cSTakuya Yoshikawa {
5233*4777225eSSean Christopherson mutex_lock(&kvm_usage_lock);
52344fa92fb2SPaolo Bonzini if (kvm_usage_count)
52354fa92fb2SPaolo Bonzini hardware_disable_nolock(NULL);
5236*4777225eSSean Christopherson mutex_unlock(&kvm_usage_lock);
52378c18b2d2SThomas Gleixner return 0;
523875b7127cSTakuya Yoshikawa }
523975b7127cSTakuya Yoshikawa
hardware_disable_all_nolock(void)524010474ae8SAlexander Graf static void hardware_disable_all_nolock(void)
524110474ae8SAlexander Graf {
524210474ae8SAlexander Graf BUG_ON(!kvm_usage_count);
524310474ae8SAlexander Graf
524410474ae8SAlexander Graf kvm_usage_count--;
524510474ae8SAlexander Graf if (!kvm_usage_count)
524675b7127cSTakuya Yoshikawa on_each_cpu(hardware_disable_nolock, NULL, 1);
524710474ae8SAlexander Graf }
524810474ae8SAlexander Graf
hardware_disable_all(void)524910474ae8SAlexander Graf static void hardware_disable_all(void)
525010474ae8SAlexander Graf {
5251e4aa7f88SChao Gao cpus_read_lock();
5252*4777225eSSean Christopherson mutex_lock(&kvm_usage_lock);
525310474ae8SAlexander Graf hardware_disable_all_nolock();
5254*4777225eSSean Christopherson mutex_unlock(&kvm_usage_lock);
5255e4aa7f88SChao Gao cpus_read_unlock();
525610474ae8SAlexander Graf }
525710474ae8SAlexander Graf
hardware_enable_all(void)525810474ae8SAlexander Graf static int hardware_enable_all(void)
525910474ae8SAlexander Graf {
5260e6fb7d6eSIsaku Yamahata atomic_t failed = ATOMIC_INIT(0);
5261e0ceec22SSean Christopherson int r;
5262e0ceec22SSean Christopherson
5263e0ceec22SSean Christopherson /*
5264e0ceec22SSean Christopherson * Do not enable hardware virtualization if the system is going down.
5265e0ceec22SSean Christopherson * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5266e0ceec22SSean Christopherson * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5267e0ceec22SSean Christopherson * after kvm_reboot() is called. Note, this relies on system_state
5268e0ceec22SSean Christopherson * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5269e0ceec22SSean Christopherson * hook instead of registering a dedicated reboot notifier (the latter
5270e0ceec22SSean Christopherson * runs before system_state is updated).
5271e0ceec22SSean Christopherson */
5272e0ceec22SSean Christopherson if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5273e0ceec22SSean Christopherson system_state == SYSTEM_RESTART)
5274e0ceec22SSean Christopherson return -EBUSY;
527510474ae8SAlexander Graf
5276e4aa7f88SChao Gao /*
5277e4aa7f88SChao Gao * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5278e4aa7f88SChao Gao * is called, and so on_each_cpu() between them includes the CPU that
5279e4aa7f88SChao Gao * is being onlined. As a result, hardware_enable_nolock() may get
5280e4aa7f88SChao Gao * invoked before kvm_online_cpu(), which also enables hardware if the
5281e4aa7f88SChao Gao * usage count is non-zero. Disable CPU hotplug to avoid attempting to
5282e4aa7f88SChao Gao * enable hardware multiple times.
5283e4aa7f88SChao Gao */
5284e4aa7f88SChao Gao cpus_read_lock();
5285*4777225eSSean Christopherson mutex_lock(&kvm_usage_lock);
528610474ae8SAlexander Graf
5287e0ceec22SSean Christopherson r = 0;
5288e0ceec22SSean Christopherson
528910474ae8SAlexander Graf kvm_usage_count++;
529010474ae8SAlexander Graf if (kvm_usage_count == 1) {
5291e6fb7d6eSIsaku Yamahata on_each_cpu(hardware_enable_nolock, &failed, 1);
529210474ae8SAlexander Graf
5293e6fb7d6eSIsaku Yamahata if (atomic_read(&failed)) {
529410474ae8SAlexander Graf hardware_disable_all_nolock();
529510474ae8SAlexander Graf r = -EBUSY;
529610474ae8SAlexander Graf }
529710474ae8SAlexander Graf }
529810474ae8SAlexander Graf
5299*4777225eSSean Christopherson mutex_unlock(&kvm_usage_lock);
5300e4aa7f88SChao Gao cpus_read_unlock();
530110474ae8SAlexander Graf
530210474ae8SAlexander Graf return r;
530310474ae8SAlexander Graf }
530410474ae8SAlexander Graf
kvm_shutdown(void)53056735150bSSean Christopherson static void kvm_shutdown(void)
53060fce5623SAvi Kivity {
53070fce5623SAvi Kivity /*
53086735150bSSean Christopherson * Disable hardware virtualization and set kvm_rebooting to indicate
53096735150bSSean Christopherson * that KVM has asynchronously disabled hardware virtualization, i.e.
53106735150bSSean Christopherson * that relevant errors and exceptions aren't entirely unexpected.
53116735150bSSean Christopherson * Some flavors of hardware virtualization need to be disabled before
53126735150bSSean Christopherson * transferring control to firmware (to perform shutdown/reboot), e.g.
53136735150bSSean Christopherson * on x86, virtualization can block INIT interrupts, which are used by
53146735150bSSean Christopherson * firmware to pull APs back under firmware control. Note, this path
53156735150bSSean Christopherson * is used for both shutdown and reboot scenarios, i.e. neither name is
53166735150bSSean Christopherson * 100% comprehensive.
53170fce5623SAvi Kivity */
53181170adc6SXiubo Li pr_info("kvm: exiting hardware virtualization\n");
53194ecac3fdSAvi Kivity kvm_rebooting = true;
532075b7127cSTakuya Yoshikawa on_each_cpu(hardware_disable_nolock, NULL, 1);
53210fce5623SAvi Kivity }
53220fce5623SAvi Kivity
kvm_suspend(void)532335774a9fSSean Christopherson static int kvm_suspend(void)
532435774a9fSSean Christopherson {
532535774a9fSSean Christopherson /*
532635774a9fSSean Christopherson * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5327*4777225eSSean Christopherson * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
5328*4777225eSSean Christopherson * count is stable. Assert that kvm_usage_lock is not held to ensure
5329*4777225eSSean Christopherson * the system isn't suspended while KVM is enabling hardware. Hardware
5330*4777225eSSean Christopherson * enabling can be preempted, but the task cannot be frozen until it has
5331*4777225eSSean Christopherson * dropped all locks (userspace tasks are frozen via a fake signal).
533235774a9fSSean Christopherson */
5333*4777225eSSean Christopherson lockdep_assert_not_held(&kvm_usage_lock);
533435774a9fSSean Christopherson lockdep_assert_irqs_disabled();
533535774a9fSSean Christopherson
533635774a9fSSean Christopherson if (kvm_usage_count)
533735774a9fSSean Christopherson hardware_disable_nolock(NULL);
533835774a9fSSean Christopherson return 0;
533935774a9fSSean Christopherson }
534035774a9fSSean Christopherson
kvm_resume(void)534135774a9fSSean Christopherson static void kvm_resume(void)
534235774a9fSSean Christopherson {
5343*4777225eSSean Christopherson lockdep_assert_not_held(&kvm_usage_lock);
534435774a9fSSean Christopherson lockdep_assert_irqs_disabled();
534535774a9fSSean Christopherson
534635774a9fSSean Christopherson if (kvm_usage_count)
534735774a9fSSean Christopherson WARN_ON_ONCE(__hardware_enable_nolock());
534835774a9fSSean Christopherson }
534935774a9fSSean Christopherson
535035774a9fSSean Christopherson static struct syscore_ops kvm_syscore_ops = {
535135774a9fSSean Christopherson .suspend = kvm_suspend,
535235774a9fSSean Christopherson .resume = kvm_resume,
53536735150bSSean Christopherson .shutdown = kvm_shutdown,
535435774a9fSSean Christopherson };
5355441f7bfaSSean Christopherson #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
hardware_enable_all(void)5356441f7bfaSSean Christopherson static int hardware_enable_all(void)
5357441f7bfaSSean Christopherson {
5358441f7bfaSSean Christopherson return 0;
5359441f7bfaSSean Christopherson }
5360441f7bfaSSean Christopherson
hardware_disable_all(void)5361441f7bfaSSean Christopherson static void hardware_disable_all(void)
5362441f7bfaSSean Christopherson {
5363441f7bfaSSean Christopherson
5364441f7bfaSSean Christopherson }
5365441f7bfaSSean Christopherson #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
536635774a9fSSean Christopherson
kvm_iodevice_destructor(struct kvm_io_device * dev)53675ea5ca3cSWei Wang static void kvm_iodevice_destructor(struct kvm_io_device *dev)
53685ea5ca3cSWei Wang {
53695ea5ca3cSWei Wang if (dev->ops->destructor)
53705ea5ca3cSWei Wang dev->ops->destructor(dev);
53715ea5ca3cSWei Wang }
53725ea5ca3cSWei Wang
kvm_io_bus_destroy(struct kvm_io_bus * bus)5373e93f8a0fSMarcelo Tosatti static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
53740fce5623SAvi Kivity {
53750fce5623SAvi Kivity int i;
53760fce5623SAvi Kivity
53770fce5623SAvi Kivity for (i = 0; i < bus->dev_count; i++) {
5378743eeb0bSSasha Levin struct kvm_io_device *pos = bus->range[i].dev;
53790fce5623SAvi Kivity
53800fce5623SAvi Kivity kvm_iodevice_destructor(pos);
53810fce5623SAvi Kivity }
5382e93f8a0fSMarcelo Tosatti kfree(bus);
53830fce5623SAvi Kivity }
53840fce5623SAvi Kivity
kvm_io_bus_cmp(const struct kvm_io_range * r1,const struct kvm_io_range * r2)5385c21fbff1SPaolo Bonzini static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5386a343c9b7SPaolo Bonzini const struct kvm_io_range *r2)
5387743eeb0bSSasha Levin {
53888f4216c7SJason Wang gpa_t addr1 = r1->addr;
53898f4216c7SJason Wang gpa_t addr2 = r2->addr;
53908f4216c7SJason Wang
53918f4216c7SJason Wang if (addr1 < addr2)
5392743eeb0bSSasha Levin return -1;
53938f4216c7SJason Wang
53948f4216c7SJason Wang /* If r2->len == 0, match the exact address. If r2->len != 0,
53958f4216c7SJason Wang * accept any overlapping write. Any order is acceptable for
53968f4216c7SJason Wang * overlapping ranges, because kvm_io_bus_get_first_dev ensures
53978f4216c7SJason Wang * we process all of them.
53988f4216c7SJason Wang */
53998f4216c7SJason Wang if (r2->len) {
54008f4216c7SJason Wang addr1 += r1->len;
54018f4216c7SJason Wang addr2 += r2->len;
54028f4216c7SJason Wang }
54038f4216c7SJason Wang
54048f4216c7SJason Wang if (addr1 > addr2)
5405743eeb0bSSasha Levin return 1;
54068f4216c7SJason Wang
5407743eeb0bSSasha Levin return 0;
5408743eeb0bSSasha Levin }
5409743eeb0bSSasha Levin
kvm_io_bus_sort_cmp(const void * p1,const void * p2)5410a343c9b7SPaolo Bonzini static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5411a343c9b7SPaolo Bonzini {
5412c21fbff1SPaolo Bonzini return kvm_io_bus_cmp(p1, p2);
5413a343c9b7SPaolo Bonzini }
5414a343c9b7SPaolo Bonzini
kvm_io_bus_get_first_dev(struct kvm_io_bus * bus,gpa_t addr,int len)541539369f7aSGeoff Levand static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5416743eeb0bSSasha Levin gpa_t addr, int len)
5417743eeb0bSSasha Levin {
5418743eeb0bSSasha Levin struct kvm_io_range *range, key;
5419743eeb0bSSasha Levin int off;
5420743eeb0bSSasha Levin
5421743eeb0bSSasha Levin key = (struct kvm_io_range) {
5422743eeb0bSSasha Levin .addr = addr,
5423743eeb0bSSasha Levin .len = len,
5424743eeb0bSSasha Levin };
5425743eeb0bSSasha Levin
5426743eeb0bSSasha Levin range = bsearch(&key, bus->range, bus->dev_count,
5427743eeb0bSSasha Levin sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5428743eeb0bSSasha Levin if (range == NULL)
5429743eeb0bSSasha Levin return -ENOENT;
5430743eeb0bSSasha Levin
5431743eeb0bSSasha Levin off = range - bus->range;
5432743eeb0bSSasha Levin
5433c21fbff1SPaolo Bonzini while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5434743eeb0bSSasha Levin off--;
5435743eeb0bSSasha Levin
5436743eeb0bSSasha Levin return off;
5437743eeb0bSSasha Levin }
5438743eeb0bSSasha Levin
__kvm_io_bus_write(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,const void * val)5439e32edf4fSNikolay Nikolaev static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5440126a5af5SCornelia Huck struct kvm_io_range *range, const void *val)
5441126a5af5SCornelia Huck {
5442126a5af5SCornelia Huck int idx;
5443126a5af5SCornelia Huck
5444126a5af5SCornelia Huck idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5445126a5af5SCornelia Huck if (idx < 0)
5446126a5af5SCornelia Huck return -EOPNOTSUPP;
5447126a5af5SCornelia Huck
5448126a5af5SCornelia Huck while (idx < bus->dev_count &&
5449c21fbff1SPaolo Bonzini kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5450e32edf4fSNikolay Nikolaev if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5451126a5af5SCornelia Huck range->len, val))
5452126a5af5SCornelia Huck return idx;
5453126a5af5SCornelia Huck idx++;
5454126a5af5SCornelia Huck }
5455126a5af5SCornelia Huck
5456126a5af5SCornelia Huck return -EOPNOTSUPP;
5457126a5af5SCornelia Huck }
5458126a5af5SCornelia Huck
5459bda9020eSMichael S. Tsirkin /* kvm_io_bus_write - called under kvm->slots_lock */
kvm_io_bus_write(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val)5460e32edf4fSNikolay Nikolaev int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5461bda9020eSMichael S. Tsirkin int len, const void *val)
54620fce5623SAvi Kivity {
5463126a5af5SCornelia Huck struct kvm_io_bus *bus;
5464126a5af5SCornelia Huck struct kvm_io_range range;
5465126a5af5SCornelia Huck int r;
5466126a5af5SCornelia Huck
5467126a5af5SCornelia Huck range = (struct kvm_io_range) {
5468126a5af5SCornelia Huck .addr = addr,
5469126a5af5SCornelia Huck .len = len,
5470126a5af5SCornelia Huck };
5471126a5af5SCornelia Huck
5472e32edf4fSNikolay Nikolaev bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
547390db1043SDavid Hildenbrand if (!bus)
547490db1043SDavid Hildenbrand return -ENOMEM;
5475e32edf4fSNikolay Nikolaev r = __kvm_io_bus_write(vcpu, bus, &range, val);
5476126a5af5SCornelia Huck return r < 0 ? r : 0;
5477126a5af5SCornelia Huck }
5478a2420107SLeo Yan EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5479126a5af5SCornelia Huck
5480126a5af5SCornelia Huck /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
kvm_io_bus_write_cookie(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val,long cookie)5481e32edf4fSNikolay Nikolaev int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5482e32edf4fSNikolay Nikolaev gpa_t addr, int len, const void *val, long cookie)
5483126a5af5SCornelia Huck {
548490d83dc3SLai Jiangshan struct kvm_io_bus *bus;
5485743eeb0bSSasha Levin struct kvm_io_range range;
5486743eeb0bSSasha Levin
5487743eeb0bSSasha Levin range = (struct kvm_io_range) {
5488743eeb0bSSasha Levin .addr = addr,
5489743eeb0bSSasha Levin .len = len,
5490743eeb0bSSasha Levin };
549190d83dc3SLai Jiangshan
5492e32edf4fSNikolay Nikolaev bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
549390db1043SDavid Hildenbrand if (!bus)
549490db1043SDavid Hildenbrand return -ENOMEM;
5495126a5af5SCornelia Huck
5496126a5af5SCornelia Huck /* First try the device referenced by cookie. */
5497126a5af5SCornelia Huck if ((cookie >= 0) && (cookie < bus->dev_count) &&
5498c21fbff1SPaolo Bonzini (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5499e32edf4fSNikolay Nikolaev if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5500126a5af5SCornelia Huck val))
5501126a5af5SCornelia Huck return cookie;
5502126a5af5SCornelia Huck
5503126a5af5SCornelia Huck /*
5504126a5af5SCornelia Huck * cookie contained garbage; fall back to search and return the
5505126a5af5SCornelia Huck * correct cookie value.
5506126a5af5SCornelia Huck */
5507e32edf4fSNikolay Nikolaev return __kvm_io_bus_write(vcpu, bus, &range, val);
5508126a5af5SCornelia Huck }
5509126a5af5SCornelia Huck
__kvm_io_bus_read(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,void * val)5510e32edf4fSNikolay Nikolaev static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5511e32edf4fSNikolay Nikolaev struct kvm_io_range *range, void *val)
5512126a5af5SCornelia Huck {
5513126a5af5SCornelia Huck int idx;
5514126a5af5SCornelia Huck
5515126a5af5SCornelia Huck idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5516743eeb0bSSasha Levin if (idx < 0)
5517743eeb0bSSasha Levin return -EOPNOTSUPP;
5518743eeb0bSSasha Levin
5519743eeb0bSSasha Levin while (idx < bus->dev_count &&
5520c21fbff1SPaolo Bonzini kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5521e32edf4fSNikolay Nikolaev if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5522126a5af5SCornelia Huck range->len, val))
5523126a5af5SCornelia Huck return idx;
5524743eeb0bSSasha Levin idx++;
5525743eeb0bSSasha Levin }
5526743eeb0bSSasha Levin
5527bda9020eSMichael S. Tsirkin return -EOPNOTSUPP;
55280fce5623SAvi Kivity }
55290fce5623SAvi Kivity
5530bda9020eSMichael S. Tsirkin /* kvm_io_bus_read - called under kvm->slots_lock */
kvm_io_bus_read(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,void * val)5531e32edf4fSNikolay Nikolaev int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5532e93f8a0fSMarcelo Tosatti int len, void *val)
5533bda9020eSMichael S. Tsirkin {
5534126a5af5SCornelia Huck struct kvm_io_bus *bus;
5535126a5af5SCornelia Huck struct kvm_io_range range;
5536126a5af5SCornelia Huck int r;
5537126a5af5SCornelia Huck
5538126a5af5SCornelia Huck range = (struct kvm_io_range) {
5539126a5af5SCornelia Huck .addr = addr,
5540126a5af5SCornelia Huck .len = len,
5541126a5af5SCornelia Huck };
5542126a5af5SCornelia Huck
5543e32edf4fSNikolay Nikolaev bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
554490db1043SDavid Hildenbrand if (!bus)
554590db1043SDavid Hildenbrand return -ENOMEM;
5546e32edf4fSNikolay Nikolaev r = __kvm_io_bus_read(vcpu, bus, &range, val);
5547126a5af5SCornelia Huck return r < 0 ? r : 0;
5548126a5af5SCornelia Huck }
5549126a5af5SCornelia Huck
555079fac95eSMarcelo Tosatti /* Caller must hold slots_lock. */
kvm_io_bus_register_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr,int len,struct kvm_io_device * dev)5551743eeb0bSSasha Levin int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5552743eeb0bSSasha Levin int len, struct kvm_io_device *dev)
5553090b7affSGregory Haskins {
5554d4c67a7aSGal Hammer int i;
5555e93f8a0fSMarcelo Tosatti struct kvm_io_bus *new_bus, *bus;
5556d4c67a7aSGal Hammer struct kvm_io_range range;
5557090b7affSGregory Haskins
55584a12f951SChristian Borntraeger bus = kvm_get_bus(kvm, bus_idx);
555990db1043SDavid Hildenbrand if (!bus)
556090db1043SDavid Hildenbrand return -ENOMEM;
556190db1043SDavid Hildenbrand
55626ea34c9bSAmos Kong /* exclude ioeventfd which is limited by maximum fd */
55636ea34c9bSAmos Kong if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5564090b7affSGregory Haskins return -ENOSPC;
5565090b7affSGregory Haskins
556690952cd3SGustavo A. R. Silva new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5567b12ce36aSBen Gardon GFP_KERNEL_ACCOUNT);
5568e93f8a0fSMarcelo Tosatti if (!new_bus)
5569e93f8a0fSMarcelo Tosatti return -ENOMEM;
5570d4c67a7aSGal Hammer
5571d4c67a7aSGal Hammer range = (struct kvm_io_range) {
5572d4c67a7aSGal Hammer .addr = addr,
5573d4c67a7aSGal Hammer .len = len,
5574d4c67a7aSGal Hammer .dev = dev,
5575d4c67a7aSGal Hammer };
5576d4c67a7aSGal Hammer
5577d4c67a7aSGal Hammer for (i = 0; i < bus->dev_count; i++)
5578d4c67a7aSGal Hammer if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5579d4c67a7aSGal Hammer break;
5580d4c67a7aSGal Hammer
5581d4c67a7aSGal Hammer memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5582d4c67a7aSGal Hammer new_bus->dev_count++;
5583d4c67a7aSGal Hammer new_bus->range[i] = range;
5584d4c67a7aSGal Hammer memcpy(new_bus->range + i + 1, bus->range + i,
5585d4c67a7aSGal Hammer (bus->dev_count - i) * sizeof(struct kvm_io_range));
5586e93f8a0fSMarcelo Tosatti rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5587e93f8a0fSMarcelo Tosatti synchronize_srcu_expedited(&kvm->srcu);
5588e93f8a0fSMarcelo Tosatti kfree(bus);
5589090b7affSGregory Haskins
5590090b7affSGregory Haskins return 0;
5591090b7affSGregory Haskins }
5592090b7affSGregory Haskins
kvm_io_bus_unregister_dev(struct kvm * kvm,enum kvm_bus bus_idx,struct kvm_io_device * dev)55935d3c4c79SSean Christopherson int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
55946c474694SMichael S. Tsirkin struct kvm_io_device *dev)
55956c474694SMichael S. Tsirkin {
55965ea5ca3cSWei Wang int i;
5597e93f8a0fSMarcelo Tosatti struct kvm_io_bus *new_bus, *bus;
55986c474694SMichael S. Tsirkin
55997c896d37SSean Christopherson lockdep_assert_held(&kvm->slots_lock);
56007c896d37SSean Christopherson
56014a12f951SChristian Borntraeger bus = kvm_get_bus(kvm, bus_idx);
5602df630b8cSPeter Xu if (!bus)
56035d3c4c79SSean Christopherson return 0;
5604df630b8cSPeter Xu
56057c896d37SSean Christopherson for (i = 0; i < bus->dev_count; i++) {
5606a1300716SAmos Kong if (bus->range[i].dev == dev) {
5607090b7affSGregory Haskins break;
5608090b7affSGregory Haskins }
56097c896d37SSean Christopherson }
5610e93f8a0fSMarcelo Tosatti
561190db1043SDavid Hildenbrand if (i == bus->dev_count)
56125d3c4c79SSean Christopherson return 0;
5613a1300716SAmos Kong
561490952cd3SGustavo A. R. Silva new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5615b12ce36aSBen Gardon GFP_KERNEL_ACCOUNT);
5616f6588660SRustam Kovhaev if (new_bus) {
5617871c433bSRustam Kovhaev memcpy(new_bus, bus, struct_size(bus, range, i));
5618a1300716SAmos Kong new_bus->dev_count--;
5619a1300716SAmos Kong memcpy(new_bus->range + i, bus->range + i + 1,
5620871c433bSRustam Kovhaev flex_array_size(new_bus, range, new_bus->dev_count - i));
56212ee37574SSean Christopherson }
56222ee37574SSean Christopherson
56232ee37574SSean Christopherson rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
56242ee37574SSean Christopherson synchronize_srcu_expedited(&kvm->srcu);
56252ee37574SSean Christopherson
56265ea5ca3cSWei Wang /*
56275ea5ca3cSWei Wang * If NULL bus is installed, destroy the old bus, including all the
56285ea5ca3cSWei Wang * attached devices. Otherwise, destroy the caller's device only.
56295ea5ca3cSWei Wang */
56302ee37574SSean Christopherson if (!new_bus) {
5631f6588660SRustam Kovhaev pr_err("kvm: failed to shrink bus, removing it completely\n");
56325ea5ca3cSWei Wang kvm_io_bus_destroy(bus);
56335ea5ca3cSWei Wang return -ENOMEM;
5634f6588660SRustam Kovhaev }
5635e93f8a0fSMarcelo Tosatti
56365ea5ca3cSWei Wang kvm_iodevice_destructor(dev);
5637e93f8a0fSMarcelo Tosatti kfree(bus);
56385ea5ca3cSWei Wang return 0;
56390fce5623SAvi Kivity }
56400fce5623SAvi Kivity
kvm_io_bus_get_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr)56418a39d006SAndre Przywara struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
56428a39d006SAndre Przywara gpa_t addr)
56438a39d006SAndre Przywara {
56448a39d006SAndre Przywara struct kvm_io_bus *bus;
56458a39d006SAndre Przywara int dev_idx, srcu_idx;
56468a39d006SAndre Przywara struct kvm_io_device *iodev = NULL;
56478a39d006SAndre Przywara
56488a39d006SAndre Przywara srcu_idx = srcu_read_lock(&kvm->srcu);
56498a39d006SAndre Przywara
56508a39d006SAndre Przywara bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
565190db1043SDavid Hildenbrand if (!bus)
565290db1043SDavid Hildenbrand goto out_unlock;
56538a39d006SAndre Przywara
56548a39d006SAndre Przywara dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
56558a39d006SAndre Przywara if (dev_idx < 0)
56568a39d006SAndre Przywara goto out_unlock;
56578a39d006SAndre Przywara
56588a39d006SAndre Przywara iodev = bus->range[dev_idx].dev;
56598a39d006SAndre Przywara
56608a39d006SAndre Przywara out_unlock:
56618a39d006SAndre Przywara srcu_read_unlock(&kvm->srcu, srcu_idx);
56628a39d006SAndre Przywara
56638a39d006SAndre Przywara return iodev;
56648a39d006SAndre Przywara }
56658a39d006SAndre Przywara EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
56668a39d006SAndre Przywara
kvm_debugfs_open(struct inode * inode,struct file * file,int (* get)(void *,u64 *),int (* set)(void *,u64),const char * fmt)5667536a6f88SJanosch Frank static int kvm_debugfs_open(struct inode *inode, struct file *file,
5668536a6f88SJanosch Frank int (*get)(void *, u64 *), int (*set)(void *, u64),
5669536a6f88SJanosch Frank const char *fmt)
5670536a6f88SJanosch Frank {
5671180418e2SHou Wenlong int ret;
567214aa40a1SLi kunyu struct kvm_stat_data *stat_data = inode->i_private;
5673536a6f88SJanosch Frank
5674605c7130SPeter Xu /*
5675605c7130SPeter Xu * The debugfs files are a reference to the kvm struct which
5676605c7130SPeter Xu * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
5677605c7130SPeter Xu * avoids the race between open and the removal of the debugfs directory.
5678536a6f88SJanosch Frank */
5679605c7130SPeter Xu if (!kvm_get_kvm_safe(stat_data->kvm))
5680536a6f88SJanosch Frank return -ENOENT;
5681536a6f88SJanosch Frank
5682180418e2SHou Wenlong ret = simple_attr_open(inode, file, get,
5683bc9e9e67SJing Zhang kvm_stats_debugfs_mode(stat_data->desc) & 0222
5684180418e2SHou Wenlong ? set : NULL, fmt);
5685180418e2SHou Wenlong if (ret)
5686536a6f88SJanosch Frank kvm_put_kvm(stat_data->kvm);
5687536a6f88SJanosch Frank
5688180418e2SHou Wenlong return ret;
5689536a6f88SJanosch Frank }
5690536a6f88SJanosch Frank
kvm_debugfs_release(struct inode * inode,struct file * file)5691536a6f88SJanosch Frank static int kvm_debugfs_release(struct inode *inode, struct file *file)
5692536a6f88SJanosch Frank {
569314aa40a1SLi kunyu struct kvm_stat_data *stat_data = inode->i_private;
5694536a6f88SJanosch Frank
5695536a6f88SJanosch Frank simple_attr_release(inode, file);
5696536a6f88SJanosch Frank kvm_put_kvm(stat_data->kvm);
5697536a6f88SJanosch Frank
5698536a6f88SJanosch Frank return 0;
5699536a6f88SJanosch Frank }
5700536a6f88SJanosch Frank
kvm_get_stat_per_vm(struct kvm * kvm,size_t offset,u64 * val)570109cbcef6SMilan Pandurov static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5702536a6f88SJanosch Frank {
5703bc9e9e67SJing Zhang *val = *(u64 *)((void *)(&kvm->stat) + offset);
5704536a6f88SJanosch Frank
5705536a6f88SJanosch Frank return 0;
5706536a6f88SJanosch Frank }
5707536a6f88SJanosch Frank
kvm_clear_stat_per_vm(struct kvm * kvm,size_t offset)570809cbcef6SMilan Pandurov static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5709ce35ef27SSuraj Jitindar Singh {
5710bc9e9e67SJing Zhang *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5711ce35ef27SSuraj Jitindar Singh
5712ce35ef27SSuraj Jitindar Singh return 0;
5713ce35ef27SSuraj Jitindar Singh }
5714ce35ef27SSuraj Jitindar Singh
kvm_get_stat_per_vcpu(struct kvm * kvm,size_t offset,u64 * val)571509cbcef6SMilan Pandurov static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5716536a6f88SJanosch Frank {
571746808a4cSMarc Zyngier unsigned long i;
5718536a6f88SJanosch Frank struct kvm_vcpu *vcpu;
5719536a6f88SJanosch Frank
5720536a6f88SJanosch Frank *val = 0;
5721536a6f88SJanosch Frank
572209cbcef6SMilan Pandurov kvm_for_each_vcpu(i, vcpu, kvm)
5723bc9e9e67SJing Zhang *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5724536a6f88SJanosch Frank
5725536a6f88SJanosch Frank return 0;
5726536a6f88SJanosch Frank }
5727536a6f88SJanosch Frank
kvm_clear_stat_per_vcpu(struct kvm * kvm,size_t offset)572809cbcef6SMilan Pandurov static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5729ce35ef27SSuraj Jitindar Singh {
573046808a4cSMarc Zyngier unsigned long i;
5731ce35ef27SSuraj Jitindar Singh struct kvm_vcpu *vcpu;
5732ce35ef27SSuraj Jitindar Singh
573309cbcef6SMilan Pandurov kvm_for_each_vcpu(i, vcpu, kvm)
5734bc9e9e67SJing Zhang *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
573509cbcef6SMilan Pandurov
573609cbcef6SMilan Pandurov return 0;
573709cbcef6SMilan Pandurov }
573809cbcef6SMilan Pandurov
kvm_stat_data_get(void * data,u64 * val)573909cbcef6SMilan Pandurov static int kvm_stat_data_get(void *data, u64 *val)
574009cbcef6SMilan Pandurov {
574109cbcef6SMilan Pandurov int r = -EFAULT;
574214aa40a1SLi kunyu struct kvm_stat_data *stat_data = data;
574309cbcef6SMilan Pandurov
5744bc9e9e67SJing Zhang switch (stat_data->kind) {
574509cbcef6SMilan Pandurov case KVM_STAT_VM:
574609cbcef6SMilan Pandurov r = kvm_get_stat_per_vm(stat_data->kvm,
5747bc9e9e67SJing Zhang stat_data->desc->desc.offset, val);
574809cbcef6SMilan Pandurov break;
574909cbcef6SMilan Pandurov case KVM_STAT_VCPU:
575009cbcef6SMilan Pandurov r = kvm_get_stat_per_vcpu(stat_data->kvm,
5751bc9e9e67SJing Zhang stat_data->desc->desc.offset, val);
575209cbcef6SMilan Pandurov break;
575309cbcef6SMilan Pandurov }
575409cbcef6SMilan Pandurov
575509cbcef6SMilan Pandurov return r;
575609cbcef6SMilan Pandurov }
575709cbcef6SMilan Pandurov
kvm_stat_data_clear(void * data,u64 val)575809cbcef6SMilan Pandurov static int kvm_stat_data_clear(void *data, u64 val)
575909cbcef6SMilan Pandurov {
576009cbcef6SMilan Pandurov int r = -EFAULT;
576114aa40a1SLi kunyu struct kvm_stat_data *stat_data = data;
576209cbcef6SMilan Pandurov
5763ce35ef27SSuraj Jitindar Singh if (val)
5764ce35ef27SSuraj Jitindar Singh return -EINVAL;
5765ce35ef27SSuraj Jitindar Singh
5766bc9e9e67SJing Zhang switch (stat_data->kind) {
576709cbcef6SMilan Pandurov case KVM_STAT_VM:
576809cbcef6SMilan Pandurov r = kvm_clear_stat_per_vm(stat_data->kvm,
5769bc9e9e67SJing Zhang stat_data->desc->desc.offset);
577009cbcef6SMilan Pandurov break;
577109cbcef6SMilan Pandurov case KVM_STAT_VCPU:
577209cbcef6SMilan Pandurov r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5773bc9e9e67SJing Zhang stat_data->desc->desc.offset);
577409cbcef6SMilan Pandurov break;
5775ce35ef27SSuraj Jitindar Singh }
5776ce35ef27SSuraj Jitindar Singh
577709cbcef6SMilan Pandurov return r;
577809cbcef6SMilan Pandurov }
577909cbcef6SMilan Pandurov
kvm_stat_data_open(struct inode * inode,struct file * file)578009cbcef6SMilan Pandurov static int kvm_stat_data_open(struct inode *inode, struct file *file)
5781536a6f88SJanosch Frank {
5782536a6f88SJanosch Frank __simple_attr_check_format("%llu\n", 0ull);
578309cbcef6SMilan Pandurov return kvm_debugfs_open(inode, file, kvm_stat_data_get,
578409cbcef6SMilan Pandurov kvm_stat_data_clear, "%llu\n");
5785536a6f88SJanosch Frank }
5786536a6f88SJanosch Frank
578709cbcef6SMilan Pandurov static const struct file_operations stat_fops_per_vm = {
5788536a6f88SJanosch Frank .owner = THIS_MODULE,
578909cbcef6SMilan Pandurov .open = kvm_stat_data_open,
5790536a6f88SJanosch Frank .release = kvm_debugfs_release,
5791536a6f88SJanosch Frank .read = simple_attr_read,
5792536a6f88SJanosch Frank .write = simple_attr_write,
57933bed8888SGeliang Tang .llseek = no_llseek,
5794536a6f88SJanosch Frank };
5795536a6f88SJanosch Frank
vm_stat_get(void * _offset,u64 * val)57968b88b099SChristoph Hellwig static int vm_stat_get(void *_offset, u64 *val)
57970fce5623SAvi Kivity {
57980fce5623SAvi Kivity unsigned offset = (long)_offset;
57990fce5623SAvi Kivity struct kvm *kvm;
5800536a6f88SJanosch Frank u64 tmp_val;
58010fce5623SAvi Kivity
58028b88b099SChristoph Hellwig *val = 0;
58030d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
5804536a6f88SJanosch Frank list_for_each_entry(kvm, &vm_list, vm_list) {
580509cbcef6SMilan Pandurov kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5806536a6f88SJanosch Frank *val += tmp_val;
5807536a6f88SJanosch Frank }
58080d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
58098b88b099SChristoph Hellwig return 0;
58100fce5623SAvi Kivity }
58110fce5623SAvi Kivity
vm_stat_clear(void * _offset,u64 val)5812ce35ef27SSuraj Jitindar Singh static int vm_stat_clear(void *_offset, u64 val)
5813ce35ef27SSuraj Jitindar Singh {
5814ce35ef27SSuraj Jitindar Singh unsigned offset = (long)_offset;
5815ce35ef27SSuraj Jitindar Singh struct kvm *kvm;
5816ce35ef27SSuraj Jitindar Singh
5817ce35ef27SSuraj Jitindar Singh if (val)
5818ce35ef27SSuraj Jitindar Singh return -EINVAL;
5819ce35ef27SSuraj Jitindar Singh
58200d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
5821ce35ef27SSuraj Jitindar Singh list_for_each_entry(kvm, &vm_list, vm_list) {
582209cbcef6SMilan Pandurov kvm_clear_stat_per_vm(kvm, offset);
5823ce35ef27SSuraj Jitindar Singh }
58240d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
5825ce35ef27SSuraj Jitindar Singh
5826ce35ef27SSuraj Jitindar Singh return 0;
5827ce35ef27SSuraj Jitindar Singh }
5828ce35ef27SSuraj Jitindar Singh
5829ce35ef27SSuraj Jitindar Singh DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5830bc9e9e67SJing Zhang DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
58310fce5623SAvi Kivity
vcpu_stat_get(void * _offset,u64 * val)58328b88b099SChristoph Hellwig static int vcpu_stat_get(void *_offset, u64 *val)
58330fce5623SAvi Kivity {
58340fce5623SAvi Kivity unsigned offset = (long)_offset;
58350fce5623SAvi Kivity struct kvm *kvm;
5836536a6f88SJanosch Frank u64 tmp_val;
58370fce5623SAvi Kivity
58388b88b099SChristoph Hellwig *val = 0;
58390d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
5840536a6f88SJanosch Frank list_for_each_entry(kvm, &vm_list, vm_list) {
584109cbcef6SMilan Pandurov kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5842536a6f88SJanosch Frank *val += tmp_val;
5843536a6f88SJanosch Frank }
58440d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
58458b88b099SChristoph Hellwig return 0;
58460fce5623SAvi Kivity }
58470fce5623SAvi Kivity
vcpu_stat_clear(void * _offset,u64 val)5848ce35ef27SSuraj Jitindar Singh static int vcpu_stat_clear(void *_offset, u64 val)
5849ce35ef27SSuraj Jitindar Singh {
5850ce35ef27SSuraj Jitindar Singh unsigned offset = (long)_offset;
5851ce35ef27SSuraj Jitindar Singh struct kvm *kvm;
5852ce35ef27SSuraj Jitindar Singh
5853ce35ef27SSuraj Jitindar Singh if (val)
5854ce35ef27SSuraj Jitindar Singh return -EINVAL;
5855ce35ef27SSuraj Jitindar Singh
58560d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
5857ce35ef27SSuraj Jitindar Singh list_for_each_entry(kvm, &vm_list, vm_list) {
585809cbcef6SMilan Pandurov kvm_clear_stat_per_vcpu(kvm, offset);
5859ce35ef27SSuraj Jitindar Singh }
58600d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
5861ce35ef27SSuraj Jitindar Singh
5862ce35ef27SSuraj Jitindar Singh return 0;
5863ce35ef27SSuraj Jitindar Singh }
5864ce35ef27SSuraj Jitindar Singh
5865ce35ef27SSuraj Jitindar Singh DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5866ce35ef27SSuraj Jitindar Singh "%llu\n");
5867bc9e9e67SJing Zhang DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
58680fce5623SAvi Kivity
kvm_uevent_notify_change(unsigned int type,struct kvm * kvm)5869286de8f6SClaudio Imbrenda static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5870286de8f6SClaudio Imbrenda {
5871286de8f6SClaudio Imbrenda struct kobj_uevent_env *env;
5872286de8f6SClaudio Imbrenda unsigned long long created, active;
5873286de8f6SClaudio Imbrenda
5874286de8f6SClaudio Imbrenda if (!kvm_dev.this_device || !kvm)
5875286de8f6SClaudio Imbrenda return;
5876286de8f6SClaudio Imbrenda
58770d9ce162SJunaid Shahid mutex_lock(&kvm_lock);
5878286de8f6SClaudio Imbrenda if (type == KVM_EVENT_CREATE_VM) {
5879286de8f6SClaudio Imbrenda kvm_createvm_count++;
5880286de8f6SClaudio Imbrenda kvm_active_vms++;
5881286de8f6SClaudio Imbrenda } else if (type == KVM_EVENT_DESTROY_VM) {
5882286de8f6SClaudio Imbrenda kvm_active_vms--;
5883286de8f6SClaudio Imbrenda }
5884286de8f6SClaudio Imbrenda created = kvm_createvm_count;
5885286de8f6SClaudio Imbrenda active = kvm_active_vms;
58860d9ce162SJunaid Shahid mutex_unlock(&kvm_lock);
5887286de8f6SClaudio Imbrenda
5888b12ce36aSBen Gardon env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5889286de8f6SClaudio Imbrenda if (!env)
5890286de8f6SClaudio Imbrenda return;
5891286de8f6SClaudio Imbrenda
5892286de8f6SClaudio Imbrenda add_uevent_var(env, "CREATED=%llu", created);
5893286de8f6SClaudio Imbrenda add_uevent_var(env, "COUNT=%llu", active);
5894286de8f6SClaudio Imbrenda
5895fdeaf7e3SClaudio Imbrenda if (type == KVM_EVENT_CREATE_VM) {
5896286de8f6SClaudio Imbrenda add_uevent_var(env, "EVENT=create");
5897fdeaf7e3SClaudio Imbrenda kvm->userspace_pid = task_pid_nr(current);
5898fdeaf7e3SClaudio Imbrenda } else if (type == KVM_EVENT_DESTROY_VM) {
5899286de8f6SClaudio Imbrenda add_uevent_var(env, "EVENT=destroy");
5900fdeaf7e3SClaudio Imbrenda }
5901fdeaf7e3SClaudio Imbrenda add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5902286de8f6SClaudio Imbrenda
5903a44a4cc1SOliver Upton if (!IS_ERR(kvm->debugfs_dentry)) {
5904b12ce36aSBen Gardon char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5905286de8f6SClaudio Imbrenda
5906fdeaf7e3SClaudio Imbrenda if (p) {
5907fdeaf7e3SClaudio Imbrenda tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5908fdeaf7e3SClaudio Imbrenda if (!IS_ERR(tmp))
5909fdeaf7e3SClaudio Imbrenda add_uevent_var(env, "STATS_PATH=%s", tmp);
5910fdeaf7e3SClaudio Imbrenda kfree(p);
5911286de8f6SClaudio Imbrenda }
5912286de8f6SClaudio Imbrenda }
5913286de8f6SClaudio Imbrenda /* no need for checks, since we are adding at most only 5 keys */
5914286de8f6SClaudio Imbrenda env->envp[env->envp_idx++] = NULL;
5915286de8f6SClaudio Imbrenda kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5916286de8f6SClaudio Imbrenda kfree(env);
5917286de8f6SClaudio Imbrenda }
5918286de8f6SClaudio Imbrenda
kvm_init_debug(void)5919929f45e3SGreg Kroah-Hartman static void kvm_init_debug(void)
59200fce5623SAvi Kivity {
5921bc9e9e67SJing Zhang const struct file_operations *fops;
5922bc9e9e67SJing Zhang const struct _kvm_stats_desc *pdesc;
5923bc9e9e67SJing Zhang int i;
59240fce5623SAvi Kivity
592576f7c879SHollis Blanchard kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
59264f69b680SHamo
5927bc9e9e67SJing Zhang for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5928bc9e9e67SJing Zhang pdesc = &kvm_vm_stats_desc[i];
5929bc9e9e67SJing Zhang if (kvm_stats_debugfs_mode(pdesc) & 0222)
5930bc9e9e67SJing Zhang fops = &vm_stat_fops;
5931bc9e9e67SJing Zhang else
5932bc9e9e67SJing Zhang fops = &vm_stat_readonly_fops;
5933bc9e9e67SJing Zhang debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5934bc9e9e67SJing Zhang kvm_debugfs_dir,
5935bc9e9e67SJing Zhang (void *)(long)pdesc->desc.offset, fops);
5936bc9e9e67SJing Zhang }
5937bc9e9e67SJing Zhang
5938bc9e9e67SJing Zhang for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5939bc9e9e67SJing Zhang pdesc = &kvm_vcpu_stats_desc[i];
5940bc9e9e67SJing Zhang if (kvm_stats_debugfs_mode(pdesc) & 0222)
5941bc9e9e67SJing Zhang fops = &vcpu_stat_fops;
5942bc9e9e67SJing Zhang else
5943bc9e9e67SJing Zhang fops = &vcpu_stat_readonly_fops;
5944bc9e9e67SJing Zhang debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5945bc9e9e67SJing Zhang kvm_debugfs_dir,
5946bc9e9e67SJing Zhang (void *)(long)pdesc->desc.offset, fops);
59474f69b680SHamo }
59480fce5623SAvi Kivity }
59490fce5623SAvi Kivity
59500fce5623SAvi Kivity static inline
preempt_notifier_to_vcpu(struct preempt_notifier * pn)59510fce5623SAvi Kivity struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
59520fce5623SAvi Kivity {
59530fce5623SAvi Kivity return container_of(pn, struct kvm_vcpu, preempt_notifier);
59540fce5623SAvi Kivity }
59550fce5623SAvi Kivity
kvm_sched_in(struct preempt_notifier * pn,int cpu)59560fce5623SAvi Kivity static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
59570fce5623SAvi Kivity {
59580fce5623SAvi Kivity struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5959f95ef0cdSXiubo Li
5960046ddeedSWanpeng Li WRITE_ONCE(vcpu->preempted, false);
5961d73eb57bSWanpeng Li WRITE_ONCE(vcpu->ready, false);
59620fce5623SAvi Kivity
59637495e22bSPaolo Bonzini __this_cpu_write(kvm_running_vcpu, vcpu);
5964e790d9efSRadim Krčmář kvm_arch_sched_in(vcpu, cpu);
59650fce5623SAvi Kivity kvm_arch_vcpu_load(vcpu, cpu);
59660fce5623SAvi Kivity }
59670fce5623SAvi Kivity
kvm_sched_out(struct preempt_notifier * pn,struct task_struct * next)59680fce5623SAvi Kivity static void kvm_sched_out(struct preempt_notifier *pn,
59690fce5623SAvi Kivity struct task_struct *next)
59700fce5623SAvi Kivity {
59710fce5623SAvi Kivity struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
59720fce5623SAvi Kivity
59733ba9f93bSPeter Zijlstra if (current->on_rq) {
5974046ddeedSWanpeng Li WRITE_ONCE(vcpu->preempted, true);
5975d73eb57bSWanpeng Li WRITE_ONCE(vcpu->ready, true);
5976d73eb57bSWanpeng Li }
59770fce5623SAvi Kivity kvm_arch_vcpu_put(vcpu);
59787495e22bSPaolo Bonzini __this_cpu_write(kvm_running_vcpu, NULL);
59797495e22bSPaolo Bonzini }
59807495e22bSPaolo Bonzini
59817495e22bSPaolo Bonzini /**
59827495e22bSPaolo Bonzini * kvm_get_running_vcpu - get the vcpu running on the current CPU.
59831f03b2bcSMarc Zyngier *
59841f03b2bcSMarc Zyngier * We can disable preemption locally around accessing the per-CPU variable,
59851f03b2bcSMarc Zyngier * and use the resolved vcpu pointer after enabling preemption again,
59861f03b2bcSMarc Zyngier * because even if the current thread is migrated to another CPU, reading
59871f03b2bcSMarc Zyngier * the per-CPU value later will give us the same value as we update the
59881f03b2bcSMarc Zyngier * per-CPU variable in the preempt notifier handlers.
59897495e22bSPaolo Bonzini */
kvm_get_running_vcpu(void)59907495e22bSPaolo Bonzini struct kvm_vcpu *kvm_get_running_vcpu(void)
59917495e22bSPaolo Bonzini {
59921f03b2bcSMarc Zyngier struct kvm_vcpu *vcpu;
59931f03b2bcSMarc Zyngier
59941f03b2bcSMarc Zyngier preempt_disable();
59951f03b2bcSMarc Zyngier vcpu = __this_cpu_read(kvm_running_vcpu);
59961f03b2bcSMarc Zyngier preempt_enable();
59971f03b2bcSMarc Zyngier
59981f03b2bcSMarc Zyngier return vcpu;
59997495e22bSPaolo Bonzini }
6000379a3c8eSWanpeng Li EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
60017495e22bSPaolo Bonzini
60027495e22bSPaolo Bonzini /**
60037495e22bSPaolo Bonzini * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
60047495e22bSPaolo Bonzini */
kvm_get_running_vcpus(void)60057495e22bSPaolo Bonzini struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
60067495e22bSPaolo Bonzini {
60077495e22bSPaolo Bonzini return &kvm_running_vcpu;
60080fce5623SAvi Kivity }
60090fce5623SAvi Kivity
6010e1bfc245SSean Christopherson #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_guest_state(void)6011e1bfc245SSean Christopherson static unsigned int kvm_guest_state(void)
6012e1bfc245SSean Christopherson {
6013e1bfc245SSean Christopherson struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6014e1bfc245SSean Christopherson unsigned int state;
6015e1bfc245SSean Christopherson
6016e1bfc245SSean Christopherson if (!kvm_arch_pmi_in_guest(vcpu))
6017e1bfc245SSean Christopherson return 0;
6018e1bfc245SSean Christopherson
6019e1bfc245SSean Christopherson state = PERF_GUEST_ACTIVE;
6020e1bfc245SSean Christopherson if (!kvm_arch_vcpu_in_kernel(vcpu))
6021e1bfc245SSean Christopherson state |= PERF_GUEST_USER;
6022e1bfc245SSean Christopherson
6023e1bfc245SSean Christopherson return state;
6024e1bfc245SSean Christopherson }
6025e1bfc245SSean Christopherson
kvm_guest_get_ip(void)6026e1bfc245SSean Christopherson static unsigned long kvm_guest_get_ip(void)
6027e1bfc245SSean Christopherson {
6028e1bfc245SSean Christopherson struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6029e1bfc245SSean Christopherson
6030e1bfc245SSean Christopherson /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6031e1bfc245SSean Christopherson if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6032e1bfc245SSean Christopherson return 0;
6033e1bfc245SSean Christopherson
6034e1bfc245SSean Christopherson return kvm_arch_vcpu_get_ip(vcpu);
6035e1bfc245SSean Christopherson }
6036e1bfc245SSean Christopherson
6037e1bfc245SSean Christopherson static struct perf_guest_info_callbacks kvm_guest_cbs = {
6038e1bfc245SSean Christopherson .state = kvm_guest_state,
6039e1bfc245SSean Christopherson .get_ip = kvm_guest_get_ip,
6040e1bfc245SSean Christopherson .handle_intel_pt_intr = NULL,
6041e1bfc245SSean Christopherson };
6042e1bfc245SSean Christopherson
kvm_register_perf_callbacks(unsigned int (* pt_intr_handler)(void))6043e1bfc245SSean Christopherson void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6044e1bfc245SSean Christopherson {
6045e1bfc245SSean Christopherson kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6046e1bfc245SSean Christopherson perf_register_guest_info_callbacks(&kvm_guest_cbs);
6047e1bfc245SSean Christopherson }
kvm_unregister_perf_callbacks(void)6048e1bfc245SSean Christopherson void kvm_unregister_perf_callbacks(void)
6049e1bfc245SSean Christopherson {
6050e1bfc245SSean Christopherson perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6051e1bfc245SSean Christopherson }
6052e1bfc245SSean Christopherson #endif
6053e1bfc245SSean Christopherson
kvm_init(unsigned vcpu_size,unsigned vcpu_align,struct module * module)605481a1cf9fSSean Christopherson int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6055f257d6dcSSean Christopherson {
60560fce5623SAvi Kivity int r;
60570fce5623SAvi Kivity int cpu;
60580fce5623SAvi Kivity
6059441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6060aaf12a7bSChao Gao r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6061aaf12a7bSChao Gao kvm_online_cpu, kvm_offline_cpu);
60620fce5623SAvi Kivity if (r)
606337d25881SSean Christopherson return r;
606437d25881SSean Christopherson
606535774a9fSSean Christopherson register_syscore_ops(&kvm_syscore_ops);
6066441f7bfaSSean Christopherson #endif
60670fce5623SAvi Kivity
60680fce5623SAvi Kivity /* A kmem cache lets us meet the alignment requirements of fx_save. */
60690ee75beaSAvi Kivity if (!vcpu_align)
60700ee75beaSAvi Kivity vcpu_align = __alignof__(struct kvm_vcpu);
607146515736SPaolo Bonzini kvm_vcpu_cache =
607246515736SPaolo Bonzini kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
607346515736SPaolo Bonzini SLAB_ACCOUNT,
607446515736SPaolo Bonzini offsetof(struct kvm_vcpu, arch),
6075ce55c049SJing Zhang offsetofend(struct kvm_vcpu, stats_id)
6076ce55c049SJing Zhang - offsetof(struct kvm_vcpu, arch),
607746515736SPaolo Bonzini NULL);
60780fce5623SAvi Kivity if (!kvm_vcpu_cache) {
60790fce5623SAvi Kivity r = -ENOMEM;
60809f1a4c00SSean Christopherson goto err_vcpu_cache;
60810fce5623SAvi Kivity }
60820fce5623SAvi Kivity
6083baff59ccSVitaly Kuznetsov for_each_possible_cpu(cpu) {
6084baff59ccSVitaly Kuznetsov if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6085baff59ccSVitaly Kuznetsov GFP_KERNEL, cpu_to_node(cpu))) {
6086baff59ccSVitaly Kuznetsov r = -ENOMEM;
60879f1a4c00SSean Christopherson goto err_cpu_kick_mask;
6088baff59ccSVitaly Kuznetsov }
6089baff59ccSVitaly Kuznetsov }
6090baff59ccSVitaly Kuznetsov
60915910ccf0SSean Christopherson r = kvm_irqfd_init();
60925910ccf0SSean Christopherson if (r)
60935910ccf0SSean Christopherson goto err_irqfd;
60945910ccf0SSean Christopherson
6095af585b92SGleb Natapov r = kvm_async_pf_init();
6096af585b92SGleb Natapov if (r)
60975910ccf0SSean Christopherson goto err_async_pf;
6098af585b92SGleb Natapov
60990fce5623SAvi Kivity kvm_chardev_ops.owner = module;
61000fce5623SAvi Kivity
61010fce5623SAvi Kivity kvm_preempt_ops.sched_in = kvm_sched_in;
61020fce5623SAvi Kivity kvm_preempt_ops.sched_out = kvm_sched_out;
61030fce5623SAvi Kivity
6104929f45e3SGreg Kroah-Hartman kvm_init_debug();
61050ea4ed8eSDarrick J. Wong
61063c3c29fdSPaolo Bonzini r = kvm_vfio_ops_init();
61072b012812SSean Christopherson if (WARN_ON_ONCE(r))
61082b012812SSean Christopherson goto err_vfio;
61092b012812SSean Christopherson
61102b012812SSean Christopherson /*
61112b012812SSean Christopherson * Registration _must_ be the very last thing done, as this exposes
61122b012812SSean Christopherson * /dev/kvm to userspace, i.e. all infrastructure must be setup!
61132b012812SSean Christopherson */
61142b012812SSean Christopherson r = misc_register(&kvm_dev);
61152b012812SSean Christopherson if (r) {
61162b012812SSean Christopherson pr_err("kvm: misc device register failed\n");
61172b012812SSean Christopherson goto err_register;
61182b012812SSean Christopherson }
61193c3c29fdSPaolo Bonzini
61200fce5623SAvi Kivity return 0;
61210fce5623SAvi Kivity
61222b012812SSean Christopherson err_register:
61232b012812SSean Christopherson kvm_vfio_ops_exit();
61242b012812SSean Christopherson err_vfio:
6125af585b92SGleb Natapov kvm_async_pf_deinit();
61265910ccf0SSean Christopherson err_async_pf:
61275910ccf0SSean Christopherson kvm_irqfd_exit();
61285910ccf0SSean Christopherson err_irqfd:
61299f1a4c00SSean Christopherson err_cpu_kick_mask:
6130baff59ccSVitaly Kuznetsov for_each_possible_cpu(cpu)
6131baff59ccSVitaly Kuznetsov free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
61320fce5623SAvi Kivity kmem_cache_destroy(kvm_vcpu_cache);
61339f1a4c00SSean Christopherson err_vcpu_cache:
6134441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
613535774a9fSSean Christopherson unregister_syscore_ops(&kvm_syscore_ops);
6136aaf12a7bSChao Gao cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6137441f7bfaSSean Christopherson #endif
61380fce5623SAvi Kivity return r;
61390fce5623SAvi Kivity }
61400fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_init);
61410fce5623SAvi Kivity
kvm_exit(void)61420fce5623SAvi Kivity void kvm_exit(void)
61430fce5623SAvi Kivity {
6144baff59ccSVitaly Kuznetsov int cpu;
6145baff59ccSVitaly Kuznetsov
61462b012812SSean Christopherson /*
61472b012812SSean Christopherson * Note, unregistering /dev/kvm doesn't strictly need to come first,
61482b012812SSean Christopherson * fops_get(), a.k.a. try_module_get(), prevents acquiring references
61492b012812SSean Christopherson * to KVM while the module is being stopped.
61502b012812SSean Christopherson */
61510fce5623SAvi Kivity misc_deregister(&kvm_dev);
61522b012812SSean Christopherson
61532b012812SSean Christopherson debugfs_remove_recursive(kvm_debugfs_dir);
6154baff59ccSVitaly Kuznetsov for_each_possible_cpu(cpu)
6155baff59ccSVitaly Kuznetsov free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
61560fce5623SAvi Kivity kmem_cache_destroy(kvm_vcpu_cache);
615773b8dc04SSean Christopherson kvm_vfio_ops_exit();
6158af585b92SGleb Natapov kvm_async_pf_deinit();
6159441f7bfaSSean Christopherson #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6160fb3600ccSRafael J. Wysocki unregister_syscore_ops(&kvm_syscore_ops);
6161aaf12a7bSChao Gao cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6162441f7bfaSSean Christopherson #endif
61635910ccf0SSean Christopherson kvm_irqfd_exit();
61640fce5623SAvi Kivity }
61650fce5623SAvi Kivity EXPORT_SYMBOL_GPL(kvm_exit);
6166c57c8046SJunaid Shahid
6167c57c8046SJunaid Shahid struct kvm_vm_worker_thread_context {
6168c57c8046SJunaid Shahid struct kvm *kvm;
6169c57c8046SJunaid Shahid struct task_struct *parent;
6170c57c8046SJunaid Shahid struct completion init_done;
6171c57c8046SJunaid Shahid kvm_vm_thread_fn_t thread_fn;
6172c57c8046SJunaid Shahid uintptr_t data;
6173c57c8046SJunaid Shahid int err;
6174c57c8046SJunaid Shahid };
6175c57c8046SJunaid Shahid
kvm_vm_worker_thread(void * context)6176c57c8046SJunaid Shahid static int kvm_vm_worker_thread(void *context)
6177c57c8046SJunaid Shahid {
6178c57c8046SJunaid Shahid /*
6179c57c8046SJunaid Shahid * The init_context is allocated on the stack of the parent thread, so
6180c57c8046SJunaid Shahid * we have to locally copy anything that is needed beyond initialization
6181c57c8046SJunaid Shahid */
6182c57c8046SJunaid Shahid struct kvm_vm_worker_thread_context *init_context = context;
6183e45cce30SVipin Sharma struct task_struct *parent;
6184c57c8046SJunaid Shahid struct kvm *kvm = init_context->kvm;
6185c57c8046SJunaid Shahid kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6186c57c8046SJunaid Shahid uintptr_t data = init_context->data;
6187c57c8046SJunaid Shahid int err;
6188c57c8046SJunaid Shahid
6189c57c8046SJunaid Shahid err = kthread_park(current);
6190c57c8046SJunaid Shahid /* kthread_park(current) is never supposed to return an error */
6191c57c8046SJunaid Shahid WARN_ON(err != 0);
6192c57c8046SJunaid Shahid if (err)
6193c57c8046SJunaid Shahid goto init_complete;
6194c57c8046SJunaid Shahid
6195c57c8046SJunaid Shahid err = cgroup_attach_task_all(init_context->parent, current);
6196c57c8046SJunaid Shahid if (err) {
6197c57c8046SJunaid Shahid kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6198c57c8046SJunaid Shahid __func__, err);
6199c57c8046SJunaid Shahid goto init_complete;
6200c57c8046SJunaid Shahid }
6201c57c8046SJunaid Shahid
6202c57c8046SJunaid Shahid set_user_nice(current, task_nice(init_context->parent));
6203c57c8046SJunaid Shahid
6204c57c8046SJunaid Shahid init_complete:
6205c57c8046SJunaid Shahid init_context->err = err;
6206c57c8046SJunaid Shahid complete(&init_context->init_done);
6207c57c8046SJunaid Shahid init_context = NULL;
6208c57c8046SJunaid Shahid
6209c57c8046SJunaid Shahid if (err)
6210e45cce30SVipin Sharma goto out;
6211c57c8046SJunaid Shahid
6212c57c8046SJunaid Shahid /* Wait to be woken up by the spawner before proceeding. */
6213c57c8046SJunaid Shahid kthread_parkme();
6214c57c8046SJunaid Shahid
6215c57c8046SJunaid Shahid if (!kthread_should_stop())
6216c57c8046SJunaid Shahid err = thread_fn(kvm, data);
6217c57c8046SJunaid Shahid
6218e45cce30SVipin Sharma out:
6219e45cce30SVipin Sharma /*
6220e45cce30SVipin Sharma * Move kthread back to its original cgroup to prevent it lingering in
6221e45cce30SVipin Sharma * the cgroup of the VM process, after the latter finishes its
6222e45cce30SVipin Sharma * execution.
6223e45cce30SVipin Sharma *
6224e45cce30SVipin Sharma * kthread_stop() waits on the 'exited' completion condition which is
6225e45cce30SVipin Sharma * set in exit_mm(), via mm_release(), in do_exit(). However, the
6226e45cce30SVipin Sharma * kthread is removed from the cgroup in the cgroup_exit() which is
6227e45cce30SVipin Sharma * called after the exit_mm(). This causes the kthread_stop() to return
6228e45cce30SVipin Sharma * before the kthread actually quits the cgroup.
6229e45cce30SVipin Sharma */
6230e45cce30SVipin Sharma rcu_read_lock();
6231e45cce30SVipin Sharma parent = rcu_dereference(current->real_parent);
6232e45cce30SVipin Sharma get_task_struct(parent);
6233e45cce30SVipin Sharma rcu_read_unlock();
6234e45cce30SVipin Sharma cgroup_attach_task_all(parent, current);
6235e45cce30SVipin Sharma put_task_struct(parent);
6236e45cce30SVipin Sharma
6237c57c8046SJunaid Shahid return err;
6238c57c8046SJunaid Shahid }
6239c57c8046SJunaid Shahid
kvm_vm_create_worker_thread(struct kvm * kvm,kvm_vm_thread_fn_t thread_fn,uintptr_t data,const char * name,struct task_struct ** thread_ptr)6240c57c8046SJunaid Shahid int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6241c57c8046SJunaid Shahid uintptr_t data, const char *name,
6242c57c8046SJunaid Shahid struct task_struct **thread_ptr)
6243c57c8046SJunaid Shahid {
6244c57c8046SJunaid Shahid struct kvm_vm_worker_thread_context init_context = {};
6245c57c8046SJunaid Shahid struct task_struct *thread;
6246c57c8046SJunaid Shahid
6247c57c8046SJunaid Shahid *thread_ptr = NULL;
6248c57c8046SJunaid Shahid init_context.kvm = kvm;
6249c57c8046SJunaid Shahid init_context.parent = current;
6250c57c8046SJunaid Shahid init_context.thread_fn = thread_fn;
6251c57c8046SJunaid Shahid init_context.data = data;
6252c57c8046SJunaid Shahid init_completion(&init_context.init_done);
6253c57c8046SJunaid Shahid
6254c57c8046SJunaid Shahid thread = kthread_run(kvm_vm_worker_thread, &init_context,
6255c57c8046SJunaid Shahid "%s-%d", name, task_pid_nr(current));
6256c57c8046SJunaid Shahid if (IS_ERR(thread))
6257c57c8046SJunaid Shahid return PTR_ERR(thread);
6258c57c8046SJunaid Shahid
6259c57c8046SJunaid Shahid /* kthread_run is never supposed to return NULL */
6260c57c8046SJunaid Shahid WARN_ON(thread == NULL);
6261c57c8046SJunaid Shahid
6262c57c8046SJunaid Shahid wait_for_completion(&init_context.init_done);
6263c57c8046SJunaid Shahid
6264c57c8046SJunaid Shahid if (!init_context.err)
6265c57c8046SJunaid Shahid *thread_ptr = thread;
6266c57c8046SJunaid Shahid
6267c57c8046SJunaid Shahid return init_context.err;
6268c57c8046SJunaid Shahid }
6269