1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/bug.h> 8 #include <linux/cpu_pm.h> 9 #include <linux/entry-kvm.h> 10 #include <linux/errno.h> 11 #include <linux/err.h> 12 #include <linux/kvm_host.h> 13 #include <linux/list.h> 14 #include <linux/module.h> 15 #include <linux/vmalloc.h> 16 #include <linux/fs.h> 17 #include <linux/mman.h> 18 #include <linux/sched.h> 19 #include <linux/kvm.h> 20 #include <linux/kvm_irqfd.h> 21 #include <linux/irqbypass.h> 22 #include <linux/sched/stat.h> 23 #include <linux/psci.h> 24 #include <trace/events/kvm.h> 25 26 #define CREATE_TRACE_POINTS 27 #include "trace_arm.h" 28 29 #include <linux/uaccess.h> 30 #include <asm/ptrace.h> 31 #include <asm/mman.h> 32 #include <asm/tlbflush.h> 33 #include <asm/cacheflush.h> 34 #include <asm/cpufeature.h> 35 #include <asm/virt.h> 36 #include <asm/kvm_arm.h> 37 #include <asm/kvm_asm.h> 38 #include <asm/kvm_mmu.h> 39 #include <asm/kvm_pkvm.h> 40 #include <asm/kvm_emulate.h> 41 #include <asm/sections.h> 42 43 #include <kvm/arm_hypercalls.h> 44 #include <kvm/arm_pmu.h> 45 #include <kvm/arm_psci.h> 46 47 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; 48 49 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); 50 51 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 52 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); 53 54 static bool vgic_present; 55 56 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); 57 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 58 59 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 60 { 61 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 62 } 63 64 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 65 struct kvm_enable_cap *cap) 66 { 67 int r; 68 69 if (cap->flags) 70 return -EINVAL; 71 72 switch (cap->cap) { 73 case KVM_CAP_ARM_NISV_TO_USER: 74 r = 0; 75 set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, 76 &kvm->arch.flags); 77 break; 78 case KVM_CAP_ARM_MTE: 79 mutex_lock(&kvm->lock); 80 if (!system_supports_mte() || kvm->created_vcpus) { 81 r = -EINVAL; 82 } else { 83 r = 0; 84 set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); 85 } 86 mutex_unlock(&kvm->lock); 87 break; 88 case KVM_CAP_ARM_SYSTEM_SUSPEND: 89 r = 0; 90 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); 91 break; 92 default: 93 r = -EINVAL; 94 break; 95 } 96 97 return r; 98 } 99 100 static int kvm_arm_default_max_vcpus(void) 101 { 102 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; 103 } 104 105 static void set_default_spectre(struct kvm *kvm) 106 { 107 /* 108 * The default is to expose CSV2 == 1 if the HW isn't affected. 109 * Although this is a per-CPU feature, we make it global because 110 * asymmetric systems are just a nuisance. 111 * 112 * Userspace can override this as long as it doesn't promise 113 * the impossible. 114 */ 115 if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) 116 kvm->arch.pfr0_csv2 = 1; 117 if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) 118 kvm->arch.pfr0_csv3 = 1; 119 } 120 121 /** 122 * kvm_arch_init_vm - initializes a VM data structure 123 * @kvm: pointer to the KVM struct 124 */ 125 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 126 { 127 int ret; 128 129 mutex_init(&kvm->arch.config_lock); 130 131 #ifdef CONFIG_LOCKDEP 132 /* Clue in lockdep that the config_lock must be taken inside kvm->lock */ 133 mutex_lock(&kvm->lock); 134 mutex_lock(&kvm->arch.config_lock); 135 mutex_unlock(&kvm->arch.config_lock); 136 mutex_unlock(&kvm->lock); 137 #endif 138 139 ret = kvm_share_hyp(kvm, kvm + 1); 140 if (ret) 141 return ret; 142 143 ret = pkvm_init_host_vm(kvm); 144 if (ret) 145 goto err_unshare_kvm; 146 147 if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { 148 ret = -ENOMEM; 149 goto err_unshare_kvm; 150 } 151 cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); 152 153 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); 154 if (ret) 155 goto err_free_cpumask; 156 157 kvm_vgic_early_init(kvm); 158 159 kvm_timer_init_vm(kvm); 160 161 /* The maximum number of VCPUs is limited by the host's GIC model */ 162 kvm->max_vcpus = kvm_arm_default_max_vcpus(); 163 164 set_default_spectre(kvm); 165 kvm_arm_init_hypercalls(kvm); 166 167 /* 168 * Initialise the default PMUver before there is a chance to 169 * create an actual PMU. 170 */ 171 kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); 172 173 return 0; 174 175 err_free_cpumask: 176 free_cpumask_var(kvm->arch.supported_cpus); 177 err_unshare_kvm: 178 kvm_unshare_hyp(kvm, kvm + 1); 179 return ret; 180 } 181 182 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 183 { 184 return VM_FAULT_SIGBUS; 185 } 186 187 188 /** 189 * kvm_arch_destroy_vm - destroy the VM data structure 190 * @kvm: pointer to the KVM struct 191 */ 192 void kvm_arch_destroy_vm(struct kvm *kvm) 193 { 194 bitmap_free(kvm->arch.pmu_filter); 195 free_cpumask_var(kvm->arch.supported_cpus); 196 197 kvm_vgic_destroy(kvm); 198 199 if (is_protected_kvm_enabled()) 200 pkvm_destroy_hyp_vm(kvm); 201 202 kvm_destroy_vcpus(kvm); 203 204 kvm_unshare_hyp(kvm, kvm + 1); 205 206 kvm_arm_teardown_hypercalls(kvm); 207 } 208 209 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 210 { 211 int r; 212 switch (ext) { 213 case KVM_CAP_IRQCHIP: 214 r = vgic_present; 215 break; 216 case KVM_CAP_IOEVENTFD: 217 case KVM_CAP_DEVICE_CTRL: 218 case KVM_CAP_USER_MEMORY: 219 case KVM_CAP_SYNC_MMU: 220 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 221 case KVM_CAP_ONE_REG: 222 case KVM_CAP_ARM_PSCI: 223 case KVM_CAP_ARM_PSCI_0_2: 224 case KVM_CAP_READONLY_MEM: 225 case KVM_CAP_MP_STATE: 226 case KVM_CAP_IMMEDIATE_EXIT: 227 case KVM_CAP_VCPU_EVENTS: 228 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: 229 case KVM_CAP_ARM_NISV_TO_USER: 230 case KVM_CAP_ARM_INJECT_EXT_DABT: 231 case KVM_CAP_SET_GUEST_DEBUG: 232 case KVM_CAP_VCPU_ATTRIBUTES: 233 case KVM_CAP_PTP_KVM: 234 case KVM_CAP_ARM_SYSTEM_SUSPEND: 235 case KVM_CAP_IRQFD_RESAMPLE: 236 case KVM_CAP_COUNTER_OFFSET: 237 r = 1; 238 break; 239 case KVM_CAP_SET_GUEST_DEBUG2: 240 return KVM_GUESTDBG_VALID_MASK; 241 case KVM_CAP_ARM_SET_DEVICE_ADDR: 242 r = 1; 243 break; 244 case KVM_CAP_NR_VCPUS: 245 /* 246 * ARM64 treats KVM_CAP_NR_CPUS differently from all other 247 * architectures, as it does not always bound it to 248 * KVM_CAP_MAX_VCPUS. It should not matter much because 249 * this is just an advisory value. 250 */ 251 r = min_t(unsigned int, num_online_cpus(), 252 kvm_arm_default_max_vcpus()); 253 break; 254 case KVM_CAP_MAX_VCPUS: 255 case KVM_CAP_MAX_VCPU_ID: 256 if (kvm) 257 r = kvm->max_vcpus; 258 else 259 r = kvm_arm_default_max_vcpus(); 260 break; 261 case KVM_CAP_MSI_DEVID: 262 if (!kvm) 263 r = -EINVAL; 264 else 265 r = kvm->arch.vgic.msis_require_devid; 266 break; 267 case KVM_CAP_ARM_USER_IRQ: 268 /* 269 * 1: EL1_VTIMER, EL1_PTIMER, and PMU. 270 * (bump this number if adding more devices) 271 */ 272 r = 1; 273 break; 274 case KVM_CAP_ARM_MTE: 275 r = system_supports_mte(); 276 break; 277 case KVM_CAP_STEAL_TIME: 278 r = kvm_arm_pvtime_supported(); 279 break; 280 case KVM_CAP_ARM_EL1_32BIT: 281 r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); 282 break; 283 case KVM_CAP_GUEST_DEBUG_HW_BPS: 284 r = get_num_brps(); 285 break; 286 case KVM_CAP_GUEST_DEBUG_HW_WPS: 287 r = get_num_wrps(); 288 break; 289 case KVM_CAP_ARM_PMU_V3: 290 r = kvm_arm_support_pmu_v3(); 291 break; 292 case KVM_CAP_ARM_INJECT_SERROR_ESR: 293 r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); 294 break; 295 case KVM_CAP_ARM_VM_IPA_SIZE: 296 r = get_kvm_ipa_limit(); 297 break; 298 case KVM_CAP_ARM_SVE: 299 r = system_supports_sve(); 300 break; 301 case KVM_CAP_ARM_PTRAUTH_ADDRESS: 302 case KVM_CAP_ARM_PTRAUTH_GENERIC: 303 r = system_has_full_ptr_auth(); 304 break; 305 default: 306 r = 0; 307 } 308 309 return r; 310 } 311 312 long kvm_arch_dev_ioctl(struct file *filp, 313 unsigned int ioctl, unsigned long arg) 314 { 315 return -EINVAL; 316 } 317 318 struct kvm *kvm_arch_alloc_vm(void) 319 { 320 size_t sz = sizeof(struct kvm); 321 322 if (!has_vhe()) 323 return kzalloc(sz, GFP_KERNEL_ACCOUNT); 324 325 return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); 326 } 327 328 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 329 { 330 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) 331 return -EBUSY; 332 333 if (id >= kvm->max_vcpus) 334 return -EINVAL; 335 336 return 0; 337 } 338 339 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 340 { 341 int err; 342 343 spin_lock_init(&vcpu->arch.mp_state_lock); 344 345 #ifdef CONFIG_LOCKDEP 346 /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ 347 mutex_lock(&vcpu->mutex); 348 mutex_lock(&vcpu->kvm->arch.config_lock); 349 mutex_unlock(&vcpu->kvm->arch.config_lock); 350 mutex_unlock(&vcpu->mutex); 351 #endif 352 353 /* Force users to call KVM_ARM_VCPU_INIT */ 354 vcpu->arch.target = -1; 355 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 356 357 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; 358 359 /* 360 * Default value for the FP state, will be overloaded at load 361 * time if we support FP (pretty likely) 362 */ 363 vcpu->arch.fp_state = FP_STATE_FREE; 364 365 /* Set up the timer */ 366 kvm_timer_vcpu_init(vcpu); 367 368 kvm_pmu_vcpu_init(vcpu); 369 370 kvm_arm_reset_debug_ptr(vcpu); 371 372 kvm_arm_pvtime_vcpu_init(&vcpu->arch); 373 374 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; 375 376 err = kvm_vgic_vcpu_init(vcpu); 377 if (err) 378 return err; 379 380 return kvm_share_hyp(vcpu, vcpu + 1); 381 } 382 383 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 384 { 385 } 386 387 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 388 { 389 if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) 390 static_branch_dec(&userspace_irqchip_in_use); 391 392 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 393 kvm_timer_vcpu_terminate(vcpu); 394 kvm_pmu_vcpu_destroy(vcpu); 395 396 kvm_arm_vcpu_destroy(vcpu); 397 } 398 399 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 400 { 401 402 } 403 404 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) 405 { 406 407 } 408 409 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 410 { 411 struct kvm_s2_mmu *mmu; 412 int *last_ran; 413 414 mmu = vcpu->arch.hw_mmu; 415 last_ran = this_cpu_ptr(mmu->last_vcpu_ran); 416 417 /* 418 * We guarantee that both TLBs and I-cache are private to each 419 * vcpu. If detecting that a vcpu from the same VM has 420 * previously run on the same physical CPU, call into the 421 * hypervisor code to nuke the relevant contexts. 422 * 423 * We might get preempted before the vCPU actually runs, but 424 * over-invalidation doesn't affect correctness. 425 */ 426 if (*last_ran != vcpu->vcpu_id) { 427 kvm_call_hyp(__kvm_flush_cpu_context, mmu); 428 *last_ran = vcpu->vcpu_id; 429 } 430 431 vcpu->cpu = cpu; 432 433 kvm_vgic_load(vcpu); 434 kvm_timer_vcpu_load(vcpu); 435 if (has_vhe()) 436 kvm_vcpu_load_sysregs_vhe(vcpu); 437 kvm_arch_vcpu_load_fp(vcpu); 438 kvm_vcpu_pmu_restore_guest(vcpu); 439 if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) 440 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); 441 442 if (single_task_running()) 443 vcpu_clear_wfx_traps(vcpu); 444 else 445 vcpu_set_wfx_traps(vcpu); 446 447 if (vcpu_has_ptrauth(vcpu)) 448 vcpu_ptrauth_disable(vcpu); 449 kvm_arch_vcpu_load_debug_state_flags(vcpu); 450 451 if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus)) 452 vcpu_set_on_unsupported_cpu(vcpu); 453 } 454 455 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 456 { 457 kvm_arch_vcpu_put_debug_state_flags(vcpu); 458 kvm_arch_vcpu_put_fp(vcpu); 459 if (has_vhe()) 460 kvm_vcpu_put_sysregs_vhe(vcpu); 461 kvm_timer_vcpu_put(vcpu); 462 kvm_vgic_put(vcpu); 463 kvm_vcpu_pmu_restore_host(vcpu); 464 kvm_arm_vmid_clear_active(); 465 466 vcpu_clear_on_unsupported_cpu(vcpu); 467 vcpu->cpu = -1; 468 } 469 470 static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) 471 { 472 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); 473 kvm_make_request(KVM_REQ_SLEEP, vcpu); 474 kvm_vcpu_kick(vcpu); 475 } 476 477 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) 478 { 479 spin_lock(&vcpu->arch.mp_state_lock); 480 __kvm_arm_vcpu_power_off(vcpu); 481 spin_unlock(&vcpu->arch.mp_state_lock); 482 } 483 484 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) 485 { 486 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; 487 } 488 489 static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) 490 { 491 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED); 492 kvm_make_request(KVM_REQ_SUSPEND, vcpu); 493 kvm_vcpu_kick(vcpu); 494 } 495 496 static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) 497 { 498 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED; 499 } 500 501 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 502 struct kvm_mp_state *mp_state) 503 { 504 *mp_state = READ_ONCE(vcpu->arch.mp_state); 505 506 return 0; 507 } 508 509 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 510 struct kvm_mp_state *mp_state) 511 { 512 int ret = 0; 513 514 spin_lock(&vcpu->arch.mp_state_lock); 515 516 switch (mp_state->mp_state) { 517 case KVM_MP_STATE_RUNNABLE: 518 WRITE_ONCE(vcpu->arch.mp_state, *mp_state); 519 break; 520 case KVM_MP_STATE_STOPPED: 521 __kvm_arm_vcpu_power_off(vcpu); 522 break; 523 case KVM_MP_STATE_SUSPENDED: 524 kvm_arm_vcpu_suspend(vcpu); 525 break; 526 default: 527 ret = -EINVAL; 528 } 529 530 spin_unlock(&vcpu->arch.mp_state_lock); 531 532 return ret; 533 } 534 535 /** 536 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled 537 * @v: The VCPU pointer 538 * 539 * If the guest CPU is not waiting for interrupts or an interrupt line is 540 * asserted, the CPU is by definition runnable. 541 */ 542 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 543 { 544 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); 545 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) 546 && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); 547 } 548 549 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 550 { 551 return vcpu_mode_priv(vcpu); 552 } 553 554 #ifdef CONFIG_GUEST_PERF_EVENTS 555 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) 556 { 557 return *vcpu_pc(vcpu); 558 } 559 #endif 560 561 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) 562 { 563 return vcpu->arch.target >= 0; 564 } 565 566 /* 567 * Handle both the initialisation that is being done when the vcpu is 568 * run for the first time, as well as the updates that must be 569 * performed each time we get a new thread dealing with this vcpu. 570 */ 571 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) 572 { 573 struct kvm *kvm = vcpu->kvm; 574 int ret; 575 576 if (!kvm_vcpu_initialized(vcpu)) 577 return -ENOEXEC; 578 579 if (!kvm_arm_vcpu_is_finalized(vcpu)) 580 return -EPERM; 581 582 ret = kvm_arch_vcpu_run_map_fp(vcpu); 583 if (ret) 584 return ret; 585 586 if (likely(vcpu_has_run_once(vcpu))) 587 return 0; 588 589 kvm_arm_vcpu_init_debug(vcpu); 590 591 if (likely(irqchip_in_kernel(kvm))) { 592 /* 593 * Map the VGIC hardware resources before running a vcpu the 594 * first time on this VM. 595 */ 596 ret = kvm_vgic_map_resources(kvm); 597 if (ret) 598 return ret; 599 } 600 601 ret = kvm_timer_enable(vcpu); 602 if (ret) 603 return ret; 604 605 ret = kvm_arm_pmu_v3_enable(vcpu); 606 if (ret) 607 return ret; 608 609 if (is_protected_kvm_enabled()) { 610 ret = pkvm_create_hyp_vm(kvm); 611 if (ret) 612 return ret; 613 } 614 615 if (!irqchip_in_kernel(kvm)) { 616 /* 617 * Tell the rest of the code that there are userspace irqchip 618 * VMs in the wild. 619 */ 620 static_branch_inc(&userspace_irqchip_in_use); 621 } 622 623 /* 624 * Initialize traps for protected VMs. 625 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once 626 * the code is in place for first run initialization at EL2. 627 */ 628 if (kvm_vm_is_protected(kvm)) 629 kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); 630 631 mutex_lock(&kvm->arch.config_lock); 632 set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); 633 mutex_unlock(&kvm->arch.config_lock); 634 635 return ret; 636 } 637 638 bool kvm_arch_intc_initialized(struct kvm *kvm) 639 { 640 return vgic_initialized(kvm); 641 } 642 643 void kvm_arm_halt_guest(struct kvm *kvm) 644 { 645 unsigned long i; 646 struct kvm_vcpu *vcpu; 647 648 kvm_for_each_vcpu(i, vcpu, kvm) 649 vcpu->arch.pause = true; 650 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); 651 } 652 653 void kvm_arm_resume_guest(struct kvm *kvm) 654 { 655 unsigned long i; 656 struct kvm_vcpu *vcpu; 657 658 kvm_for_each_vcpu(i, vcpu, kvm) { 659 vcpu->arch.pause = false; 660 __kvm_vcpu_wake_up(vcpu); 661 } 662 } 663 664 static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) 665 { 666 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); 667 668 rcuwait_wait_event(wait, 669 (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), 670 TASK_INTERRUPTIBLE); 671 672 if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { 673 /* Awaken to handle a signal, request we sleep again later. */ 674 kvm_make_request(KVM_REQ_SLEEP, vcpu); 675 } 676 677 /* 678 * Make sure we will observe a potential reset request if we've 679 * observed a change to the power state. Pairs with the smp_wmb() in 680 * kvm_psci_vcpu_on(). 681 */ 682 smp_rmb(); 683 } 684 685 /** 686 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior 687 * @vcpu: The VCPU pointer 688 * 689 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until 690 * the vCPU is runnable. The vCPU may or may not be scheduled out, depending 691 * on when a wake event arrives, e.g. there may already be a pending wake event. 692 */ 693 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) 694 { 695 /* 696 * Sync back the state of the GIC CPU interface so that we have 697 * the latest PMR and group enables. This ensures that 698 * kvm_arch_vcpu_runnable has up-to-date data to decide whether 699 * we have pending interrupts, e.g. when determining if the 700 * vCPU should block. 701 * 702 * For the same reason, we want to tell GICv4 that we need 703 * doorbells to be signalled, should an interrupt become pending. 704 */ 705 preempt_disable(); 706 kvm_vgic_vmcr_sync(vcpu); 707 vgic_v4_put(vcpu, true); 708 preempt_enable(); 709 710 kvm_vcpu_halt(vcpu); 711 vcpu_clear_flag(vcpu, IN_WFIT); 712 713 preempt_disable(); 714 vgic_v4_load(vcpu); 715 preempt_enable(); 716 } 717 718 static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) 719 { 720 if (!kvm_arm_vcpu_suspended(vcpu)) 721 return 1; 722 723 kvm_vcpu_wfi(vcpu); 724 725 /* 726 * The suspend state is sticky; we do not leave it until userspace 727 * explicitly marks the vCPU as runnable. Request that we suspend again 728 * later. 729 */ 730 kvm_make_request(KVM_REQ_SUSPEND, vcpu); 731 732 /* 733 * Check to make sure the vCPU is actually runnable. If so, exit to 734 * userspace informing it of the wakeup condition. 735 */ 736 if (kvm_arch_vcpu_runnable(vcpu)) { 737 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); 738 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; 739 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 740 return 0; 741 } 742 743 /* 744 * Otherwise, we were unblocked to process a different event, such as a 745 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to 746 * process the event. 747 */ 748 return 1; 749 } 750 751 /** 752 * check_vcpu_requests - check and handle pending vCPU requests 753 * @vcpu: the VCPU pointer 754 * 755 * Return: 1 if we should enter the guest 756 * 0 if we should exit to userspace 757 * < 0 if we should exit to userspace, where the return value indicates 758 * an error 759 */ 760 static int check_vcpu_requests(struct kvm_vcpu *vcpu) 761 { 762 if (kvm_request_pending(vcpu)) { 763 if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) 764 kvm_vcpu_sleep(vcpu); 765 766 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 767 kvm_reset_vcpu(vcpu); 768 769 /* 770 * Clear IRQ_PENDING requests that were made to guarantee 771 * that a VCPU sees new virtual interrupts. 772 */ 773 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); 774 775 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) 776 kvm_update_stolen_time(vcpu); 777 778 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { 779 /* The distributor enable bits were changed */ 780 preempt_disable(); 781 vgic_v4_put(vcpu, false); 782 vgic_v4_load(vcpu); 783 preempt_enable(); 784 } 785 786 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) 787 kvm_pmu_handle_pmcr(vcpu, 788 __vcpu_sys_reg(vcpu, PMCR_EL0)); 789 790 if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) 791 return kvm_vcpu_suspend(vcpu); 792 793 if (kvm_dirty_ring_check_request(vcpu)) 794 return 0; 795 } 796 797 return 1; 798 } 799 800 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) 801 { 802 if (likely(!vcpu_mode_is_32bit(vcpu))) 803 return false; 804 805 return !kvm_supports_32bit_el0(); 806 } 807 808 /** 809 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest 810 * @vcpu: The VCPU pointer 811 * @ret: Pointer to write optional return code 812 * 813 * Returns: true if the VCPU needs to return to a preemptible + interruptible 814 * and skip guest entry. 815 * 816 * This function disambiguates between two different types of exits: exits to a 817 * preemptible + interruptible kernel context and exits to userspace. For an 818 * exit to userspace, this function will write the return code to ret and return 819 * true. For an exit to preemptible + interruptible kernel context (i.e. check 820 * for pending work and re-enter), return true without writing to ret. 821 */ 822 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) 823 { 824 struct kvm_run *run = vcpu->run; 825 826 /* 827 * If we're using a userspace irqchip, then check if we need 828 * to tell a userspace irqchip about timer or PMU level 829 * changes and if so, exit to userspace (the actual level 830 * state gets updated in kvm_timer_update_run and 831 * kvm_pmu_update_run below). 832 */ 833 if (static_branch_unlikely(&userspace_irqchip_in_use)) { 834 if (kvm_timer_should_notify_user(vcpu) || 835 kvm_pmu_should_notify_user(vcpu)) { 836 *ret = -EINTR; 837 run->exit_reason = KVM_EXIT_INTR; 838 return true; 839 } 840 } 841 842 if (unlikely(vcpu_on_unsupported_cpu(vcpu))) { 843 run->exit_reason = KVM_EXIT_FAIL_ENTRY; 844 run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED; 845 run->fail_entry.cpu = smp_processor_id(); 846 *ret = 0; 847 return true; 848 } 849 850 return kvm_request_pending(vcpu) || 851 xfer_to_guest_mode_work_pending(); 852 } 853 854 /* 855 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while 856 * the vCPU is running. 857 * 858 * This must be noinstr as instrumentation may make use of RCU, and this is not 859 * safe during the EQS. 860 */ 861 static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 862 { 863 int ret; 864 865 guest_state_enter_irqoff(); 866 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); 867 guest_state_exit_irqoff(); 868 869 return ret; 870 } 871 872 /** 873 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code 874 * @vcpu: The VCPU pointer 875 * 876 * This function is called through the VCPU_RUN ioctl called from user space. It 877 * will execute VM code in a loop until the time slice for the process is used 878 * or some emulation is needed from user space in which case the function will 879 * return with return value 0 and with the kvm_run structure filled in with the 880 * required data for the requested emulation. 881 */ 882 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) 883 { 884 struct kvm_run *run = vcpu->run; 885 int ret; 886 887 if (run->exit_reason == KVM_EXIT_MMIO) { 888 ret = kvm_handle_mmio_return(vcpu); 889 if (ret) 890 return ret; 891 } 892 893 vcpu_load(vcpu); 894 895 if (run->immediate_exit) { 896 ret = -EINTR; 897 goto out; 898 } 899 900 kvm_sigset_activate(vcpu); 901 902 ret = 1; 903 run->exit_reason = KVM_EXIT_UNKNOWN; 904 run->flags = 0; 905 while (ret > 0) { 906 /* 907 * Check conditions before entering the guest 908 */ 909 ret = xfer_to_guest_mode_handle_work(vcpu); 910 if (!ret) 911 ret = 1; 912 913 if (ret > 0) 914 ret = check_vcpu_requests(vcpu); 915 916 /* 917 * Preparing the interrupts to be injected also 918 * involves poking the GIC, which must be done in a 919 * non-preemptible context. 920 */ 921 preempt_disable(); 922 923 /* 924 * The VMID allocator only tracks active VMIDs per 925 * physical CPU, and therefore the VMID allocated may not be 926 * preserved on VMID roll-over if the task was preempted, 927 * making a thread's VMID inactive. So we need to call 928 * kvm_arm_vmid_update() in non-premptible context. 929 */ 930 kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid); 931 932 kvm_pmu_flush_hwstate(vcpu); 933 934 local_irq_disable(); 935 936 kvm_vgic_flush_hwstate(vcpu); 937 938 kvm_pmu_update_vcpu_events(vcpu); 939 940 /* 941 * Ensure we set mode to IN_GUEST_MODE after we disable 942 * interrupts and before the final VCPU requests check. 943 * See the comment in kvm_vcpu_exiting_guest_mode() and 944 * Documentation/virt/kvm/vcpu-requests.rst 945 */ 946 smp_store_mb(vcpu->mode, IN_GUEST_MODE); 947 948 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { 949 vcpu->mode = OUTSIDE_GUEST_MODE; 950 isb(); /* Ensure work in x_flush_hwstate is committed */ 951 kvm_pmu_sync_hwstate(vcpu); 952 if (static_branch_unlikely(&userspace_irqchip_in_use)) 953 kvm_timer_sync_user(vcpu); 954 kvm_vgic_sync_hwstate(vcpu); 955 local_irq_enable(); 956 preempt_enable(); 957 continue; 958 } 959 960 kvm_arm_setup_debug(vcpu); 961 kvm_arch_vcpu_ctxflush_fp(vcpu); 962 963 /************************************************************** 964 * Enter the guest 965 */ 966 trace_kvm_entry(*vcpu_pc(vcpu)); 967 guest_timing_enter_irqoff(); 968 969 ret = kvm_arm_vcpu_enter_exit(vcpu); 970 971 vcpu->mode = OUTSIDE_GUEST_MODE; 972 vcpu->stat.exits++; 973 /* 974 * Back from guest 975 *************************************************************/ 976 977 kvm_arm_clear_debug(vcpu); 978 979 /* 980 * We must sync the PMU state before the vgic state so 981 * that the vgic can properly sample the updated state of the 982 * interrupt line. 983 */ 984 kvm_pmu_sync_hwstate(vcpu); 985 986 /* 987 * Sync the vgic state before syncing the timer state because 988 * the timer code needs to know if the virtual timer 989 * interrupts are active. 990 */ 991 kvm_vgic_sync_hwstate(vcpu); 992 993 /* 994 * Sync the timer hardware state before enabling interrupts as 995 * we don't want vtimer interrupts to race with syncing the 996 * timer virtual interrupt state. 997 */ 998 if (static_branch_unlikely(&userspace_irqchip_in_use)) 999 kvm_timer_sync_user(vcpu); 1000 1001 kvm_arch_vcpu_ctxsync_fp(vcpu); 1002 1003 /* 1004 * We must ensure that any pending interrupts are taken before 1005 * we exit guest timing so that timer ticks are accounted as 1006 * guest time. Transiently unmask interrupts so that any 1007 * pending interrupts are taken. 1008 * 1009 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other 1010 * context synchronization event) is necessary to ensure that 1011 * pending interrupts are taken. 1012 */ 1013 if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) { 1014 local_irq_enable(); 1015 isb(); 1016 local_irq_disable(); 1017 } 1018 1019 guest_timing_exit_irqoff(); 1020 1021 local_irq_enable(); 1022 1023 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 1024 1025 /* Exit types that need handling before we can be preempted */ 1026 handle_exit_early(vcpu, ret); 1027 1028 preempt_enable(); 1029 1030 /* 1031 * The ARMv8 architecture doesn't give the hypervisor 1032 * a mechanism to prevent a guest from dropping to AArch32 EL0 1033 * if implemented by the CPU. If we spot the guest in such 1034 * state and that we decided it wasn't supposed to do so (like 1035 * with the asymmetric AArch32 case), return to userspace with 1036 * a fatal error. 1037 */ 1038 if (vcpu_mode_is_bad_32bit(vcpu)) { 1039 /* 1040 * As we have caught the guest red-handed, decide that 1041 * it isn't fit for purpose anymore by making the vcpu 1042 * invalid. The VMM can try and fix it by issuing a 1043 * KVM_ARM_VCPU_INIT if it really wants to. 1044 */ 1045 vcpu->arch.target = -1; 1046 ret = ARM_EXCEPTION_IL; 1047 } 1048 1049 ret = handle_exit(vcpu, ret); 1050 } 1051 1052 /* Tell userspace about in-kernel device output levels */ 1053 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { 1054 kvm_timer_update_run(vcpu); 1055 kvm_pmu_update_run(vcpu); 1056 } 1057 1058 kvm_sigset_deactivate(vcpu); 1059 1060 out: 1061 /* 1062 * In the unlikely event that we are returning to userspace 1063 * with pending exceptions or PC adjustment, commit these 1064 * adjustments in order to give userspace a consistent view of 1065 * the vcpu state. Note that this relies on __kvm_adjust_pc() 1066 * being preempt-safe on VHE. 1067 */ 1068 if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || 1069 vcpu_get_flag(vcpu, INCREMENT_PC))) 1070 kvm_call_hyp(__kvm_adjust_pc, vcpu); 1071 1072 vcpu_put(vcpu); 1073 return ret; 1074 } 1075 1076 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) 1077 { 1078 int bit_index; 1079 bool set; 1080 unsigned long *hcr; 1081 1082 if (number == KVM_ARM_IRQ_CPU_IRQ) 1083 bit_index = __ffs(HCR_VI); 1084 else /* KVM_ARM_IRQ_CPU_FIQ */ 1085 bit_index = __ffs(HCR_VF); 1086 1087 hcr = vcpu_hcr(vcpu); 1088 if (level) 1089 set = test_and_set_bit(bit_index, hcr); 1090 else 1091 set = test_and_clear_bit(bit_index, hcr); 1092 1093 /* 1094 * If we didn't change anything, no need to wake up or kick other CPUs 1095 */ 1096 if (set == level) 1097 return 0; 1098 1099 /* 1100 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and 1101 * trigger a world-switch round on the running physical CPU to set the 1102 * virtual IRQ/FIQ fields in the HCR appropriately. 1103 */ 1104 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 1105 kvm_vcpu_kick(vcpu); 1106 1107 return 0; 1108 } 1109 1110 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 1111 bool line_status) 1112 { 1113 u32 irq = irq_level->irq; 1114 unsigned int irq_type, vcpu_idx, irq_num; 1115 int nrcpus = atomic_read(&kvm->online_vcpus); 1116 struct kvm_vcpu *vcpu = NULL; 1117 bool level = irq_level->level; 1118 1119 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; 1120 vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; 1121 vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); 1122 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; 1123 1124 trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); 1125 1126 switch (irq_type) { 1127 case KVM_ARM_IRQ_TYPE_CPU: 1128 if (irqchip_in_kernel(kvm)) 1129 return -ENXIO; 1130 1131 if (vcpu_idx >= nrcpus) 1132 return -EINVAL; 1133 1134 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1135 if (!vcpu) 1136 return -EINVAL; 1137 1138 if (irq_num > KVM_ARM_IRQ_CPU_FIQ) 1139 return -EINVAL; 1140 1141 return vcpu_interrupt_line(vcpu, irq_num, level); 1142 case KVM_ARM_IRQ_TYPE_PPI: 1143 if (!irqchip_in_kernel(kvm)) 1144 return -ENXIO; 1145 1146 if (vcpu_idx >= nrcpus) 1147 return -EINVAL; 1148 1149 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1150 if (!vcpu) 1151 return -EINVAL; 1152 1153 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) 1154 return -EINVAL; 1155 1156 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); 1157 case KVM_ARM_IRQ_TYPE_SPI: 1158 if (!irqchip_in_kernel(kvm)) 1159 return -ENXIO; 1160 1161 if (irq_num < VGIC_NR_PRIVATE_IRQS) 1162 return -EINVAL; 1163 1164 return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); 1165 } 1166 1167 return -EINVAL; 1168 } 1169 1170 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1171 const struct kvm_vcpu_init *init) 1172 { 1173 unsigned int i, ret; 1174 u32 phys_target = kvm_target_cpu(); 1175 1176 if (init->target != phys_target) 1177 return -EINVAL; 1178 1179 /* 1180 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1181 * use the same target. 1182 */ 1183 if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) 1184 return -EINVAL; 1185 1186 /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ 1187 for (i = 0; i < sizeof(init->features) * 8; i++) { 1188 bool set = (init->features[i / 32] & (1 << (i % 32))); 1189 1190 if (set && i >= KVM_VCPU_MAX_FEATURES) 1191 return -ENOENT; 1192 1193 /* 1194 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1195 * use the same feature set. 1196 */ 1197 if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && 1198 test_bit(i, vcpu->arch.features) != set) 1199 return -EINVAL; 1200 1201 if (set) 1202 set_bit(i, vcpu->arch.features); 1203 } 1204 1205 vcpu->arch.target = phys_target; 1206 1207 /* Now we know what it is, we can reset it. */ 1208 ret = kvm_reset_vcpu(vcpu); 1209 if (ret) { 1210 vcpu->arch.target = -1; 1211 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 1212 } 1213 1214 return ret; 1215 } 1216 1217 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, 1218 struct kvm_vcpu_init *init) 1219 { 1220 int ret; 1221 1222 ret = kvm_vcpu_set_target(vcpu, init); 1223 if (ret) 1224 return ret; 1225 1226 /* 1227 * Ensure a rebooted VM will fault in RAM pages and detect if the 1228 * guest MMU is turned off and flush the caches as needed. 1229 * 1230 * S2FWB enforces all memory accesses to RAM being cacheable, 1231 * ensuring that the data side is always coherent. We still 1232 * need to invalidate the I-cache though, as FWB does *not* 1233 * imply CTR_EL0.DIC. 1234 */ 1235 if (vcpu_has_run_once(vcpu)) { 1236 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) 1237 stage2_unmap_vm(vcpu->kvm); 1238 else 1239 icache_inval_all_pou(); 1240 } 1241 1242 vcpu_reset_hcr(vcpu); 1243 vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; 1244 1245 /* 1246 * Handle the "start in power-off" case. 1247 */ 1248 spin_lock(&vcpu->arch.mp_state_lock); 1249 1250 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 1251 __kvm_arm_vcpu_power_off(vcpu); 1252 else 1253 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); 1254 1255 spin_unlock(&vcpu->arch.mp_state_lock); 1256 1257 return 0; 1258 } 1259 1260 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, 1261 struct kvm_device_attr *attr) 1262 { 1263 int ret = -ENXIO; 1264 1265 switch (attr->group) { 1266 default: 1267 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); 1268 break; 1269 } 1270 1271 return ret; 1272 } 1273 1274 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, 1275 struct kvm_device_attr *attr) 1276 { 1277 int ret = -ENXIO; 1278 1279 switch (attr->group) { 1280 default: 1281 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); 1282 break; 1283 } 1284 1285 return ret; 1286 } 1287 1288 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, 1289 struct kvm_device_attr *attr) 1290 { 1291 int ret = -ENXIO; 1292 1293 switch (attr->group) { 1294 default: 1295 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); 1296 break; 1297 } 1298 1299 return ret; 1300 } 1301 1302 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, 1303 struct kvm_vcpu_events *events) 1304 { 1305 memset(events, 0, sizeof(*events)); 1306 1307 return __kvm_arm_vcpu_get_events(vcpu, events); 1308 } 1309 1310 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, 1311 struct kvm_vcpu_events *events) 1312 { 1313 int i; 1314 1315 /* check whether the reserved field is zero */ 1316 for (i = 0; i < ARRAY_SIZE(events->reserved); i++) 1317 if (events->reserved[i]) 1318 return -EINVAL; 1319 1320 /* check whether the pad field is zero */ 1321 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) 1322 if (events->exception.pad[i]) 1323 return -EINVAL; 1324 1325 return __kvm_arm_vcpu_set_events(vcpu, events); 1326 } 1327 1328 long kvm_arch_vcpu_ioctl(struct file *filp, 1329 unsigned int ioctl, unsigned long arg) 1330 { 1331 struct kvm_vcpu *vcpu = filp->private_data; 1332 void __user *argp = (void __user *)arg; 1333 struct kvm_device_attr attr; 1334 long r; 1335 1336 switch (ioctl) { 1337 case KVM_ARM_VCPU_INIT: { 1338 struct kvm_vcpu_init init; 1339 1340 r = -EFAULT; 1341 if (copy_from_user(&init, argp, sizeof(init))) 1342 break; 1343 1344 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); 1345 break; 1346 } 1347 case KVM_SET_ONE_REG: 1348 case KVM_GET_ONE_REG: { 1349 struct kvm_one_reg reg; 1350 1351 r = -ENOEXEC; 1352 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1353 break; 1354 1355 r = -EFAULT; 1356 if (copy_from_user(®, argp, sizeof(reg))) 1357 break; 1358 1359 /* 1360 * We could owe a reset due to PSCI. Handle the pending reset 1361 * here to ensure userspace register accesses are ordered after 1362 * the reset. 1363 */ 1364 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 1365 kvm_reset_vcpu(vcpu); 1366 1367 if (ioctl == KVM_SET_ONE_REG) 1368 r = kvm_arm_set_reg(vcpu, ®); 1369 else 1370 r = kvm_arm_get_reg(vcpu, ®); 1371 break; 1372 } 1373 case KVM_GET_REG_LIST: { 1374 struct kvm_reg_list __user *user_list = argp; 1375 struct kvm_reg_list reg_list; 1376 unsigned n; 1377 1378 r = -ENOEXEC; 1379 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1380 break; 1381 1382 r = -EPERM; 1383 if (!kvm_arm_vcpu_is_finalized(vcpu)) 1384 break; 1385 1386 r = -EFAULT; 1387 if (copy_from_user(®_list, user_list, sizeof(reg_list))) 1388 break; 1389 n = reg_list.n; 1390 reg_list.n = kvm_arm_num_regs(vcpu); 1391 if (copy_to_user(user_list, ®_list, sizeof(reg_list))) 1392 break; 1393 r = -E2BIG; 1394 if (n < reg_list.n) 1395 break; 1396 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); 1397 break; 1398 } 1399 case KVM_SET_DEVICE_ATTR: { 1400 r = -EFAULT; 1401 if (copy_from_user(&attr, argp, sizeof(attr))) 1402 break; 1403 r = kvm_arm_vcpu_set_attr(vcpu, &attr); 1404 break; 1405 } 1406 case KVM_GET_DEVICE_ATTR: { 1407 r = -EFAULT; 1408 if (copy_from_user(&attr, argp, sizeof(attr))) 1409 break; 1410 r = kvm_arm_vcpu_get_attr(vcpu, &attr); 1411 break; 1412 } 1413 case KVM_HAS_DEVICE_ATTR: { 1414 r = -EFAULT; 1415 if (copy_from_user(&attr, argp, sizeof(attr))) 1416 break; 1417 r = kvm_arm_vcpu_has_attr(vcpu, &attr); 1418 break; 1419 } 1420 case KVM_GET_VCPU_EVENTS: { 1421 struct kvm_vcpu_events events; 1422 1423 if (kvm_arm_vcpu_get_events(vcpu, &events)) 1424 return -EINVAL; 1425 1426 if (copy_to_user(argp, &events, sizeof(events))) 1427 return -EFAULT; 1428 1429 return 0; 1430 } 1431 case KVM_SET_VCPU_EVENTS: { 1432 struct kvm_vcpu_events events; 1433 1434 if (copy_from_user(&events, argp, sizeof(events))) 1435 return -EFAULT; 1436 1437 return kvm_arm_vcpu_set_events(vcpu, &events); 1438 } 1439 case KVM_ARM_VCPU_FINALIZE: { 1440 int what; 1441 1442 if (!kvm_vcpu_initialized(vcpu)) 1443 return -ENOEXEC; 1444 1445 if (get_user(what, (const int __user *)argp)) 1446 return -EFAULT; 1447 1448 return kvm_arm_vcpu_finalize(vcpu, what); 1449 } 1450 default: 1451 r = -EINVAL; 1452 } 1453 1454 return r; 1455 } 1456 1457 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1458 { 1459 1460 } 1461 1462 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 1463 const struct kvm_memory_slot *memslot) 1464 { 1465 kvm_flush_remote_tlbs(kvm); 1466 } 1467 1468 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, 1469 struct kvm_arm_device_addr *dev_addr) 1470 { 1471 switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { 1472 case KVM_ARM_DEVICE_VGIC_V2: 1473 if (!vgic_present) 1474 return -ENXIO; 1475 return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); 1476 default: 1477 return -ENODEV; 1478 } 1479 } 1480 1481 static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) 1482 { 1483 switch (attr->group) { 1484 case KVM_ARM_VM_SMCCC_CTRL: 1485 return kvm_vm_smccc_has_attr(kvm, attr); 1486 default: 1487 return -ENXIO; 1488 } 1489 } 1490 1491 static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) 1492 { 1493 switch (attr->group) { 1494 case KVM_ARM_VM_SMCCC_CTRL: 1495 return kvm_vm_smccc_set_attr(kvm, attr); 1496 default: 1497 return -ENXIO; 1498 } 1499 } 1500 1501 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1502 { 1503 struct kvm *kvm = filp->private_data; 1504 void __user *argp = (void __user *)arg; 1505 struct kvm_device_attr attr; 1506 1507 switch (ioctl) { 1508 case KVM_CREATE_IRQCHIP: { 1509 int ret; 1510 if (!vgic_present) 1511 return -ENXIO; 1512 mutex_lock(&kvm->lock); 1513 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); 1514 mutex_unlock(&kvm->lock); 1515 return ret; 1516 } 1517 case KVM_ARM_SET_DEVICE_ADDR: { 1518 struct kvm_arm_device_addr dev_addr; 1519 1520 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) 1521 return -EFAULT; 1522 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); 1523 } 1524 case KVM_ARM_PREFERRED_TARGET: { 1525 struct kvm_vcpu_init init; 1526 1527 kvm_vcpu_preferred_target(&init); 1528 1529 if (copy_to_user(argp, &init, sizeof(init))) 1530 return -EFAULT; 1531 1532 return 0; 1533 } 1534 case KVM_ARM_MTE_COPY_TAGS: { 1535 struct kvm_arm_copy_mte_tags copy_tags; 1536 1537 if (copy_from_user(©_tags, argp, sizeof(copy_tags))) 1538 return -EFAULT; 1539 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); 1540 } 1541 case KVM_ARM_SET_COUNTER_OFFSET: { 1542 struct kvm_arm_counter_offset offset; 1543 1544 if (copy_from_user(&offset, argp, sizeof(offset))) 1545 return -EFAULT; 1546 return kvm_vm_ioctl_set_counter_offset(kvm, &offset); 1547 } 1548 case KVM_HAS_DEVICE_ATTR: { 1549 if (copy_from_user(&attr, argp, sizeof(attr))) 1550 return -EFAULT; 1551 1552 return kvm_vm_has_attr(kvm, &attr); 1553 } 1554 case KVM_SET_DEVICE_ATTR: { 1555 if (copy_from_user(&attr, argp, sizeof(attr))) 1556 return -EFAULT; 1557 1558 return kvm_vm_set_attr(kvm, &attr); 1559 } 1560 default: 1561 return -EINVAL; 1562 } 1563 } 1564 1565 /* unlocks vcpus from @vcpu_lock_idx and smaller */ 1566 static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) 1567 { 1568 struct kvm_vcpu *tmp_vcpu; 1569 1570 for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { 1571 tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); 1572 mutex_unlock(&tmp_vcpu->mutex); 1573 } 1574 } 1575 1576 void unlock_all_vcpus(struct kvm *kvm) 1577 { 1578 lockdep_assert_held(&kvm->lock); 1579 1580 unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); 1581 } 1582 1583 /* Returns true if all vcpus were locked, false otherwise */ 1584 bool lock_all_vcpus(struct kvm *kvm) 1585 { 1586 struct kvm_vcpu *tmp_vcpu; 1587 unsigned long c; 1588 1589 lockdep_assert_held(&kvm->lock); 1590 1591 /* 1592 * Any time a vcpu is in an ioctl (including running), the 1593 * core KVM code tries to grab the vcpu->mutex. 1594 * 1595 * By grabbing the vcpu->mutex of all VCPUs we ensure that no 1596 * other VCPUs can fiddle with the state while we access it. 1597 */ 1598 kvm_for_each_vcpu(c, tmp_vcpu, kvm) { 1599 if (!mutex_trylock(&tmp_vcpu->mutex)) { 1600 unlock_vcpus(kvm, c - 1); 1601 return false; 1602 } 1603 } 1604 1605 return true; 1606 } 1607 1608 static unsigned long nvhe_percpu_size(void) 1609 { 1610 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - 1611 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); 1612 } 1613 1614 static unsigned long nvhe_percpu_order(void) 1615 { 1616 unsigned long size = nvhe_percpu_size(); 1617 1618 return size ? get_order(size) : 0; 1619 } 1620 1621 /* A lookup table holding the hypervisor VA for each vector slot */ 1622 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; 1623 1624 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) 1625 { 1626 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); 1627 } 1628 1629 static int kvm_init_vector_slots(void) 1630 { 1631 int err; 1632 void *base; 1633 1634 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); 1635 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); 1636 1637 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); 1638 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); 1639 1640 if (kvm_system_needs_idmapped_vectors() && 1641 !is_protected_kvm_enabled()) { 1642 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), 1643 __BP_HARDEN_HYP_VECS_SZ, &base); 1644 if (err) 1645 return err; 1646 } 1647 1648 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); 1649 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); 1650 return 0; 1651 } 1652 1653 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) 1654 { 1655 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 1656 unsigned long tcr; 1657 1658 /* 1659 * Calculate the raw per-cpu offset without a translation from the 1660 * kernel's mapping to the linear mapping, and store it in tpidr_el2 1661 * so that we can use adr_l to access per-cpu variables in EL2. 1662 * Also drop the KASAN tag which gets in the way... 1663 */ 1664 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - 1665 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); 1666 1667 params->mair_el2 = read_sysreg(mair_el1); 1668 1669 tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; 1670 tcr &= ~TCR_T0SZ_MASK; 1671 tcr |= TCR_T0SZ(hyp_va_bits); 1672 params->tcr_el2 = tcr; 1673 1674 params->pgd_pa = kvm_mmu_get_httbr(); 1675 if (is_protected_kvm_enabled()) 1676 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; 1677 else 1678 params->hcr_el2 = HCR_HOST_NVHE_FLAGS; 1679 params->vttbr = params->vtcr = 0; 1680 1681 /* 1682 * Flush the init params from the data cache because the struct will 1683 * be read while the MMU is off. 1684 */ 1685 kvm_flush_dcache_to_poc(params, sizeof(*params)); 1686 } 1687 1688 static void hyp_install_host_vector(void) 1689 { 1690 struct kvm_nvhe_init_params *params; 1691 struct arm_smccc_res res; 1692 1693 /* Switch from the HYP stub to our own HYP init vector */ 1694 __hyp_set_vectors(kvm_get_idmap_vector()); 1695 1696 /* 1697 * Call initialization code, and switch to the full blown HYP code. 1698 * If the cpucaps haven't been finalized yet, something has gone very 1699 * wrong, and hyp will crash and burn when it uses any 1700 * cpus_have_const_cap() wrapper. 1701 */ 1702 BUG_ON(!system_capabilities_finalized()); 1703 params = this_cpu_ptr_nvhe_sym(kvm_init_params); 1704 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); 1705 WARN_ON(res.a0 != SMCCC_RET_SUCCESS); 1706 } 1707 1708 static void cpu_init_hyp_mode(void) 1709 { 1710 hyp_install_host_vector(); 1711 1712 /* 1713 * Disabling SSBD on a non-VHE system requires us to enable SSBS 1714 * at EL2. 1715 */ 1716 if (this_cpu_has_cap(ARM64_SSBS) && 1717 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { 1718 kvm_call_hyp_nvhe(__kvm_enable_ssbs); 1719 } 1720 } 1721 1722 static void cpu_hyp_reset(void) 1723 { 1724 if (!is_kernel_in_hyp_mode()) 1725 __hyp_reset_vectors(); 1726 } 1727 1728 /* 1729 * EL2 vectors can be mapped and rerouted in a number of ways, 1730 * depending on the kernel configuration and CPU present: 1731 * 1732 * - If the CPU is affected by Spectre-v2, the hardening sequence is 1733 * placed in one of the vector slots, which is executed before jumping 1734 * to the real vectors. 1735 * 1736 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot 1737 * containing the hardening sequence is mapped next to the idmap page, 1738 * and executed before jumping to the real vectors. 1739 * 1740 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an 1741 * empty slot is selected, mapped next to the idmap page, and 1742 * executed before jumping to the real vectors. 1743 * 1744 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with 1745 * VHE, as we don't have hypervisor-specific mappings. If the system 1746 * is VHE and yet selects this capability, it will be ignored. 1747 */ 1748 static void cpu_set_hyp_vector(void) 1749 { 1750 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); 1751 void *vector = hyp_spectre_vector_selector[data->slot]; 1752 1753 if (!is_protected_kvm_enabled()) 1754 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; 1755 else 1756 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); 1757 } 1758 1759 static void cpu_hyp_init_context(void) 1760 { 1761 kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); 1762 1763 if (!is_kernel_in_hyp_mode()) 1764 cpu_init_hyp_mode(); 1765 } 1766 1767 static void cpu_hyp_init_features(void) 1768 { 1769 cpu_set_hyp_vector(); 1770 kvm_arm_init_debug(); 1771 1772 if (is_kernel_in_hyp_mode()) 1773 kvm_timer_init_vhe(); 1774 1775 if (vgic_present) 1776 kvm_vgic_init_cpu_hardware(); 1777 } 1778 1779 static void cpu_hyp_reinit(void) 1780 { 1781 cpu_hyp_reset(); 1782 cpu_hyp_init_context(); 1783 cpu_hyp_init_features(); 1784 } 1785 1786 static void _kvm_arch_hardware_enable(void *discard) 1787 { 1788 if (!__this_cpu_read(kvm_arm_hardware_enabled)) { 1789 cpu_hyp_reinit(); 1790 __this_cpu_write(kvm_arm_hardware_enabled, 1); 1791 } 1792 } 1793 1794 int kvm_arch_hardware_enable(void) 1795 { 1796 int was_enabled = __this_cpu_read(kvm_arm_hardware_enabled); 1797 1798 _kvm_arch_hardware_enable(NULL); 1799 1800 if (!was_enabled) { 1801 kvm_vgic_cpu_up(); 1802 kvm_timer_cpu_up(); 1803 } 1804 1805 return 0; 1806 } 1807 1808 static void _kvm_arch_hardware_disable(void *discard) 1809 { 1810 if (__this_cpu_read(kvm_arm_hardware_enabled)) { 1811 cpu_hyp_reset(); 1812 __this_cpu_write(kvm_arm_hardware_enabled, 0); 1813 } 1814 } 1815 1816 void kvm_arch_hardware_disable(void) 1817 { 1818 if (__this_cpu_read(kvm_arm_hardware_enabled)) { 1819 kvm_timer_cpu_down(); 1820 kvm_vgic_cpu_down(); 1821 } 1822 1823 if (!is_protected_kvm_enabled()) 1824 _kvm_arch_hardware_disable(NULL); 1825 } 1826 1827 #ifdef CONFIG_CPU_PM 1828 static int hyp_init_cpu_pm_notifier(struct notifier_block *self, 1829 unsigned long cmd, 1830 void *v) 1831 { 1832 /* 1833 * kvm_arm_hardware_enabled is left with its old value over 1834 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should 1835 * re-enable hyp. 1836 */ 1837 switch (cmd) { 1838 case CPU_PM_ENTER: 1839 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1840 /* 1841 * don't update kvm_arm_hardware_enabled here 1842 * so that the hardware will be re-enabled 1843 * when we resume. See below. 1844 */ 1845 cpu_hyp_reset(); 1846 1847 return NOTIFY_OK; 1848 case CPU_PM_ENTER_FAILED: 1849 case CPU_PM_EXIT: 1850 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1851 /* The hardware was enabled before suspend. */ 1852 cpu_hyp_reinit(); 1853 1854 return NOTIFY_OK; 1855 1856 default: 1857 return NOTIFY_DONE; 1858 } 1859 } 1860 1861 static struct notifier_block hyp_init_cpu_pm_nb = { 1862 .notifier_call = hyp_init_cpu_pm_notifier, 1863 }; 1864 1865 static void __init hyp_cpu_pm_init(void) 1866 { 1867 if (!is_protected_kvm_enabled()) 1868 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); 1869 } 1870 static void __init hyp_cpu_pm_exit(void) 1871 { 1872 if (!is_protected_kvm_enabled()) 1873 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); 1874 } 1875 #else 1876 static inline void __init hyp_cpu_pm_init(void) 1877 { 1878 } 1879 static inline void __init hyp_cpu_pm_exit(void) 1880 { 1881 } 1882 #endif 1883 1884 static void __init init_cpu_logical_map(void) 1885 { 1886 unsigned int cpu; 1887 1888 /* 1889 * Copy the MPIDR <-> logical CPU ID mapping to hyp. 1890 * Only copy the set of online CPUs whose features have been checked 1891 * against the finalized system capabilities. The hypervisor will not 1892 * allow any other CPUs from the `possible` set to boot. 1893 */ 1894 for_each_online_cpu(cpu) 1895 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); 1896 } 1897 1898 #define init_psci_0_1_impl_state(config, what) \ 1899 config.psci_0_1_ ## what ## _implemented = psci_ops.what 1900 1901 static bool __init init_psci_relay(void) 1902 { 1903 /* 1904 * If PSCI has not been initialized, protected KVM cannot install 1905 * itself on newly booted CPUs. 1906 */ 1907 if (!psci_ops.get_version) { 1908 kvm_err("Cannot initialize protected mode without PSCI\n"); 1909 return false; 1910 } 1911 1912 kvm_host_psci_config.version = psci_ops.get_version(); 1913 1914 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { 1915 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); 1916 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); 1917 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); 1918 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); 1919 init_psci_0_1_impl_state(kvm_host_psci_config, migrate); 1920 } 1921 return true; 1922 } 1923 1924 static int __init init_subsystems(void) 1925 { 1926 int err = 0; 1927 1928 /* 1929 * Enable hardware so that subsystem initialisation can access EL2. 1930 */ 1931 on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); 1932 1933 /* 1934 * Register CPU lower-power notifier 1935 */ 1936 hyp_cpu_pm_init(); 1937 1938 /* 1939 * Init HYP view of VGIC 1940 */ 1941 err = kvm_vgic_hyp_init(); 1942 switch (err) { 1943 case 0: 1944 vgic_present = true; 1945 break; 1946 case -ENODEV: 1947 case -ENXIO: 1948 vgic_present = false; 1949 err = 0; 1950 break; 1951 default: 1952 goto out; 1953 } 1954 1955 /* 1956 * Init HYP architected timer support 1957 */ 1958 err = kvm_timer_hyp_init(vgic_present); 1959 if (err) 1960 goto out; 1961 1962 kvm_register_perf_callbacks(NULL); 1963 1964 out: 1965 if (err) 1966 hyp_cpu_pm_exit(); 1967 1968 if (err || !is_protected_kvm_enabled()) 1969 on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); 1970 1971 return err; 1972 } 1973 1974 static void __init teardown_subsystems(void) 1975 { 1976 kvm_unregister_perf_callbacks(); 1977 hyp_cpu_pm_exit(); 1978 } 1979 1980 static void __init teardown_hyp_mode(void) 1981 { 1982 int cpu; 1983 1984 free_hyp_pgds(); 1985 for_each_possible_cpu(cpu) { 1986 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); 1987 free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); 1988 } 1989 } 1990 1991 static int __init do_pkvm_init(u32 hyp_va_bits) 1992 { 1993 void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); 1994 int ret; 1995 1996 preempt_disable(); 1997 cpu_hyp_init_context(); 1998 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, 1999 num_possible_cpus(), kern_hyp_va(per_cpu_base), 2000 hyp_va_bits); 2001 cpu_hyp_init_features(); 2002 2003 /* 2004 * The stub hypercalls are now disabled, so set our local flag to 2005 * prevent a later re-init attempt in kvm_arch_hardware_enable(). 2006 */ 2007 __this_cpu_write(kvm_arm_hardware_enabled, 1); 2008 preempt_enable(); 2009 2010 return ret; 2011 } 2012 2013 static u64 get_hyp_id_aa64pfr0_el1(void) 2014 { 2015 /* 2016 * Track whether the system isn't affected by spectre/meltdown in the 2017 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. 2018 * Although this is per-CPU, we make it global for simplicity, e.g., not 2019 * to have to worry about vcpu migration. 2020 * 2021 * Unlike for non-protected VMs, userspace cannot override this for 2022 * protected VMs. 2023 */ 2024 u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); 2025 2026 val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | 2027 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); 2028 2029 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 2030 arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); 2031 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 2032 arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); 2033 2034 return val; 2035 } 2036 2037 static void kvm_hyp_init_symbols(void) 2038 { 2039 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); 2040 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); 2041 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); 2042 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); 2043 kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); 2044 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 2045 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 2046 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); 2047 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); 2048 kvm_nvhe_sym(__icache_flags) = __icache_flags; 2049 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; 2050 } 2051 2052 static int __init kvm_hyp_init_protection(u32 hyp_va_bits) 2053 { 2054 void *addr = phys_to_virt(hyp_mem_base); 2055 int ret; 2056 2057 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); 2058 if (ret) 2059 return ret; 2060 2061 ret = do_pkvm_init(hyp_va_bits); 2062 if (ret) 2063 return ret; 2064 2065 free_hyp_pgds(); 2066 2067 return 0; 2068 } 2069 2070 /* Inits Hyp-mode on all online CPUs */ 2071 static int __init init_hyp_mode(void) 2072 { 2073 u32 hyp_va_bits; 2074 int cpu; 2075 int err = -ENOMEM; 2076 2077 /* 2078 * The protected Hyp-mode cannot be initialized if the memory pool 2079 * allocation has failed. 2080 */ 2081 if (is_protected_kvm_enabled() && !hyp_mem_base) 2082 goto out_err; 2083 2084 /* 2085 * Allocate Hyp PGD and setup Hyp identity mapping 2086 */ 2087 err = kvm_mmu_init(&hyp_va_bits); 2088 if (err) 2089 goto out_err; 2090 2091 /* 2092 * Allocate stack pages for Hypervisor-mode 2093 */ 2094 for_each_possible_cpu(cpu) { 2095 unsigned long stack_page; 2096 2097 stack_page = __get_free_page(GFP_KERNEL); 2098 if (!stack_page) { 2099 err = -ENOMEM; 2100 goto out_err; 2101 } 2102 2103 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; 2104 } 2105 2106 /* 2107 * Allocate and initialize pages for Hypervisor-mode percpu regions. 2108 */ 2109 for_each_possible_cpu(cpu) { 2110 struct page *page; 2111 void *page_addr; 2112 2113 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); 2114 if (!page) { 2115 err = -ENOMEM; 2116 goto out_err; 2117 } 2118 2119 page_addr = page_address(page); 2120 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); 2121 kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; 2122 } 2123 2124 /* 2125 * Map the Hyp-code called directly from the host 2126 */ 2127 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), 2128 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); 2129 if (err) { 2130 kvm_err("Cannot map world-switch code\n"); 2131 goto out_err; 2132 } 2133 2134 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), 2135 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); 2136 if (err) { 2137 kvm_err("Cannot map .hyp.rodata section\n"); 2138 goto out_err; 2139 } 2140 2141 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), 2142 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); 2143 if (err) { 2144 kvm_err("Cannot map rodata section\n"); 2145 goto out_err; 2146 } 2147 2148 /* 2149 * .hyp.bss is guaranteed to be placed at the beginning of the .bss 2150 * section thanks to an assertion in the linker script. Map it RW and 2151 * the rest of .bss RO. 2152 */ 2153 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), 2154 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); 2155 if (err) { 2156 kvm_err("Cannot map hyp bss section: %d\n", err); 2157 goto out_err; 2158 } 2159 2160 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), 2161 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); 2162 if (err) { 2163 kvm_err("Cannot map bss section\n"); 2164 goto out_err; 2165 } 2166 2167 /* 2168 * Map the Hyp stack pages 2169 */ 2170 for_each_possible_cpu(cpu) { 2171 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 2172 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); 2173 unsigned long hyp_addr; 2174 2175 /* 2176 * Allocate a contiguous HYP private VA range for the stack 2177 * and guard page. The allocation is also aligned based on 2178 * the order of its size. 2179 */ 2180 err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); 2181 if (err) { 2182 kvm_err("Cannot allocate hyp stack guard page\n"); 2183 goto out_err; 2184 } 2185 2186 /* 2187 * Since the stack grows downwards, map the stack to the page 2188 * at the higher address and leave the lower guard page 2189 * unbacked. 2190 * 2191 * Any valid stack address now has the PAGE_SHIFT bit as 1 2192 * and addresses corresponding to the guard page have the 2193 * PAGE_SHIFT bit as 0 - this is used for overflow detection. 2194 */ 2195 err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, 2196 __pa(stack_page), PAGE_HYP); 2197 if (err) { 2198 kvm_err("Cannot map hyp stack\n"); 2199 goto out_err; 2200 } 2201 2202 /* 2203 * Save the stack PA in nvhe_init_params. This will be needed 2204 * to recreate the stack mapping in protected nVHE mode. 2205 * __hyp_pa() won't do the right thing there, since the stack 2206 * has been mapped in the flexible private VA space. 2207 */ 2208 params->stack_pa = __pa(stack_page); 2209 2210 params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); 2211 } 2212 2213 for_each_possible_cpu(cpu) { 2214 char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; 2215 char *percpu_end = percpu_begin + nvhe_percpu_size(); 2216 2217 /* Map Hyp percpu pages */ 2218 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); 2219 if (err) { 2220 kvm_err("Cannot map hyp percpu region\n"); 2221 goto out_err; 2222 } 2223 2224 /* Prepare the CPU initialization parameters */ 2225 cpu_prepare_hyp_mode(cpu, hyp_va_bits); 2226 } 2227 2228 kvm_hyp_init_symbols(); 2229 2230 if (is_protected_kvm_enabled()) { 2231 init_cpu_logical_map(); 2232 2233 if (!init_psci_relay()) { 2234 err = -ENODEV; 2235 goto out_err; 2236 } 2237 2238 err = kvm_hyp_init_protection(hyp_va_bits); 2239 if (err) { 2240 kvm_err("Failed to init hyp memory protection\n"); 2241 goto out_err; 2242 } 2243 } 2244 2245 return 0; 2246 2247 out_err: 2248 teardown_hyp_mode(); 2249 kvm_err("error initializing Hyp mode: %d\n", err); 2250 return err; 2251 } 2252 2253 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) 2254 { 2255 struct kvm_vcpu *vcpu; 2256 unsigned long i; 2257 2258 mpidr &= MPIDR_HWID_BITMASK; 2259 kvm_for_each_vcpu(i, vcpu, kvm) { 2260 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) 2261 return vcpu; 2262 } 2263 return NULL; 2264 } 2265 2266 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) 2267 { 2268 return irqchip_in_kernel(kvm); 2269 } 2270 2271 bool kvm_arch_has_irq_bypass(void) 2272 { 2273 return true; 2274 } 2275 2276 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 2277 struct irq_bypass_producer *prod) 2278 { 2279 struct kvm_kernel_irqfd *irqfd = 2280 container_of(cons, struct kvm_kernel_irqfd, consumer); 2281 2282 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, 2283 &irqfd->irq_entry); 2284 } 2285 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 2286 struct irq_bypass_producer *prod) 2287 { 2288 struct kvm_kernel_irqfd *irqfd = 2289 container_of(cons, struct kvm_kernel_irqfd, consumer); 2290 2291 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, 2292 &irqfd->irq_entry); 2293 } 2294 2295 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) 2296 { 2297 struct kvm_kernel_irqfd *irqfd = 2298 container_of(cons, struct kvm_kernel_irqfd, consumer); 2299 2300 kvm_arm_halt_guest(irqfd->kvm); 2301 } 2302 2303 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) 2304 { 2305 struct kvm_kernel_irqfd *irqfd = 2306 container_of(cons, struct kvm_kernel_irqfd, consumer); 2307 2308 kvm_arm_resume_guest(irqfd->kvm); 2309 } 2310 2311 /* Initialize Hyp-mode and memory mappings on all CPUs */ 2312 static __init int kvm_arm_init(void) 2313 { 2314 int err; 2315 bool in_hyp_mode; 2316 2317 if (!is_hyp_mode_available()) { 2318 kvm_info("HYP mode not available\n"); 2319 return -ENODEV; 2320 } 2321 2322 if (kvm_get_mode() == KVM_MODE_NONE) { 2323 kvm_info("KVM disabled from command line\n"); 2324 return -ENODEV; 2325 } 2326 2327 err = kvm_sys_reg_table_init(); 2328 if (err) { 2329 kvm_info("Error initializing system register tables"); 2330 return err; 2331 } 2332 2333 in_hyp_mode = is_kernel_in_hyp_mode(); 2334 2335 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || 2336 cpus_have_final_cap(ARM64_WORKAROUND_1508412)) 2337 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ 2338 "Only trusted guests should be used on this system.\n"); 2339 2340 err = kvm_set_ipa_limit(); 2341 if (err) 2342 return err; 2343 2344 err = kvm_arm_init_sve(); 2345 if (err) 2346 return err; 2347 2348 err = kvm_arm_vmid_alloc_init(); 2349 if (err) { 2350 kvm_err("Failed to initialize VMID allocator.\n"); 2351 return err; 2352 } 2353 2354 if (!in_hyp_mode) { 2355 err = init_hyp_mode(); 2356 if (err) 2357 goto out_err; 2358 } 2359 2360 err = kvm_init_vector_slots(); 2361 if (err) { 2362 kvm_err("Cannot initialise vector slots\n"); 2363 goto out_hyp; 2364 } 2365 2366 err = init_subsystems(); 2367 if (err) 2368 goto out_hyp; 2369 2370 if (is_protected_kvm_enabled()) { 2371 kvm_info("Protected nVHE mode initialized successfully\n"); 2372 } else if (in_hyp_mode) { 2373 kvm_info("VHE mode initialized successfully\n"); 2374 } else { 2375 kvm_info("Hyp mode initialized successfully\n"); 2376 } 2377 2378 /* 2379 * FIXME: Do something reasonable if kvm_init() fails after pKVM 2380 * hypervisor protection is finalized. 2381 */ 2382 err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); 2383 if (err) 2384 goto out_subs; 2385 2386 return 0; 2387 2388 out_subs: 2389 teardown_subsystems(); 2390 out_hyp: 2391 if (!in_hyp_mode) 2392 teardown_hyp_mode(); 2393 out_err: 2394 kvm_arm_vmid_alloc_free(); 2395 return err; 2396 } 2397 2398 static int __init early_kvm_mode_cfg(char *arg) 2399 { 2400 if (!arg) 2401 return -EINVAL; 2402 2403 if (strcmp(arg, "none") == 0) { 2404 kvm_mode = KVM_MODE_NONE; 2405 return 0; 2406 } 2407 2408 if (!is_hyp_mode_available()) { 2409 pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); 2410 return 0; 2411 } 2412 2413 if (strcmp(arg, "protected") == 0) { 2414 if (!is_kernel_in_hyp_mode()) 2415 kvm_mode = KVM_MODE_PROTECTED; 2416 else 2417 pr_warn_once("Protected KVM not available with VHE\n"); 2418 2419 return 0; 2420 } 2421 2422 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { 2423 kvm_mode = KVM_MODE_DEFAULT; 2424 return 0; 2425 } 2426 2427 if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { 2428 kvm_mode = KVM_MODE_NV; 2429 return 0; 2430 } 2431 2432 return -EINVAL; 2433 } 2434 early_param("kvm-arm.mode", early_kvm_mode_cfg); 2435 2436 enum kvm_mode kvm_get_mode(void) 2437 { 2438 return kvm_mode; 2439 } 2440 2441 module_init(kvm_arm_init); 2442