1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/bug.h> 8 #include <linux/cpu_pm.h> 9 #include <linux/entry-kvm.h> 10 #include <linux/errno.h> 11 #include <linux/err.h> 12 #include <linux/kvm_host.h> 13 #include <linux/list.h> 14 #include <linux/module.h> 15 #include <linux/vmalloc.h> 16 #include <linux/fs.h> 17 #include <linux/mman.h> 18 #include <linux/sched.h> 19 #include <linux/kvm.h> 20 #include <linux/kvm_irqfd.h> 21 #include <linux/irqbypass.h> 22 #include <linux/sched/stat.h> 23 #include <linux/psci.h> 24 #include <trace/events/kvm.h> 25 26 #define CREATE_TRACE_POINTS 27 #include "trace_arm.h" 28 29 #include <linux/uaccess.h> 30 #include <asm/ptrace.h> 31 #include <asm/mman.h> 32 #include <asm/tlbflush.h> 33 #include <asm/cacheflush.h> 34 #include <asm/cpufeature.h> 35 #include <asm/virt.h> 36 #include <asm/kvm_arm.h> 37 #include <asm/kvm_asm.h> 38 #include <asm/kvm_mmu.h> 39 #include <asm/kvm_pkvm.h> 40 #include <asm/kvm_emulate.h> 41 #include <asm/sections.h> 42 43 #include <kvm/arm_hypercalls.h> 44 #include <kvm/arm_pmu.h> 45 #include <kvm/arm_psci.h> 46 47 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; 48 49 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); 50 51 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 52 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); 53 54 static bool vgic_present; 55 56 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); 57 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 58 59 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 60 { 61 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 62 } 63 64 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 65 struct kvm_enable_cap *cap) 66 { 67 int r; 68 69 if (cap->flags) 70 return -EINVAL; 71 72 switch (cap->cap) { 73 case KVM_CAP_ARM_NISV_TO_USER: 74 r = 0; 75 set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, 76 &kvm->arch.flags); 77 break; 78 case KVM_CAP_ARM_MTE: 79 mutex_lock(&kvm->lock); 80 if (!system_supports_mte() || kvm->created_vcpus) { 81 r = -EINVAL; 82 } else { 83 r = 0; 84 set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); 85 } 86 mutex_unlock(&kvm->lock); 87 break; 88 case KVM_CAP_ARM_SYSTEM_SUSPEND: 89 r = 0; 90 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); 91 break; 92 default: 93 r = -EINVAL; 94 break; 95 } 96 97 return r; 98 } 99 100 static int kvm_arm_default_max_vcpus(void) 101 { 102 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; 103 } 104 105 static void set_default_spectre(struct kvm *kvm) 106 { 107 /* 108 * The default is to expose CSV2 == 1 if the HW isn't affected. 109 * Although this is a per-CPU feature, we make it global because 110 * asymmetric systems are just a nuisance. 111 * 112 * Userspace can override this as long as it doesn't promise 113 * the impossible. 114 */ 115 if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) 116 kvm->arch.pfr0_csv2 = 1; 117 if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) 118 kvm->arch.pfr0_csv3 = 1; 119 } 120 121 /** 122 * kvm_arch_init_vm - initializes a VM data structure 123 * @kvm: pointer to the KVM struct 124 */ 125 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 126 { 127 int ret; 128 129 ret = kvm_share_hyp(kvm, kvm + 1); 130 if (ret) 131 return ret; 132 133 ret = pkvm_init_host_vm(kvm); 134 if (ret) 135 goto err_unshare_kvm; 136 137 if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { 138 ret = -ENOMEM; 139 goto err_unshare_kvm; 140 } 141 cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); 142 143 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); 144 if (ret) 145 goto err_free_cpumask; 146 147 kvm_vgic_early_init(kvm); 148 149 /* The maximum number of VCPUs is limited by the host's GIC model */ 150 kvm->max_vcpus = kvm_arm_default_max_vcpus(); 151 152 set_default_spectre(kvm); 153 kvm_arm_init_hypercalls(kvm); 154 155 /* 156 * Initialise the default PMUver before there is a chance to 157 * create an actual PMU. 158 */ 159 kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); 160 161 return 0; 162 163 err_free_cpumask: 164 free_cpumask_var(kvm->arch.supported_cpus); 165 err_unshare_kvm: 166 kvm_unshare_hyp(kvm, kvm + 1); 167 return ret; 168 } 169 170 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 171 { 172 return VM_FAULT_SIGBUS; 173 } 174 175 176 /** 177 * kvm_arch_destroy_vm - destroy the VM data structure 178 * @kvm: pointer to the KVM struct 179 */ 180 void kvm_arch_destroy_vm(struct kvm *kvm) 181 { 182 bitmap_free(kvm->arch.pmu_filter); 183 free_cpumask_var(kvm->arch.supported_cpus); 184 185 kvm_vgic_destroy(kvm); 186 187 if (is_protected_kvm_enabled()) 188 pkvm_destroy_hyp_vm(kvm); 189 190 kvm_destroy_vcpus(kvm); 191 192 kvm_unshare_hyp(kvm, kvm + 1); 193 } 194 195 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 196 { 197 int r; 198 switch (ext) { 199 case KVM_CAP_IRQCHIP: 200 r = vgic_present; 201 break; 202 case KVM_CAP_IOEVENTFD: 203 case KVM_CAP_DEVICE_CTRL: 204 case KVM_CAP_USER_MEMORY: 205 case KVM_CAP_SYNC_MMU: 206 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 207 case KVM_CAP_ONE_REG: 208 case KVM_CAP_ARM_PSCI: 209 case KVM_CAP_ARM_PSCI_0_2: 210 case KVM_CAP_READONLY_MEM: 211 case KVM_CAP_MP_STATE: 212 case KVM_CAP_IMMEDIATE_EXIT: 213 case KVM_CAP_VCPU_EVENTS: 214 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: 215 case KVM_CAP_ARM_NISV_TO_USER: 216 case KVM_CAP_ARM_INJECT_EXT_DABT: 217 case KVM_CAP_SET_GUEST_DEBUG: 218 case KVM_CAP_VCPU_ATTRIBUTES: 219 case KVM_CAP_PTP_KVM: 220 case KVM_CAP_ARM_SYSTEM_SUSPEND: 221 case KVM_CAP_IRQFD_RESAMPLE: 222 r = 1; 223 break; 224 case KVM_CAP_SET_GUEST_DEBUG2: 225 return KVM_GUESTDBG_VALID_MASK; 226 case KVM_CAP_ARM_SET_DEVICE_ADDR: 227 r = 1; 228 break; 229 case KVM_CAP_NR_VCPUS: 230 /* 231 * ARM64 treats KVM_CAP_NR_CPUS differently from all other 232 * architectures, as it does not always bound it to 233 * KVM_CAP_MAX_VCPUS. It should not matter much because 234 * this is just an advisory value. 235 */ 236 r = min_t(unsigned int, num_online_cpus(), 237 kvm_arm_default_max_vcpus()); 238 break; 239 case KVM_CAP_MAX_VCPUS: 240 case KVM_CAP_MAX_VCPU_ID: 241 if (kvm) 242 r = kvm->max_vcpus; 243 else 244 r = kvm_arm_default_max_vcpus(); 245 break; 246 case KVM_CAP_MSI_DEVID: 247 if (!kvm) 248 r = -EINVAL; 249 else 250 r = kvm->arch.vgic.msis_require_devid; 251 break; 252 case KVM_CAP_ARM_USER_IRQ: 253 /* 254 * 1: EL1_VTIMER, EL1_PTIMER, and PMU. 255 * (bump this number if adding more devices) 256 */ 257 r = 1; 258 break; 259 case KVM_CAP_ARM_MTE: 260 r = system_supports_mte(); 261 break; 262 case KVM_CAP_STEAL_TIME: 263 r = kvm_arm_pvtime_supported(); 264 break; 265 case KVM_CAP_ARM_EL1_32BIT: 266 r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); 267 break; 268 case KVM_CAP_GUEST_DEBUG_HW_BPS: 269 r = get_num_brps(); 270 break; 271 case KVM_CAP_GUEST_DEBUG_HW_WPS: 272 r = get_num_wrps(); 273 break; 274 case KVM_CAP_ARM_PMU_V3: 275 r = kvm_arm_support_pmu_v3(); 276 break; 277 case KVM_CAP_ARM_INJECT_SERROR_ESR: 278 r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); 279 break; 280 case KVM_CAP_ARM_VM_IPA_SIZE: 281 r = get_kvm_ipa_limit(); 282 break; 283 case KVM_CAP_ARM_SVE: 284 r = system_supports_sve(); 285 break; 286 case KVM_CAP_ARM_PTRAUTH_ADDRESS: 287 case KVM_CAP_ARM_PTRAUTH_GENERIC: 288 r = system_has_full_ptr_auth(); 289 break; 290 default: 291 r = 0; 292 } 293 294 return r; 295 } 296 297 long kvm_arch_dev_ioctl(struct file *filp, 298 unsigned int ioctl, unsigned long arg) 299 { 300 return -EINVAL; 301 } 302 303 struct kvm *kvm_arch_alloc_vm(void) 304 { 305 size_t sz = sizeof(struct kvm); 306 307 if (!has_vhe()) 308 return kzalloc(sz, GFP_KERNEL_ACCOUNT); 309 310 return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); 311 } 312 313 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 314 { 315 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) 316 return -EBUSY; 317 318 if (id >= kvm->max_vcpus) 319 return -EINVAL; 320 321 return 0; 322 } 323 324 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 325 { 326 int err; 327 328 /* Force users to call KVM_ARM_VCPU_INIT */ 329 vcpu->arch.target = -1; 330 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 331 332 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; 333 334 /* 335 * Default value for the FP state, will be overloaded at load 336 * time if we support FP (pretty likely) 337 */ 338 vcpu->arch.fp_state = FP_STATE_FREE; 339 340 /* Set up the timer */ 341 kvm_timer_vcpu_init(vcpu); 342 343 kvm_pmu_vcpu_init(vcpu); 344 345 kvm_arm_reset_debug_ptr(vcpu); 346 347 kvm_arm_pvtime_vcpu_init(&vcpu->arch); 348 349 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; 350 351 err = kvm_vgic_vcpu_init(vcpu); 352 if (err) 353 return err; 354 355 return kvm_share_hyp(vcpu, vcpu + 1); 356 } 357 358 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 359 { 360 } 361 362 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 363 { 364 if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) 365 static_branch_dec(&userspace_irqchip_in_use); 366 367 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 368 kvm_timer_vcpu_terminate(vcpu); 369 kvm_pmu_vcpu_destroy(vcpu); 370 371 kvm_arm_vcpu_destroy(vcpu); 372 } 373 374 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 375 { 376 377 } 378 379 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) 380 { 381 382 } 383 384 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 385 { 386 struct kvm_s2_mmu *mmu; 387 int *last_ran; 388 389 mmu = vcpu->arch.hw_mmu; 390 last_ran = this_cpu_ptr(mmu->last_vcpu_ran); 391 392 /* 393 * We guarantee that both TLBs and I-cache are private to each 394 * vcpu. If detecting that a vcpu from the same VM has 395 * previously run on the same physical CPU, call into the 396 * hypervisor code to nuke the relevant contexts. 397 * 398 * We might get preempted before the vCPU actually runs, but 399 * over-invalidation doesn't affect correctness. 400 */ 401 if (*last_ran != vcpu->vcpu_id) { 402 kvm_call_hyp(__kvm_flush_cpu_context, mmu); 403 *last_ran = vcpu->vcpu_id; 404 } 405 406 vcpu->cpu = cpu; 407 408 kvm_vgic_load(vcpu); 409 kvm_timer_vcpu_load(vcpu); 410 if (has_vhe()) 411 kvm_vcpu_load_sysregs_vhe(vcpu); 412 kvm_arch_vcpu_load_fp(vcpu); 413 kvm_vcpu_pmu_restore_guest(vcpu); 414 if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) 415 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); 416 417 if (single_task_running()) 418 vcpu_clear_wfx_traps(vcpu); 419 else 420 vcpu_set_wfx_traps(vcpu); 421 422 if (vcpu_has_ptrauth(vcpu)) 423 vcpu_ptrauth_disable(vcpu); 424 kvm_arch_vcpu_load_debug_state_flags(vcpu); 425 426 if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus)) 427 vcpu_set_on_unsupported_cpu(vcpu); 428 } 429 430 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 431 { 432 kvm_arch_vcpu_put_debug_state_flags(vcpu); 433 kvm_arch_vcpu_put_fp(vcpu); 434 if (has_vhe()) 435 kvm_vcpu_put_sysregs_vhe(vcpu); 436 kvm_timer_vcpu_put(vcpu); 437 kvm_vgic_put(vcpu); 438 kvm_vcpu_pmu_restore_host(vcpu); 439 kvm_arm_vmid_clear_active(); 440 441 vcpu_clear_on_unsupported_cpu(vcpu); 442 vcpu->cpu = -1; 443 } 444 445 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) 446 { 447 vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; 448 kvm_make_request(KVM_REQ_SLEEP, vcpu); 449 kvm_vcpu_kick(vcpu); 450 } 451 452 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) 453 { 454 return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; 455 } 456 457 static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) 458 { 459 vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; 460 kvm_make_request(KVM_REQ_SUSPEND, vcpu); 461 kvm_vcpu_kick(vcpu); 462 } 463 464 static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) 465 { 466 return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; 467 } 468 469 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 470 struct kvm_mp_state *mp_state) 471 { 472 *mp_state = vcpu->arch.mp_state; 473 474 return 0; 475 } 476 477 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 478 struct kvm_mp_state *mp_state) 479 { 480 int ret = 0; 481 482 switch (mp_state->mp_state) { 483 case KVM_MP_STATE_RUNNABLE: 484 vcpu->arch.mp_state = *mp_state; 485 break; 486 case KVM_MP_STATE_STOPPED: 487 kvm_arm_vcpu_power_off(vcpu); 488 break; 489 case KVM_MP_STATE_SUSPENDED: 490 kvm_arm_vcpu_suspend(vcpu); 491 break; 492 default: 493 ret = -EINVAL; 494 } 495 496 return ret; 497 } 498 499 /** 500 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled 501 * @v: The VCPU pointer 502 * 503 * If the guest CPU is not waiting for interrupts or an interrupt line is 504 * asserted, the CPU is by definition runnable. 505 */ 506 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 507 { 508 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); 509 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) 510 && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); 511 } 512 513 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 514 { 515 return vcpu_mode_priv(vcpu); 516 } 517 518 #ifdef CONFIG_GUEST_PERF_EVENTS 519 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) 520 { 521 return *vcpu_pc(vcpu); 522 } 523 #endif 524 525 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) 526 { 527 return vcpu->arch.target >= 0; 528 } 529 530 /* 531 * Handle both the initialisation that is being done when the vcpu is 532 * run for the first time, as well as the updates that must be 533 * performed each time we get a new thread dealing with this vcpu. 534 */ 535 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) 536 { 537 struct kvm *kvm = vcpu->kvm; 538 int ret; 539 540 if (!kvm_vcpu_initialized(vcpu)) 541 return -ENOEXEC; 542 543 if (!kvm_arm_vcpu_is_finalized(vcpu)) 544 return -EPERM; 545 546 ret = kvm_arch_vcpu_run_map_fp(vcpu); 547 if (ret) 548 return ret; 549 550 if (likely(vcpu_has_run_once(vcpu))) 551 return 0; 552 553 kvm_arm_vcpu_init_debug(vcpu); 554 555 if (likely(irqchip_in_kernel(kvm))) { 556 /* 557 * Map the VGIC hardware resources before running a vcpu the 558 * first time on this VM. 559 */ 560 ret = kvm_vgic_map_resources(kvm); 561 if (ret) 562 return ret; 563 } 564 565 ret = kvm_timer_enable(vcpu); 566 if (ret) 567 return ret; 568 569 ret = kvm_arm_pmu_v3_enable(vcpu); 570 if (ret) 571 return ret; 572 573 if (is_protected_kvm_enabled()) { 574 ret = pkvm_create_hyp_vm(kvm); 575 if (ret) 576 return ret; 577 } 578 579 if (!irqchip_in_kernel(kvm)) { 580 /* 581 * Tell the rest of the code that there are userspace irqchip 582 * VMs in the wild. 583 */ 584 static_branch_inc(&userspace_irqchip_in_use); 585 } 586 587 /* 588 * Initialize traps for protected VMs. 589 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once 590 * the code is in place for first run initialization at EL2. 591 */ 592 if (kvm_vm_is_protected(kvm)) 593 kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); 594 595 mutex_lock(&kvm->lock); 596 set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); 597 mutex_unlock(&kvm->lock); 598 599 return ret; 600 } 601 602 bool kvm_arch_intc_initialized(struct kvm *kvm) 603 { 604 return vgic_initialized(kvm); 605 } 606 607 void kvm_arm_halt_guest(struct kvm *kvm) 608 { 609 unsigned long i; 610 struct kvm_vcpu *vcpu; 611 612 kvm_for_each_vcpu(i, vcpu, kvm) 613 vcpu->arch.pause = true; 614 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); 615 } 616 617 void kvm_arm_resume_guest(struct kvm *kvm) 618 { 619 unsigned long i; 620 struct kvm_vcpu *vcpu; 621 622 kvm_for_each_vcpu(i, vcpu, kvm) { 623 vcpu->arch.pause = false; 624 __kvm_vcpu_wake_up(vcpu); 625 } 626 } 627 628 static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) 629 { 630 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); 631 632 rcuwait_wait_event(wait, 633 (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), 634 TASK_INTERRUPTIBLE); 635 636 if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { 637 /* Awaken to handle a signal, request we sleep again later. */ 638 kvm_make_request(KVM_REQ_SLEEP, vcpu); 639 } 640 641 /* 642 * Make sure we will observe a potential reset request if we've 643 * observed a change to the power state. Pairs with the smp_wmb() in 644 * kvm_psci_vcpu_on(). 645 */ 646 smp_rmb(); 647 } 648 649 /** 650 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior 651 * @vcpu: The VCPU pointer 652 * 653 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until 654 * the vCPU is runnable. The vCPU may or may not be scheduled out, depending 655 * on when a wake event arrives, e.g. there may already be a pending wake event. 656 */ 657 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) 658 { 659 /* 660 * Sync back the state of the GIC CPU interface so that we have 661 * the latest PMR and group enables. This ensures that 662 * kvm_arch_vcpu_runnable has up-to-date data to decide whether 663 * we have pending interrupts, e.g. when determining if the 664 * vCPU should block. 665 * 666 * For the same reason, we want to tell GICv4 that we need 667 * doorbells to be signalled, should an interrupt become pending. 668 */ 669 preempt_disable(); 670 kvm_vgic_vmcr_sync(vcpu); 671 vgic_v4_put(vcpu, true); 672 preempt_enable(); 673 674 kvm_vcpu_halt(vcpu); 675 vcpu_clear_flag(vcpu, IN_WFIT); 676 677 preempt_disable(); 678 vgic_v4_load(vcpu); 679 preempt_enable(); 680 } 681 682 static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) 683 { 684 if (!kvm_arm_vcpu_suspended(vcpu)) 685 return 1; 686 687 kvm_vcpu_wfi(vcpu); 688 689 /* 690 * The suspend state is sticky; we do not leave it until userspace 691 * explicitly marks the vCPU as runnable. Request that we suspend again 692 * later. 693 */ 694 kvm_make_request(KVM_REQ_SUSPEND, vcpu); 695 696 /* 697 * Check to make sure the vCPU is actually runnable. If so, exit to 698 * userspace informing it of the wakeup condition. 699 */ 700 if (kvm_arch_vcpu_runnable(vcpu)) { 701 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); 702 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; 703 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 704 return 0; 705 } 706 707 /* 708 * Otherwise, we were unblocked to process a different event, such as a 709 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to 710 * process the event. 711 */ 712 return 1; 713 } 714 715 /** 716 * check_vcpu_requests - check and handle pending vCPU requests 717 * @vcpu: the VCPU pointer 718 * 719 * Return: 1 if we should enter the guest 720 * 0 if we should exit to userspace 721 * < 0 if we should exit to userspace, where the return value indicates 722 * an error 723 */ 724 static int check_vcpu_requests(struct kvm_vcpu *vcpu) 725 { 726 if (kvm_request_pending(vcpu)) { 727 if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) 728 kvm_vcpu_sleep(vcpu); 729 730 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 731 kvm_reset_vcpu(vcpu); 732 733 /* 734 * Clear IRQ_PENDING requests that were made to guarantee 735 * that a VCPU sees new virtual interrupts. 736 */ 737 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); 738 739 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) 740 kvm_update_stolen_time(vcpu); 741 742 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { 743 /* The distributor enable bits were changed */ 744 preempt_disable(); 745 vgic_v4_put(vcpu, false); 746 vgic_v4_load(vcpu); 747 preempt_enable(); 748 } 749 750 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) 751 kvm_pmu_handle_pmcr(vcpu, 752 __vcpu_sys_reg(vcpu, PMCR_EL0)); 753 754 if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) 755 return kvm_vcpu_suspend(vcpu); 756 757 if (kvm_dirty_ring_check_request(vcpu)) 758 return 0; 759 } 760 761 return 1; 762 } 763 764 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) 765 { 766 if (likely(!vcpu_mode_is_32bit(vcpu))) 767 return false; 768 769 return !kvm_supports_32bit_el0(); 770 } 771 772 /** 773 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest 774 * @vcpu: The VCPU pointer 775 * @ret: Pointer to write optional return code 776 * 777 * Returns: true if the VCPU needs to return to a preemptible + interruptible 778 * and skip guest entry. 779 * 780 * This function disambiguates between two different types of exits: exits to a 781 * preemptible + interruptible kernel context and exits to userspace. For an 782 * exit to userspace, this function will write the return code to ret and return 783 * true. For an exit to preemptible + interruptible kernel context (i.e. check 784 * for pending work and re-enter), return true without writing to ret. 785 */ 786 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) 787 { 788 struct kvm_run *run = vcpu->run; 789 790 /* 791 * If we're using a userspace irqchip, then check if we need 792 * to tell a userspace irqchip about timer or PMU level 793 * changes and if so, exit to userspace (the actual level 794 * state gets updated in kvm_timer_update_run and 795 * kvm_pmu_update_run below). 796 */ 797 if (static_branch_unlikely(&userspace_irqchip_in_use)) { 798 if (kvm_timer_should_notify_user(vcpu) || 799 kvm_pmu_should_notify_user(vcpu)) { 800 *ret = -EINTR; 801 run->exit_reason = KVM_EXIT_INTR; 802 return true; 803 } 804 } 805 806 if (unlikely(vcpu_on_unsupported_cpu(vcpu))) { 807 run->exit_reason = KVM_EXIT_FAIL_ENTRY; 808 run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED; 809 run->fail_entry.cpu = smp_processor_id(); 810 *ret = 0; 811 return true; 812 } 813 814 return kvm_request_pending(vcpu) || 815 xfer_to_guest_mode_work_pending(); 816 } 817 818 /* 819 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while 820 * the vCPU is running. 821 * 822 * This must be noinstr as instrumentation may make use of RCU, and this is not 823 * safe during the EQS. 824 */ 825 static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 826 { 827 int ret; 828 829 guest_state_enter_irqoff(); 830 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); 831 guest_state_exit_irqoff(); 832 833 return ret; 834 } 835 836 /** 837 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code 838 * @vcpu: The VCPU pointer 839 * 840 * This function is called through the VCPU_RUN ioctl called from user space. It 841 * will execute VM code in a loop until the time slice for the process is used 842 * or some emulation is needed from user space in which case the function will 843 * return with return value 0 and with the kvm_run structure filled in with the 844 * required data for the requested emulation. 845 */ 846 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) 847 { 848 struct kvm_run *run = vcpu->run; 849 int ret; 850 851 if (run->exit_reason == KVM_EXIT_MMIO) { 852 ret = kvm_handle_mmio_return(vcpu); 853 if (ret) 854 return ret; 855 } 856 857 vcpu_load(vcpu); 858 859 if (run->immediate_exit) { 860 ret = -EINTR; 861 goto out; 862 } 863 864 kvm_sigset_activate(vcpu); 865 866 ret = 1; 867 run->exit_reason = KVM_EXIT_UNKNOWN; 868 run->flags = 0; 869 while (ret > 0) { 870 /* 871 * Check conditions before entering the guest 872 */ 873 ret = xfer_to_guest_mode_handle_work(vcpu); 874 if (!ret) 875 ret = 1; 876 877 if (ret > 0) 878 ret = check_vcpu_requests(vcpu); 879 880 /* 881 * Preparing the interrupts to be injected also 882 * involves poking the GIC, which must be done in a 883 * non-preemptible context. 884 */ 885 preempt_disable(); 886 887 /* 888 * The VMID allocator only tracks active VMIDs per 889 * physical CPU, and therefore the VMID allocated may not be 890 * preserved on VMID roll-over if the task was preempted, 891 * making a thread's VMID inactive. So we need to call 892 * kvm_arm_vmid_update() in non-premptible context. 893 */ 894 kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid); 895 896 kvm_pmu_flush_hwstate(vcpu); 897 898 local_irq_disable(); 899 900 kvm_vgic_flush_hwstate(vcpu); 901 902 kvm_pmu_update_vcpu_events(vcpu); 903 904 /* 905 * Ensure we set mode to IN_GUEST_MODE after we disable 906 * interrupts and before the final VCPU requests check. 907 * See the comment in kvm_vcpu_exiting_guest_mode() and 908 * Documentation/virt/kvm/vcpu-requests.rst 909 */ 910 smp_store_mb(vcpu->mode, IN_GUEST_MODE); 911 912 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { 913 vcpu->mode = OUTSIDE_GUEST_MODE; 914 isb(); /* Ensure work in x_flush_hwstate is committed */ 915 kvm_pmu_sync_hwstate(vcpu); 916 if (static_branch_unlikely(&userspace_irqchip_in_use)) 917 kvm_timer_sync_user(vcpu); 918 kvm_vgic_sync_hwstate(vcpu); 919 local_irq_enable(); 920 preempt_enable(); 921 continue; 922 } 923 924 kvm_arm_setup_debug(vcpu); 925 kvm_arch_vcpu_ctxflush_fp(vcpu); 926 927 /************************************************************** 928 * Enter the guest 929 */ 930 trace_kvm_entry(*vcpu_pc(vcpu)); 931 guest_timing_enter_irqoff(); 932 933 ret = kvm_arm_vcpu_enter_exit(vcpu); 934 935 vcpu->mode = OUTSIDE_GUEST_MODE; 936 vcpu->stat.exits++; 937 /* 938 * Back from guest 939 *************************************************************/ 940 941 kvm_arm_clear_debug(vcpu); 942 943 /* 944 * We must sync the PMU state before the vgic state so 945 * that the vgic can properly sample the updated state of the 946 * interrupt line. 947 */ 948 kvm_pmu_sync_hwstate(vcpu); 949 950 /* 951 * Sync the vgic state before syncing the timer state because 952 * the timer code needs to know if the virtual timer 953 * interrupts are active. 954 */ 955 kvm_vgic_sync_hwstate(vcpu); 956 957 /* 958 * Sync the timer hardware state before enabling interrupts as 959 * we don't want vtimer interrupts to race with syncing the 960 * timer virtual interrupt state. 961 */ 962 if (static_branch_unlikely(&userspace_irqchip_in_use)) 963 kvm_timer_sync_user(vcpu); 964 965 kvm_arch_vcpu_ctxsync_fp(vcpu); 966 967 /* 968 * We must ensure that any pending interrupts are taken before 969 * we exit guest timing so that timer ticks are accounted as 970 * guest time. Transiently unmask interrupts so that any 971 * pending interrupts are taken. 972 * 973 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other 974 * context synchronization event) is necessary to ensure that 975 * pending interrupts are taken. 976 */ 977 if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) { 978 local_irq_enable(); 979 isb(); 980 local_irq_disable(); 981 } 982 983 guest_timing_exit_irqoff(); 984 985 local_irq_enable(); 986 987 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 988 989 /* Exit types that need handling before we can be preempted */ 990 handle_exit_early(vcpu, ret); 991 992 preempt_enable(); 993 994 /* 995 * The ARMv8 architecture doesn't give the hypervisor 996 * a mechanism to prevent a guest from dropping to AArch32 EL0 997 * if implemented by the CPU. If we spot the guest in such 998 * state and that we decided it wasn't supposed to do so (like 999 * with the asymmetric AArch32 case), return to userspace with 1000 * a fatal error. 1001 */ 1002 if (vcpu_mode_is_bad_32bit(vcpu)) { 1003 /* 1004 * As we have caught the guest red-handed, decide that 1005 * it isn't fit for purpose anymore by making the vcpu 1006 * invalid. The VMM can try and fix it by issuing a 1007 * KVM_ARM_VCPU_INIT if it really wants to. 1008 */ 1009 vcpu->arch.target = -1; 1010 ret = ARM_EXCEPTION_IL; 1011 } 1012 1013 ret = handle_exit(vcpu, ret); 1014 } 1015 1016 /* Tell userspace about in-kernel device output levels */ 1017 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { 1018 kvm_timer_update_run(vcpu); 1019 kvm_pmu_update_run(vcpu); 1020 } 1021 1022 kvm_sigset_deactivate(vcpu); 1023 1024 out: 1025 /* 1026 * In the unlikely event that we are returning to userspace 1027 * with pending exceptions or PC adjustment, commit these 1028 * adjustments in order to give userspace a consistent view of 1029 * the vcpu state. Note that this relies on __kvm_adjust_pc() 1030 * being preempt-safe on VHE. 1031 */ 1032 if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || 1033 vcpu_get_flag(vcpu, INCREMENT_PC))) 1034 kvm_call_hyp(__kvm_adjust_pc, vcpu); 1035 1036 vcpu_put(vcpu); 1037 return ret; 1038 } 1039 1040 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) 1041 { 1042 int bit_index; 1043 bool set; 1044 unsigned long *hcr; 1045 1046 if (number == KVM_ARM_IRQ_CPU_IRQ) 1047 bit_index = __ffs(HCR_VI); 1048 else /* KVM_ARM_IRQ_CPU_FIQ */ 1049 bit_index = __ffs(HCR_VF); 1050 1051 hcr = vcpu_hcr(vcpu); 1052 if (level) 1053 set = test_and_set_bit(bit_index, hcr); 1054 else 1055 set = test_and_clear_bit(bit_index, hcr); 1056 1057 /* 1058 * If we didn't change anything, no need to wake up or kick other CPUs 1059 */ 1060 if (set == level) 1061 return 0; 1062 1063 /* 1064 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and 1065 * trigger a world-switch round on the running physical CPU to set the 1066 * virtual IRQ/FIQ fields in the HCR appropriately. 1067 */ 1068 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 1069 kvm_vcpu_kick(vcpu); 1070 1071 return 0; 1072 } 1073 1074 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 1075 bool line_status) 1076 { 1077 u32 irq = irq_level->irq; 1078 unsigned int irq_type, vcpu_idx, irq_num; 1079 int nrcpus = atomic_read(&kvm->online_vcpus); 1080 struct kvm_vcpu *vcpu = NULL; 1081 bool level = irq_level->level; 1082 1083 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; 1084 vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; 1085 vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); 1086 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; 1087 1088 trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); 1089 1090 switch (irq_type) { 1091 case KVM_ARM_IRQ_TYPE_CPU: 1092 if (irqchip_in_kernel(kvm)) 1093 return -ENXIO; 1094 1095 if (vcpu_idx >= nrcpus) 1096 return -EINVAL; 1097 1098 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1099 if (!vcpu) 1100 return -EINVAL; 1101 1102 if (irq_num > KVM_ARM_IRQ_CPU_FIQ) 1103 return -EINVAL; 1104 1105 return vcpu_interrupt_line(vcpu, irq_num, level); 1106 case KVM_ARM_IRQ_TYPE_PPI: 1107 if (!irqchip_in_kernel(kvm)) 1108 return -ENXIO; 1109 1110 if (vcpu_idx >= nrcpus) 1111 return -EINVAL; 1112 1113 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1114 if (!vcpu) 1115 return -EINVAL; 1116 1117 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) 1118 return -EINVAL; 1119 1120 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); 1121 case KVM_ARM_IRQ_TYPE_SPI: 1122 if (!irqchip_in_kernel(kvm)) 1123 return -ENXIO; 1124 1125 if (irq_num < VGIC_NR_PRIVATE_IRQS) 1126 return -EINVAL; 1127 1128 return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); 1129 } 1130 1131 return -EINVAL; 1132 } 1133 1134 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1135 const struct kvm_vcpu_init *init) 1136 { 1137 unsigned int i, ret; 1138 u32 phys_target = kvm_target_cpu(); 1139 1140 if (init->target != phys_target) 1141 return -EINVAL; 1142 1143 /* 1144 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1145 * use the same target. 1146 */ 1147 if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) 1148 return -EINVAL; 1149 1150 /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ 1151 for (i = 0; i < sizeof(init->features) * 8; i++) { 1152 bool set = (init->features[i / 32] & (1 << (i % 32))); 1153 1154 if (set && i >= KVM_VCPU_MAX_FEATURES) 1155 return -ENOENT; 1156 1157 /* 1158 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1159 * use the same feature set. 1160 */ 1161 if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && 1162 test_bit(i, vcpu->arch.features) != set) 1163 return -EINVAL; 1164 1165 if (set) 1166 set_bit(i, vcpu->arch.features); 1167 } 1168 1169 vcpu->arch.target = phys_target; 1170 1171 /* Now we know what it is, we can reset it. */ 1172 ret = kvm_reset_vcpu(vcpu); 1173 if (ret) { 1174 vcpu->arch.target = -1; 1175 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 1176 } 1177 1178 return ret; 1179 } 1180 1181 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, 1182 struct kvm_vcpu_init *init) 1183 { 1184 int ret; 1185 1186 ret = kvm_vcpu_set_target(vcpu, init); 1187 if (ret) 1188 return ret; 1189 1190 /* 1191 * Ensure a rebooted VM will fault in RAM pages and detect if the 1192 * guest MMU is turned off and flush the caches as needed. 1193 * 1194 * S2FWB enforces all memory accesses to RAM being cacheable, 1195 * ensuring that the data side is always coherent. We still 1196 * need to invalidate the I-cache though, as FWB does *not* 1197 * imply CTR_EL0.DIC. 1198 */ 1199 if (vcpu_has_run_once(vcpu)) { 1200 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) 1201 stage2_unmap_vm(vcpu->kvm); 1202 else 1203 icache_inval_all_pou(); 1204 } 1205 1206 vcpu_reset_hcr(vcpu); 1207 vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; 1208 1209 /* 1210 * Handle the "start in power-off" case. 1211 */ 1212 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 1213 kvm_arm_vcpu_power_off(vcpu); 1214 else 1215 vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; 1216 1217 return 0; 1218 } 1219 1220 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, 1221 struct kvm_device_attr *attr) 1222 { 1223 int ret = -ENXIO; 1224 1225 switch (attr->group) { 1226 default: 1227 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); 1228 break; 1229 } 1230 1231 return ret; 1232 } 1233 1234 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, 1235 struct kvm_device_attr *attr) 1236 { 1237 int ret = -ENXIO; 1238 1239 switch (attr->group) { 1240 default: 1241 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); 1242 break; 1243 } 1244 1245 return ret; 1246 } 1247 1248 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, 1249 struct kvm_device_attr *attr) 1250 { 1251 int ret = -ENXIO; 1252 1253 switch (attr->group) { 1254 default: 1255 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); 1256 break; 1257 } 1258 1259 return ret; 1260 } 1261 1262 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, 1263 struct kvm_vcpu_events *events) 1264 { 1265 memset(events, 0, sizeof(*events)); 1266 1267 return __kvm_arm_vcpu_get_events(vcpu, events); 1268 } 1269 1270 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, 1271 struct kvm_vcpu_events *events) 1272 { 1273 int i; 1274 1275 /* check whether the reserved field is zero */ 1276 for (i = 0; i < ARRAY_SIZE(events->reserved); i++) 1277 if (events->reserved[i]) 1278 return -EINVAL; 1279 1280 /* check whether the pad field is zero */ 1281 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) 1282 if (events->exception.pad[i]) 1283 return -EINVAL; 1284 1285 return __kvm_arm_vcpu_set_events(vcpu, events); 1286 } 1287 1288 long kvm_arch_vcpu_ioctl(struct file *filp, 1289 unsigned int ioctl, unsigned long arg) 1290 { 1291 struct kvm_vcpu *vcpu = filp->private_data; 1292 void __user *argp = (void __user *)arg; 1293 struct kvm_device_attr attr; 1294 long r; 1295 1296 switch (ioctl) { 1297 case KVM_ARM_VCPU_INIT: { 1298 struct kvm_vcpu_init init; 1299 1300 r = -EFAULT; 1301 if (copy_from_user(&init, argp, sizeof(init))) 1302 break; 1303 1304 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); 1305 break; 1306 } 1307 case KVM_SET_ONE_REG: 1308 case KVM_GET_ONE_REG: { 1309 struct kvm_one_reg reg; 1310 1311 r = -ENOEXEC; 1312 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1313 break; 1314 1315 r = -EFAULT; 1316 if (copy_from_user(®, argp, sizeof(reg))) 1317 break; 1318 1319 /* 1320 * We could owe a reset due to PSCI. Handle the pending reset 1321 * here to ensure userspace register accesses are ordered after 1322 * the reset. 1323 */ 1324 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 1325 kvm_reset_vcpu(vcpu); 1326 1327 if (ioctl == KVM_SET_ONE_REG) 1328 r = kvm_arm_set_reg(vcpu, ®); 1329 else 1330 r = kvm_arm_get_reg(vcpu, ®); 1331 break; 1332 } 1333 case KVM_GET_REG_LIST: { 1334 struct kvm_reg_list __user *user_list = argp; 1335 struct kvm_reg_list reg_list; 1336 unsigned n; 1337 1338 r = -ENOEXEC; 1339 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1340 break; 1341 1342 r = -EPERM; 1343 if (!kvm_arm_vcpu_is_finalized(vcpu)) 1344 break; 1345 1346 r = -EFAULT; 1347 if (copy_from_user(®_list, user_list, sizeof(reg_list))) 1348 break; 1349 n = reg_list.n; 1350 reg_list.n = kvm_arm_num_regs(vcpu); 1351 if (copy_to_user(user_list, ®_list, sizeof(reg_list))) 1352 break; 1353 r = -E2BIG; 1354 if (n < reg_list.n) 1355 break; 1356 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); 1357 break; 1358 } 1359 case KVM_SET_DEVICE_ATTR: { 1360 r = -EFAULT; 1361 if (copy_from_user(&attr, argp, sizeof(attr))) 1362 break; 1363 r = kvm_arm_vcpu_set_attr(vcpu, &attr); 1364 break; 1365 } 1366 case KVM_GET_DEVICE_ATTR: { 1367 r = -EFAULT; 1368 if (copy_from_user(&attr, argp, sizeof(attr))) 1369 break; 1370 r = kvm_arm_vcpu_get_attr(vcpu, &attr); 1371 break; 1372 } 1373 case KVM_HAS_DEVICE_ATTR: { 1374 r = -EFAULT; 1375 if (copy_from_user(&attr, argp, sizeof(attr))) 1376 break; 1377 r = kvm_arm_vcpu_has_attr(vcpu, &attr); 1378 break; 1379 } 1380 case KVM_GET_VCPU_EVENTS: { 1381 struct kvm_vcpu_events events; 1382 1383 if (kvm_arm_vcpu_get_events(vcpu, &events)) 1384 return -EINVAL; 1385 1386 if (copy_to_user(argp, &events, sizeof(events))) 1387 return -EFAULT; 1388 1389 return 0; 1390 } 1391 case KVM_SET_VCPU_EVENTS: { 1392 struct kvm_vcpu_events events; 1393 1394 if (copy_from_user(&events, argp, sizeof(events))) 1395 return -EFAULT; 1396 1397 return kvm_arm_vcpu_set_events(vcpu, &events); 1398 } 1399 case KVM_ARM_VCPU_FINALIZE: { 1400 int what; 1401 1402 if (!kvm_vcpu_initialized(vcpu)) 1403 return -ENOEXEC; 1404 1405 if (get_user(what, (const int __user *)argp)) 1406 return -EFAULT; 1407 1408 return kvm_arm_vcpu_finalize(vcpu, what); 1409 } 1410 default: 1411 r = -EINVAL; 1412 } 1413 1414 return r; 1415 } 1416 1417 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1418 { 1419 1420 } 1421 1422 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 1423 const struct kvm_memory_slot *memslot) 1424 { 1425 kvm_flush_remote_tlbs(kvm); 1426 } 1427 1428 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, 1429 struct kvm_arm_device_addr *dev_addr) 1430 { 1431 switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { 1432 case KVM_ARM_DEVICE_VGIC_V2: 1433 if (!vgic_present) 1434 return -ENXIO; 1435 return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); 1436 default: 1437 return -ENODEV; 1438 } 1439 } 1440 1441 long kvm_arch_vm_ioctl(struct file *filp, 1442 unsigned int ioctl, unsigned long arg) 1443 { 1444 struct kvm *kvm = filp->private_data; 1445 void __user *argp = (void __user *)arg; 1446 1447 switch (ioctl) { 1448 case KVM_CREATE_IRQCHIP: { 1449 int ret; 1450 if (!vgic_present) 1451 return -ENXIO; 1452 mutex_lock(&kvm->lock); 1453 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); 1454 mutex_unlock(&kvm->lock); 1455 return ret; 1456 } 1457 case KVM_ARM_SET_DEVICE_ADDR: { 1458 struct kvm_arm_device_addr dev_addr; 1459 1460 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) 1461 return -EFAULT; 1462 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); 1463 } 1464 case KVM_ARM_PREFERRED_TARGET: { 1465 struct kvm_vcpu_init init; 1466 1467 kvm_vcpu_preferred_target(&init); 1468 1469 if (copy_to_user(argp, &init, sizeof(init))) 1470 return -EFAULT; 1471 1472 return 0; 1473 } 1474 case KVM_ARM_MTE_COPY_TAGS: { 1475 struct kvm_arm_copy_mte_tags copy_tags; 1476 1477 if (copy_from_user(©_tags, argp, sizeof(copy_tags))) 1478 return -EFAULT; 1479 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); 1480 } 1481 default: 1482 return -EINVAL; 1483 } 1484 } 1485 1486 static unsigned long nvhe_percpu_size(void) 1487 { 1488 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - 1489 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); 1490 } 1491 1492 static unsigned long nvhe_percpu_order(void) 1493 { 1494 unsigned long size = nvhe_percpu_size(); 1495 1496 return size ? get_order(size) : 0; 1497 } 1498 1499 /* A lookup table holding the hypervisor VA for each vector slot */ 1500 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; 1501 1502 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) 1503 { 1504 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); 1505 } 1506 1507 static int kvm_init_vector_slots(void) 1508 { 1509 int err; 1510 void *base; 1511 1512 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); 1513 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); 1514 1515 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); 1516 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); 1517 1518 if (kvm_system_needs_idmapped_vectors() && 1519 !is_protected_kvm_enabled()) { 1520 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), 1521 __BP_HARDEN_HYP_VECS_SZ, &base); 1522 if (err) 1523 return err; 1524 } 1525 1526 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); 1527 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); 1528 return 0; 1529 } 1530 1531 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) 1532 { 1533 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 1534 unsigned long tcr; 1535 1536 /* 1537 * Calculate the raw per-cpu offset without a translation from the 1538 * kernel's mapping to the linear mapping, and store it in tpidr_el2 1539 * so that we can use adr_l to access per-cpu variables in EL2. 1540 * Also drop the KASAN tag which gets in the way... 1541 */ 1542 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - 1543 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); 1544 1545 params->mair_el2 = read_sysreg(mair_el1); 1546 1547 tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; 1548 tcr &= ~TCR_T0SZ_MASK; 1549 tcr |= TCR_T0SZ(hyp_va_bits); 1550 params->tcr_el2 = tcr; 1551 1552 params->pgd_pa = kvm_mmu_get_httbr(); 1553 if (is_protected_kvm_enabled()) 1554 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; 1555 else 1556 params->hcr_el2 = HCR_HOST_NVHE_FLAGS; 1557 params->vttbr = params->vtcr = 0; 1558 1559 /* 1560 * Flush the init params from the data cache because the struct will 1561 * be read while the MMU is off. 1562 */ 1563 kvm_flush_dcache_to_poc(params, sizeof(*params)); 1564 } 1565 1566 static void hyp_install_host_vector(void) 1567 { 1568 struct kvm_nvhe_init_params *params; 1569 struct arm_smccc_res res; 1570 1571 /* Switch from the HYP stub to our own HYP init vector */ 1572 __hyp_set_vectors(kvm_get_idmap_vector()); 1573 1574 /* 1575 * Call initialization code, and switch to the full blown HYP code. 1576 * If the cpucaps haven't been finalized yet, something has gone very 1577 * wrong, and hyp will crash and burn when it uses any 1578 * cpus_have_const_cap() wrapper. 1579 */ 1580 BUG_ON(!system_capabilities_finalized()); 1581 params = this_cpu_ptr_nvhe_sym(kvm_init_params); 1582 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); 1583 WARN_ON(res.a0 != SMCCC_RET_SUCCESS); 1584 } 1585 1586 static void cpu_init_hyp_mode(void) 1587 { 1588 hyp_install_host_vector(); 1589 1590 /* 1591 * Disabling SSBD on a non-VHE system requires us to enable SSBS 1592 * at EL2. 1593 */ 1594 if (this_cpu_has_cap(ARM64_SSBS) && 1595 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { 1596 kvm_call_hyp_nvhe(__kvm_enable_ssbs); 1597 } 1598 } 1599 1600 static void cpu_hyp_reset(void) 1601 { 1602 if (!is_kernel_in_hyp_mode()) 1603 __hyp_reset_vectors(); 1604 } 1605 1606 /* 1607 * EL2 vectors can be mapped and rerouted in a number of ways, 1608 * depending on the kernel configuration and CPU present: 1609 * 1610 * - If the CPU is affected by Spectre-v2, the hardening sequence is 1611 * placed in one of the vector slots, which is executed before jumping 1612 * to the real vectors. 1613 * 1614 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot 1615 * containing the hardening sequence is mapped next to the idmap page, 1616 * and executed before jumping to the real vectors. 1617 * 1618 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an 1619 * empty slot is selected, mapped next to the idmap page, and 1620 * executed before jumping to the real vectors. 1621 * 1622 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with 1623 * VHE, as we don't have hypervisor-specific mappings. If the system 1624 * is VHE and yet selects this capability, it will be ignored. 1625 */ 1626 static void cpu_set_hyp_vector(void) 1627 { 1628 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); 1629 void *vector = hyp_spectre_vector_selector[data->slot]; 1630 1631 if (!is_protected_kvm_enabled()) 1632 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; 1633 else 1634 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); 1635 } 1636 1637 static void cpu_hyp_init_context(void) 1638 { 1639 kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); 1640 1641 if (!is_kernel_in_hyp_mode()) 1642 cpu_init_hyp_mode(); 1643 } 1644 1645 static void cpu_hyp_init_features(void) 1646 { 1647 cpu_set_hyp_vector(); 1648 kvm_arm_init_debug(); 1649 1650 if (is_kernel_in_hyp_mode()) 1651 kvm_timer_init_vhe(); 1652 1653 if (vgic_present) 1654 kvm_vgic_init_cpu_hardware(); 1655 } 1656 1657 static void cpu_hyp_reinit(void) 1658 { 1659 cpu_hyp_reset(); 1660 cpu_hyp_init_context(); 1661 cpu_hyp_init_features(); 1662 } 1663 1664 static void _kvm_arch_hardware_enable(void *discard) 1665 { 1666 if (!__this_cpu_read(kvm_arm_hardware_enabled)) { 1667 cpu_hyp_reinit(); 1668 __this_cpu_write(kvm_arm_hardware_enabled, 1); 1669 } 1670 } 1671 1672 int kvm_arch_hardware_enable(void) 1673 { 1674 int was_enabled = __this_cpu_read(kvm_arm_hardware_enabled); 1675 1676 _kvm_arch_hardware_enable(NULL); 1677 1678 if (!was_enabled) { 1679 kvm_vgic_cpu_up(); 1680 kvm_timer_cpu_up(); 1681 } 1682 1683 return 0; 1684 } 1685 1686 static void _kvm_arch_hardware_disable(void *discard) 1687 { 1688 if (__this_cpu_read(kvm_arm_hardware_enabled)) { 1689 cpu_hyp_reset(); 1690 __this_cpu_write(kvm_arm_hardware_enabled, 0); 1691 } 1692 } 1693 1694 void kvm_arch_hardware_disable(void) 1695 { 1696 if (__this_cpu_read(kvm_arm_hardware_enabled)) { 1697 kvm_timer_cpu_down(); 1698 kvm_vgic_cpu_down(); 1699 } 1700 1701 if (!is_protected_kvm_enabled()) 1702 _kvm_arch_hardware_disable(NULL); 1703 } 1704 1705 #ifdef CONFIG_CPU_PM 1706 static int hyp_init_cpu_pm_notifier(struct notifier_block *self, 1707 unsigned long cmd, 1708 void *v) 1709 { 1710 /* 1711 * kvm_arm_hardware_enabled is left with its old value over 1712 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should 1713 * re-enable hyp. 1714 */ 1715 switch (cmd) { 1716 case CPU_PM_ENTER: 1717 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1718 /* 1719 * don't update kvm_arm_hardware_enabled here 1720 * so that the hardware will be re-enabled 1721 * when we resume. See below. 1722 */ 1723 cpu_hyp_reset(); 1724 1725 return NOTIFY_OK; 1726 case CPU_PM_ENTER_FAILED: 1727 case CPU_PM_EXIT: 1728 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1729 /* The hardware was enabled before suspend. */ 1730 cpu_hyp_reinit(); 1731 1732 return NOTIFY_OK; 1733 1734 default: 1735 return NOTIFY_DONE; 1736 } 1737 } 1738 1739 static struct notifier_block hyp_init_cpu_pm_nb = { 1740 .notifier_call = hyp_init_cpu_pm_notifier, 1741 }; 1742 1743 static void __init hyp_cpu_pm_init(void) 1744 { 1745 if (!is_protected_kvm_enabled()) 1746 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); 1747 } 1748 static void __init hyp_cpu_pm_exit(void) 1749 { 1750 if (!is_protected_kvm_enabled()) 1751 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); 1752 } 1753 #else 1754 static inline void __init hyp_cpu_pm_init(void) 1755 { 1756 } 1757 static inline void __init hyp_cpu_pm_exit(void) 1758 { 1759 } 1760 #endif 1761 1762 static void __init init_cpu_logical_map(void) 1763 { 1764 unsigned int cpu; 1765 1766 /* 1767 * Copy the MPIDR <-> logical CPU ID mapping to hyp. 1768 * Only copy the set of online CPUs whose features have been checked 1769 * against the finalized system capabilities. The hypervisor will not 1770 * allow any other CPUs from the `possible` set to boot. 1771 */ 1772 for_each_online_cpu(cpu) 1773 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); 1774 } 1775 1776 #define init_psci_0_1_impl_state(config, what) \ 1777 config.psci_0_1_ ## what ## _implemented = psci_ops.what 1778 1779 static bool __init init_psci_relay(void) 1780 { 1781 /* 1782 * If PSCI has not been initialized, protected KVM cannot install 1783 * itself on newly booted CPUs. 1784 */ 1785 if (!psci_ops.get_version) { 1786 kvm_err("Cannot initialize protected mode without PSCI\n"); 1787 return false; 1788 } 1789 1790 kvm_host_psci_config.version = psci_ops.get_version(); 1791 1792 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { 1793 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); 1794 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); 1795 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); 1796 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); 1797 init_psci_0_1_impl_state(kvm_host_psci_config, migrate); 1798 } 1799 return true; 1800 } 1801 1802 static int __init init_subsystems(void) 1803 { 1804 int err = 0; 1805 1806 /* 1807 * Enable hardware so that subsystem initialisation can access EL2. 1808 */ 1809 on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); 1810 1811 /* 1812 * Register CPU lower-power notifier 1813 */ 1814 hyp_cpu_pm_init(); 1815 1816 /* 1817 * Init HYP view of VGIC 1818 */ 1819 err = kvm_vgic_hyp_init(); 1820 switch (err) { 1821 case 0: 1822 vgic_present = true; 1823 break; 1824 case -ENODEV: 1825 case -ENXIO: 1826 vgic_present = false; 1827 err = 0; 1828 break; 1829 default: 1830 goto out; 1831 } 1832 1833 /* 1834 * Init HYP architected timer support 1835 */ 1836 err = kvm_timer_hyp_init(vgic_present); 1837 if (err) 1838 goto out; 1839 1840 kvm_register_perf_callbacks(NULL); 1841 1842 out: 1843 if (err) 1844 hyp_cpu_pm_exit(); 1845 1846 if (err || !is_protected_kvm_enabled()) 1847 on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); 1848 1849 return err; 1850 } 1851 1852 static void __init teardown_subsystems(void) 1853 { 1854 kvm_unregister_perf_callbacks(); 1855 hyp_cpu_pm_exit(); 1856 } 1857 1858 static void __init teardown_hyp_mode(void) 1859 { 1860 int cpu; 1861 1862 free_hyp_pgds(); 1863 for_each_possible_cpu(cpu) { 1864 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); 1865 free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); 1866 } 1867 } 1868 1869 static int __init do_pkvm_init(u32 hyp_va_bits) 1870 { 1871 void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); 1872 int ret; 1873 1874 preempt_disable(); 1875 cpu_hyp_init_context(); 1876 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, 1877 num_possible_cpus(), kern_hyp_va(per_cpu_base), 1878 hyp_va_bits); 1879 cpu_hyp_init_features(); 1880 1881 /* 1882 * The stub hypercalls are now disabled, so set our local flag to 1883 * prevent a later re-init attempt in kvm_arch_hardware_enable(). 1884 */ 1885 __this_cpu_write(kvm_arm_hardware_enabled, 1); 1886 preempt_enable(); 1887 1888 return ret; 1889 } 1890 1891 static u64 get_hyp_id_aa64pfr0_el1(void) 1892 { 1893 /* 1894 * Track whether the system isn't affected by spectre/meltdown in the 1895 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. 1896 * Although this is per-CPU, we make it global for simplicity, e.g., not 1897 * to have to worry about vcpu migration. 1898 * 1899 * Unlike for non-protected VMs, userspace cannot override this for 1900 * protected VMs. 1901 */ 1902 u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); 1903 1904 val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | 1905 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); 1906 1907 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), 1908 arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); 1909 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), 1910 arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); 1911 1912 return val; 1913 } 1914 1915 static void kvm_hyp_init_symbols(void) 1916 { 1917 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); 1918 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); 1919 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); 1920 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); 1921 kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); 1922 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 1923 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 1924 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); 1925 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); 1926 kvm_nvhe_sym(__icache_flags) = __icache_flags; 1927 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; 1928 } 1929 1930 static int __init kvm_hyp_init_protection(u32 hyp_va_bits) 1931 { 1932 void *addr = phys_to_virt(hyp_mem_base); 1933 int ret; 1934 1935 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); 1936 if (ret) 1937 return ret; 1938 1939 ret = do_pkvm_init(hyp_va_bits); 1940 if (ret) 1941 return ret; 1942 1943 free_hyp_pgds(); 1944 1945 return 0; 1946 } 1947 1948 /* Inits Hyp-mode on all online CPUs */ 1949 static int __init init_hyp_mode(void) 1950 { 1951 u32 hyp_va_bits; 1952 int cpu; 1953 int err = -ENOMEM; 1954 1955 /* 1956 * The protected Hyp-mode cannot be initialized if the memory pool 1957 * allocation has failed. 1958 */ 1959 if (is_protected_kvm_enabled() && !hyp_mem_base) 1960 goto out_err; 1961 1962 /* 1963 * Allocate Hyp PGD and setup Hyp identity mapping 1964 */ 1965 err = kvm_mmu_init(&hyp_va_bits); 1966 if (err) 1967 goto out_err; 1968 1969 /* 1970 * Allocate stack pages for Hypervisor-mode 1971 */ 1972 for_each_possible_cpu(cpu) { 1973 unsigned long stack_page; 1974 1975 stack_page = __get_free_page(GFP_KERNEL); 1976 if (!stack_page) { 1977 err = -ENOMEM; 1978 goto out_err; 1979 } 1980 1981 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; 1982 } 1983 1984 /* 1985 * Allocate and initialize pages for Hypervisor-mode percpu regions. 1986 */ 1987 for_each_possible_cpu(cpu) { 1988 struct page *page; 1989 void *page_addr; 1990 1991 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); 1992 if (!page) { 1993 err = -ENOMEM; 1994 goto out_err; 1995 } 1996 1997 page_addr = page_address(page); 1998 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); 1999 kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; 2000 } 2001 2002 /* 2003 * Map the Hyp-code called directly from the host 2004 */ 2005 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), 2006 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); 2007 if (err) { 2008 kvm_err("Cannot map world-switch code\n"); 2009 goto out_err; 2010 } 2011 2012 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), 2013 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); 2014 if (err) { 2015 kvm_err("Cannot map .hyp.rodata section\n"); 2016 goto out_err; 2017 } 2018 2019 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), 2020 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); 2021 if (err) { 2022 kvm_err("Cannot map rodata section\n"); 2023 goto out_err; 2024 } 2025 2026 /* 2027 * .hyp.bss is guaranteed to be placed at the beginning of the .bss 2028 * section thanks to an assertion in the linker script. Map it RW and 2029 * the rest of .bss RO. 2030 */ 2031 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), 2032 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); 2033 if (err) { 2034 kvm_err("Cannot map hyp bss section: %d\n", err); 2035 goto out_err; 2036 } 2037 2038 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), 2039 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); 2040 if (err) { 2041 kvm_err("Cannot map bss section\n"); 2042 goto out_err; 2043 } 2044 2045 /* 2046 * Map the Hyp stack pages 2047 */ 2048 for_each_possible_cpu(cpu) { 2049 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 2050 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); 2051 unsigned long hyp_addr; 2052 2053 /* 2054 * Allocate a contiguous HYP private VA range for the stack 2055 * and guard page. The allocation is also aligned based on 2056 * the order of its size. 2057 */ 2058 err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); 2059 if (err) { 2060 kvm_err("Cannot allocate hyp stack guard page\n"); 2061 goto out_err; 2062 } 2063 2064 /* 2065 * Since the stack grows downwards, map the stack to the page 2066 * at the higher address and leave the lower guard page 2067 * unbacked. 2068 * 2069 * Any valid stack address now has the PAGE_SHIFT bit as 1 2070 * and addresses corresponding to the guard page have the 2071 * PAGE_SHIFT bit as 0 - this is used for overflow detection. 2072 */ 2073 err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, 2074 __pa(stack_page), PAGE_HYP); 2075 if (err) { 2076 kvm_err("Cannot map hyp stack\n"); 2077 goto out_err; 2078 } 2079 2080 /* 2081 * Save the stack PA in nvhe_init_params. This will be needed 2082 * to recreate the stack mapping in protected nVHE mode. 2083 * __hyp_pa() won't do the right thing there, since the stack 2084 * has been mapped in the flexible private VA space. 2085 */ 2086 params->stack_pa = __pa(stack_page); 2087 2088 params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); 2089 } 2090 2091 for_each_possible_cpu(cpu) { 2092 char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; 2093 char *percpu_end = percpu_begin + nvhe_percpu_size(); 2094 2095 /* Map Hyp percpu pages */ 2096 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); 2097 if (err) { 2098 kvm_err("Cannot map hyp percpu region\n"); 2099 goto out_err; 2100 } 2101 2102 /* Prepare the CPU initialization parameters */ 2103 cpu_prepare_hyp_mode(cpu, hyp_va_bits); 2104 } 2105 2106 kvm_hyp_init_symbols(); 2107 2108 if (is_protected_kvm_enabled()) { 2109 init_cpu_logical_map(); 2110 2111 if (!init_psci_relay()) { 2112 err = -ENODEV; 2113 goto out_err; 2114 } 2115 2116 err = kvm_hyp_init_protection(hyp_va_bits); 2117 if (err) { 2118 kvm_err("Failed to init hyp memory protection\n"); 2119 goto out_err; 2120 } 2121 } 2122 2123 return 0; 2124 2125 out_err: 2126 teardown_hyp_mode(); 2127 kvm_err("error initializing Hyp mode: %d\n", err); 2128 return err; 2129 } 2130 2131 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) 2132 { 2133 struct kvm_vcpu *vcpu; 2134 unsigned long i; 2135 2136 mpidr &= MPIDR_HWID_BITMASK; 2137 kvm_for_each_vcpu(i, vcpu, kvm) { 2138 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) 2139 return vcpu; 2140 } 2141 return NULL; 2142 } 2143 2144 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) 2145 { 2146 return irqchip_in_kernel(kvm); 2147 } 2148 2149 bool kvm_arch_has_irq_bypass(void) 2150 { 2151 return true; 2152 } 2153 2154 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 2155 struct irq_bypass_producer *prod) 2156 { 2157 struct kvm_kernel_irqfd *irqfd = 2158 container_of(cons, struct kvm_kernel_irqfd, consumer); 2159 2160 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, 2161 &irqfd->irq_entry); 2162 } 2163 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 2164 struct irq_bypass_producer *prod) 2165 { 2166 struct kvm_kernel_irqfd *irqfd = 2167 container_of(cons, struct kvm_kernel_irqfd, consumer); 2168 2169 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, 2170 &irqfd->irq_entry); 2171 } 2172 2173 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) 2174 { 2175 struct kvm_kernel_irqfd *irqfd = 2176 container_of(cons, struct kvm_kernel_irqfd, consumer); 2177 2178 kvm_arm_halt_guest(irqfd->kvm); 2179 } 2180 2181 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) 2182 { 2183 struct kvm_kernel_irqfd *irqfd = 2184 container_of(cons, struct kvm_kernel_irqfd, consumer); 2185 2186 kvm_arm_resume_guest(irqfd->kvm); 2187 } 2188 2189 /* Initialize Hyp-mode and memory mappings on all CPUs */ 2190 static __init int kvm_arm_init(void) 2191 { 2192 int err; 2193 bool in_hyp_mode; 2194 2195 if (!is_hyp_mode_available()) { 2196 kvm_info("HYP mode not available\n"); 2197 return -ENODEV; 2198 } 2199 2200 if (kvm_get_mode() == KVM_MODE_NONE) { 2201 kvm_info("KVM disabled from command line\n"); 2202 return -ENODEV; 2203 } 2204 2205 err = kvm_sys_reg_table_init(); 2206 if (err) { 2207 kvm_info("Error initializing system register tables"); 2208 return err; 2209 } 2210 2211 in_hyp_mode = is_kernel_in_hyp_mode(); 2212 2213 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || 2214 cpus_have_final_cap(ARM64_WORKAROUND_1508412)) 2215 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ 2216 "Only trusted guests should be used on this system.\n"); 2217 2218 err = kvm_set_ipa_limit(); 2219 if (err) 2220 return err; 2221 2222 err = kvm_arm_init_sve(); 2223 if (err) 2224 return err; 2225 2226 err = kvm_arm_vmid_alloc_init(); 2227 if (err) { 2228 kvm_err("Failed to initialize VMID allocator.\n"); 2229 return err; 2230 } 2231 2232 if (!in_hyp_mode) { 2233 err = init_hyp_mode(); 2234 if (err) 2235 goto out_err; 2236 } 2237 2238 err = kvm_init_vector_slots(); 2239 if (err) { 2240 kvm_err("Cannot initialise vector slots\n"); 2241 goto out_hyp; 2242 } 2243 2244 err = init_subsystems(); 2245 if (err) 2246 goto out_hyp; 2247 2248 if (is_protected_kvm_enabled()) { 2249 kvm_info("Protected nVHE mode initialized successfully\n"); 2250 } else if (in_hyp_mode) { 2251 kvm_info("VHE mode initialized successfully\n"); 2252 } else { 2253 kvm_info("Hyp mode initialized successfully\n"); 2254 } 2255 2256 /* 2257 * FIXME: Do something reasonable if kvm_init() fails after pKVM 2258 * hypervisor protection is finalized. 2259 */ 2260 err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); 2261 if (err) 2262 goto out_subs; 2263 2264 return 0; 2265 2266 out_subs: 2267 teardown_subsystems(); 2268 out_hyp: 2269 if (!in_hyp_mode) 2270 teardown_hyp_mode(); 2271 out_err: 2272 kvm_arm_vmid_alloc_free(); 2273 return err; 2274 } 2275 2276 static int __init early_kvm_mode_cfg(char *arg) 2277 { 2278 if (!arg) 2279 return -EINVAL; 2280 2281 if (strcmp(arg, "none") == 0) { 2282 kvm_mode = KVM_MODE_NONE; 2283 return 0; 2284 } 2285 2286 if (!is_hyp_mode_available()) { 2287 pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); 2288 return 0; 2289 } 2290 2291 if (strcmp(arg, "protected") == 0) { 2292 if (!is_kernel_in_hyp_mode()) 2293 kvm_mode = KVM_MODE_PROTECTED; 2294 else 2295 pr_warn_once("Protected KVM not available with VHE\n"); 2296 2297 return 0; 2298 } 2299 2300 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { 2301 kvm_mode = KVM_MODE_DEFAULT; 2302 return 0; 2303 } 2304 2305 if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { 2306 kvm_mode = KVM_MODE_NV; 2307 return 0; 2308 } 2309 2310 return -EINVAL; 2311 } 2312 early_param("kvm-arm.mode", early_kvm_mode_cfg); 2313 2314 enum kvm_mode kvm_get_mode(void) 2315 { 2316 return kvm_mode; 2317 } 2318 2319 module_init(kvm_arm_init); 2320