1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Xen SMP support 4 * 5 * This file implements the Xen versions of smp_ops. SMP under Xen is 6 * very straightforward. Bringing a CPU up is simply a matter of 7 * loading its initial context and setting it running. 8 * 9 * IPIs are handled through the Xen event mechanism. 10 * 11 * Because virtual CPUs can be scheduled onto any real CPU, there's no 12 * useful topology information for the kernel to make use of. As a 13 * result, all CPUs are treated as if they're single-core and 14 * single-threaded. 15 */ 16 #include <linux/sched.h> 17 #include <linux/sched/task_stack.h> 18 #include <linux/err.h> 19 #include <linux/slab.h> 20 #include <linux/smp.h> 21 #include <linux/irq_work.h> 22 #include <linux/tick.h> 23 #include <linux/nmi.h> 24 #include <linux/cpuhotplug.h> 25 26 #include <asm/paravirt.h> 27 #include <asm/desc.h> 28 #include <asm/pgtable.h> 29 #include <asm/cpu.h> 30 31 #include <xen/interface/xen.h> 32 #include <xen/interface/vcpu.h> 33 #include <xen/interface/xenpmu.h> 34 35 #include <asm/spec-ctrl.h> 36 #include <asm/xen/interface.h> 37 #include <asm/xen/hypercall.h> 38 39 #include <xen/xen.h> 40 #include <xen/page.h> 41 #include <xen/events.h> 42 43 #include <xen/hvc-console.h> 44 #include "xen-ops.h" 45 #include "mmu.h" 46 #include "smp.h" 47 #include "pmu.h" 48 49 cpumask_var_t xen_cpu_initialized_map; 50 51 static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; 52 static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; 53 54 static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); 55 56 static void cpu_bringup(void) 57 { 58 int cpu; 59 60 cpu_init(); 61 touch_softlockup_watchdog(); 62 preempt_disable(); 63 64 /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ 65 if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { 66 xen_enable_sysenter(); 67 xen_enable_syscall(); 68 } 69 cpu = smp_processor_id(); 70 smp_store_cpu_info(cpu); 71 cpu_data(cpu).x86_max_cores = 1; 72 set_cpu_sibling_map(cpu); 73 74 speculative_store_bypass_ht_init(); 75 76 xen_setup_cpu_clockevents(); 77 78 notify_cpu_starting(cpu); 79 80 set_cpu_online(cpu, true); 81 82 cpu_set_state_online(cpu); /* Implies full memory barrier. */ 83 84 /* We can take interrupts now: we're officially "up". */ 85 local_irq_enable(); 86 } 87 88 asmlinkage __visible void cpu_bringup_and_idle(void) 89 { 90 cpu_bringup(); 91 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 92 } 93 94 void xen_smp_intr_free_pv(unsigned int cpu) 95 { 96 if (per_cpu(xen_irq_work, cpu).irq >= 0) { 97 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); 98 per_cpu(xen_irq_work, cpu).irq = -1; 99 kfree(per_cpu(xen_irq_work, cpu).name); 100 per_cpu(xen_irq_work, cpu).name = NULL; 101 } 102 103 if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { 104 unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); 105 per_cpu(xen_pmu_irq, cpu).irq = -1; 106 kfree(per_cpu(xen_pmu_irq, cpu).name); 107 per_cpu(xen_pmu_irq, cpu).name = NULL; 108 } 109 } 110 111 int xen_smp_intr_init_pv(unsigned int cpu) 112 { 113 int rc; 114 char *callfunc_name, *pmu_name; 115 116 callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); 117 rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, 118 cpu, 119 xen_irq_work_interrupt, 120 IRQF_PERCPU|IRQF_NOBALANCING, 121 callfunc_name, 122 NULL); 123 if (rc < 0) 124 goto fail; 125 per_cpu(xen_irq_work, cpu).irq = rc; 126 per_cpu(xen_irq_work, cpu).name = callfunc_name; 127 128 if (is_xen_pmu(cpu)) { 129 pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); 130 rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, 131 xen_pmu_irq_handler, 132 IRQF_PERCPU|IRQF_NOBALANCING, 133 pmu_name, NULL); 134 if (rc < 0) 135 goto fail; 136 per_cpu(xen_pmu_irq, cpu).irq = rc; 137 per_cpu(xen_pmu_irq, cpu).name = pmu_name; 138 } 139 140 return 0; 141 142 fail: 143 xen_smp_intr_free_pv(cpu); 144 return rc; 145 } 146 147 static void __init xen_fill_possible_map(void) 148 { 149 int i, rc; 150 151 if (xen_initial_domain()) 152 return; 153 154 for (i = 0; i < nr_cpu_ids; i++) { 155 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 156 if (rc >= 0) { 157 num_processors++; 158 set_cpu_possible(i, true); 159 } 160 } 161 } 162 163 static void __init xen_filter_cpu_maps(void) 164 { 165 int i, rc; 166 unsigned int subtract = 0; 167 168 if (!xen_initial_domain()) 169 return; 170 171 num_processors = 0; 172 disabled_cpus = 0; 173 for (i = 0; i < nr_cpu_ids; i++) { 174 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 175 if (rc >= 0) { 176 num_processors++; 177 set_cpu_possible(i, true); 178 } else { 179 set_cpu_possible(i, false); 180 set_cpu_present(i, false); 181 subtract++; 182 } 183 } 184 #ifdef CONFIG_HOTPLUG_CPU 185 /* This is akin to using 'nr_cpus' on the Linux command line. 186 * Which is OK as when we use 'dom0_max_vcpus=X' we can only 187 * have up to X, while nr_cpu_ids is greater than X. This 188 * normally is not a problem, except when CPU hotplugging 189 * is involved and then there might be more than X CPUs 190 * in the guest - which will not work as there is no 191 * hypercall to expand the max number of VCPUs an already 192 * running guest has. So cap it up to X. */ 193 if (subtract) 194 nr_cpu_ids = nr_cpu_ids - subtract; 195 #endif 196 197 } 198 199 static void __init xen_pv_smp_prepare_boot_cpu(void) 200 { 201 BUG_ON(smp_processor_id() != 0); 202 native_smp_prepare_boot_cpu(); 203 204 if (!xen_feature(XENFEAT_writable_page_tables)) 205 /* We've switched to the "real" per-cpu gdt, so make 206 * sure the old memory can be recycled. */ 207 make_lowmem_page_readwrite(xen_initial_gdt); 208 209 #ifdef CONFIG_X86_32 210 /* 211 * Xen starts us with XEN_FLAT_RING1_DS, but linux code 212 * expects __USER_DS 213 */ 214 loadsegment(ds, __USER_DS); 215 loadsegment(es, __USER_DS); 216 #endif 217 218 xen_filter_cpu_maps(); 219 xen_setup_vcpu_info_placement(); 220 221 /* 222 * The alternative logic (which patches the unlock/lock) runs before 223 * the smp bootup up code is activated. Hence we need to set this up 224 * the core kernel is being patched. Otherwise we will have only 225 * modules patched but not core code. 226 */ 227 xen_init_spinlocks(); 228 } 229 230 static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) 231 { 232 unsigned cpu; 233 unsigned int i; 234 235 if (skip_ioapic_setup) { 236 char *m = (max_cpus == 0) ? 237 "The nosmp parameter is incompatible with Xen; " \ 238 "use Xen dom0_max_vcpus=1 parameter" : 239 "The noapic parameter is incompatible with Xen"; 240 241 xen_raw_printk(m); 242 panic(m); 243 } 244 xen_init_lock_cpu(0); 245 246 smp_store_boot_cpu_info(); 247 cpu_data(0).x86_max_cores = 1; 248 249 for_each_possible_cpu(i) { 250 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 251 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 252 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); 253 } 254 set_cpu_sibling_map(0); 255 256 speculative_store_bypass_ht_init(); 257 258 xen_pmu_init(0); 259 260 if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) 261 BUG(); 262 263 if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) 264 panic("could not allocate xen_cpu_initialized_map\n"); 265 266 cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); 267 268 /* Restrict the possible_map according to max_cpus. */ 269 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 270 for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) 271 continue; 272 set_cpu_possible(cpu, false); 273 } 274 275 for_each_possible_cpu(cpu) 276 set_cpu_present(cpu, true); 277 } 278 279 static int 280 cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 281 { 282 struct vcpu_guest_context *ctxt; 283 struct desc_struct *gdt; 284 unsigned long gdt_mfn; 285 286 /* used to tell cpu_init() that it can proceed with initialization */ 287 cpumask_set_cpu(cpu, cpu_callout_mask); 288 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 289 return 0; 290 291 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 292 if (ctxt == NULL) 293 return -ENOMEM; 294 295 gdt = get_cpu_gdt_rw(cpu); 296 297 #ifdef CONFIG_X86_32 298 ctxt->user_regs.fs = __KERNEL_PERCPU; 299 ctxt->user_regs.gs = __KERNEL_STACK_CANARY; 300 #endif 301 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); 302 303 /* 304 * Bring up the CPU in cpu_bringup_and_idle() with the stack 305 * pointing just below where pt_regs would be if it were a normal 306 * kernel entry. 307 */ 308 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 309 ctxt->flags = VGCF_IN_KERNEL; 310 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 311 ctxt->user_regs.ds = __USER_DS; 312 ctxt->user_regs.es = __USER_DS; 313 ctxt->user_regs.ss = __KERNEL_DS; 314 ctxt->user_regs.cs = __KERNEL_CS; 315 ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); 316 317 xen_copy_trap_info(ctxt->trap_ctxt); 318 319 ctxt->ldt_ents = 0; 320 321 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 322 323 gdt_mfn = arbitrary_virt_to_mfn(gdt); 324 make_lowmem_page_readonly(gdt); 325 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); 326 327 ctxt->gdt_frames[0] = gdt_mfn; 328 ctxt->gdt_ents = GDT_ENTRIES; 329 330 /* 331 * Set SS:SP that Xen will use when entering guest kernel mode 332 * from guest user mode. Subsequent calls to load_sp0() can 333 * change this value. 334 */ 335 ctxt->kernel_ss = __KERNEL_DS; 336 ctxt->kernel_sp = task_top_of_stack(idle); 337 338 #ifdef CONFIG_X86_32 339 ctxt->event_callback_cs = __KERNEL_CS; 340 ctxt->failsafe_callback_cs = __KERNEL_CS; 341 #else 342 ctxt->gs_base_kernel = per_cpu_offset(cpu); 343 #endif 344 ctxt->event_callback_eip = 345 (unsigned long)xen_hypervisor_callback; 346 ctxt->failsafe_callback_eip = 347 (unsigned long)xen_failsafe_callback; 348 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 349 350 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); 351 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) 352 BUG(); 353 354 kfree(ctxt); 355 return 0; 356 } 357 358 static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) 359 { 360 int rc; 361 362 common_cpu_up(cpu, idle); 363 364 xen_setup_runstate_info(cpu); 365 366 /* 367 * PV VCPUs are always successfully taken down (see 'while' loop 368 * in xen_cpu_die()), so -EBUSY is an error. 369 */ 370 rc = cpu_check_up_prepare(cpu); 371 if (rc) 372 return rc; 373 374 /* make sure interrupts start blocked */ 375 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 376 377 rc = cpu_initialize_context(cpu, idle); 378 if (rc) 379 return rc; 380 381 xen_pmu_init(cpu); 382 383 rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); 384 BUG_ON(rc); 385 386 while (cpu_report_state(cpu) != CPU_ONLINE) 387 HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 388 389 return 0; 390 } 391 392 #ifdef CONFIG_HOTPLUG_CPU 393 static int xen_pv_cpu_disable(void) 394 { 395 unsigned int cpu = smp_processor_id(); 396 if (cpu == 0) 397 return -EBUSY; 398 399 cpu_disable_common(); 400 401 load_cr3(swapper_pg_dir); 402 return 0; 403 } 404 405 static void xen_pv_cpu_die(unsigned int cpu) 406 { 407 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, 408 xen_vcpu_nr(cpu), NULL)) { 409 __set_current_state(TASK_UNINTERRUPTIBLE); 410 schedule_timeout(HZ/10); 411 } 412 413 if (common_cpu_die(cpu) == 0) { 414 xen_smp_intr_free(cpu); 415 xen_uninit_lock_cpu(cpu); 416 xen_teardown_timer(cpu); 417 xen_pmu_finish(cpu); 418 } 419 } 420 421 static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ 422 { 423 play_dead_common(); 424 HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); 425 cpu_bringup(); 426 /* 427 * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) 428 * clears certain data that the cpu_idle loop (which called us 429 * and that we return from) expects. The only way to get that 430 * data back is to call: 431 */ 432 tick_nohz_idle_enter(); 433 tick_nohz_idle_stop_tick_protected(); 434 435 cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); 436 } 437 438 #else /* !CONFIG_HOTPLUG_CPU */ 439 static int xen_pv_cpu_disable(void) 440 { 441 return -ENOSYS; 442 } 443 444 static void xen_pv_cpu_die(unsigned int cpu) 445 { 446 BUG(); 447 } 448 449 static void xen_pv_play_dead(void) 450 { 451 BUG(); 452 } 453 454 #endif 455 static void stop_self(void *v) 456 { 457 int cpu = smp_processor_id(); 458 459 /* make sure we're not pinning something down */ 460 load_cr3(swapper_pg_dir); 461 /* should set up a minimal gdt */ 462 463 set_cpu_online(cpu, false); 464 465 HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); 466 BUG(); 467 } 468 469 static void xen_pv_stop_other_cpus(int wait) 470 { 471 smp_call_function(stop_self, NULL, wait); 472 } 473 474 static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) 475 { 476 irq_enter(); 477 irq_work_run(); 478 inc_irq_stat(apic_irq_work_irqs); 479 irq_exit(); 480 481 return IRQ_HANDLED; 482 } 483 484 static const struct smp_ops xen_smp_ops __initconst = { 485 .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, 486 .smp_prepare_cpus = xen_pv_smp_prepare_cpus, 487 .smp_cpus_done = xen_smp_cpus_done, 488 489 .cpu_up = xen_pv_cpu_up, 490 .cpu_die = xen_pv_cpu_die, 491 .cpu_disable = xen_pv_cpu_disable, 492 .play_dead = xen_pv_play_dead, 493 494 .stop_other_cpus = xen_pv_stop_other_cpus, 495 .smp_send_reschedule = xen_smp_send_reschedule, 496 497 .send_call_func_ipi = xen_smp_send_call_function_ipi, 498 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, 499 }; 500 501 void __init xen_smp_init(void) 502 { 503 smp_ops = xen_smp_ops; 504 xen_fill_possible_map(); 505 } 506