1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Xen time implementation. 4 * 5 * This is implemented in terms of a clocksource driver which uses 6 * the hypervisor clock as a nanosecond timebase, and a clockevent 7 * driver which uses the hypervisor's timer mechanism. 8 * 9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 10 */ 11 #include <linux/kernel.h> 12 #include <linux/interrupt.h> 13 #include <linux/clocksource.h> 14 #include <linux/clockchips.h> 15 #include <linux/gfp.h> 16 #include <linux/slab.h> 17 #include <linux/pvclock_gtod.h> 18 #include <linux/timekeeper_internal.h> 19 20 #include <asm/pvclock.h> 21 #include <asm/xen/hypervisor.h> 22 #include <asm/xen/hypercall.h> 23 #include <asm/xen/cpuid.h> 24 25 #include <xen/events.h> 26 #include <xen/features.h> 27 #include <xen/interface/xen.h> 28 #include <xen/interface/vcpu.h> 29 30 #include "xen-ops.h" 31 32 /* Minimum amount of time until next clock event fires */ 33 #define TIMER_SLOP 100000 34 35 static u64 xen_sched_clock_offset __read_mostly; 36 37 /* Get the TSC speed from Xen */ 38 static unsigned long xen_tsc_khz(void) 39 { 40 struct pvclock_vcpu_time_info *info = 41 &HYPERVISOR_shared_info->vcpu_info[0].time; 42 43 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); 44 return pvclock_tsc_khz(info); 45 } 46 47 static u64 xen_clocksource_read(void) 48 { 49 struct pvclock_vcpu_time_info *src; 50 u64 ret; 51 52 preempt_disable_notrace(); 53 src = &__this_cpu_read(xen_vcpu)->time; 54 ret = pvclock_clocksource_read(src); 55 preempt_enable_notrace(); 56 return ret; 57 } 58 59 static u64 xen_clocksource_get_cycles(struct clocksource *cs) 60 { 61 return xen_clocksource_read(); 62 } 63 64 static noinstr u64 xen_sched_clock(void) 65 { 66 struct pvclock_vcpu_time_info *src; 67 u64 ret; 68 69 preempt_disable_notrace(); 70 src = &__this_cpu_read(xen_vcpu)->time; 71 ret = pvclock_clocksource_read_nowd(src); 72 ret -= xen_sched_clock_offset; 73 preempt_enable_notrace(); 74 return ret; 75 } 76 77 static void xen_read_wallclock(struct timespec64 *ts) 78 { 79 struct shared_info *s = HYPERVISOR_shared_info; 80 struct pvclock_wall_clock *wall_clock = &(s->wc); 81 struct pvclock_vcpu_time_info *vcpu_time; 82 83 vcpu_time = &get_cpu_var(xen_vcpu)->time; 84 pvclock_read_wallclock(wall_clock, vcpu_time, ts); 85 put_cpu_var(xen_vcpu); 86 } 87 88 static void xen_get_wallclock(struct timespec64 *now) 89 { 90 xen_read_wallclock(now); 91 } 92 93 static int xen_set_wallclock(const struct timespec64 *now) 94 { 95 return -ENODEV; 96 } 97 98 static int xen_pvclock_gtod_notify(struct notifier_block *nb, 99 unsigned long was_set, void *priv) 100 { 101 /* Protected by the calling core code serialization */ 102 static struct timespec64 next_sync; 103 104 struct xen_platform_op op; 105 struct timespec64 now; 106 struct timekeeper *tk = priv; 107 static bool settime64_supported = true; 108 int ret; 109 110 now.tv_sec = tk->xtime_sec; 111 now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); 112 113 /* 114 * We only take the expensive HV call when the clock was set 115 * or when the 11 minutes RTC synchronization time elapsed. 116 */ 117 if (!was_set && timespec64_compare(&now, &next_sync) < 0) 118 return NOTIFY_OK; 119 120 again: 121 if (settime64_supported) { 122 op.cmd = XENPF_settime64; 123 op.u.settime64.mbz = 0; 124 op.u.settime64.secs = now.tv_sec; 125 op.u.settime64.nsecs = now.tv_nsec; 126 op.u.settime64.system_time = xen_clocksource_read(); 127 } else { 128 op.cmd = XENPF_settime32; 129 op.u.settime32.secs = now.tv_sec; 130 op.u.settime32.nsecs = now.tv_nsec; 131 op.u.settime32.system_time = xen_clocksource_read(); 132 } 133 134 ret = HYPERVISOR_platform_op(&op); 135 136 if (ret == -ENOSYS && settime64_supported) { 137 settime64_supported = false; 138 goto again; 139 } 140 if (ret < 0) 141 return NOTIFY_BAD; 142 143 /* 144 * Move the next drift compensation time 11 minutes 145 * ahead. That's emulating the sync_cmos_clock() update for 146 * the hardware RTC. 147 */ 148 next_sync = now; 149 next_sync.tv_sec += 11 * 60; 150 151 return NOTIFY_OK; 152 } 153 154 static struct notifier_block xen_pvclock_gtod_notifier = { 155 .notifier_call = xen_pvclock_gtod_notify, 156 }; 157 158 static int xen_cs_enable(struct clocksource *cs) 159 { 160 vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); 161 return 0; 162 } 163 164 static struct clocksource xen_clocksource __read_mostly = { 165 .name = "xen", 166 .rating = 400, 167 .read = xen_clocksource_get_cycles, 168 .mask = CLOCKSOURCE_MASK(64), 169 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 170 .enable = xen_cs_enable, 171 }; 172 173 /* 174 Xen clockevent implementation 175 176 Xen has two clockevent implementations: 177 178 The old timer_op one works with all released versions of Xen prior 179 to version 3.0.4. This version of the hypervisor provides a 180 single-shot timer with nanosecond resolution. However, sharing the 181 same event channel is a 100Hz tick which is delivered while the 182 vcpu is running. We don't care about or use this tick, but it will 183 cause the core time code to think the timer fired too soon, and 184 will end up resetting it each time. It could be filtered, but 185 doing so has complications when the ktime clocksource is not yet 186 the xen clocksource (ie, at boot time). 187 188 The new vcpu_op-based timer interface allows the tick timer period 189 to be changed or turned off. The tick timer is not useful as a 190 periodic timer because events are only delivered to running vcpus. 191 The one-shot timer can report when a timeout is in the past, so 192 set_next_event is capable of returning -ETIME when appropriate. 193 This interface is used when available. 194 */ 195 196 197 /* 198 Get a hypervisor absolute time. In theory we could maintain an 199 offset between the kernel's time and the hypervisor's time, and 200 apply that to a kernel's absolute timeout. Unfortunately the 201 hypervisor and kernel times can drift even if the kernel is using 202 the Xen clocksource, because ntp can warp the kernel's clocksource. 203 */ 204 static s64 get_abs_timeout(unsigned long delta) 205 { 206 return xen_clocksource_read() + delta; 207 } 208 209 static int xen_timerop_shutdown(struct clock_event_device *evt) 210 { 211 /* cancel timeout */ 212 HYPERVISOR_set_timer_op(0); 213 214 return 0; 215 } 216 217 static int xen_timerop_set_next_event(unsigned long delta, 218 struct clock_event_device *evt) 219 { 220 WARN_ON(!clockevent_state_oneshot(evt)); 221 222 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) 223 BUG(); 224 225 /* We may have missed the deadline, but there's no real way of 226 knowing for sure. If the event was in the past, then we'll 227 get an immediate interrupt. */ 228 229 return 0; 230 } 231 232 static struct clock_event_device xen_timerop_clockevent __ro_after_init = { 233 .name = "xen", 234 .features = CLOCK_EVT_FEAT_ONESHOT, 235 236 .max_delta_ns = 0xffffffff, 237 .max_delta_ticks = 0xffffffff, 238 .min_delta_ns = TIMER_SLOP, 239 .min_delta_ticks = TIMER_SLOP, 240 241 .mult = 1, 242 .shift = 0, 243 .rating = 500, 244 245 .set_state_shutdown = xen_timerop_shutdown, 246 .set_next_event = xen_timerop_set_next_event, 247 }; 248 249 static int xen_vcpuop_shutdown(struct clock_event_device *evt) 250 { 251 int cpu = smp_processor_id(); 252 253 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu), 254 NULL) || 255 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 256 NULL)) 257 BUG(); 258 259 return 0; 260 } 261 262 static int xen_vcpuop_set_oneshot(struct clock_event_device *evt) 263 { 264 int cpu = smp_processor_id(); 265 266 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 267 NULL)) 268 BUG(); 269 270 return 0; 271 } 272 273 static int xen_vcpuop_set_next_event(unsigned long delta, 274 struct clock_event_device *evt) 275 { 276 int cpu = smp_processor_id(); 277 struct vcpu_set_singleshot_timer single; 278 int ret; 279 280 WARN_ON(!clockevent_state_oneshot(evt)); 281 282 single.timeout_abs_ns = get_abs_timeout(delta); 283 /* Get an event anyway, even if the timeout is already expired */ 284 single.flags = 0; 285 286 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu), 287 &single); 288 BUG_ON(ret != 0); 289 290 return ret; 291 } 292 293 static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = { 294 .name = "xen", 295 .features = CLOCK_EVT_FEAT_ONESHOT, 296 297 .max_delta_ns = 0xffffffff, 298 .max_delta_ticks = 0xffffffff, 299 .min_delta_ns = TIMER_SLOP, 300 .min_delta_ticks = TIMER_SLOP, 301 302 .mult = 1, 303 .shift = 0, 304 .rating = 500, 305 306 .set_state_shutdown = xen_vcpuop_shutdown, 307 .set_state_oneshot = xen_vcpuop_set_oneshot, 308 .set_next_event = xen_vcpuop_set_next_event, 309 }; 310 311 static const struct clock_event_device *xen_clockevent = 312 &xen_timerop_clockevent; 313 314 struct xen_clock_event_device { 315 struct clock_event_device evt; 316 char name[16]; 317 }; 318 static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; 319 320 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) 321 { 322 struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt); 323 irqreturn_t ret; 324 325 ret = IRQ_NONE; 326 if (evt->event_handler) { 327 evt->event_handler(evt); 328 ret = IRQ_HANDLED; 329 } 330 331 return ret; 332 } 333 334 void xen_teardown_timer(int cpu) 335 { 336 struct clock_event_device *evt; 337 evt = &per_cpu(xen_clock_events, cpu).evt; 338 339 if (evt->irq >= 0) { 340 unbind_from_irqhandler(evt->irq, NULL); 341 evt->irq = -1; 342 } 343 } 344 345 void xen_setup_timer(int cpu) 346 { 347 struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu); 348 struct clock_event_device *evt = &xevt->evt; 349 int irq; 350 351 WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); 352 if (evt->irq >= 0) 353 xen_teardown_timer(cpu); 354 355 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); 356 357 snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu); 358 359 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 360 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| 361 IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, 362 xevt->name, NULL); 363 (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); 364 365 memcpy(evt, xen_clockevent, sizeof(*evt)); 366 367 evt->cpumask = cpumask_of(cpu); 368 evt->irq = irq; 369 } 370 371 372 void xen_setup_cpu_clockevents(void) 373 { 374 clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt)); 375 } 376 377 void xen_timer_resume(void) 378 { 379 int cpu; 380 381 if (xen_clockevent != &xen_vcpuop_clockevent) 382 return; 383 384 for_each_online_cpu(cpu) { 385 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 386 xen_vcpu_nr(cpu), NULL)) 387 BUG(); 388 } 389 } 390 391 static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; 392 static u64 xen_clock_value_saved; 393 394 void xen_save_time_memory_area(void) 395 { 396 struct vcpu_register_time_memory_area t; 397 int ret; 398 399 xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset; 400 401 if (!xen_clock) 402 return; 403 404 t.addr.v = NULL; 405 406 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 407 if (ret != 0) 408 pr_notice("Cannot save secondary vcpu_time_info (err %d)", 409 ret); 410 else 411 clear_page(xen_clock); 412 } 413 414 void xen_restore_time_memory_area(void) 415 { 416 struct vcpu_register_time_memory_area t; 417 int ret; 418 419 if (!xen_clock) 420 goto out; 421 422 t.addr.v = &xen_clock->pvti; 423 424 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 425 426 /* 427 * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to 428 * register the secondary time info with Xen or if we migrated to a 429 * host without the necessary flags. On both of these cases what 430 * happens is either process seeing a zeroed out pvti or seeing no 431 * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and 432 * if 0, it discards the data in pvti and fallbacks to a system 433 * call for a reliable timestamp. 434 */ 435 if (ret != 0) 436 pr_notice("Cannot restore secondary vcpu_time_info (err %d)", 437 ret); 438 439 out: 440 /* Need pvclock_resume() before using xen_clocksource_read(). */ 441 pvclock_resume(); 442 xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved; 443 } 444 445 static void xen_setup_vsyscall_time_info(void) 446 { 447 struct vcpu_register_time_memory_area t; 448 struct pvclock_vsyscall_time_info *ti; 449 int ret; 450 451 ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL); 452 if (!ti) 453 return; 454 455 t.addr.v = &ti->pvti; 456 457 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 458 if (ret) { 459 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret); 460 free_page((unsigned long)ti); 461 return; 462 } 463 464 /* 465 * If primary time info had this bit set, secondary should too since 466 * it's the same data on both just different memory regions. But we 467 * still check it in case hypervisor is buggy. 468 */ 469 if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) { 470 t.addr.v = NULL; 471 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 472 0, &t); 473 if (!ret) 474 free_page((unsigned long)ti); 475 476 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n"); 477 return; 478 } 479 480 xen_clock = ti; 481 pvclock_set_pvti_cpu0_va(xen_clock); 482 483 xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; 484 } 485 486 /* 487 * Check if it is possible to safely use the tsc as a clocksource. This is 488 * only true if the hypervisor notifies the guest that its tsc is invariant, 489 * the tsc is stable, and the tsc instruction will never be emulated. 490 */ 491 static int __init xen_tsc_safe_clocksource(void) 492 { 493 u32 eax, ebx, ecx, edx; 494 495 if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC))) 496 return 0; 497 498 if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC))) 499 return 0; 500 501 if (check_tsc_unstable()) 502 return 0; 503 504 /* Leaf 4, sub-leaf 0 (0x40000x03) */ 505 cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx); 506 507 return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE; 508 } 509 510 static void __init xen_time_init(void) 511 { 512 struct pvclock_vcpu_time_info *pvti; 513 int cpu = smp_processor_id(); 514 struct timespec64 tp; 515 516 /* 517 * As Dom0 is never moved, no penalty on using TSC there. 518 * 519 * If it is possible for the guest to determine that the tsc is a safe 520 * clocksource, then set xen_clocksource rating below that of the tsc 521 * so that the system prefers tsc instead. 522 */ 523 if (xen_initial_domain()) 524 xen_clocksource.rating = 275; 525 else if (xen_tsc_safe_clocksource()) 526 xen_clocksource.rating = 299; 527 528 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); 529 530 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 531 NULL) == 0) { 532 /* Successfully turned off 100Hz tick, so we have the 533 vcpuop-based timer interface */ 534 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); 535 xen_clockevent = &xen_vcpuop_clockevent; 536 } 537 538 /* Set initial system time with full resolution */ 539 xen_read_wallclock(&tp); 540 do_settimeofday64(&tp); 541 542 setup_force_cpu_cap(X86_FEATURE_TSC); 543 544 /* 545 * We check ahead on the primary time info if this 546 * bit is supported hence speeding up Xen clocksource. 547 */ 548 pvti = &__this_cpu_read(xen_vcpu)->time; 549 if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) { 550 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 551 xen_setup_vsyscall_time_info(); 552 } 553 554 xen_setup_runstate_info(cpu); 555 xen_setup_timer(cpu); 556 xen_setup_cpu_clockevents(); 557 558 xen_time_setup_guest(); 559 560 if (xen_initial_domain()) 561 pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); 562 } 563 564 static void __init xen_init_time_common(void) 565 { 566 xen_sched_clock_offset = xen_clocksource_read(); 567 static_call_update(pv_steal_clock, xen_steal_clock); 568 paravirt_set_sched_clock(xen_sched_clock); 569 570 x86_platform.calibrate_tsc = xen_tsc_khz; 571 x86_platform.get_wallclock = xen_get_wallclock; 572 } 573 574 void __init xen_init_time_ops(void) 575 { 576 xen_init_time_common(); 577 578 x86_init.timers.timer_init = xen_time_init; 579 x86_init.timers.setup_percpu_clockev = x86_init_noop; 580 x86_cpuinit.setup_percpu_clockev = x86_init_noop; 581 582 /* Dom0 uses the native method to set the hardware RTC. */ 583 if (!xen_initial_domain()) 584 x86_platform.set_wallclock = xen_set_wallclock; 585 } 586 587 #ifdef CONFIG_XEN_PVHVM 588 static void xen_hvm_setup_cpu_clockevents(void) 589 { 590 int cpu = smp_processor_id(); 591 xen_setup_runstate_info(cpu); 592 /* 593 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence 594 * doing it xen_hvm_cpu_notify (which gets called by smp_init during 595 * early bootup and also during CPU hotplug events). 596 */ 597 xen_setup_cpu_clockevents(); 598 } 599 600 void __init xen_hvm_init_time_ops(void) 601 { 602 static bool hvm_time_initialized; 603 604 if (hvm_time_initialized) 605 return; 606 607 /* 608 * vector callback is needed otherwise we cannot receive interrupts 609 * on cpu > 0 and at this point we don't know how many cpus are 610 * available. 611 */ 612 if (!xen_have_vector_callback) 613 return; 614 615 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { 616 pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer"); 617 return; 618 } 619 620 /* 621 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'. 622 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest 623 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access 624 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic. 625 * 626 * The xen_hvm_init_time_ops() should be called again later after 627 * __this_cpu_read(xen_vcpu) is available. 628 */ 629 if (!__this_cpu_read(xen_vcpu)) { 630 pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n", 631 xen_vcpu_nr(0)); 632 return; 633 } 634 635 xen_init_time_common(); 636 637 x86_init.timers.setup_percpu_clockev = xen_time_init; 638 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; 639 640 x86_platform.set_wallclock = xen_set_wallclock; 641 642 hvm_time_initialized = true; 643 } 644 #endif 645 646 /* Kernel parameter to specify Xen timer slop */ 647 static int __init parse_xen_timer_slop(char *ptr) 648 { 649 unsigned long slop = memparse(ptr, NULL); 650 651 xen_timerop_clockevent.min_delta_ns = slop; 652 xen_timerop_clockevent.min_delta_ticks = slop; 653 xen_vcpuop_clockevent.min_delta_ns = slop; 654 xen_vcpuop_clockevent.min_delta_ticks = slop; 655 656 return 0; 657 } 658 early_param("xen_timer_slop", parse_xen_timer_slop); 659