1 /* KVM paravirtual clock driver. A clocksource implementation 2 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 #include <linux/clocksource.h> 20 #include <linux/kvm_para.h> 21 #include <asm/pvclock.h> 22 #include <asm/msr.h> 23 #include <asm/apic.h> 24 #include <linux/percpu.h> 25 #include <linux/hardirq.h> 26 #include <linux/cpuhotplug.h> 27 #include <linux/sched.h> 28 #include <linux/sched/clock.h> 29 #include <linux/mm.h> 30 #include <linux/slab.h> 31 32 #include <asm/hypervisor.h> 33 #include <asm/mem_encrypt.h> 34 #include <asm/x86_init.h> 35 #include <asm/reboot.h> 36 #include <asm/kvmclock.h> 37 38 static int kvmclock __initdata = 1; 39 static int kvmclock_vsyscall __initdata = 1; 40 static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; 41 static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; 42 static u64 kvm_sched_clock_offset __ro_after_init; 43 44 static int __init parse_no_kvmclock(char *arg) 45 { 46 kvmclock = 0; 47 return 0; 48 } 49 early_param("no-kvmclock", parse_no_kvmclock); 50 51 static int __init parse_no_kvmclock_vsyscall(char *arg) 52 { 53 kvmclock_vsyscall = 0; 54 return 0; 55 } 56 early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); 57 58 /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ 59 #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) 60 #define HVC_BOOT_ARRAY_SIZE \ 61 (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) 62 63 static struct pvclock_vsyscall_time_info 64 hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); 65 static struct pvclock_wall_clock wall_clock; 66 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); 67 68 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) 69 { 70 return &this_cpu_read(hv_clock_per_cpu)->pvti; 71 } 72 73 static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) 74 { 75 return this_cpu_read(hv_clock_per_cpu); 76 } 77 78 /* 79 * The wallclock is the time of day when we booted. Since then, some time may 80 * have elapsed since the hypervisor wrote the data. So we try to account for 81 * that with system time 82 */ 83 static void kvm_get_wallclock(struct timespec64 *now) 84 { 85 wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); 86 preempt_disable(); 87 pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); 88 preempt_enable(); 89 } 90 91 static int kvm_set_wallclock(const struct timespec64 *now) 92 { 93 return -ENODEV; 94 } 95 96 static u64 kvm_clock_read(void) 97 { 98 u64 ret; 99 100 preempt_disable_notrace(); 101 ret = pvclock_clocksource_read(this_cpu_pvti()); 102 preempt_enable_notrace(); 103 return ret; 104 } 105 106 static u64 kvm_clock_get_cycles(struct clocksource *cs) 107 { 108 return kvm_clock_read(); 109 } 110 111 static u64 kvm_sched_clock_read(void) 112 { 113 return kvm_clock_read() - kvm_sched_clock_offset; 114 } 115 116 static inline void kvm_sched_clock_init(bool stable) 117 { 118 if (!stable) { 119 pv_time_ops.sched_clock = kvm_clock_read; 120 clear_sched_clock_stable(); 121 return; 122 } 123 124 kvm_sched_clock_offset = kvm_clock_read(); 125 pv_time_ops.sched_clock = kvm_sched_clock_read; 126 127 pr_info("kvm-clock: using sched offset of %llu cycles", 128 kvm_sched_clock_offset); 129 130 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > 131 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); 132 } 133 134 /* 135 * If we don't do that, there is the possibility that the guest 136 * will calibrate under heavy load - thus, getting a lower lpj - 137 * and execute the delays themselves without load. This is wrong, 138 * because no delay loop can finish beforehand. 139 * Any heuristics is subject to fail, because ultimately, a large 140 * poll of guests can be running and trouble each other. So we preset 141 * lpj here 142 */ 143 static unsigned long kvm_get_tsc_khz(void) 144 { 145 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); 146 return pvclock_tsc_khz(this_cpu_pvti()); 147 } 148 149 static void __init kvm_get_preset_lpj(void) 150 { 151 unsigned long khz; 152 u64 lpj; 153 154 khz = kvm_get_tsc_khz(); 155 156 lpj = ((u64)khz * 1000); 157 do_div(lpj, HZ); 158 preset_lpj = lpj; 159 } 160 161 bool kvm_check_and_clear_guest_paused(void) 162 { 163 struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); 164 bool ret = false; 165 166 if (!src) 167 return ret; 168 169 if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { 170 src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; 171 pvclock_touch_watchdogs(); 172 ret = true; 173 } 174 return ret; 175 } 176 177 struct clocksource kvm_clock = { 178 .name = "kvm-clock", 179 .read = kvm_clock_get_cycles, 180 .rating = 400, 181 .mask = CLOCKSOURCE_MASK(64), 182 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 183 }; 184 EXPORT_SYMBOL_GPL(kvm_clock); 185 186 static void kvm_register_clock(char *txt) 187 { 188 struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); 189 u64 pa; 190 191 if (!src) 192 return; 193 194 pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; 195 wrmsrl(msr_kvm_system_time, pa); 196 pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); 197 } 198 199 static void kvm_save_sched_clock_state(void) 200 { 201 } 202 203 static void kvm_restore_sched_clock_state(void) 204 { 205 kvm_register_clock("primary cpu clock, resume"); 206 } 207 208 #ifdef CONFIG_X86_LOCAL_APIC 209 static void kvm_setup_secondary_clock(void) 210 { 211 kvm_register_clock("secondary cpu clock"); 212 } 213 #endif 214 215 /* 216 * After the clock is registered, the host will keep writing to the 217 * registered memory location. If the guest happens to shutdown, this memory 218 * won't be valid. In cases like kexec, in which you install a new kernel, this 219 * means a random memory location will be kept being written. So before any 220 * kind of shutdown from our side, we unregister the clock by writing anything 221 * that does not have the 'enable' bit set in the msr 222 */ 223 #ifdef CONFIG_KEXEC_CORE 224 static void kvm_crash_shutdown(struct pt_regs *regs) 225 { 226 native_write_msr(msr_kvm_system_time, 0, 0); 227 kvm_disable_steal_time(); 228 native_machine_crash_shutdown(regs); 229 } 230 #endif 231 232 static void kvm_shutdown(void) 233 { 234 native_write_msr(msr_kvm_system_time, 0, 0); 235 kvm_disable_steal_time(); 236 native_machine_shutdown(); 237 } 238 239 static int __init kvm_setup_vsyscall_timeinfo(void) 240 { 241 #ifdef CONFIG_X86_64 242 u8 flags; 243 244 if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) 245 return 0; 246 247 flags = pvclock_read_flags(&hv_clock_boot[0].pvti); 248 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) 249 return 0; 250 251 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; 252 #endif 253 return 0; 254 } 255 early_initcall(kvm_setup_vsyscall_timeinfo); 256 257 static int kvmclock_setup_percpu(unsigned int cpu) 258 { 259 struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); 260 261 /* 262 * The per cpu area setup replicates CPU0 data to all cpu 263 * pointers. So carefully check. CPU0 has been set up in init 264 * already. 265 */ 266 if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) 267 return 0; 268 269 /* Use the static page for the first CPUs, allocate otherwise */ 270 if (cpu < HVC_BOOT_ARRAY_SIZE) 271 p = &hv_clock_boot[cpu]; 272 else 273 p = kzalloc(sizeof(*p), GFP_KERNEL); 274 275 per_cpu(hv_clock_per_cpu, cpu) = p; 276 return p ? 0 : -ENOMEM; 277 } 278 279 void __init kvmclock_init(void) 280 { 281 u8 flags; 282 283 if (!kvm_para_available() || !kvmclock) 284 return; 285 286 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { 287 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; 288 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; 289 } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 290 return; 291 } 292 293 if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", 294 kvmclock_setup_percpu, NULL) < 0) { 295 return; 296 } 297 298 pr_info("kvm-clock: Using msrs %x and %x", 299 msr_kvm_system_time, msr_kvm_wall_clock); 300 301 this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); 302 kvm_register_clock("primary cpu clock"); 303 pvclock_set_pvti_cpu0_va(hv_clock_boot); 304 305 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 306 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 307 308 flags = pvclock_read_flags(&hv_clock_boot[0].pvti); 309 kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); 310 311 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 312 x86_platform.calibrate_cpu = kvm_get_tsc_khz; 313 x86_platform.get_wallclock = kvm_get_wallclock; 314 x86_platform.set_wallclock = kvm_set_wallclock; 315 #ifdef CONFIG_X86_LOCAL_APIC 316 x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; 317 #endif 318 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; 319 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; 320 machine_ops.shutdown = kvm_shutdown; 321 #ifdef CONFIG_KEXEC_CORE 322 machine_ops.crash_shutdown = kvm_crash_shutdown; 323 #endif 324 kvm_get_preset_lpj(); 325 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); 326 pv_info.name = "KVM"; 327 } 328