1 /* 2 * KVM paravirt_ops implementation 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 * 18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 19 * Copyright IBM Corporation, 2007 20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 21 */ 22 23 #include <linux/module.h> 24 #include <linux/kernel.h> 25 #include <linux/kvm_para.h> 26 #include <linux/cpu.h> 27 #include <linux/mm.h> 28 #include <linux/highmem.h> 29 #include <linux/hardirq.h> 30 #include <linux/notifier.h> 31 #include <linux/reboot.h> 32 #include <linux/hash.h> 33 #include <linux/sched.h> 34 #include <linux/slab.h> 35 #include <linux/kprobes.h> 36 #include <asm/timer.h> 37 #include <asm/cpu.h> 38 #include <asm/traps.h> 39 #include <asm/desc.h> 40 #include <asm/tlbflush.h> 41 #include <asm/idle.h> 42 #include <asm/apic.h> 43 #include <asm/apicdef.h> 44 #include <asm/hypervisor.h> 45 46 static int kvmapf = 1; 47 48 static int parse_no_kvmapf(char *arg) 49 { 50 kvmapf = 0; 51 return 0; 52 } 53 54 early_param("no-kvmapf", parse_no_kvmapf); 55 56 static int steal_acc = 1; 57 static int parse_no_stealacc(char *arg) 58 { 59 steal_acc = 0; 60 return 0; 61 } 62 63 early_param("no-steal-acc", parse_no_stealacc); 64 65 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 66 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 67 static int has_steal_clock = 0; 68 69 /* 70 * No need for any "IO delay" on KVM 71 */ 72 static void kvm_io_delay(void) 73 { 74 } 75 76 #define KVM_TASK_SLEEP_HASHBITS 8 77 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 78 79 struct kvm_task_sleep_node { 80 struct hlist_node link; 81 wait_queue_head_t wq; 82 u32 token; 83 int cpu; 84 bool halted; 85 }; 86 87 static struct kvm_task_sleep_head { 88 spinlock_t lock; 89 struct hlist_head list; 90 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 91 92 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 93 u32 token) 94 { 95 struct hlist_node *p; 96 97 hlist_for_each(p, &b->list) { 98 struct kvm_task_sleep_node *n = 99 hlist_entry(p, typeof(*n), link); 100 if (n->token == token) 101 return n; 102 } 103 104 return NULL; 105 } 106 107 void kvm_async_pf_task_wait(u32 token) 108 { 109 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 111 struct kvm_task_sleep_node n, *e; 112 DEFINE_WAIT(wait); 113 int cpu, idle; 114 115 cpu = get_cpu(); 116 idle = idle_cpu(cpu); 117 put_cpu(); 118 119 spin_lock(&b->lock); 120 e = _find_apf_task(b, token); 121 if (e) { 122 /* dummy entry exist -> wake up was delivered ahead of PF */ 123 hlist_del(&e->link); 124 kfree(e); 125 spin_unlock(&b->lock); 126 return; 127 } 128 129 n.token = token; 130 n.cpu = smp_processor_id(); 131 n.halted = idle || preempt_count() > 1; 132 init_waitqueue_head(&n.wq); 133 hlist_add_head(&n.link, &b->list); 134 spin_unlock(&b->lock); 135 136 for (;;) { 137 if (!n.halted) 138 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 139 if (hlist_unhashed(&n.link)) 140 break; 141 142 if (!n.halted) { 143 local_irq_enable(); 144 schedule(); 145 local_irq_disable(); 146 } else { 147 /* 148 * We cannot reschedule. So halt. 149 */ 150 native_safe_halt(); 151 local_irq_disable(); 152 } 153 } 154 if (!n.halted) 155 finish_wait(&n.wq, &wait); 156 157 return; 158 } 159 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 160 161 static void apf_task_wake_one(struct kvm_task_sleep_node *n) 162 { 163 hlist_del_init(&n->link); 164 if (n->halted) 165 smp_send_reschedule(n->cpu); 166 else if (waitqueue_active(&n->wq)) 167 wake_up(&n->wq); 168 } 169 170 static void apf_task_wake_all(void) 171 { 172 int i; 173 174 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 175 struct hlist_node *p, *next; 176 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 177 spin_lock(&b->lock); 178 hlist_for_each_safe(p, next, &b->list) { 179 struct kvm_task_sleep_node *n = 180 hlist_entry(p, typeof(*n), link); 181 if (n->cpu == smp_processor_id()) 182 apf_task_wake_one(n); 183 } 184 spin_unlock(&b->lock); 185 } 186 } 187 188 void kvm_async_pf_task_wake(u32 token) 189 { 190 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 191 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 192 struct kvm_task_sleep_node *n; 193 194 if (token == ~0) { 195 apf_task_wake_all(); 196 return; 197 } 198 199 again: 200 spin_lock(&b->lock); 201 n = _find_apf_task(b, token); 202 if (!n) { 203 /* 204 * async PF was not yet handled. 205 * Add dummy entry for the token. 206 */ 207 n = kzalloc(sizeof(*n), GFP_ATOMIC); 208 if (!n) { 209 /* 210 * Allocation failed! Busy wait while other cpu 211 * handles async PF. 212 */ 213 spin_unlock(&b->lock); 214 cpu_relax(); 215 goto again; 216 } 217 n->token = token; 218 n->cpu = smp_processor_id(); 219 init_waitqueue_head(&n->wq); 220 hlist_add_head(&n->link, &b->list); 221 } else 222 apf_task_wake_one(n); 223 spin_unlock(&b->lock); 224 return; 225 } 226 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 227 228 u32 kvm_read_and_reset_pf_reason(void) 229 { 230 u32 reason = 0; 231 232 if (__get_cpu_var(apf_reason).enabled) { 233 reason = __get_cpu_var(apf_reason).reason; 234 __get_cpu_var(apf_reason).reason = 0; 235 } 236 237 return reason; 238 } 239 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); 240 241 dotraplinkage void __kprobes 242 do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 243 { 244 switch (kvm_read_and_reset_pf_reason()) { 245 default: 246 do_page_fault(regs, error_code); 247 break; 248 case KVM_PV_REASON_PAGE_NOT_PRESENT: 249 /* page is swapped out by the host. */ 250 rcu_irq_enter(); 251 exit_idle(); 252 kvm_async_pf_task_wait((u32)read_cr2()); 253 rcu_irq_exit(); 254 break; 255 case KVM_PV_REASON_PAGE_READY: 256 rcu_irq_enter(); 257 exit_idle(); 258 kvm_async_pf_task_wake((u32)read_cr2()); 259 rcu_irq_exit(); 260 break; 261 } 262 } 263 264 static void __init paravirt_ops_setup(void) 265 { 266 pv_info.name = "KVM"; 267 pv_info.paravirt_enabled = 1; 268 269 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 270 pv_cpu_ops.io_delay = kvm_io_delay; 271 272 #ifdef CONFIG_X86_IO_APIC 273 no_timer_check = 1; 274 #endif 275 } 276 277 static void kvm_register_steal_time(void) 278 { 279 int cpu = smp_processor_id(); 280 struct kvm_steal_time *st = &per_cpu(steal_time, cpu); 281 282 if (!has_steal_clock) 283 return; 284 285 memset(st, 0, sizeof(*st)); 286 287 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 288 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 289 cpu, __pa(st)); 290 } 291 292 static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 293 294 static void kvm_guest_apic_eoi_write(u32 reg, u32 val) 295 { 296 /** 297 * This relies on __test_and_clear_bit to modify the memory 298 * in a way that is atomic with respect to the local CPU. 299 * The hypervisor only accesses this memory from the local CPU so 300 * there's no need for lock or memory barriers. 301 * An optimization barrier is implied in apic write. 302 */ 303 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) 304 return; 305 apic_write(APIC_EOI, APIC_EOI_ACK); 306 } 307 308 void __cpuinit kvm_guest_cpu_init(void) 309 { 310 if (!kvm_para_available()) 311 return; 312 313 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 314 u64 pa = __pa(&__get_cpu_var(apf_reason)); 315 316 #ifdef CONFIG_PREEMPT 317 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 318 #endif 319 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 320 __get_cpu_var(apf_reason).enabled = 1; 321 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 322 smp_processor_id()); 323 } 324 325 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { 326 unsigned long pa; 327 /* Size alignment is implied but just to make it explicit. */ 328 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 329 __get_cpu_var(kvm_apic_eoi) = 0; 330 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 331 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 332 } 333 334 if (has_steal_clock) 335 kvm_register_steal_time(); 336 } 337 338 static void kvm_pv_disable_apf(void) 339 { 340 if (!__get_cpu_var(apf_reason).enabled) 341 return; 342 343 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 344 __get_cpu_var(apf_reason).enabled = 0; 345 346 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", 347 smp_processor_id()); 348 } 349 350 static void kvm_pv_guest_cpu_reboot(void *unused) 351 { 352 /* 353 * We disable PV EOI before we load a new kernel by kexec, 354 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 355 * New kernel can re-enable when it boots. 356 */ 357 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 358 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 359 kvm_pv_disable_apf(); 360 kvm_disable_steal_time(); 361 } 362 363 static int kvm_pv_reboot_notify(struct notifier_block *nb, 364 unsigned long code, void *unused) 365 { 366 if (code == SYS_RESTART) 367 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 368 return NOTIFY_DONE; 369 } 370 371 static struct notifier_block kvm_pv_reboot_nb = { 372 .notifier_call = kvm_pv_reboot_notify, 373 }; 374 375 static u64 kvm_steal_clock(int cpu) 376 { 377 u64 steal; 378 struct kvm_steal_time *src; 379 int version; 380 381 src = &per_cpu(steal_time, cpu); 382 do { 383 version = src->version; 384 rmb(); 385 steal = src->steal; 386 rmb(); 387 } while ((version & 1) || (version != src->version)); 388 389 return steal; 390 } 391 392 void kvm_disable_steal_time(void) 393 { 394 if (!has_steal_clock) 395 return; 396 397 wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 398 } 399 400 #ifdef CONFIG_SMP 401 static void __init kvm_smp_prepare_boot_cpu(void) 402 { 403 WARN_ON(kvm_register_clock("primary cpu clock")); 404 kvm_guest_cpu_init(); 405 native_smp_prepare_boot_cpu(); 406 } 407 408 static void __cpuinit kvm_guest_cpu_online(void *dummy) 409 { 410 kvm_guest_cpu_init(); 411 } 412 413 static void kvm_guest_cpu_offline(void *dummy) 414 { 415 kvm_disable_steal_time(); 416 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 417 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 418 kvm_pv_disable_apf(); 419 apf_task_wake_all(); 420 } 421 422 static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 423 unsigned long action, void *hcpu) 424 { 425 int cpu = (unsigned long)hcpu; 426 switch (action) { 427 case CPU_ONLINE: 428 case CPU_DOWN_FAILED: 429 case CPU_ONLINE_FROZEN: 430 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); 431 break; 432 case CPU_DOWN_PREPARE: 433 case CPU_DOWN_PREPARE_FROZEN: 434 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); 435 break; 436 default: 437 break; 438 } 439 return NOTIFY_OK; 440 } 441 442 static struct notifier_block __cpuinitdata kvm_cpu_notifier = { 443 .notifier_call = kvm_cpu_notify, 444 }; 445 #endif 446 447 static void __init kvm_apf_trap_init(void) 448 { 449 set_intr_gate(14, &async_page_fault); 450 } 451 452 void __init kvm_guest_init(void) 453 { 454 int i; 455 456 if (!kvm_para_available()) 457 return; 458 459 paravirt_ops_setup(); 460 register_reboot_notifier(&kvm_pv_reboot_nb); 461 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 462 spin_lock_init(&async_pf_sleepers[i].lock); 463 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 464 x86_init.irqs.trap_init = kvm_apf_trap_init; 465 466 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 467 has_steal_clock = 1; 468 pv_time_ops.steal_clock = kvm_steal_clock; 469 } 470 471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 473 474 #ifdef CONFIG_SMP 475 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 476 register_cpu_notifier(&kvm_cpu_notifier); 477 #else 478 kvm_guest_cpu_init(); 479 #endif 480 } 481 482 static bool __init kvm_detect(void) 483 { 484 if (!kvm_para_available()) 485 return false; 486 return true; 487 } 488 489 const struct hypervisor_x86 x86_hyper_kvm __refconst = { 490 .name = "KVM", 491 .detect = kvm_detect, 492 }; 493 EXPORT_SYMBOL_GPL(x86_hyper_kvm); 494 495 static __init int activate_jump_labels(void) 496 { 497 if (has_steal_clock) { 498 static_key_slow_inc(¶virt_steal_enabled); 499 if (steal_acc) 500 static_key_slow_inc(¶virt_steal_rq_enabled); 501 } 502 503 return 0; 504 } 505 arch_initcall(activate_jump_labels); 506