1 /* 2 * KVM paravirt_ops implementation 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 * 18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 19 * Copyright IBM Corporation, 2007 20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 21 */ 22 23 #include <linux/module.h> 24 #include <linux/kernel.h> 25 #include <linux/kvm_para.h> 26 #include <linux/cpu.h> 27 #include <linux/mm.h> 28 #include <linux/highmem.h> 29 #include <linux/hardirq.h> 30 #include <linux/notifier.h> 31 #include <linux/reboot.h> 32 #include <linux/hash.h> 33 #include <linux/sched.h> 34 #include <linux/slab.h> 35 #include <linux/kprobes.h> 36 #include <asm/timer.h> 37 #include <asm/cpu.h> 38 #include <asm/traps.h> 39 #include <asm/desc.h> 40 #include <asm/tlbflush.h> 41 42 #define MMU_QUEUE_SIZE 1024 43 44 static int kvmapf = 1; 45 46 static int parse_no_kvmapf(char *arg) 47 { 48 kvmapf = 0; 49 return 0; 50 } 51 52 early_param("no-kvmapf", parse_no_kvmapf); 53 54 struct kvm_para_state { 55 u8 mmu_queue[MMU_QUEUE_SIZE]; 56 int mmu_queue_len; 57 }; 58 59 static DEFINE_PER_CPU(struct kvm_para_state, para_state); 60 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 61 62 static struct kvm_para_state *kvm_para_state(void) 63 { 64 return &per_cpu(para_state, raw_smp_processor_id()); 65 } 66 67 /* 68 * No need for any "IO delay" on KVM 69 */ 70 static void kvm_io_delay(void) 71 { 72 } 73 74 #define KVM_TASK_SLEEP_HASHBITS 8 75 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 76 77 struct kvm_task_sleep_node { 78 struct hlist_node link; 79 wait_queue_head_t wq; 80 u32 token; 81 int cpu; 82 bool halted; 83 struct mm_struct *mm; 84 }; 85 86 static struct kvm_task_sleep_head { 87 spinlock_t lock; 88 struct hlist_head list; 89 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 90 91 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 92 u32 token) 93 { 94 struct hlist_node *p; 95 96 hlist_for_each(p, &b->list) { 97 struct kvm_task_sleep_node *n = 98 hlist_entry(p, typeof(*n), link); 99 if (n->token == token) 100 return n; 101 } 102 103 return NULL; 104 } 105 106 void kvm_async_pf_task_wait(u32 token) 107 { 108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 110 struct kvm_task_sleep_node n, *e; 111 DEFINE_WAIT(wait); 112 int cpu, idle; 113 114 cpu = get_cpu(); 115 idle = idle_cpu(cpu); 116 put_cpu(); 117 118 spin_lock(&b->lock); 119 e = _find_apf_task(b, token); 120 if (e) { 121 /* dummy entry exist -> wake up was delivered ahead of PF */ 122 hlist_del(&e->link); 123 kfree(e); 124 spin_unlock(&b->lock); 125 return; 126 } 127 128 n.token = token; 129 n.cpu = smp_processor_id(); 130 n.mm = current->active_mm; 131 n.halted = idle || preempt_count() > 1; 132 atomic_inc(&n.mm->mm_count); 133 init_waitqueue_head(&n.wq); 134 hlist_add_head(&n.link, &b->list); 135 spin_unlock(&b->lock); 136 137 for (;;) { 138 if (!n.halted) 139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 140 if (hlist_unhashed(&n.link)) 141 break; 142 143 if (!n.halted) { 144 local_irq_enable(); 145 schedule(); 146 local_irq_disable(); 147 } else { 148 /* 149 * We cannot reschedule. So halt. 150 */ 151 native_safe_halt(); 152 local_irq_disable(); 153 } 154 } 155 if (!n.halted) 156 finish_wait(&n.wq, &wait); 157 158 return; 159 } 160 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 161 162 static void apf_task_wake_one(struct kvm_task_sleep_node *n) 163 { 164 hlist_del_init(&n->link); 165 if (!n->mm) 166 return; 167 mmdrop(n->mm); 168 if (n->halted) 169 smp_send_reschedule(n->cpu); 170 else if (waitqueue_active(&n->wq)) 171 wake_up(&n->wq); 172 } 173 174 static void apf_task_wake_all(void) 175 { 176 int i; 177 178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 179 struct hlist_node *p, *next; 180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 181 spin_lock(&b->lock); 182 hlist_for_each_safe(p, next, &b->list) { 183 struct kvm_task_sleep_node *n = 184 hlist_entry(p, typeof(*n), link); 185 if (n->cpu == smp_processor_id()) 186 apf_task_wake_one(n); 187 } 188 spin_unlock(&b->lock); 189 } 190 } 191 192 void kvm_async_pf_task_wake(u32 token) 193 { 194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 196 struct kvm_task_sleep_node *n; 197 198 if (token == ~0) { 199 apf_task_wake_all(); 200 return; 201 } 202 203 again: 204 spin_lock(&b->lock); 205 n = _find_apf_task(b, token); 206 if (!n) { 207 /* 208 * async PF was not yet handled. 209 * Add dummy entry for the token. 210 */ 211 n = kmalloc(sizeof(*n), GFP_ATOMIC); 212 if (!n) { 213 /* 214 * Allocation failed! Busy wait while other cpu 215 * handles async PF. 216 */ 217 spin_unlock(&b->lock); 218 cpu_relax(); 219 goto again; 220 } 221 n->token = token; 222 n->cpu = smp_processor_id(); 223 n->mm = NULL; 224 init_waitqueue_head(&n->wq); 225 hlist_add_head(&n->link, &b->list); 226 } else 227 apf_task_wake_one(n); 228 spin_unlock(&b->lock); 229 return; 230 } 231 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 232 233 u32 kvm_read_and_reset_pf_reason(void) 234 { 235 u32 reason = 0; 236 237 if (__get_cpu_var(apf_reason).enabled) { 238 reason = __get_cpu_var(apf_reason).reason; 239 __get_cpu_var(apf_reason).reason = 0; 240 } 241 242 return reason; 243 } 244 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); 245 246 dotraplinkage void __kprobes 247 do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 248 { 249 switch (kvm_read_and_reset_pf_reason()) { 250 default: 251 do_page_fault(regs, error_code); 252 break; 253 case KVM_PV_REASON_PAGE_NOT_PRESENT: 254 /* page is swapped out by the host. */ 255 kvm_async_pf_task_wait((u32)read_cr2()); 256 break; 257 case KVM_PV_REASON_PAGE_READY: 258 kvm_async_pf_task_wake((u32)read_cr2()); 259 break; 260 } 261 } 262 263 static void kvm_mmu_op(void *buffer, unsigned len) 264 { 265 int r; 266 unsigned long a1, a2; 267 268 do { 269 a1 = __pa(buffer); 270 a2 = 0; /* on i386 __pa() always returns <4G */ 271 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); 272 buffer += r; 273 len -= r; 274 } while (len); 275 } 276 277 static void mmu_queue_flush(struct kvm_para_state *state) 278 { 279 if (state->mmu_queue_len) { 280 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); 281 state->mmu_queue_len = 0; 282 } 283 } 284 285 static void kvm_deferred_mmu_op(void *buffer, int len) 286 { 287 struct kvm_para_state *state = kvm_para_state(); 288 289 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { 290 kvm_mmu_op(buffer, len); 291 return; 292 } 293 if (state->mmu_queue_len + len > sizeof state->mmu_queue) 294 mmu_queue_flush(state); 295 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); 296 state->mmu_queue_len += len; 297 } 298 299 static void kvm_mmu_write(void *dest, u64 val) 300 { 301 __u64 pte_phys; 302 struct kvm_mmu_op_write_pte wpte; 303 304 #ifdef CONFIG_HIGHPTE 305 struct page *page; 306 unsigned long dst = (unsigned long) dest; 307 308 page = kmap_atomic_to_page(dest); 309 pte_phys = page_to_pfn(page); 310 pte_phys <<= PAGE_SHIFT; 311 pte_phys += (dst & ~(PAGE_MASK)); 312 #else 313 pte_phys = (unsigned long)__pa(dest); 314 #endif 315 wpte.header.op = KVM_MMU_OP_WRITE_PTE; 316 wpte.pte_val = val; 317 wpte.pte_phys = pte_phys; 318 319 kvm_deferred_mmu_op(&wpte, sizeof wpte); 320 } 321 322 /* 323 * We only need to hook operations that are MMU writes. We hook these so that 324 * we can use lazy MMU mode to batch these operations. We could probably 325 * improve the performance of the host code if we used some of the information 326 * here to simplify processing of batched writes. 327 */ 328 static void kvm_set_pte(pte_t *ptep, pte_t pte) 329 { 330 kvm_mmu_write(ptep, pte_val(pte)); 331 } 332 333 static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, 334 pte_t *ptep, pte_t pte) 335 { 336 kvm_mmu_write(ptep, pte_val(pte)); 337 } 338 339 static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) 340 { 341 kvm_mmu_write(pmdp, pmd_val(pmd)); 342 } 343 344 #if PAGETABLE_LEVELS >= 3 345 #ifdef CONFIG_X86_PAE 346 static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) 347 { 348 kvm_mmu_write(ptep, pte_val(pte)); 349 } 350 351 static void kvm_pte_clear(struct mm_struct *mm, 352 unsigned long addr, pte_t *ptep) 353 { 354 kvm_mmu_write(ptep, 0); 355 } 356 357 static void kvm_pmd_clear(pmd_t *pmdp) 358 { 359 kvm_mmu_write(pmdp, 0); 360 } 361 #endif 362 363 static void kvm_set_pud(pud_t *pudp, pud_t pud) 364 { 365 kvm_mmu_write(pudp, pud_val(pud)); 366 } 367 368 #if PAGETABLE_LEVELS == 4 369 static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) 370 { 371 kvm_mmu_write(pgdp, pgd_val(pgd)); 372 } 373 #endif 374 #endif /* PAGETABLE_LEVELS >= 3 */ 375 376 static void kvm_flush_tlb(void) 377 { 378 struct kvm_mmu_op_flush_tlb ftlb = { 379 .header.op = KVM_MMU_OP_FLUSH_TLB, 380 }; 381 382 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 383 } 384 385 static void kvm_release_pt(unsigned long pfn) 386 { 387 struct kvm_mmu_op_release_pt rpt = { 388 .header.op = KVM_MMU_OP_RELEASE_PT, 389 .pt_phys = (u64)pfn << PAGE_SHIFT, 390 }; 391 392 kvm_mmu_op(&rpt, sizeof rpt); 393 } 394 395 static void kvm_enter_lazy_mmu(void) 396 { 397 paravirt_enter_lazy_mmu(); 398 } 399 400 static void kvm_leave_lazy_mmu(void) 401 { 402 struct kvm_para_state *state = kvm_para_state(); 403 404 mmu_queue_flush(state); 405 paravirt_leave_lazy_mmu(); 406 } 407 408 static void __init paravirt_ops_setup(void) 409 { 410 pv_info.name = "KVM"; 411 pv_info.paravirt_enabled = 1; 412 413 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 414 pv_cpu_ops.io_delay = kvm_io_delay; 415 416 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { 417 pv_mmu_ops.set_pte = kvm_set_pte; 418 pv_mmu_ops.set_pte_at = kvm_set_pte_at; 419 pv_mmu_ops.set_pmd = kvm_set_pmd; 420 #if PAGETABLE_LEVELS >= 3 421 #ifdef CONFIG_X86_PAE 422 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 423 pv_mmu_ops.pte_clear = kvm_pte_clear; 424 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 425 #endif 426 pv_mmu_ops.set_pud = kvm_set_pud; 427 #if PAGETABLE_LEVELS == 4 428 pv_mmu_ops.set_pgd = kvm_set_pgd; 429 #endif 430 #endif 431 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; 432 pv_mmu_ops.release_pte = kvm_release_pt; 433 pv_mmu_ops.release_pmd = kvm_release_pt; 434 pv_mmu_ops.release_pud = kvm_release_pt; 435 436 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 437 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 438 } 439 #ifdef CONFIG_X86_IO_APIC 440 no_timer_check = 1; 441 #endif 442 } 443 444 void __cpuinit kvm_guest_cpu_init(void) 445 { 446 if (!kvm_para_available()) 447 return; 448 449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 450 u64 pa = __pa(&__get_cpu_var(apf_reason)); 451 452 #ifdef CONFIG_PREEMPT 453 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 454 #endif 455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 456 __get_cpu_var(apf_reason).enabled = 1; 457 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 458 smp_processor_id()); 459 } 460 } 461 462 static void kvm_pv_disable_apf(void *unused) 463 { 464 if (!__get_cpu_var(apf_reason).enabled) 465 return; 466 467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 468 __get_cpu_var(apf_reason).enabled = 0; 469 470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", 471 smp_processor_id()); 472 } 473 474 static int kvm_pv_reboot_notify(struct notifier_block *nb, 475 unsigned long code, void *unused) 476 { 477 if (code == SYS_RESTART) 478 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 479 return NOTIFY_DONE; 480 } 481 482 static struct notifier_block kvm_pv_reboot_nb = { 483 .notifier_call = kvm_pv_reboot_notify, 484 }; 485 486 #ifdef CONFIG_SMP 487 static void __init kvm_smp_prepare_boot_cpu(void) 488 { 489 #ifdef CONFIG_KVM_CLOCK 490 WARN_ON(kvm_register_clock("primary cpu clock")); 491 #endif 492 kvm_guest_cpu_init(); 493 native_smp_prepare_boot_cpu(); 494 } 495 496 static void __cpuinit kvm_guest_cpu_online(void *dummy) 497 { 498 kvm_guest_cpu_init(); 499 } 500 501 static void kvm_guest_cpu_offline(void *dummy) 502 { 503 kvm_pv_disable_apf(NULL); 504 apf_task_wake_all(); 505 } 506 507 static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 508 unsigned long action, void *hcpu) 509 { 510 int cpu = (unsigned long)hcpu; 511 switch (action) { 512 case CPU_ONLINE: 513 case CPU_DOWN_FAILED: 514 case CPU_ONLINE_FROZEN: 515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); 516 break; 517 case CPU_DOWN_PREPARE: 518 case CPU_DOWN_PREPARE_FROZEN: 519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); 520 break; 521 default: 522 break; 523 } 524 return NOTIFY_OK; 525 } 526 527 static struct notifier_block __cpuinitdata kvm_cpu_notifier = { 528 .notifier_call = kvm_cpu_notify, 529 }; 530 #endif 531 532 static void __init kvm_apf_trap_init(void) 533 { 534 set_intr_gate(14, &async_page_fault); 535 } 536 537 void __init kvm_guest_init(void) 538 { 539 int i; 540 541 if (!kvm_para_available()) 542 return; 543 544 paravirt_ops_setup(); 545 register_reboot_notifier(&kvm_pv_reboot_nb); 546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 547 spin_lock_init(&async_pf_sleepers[i].lock); 548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 549 x86_init.irqs.trap_init = kvm_apf_trap_init; 550 551 #ifdef CONFIG_SMP 552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 553 register_cpu_notifier(&kvm_cpu_notifier); 554 #else 555 kvm_guest_cpu_init(); 556 #endif 557 } 558