1 /****************************************************************************** 2 * privcmd.c 3 * 4 * Interface to privileged domain-0 commands. 5 * 6 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 7 */ 8 9 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/module.h> 13 #include <linux/sched.h> 14 #include <linux/slab.h> 15 #include <linux/string.h> 16 #include <linux/errno.h> 17 #include <linux/mm.h> 18 #include <linux/mman.h> 19 #include <linux/uaccess.h> 20 #include <linux/swap.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/seq_file.h> 24 #include <linux/miscdevice.h> 25 26 #include <asm/pgalloc.h> 27 #include <asm/pgtable.h> 28 #include <asm/tlb.h> 29 #include <asm/xen/hypervisor.h> 30 #include <asm/xen/hypercall.h> 31 32 #include <xen/xen.h> 33 #include <xen/privcmd.h> 34 #include <xen/interface/xen.h> 35 #include <xen/features.h> 36 #include <xen/page.h> 37 #include <xen/xen-ops.h> 38 #include <xen/balloon.h> 39 40 #include "privcmd.h" 41 42 MODULE_LICENSE("GPL"); 43 44 #define PRIV_VMA_LOCKED ((void *)1) 45 46 static int privcmd_vma_range_is_mapped( 47 struct vm_area_struct *vma, 48 unsigned long addr, 49 unsigned long nr_pages); 50 51 static long privcmd_ioctl_hypercall(void __user *udata) 52 { 53 struct privcmd_hypercall hypercall; 54 long ret; 55 56 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 57 return -EFAULT; 58 59 ret = privcmd_call(hypercall.op, 60 hypercall.arg[0], hypercall.arg[1], 61 hypercall.arg[2], hypercall.arg[3], 62 hypercall.arg[4]); 63 64 return ret; 65 } 66 67 static void free_page_list(struct list_head *pages) 68 { 69 struct page *p, *n; 70 71 list_for_each_entry_safe(p, n, pages, lru) 72 __free_page(p); 73 74 INIT_LIST_HEAD(pages); 75 } 76 77 /* 78 * Given an array of items in userspace, return a list of pages 79 * containing the data. If copying fails, either because of memory 80 * allocation failure or a problem reading user memory, return an 81 * error code; its up to the caller to dispose of any partial list. 82 */ 83 static int gather_array(struct list_head *pagelist, 84 unsigned nelem, size_t size, 85 const void __user *data) 86 { 87 unsigned pageidx; 88 void *pagedata; 89 int ret; 90 91 if (size > PAGE_SIZE) 92 return 0; 93 94 pageidx = PAGE_SIZE; 95 pagedata = NULL; /* quiet, gcc */ 96 while (nelem--) { 97 if (pageidx > PAGE_SIZE-size) { 98 struct page *page = alloc_page(GFP_KERNEL); 99 100 ret = -ENOMEM; 101 if (page == NULL) 102 goto fail; 103 104 pagedata = page_address(page); 105 106 list_add_tail(&page->lru, pagelist); 107 pageidx = 0; 108 } 109 110 ret = -EFAULT; 111 if (copy_from_user(pagedata + pageidx, data, size)) 112 goto fail; 113 114 data += size; 115 pageidx += size; 116 } 117 118 ret = 0; 119 120 fail: 121 return ret; 122 } 123 124 /* 125 * Call function "fn" on each element of the array fragmented 126 * over a list of pages. 127 */ 128 static int traverse_pages(unsigned nelem, size_t size, 129 struct list_head *pos, 130 int (*fn)(void *data, void *state), 131 void *state) 132 { 133 void *pagedata; 134 unsigned pageidx; 135 int ret = 0; 136 137 BUG_ON(size > PAGE_SIZE); 138 139 pageidx = PAGE_SIZE; 140 pagedata = NULL; /* hush, gcc */ 141 142 while (nelem--) { 143 if (pageidx > PAGE_SIZE-size) { 144 struct page *page; 145 pos = pos->next; 146 page = list_entry(pos, struct page, lru); 147 pagedata = page_address(page); 148 pageidx = 0; 149 } 150 151 ret = (*fn)(pagedata + pageidx, state); 152 if (ret) 153 break; 154 pageidx += size; 155 } 156 157 return ret; 158 } 159 160 struct mmap_mfn_state { 161 unsigned long va; 162 struct vm_area_struct *vma; 163 domid_t domain; 164 }; 165 166 static int mmap_mfn_range(void *data, void *state) 167 { 168 struct privcmd_mmap_entry *msg = data; 169 struct mmap_mfn_state *st = state; 170 struct vm_area_struct *vma = st->vma; 171 int rc; 172 173 /* Do not allow range to wrap the address space. */ 174 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 175 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 176 return -EINVAL; 177 178 /* Range chunks must be contiguous in va space. */ 179 if ((msg->va != st->va) || 180 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 181 return -EINVAL; 182 183 rc = xen_remap_domain_mfn_range(vma, 184 msg->va & PAGE_MASK, 185 msg->mfn, msg->npages, 186 vma->vm_page_prot, 187 st->domain, NULL); 188 if (rc < 0) 189 return rc; 190 191 st->va += msg->npages << PAGE_SHIFT; 192 193 return 0; 194 } 195 196 static long privcmd_ioctl_mmap(void __user *udata) 197 { 198 struct privcmd_mmap mmapcmd; 199 struct mm_struct *mm = current->mm; 200 struct vm_area_struct *vma; 201 int rc; 202 LIST_HEAD(pagelist); 203 struct mmap_mfn_state state; 204 205 /* We only support privcmd_ioctl_mmap_batch for auto translated. */ 206 if (xen_feature(XENFEAT_auto_translated_physmap)) 207 return -ENOSYS; 208 209 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 210 return -EFAULT; 211 212 rc = gather_array(&pagelist, 213 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 214 mmapcmd.entry); 215 216 if (rc || list_empty(&pagelist)) 217 goto out; 218 219 down_write(&mm->mmap_sem); 220 221 { 222 struct page *page = list_first_entry(&pagelist, 223 struct page, lru); 224 struct privcmd_mmap_entry *msg = page_address(page); 225 226 vma = find_vma(mm, msg->va); 227 rc = -EINVAL; 228 229 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 230 goto out_up; 231 vma->vm_private_data = PRIV_VMA_LOCKED; 232 } 233 234 state.va = vma->vm_start; 235 state.vma = vma; 236 state.domain = mmapcmd.dom; 237 238 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 239 &pagelist, 240 mmap_mfn_range, &state); 241 242 243 out_up: 244 up_write(&mm->mmap_sem); 245 246 out: 247 free_page_list(&pagelist); 248 249 return rc; 250 } 251 252 struct mmap_batch_state { 253 domid_t domain; 254 unsigned long va; 255 struct vm_area_struct *vma; 256 int index; 257 /* A tristate: 258 * 0 for no errors 259 * 1 if at least one error has happened (and no 260 * -ENOENT errors have happened) 261 * -ENOENT if at least 1 -ENOENT has happened. 262 */ 263 int global_error; 264 int version; 265 266 /* User-space mfn array to store errors in the second pass for V1. */ 267 xen_pfn_t __user *user_mfn; 268 /* User-space int array to store errors in the second pass for V2. */ 269 int __user *user_err; 270 }; 271 272 /* auto translated dom0 note: if domU being created is PV, then mfn is 273 * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). 274 */ 275 static int mmap_batch_fn(void *data, void *state) 276 { 277 xen_pfn_t *mfnp = data; 278 struct mmap_batch_state *st = state; 279 struct vm_area_struct *vma = st->vma; 280 struct page **pages = vma->vm_private_data; 281 struct page *cur_page = NULL; 282 int ret; 283 284 if (xen_feature(XENFEAT_auto_translated_physmap)) 285 cur_page = pages[st->index++]; 286 287 ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, 288 st->vma->vm_page_prot, st->domain, 289 &cur_page); 290 291 /* Store error code for second pass. */ 292 if (st->version == 1) { 293 if (ret < 0) { 294 /* 295 * V1 encodes the error codes in the 32bit top nibble of the 296 * mfn (with its known limitations vis-a-vis 64 bit callers). 297 */ 298 *mfnp |= (ret == -ENOENT) ? 299 PRIVCMD_MMAPBATCH_PAGED_ERROR : 300 PRIVCMD_MMAPBATCH_MFN_ERROR; 301 } 302 } else { /* st->version == 2 */ 303 *((int *) mfnp) = ret; 304 } 305 306 /* And see if it affects the global_error. */ 307 if (ret < 0) { 308 if (ret == -ENOENT) 309 st->global_error = -ENOENT; 310 else { 311 /* Record that at least one error has happened. */ 312 if (st->global_error == 0) 313 st->global_error = 1; 314 } 315 } 316 st->va += PAGE_SIZE; 317 318 return 0; 319 } 320 321 static int mmap_return_errors(void *data, void *state) 322 { 323 struct mmap_batch_state *st = state; 324 325 if (st->version == 1) { 326 xen_pfn_t mfnp = *((xen_pfn_t *) data); 327 if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) 328 return __put_user(mfnp, st->user_mfn++); 329 else 330 st->user_mfn++; 331 } else { /* st->version == 2 */ 332 int err = *((int *) data); 333 if (err) 334 return __put_user(err, st->user_err++); 335 else 336 st->user_err++; 337 } 338 339 return 0; 340 } 341 342 /* Allocate pfns that are then mapped with gmfns from foreign domid. Update 343 * the vma with the page info to use later. 344 * Returns: 0 if success, otherwise -errno 345 */ 346 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 347 { 348 int rc; 349 struct page **pages; 350 351 pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 352 if (pages == NULL) 353 return -ENOMEM; 354 355 rc = alloc_xenballooned_pages(numpgs, pages, 0); 356 if (rc != 0) { 357 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 358 numpgs, rc); 359 kfree(pages); 360 return -ENOMEM; 361 } 362 BUG_ON(vma->vm_private_data != NULL); 363 vma->vm_private_data = pages; 364 365 return 0; 366 } 367 368 static struct vm_operations_struct privcmd_vm_ops; 369 370 static long privcmd_ioctl_mmap_batch(void __user *udata, int version) 371 { 372 int ret; 373 struct privcmd_mmapbatch_v2 m; 374 struct mm_struct *mm = current->mm; 375 struct vm_area_struct *vma; 376 unsigned long nr_pages; 377 LIST_HEAD(pagelist); 378 struct mmap_batch_state state; 379 380 switch (version) { 381 case 1: 382 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 383 return -EFAULT; 384 /* Returns per-frame error in m.arr. */ 385 m.err = NULL; 386 if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) 387 return -EFAULT; 388 break; 389 case 2: 390 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 391 return -EFAULT; 392 /* Returns per-frame error code in m.err. */ 393 if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) 394 return -EFAULT; 395 break; 396 default: 397 return -EINVAL; 398 } 399 400 nr_pages = m.num; 401 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 402 return -EINVAL; 403 404 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 405 406 if (ret) 407 goto out; 408 if (list_empty(&pagelist)) { 409 ret = -EINVAL; 410 goto out; 411 } 412 413 if (version == 2) { 414 /* Zero error array now to only copy back actual errors. */ 415 if (clear_user(m.err, sizeof(int) * m.num)) { 416 ret = -EFAULT; 417 goto out; 418 } 419 } 420 421 down_write(&mm->mmap_sem); 422 423 vma = find_vma(mm, m.addr); 424 if (!vma || 425 vma->vm_ops != &privcmd_vm_ops) { 426 ret = -EINVAL; 427 goto out_unlock; 428 } 429 430 /* 431 * Caller must either: 432 * 433 * Map the whole VMA range, which will also allocate all the 434 * pages required for the auto_translated_physmap case. 435 * 436 * Or 437 * 438 * Map unmapped holes left from a previous map attempt (e.g., 439 * because those foreign frames were previously paged out). 440 */ 441 if (vma->vm_private_data == NULL) { 442 if (m.addr != vma->vm_start || 443 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 444 ret = -EINVAL; 445 goto out_unlock; 446 } 447 if (xen_feature(XENFEAT_auto_translated_physmap)) { 448 ret = alloc_empty_pages(vma, m.num); 449 if (ret < 0) 450 goto out_unlock; 451 } else 452 vma->vm_private_data = PRIV_VMA_LOCKED; 453 } else { 454 if (m.addr < vma->vm_start || 455 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 456 ret = -EINVAL; 457 goto out_unlock; 458 } 459 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 460 ret = -EINVAL; 461 goto out_unlock; 462 } 463 } 464 465 state.domain = m.dom; 466 state.vma = vma; 467 state.va = m.addr; 468 state.index = 0; 469 state.global_error = 0; 470 state.version = version; 471 472 /* mmap_batch_fn guarantees ret == 0 */ 473 BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), 474 &pagelist, mmap_batch_fn, &state)); 475 476 up_write(&mm->mmap_sem); 477 478 if (state.global_error) { 479 /* Write back errors in second pass. */ 480 state.user_mfn = (xen_pfn_t *)m.arr; 481 state.user_err = m.err; 482 ret = traverse_pages(m.num, sizeof(xen_pfn_t), 483 &pagelist, mmap_return_errors, &state); 484 } else 485 ret = 0; 486 487 /* If we have not had any EFAULT-like global errors then set the global 488 * error to -ENOENT if necessary. */ 489 if ((ret == 0) && (state.global_error == -ENOENT)) 490 ret = -ENOENT; 491 492 out: 493 free_page_list(&pagelist); 494 return ret; 495 496 out_unlock: 497 up_write(&mm->mmap_sem); 498 goto out; 499 } 500 501 static long privcmd_ioctl(struct file *file, 502 unsigned int cmd, unsigned long data) 503 { 504 int ret = -ENOSYS; 505 void __user *udata = (void __user *) data; 506 507 switch (cmd) { 508 case IOCTL_PRIVCMD_HYPERCALL: 509 ret = privcmd_ioctl_hypercall(udata); 510 break; 511 512 case IOCTL_PRIVCMD_MMAP: 513 ret = privcmd_ioctl_mmap(udata); 514 break; 515 516 case IOCTL_PRIVCMD_MMAPBATCH: 517 ret = privcmd_ioctl_mmap_batch(udata, 1); 518 break; 519 520 case IOCTL_PRIVCMD_MMAPBATCH_V2: 521 ret = privcmd_ioctl_mmap_batch(udata, 2); 522 break; 523 524 default: 525 ret = -EINVAL; 526 break; 527 } 528 529 return ret; 530 } 531 532 static void privcmd_close(struct vm_area_struct *vma) 533 { 534 struct page **pages = vma->vm_private_data; 535 int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 536 537 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 538 return; 539 540 xen_unmap_domain_mfn_range(vma, numpgs, pages); 541 free_xenballooned_pages(numpgs, pages); 542 kfree(pages); 543 } 544 545 static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 546 { 547 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 548 vma, vma->vm_start, vma->vm_end, 549 vmf->pgoff, vmf->virtual_address); 550 551 return VM_FAULT_SIGBUS; 552 } 553 554 static struct vm_operations_struct privcmd_vm_ops = { 555 .close = privcmd_close, 556 .fault = privcmd_fault 557 }; 558 559 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 560 { 561 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 562 * how to recreate these mappings */ 563 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | 564 VM_DONTEXPAND | VM_DONTDUMP; 565 vma->vm_ops = &privcmd_vm_ops; 566 vma->vm_private_data = NULL; 567 568 return 0; 569 } 570 571 /* 572 * For MMAPBATCH*. This allows asserting the singleshot mapping 573 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 574 * can be then retried until success. 575 */ 576 static int is_mapped_fn(pte_t *pte, struct page *pmd_page, 577 unsigned long addr, void *data) 578 { 579 return pte_none(*pte) ? 0 : -EBUSY; 580 } 581 582 static int privcmd_vma_range_is_mapped( 583 struct vm_area_struct *vma, 584 unsigned long addr, 585 unsigned long nr_pages) 586 { 587 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 588 is_mapped_fn, NULL) != 0; 589 } 590 591 const struct file_operations xen_privcmd_fops = { 592 .owner = THIS_MODULE, 593 .unlocked_ioctl = privcmd_ioctl, 594 .mmap = privcmd_mmap, 595 }; 596 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 597 598 static struct miscdevice privcmd_dev = { 599 .minor = MISC_DYNAMIC_MINOR, 600 .name = "xen/privcmd", 601 .fops = &xen_privcmd_fops, 602 }; 603 604 static int __init privcmd_init(void) 605 { 606 int err; 607 608 if (!xen_domain()) 609 return -ENODEV; 610 611 err = misc_register(&privcmd_dev); 612 if (err != 0) { 613 pr_err("Could not register Xen privcmd device\n"); 614 return err; 615 } 616 return 0; 617 } 618 619 static void __exit privcmd_exit(void) 620 { 621 misc_deregister(&privcmd_dev); 622 } 623 624 module_init(privcmd_init); 625 module_exit(privcmd_exit); 626