1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/poll.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/string.h> 21 #include <linux/workqueue.h> 22 #include <linux/errno.h> 23 #include <linux/mm.h> 24 #include <linux/mman.h> 25 #include <linux/uaccess.h> 26 #include <linux/swap.h> 27 #include <linux/highmem.h> 28 #include <linux/pagemap.h> 29 #include <linux/seq_file.h> 30 #include <linux/miscdevice.h> 31 #include <linux/moduleparam.h> 32 33 #include <asm/xen/hypervisor.h> 34 #include <asm/xen/hypercall.h> 35 36 #include <xen/xen.h> 37 #include <xen/privcmd.h> 38 #include <xen/interface/xen.h> 39 #include <xen/interface/memory.h> 40 #include <xen/interface/hvm/dm_op.h> 41 #include <xen/features.h> 42 #include <xen/page.h> 43 #include <xen/xen-ops.h> 44 #include <xen/balloon.h> 45 46 #include "privcmd.h" 47 48 MODULE_LICENSE("GPL"); 49 50 #define PRIV_VMA_LOCKED ((void *)1) 51 52 static unsigned int privcmd_dm_op_max_num = 16; 53 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 54 MODULE_PARM_DESC(dm_op_max_nr_bufs, 55 "Maximum number of buffers per dm_op hypercall"); 56 57 static unsigned int privcmd_dm_op_buf_max_size = 4096; 58 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 59 0644); 60 MODULE_PARM_DESC(dm_op_buf_max_size, 61 "Maximum size of a dm_op hypercall buffer"); 62 63 struct privcmd_data { 64 domid_t domid; 65 }; 66 67 static int privcmd_vma_range_is_mapped( 68 struct vm_area_struct *vma, 69 unsigned long addr, 70 unsigned long nr_pages); 71 72 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 73 { 74 struct privcmd_data *data = file->private_data; 75 struct privcmd_hypercall hypercall; 76 long ret; 77 78 /* Disallow arbitrary hypercalls if restricted */ 79 if (data->domid != DOMID_INVALID) 80 return -EPERM; 81 82 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 83 return -EFAULT; 84 85 xen_preemptible_hcall_begin(); 86 ret = privcmd_call(hypercall.op, 87 hypercall.arg[0], hypercall.arg[1], 88 hypercall.arg[2], hypercall.arg[3], 89 hypercall.arg[4]); 90 xen_preemptible_hcall_end(); 91 92 return ret; 93 } 94 95 static void free_page_list(struct list_head *pages) 96 { 97 struct page *p, *n; 98 99 list_for_each_entry_safe(p, n, pages, lru) 100 __free_page(p); 101 102 INIT_LIST_HEAD(pages); 103 } 104 105 /* 106 * Given an array of items in userspace, return a list of pages 107 * containing the data. If copying fails, either because of memory 108 * allocation failure or a problem reading user memory, return an 109 * error code; its up to the caller to dispose of any partial list. 110 */ 111 static int gather_array(struct list_head *pagelist, 112 unsigned nelem, size_t size, 113 const void __user *data) 114 { 115 unsigned pageidx; 116 void *pagedata; 117 int ret; 118 119 if (size > PAGE_SIZE) 120 return 0; 121 122 pageidx = PAGE_SIZE; 123 pagedata = NULL; /* quiet, gcc */ 124 while (nelem--) { 125 if (pageidx > PAGE_SIZE-size) { 126 struct page *page = alloc_page(GFP_KERNEL); 127 128 ret = -ENOMEM; 129 if (page == NULL) 130 goto fail; 131 132 pagedata = page_address(page); 133 134 list_add_tail(&page->lru, pagelist); 135 pageidx = 0; 136 } 137 138 ret = -EFAULT; 139 if (copy_from_user(pagedata + pageidx, data, size)) 140 goto fail; 141 142 data += size; 143 pageidx += size; 144 } 145 146 ret = 0; 147 148 fail: 149 return ret; 150 } 151 152 /* 153 * Call function "fn" on each element of the array fragmented 154 * over a list of pages. 155 */ 156 static int traverse_pages(unsigned nelem, size_t size, 157 struct list_head *pos, 158 int (*fn)(void *data, void *state), 159 void *state) 160 { 161 void *pagedata; 162 unsigned pageidx; 163 int ret = 0; 164 165 BUG_ON(size > PAGE_SIZE); 166 167 pageidx = PAGE_SIZE; 168 pagedata = NULL; /* hush, gcc */ 169 170 while (nelem--) { 171 if (pageidx > PAGE_SIZE-size) { 172 struct page *page; 173 pos = pos->next; 174 page = list_entry(pos, struct page, lru); 175 pagedata = page_address(page); 176 pageidx = 0; 177 } 178 179 ret = (*fn)(pagedata + pageidx, state); 180 if (ret) 181 break; 182 pageidx += size; 183 } 184 185 return ret; 186 } 187 188 /* 189 * Similar to traverse_pages, but use each page as a "block" of 190 * data to be processed as one unit. 191 */ 192 static int traverse_pages_block(unsigned nelem, size_t size, 193 struct list_head *pos, 194 int (*fn)(void *data, int nr, void *state), 195 void *state) 196 { 197 void *pagedata; 198 int ret = 0; 199 200 BUG_ON(size > PAGE_SIZE); 201 202 while (nelem) { 203 int nr = (PAGE_SIZE/size); 204 struct page *page; 205 if (nr > nelem) 206 nr = nelem; 207 pos = pos->next; 208 page = list_entry(pos, struct page, lru); 209 pagedata = page_address(page); 210 ret = (*fn)(pagedata, nr, state); 211 if (ret) 212 break; 213 nelem -= nr; 214 } 215 216 return ret; 217 } 218 219 struct mmap_gfn_state { 220 unsigned long va; 221 struct vm_area_struct *vma; 222 domid_t domain; 223 }; 224 225 static int mmap_gfn_range(void *data, void *state) 226 { 227 struct privcmd_mmap_entry *msg = data; 228 struct mmap_gfn_state *st = state; 229 struct vm_area_struct *vma = st->vma; 230 int rc; 231 232 /* Do not allow range to wrap the address space. */ 233 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 234 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 235 return -EINVAL; 236 237 /* Range chunks must be contiguous in va space. */ 238 if ((msg->va != st->va) || 239 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 240 return -EINVAL; 241 242 rc = xen_remap_domain_gfn_range(vma, 243 msg->va & PAGE_MASK, 244 msg->mfn, msg->npages, 245 vma->vm_page_prot, 246 st->domain, NULL); 247 if (rc < 0) 248 return rc; 249 250 st->va += msg->npages << PAGE_SHIFT; 251 252 return 0; 253 } 254 255 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 256 { 257 struct privcmd_data *data = file->private_data; 258 struct privcmd_mmap mmapcmd; 259 struct mm_struct *mm = current->mm; 260 struct vm_area_struct *vma; 261 int rc; 262 LIST_HEAD(pagelist); 263 struct mmap_gfn_state state; 264 265 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 266 if (xen_feature(XENFEAT_auto_translated_physmap)) 267 return -ENOSYS; 268 269 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 270 return -EFAULT; 271 272 /* If restriction is in place, check the domid matches */ 273 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 274 return -EPERM; 275 276 rc = gather_array(&pagelist, 277 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 278 mmapcmd.entry); 279 280 if (rc || list_empty(&pagelist)) 281 goto out; 282 283 mmap_write_lock(mm); 284 285 { 286 struct page *page = list_first_entry(&pagelist, 287 struct page, lru); 288 struct privcmd_mmap_entry *msg = page_address(page); 289 290 vma = vma_lookup(mm, msg->va); 291 rc = -EINVAL; 292 293 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 294 goto out_up; 295 vma->vm_private_data = PRIV_VMA_LOCKED; 296 } 297 298 state.va = vma->vm_start; 299 state.vma = vma; 300 state.domain = mmapcmd.dom; 301 302 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 303 &pagelist, 304 mmap_gfn_range, &state); 305 306 307 out_up: 308 mmap_write_unlock(mm); 309 310 out: 311 free_page_list(&pagelist); 312 313 return rc; 314 } 315 316 struct mmap_batch_state { 317 domid_t domain; 318 unsigned long va; 319 struct vm_area_struct *vma; 320 int index; 321 /* A tristate: 322 * 0 for no errors 323 * 1 if at least one error has happened (and no 324 * -ENOENT errors have happened) 325 * -ENOENT if at least 1 -ENOENT has happened. 326 */ 327 int global_error; 328 int version; 329 330 /* User-space gfn array to store errors in the second pass for V1. */ 331 xen_pfn_t __user *user_gfn; 332 /* User-space int array to store errors in the second pass for V2. */ 333 int __user *user_err; 334 }; 335 336 /* auto translated dom0 note: if domU being created is PV, then gfn is 337 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 338 */ 339 static int mmap_batch_fn(void *data, int nr, void *state) 340 { 341 xen_pfn_t *gfnp = data; 342 struct mmap_batch_state *st = state; 343 struct vm_area_struct *vma = st->vma; 344 struct page **pages = vma->vm_private_data; 345 struct page **cur_pages = NULL; 346 int ret; 347 348 if (xen_feature(XENFEAT_auto_translated_physmap)) 349 cur_pages = &pages[st->index]; 350 351 BUG_ON(nr < 0); 352 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 353 (int *)gfnp, st->vma->vm_page_prot, 354 st->domain, cur_pages); 355 356 /* Adjust the global_error? */ 357 if (ret != nr) { 358 if (ret == -ENOENT) 359 st->global_error = -ENOENT; 360 else { 361 /* Record that at least one error has happened. */ 362 if (st->global_error == 0) 363 st->global_error = 1; 364 } 365 } 366 st->va += XEN_PAGE_SIZE * nr; 367 st->index += nr / XEN_PFN_PER_PAGE; 368 369 return 0; 370 } 371 372 static int mmap_return_error(int err, struct mmap_batch_state *st) 373 { 374 int ret; 375 376 if (st->version == 1) { 377 if (err) { 378 xen_pfn_t gfn; 379 380 ret = get_user(gfn, st->user_gfn); 381 if (ret < 0) 382 return ret; 383 /* 384 * V1 encodes the error codes in the 32bit top 385 * nibble of the gfn (with its known 386 * limitations vis-a-vis 64 bit callers). 387 */ 388 gfn |= (err == -ENOENT) ? 389 PRIVCMD_MMAPBATCH_PAGED_ERROR : 390 PRIVCMD_MMAPBATCH_MFN_ERROR; 391 return __put_user(gfn, st->user_gfn++); 392 } else 393 st->user_gfn++; 394 } else { /* st->version == 2 */ 395 if (err) 396 return __put_user(err, st->user_err++); 397 else 398 st->user_err++; 399 } 400 401 return 0; 402 } 403 404 static int mmap_return_errors(void *data, int nr, void *state) 405 { 406 struct mmap_batch_state *st = state; 407 int *errs = data; 408 int i; 409 int ret; 410 411 for (i = 0; i < nr; i++) { 412 ret = mmap_return_error(errs[i], st); 413 if (ret < 0) 414 return ret; 415 } 416 return 0; 417 } 418 419 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 420 * the vma with the page info to use later. 421 * Returns: 0 if success, otherwise -errno 422 */ 423 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 424 { 425 int rc; 426 struct page **pages; 427 428 pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 429 if (pages == NULL) 430 return -ENOMEM; 431 432 rc = xen_alloc_unpopulated_pages(numpgs, pages); 433 if (rc != 0) { 434 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 435 numpgs, rc); 436 kvfree(pages); 437 return -ENOMEM; 438 } 439 BUG_ON(vma->vm_private_data != NULL); 440 vma->vm_private_data = pages; 441 442 return 0; 443 } 444 445 static const struct vm_operations_struct privcmd_vm_ops; 446 447 static long privcmd_ioctl_mmap_batch( 448 struct file *file, void __user *udata, int version) 449 { 450 struct privcmd_data *data = file->private_data; 451 int ret; 452 struct privcmd_mmapbatch_v2 m; 453 struct mm_struct *mm = current->mm; 454 struct vm_area_struct *vma; 455 unsigned long nr_pages; 456 LIST_HEAD(pagelist); 457 struct mmap_batch_state state; 458 459 switch (version) { 460 case 1: 461 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 462 return -EFAULT; 463 /* Returns per-frame error in m.arr. */ 464 m.err = NULL; 465 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 466 return -EFAULT; 467 break; 468 case 2: 469 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 470 return -EFAULT; 471 /* Returns per-frame error code in m.err. */ 472 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 473 return -EFAULT; 474 break; 475 default: 476 return -EINVAL; 477 } 478 479 /* If restriction is in place, check the domid matches */ 480 if (data->domid != DOMID_INVALID && data->domid != m.dom) 481 return -EPERM; 482 483 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 484 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 485 return -EINVAL; 486 487 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 488 489 if (ret) 490 goto out; 491 if (list_empty(&pagelist)) { 492 ret = -EINVAL; 493 goto out; 494 } 495 496 if (version == 2) { 497 /* Zero error array now to only copy back actual errors. */ 498 if (clear_user(m.err, sizeof(int) * m.num)) { 499 ret = -EFAULT; 500 goto out; 501 } 502 } 503 504 mmap_write_lock(mm); 505 506 vma = find_vma(mm, m.addr); 507 if (!vma || 508 vma->vm_ops != &privcmd_vm_ops) { 509 ret = -EINVAL; 510 goto out_unlock; 511 } 512 513 /* 514 * Caller must either: 515 * 516 * Map the whole VMA range, which will also allocate all the 517 * pages required for the auto_translated_physmap case. 518 * 519 * Or 520 * 521 * Map unmapped holes left from a previous map attempt (e.g., 522 * because those foreign frames were previously paged out). 523 */ 524 if (vma->vm_private_data == NULL) { 525 if (m.addr != vma->vm_start || 526 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 527 ret = -EINVAL; 528 goto out_unlock; 529 } 530 if (xen_feature(XENFEAT_auto_translated_physmap)) { 531 ret = alloc_empty_pages(vma, nr_pages); 532 if (ret < 0) 533 goto out_unlock; 534 } else 535 vma->vm_private_data = PRIV_VMA_LOCKED; 536 } else { 537 if (m.addr < vma->vm_start || 538 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 539 ret = -EINVAL; 540 goto out_unlock; 541 } 542 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 543 ret = -EINVAL; 544 goto out_unlock; 545 } 546 } 547 548 state.domain = m.dom; 549 state.vma = vma; 550 state.va = m.addr; 551 state.index = 0; 552 state.global_error = 0; 553 state.version = version; 554 555 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 556 /* mmap_batch_fn guarantees ret == 0 */ 557 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 558 &pagelist, mmap_batch_fn, &state)); 559 560 mmap_write_unlock(mm); 561 562 if (state.global_error) { 563 /* Write back errors in second pass. */ 564 state.user_gfn = (xen_pfn_t *)m.arr; 565 state.user_err = m.err; 566 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 567 &pagelist, mmap_return_errors, &state); 568 } else 569 ret = 0; 570 571 /* If we have not had any EFAULT-like global errors then set the global 572 * error to -ENOENT if necessary. */ 573 if ((ret == 0) && (state.global_error == -ENOENT)) 574 ret = -ENOENT; 575 576 out: 577 free_page_list(&pagelist); 578 return ret; 579 580 out_unlock: 581 mmap_write_unlock(mm); 582 goto out; 583 } 584 585 static int lock_pages( 586 struct privcmd_dm_op_buf kbufs[], unsigned int num, 587 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 588 { 589 unsigned int i, off = 0; 590 591 for (i = 0; i < num; ) { 592 unsigned int requested; 593 int page_count; 594 595 requested = DIV_ROUND_UP( 596 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 597 PAGE_SIZE) - off; 598 if (requested > nr_pages) 599 return -ENOSPC; 600 601 page_count = pin_user_pages_fast( 602 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 603 requested, FOLL_WRITE, pages); 604 if (page_count <= 0) 605 return page_count ? : -EFAULT; 606 607 *pinned += page_count; 608 nr_pages -= page_count; 609 pages += page_count; 610 611 off = (requested == page_count) ? 0 : off + page_count; 612 i += !off; 613 } 614 615 return 0; 616 } 617 618 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 619 { 620 unpin_user_pages_dirty_lock(pages, nr_pages, true); 621 } 622 623 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 624 { 625 struct privcmd_data *data = file->private_data; 626 struct privcmd_dm_op kdata; 627 struct privcmd_dm_op_buf *kbufs; 628 unsigned int nr_pages = 0; 629 struct page **pages = NULL; 630 struct xen_dm_op_buf *xbufs = NULL; 631 unsigned int i; 632 long rc; 633 unsigned int pinned = 0; 634 635 if (copy_from_user(&kdata, udata, sizeof(kdata))) 636 return -EFAULT; 637 638 /* If restriction is in place, check the domid matches */ 639 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 640 return -EPERM; 641 642 if (kdata.num == 0) 643 return 0; 644 645 if (kdata.num > privcmd_dm_op_max_num) 646 return -E2BIG; 647 648 kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); 649 if (!kbufs) 650 return -ENOMEM; 651 652 if (copy_from_user(kbufs, kdata.ubufs, 653 sizeof(*kbufs) * kdata.num)) { 654 rc = -EFAULT; 655 goto out; 656 } 657 658 for (i = 0; i < kdata.num; i++) { 659 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 660 rc = -E2BIG; 661 goto out; 662 } 663 664 if (!access_ok(kbufs[i].uptr, 665 kbufs[i].size)) { 666 rc = -EFAULT; 667 goto out; 668 } 669 670 nr_pages += DIV_ROUND_UP( 671 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 672 PAGE_SIZE); 673 } 674 675 pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); 676 if (!pages) { 677 rc = -ENOMEM; 678 goto out; 679 } 680 681 xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); 682 if (!xbufs) { 683 rc = -ENOMEM; 684 goto out; 685 } 686 687 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 688 if (rc < 0) 689 goto out; 690 691 for (i = 0; i < kdata.num; i++) { 692 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 693 xbufs[i].size = kbufs[i].size; 694 } 695 696 xen_preemptible_hcall_begin(); 697 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 698 xen_preemptible_hcall_end(); 699 700 out: 701 unlock_pages(pages, pinned); 702 kfree(xbufs); 703 kfree(pages); 704 kfree(kbufs); 705 706 return rc; 707 } 708 709 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 710 { 711 struct privcmd_data *data = file->private_data; 712 domid_t dom; 713 714 if (copy_from_user(&dom, udata, sizeof(dom))) 715 return -EFAULT; 716 717 /* Set restriction to the specified domain, or check it matches */ 718 if (data->domid == DOMID_INVALID) 719 data->domid = dom; 720 else if (data->domid != dom) 721 return -EINVAL; 722 723 return 0; 724 } 725 726 static long privcmd_ioctl_mmap_resource(struct file *file, 727 struct privcmd_mmap_resource __user *udata) 728 { 729 struct privcmd_data *data = file->private_data; 730 struct mm_struct *mm = current->mm; 731 struct vm_area_struct *vma; 732 struct privcmd_mmap_resource kdata; 733 xen_pfn_t *pfns = NULL; 734 struct xen_mem_acquire_resource xdata = { }; 735 int rc; 736 737 if (copy_from_user(&kdata, udata, sizeof(kdata))) 738 return -EFAULT; 739 740 /* If restriction is in place, check the domid matches */ 741 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 742 return -EPERM; 743 744 /* Both fields must be set or unset */ 745 if (!!kdata.addr != !!kdata.num) 746 return -EINVAL; 747 748 xdata.domid = kdata.dom; 749 xdata.type = kdata.type; 750 xdata.id = kdata.id; 751 752 if (!kdata.addr && !kdata.num) { 753 /* Query the size of the resource. */ 754 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 755 if (rc) 756 return rc; 757 return __put_user(xdata.nr_frames, &udata->num); 758 } 759 760 mmap_write_lock(mm); 761 762 vma = find_vma(mm, kdata.addr); 763 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 764 rc = -EINVAL; 765 goto out; 766 } 767 768 pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); 769 if (!pfns) { 770 rc = -ENOMEM; 771 goto out; 772 } 773 774 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 775 xen_feature(XENFEAT_auto_translated_physmap)) { 776 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 777 struct page **pages; 778 unsigned int i; 779 780 rc = alloc_empty_pages(vma, nr); 781 if (rc < 0) 782 goto out; 783 784 pages = vma->vm_private_data; 785 for (i = 0; i < kdata.num; i++) { 786 xen_pfn_t pfn = 787 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 788 789 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 790 } 791 } else 792 vma->vm_private_data = PRIV_VMA_LOCKED; 793 794 xdata.frame = kdata.idx; 795 xdata.nr_frames = kdata.num; 796 set_xen_guest_handle(xdata.frame_list, pfns); 797 798 xen_preemptible_hcall_begin(); 799 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 800 xen_preemptible_hcall_end(); 801 802 if (rc) 803 goto out; 804 805 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 806 xen_feature(XENFEAT_auto_translated_physmap)) { 807 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 808 } else { 809 unsigned int domid = 810 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 811 DOMID_SELF : kdata.dom; 812 int num, *errs = (int *)pfns; 813 814 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 815 num = xen_remap_domain_mfn_array(vma, 816 kdata.addr & PAGE_MASK, 817 pfns, kdata.num, errs, 818 vma->vm_page_prot, 819 domid); 820 if (num < 0) 821 rc = num; 822 else if (num != kdata.num) { 823 unsigned int i; 824 825 for (i = 0; i < num; i++) { 826 rc = errs[i]; 827 if (rc < 0) 828 break; 829 } 830 } else 831 rc = 0; 832 } 833 834 out: 835 mmap_write_unlock(mm); 836 kfree(pfns); 837 838 return rc; 839 } 840 841 #ifdef CONFIG_XEN_PRIVCMD_IRQFD 842 /* Irqfd support */ 843 static struct workqueue_struct *irqfd_cleanup_wq; 844 static DEFINE_SPINLOCK(irqfds_lock); 845 static LIST_HEAD(irqfds_list); 846 847 struct privcmd_kernel_irqfd { 848 struct xen_dm_op_buf xbufs; 849 domid_t dom; 850 bool error; 851 struct eventfd_ctx *eventfd; 852 struct work_struct shutdown; 853 wait_queue_entry_t wait; 854 struct list_head list; 855 poll_table pt; 856 }; 857 858 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 859 { 860 lockdep_assert_held(&irqfds_lock); 861 862 list_del_init(&kirqfd->list); 863 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 864 } 865 866 static void irqfd_shutdown(struct work_struct *work) 867 { 868 struct privcmd_kernel_irqfd *kirqfd = 869 container_of(work, struct privcmd_kernel_irqfd, shutdown); 870 u64 cnt; 871 872 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 873 eventfd_ctx_put(kirqfd->eventfd); 874 kfree(kirqfd); 875 } 876 877 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 878 { 879 u64 cnt; 880 long rc; 881 882 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 883 884 xen_preemptible_hcall_begin(); 885 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 886 xen_preemptible_hcall_end(); 887 888 /* Don't repeat the error message for consecutive failures */ 889 if (rc && !kirqfd->error) { 890 pr_err("Failed to configure irq for guest domain: %d\n", 891 kirqfd->dom); 892 } 893 894 kirqfd->error = rc; 895 } 896 897 static int 898 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 899 { 900 struct privcmd_kernel_irqfd *kirqfd = 901 container_of(wait, struct privcmd_kernel_irqfd, wait); 902 __poll_t flags = key_to_poll(key); 903 904 if (flags & EPOLLIN) 905 irqfd_inject(kirqfd); 906 907 if (flags & EPOLLHUP) { 908 unsigned long flags; 909 910 spin_lock_irqsave(&irqfds_lock, flags); 911 irqfd_deactivate(kirqfd); 912 spin_unlock_irqrestore(&irqfds_lock, flags); 913 } 914 915 return 0; 916 } 917 918 static void 919 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 920 { 921 struct privcmd_kernel_irqfd *kirqfd = 922 container_of(pt, struct privcmd_kernel_irqfd, pt); 923 924 add_wait_queue_priority(wqh, &kirqfd->wait); 925 } 926 927 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 928 { 929 struct privcmd_kernel_irqfd *kirqfd, *tmp; 930 unsigned long flags; 931 __poll_t events; 932 struct fd f; 933 void *dm_op; 934 int ret; 935 936 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 937 if (!kirqfd) 938 return -ENOMEM; 939 dm_op = kirqfd + 1; 940 941 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 942 ret = -EFAULT; 943 goto error_kfree; 944 } 945 946 kirqfd->xbufs.size = irqfd->size; 947 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 948 kirqfd->dom = irqfd->dom; 949 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 950 951 f = fdget(irqfd->fd); 952 if (!f.file) { 953 ret = -EBADF; 954 goto error_kfree; 955 } 956 957 kirqfd->eventfd = eventfd_ctx_fileget(f.file); 958 if (IS_ERR(kirqfd->eventfd)) { 959 ret = PTR_ERR(kirqfd->eventfd); 960 goto error_fd_put; 961 } 962 963 /* 964 * Install our own custom wake-up handling so we are notified via a 965 * callback whenever someone signals the underlying eventfd. 966 */ 967 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 968 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 969 970 spin_lock_irqsave(&irqfds_lock, flags); 971 972 list_for_each_entry(tmp, &irqfds_list, list) { 973 if (kirqfd->eventfd == tmp->eventfd) { 974 ret = -EBUSY; 975 spin_unlock_irqrestore(&irqfds_lock, flags); 976 goto error_eventfd; 977 } 978 } 979 980 list_add_tail(&kirqfd->list, &irqfds_list); 981 spin_unlock_irqrestore(&irqfds_lock, flags); 982 983 /* 984 * Check if there was an event already pending on the eventfd before we 985 * registered, and trigger it as if we didn't miss it. 986 */ 987 events = vfs_poll(f.file, &kirqfd->pt); 988 if (events & EPOLLIN) 989 irqfd_inject(kirqfd); 990 991 /* 992 * Do not drop the file until the kirqfd is fully initialized, otherwise 993 * we might race against the EPOLLHUP. 994 */ 995 fdput(f); 996 return 0; 997 998 error_eventfd: 999 eventfd_ctx_put(kirqfd->eventfd); 1000 1001 error_fd_put: 1002 fdput(f); 1003 1004 error_kfree: 1005 kfree(kirqfd); 1006 return ret; 1007 } 1008 1009 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1010 { 1011 struct privcmd_kernel_irqfd *kirqfd; 1012 struct eventfd_ctx *eventfd; 1013 unsigned long flags; 1014 1015 eventfd = eventfd_ctx_fdget(irqfd->fd); 1016 if (IS_ERR(eventfd)) 1017 return PTR_ERR(eventfd); 1018 1019 spin_lock_irqsave(&irqfds_lock, flags); 1020 1021 list_for_each_entry(kirqfd, &irqfds_list, list) { 1022 if (kirqfd->eventfd == eventfd) { 1023 irqfd_deactivate(kirqfd); 1024 break; 1025 } 1026 } 1027 1028 spin_unlock_irqrestore(&irqfds_lock, flags); 1029 1030 eventfd_ctx_put(eventfd); 1031 1032 /* 1033 * Block until we know all outstanding shutdown jobs have completed so 1034 * that we guarantee there will not be any more interrupts once this 1035 * deassign function returns. 1036 */ 1037 flush_workqueue(irqfd_cleanup_wq); 1038 1039 return 0; 1040 } 1041 1042 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1043 { 1044 struct privcmd_data *data = file->private_data; 1045 struct privcmd_irqfd irqfd; 1046 1047 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1048 return -EFAULT; 1049 1050 /* No other flags should be set */ 1051 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1052 return -EINVAL; 1053 1054 /* If restriction is in place, check the domid matches */ 1055 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1056 return -EPERM; 1057 1058 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1059 return privcmd_irqfd_deassign(&irqfd); 1060 1061 return privcmd_irqfd_assign(&irqfd); 1062 } 1063 1064 static int privcmd_irqfd_init(void) 1065 { 1066 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); 1067 if (!irqfd_cleanup_wq) 1068 return -ENOMEM; 1069 1070 return 0; 1071 } 1072 1073 static void privcmd_irqfd_exit(void) 1074 { 1075 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1076 unsigned long flags; 1077 1078 spin_lock_irqsave(&irqfds_lock, flags); 1079 1080 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1081 irqfd_deactivate(kirqfd); 1082 1083 spin_unlock_irqrestore(&irqfds_lock, flags); 1084 1085 destroy_workqueue(irqfd_cleanup_wq); 1086 } 1087 #else 1088 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1089 { 1090 return -EOPNOTSUPP; 1091 } 1092 1093 static inline int privcmd_irqfd_init(void) 1094 { 1095 return 0; 1096 } 1097 1098 static inline void privcmd_irqfd_exit(void) 1099 { 1100 } 1101 #endif /* CONFIG_XEN_PRIVCMD_IRQFD */ 1102 1103 static long privcmd_ioctl(struct file *file, 1104 unsigned int cmd, unsigned long data) 1105 { 1106 int ret = -ENOTTY; 1107 void __user *udata = (void __user *) data; 1108 1109 switch (cmd) { 1110 case IOCTL_PRIVCMD_HYPERCALL: 1111 ret = privcmd_ioctl_hypercall(file, udata); 1112 break; 1113 1114 case IOCTL_PRIVCMD_MMAP: 1115 ret = privcmd_ioctl_mmap(file, udata); 1116 break; 1117 1118 case IOCTL_PRIVCMD_MMAPBATCH: 1119 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1120 break; 1121 1122 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1123 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1124 break; 1125 1126 case IOCTL_PRIVCMD_DM_OP: 1127 ret = privcmd_ioctl_dm_op(file, udata); 1128 break; 1129 1130 case IOCTL_PRIVCMD_RESTRICT: 1131 ret = privcmd_ioctl_restrict(file, udata); 1132 break; 1133 1134 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1135 ret = privcmd_ioctl_mmap_resource(file, udata); 1136 break; 1137 1138 case IOCTL_PRIVCMD_IRQFD: 1139 ret = privcmd_ioctl_irqfd(file, udata); 1140 break; 1141 1142 default: 1143 break; 1144 } 1145 1146 return ret; 1147 } 1148 1149 static int privcmd_open(struct inode *ino, struct file *file) 1150 { 1151 struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); 1152 1153 if (!data) 1154 return -ENOMEM; 1155 1156 /* DOMID_INVALID implies no restriction */ 1157 data->domid = DOMID_INVALID; 1158 1159 file->private_data = data; 1160 return 0; 1161 } 1162 1163 static int privcmd_release(struct inode *ino, struct file *file) 1164 { 1165 struct privcmd_data *data = file->private_data; 1166 1167 kfree(data); 1168 return 0; 1169 } 1170 1171 static void privcmd_close(struct vm_area_struct *vma) 1172 { 1173 struct page **pages = vma->vm_private_data; 1174 int numpgs = vma_pages(vma); 1175 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1176 int rc; 1177 1178 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 1179 return; 1180 1181 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1182 if (rc == 0) 1183 xen_free_unpopulated_pages(numpgs, pages); 1184 else 1185 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1186 numpgs, rc); 1187 kvfree(pages); 1188 } 1189 1190 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1191 { 1192 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1193 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1194 vmf->pgoff, (void *)vmf->address); 1195 1196 return VM_FAULT_SIGBUS; 1197 } 1198 1199 static const struct vm_operations_struct privcmd_vm_ops = { 1200 .close = privcmd_close, 1201 .fault = privcmd_fault 1202 }; 1203 1204 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1205 { 1206 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1207 * how to recreate these mappings */ 1208 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1209 VM_DONTEXPAND | VM_DONTDUMP); 1210 vma->vm_ops = &privcmd_vm_ops; 1211 vma->vm_private_data = NULL; 1212 1213 return 0; 1214 } 1215 1216 /* 1217 * For MMAPBATCH*. This allows asserting the singleshot mapping 1218 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1219 * can be then retried until success. 1220 */ 1221 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1222 { 1223 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1224 } 1225 1226 static int privcmd_vma_range_is_mapped( 1227 struct vm_area_struct *vma, 1228 unsigned long addr, 1229 unsigned long nr_pages) 1230 { 1231 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1232 is_mapped_fn, NULL) != 0; 1233 } 1234 1235 const struct file_operations xen_privcmd_fops = { 1236 .owner = THIS_MODULE, 1237 .unlocked_ioctl = privcmd_ioctl, 1238 .open = privcmd_open, 1239 .release = privcmd_release, 1240 .mmap = privcmd_mmap, 1241 }; 1242 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1243 1244 static struct miscdevice privcmd_dev = { 1245 .minor = MISC_DYNAMIC_MINOR, 1246 .name = "xen/privcmd", 1247 .fops = &xen_privcmd_fops, 1248 }; 1249 1250 static int __init privcmd_init(void) 1251 { 1252 int err; 1253 1254 if (!xen_domain()) 1255 return -ENODEV; 1256 1257 err = misc_register(&privcmd_dev); 1258 if (err != 0) { 1259 pr_err("Could not register Xen privcmd device\n"); 1260 return err; 1261 } 1262 1263 err = misc_register(&xen_privcmdbuf_dev); 1264 if (err != 0) { 1265 pr_err("Could not register Xen hypercall-buf device\n"); 1266 goto err_privcmdbuf; 1267 } 1268 1269 err = privcmd_irqfd_init(); 1270 if (err != 0) { 1271 pr_err("irqfd init failed\n"); 1272 goto err_irqfd; 1273 } 1274 1275 return 0; 1276 1277 err_irqfd: 1278 misc_deregister(&xen_privcmdbuf_dev); 1279 err_privcmdbuf: 1280 misc_deregister(&privcmd_dev); 1281 return err; 1282 } 1283 1284 static void __exit privcmd_exit(void) 1285 { 1286 privcmd_irqfd_exit(); 1287 misc_deregister(&privcmd_dev); 1288 misc_deregister(&xen_privcmdbuf_dev); 1289 } 1290 1291 module_init(privcmd_init); 1292 module_exit(privcmd_exit); 1293