1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/poll.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/srcu.h> 21 #include <linux/string.h> 22 #include <linux/workqueue.h> 23 #include <linux/errno.h> 24 #include <linux/mm.h> 25 #include <linux/mman.h> 26 #include <linux/uaccess.h> 27 #include <linux/swap.h> 28 #include <linux/highmem.h> 29 #include <linux/pagemap.h> 30 #include <linux/seq_file.h> 31 #include <linux/miscdevice.h> 32 #include <linux/moduleparam.h> 33 34 #include <asm/xen/hypervisor.h> 35 #include <asm/xen/hypercall.h> 36 37 #include <xen/xen.h> 38 #include <xen/privcmd.h> 39 #include <xen/interface/xen.h> 40 #include <xen/interface/memory.h> 41 #include <xen/interface/hvm/dm_op.h> 42 #include <xen/features.h> 43 #include <xen/page.h> 44 #include <xen/xen-ops.h> 45 #include <xen/balloon.h> 46 47 #include "privcmd.h" 48 49 MODULE_LICENSE("GPL"); 50 51 #define PRIV_VMA_LOCKED ((void *)1) 52 53 static unsigned int privcmd_dm_op_max_num = 16; 54 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 55 MODULE_PARM_DESC(dm_op_max_nr_bufs, 56 "Maximum number of buffers per dm_op hypercall"); 57 58 static unsigned int privcmd_dm_op_buf_max_size = 4096; 59 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 60 0644); 61 MODULE_PARM_DESC(dm_op_buf_max_size, 62 "Maximum size of a dm_op hypercall buffer"); 63 64 struct privcmd_data { 65 domid_t domid; 66 }; 67 68 static int privcmd_vma_range_is_mapped( 69 struct vm_area_struct *vma, 70 unsigned long addr, 71 unsigned long nr_pages); 72 73 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 74 { 75 struct privcmd_data *data = file->private_data; 76 struct privcmd_hypercall hypercall; 77 long ret; 78 79 /* Disallow arbitrary hypercalls if restricted */ 80 if (data->domid != DOMID_INVALID) 81 return -EPERM; 82 83 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 84 return -EFAULT; 85 86 xen_preemptible_hcall_begin(); 87 ret = privcmd_call(hypercall.op, 88 hypercall.arg[0], hypercall.arg[1], 89 hypercall.arg[2], hypercall.arg[3], 90 hypercall.arg[4]); 91 xen_preemptible_hcall_end(); 92 93 return ret; 94 } 95 96 static void free_page_list(struct list_head *pages) 97 { 98 struct page *p, *n; 99 100 list_for_each_entry_safe(p, n, pages, lru) 101 __free_page(p); 102 103 INIT_LIST_HEAD(pages); 104 } 105 106 /* 107 * Given an array of items in userspace, return a list of pages 108 * containing the data. If copying fails, either because of memory 109 * allocation failure or a problem reading user memory, return an 110 * error code; its up to the caller to dispose of any partial list. 111 */ 112 static int gather_array(struct list_head *pagelist, 113 unsigned nelem, size_t size, 114 const void __user *data) 115 { 116 unsigned pageidx; 117 void *pagedata; 118 int ret; 119 120 if (size > PAGE_SIZE) 121 return 0; 122 123 pageidx = PAGE_SIZE; 124 pagedata = NULL; /* quiet, gcc */ 125 while (nelem--) { 126 if (pageidx > PAGE_SIZE-size) { 127 struct page *page = alloc_page(GFP_KERNEL); 128 129 ret = -ENOMEM; 130 if (page == NULL) 131 goto fail; 132 133 pagedata = page_address(page); 134 135 list_add_tail(&page->lru, pagelist); 136 pageidx = 0; 137 } 138 139 ret = -EFAULT; 140 if (copy_from_user(pagedata + pageidx, data, size)) 141 goto fail; 142 143 data += size; 144 pageidx += size; 145 } 146 147 ret = 0; 148 149 fail: 150 return ret; 151 } 152 153 /* 154 * Call function "fn" on each element of the array fragmented 155 * over a list of pages. 156 */ 157 static int traverse_pages(unsigned nelem, size_t size, 158 struct list_head *pos, 159 int (*fn)(void *data, void *state), 160 void *state) 161 { 162 void *pagedata; 163 unsigned pageidx; 164 int ret = 0; 165 166 BUG_ON(size > PAGE_SIZE); 167 168 pageidx = PAGE_SIZE; 169 pagedata = NULL; /* hush, gcc */ 170 171 while (nelem--) { 172 if (pageidx > PAGE_SIZE-size) { 173 struct page *page; 174 pos = pos->next; 175 page = list_entry(pos, struct page, lru); 176 pagedata = page_address(page); 177 pageidx = 0; 178 } 179 180 ret = (*fn)(pagedata + pageidx, state); 181 if (ret) 182 break; 183 pageidx += size; 184 } 185 186 return ret; 187 } 188 189 /* 190 * Similar to traverse_pages, but use each page as a "block" of 191 * data to be processed as one unit. 192 */ 193 static int traverse_pages_block(unsigned nelem, size_t size, 194 struct list_head *pos, 195 int (*fn)(void *data, int nr, void *state), 196 void *state) 197 { 198 void *pagedata; 199 int ret = 0; 200 201 BUG_ON(size > PAGE_SIZE); 202 203 while (nelem) { 204 int nr = (PAGE_SIZE/size); 205 struct page *page; 206 if (nr > nelem) 207 nr = nelem; 208 pos = pos->next; 209 page = list_entry(pos, struct page, lru); 210 pagedata = page_address(page); 211 ret = (*fn)(pagedata, nr, state); 212 if (ret) 213 break; 214 nelem -= nr; 215 } 216 217 return ret; 218 } 219 220 struct mmap_gfn_state { 221 unsigned long va; 222 struct vm_area_struct *vma; 223 domid_t domain; 224 }; 225 226 static int mmap_gfn_range(void *data, void *state) 227 { 228 struct privcmd_mmap_entry *msg = data; 229 struct mmap_gfn_state *st = state; 230 struct vm_area_struct *vma = st->vma; 231 int rc; 232 233 /* Do not allow range to wrap the address space. */ 234 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 235 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 236 return -EINVAL; 237 238 /* Range chunks must be contiguous in va space. */ 239 if ((msg->va != st->va) || 240 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 241 return -EINVAL; 242 243 rc = xen_remap_domain_gfn_range(vma, 244 msg->va & PAGE_MASK, 245 msg->mfn, msg->npages, 246 vma->vm_page_prot, 247 st->domain, NULL); 248 if (rc < 0) 249 return rc; 250 251 st->va += msg->npages << PAGE_SHIFT; 252 253 return 0; 254 } 255 256 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 257 { 258 struct privcmd_data *data = file->private_data; 259 struct privcmd_mmap mmapcmd; 260 struct mm_struct *mm = current->mm; 261 struct vm_area_struct *vma; 262 int rc; 263 LIST_HEAD(pagelist); 264 struct mmap_gfn_state state; 265 266 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 267 if (xen_feature(XENFEAT_auto_translated_physmap)) 268 return -ENOSYS; 269 270 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 271 return -EFAULT; 272 273 /* If restriction is in place, check the domid matches */ 274 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 275 return -EPERM; 276 277 rc = gather_array(&pagelist, 278 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 279 mmapcmd.entry); 280 281 if (rc || list_empty(&pagelist)) 282 goto out; 283 284 mmap_write_lock(mm); 285 286 { 287 struct page *page = list_first_entry(&pagelist, 288 struct page, lru); 289 struct privcmd_mmap_entry *msg = page_address(page); 290 291 vma = vma_lookup(mm, msg->va); 292 rc = -EINVAL; 293 294 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 295 goto out_up; 296 vma->vm_private_data = PRIV_VMA_LOCKED; 297 } 298 299 state.va = vma->vm_start; 300 state.vma = vma; 301 state.domain = mmapcmd.dom; 302 303 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 304 &pagelist, 305 mmap_gfn_range, &state); 306 307 308 out_up: 309 mmap_write_unlock(mm); 310 311 out: 312 free_page_list(&pagelist); 313 314 return rc; 315 } 316 317 struct mmap_batch_state { 318 domid_t domain; 319 unsigned long va; 320 struct vm_area_struct *vma; 321 int index; 322 /* A tristate: 323 * 0 for no errors 324 * 1 if at least one error has happened (and no 325 * -ENOENT errors have happened) 326 * -ENOENT if at least 1 -ENOENT has happened. 327 */ 328 int global_error; 329 int version; 330 331 /* User-space gfn array to store errors in the second pass for V1. */ 332 xen_pfn_t __user *user_gfn; 333 /* User-space int array to store errors in the second pass for V2. */ 334 int __user *user_err; 335 }; 336 337 /* auto translated dom0 note: if domU being created is PV, then gfn is 338 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 339 */ 340 static int mmap_batch_fn(void *data, int nr, void *state) 341 { 342 xen_pfn_t *gfnp = data; 343 struct mmap_batch_state *st = state; 344 struct vm_area_struct *vma = st->vma; 345 struct page **pages = vma->vm_private_data; 346 struct page **cur_pages = NULL; 347 int ret; 348 349 if (xen_feature(XENFEAT_auto_translated_physmap)) 350 cur_pages = &pages[st->index]; 351 352 BUG_ON(nr < 0); 353 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 354 (int *)gfnp, st->vma->vm_page_prot, 355 st->domain, cur_pages); 356 357 /* Adjust the global_error? */ 358 if (ret != nr) { 359 if (ret == -ENOENT) 360 st->global_error = -ENOENT; 361 else { 362 /* Record that at least one error has happened. */ 363 if (st->global_error == 0) 364 st->global_error = 1; 365 } 366 } 367 st->va += XEN_PAGE_SIZE * nr; 368 st->index += nr / XEN_PFN_PER_PAGE; 369 370 return 0; 371 } 372 373 static int mmap_return_error(int err, struct mmap_batch_state *st) 374 { 375 int ret; 376 377 if (st->version == 1) { 378 if (err) { 379 xen_pfn_t gfn; 380 381 ret = get_user(gfn, st->user_gfn); 382 if (ret < 0) 383 return ret; 384 /* 385 * V1 encodes the error codes in the 32bit top 386 * nibble of the gfn (with its known 387 * limitations vis-a-vis 64 bit callers). 388 */ 389 gfn |= (err == -ENOENT) ? 390 PRIVCMD_MMAPBATCH_PAGED_ERROR : 391 PRIVCMD_MMAPBATCH_MFN_ERROR; 392 return __put_user(gfn, st->user_gfn++); 393 } else 394 st->user_gfn++; 395 } else { /* st->version == 2 */ 396 if (err) 397 return __put_user(err, st->user_err++); 398 else 399 st->user_err++; 400 } 401 402 return 0; 403 } 404 405 static int mmap_return_errors(void *data, int nr, void *state) 406 { 407 struct mmap_batch_state *st = state; 408 int *errs = data; 409 int i; 410 int ret; 411 412 for (i = 0; i < nr; i++) { 413 ret = mmap_return_error(errs[i], st); 414 if (ret < 0) 415 return ret; 416 } 417 return 0; 418 } 419 420 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 421 * the vma with the page info to use later. 422 * Returns: 0 if success, otherwise -errno 423 */ 424 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 425 { 426 int rc; 427 struct page **pages; 428 429 pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 430 if (pages == NULL) 431 return -ENOMEM; 432 433 rc = xen_alloc_unpopulated_pages(numpgs, pages); 434 if (rc != 0) { 435 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 436 numpgs, rc); 437 kvfree(pages); 438 return -ENOMEM; 439 } 440 BUG_ON(vma->vm_private_data != NULL); 441 vma->vm_private_data = pages; 442 443 return 0; 444 } 445 446 static const struct vm_operations_struct privcmd_vm_ops; 447 448 static long privcmd_ioctl_mmap_batch( 449 struct file *file, void __user *udata, int version) 450 { 451 struct privcmd_data *data = file->private_data; 452 int ret; 453 struct privcmd_mmapbatch_v2 m; 454 struct mm_struct *mm = current->mm; 455 struct vm_area_struct *vma; 456 unsigned long nr_pages; 457 LIST_HEAD(pagelist); 458 struct mmap_batch_state state; 459 460 switch (version) { 461 case 1: 462 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 463 return -EFAULT; 464 /* Returns per-frame error in m.arr. */ 465 m.err = NULL; 466 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 467 return -EFAULT; 468 break; 469 case 2: 470 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 471 return -EFAULT; 472 /* Returns per-frame error code in m.err. */ 473 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 474 return -EFAULT; 475 break; 476 default: 477 return -EINVAL; 478 } 479 480 /* If restriction is in place, check the domid matches */ 481 if (data->domid != DOMID_INVALID && data->domid != m.dom) 482 return -EPERM; 483 484 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 485 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 486 return -EINVAL; 487 488 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 489 490 if (ret) 491 goto out; 492 if (list_empty(&pagelist)) { 493 ret = -EINVAL; 494 goto out; 495 } 496 497 if (version == 2) { 498 /* Zero error array now to only copy back actual errors. */ 499 if (clear_user(m.err, sizeof(int) * m.num)) { 500 ret = -EFAULT; 501 goto out; 502 } 503 } 504 505 mmap_write_lock(mm); 506 507 vma = find_vma(mm, m.addr); 508 if (!vma || 509 vma->vm_ops != &privcmd_vm_ops) { 510 ret = -EINVAL; 511 goto out_unlock; 512 } 513 514 /* 515 * Caller must either: 516 * 517 * Map the whole VMA range, which will also allocate all the 518 * pages required for the auto_translated_physmap case. 519 * 520 * Or 521 * 522 * Map unmapped holes left from a previous map attempt (e.g., 523 * because those foreign frames were previously paged out). 524 */ 525 if (vma->vm_private_data == NULL) { 526 if (m.addr != vma->vm_start || 527 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 528 ret = -EINVAL; 529 goto out_unlock; 530 } 531 if (xen_feature(XENFEAT_auto_translated_physmap)) { 532 ret = alloc_empty_pages(vma, nr_pages); 533 if (ret < 0) 534 goto out_unlock; 535 } else 536 vma->vm_private_data = PRIV_VMA_LOCKED; 537 } else { 538 if (m.addr < vma->vm_start || 539 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 540 ret = -EINVAL; 541 goto out_unlock; 542 } 543 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 544 ret = -EINVAL; 545 goto out_unlock; 546 } 547 } 548 549 state.domain = m.dom; 550 state.vma = vma; 551 state.va = m.addr; 552 state.index = 0; 553 state.global_error = 0; 554 state.version = version; 555 556 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 557 /* mmap_batch_fn guarantees ret == 0 */ 558 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 559 &pagelist, mmap_batch_fn, &state)); 560 561 mmap_write_unlock(mm); 562 563 if (state.global_error) { 564 /* Write back errors in second pass. */ 565 state.user_gfn = (xen_pfn_t *)m.arr; 566 state.user_err = m.err; 567 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 568 &pagelist, mmap_return_errors, &state); 569 } else 570 ret = 0; 571 572 /* If we have not had any EFAULT-like global errors then set the global 573 * error to -ENOENT if necessary. */ 574 if ((ret == 0) && (state.global_error == -ENOENT)) 575 ret = -ENOENT; 576 577 out: 578 free_page_list(&pagelist); 579 return ret; 580 581 out_unlock: 582 mmap_write_unlock(mm); 583 goto out; 584 } 585 586 static int lock_pages( 587 struct privcmd_dm_op_buf kbufs[], unsigned int num, 588 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 589 { 590 unsigned int i, off = 0; 591 592 for (i = 0; i < num; ) { 593 unsigned int requested; 594 int page_count; 595 596 requested = DIV_ROUND_UP( 597 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 598 PAGE_SIZE) - off; 599 if (requested > nr_pages) 600 return -ENOSPC; 601 602 page_count = pin_user_pages_fast( 603 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 604 requested, FOLL_WRITE, pages); 605 if (page_count <= 0) 606 return page_count ? : -EFAULT; 607 608 *pinned += page_count; 609 nr_pages -= page_count; 610 pages += page_count; 611 612 off = (requested == page_count) ? 0 : off + page_count; 613 i += !off; 614 } 615 616 return 0; 617 } 618 619 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 620 { 621 unpin_user_pages_dirty_lock(pages, nr_pages, true); 622 } 623 624 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 625 { 626 struct privcmd_data *data = file->private_data; 627 struct privcmd_dm_op kdata; 628 struct privcmd_dm_op_buf *kbufs; 629 unsigned int nr_pages = 0; 630 struct page **pages = NULL; 631 struct xen_dm_op_buf *xbufs = NULL; 632 unsigned int i; 633 long rc; 634 unsigned int pinned = 0; 635 636 if (copy_from_user(&kdata, udata, sizeof(kdata))) 637 return -EFAULT; 638 639 /* If restriction is in place, check the domid matches */ 640 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 641 return -EPERM; 642 643 if (kdata.num == 0) 644 return 0; 645 646 if (kdata.num > privcmd_dm_op_max_num) 647 return -E2BIG; 648 649 kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); 650 if (!kbufs) 651 return -ENOMEM; 652 653 if (copy_from_user(kbufs, kdata.ubufs, 654 sizeof(*kbufs) * kdata.num)) { 655 rc = -EFAULT; 656 goto out; 657 } 658 659 for (i = 0; i < kdata.num; i++) { 660 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 661 rc = -E2BIG; 662 goto out; 663 } 664 665 if (!access_ok(kbufs[i].uptr, 666 kbufs[i].size)) { 667 rc = -EFAULT; 668 goto out; 669 } 670 671 nr_pages += DIV_ROUND_UP( 672 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 673 PAGE_SIZE); 674 } 675 676 pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); 677 if (!pages) { 678 rc = -ENOMEM; 679 goto out; 680 } 681 682 xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); 683 if (!xbufs) { 684 rc = -ENOMEM; 685 goto out; 686 } 687 688 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 689 if (rc < 0) 690 goto out; 691 692 for (i = 0; i < kdata.num; i++) { 693 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 694 xbufs[i].size = kbufs[i].size; 695 } 696 697 xen_preemptible_hcall_begin(); 698 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 699 xen_preemptible_hcall_end(); 700 701 out: 702 unlock_pages(pages, pinned); 703 kfree(xbufs); 704 kfree(pages); 705 kfree(kbufs); 706 707 return rc; 708 } 709 710 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 711 { 712 struct privcmd_data *data = file->private_data; 713 domid_t dom; 714 715 if (copy_from_user(&dom, udata, sizeof(dom))) 716 return -EFAULT; 717 718 /* Set restriction to the specified domain, or check it matches */ 719 if (data->domid == DOMID_INVALID) 720 data->domid = dom; 721 else if (data->domid != dom) 722 return -EINVAL; 723 724 return 0; 725 } 726 727 static long privcmd_ioctl_mmap_resource(struct file *file, 728 struct privcmd_mmap_resource __user *udata) 729 { 730 struct privcmd_data *data = file->private_data; 731 struct mm_struct *mm = current->mm; 732 struct vm_area_struct *vma; 733 struct privcmd_mmap_resource kdata; 734 xen_pfn_t *pfns = NULL; 735 struct xen_mem_acquire_resource xdata = { }; 736 int rc; 737 738 if (copy_from_user(&kdata, udata, sizeof(kdata))) 739 return -EFAULT; 740 741 /* If restriction is in place, check the domid matches */ 742 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 743 return -EPERM; 744 745 /* Both fields must be set or unset */ 746 if (!!kdata.addr != !!kdata.num) 747 return -EINVAL; 748 749 xdata.domid = kdata.dom; 750 xdata.type = kdata.type; 751 xdata.id = kdata.id; 752 753 if (!kdata.addr && !kdata.num) { 754 /* Query the size of the resource. */ 755 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 756 if (rc) 757 return rc; 758 return __put_user(xdata.nr_frames, &udata->num); 759 } 760 761 mmap_write_lock(mm); 762 763 vma = find_vma(mm, kdata.addr); 764 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 765 rc = -EINVAL; 766 goto out; 767 } 768 769 pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); 770 if (!pfns) { 771 rc = -ENOMEM; 772 goto out; 773 } 774 775 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 776 xen_feature(XENFEAT_auto_translated_physmap)) { 777 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 778 struct page **pages; 779 unsigned int i; 780 781 rc = alloc_empty_pages(vma, nr); 782 if (rc < 0) 783 goto out; 784 785 pages = vma->vm_private_data; 786 for (i = 0; i < kdata.num; i++) { 787 xen_pfn_t pfn = 788 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 789 790 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 791 } 792 } else 793 vma->vm_private_data = PRIV_VMA_LOCKED; 794 795 xdata.frame = kdata.idx; 796 xdata.nr_frames = kdata.num; 797 set_xen_guest_handle(xdata.frame_list, pfns); 798 799 xen_preemptible_hcall_begin(); 800 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 801 xen_preemptible_hcall_end(); 802 803 if (rc) 804 goto out; 805 806 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 807 xen_feature(XENFEAT_auto_translated_physmap)) { 808 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 809 } else { 810 unsigned int domid = 811 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 812 DOMID_SELF : kdata.dom; 813 int num, *errs = (int *)pfns; 814 815 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 816 num = xen_remap_domain_mfn_array(vma, 817 kdata.addr & PAGE_MASK, 818 pfns, kdata.num, errs, 819 vma->vm_page_prot, 820 domid); 821 if (num < 0) 822 rc = num; 823 else if (num != kdata.num) { 824 unsigned int i; 825 826 for (i = 0; i < num; i++) { 827 rc = errs[i]; 828 if (rc < 0) 829 break; 830 } 831 } else 832 rc = 0; 833 } 834 835 out: 836 mmap_write_unlock(mm); 837 kfree(pfns); 838 839 return rc; 840 } 841 842 #ifdef CONFIG_XEN_PRIVCMD_IRQFD 843 /* Irqfd support */ 844 static struct workqueue_struct *irqfd_cleanup_wq; 845 static DEFINE_SPINLOCK(irqfds_lock); 846 DEFINE_STATIC_SRCU(irqfds_srcu); 847 static LIST_HEAD(irqfds_list); 848 849 struct privcmd_kernel_irqfd { 850 struct xen_dm_op_buf xbufs; 851 domid_t dom; 852 bool error; 853 struct eventfd_ctx *eventfd; 854 struct work_struct shutdown; 855 wait_queue_entry_t wait; 856 struct list_head list; 857 poll_table pt; 858 }; 859 860 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 861 { 862 lockdep_assert_held(&irqfds_lock); 863 864 list_del_init(&kirqfd->list); 865 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 866 } 867 868 static void irqfd_shutdown(struct work_struct *work) 869 { 870 struct privcmd_kernel_irqfd *kirqfd = 871 container_of(work, struct privcmd_kernel_irqfd, shutdown); 872 u64 cnt; 873 874 /* Make sure irqfd has been initialized in assign path */ 875 synchronize_srcu(&irqfds_srcu); 876 877 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 878 eventfd_ctx_put(kirqfd->eventfd); 879 kfree(kirqfd); 880 } 881 882 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 883 { 884 u64 cnt; 885 long rc; 886 887 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 888 889 xen_preemptible_hcall_begin(); 890 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 891 xen_preemptible_hcall_end(); 892 893 /* Don't repeat the error message for consecutive failures */ 894 if (rc && !kirqfd->error) { 895 pr_err("Failed to configure irq for guest domain: %d\n", 896 kirqfd->dom); 897 } 898 899 kirqfd->error = rc; 900 } 901 902 static int 903 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 904 { 905 struct privcmd_kernel_irqfd *kirqfd = 906 container_of(wait, struct privcmd_kernel_irqfd, wait); 907 __poll_t flags = key_to_poll(key); 908 909 if (flags & EPOLLIN) 910 irqfd_inject(kirqfd); 911 912 if (flags & EPOLLHUP) { 913 unsigned long flags; 914 915 spin_lock_irqsave(&irqfds_lock, flags); 916 irqfd_deactivate(kirqfd); 917 spin_unlock_irqrestore(&irqfds_lock, flags); 918 } 919 920 return 0; 921 } 922 923 static void 924 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 925 { 926 struct privcmd_kernel_irqfd *kirqfd = 927 container_of(pt, struct privcmd_kernel_irqfd, pt); 928 929 add_wait_queue_priority(wqh, &kirqfd->wait); 930 } 931 932 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 933 { 934 struct privcmd_kernel_irqfd *kirqfd, *tmp; 935 unsigned long flags; 936 __poll_t events; 937 struct fd f; 938 void *dm_op; 939 int ret, idx; 940 941 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 942 if (!kirqfd) 943 return -ENOMEM; 944 dm_op = kirqfd + 1; 945 946 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 947 ret = -EFAULT; 948 goto error_kfree; 949 } 950 951 kirqfd->xbufs.size = irqfd->size; 952 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 953 kirqfd->dom = irqfd->dom; 954 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 955 956 f = fdget(irqfd->fd); 957 if (!f.file) { 958 ret = -EBADF; 959 goto error_kfree; 960 } 961 962 kirqfd->eventfd = eventfd_ctx_fileget(f.file); 963 if (IS_ERR(kirqfd->eventfd)) { 964 ret = PTR_ERR(kirqfd->eventfd); 965 goto error_fd_put; 966 } 967 968 /* 969 * Install our own custom wake-up handling so we are notified via a 970 * callback whenever someone signals the underlying eventfd. 971 */ 972 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 973 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 974 975 spin_lock_irqsave(&irqfds_lock, flags); 976 977 list_for_each_entry(tmp, &irqfds_list, list) { 978 if (kirqfd->eventfd == tmp->eventfd) { 979 ret = -EBUSY; 980 spin_unlock_irqrestore(&irqfds_lock, flags); 981 goto error_eventfd; 982 } 983 } 984 985 idx = srcu_read_lock(&irqfds_srcu); 986 list_add_tail(&kirqfd->list, &irqfds_list); 987 spin_unlock_irqrestore(&irqfds_lock, flags); 988 989 /* 990 * Check if there was an event already pending on the eventfd before we 991 * registered, and trigger it as if we didn't miss it. 992 */ 993 events = vfs_poll(f.file, &kirqfd->pt); 994 if (events & EPOLLIN) 995 irqfd_inject(kirqfd); 996 997 srcu_read_unlock(&irqfds_srcu, idx); 998 999 /* 1000 * Do not drop the file until the kirqfd is fully initialized, otherwise 1001 * we might race against the EPOLLHUP. 1002 */ 1003 fdput(f); 1004 return 0; 1005 1006 error_eventfd: 1007 eventfd_ctx_put(kirqfd->eventfd); 1008 1009 error_fd_put: 1010 fdput(f); 1011 1012 error_kfree: 1013 kfree(kirqfd); 1014 return ret; 1015 } 1016 1017 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1018 { 1019 struct privcmd_kernel_irqfd *kirqfd; 1020 struct eventfd_ctx *eventfd; 1021 unsigned long flags; 1022 1023 eventfd = eventfd_ctx_fdget(irqfd->fd); 1024 if (IS_ERR(eventfd)) 1025 return PTR_ERR(eventfd); 1026 1027 spin_lock_irqsave(&irqfds_lock, flags); 1028 1029 list_for_each_entry(kirqfd, &irqfds_list, list) { 1030 if (kirqfd->eventfd == eventfd) { 1031 irqfd_deactivate(kirqfd); 1032 break; 1033 } 1034 } 1035 1036 spin_unlock_irqrestore(&irqfds_lock, flags); 1037 1038 eventfd_ctx_put(eventfd); 1039 1040 /* 1041 * Block until we know all outstanding shutdown jobs have completed so 1042 * that we guarantee there will not be any more interrupts once this 1043 * deassign function returns. 1044 */ 1045 flush_workqueue(irqfd_cleanup_wq); 1046 1047 return 0; 1048 } 1049 1050 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1051 { 1052 struct privcmd_data *data = file->private_data; 1053 struct privcmd_irqfd irqfd; 1054 1055 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1056 return -EFAULT; 1057 1058 /* No other flags should be set */ 1059 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1060 return -EINVAL; 1061 1062 /* If restriction is in place, check the domid matches */ 1063 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1064 return -EPERM; 1065 1066 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1067 return privcmd_irqfd_deassign(&irqfd); 1068 1069 return privcmd_irqfd_assign(&irqfd); 1070 } 1071 1072 static int privcmd_irqfd_init(void) 1073 { 1074 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); 1075 if (!irqfd_cleanup_wq) 1076 return -ENOMEM; 1077 1078 return 0; 1079 } 1080 1081 static void privcmd_irqfd_exit(void) 1082 { 1083 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1084 unsigned long flags; 1085 1086 spin_lock_irqsave(&irqfds_lock, flags); 1087 1088 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1089 irqfd_deactivate(kirqfd); 1090 1091 spin_unlock_irqrestore(&irqfds_lock, flags); 1092 1093 destroy_workqueue(irqfd_cleanup_wq); 1094 } 1095 #else 1096 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1097 { 1098 return -EOPNOTSUPP; 1099 } 1100 1101 static inline int privcmd_irqfd_init(void) 1102 { 1103 return 0; 1104 } 1105 1106 static inline void privcmd_irqfd_exit(void) 1107 { 1108 } 1109 #endif /* CONFIG_XEN_PRIVCMD_IRQFD */ 1110 1111 static long privcmd_ioctl(struct file *file, 1112 unsigned int cmd, unsigned long data) 1113 { 1114 int ret = -ENOTTY; 1115 void __user *udata = (void __user *) data; 1116 1117 switch (cmd) { 1118 case IOCTL_PRIVCMD_HYPERCALL: 1119 ret = privcmd_ioctl_hypercall(file, udata); 1120 break; 1121 1122 case IOCTL_PRIVCMD_MMAP: 1123 ret = privcmd_ioctl_mmap(file, udata); 1124 break; 1125 1126 case IOCTL_PRIVCMD_MMAPBATCH: 1127 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1128 break; 1129 1130 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1131 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1132 break; 1133 1134 case IOCTL_PRIVCMD_DM_OP: 1135 ret = privcmd_ioctl_dm_op(file, udata); 1136 break; 1137 1138 case IOCTL_PRIVCMD_RESTRICT: 1139 ret = privcmd_ioctl_restrict(file, udata); 1140 break; 1141 1142 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1143 ret = privcmd_ioctl_mmap_resource(file, udata); 1144 break; 1145 1146 case IOCTL_PRIVCMD_IRQFD: 1147 ret = privcmd_ioctl_irqfd(file, udata); 1148 break; 1149 1150 default: 1151 break; 1152 } 1153 1154 return ret; 1155 } 1156 1157 static int privcmd_open(struct inode *ino, struct file *file) 1158 { 1159 struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); 1160 1161 if (!data) 1162 return -ENOMEM; 1163 1164 /* DOMID_INVALID implies no restriction */ 1165 data->domid = DOMID_INVALID; 1166 1167 file->private_data = data; 1168 return 0; 1169 } 1170 1171 static int privcmd_release(struct inode *ino, struct file *file) 1172 { 1173 struct privcmd_data *data = file->private_data; 1174 1175 kfree(data); 1176 return 0; 1177 } 1178 1179 static void privcmd_close(struct vm_area_struct *vma) 1180 { 1181 struct page **pages = vma->vm_private_data; 1182 int numpgs = vma_pages(vma); 1183 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1184 int rc; 1185 1186 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 1187 return; 1188 1189 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1190 if (rc == 0) 1191 xen_free_unpopulated_pages(numpgs, pages); 1192 else 1193 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1194 numpgs, rc); 1195 kvfree(pages); 1196 } 1197 1198 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1199 { 1200 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1201 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1202 vmf->pgoff, (void *)vmf->address); 1203 1204 return VM_FAULT_SIGBUS; 1205 } 1206 1207 static const struct vm_operations_struct privcmd_vm_ops = { 1208 .close = privcmd_close, 1209 .fault = privcmd_fault 1210 }; 1211 1212 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1213 { 1214 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1215 * how to recreate these mappings */ 1216 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1217 VM_DONTEXPAND | VM_DONTDUMP); 1218 vma->vm_ops = &privcmd_vm_ops; 1219 vma->vm_private_data = NULL; 1220 1221 return 0; 1222 } 1223 1224 /* 1225 * For MMAPBATCH*. This allows asserting the singleshot mapping 1226 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1227 * can be then retried until success. 1228 */ 1229 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1230 { 1231 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1232 } 1233 1234 static int privcmd_vma_range_is_mapped( 1235 struct vm_area_struct *vma, 1236 unsigned long addr, 1237 unsigned long nr_pages) 1238 { 1239 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1240 is_mapped_fn, NULL) != 0; 1241 } 1242 1243 const struct file_operations xen_privcmd_fops = { 1244 .owner = THIS_MODULE, 1245 .unlocked_ioctl = privcmd_ioctl, 1246 .open = privcmd_open, 1247 .release = privcmd_release, 1248 .mmap = privcmd_mmap, 1249 }; 1250 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1251 1252 static struct miscdevice privcmd_dev = { 1253 .minor = MISC_DYNAMIC_MINOR, 1254 .name = "xen/privcmd", 1255 .fops = &xen_privcmd_fops, 1256 }; 1257 1258 static int __init privcmd_init(void) 1259 { 1260 int err; 1261 1262 if (!xen_domain()) 1263 return -ENODEV; 1264 1265 err = misc_register(&privcmd_dev); 1266 if (err != 0) { 1267 pr_err("Could not register Xen privcmd device\n"); 1268 return err; 1269 } 1270 1271 err = misc_register(&xen_privcmdbuf_dev); 1272 if (err != 0) { 1273 pr_err("Could not register Xen hypercall-buf device\n"); 1274 goto err_privcmdbuf; 1275 } 1276 1277 err = privcmd_irqfd_init(); 1278 if (err != 0) { 1279 pr_err("irqfd init failed\n"); 1280 goto err_irqfd; 1281 } 1282 1283 return 0; 1284 1285 err_irqfd: 1286 misc_deregister(&xen_privcmdbuf_dev); 1287 err_privcmdbuf: 1288 misc_deregister(&privcmd_dev); 1289 return err; 1290 } 1291 1292 static void __exit privcmd_exit(void) 1293 { 1294 privcmd_irqfd_exit(); 1295 misc_deregister(&privcmd_dev); 1296 misc_deregister(&xen_privcmdbuf_dev); 1297 } 1298 1299 module_init(privcmd_init); 1300 module_exit(privcmd_exit); 1301