xref: /openbmc/linux/drivers/xen/privcmd.c (revision 6fa24b41)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /******************************************************************************
3  * privcmd.c
4  *
5  * Interface to privileged domain-0 commands.
6  *
7  * Copyright (c) 2002-2004, K A Fraser, B Dragovic
8  */
9 
10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
11 
12 #include <linux/eventfd.h>
13 #include <linux/file.h>
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/mutex.h>
17 #include <linux/poll.h>
18 #include <linux/sched.h>
19 #include <linux/slab.h>
20 #include <linux/srcu.h>
21 #include <linux/string.h>
22 #include <linux/workqueue.h>
23 #include <linux/errno.h>
24 #include <linux/mm.h>
25 #include <linux/mman.h>
26 #include <linux/uaccess.h>
27 #include <linux/swap.h>
28 #include <linux/highmem.h>
29 #include <linux/pagemap.h>
30 #include <linux/seq_file.h>
31 #include <linux/miscdevice.h>
32 #include <linux/moduleparam.h>
33 
34 #include <asm/xen/hypervisor.h>
35 #include <asm/xen/hypercall.h>
36 
37 #include <xen/xen.h>
38 #include <xen/privcmd.h>
39 #include <xen/interface/xen.h>
40 #include <xen/interface/memory.h>
41 #include <xen/interface/hvm/dm_op.h>
42 #include <xen/features.h>
43 #include <xen/page.h>
44 #include <xen/xen-ops.h>
45 #include <xen/balloon.h>
46 
47 #include "privcmd.h"
48 
49 MODULE_LICENSE("GPL");
50 
51 #define PRIV_VMA_LOCKED ((void *)1)
52 
53 static unsigned int privcmd_dm_op_max_num = 16;
54 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
55 MODULE_PARM_DESC(dm_op_max_nr_bufs,
56 		 "Maximum number of buffers per dm_op hypercall");
57 
58 static unsigned int privcmd_dm_op_buf_max_size = 4096;
59 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
60 		   0644);
61 MODULE_PARM_DESC(dm_op_buf_max_size,
62 		 "Maximum size of a dm_op hypercall buffer");
63 
64 struct privcmd_data {
65 	domid_t domid;
66 };
67 
68 static int privcmd_vma_range_is_mapped(
69                struct vm_area_struct *vma,
70                unsigned long addr,
71                unsigned long nr_pages);
72 
73 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
74 {
75 	struct privcmd_data *data = file->private_data;
76 	struct privcmd_hypercall hypercall;
77 	long ret;
78 
79 	/* Disallow arbitrary hypercalls if restricted */
80 	if (data->domid != DOMID_INVALID)
81 		return -EPERM;
82 
83 	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
84 		return -EFAULT;
85 
86 	xen_preemptible_hcall_begin();
87 	ret = privcmd_call(hypercall.op,
88 			   hypercall.arg[0], hypercall.arg[1],
89 			   hypercall.arg[2], hypercall.arg[3],
90 			   hypercall.arg[4]);
91 	xen_preemptible_hcall_end();
92 
93 	return ret;
94 }
95 
96 static void free_page_list(struct list_head *pages)
97 {
98 	struct page *p, *n;
99 
100 	list_for_each_entry_safe(p, n, pages, lru)
101 		__free_page(p);
102 
103 	INIT_LIST_HEAD(pages);
104 }
105 
106 /*
107  * Given an array of items in userspace, return a list of pages
108  * containing the data.  If copying fails, either because of memory
109  * allocation failure or a problem reading user memory, return an
110  * error code; its up to the caller to dispose of any partial list.
111  */
112 static int gather_array(struct list_head *pagelist,
113 			unsigned nelem, size_t size,
114 			const void __user *data)
115 {
116 	unsigned pageidx;
117 	void *pagedata;
118 	int ret;
119 
120 	if (size > PAGE_SIZE)
121 		return 0;
122 
123 	pageidx = PAGE_SIZE;
124 	pagedata = NULL;	/* quiet, gcc */
125 	while (nelem--) {
126 		if (pageidx > PAGE_SIZE-size) {
127 			struct page *page = alloc_page(GFP_KERNEL);
128 
129 			ret = -ENOMEM;
130 			if (page == NULL)
131 				goto fail;
132 
133 			pagedata = page_address(page);
134 
135 			list_add_tail(&page->lru, pagelist);
136 			pageidx = 0;
137 		}
138 
139 		ret = -EFAULT;
140 		if (copy_from_user(pagedata + pageidx, data, size))
141 			goto fail;
142 
143 		data += size;
144 		pageidx += size;
145 	}
146 
147 	ret = 0;
148 
149 fail:
150 	return ret;
151 }
152 
153 /*
154  * Call function "fn" on each element of the array fragmented
155  * over a list of pages.
156  */
157 static int traverse_pages(unsigned nelem, size_t size,
158 			  struct list_head *pos,
159 			  int (*fn)(void *data, void *state),
160 			  void *state)
161 {
162 	void *pagedata;
163 	unsigned pageidx;
164 	int ret = 0;
165 
166 	BUG_ON(size > PAGE_SIZE);
167 
168 	pageidx = PAGE_SIZE;
169 	pagedata = NULL;	/* hush, gcc */
170 
171 	while (nelem--) {
172 		if (pageidx > PAGE_SIZE-size) {
173 			struct page *page;
174 			pos = pos->next;
175 			page = list_entry(pos, struct page, lru);
176 			pagedata = page_address(page);
177 			pageidx = 0;
178 		}
179 
180 		ret = (*fn)(pagedata + pageidx, state);
181 		if (ret)
182 			break;
183 		pageidx += size;
184 	}
185 
186 	return ret;
187 }
188 
189 /*
190  * Similar to traverse_pages, but use each page as a "block" of
191  * data to be processed as one unit.
192  */
193 static int traverse_pages_block(unsigned nelem, size_t size,
194 				struct list_head *pos,
195 				int (*fn)(void *data, int nr, void *state),
196 				void *state)
197 {
198 	void *pagedata;
199 	int ret = 0;
200 
201 	BUG_ON(size > PAGE_SIZE);
202 
203 	while (nelem) {
204 		int nr = (PAGE_SIZE/size);
205 		struct page *page;
206 		if (nr > nelem)
207 			nr = nelem;
208 		pos = pos->next;
209 		page = list_entry(pos, struct page, lru);
210 		pagedata = page_address(page);
211 		ret = (*fn)(pagedata, nr, state);
212 		if (ret)
213 			break;
214 		nelem -= nr;
215 	}
216 
217 	return ret;
218 }
219 
220 struct mmap_gfn_state {
221 	unsigned long va;
222 	struct vm_area_struct *vma;
223 	domid_t domain;
224 };
225 
226 static int mmap_gfn_range(void *data, void *state)
227 {
228 	struct privcmd_mmap_entry *msg = data;
229 	struct mmap_gfn_state *st = state;
230 	struct vm_area_struct *vma = st->vma;
231 	int rc;
232 
233 	/* Do not allow range to wrap the address space. */
234 	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
235 	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
236 		return -EINVAL;
237 
238 	/* Range chunks must be contiguous in va space. */
239 	if ((msg->va != st->va) ||
240 	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
241 		return -EINVAL;
242 
243 	rc = xen_remap_domain_gfn_range(vma,
244 					msg->va & PAGE_MASK,
245 					msg->mfn, msg->npages,
246 					vma->vm_page_prot,
247 					st->domain, NULL);
248 	if (rc < 0)
249 		return rc;
250 
251 	st->va += msg->npages << PAGE_SHIFT;
252 
253 	return 0;
254 }
255 
256 static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
257 {
258 	struct privcmd_data *data = file->private_data;
259 	struct privcmd_mmap mmapcmd;
260 	struct mm_struct *mm = current->mm;
261 	struct vm_area_struct *vma;
262 	int rc;
263 	LIST_HEAD(pagelist);
264 	struct mmap_gfn_state state;
265 
266 	/* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */
267 	if (xen_feature(XENFEAT_auto_translated_physmap))
268 		return -ENOSYS;
269 
270 	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
271 		return -EFAULT;
272 
273 	/* If restriction is in place, check the domid matches */
274 	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
275 		return -EPERM;
276 
277 	rc = gather_array(&pagelist,
278 			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
279 			  mmapcmd.entry);
280 
281 	if (rc || list_empty(&pagelist))
282 		goto out;
283 
284 	mmap_write_lock(mm);
285 
286 	{
287 		struct page *page = list_first_entry(&pagelist,
288 						     struct page, lru);
289 		struct privcmd_mmap_entry *msg = page_address(page);
290 
291 		vma = vma_lookup(mm, msg->va);
292 		rc = -EINVAL;
293 
294 		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
295 			goto out_up;
296 		vma->vm_private_data = PRIV_VMA_LOCKED;
297 	}
298 
299 	state.va = vma->vm_start;
300 	state.vma = vma;
301 	state.domain = mmapcmd.dom;
302 
303 	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
304 			    &pagelist,
305 			    mmap_gfn_range, &state);
306 
307 
308 out_up:
309 	mmap_write_unlock(mm);
310 
311 out:
312 	free_page_list(&pagelist);
313 
314 	return rc;
315 }
316 
317 struct mmap_batch_state {
318 	domid_t domain;
319 	unsigned long va;
320 	struct vm_area_struct *vma;
321 	int index;
322 	/* A tristate:
323 	 *      0 for no errors
324 	 *      1 if at least one error has happened (and no
325 	 *          -ENOENT errors have happened)
326 	 *      -ENOENT if at least 1 -ENOENT has happened.
327 	 */
328 	int global_error;
329 	int version;
330 
331 	/* User-space gfn array to store errors in the second pass for V1. */
332 	xen_pfn_t __user *user_gfn;
333 	/* User-space int array to store errors in the second pass for V2. */
334 	int __user *user_err;
335 };
336 
337 /* auto translated dom0 note: if domU being created is PV, then gfn is
338  * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
339  */
340 static int mmap_batch_fn(void *data, int nr, void *state)
341 {
342 	xen_pfn_t *gfnp = data;
343 	struct mmap_batch_state *st = state;
344 	struct vm_area_struct *vma = st->vma;
345 	struct page **pages = vma->vm_private_data;
346 	struct page **cur_pages = NULL;
347 	int ret;
348 
349 	if (xen_feature(XENFEAT_auto_translated_physmap))
350 		cur_pages = &pages[st->index];
351 
352 	BUG_ON(nr < 0);
353 	ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
354 					 (int *)gfnp, st->vma->vm_page_prot,
355 					 st->domain, cur_pages);
356 
357 	/* Adjust the global_error? */
358 	if (ret != nr) {
359 		if (ret == -ENOENT)
360 			st->global_error = -ENOENT;
361 		else {
362 			/* Record that at least one error has happened. */
363 			if (st->global_error == 0)
364 				st->global_error = 1;
365 		}
366 	}
367 	st->va += XEN_PAGE_SIZE * nr;
368 	st->index += nr / XEN_PFN_PER_PAGE;
369 
370 	return 0;
371 }
372 
373 static int mmap_return_error(int err, struct mmap_batch_state *st)
374 {
375 	int ret;
376 
377 	if (st->version == 1) {
378 		if (err) {
379 			xen_pfn_t gfn;
380 
381 			ret = get_user(gfn, st->user_gfn);
382 			if (ret < 0)
383 				return ret;
384 			/*
385 			 * V1 encodes the error codes in the 32bit top
386 			 * nibble of the gfn (with its known
387 			 * limitations vis-a-vis 64 bit callers).
388 			 */
389 			gfn |= (err == -ENOENT) ?
390 				PRIVCMD_MMAPBATCH_PAGED_ERROR :
391 				PRIVCMD_MMAPBATCH_MFN_ERROR;
392 			return __put_user(gfn, st->user_gfn++);
393 		} else
394 			st->user_gfn++;
395 	} else { /* st->version == 2 */
396 		if (err)
397 			return __put_user(err, st->user_err++);
398 		else
399 			st->user_err++;
400 	}
401 
402 	return 0;
403 }
404 
405 static int mmap_return_errors(void *data, int nr, void *state)
406 {
407 	struct mmap_batch_state *st = state;
408 	int *errs = data;
409 	int i;
410 	int ret;
411 
412 	for (i = 0; i < nr; i++) {
413 		ret = mmap_return_error(errs[i], st);
414 		if (ret < 0)
415 			return ret;
416 	}
417 	return 0;
418 }
419 
420 /* Allocate pfns that are then mapped with gfns from foreign domid. Update
421  * the vma with the page info to use later.
422  * Returns: 0 if success, otherwise -errno
423  */
424 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
425 {
426 	int rc;
427 	struct page **pages;
428 
429 	pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
430 	if (pages == NULL)
431 		return -ENOMEM;
432 
433 	rc = xen_alloc_unpopulated_pages(numpgs, pages);
434 	if (rc != 0) {
435 		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
436 			numpgs, rc);
437 		kvfree(pages);
438 		return -ENOMEM;
439 	}
440 	BUG_ON(vma->vm_private_data != NULL);
441 	vma->vm_private_data = pages;
442 
443 	return 0;
444 }
445 
446 static const struct vm_operations_struct privcmd_vm_ops;
447 
448 static long privcmd_ioctl_mmap_batch(
449 	struct file *file, void __user *udata, int version)
450 {
451 	struct privcmd_data *data = file->private_data;
452 	int ret;
453 	struct privcmd_mmapbatch_v2 m;
454 	struct mm_struct *mm = current->mm;
455 	struct vm_area_struct *vma;
456 	unsigned long nr_pages;
457 	LIST_HEAD(pagelist);
458 	struct mmap_batch_state state;
459 
460 	switch (version) {
461 	case 1:
462 		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
463 			return -EFAULT;
464 		/* Returns per-frame error in m.arr. */
465 		m.err = NULL;
466 		if (!access_ok(m.arr, m.num * sizeof(*m.arr)))
467 			return -EFAULT;
468 		break;
469 	case 2:
470 		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
471 			return -EFAULT;
472 		/* Returns per-frame error code in m.err. */
473 		if (!access_ok(m.err, m.num * (sizeof(*m.err))))
474 			return -EFAULT;
475 		break;
476 	default:
477 		return -EINVAL;
478 	}
479 
480 	/* If restriction is in place, check the domid matches */
481 	if (data->domid != DOMID_INVALID && data->domid != m.dom)
482 		return -EPERM;
483 
484 	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
485 	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
486 		return -EINVAL;
487 
488 	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
489 
490 	if (ret)
491 		goto out;
492 	if (list_empty(&pagelist)) {
493 		ret = -EINVAL;
494 		goto out;
495 	}
496 
497 	if (version == 2) {
498 		/* Zero error array now to only copy back actual errors. */
499 		if (clear_user(m.err, sizeof(int) * m.num)) {
500 			ret = -EFAULT;
501 			goto out;
502 		}
503 	}
504 
505 	mmap_write_lock(mm);
506 
507 	vma = find_vma(mm, m.addr);
508 	if (!vma ||
509 	    vma->vm_ops != &privcmd_vm_ops) {
510 		ret = -EINVAL;
511 		goto out_unlock;
512 	}
513 
514 	/*
515 	 * Caller must either:
516 	 *
517 	 * Map the whole VMA range, which will also allocate all the
518 	 * pages required for the auto_translated_physmap case.
519 	 *
520 	 * Or
521 	 *
522 	 * Map unmapped holes left from a previous map attempt (e.g.,
523 	 * because those foreign frames were previously paged out).
524 	 */
525 	if (vma->vm_private_data == NULL) {
526 		if (m.addr != vma->vm_start ||
527 		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
528 			ret = -EINVAL;
529 			goto out_unlock;
530 		}
531 		if (xen_feature(XENFEAT_auto_translated_physmap)) {
532 			ret = alloc_empty_pages(vma, nr_pages);
533 			if (ret < 0)
534 				goto out_unlock;
535 		} else
536 			vma->vm_private_data = PRIV_VMA_LOCKED;
537 	} else {
538 		if (m.addr < vma->vm_start ||
539 		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
540 			ret = -EINVAL;
541 			goto out_unlock;
542 		}
543 		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
544 			ret = -EINVAL;
545 			goto out_unlock;
546 		}
547 	}
548 
549 	state.domain        = m.dom;
550 	state.vma           = vma;
551 	state.va            = m.addr;
552 	state.index         = 0;
553 	state.global_error  = 0;
554 	state.version       = version;
555 
556 	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
557 	/* mmap_batch_fn guarantees ret == 0 */
558 	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
559 				    &pagelist, mmap_batch_fn, &state));
560 
561 	mmap_write_unlock(mm);
562 
563 	if (state.global_error) {
564 		/* Write back errors in second pass. */
565 		state.user_gfn = (xen_pfn_t *)m.arr;
566 		state.user_err = m.err;
567 		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
568 					   &pagelist, mmap_return_errors, &state);
569 	} else
570 		ret = 0;
571 
572 	/* If we have not had any EFAULT-like global errors then set the global
573 	 * error to -ENOENT if necessary. */
574 	if ((ret == 0) && (state.global_error == -ENOENT))
575 		ret = -ENOENT;
576 
577 out:
578 	free_page_list(&pagelist);
579 	return ret;
580 
581 out_unlock:
582 	mmap_write_unlock(mm);
583 	goto out;
584 }
585 
586 static int lock_pages(
587 	struct privcmd_dm_op_buf kbufs[], unsigned int num,
588 	struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
589 {
590 	unsigned int i, off = 0;
591 
592 	for (i = 0; i < num; ) {
593 		unsigned int requested;
594 		int page_count;
595 
596 		requested = DIV_ROUND_UP(
597 			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
598 			PAGE_SIZE) - off;
599 		if (requested > nr_pages)
600 			return -ENOSPC;
601 
602 		page_count = pin_user_pages_fast(
603 			(unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
604 			requested, FOLL_WRITE, pages);
605 		if (page_count <= 0)
606 			return page_count ? : -EFAULT;
607 
608 		*pinned += page_count;
609 		nr_pages -= page_count;
610 		pages += page_count;
611 
612 		off = (requested == page_count) ? 0 : off + page_count;
613 		i += !off;
614 	}
615 
616 	return 0;
617 }
618 
619 static void unlock_pages(struct page *pages[], unsigned int nr_pages)
620 {
621 	unpin_user_pages_dirty_lock(pages, nr_pages, true);
622 }
623 
624 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
625 {
626 	struct privcmd_data *data = file->private_data;
627 	struct privcmd_dm_op kdata;
628 	struct privcmd_dm_op_buf *kbufs;
629 	unsigned int nr_pages = 0;
630 	struct page **pages = NULL;
631 	struct xen_dm_op_buf *xbufs = NULL;
632 	unsigned int i;
633 	long rc;
634 	unsigned int pinned = 0;
635 
636 	if (copy_from_user(&kdata, udata, sizeof(kdata)))
637 		return -EFAULT;
638 
639 	/* If restriction is in place, check the domid matches */
640 	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
641 		return -EPERM;
642 
643 	if (kdata.num == 0)
644 		return 0;
645 
646 	if (kdata.num > privcmd_dm_op_max_num)
647 		return -E2BIG;
648 
649 	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
650 	if (!kbufs)
651 		return -ENOMEM;
652 
653 	if (copy_from_user(kbufs, kdata.ubufs,
654 			   sizeof(*kbufs) * kdata.num)) {
655 		rc = -EFAULT;
656 		goto out;
657 	}
658 
659 	for (i = 0; i < kdata.num; i++) {
660 		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
661 			rc = -E2BIG;
662 			goto out;
663 		}
664 
665 		if (!access_ok(kbufs[i].uptr,
666 			       kbufs[i].size)) {
667 			rc = -EFAULT;
668 			goto out;
669 		}
670 
671 		nr_pages += DIV_ROUND_UP(
672 			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
673 			PAGE_SIZE);
674 	}
675 
676 	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
677 	if (!pages) {
678 		rc = -ENOMEM;
679 		goto out;
680 	}
681 
682 	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
683 	if (!xbufs) {
684 		rc = -ENOMEM;
685 		goto out;
686 	}
687 
688 	rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
689 	if (rc < 0)
690 		goto out;
691 
692 	for (i = 0; i < kdata.num; i++) {
693 		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
694 		xbufs[i].size = kbufs[i].size;
695 	}
696 
697 	xen_preemptible_hcall_begin();
698 	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
699 	xen_preemptible_hcall_end();
700 
701 out:
702 	unlock_pages(pages, pinned);
703 	kfree(xbufs);
704 	kfree(pages);
705 	kfree(kbufs);
706 
707 	return rc;
708 }
709 
710 static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
711 {
712 	struct privcmd_data *data = file->private_data;
713 	domid_t dom;
714 
715 	if (copy_from_user(&dom, udata, sizeof(dom)))
716 		return -EFAULT;
717 
718 	/* Set restriction to the specified domain, or check it matches */
719 	if (data->domid == DOMID_INVALID)
720 		data->domid = dom;
721 	else if (data->domid != dom)
722 		return -EINVAL;
723 
724 	return 0;
725 }
726 
727 static long privcmd_ioctl_mmap_resource(struct file *file,
728 				struct privcmd_mmap_resource __user *udata)
729 {
730 	struct privcmd_data *data = file->private_data;
731 	struct mm_struct *mm = current->mm;
732 	struct vm_area_struct *vma;
733 	struct privcmd_mmap_resource kdata;
734 	xen_pfn_t *pfns = NULL;
735 	struct xen_mem_acquire_resource xdata = { };
736 	int rc;
737 
738 	if (copy_from_user(&kdata, udata, sizeof(kdata)))
739 		return -EFAULT;
740 
741 	/* If restriction is in place, check the domid matches */
742 	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
743 		return -EPERM;
744 
745 	/* Both fields must be set or unset */
746 	if (!!kdata.addr != !!kdata.num)
747 		return -EINVAL;
748 
749 	xdata.domid = kdata.dom;
750 	xdata.type = kdata.type;
751 	xdata.id = kdata.id;
752 
753 	if (!kdata.addr && !kdata.num) {
754 		/* Query the size of the resource. */
755 		rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
756 		if (rc)
757 			return rc;
758 		return __put_user(xdata.nr_frames, &udata->num);
759 	}
760 
761 	mmap_write_lock(mm);
762 
763 	vma = find_vma(mm, kdata.addr);
764 	if (!vma || vma->vm_ops != &privcmd_vm_ops) {
765 		rc = -EINVAL;
766 		goto out;
767 	}
768 
769 	pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN);
770 	if (!pfns) {
771 		rc = -ENOMEM;
772 		goto out;
773 	}
774 
775 	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
776 	    xen_feature(XENFEAT_auto_translated_physmap)) {
777 		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
778 		struct page **pages;
779 		unsigned int i;
780 
781 		rc = alloc_empty_pages(vma, nr);
782 		if (rc < 0)
783 			goto out;
784 
785 		pages = vma->vm_private_data;
786 		for (i = 0; i < kdata.num; i++) {
787 			xen_pfn_t pfn =
788 				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
789 
790 			pfns[i] = pfn + (i % XEN_PFN_PER_PAGE);
791 		}
792 	} else
793 		vma->vm_private_data = PRIV_VMA_LOCKED;
794 
795 	xdata.frame = kdata.idx;
796 	xdata.nr_frames = kdata.num;
797 	set_xen_guest_handle(xdata.frame_list, pfns);
798 
799 	xen_preemptible_hcall_begin();
800 	rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
801 	xen_preemptible_hcall_end();
802 
803 	if (rc)
804 		goto out;
805 
806 	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
807 	    xen_feature(XENFEAT_auto_translated_physmap)) {
808 		rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT);
809 	} else {
810 		unsigned int domid =
811 			(xdata.flags & XENMEM_rsrc_acq_caller_owned) ?
812 			DOMID_SELF : kdata.dom;
813 		int num, *errs = (int *)pfns;
814 
815 		BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns));
816 		num = xen_remap_domain_mfn_array(vma,
817 						 kdata.addr & PAGE_MASK,
818 						 pfns, kdata.num, errs,
819 						 vma->vm_page_prot,
820 						 domid);
821 		if (num < 0)
822 			rc = num;
823 		else if (num != kdata.num) {
824 			unsigned int i;
825 
826 			for (i = 0; i < num; i++) {
827 				rc = errs[i];
828 				if (rc < 0)
829 					break;
830 			}
831 		} else
832 			rc = 0;
833 	}
834 
835 out:
836 	mmap_write_unlock(mm);
837 	kfree(pfns);
838 
839 	return rc;
840 }
841 
842 #ifdef CONFIG_XEN_PRIVCMD_IRQFD
843 /* Irqfd support */
844 static struct workqueue_struct *irqfd_cleanup_wq;
845 static DEFINE_SPINLOCK(irqfds_lock);
846 DEFINE_STATIC_SRCU(irqfds_srcu);
847 static LIST_HEAD(irqfds_list);
848 
849 struct privcmd_kernel_irqfd {
850 	struct xen_dm_op_buf xbufs;
851 	domid_t dom;
852 	bool error;
853 	struct eventfd_ctx *eventfd;
854 	struct work_struct shutdown;
855 	wait_queue_entry_t wait;
856 	struct list_head list;
857 	poll_table pt;
858 };
859 
860 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
861 {
862 	lockdep_assert_held(&irqfds_lock);
863 
864 	list_del_init(&kirqfd->list);
865 	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
866 }
867 
868 static void irqfd_shutdown(struct work_struct *work)
869 {
870 	struct privcmd_kernel_irqfd *kirqfd =
871 		container_of(work, struct privcmd_kernel_irqfd, shutdown);
872 	u64 cnt;
873 
874 	/* Make sure irqfd has been initialized in assign path */
875 	synchronize_srcu(&irqfds_srcu);
876 
877 	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
878 	eventfd_ctx_put(kirqfd->eventfd);
879 	kfree(kirqfd);
880 }
881 
882 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
883 {
884 	u64 cnt;
885 	long rc;
886 
887 	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
888 
889 	xen_preemptible_hcall_begin();
890 	rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs);
891 	xen_preemptible_hcall_end();
892 
893 	/* Don't repeat the error message for consecutive failures */
894 	if (rc && !kirqfd->error) {
895 		pr_err("Failed to configure irq for guest domain: %d\n",
896 		       kirqfd->dom);
897 	}
898 
899 	kirqfd->error = rc;
900 }
901 
902 static int
903 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
904 {
905 	struct privcmd_kernel_irqfd *kirqfd =
906 		container_of(wait, struct privcmd_kernel_irqfd, wait);
907 	__poll_t flags = key_to_poll(key);
908 
909 	if (flags & EPOLLIN)
910 		irqfd_inject(kirqfd);
911 
912 	if (flags & EPOLLHUP) {
913 		unsigned long flags;
914 
915 		spin_lock_irqsave(&irqfds_lock, flags);
916 		irqfd_deactivate(kirqfd);
917 		spin_unlock_irqrestore(&irqfds_lock, flags);
918 	}
919 
920 	return 0;
921 }
922 
923 static void
924 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
925 {
926 	struct privcmd_kernel_irqfd *kirqfd =
927 		container_of(pt, struct privcmd_kernel_irqfd, pt);
928 
929 	add_wait_queue_priority(wqh, &kirqfd->wait);
930 }
931 
932 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
933 {
934 	struct privcmd_kernel_irqfd *kirqfd, *tmp;
935 	unsigned long flags;
936 	__poll_t events;
937 	struct fd f;
938 	void *dm_op;
939 	int ret, idx;
940 
941 	kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
942 	if (!kirqfd)
943 		return -ENOMEM;
944 	dm_op = kirqfd + 1;
945 
946 	if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) {
947 		ret = -EFAULT;
948 		goto error_kfree;
949 	}
950 
951 	kirqfd->xbufs.size = irqfd->size;
952 	set_xen_guest_handle(kirqfd->xbufs.h, dm_op);
953 	kirqfd->dom = irqfd->dom;
954 	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
955 
956 	f = fdget(irqfd->fd);
957 	if (!f.file) {
958 		ret = -EBADF;
959 		goto error_kfree;
960 	}
961 
962 	kirqfd->eventfd = eventfd_ctx_fileget(f.file);
963 	if (IS_ERR(kirqfd->eventfd)) {
964 		ret = PTR_ERR(kirqfd->eventfd);
965 		goto error_fd_put;
966 	}
967 
968 	/*
969 	 * Install our own custom wake-up handling so we are notified via a
970 	 * callback whenever someone signals the underlying eventfd.
971 	 */
972 	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
973 	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
974 
975 	spin_lock_irqsave(&irqfds_lock, flags);
976 
977 	list_for_each_entry(tmp, &irqfds_list, list) {
978 		if (kirqfd->eventfd == tmp->eventfd) {
979 			ret = -EBUSY;
980 			spin_unlock_irqrestore(&irqfds_lock, flags);
981 			goto error_eventfd;
982 		}
983 	}
984 
985 	idx = srcu_read_lock(&irqfds_srcu);
986 	list_add_tail(&kirqfd->list, &irqfds_list);
987 	spin_unlock_irqrestore(&irqfds_lock, flags);
988 
989 	/*
990 	 * Check if there was an event already pending on the eventfd before we
991 	 * registered, and trigger it as if we didn't miss it.
992 	 */
993 	events = vfs_poll(f.file, &kirqfd->pt);
994 	if (events & EPOLLIN)
995 		irqfd_inject(kirqfd);
996 
997 	srcu_read_unlock(&irqfds_srcu, idx);
998 
999 	/*
1000 	 * Do not drop the file until the kirqfd is fully initialized, otherwise
1001 	 * we might race against the EPOLLHUP.
1002 	 */
1003 	fdput(f);
1004 	return 0;
1005 
1006 error_eventfd:
1007 	eventfd_ctx_put(kirqfd->eventfd);
1008 
1009 error_fd_put:
1010 	fdput(f);
1011 
1012 error_kfree:
1013 	kfree(kirqfd);
1014 	return ret;
1015 }
1016 
1017 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
1018 {
1019 	struct privcmd_kernel_irqfd *kirqfd;
1020 	struct eventfd_ctx *eventfd;
1021 	unsigned long flags;
1022 
1023 	eventfd = eventfd_ctx_fdget(irqfd->fd);
1024 	if (IS_ERR(eventfd))
1025 		return PTR_ERR(eventfd);
1026 
1027 	spin_lock_irqsave(&irqfds_lock, flags);
1028 
1029 	list_for_each_entry(kirqfd, &irqfds_list, list) {
1030 		if (kirqfd->eventfd == eventfd) {
1031 			irqfd_deactivate(kirqfd);
1032 			break;
1033 		}
1034 	}
1035 
1036 	spin_unlock_irqrestore(&irqfds_lock, flags);
1037 
1038 	eventfd_ctx_put(eventfd);
1039 
1040 	/*
1041 	 * Block until we know all outstanding shutdown jobs have completed so
1042 	 * that we guarantee there will not be any more interrupts once this
1043 	 * deassign function returns.
1044 	 */
1045 	flush_workqueue(irqfd_cleanup_wq);
1046 
1047 	return 0;
1048 }
1049 
1050 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
1051 {
1052 	struct privcmd_data *data = file->private_data;
1053 	struct privcmd_irqfd irqfd;
1054 
1055 	if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
1056 		return -EFAULT;
1057 
1058 	/* No other flags should be set */
1059 	if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
1060 		return -EINVAL;
1061 
1062 	/* If restriction is in place, check the domid matches */
1063 	if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
1064 		return -EPERM;
1065 
1066 	if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
1067 		return privcmd_irqfd_deassign(&irqfd);
1068 
1069 	return privcmd_irqfd_assign(&irqfd);
1070 }
1071 
1072 static int privcmd_irqfd_init(void)
1073 {
1074 	irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
1075 	if (!irqfd_cleanup_wq)
1076 		return -ENOMEM;
1077 
1078 	return 0;
1079 }
1080 
1081 static void privcmd_irqfd_exit(void)
1082 {
1083 	struct privcmd_kernel_irqfd *kirqfd, *tmp;
1084 	unsigned long flags;
1085 
1086 	spin_lock_irqsave(&irqfds_lock, flags);
1087 
1088 	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
1089 		irqfd_deactivate(kirqfd);
1090 
1091 	spin_unlock_irqrestore(&irqfds_lock, flags);
1092 
1093 	destroy_workqueue(irqfd_cleanup_wq);
1094 }
1095 #else
1096 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
1097 {
1098 	return -EOPNOTSUPP;
1099 }
1100 
1101 static inline int privcmd_irqfd_init(void)
1102 {
1103 	return 0;
1104 }
1105 
1106 static inline void privcmd_irqfd_exit(void)
1107 {
1108 }
1109 #endif /* CONFIG_XEN_PRIVCMD_IRQFD */
1110 
1111 static long privcmd_ioctl(struct file *file,
1112 			  unsigned int cmd, unsigned long data)
1113 {
1114 	int ret = -ENOTTY;
1115 	void __user *udata = (void __user *) data;
1116 
1117 	switch (cmd) {
1118 	case IOCTL_PRIVCMD_HYPERCALL:
1119 		ret = privcmd_ioctl_hypercall(file, udata);
1120 		break;
1121 
1122 	case IOCTL_PRIVCMD_MMAP:
1123 		ret = privcmd_ioctl_mmap(file, udata);
1124 		break;
1125 
1126 	case IOCTL_PRIVCMD_MMAPBATCH:
1127 		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
1128 		break;
1129 
1130 	case IOCTL_PRIVCMD_MMAPBATCH_V2:
1131 		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
1132 		break;
1133 
1134 	case IOCTL_PRIVCMD_DM_OP:
1135 		ret = privcmd_ioctl_dm_op(file, udata);
1136 		break;
1137 
1138 	case IOCTL_PRIVCMD_RESTRICT:
1139 		ret = privcmd_ioctl_restrict(file, udata);
1140 		break;
1141 
1142 	case IOCTL_PRIVCMD_MMAP_RESOURCE:
1143 		ret = privcmd_ioctl_mmap_resource(file, udata);
1144 		break;
1145 
1146 	case IOCTL_PRIVCMD_IRQFD:
1147 		ret = privcmd_ioctl_irqfd(file, udata);
1148 		break;
1149 
1150 	default:
1151 		break;
1152 	}
1153 
1154 	return ret;
1155 }
1156 
1157 static int privcmd_open(struct inode *ino, struct file *file)
1158 {
1159 	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
1160 
1161 	if (!data)
1162 		return -ENOMEM;
1163 
1164 	/* DOMID_INVALID implies no restriction */
1165 	data->domid = DOMID_INVALID;
1166 
1167 	file->private_data = data;
1168 	return 0;
1169 }
1170 
1171 static int privcmd_release(struct inode *ino, struct file *file)
1172 {
1173 	struct privcmd_data *data = file->private_data;
1174 
1175 	kfree(data);
1176 	return 0;
1177 }
1178 
1179 static void privcmd_close(struct vm_area_struct *vma)
1180 {
1181 	struct page **pages = vma->vm_private_data;
1182 	int numpgs = vma_pages(vma);
1183 	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
1184 	int rc;
1185 
1186 	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
1187 		return;
1188 
1189 	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
1190 	if (rc == 0)
1191 		xen_free_unpopulated_pages(numpgs, pages);
1192 	else
1193 		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
1194 			numpgs, rc);
1195 	kvfree(pages);
1196 }
1197 
1198 static vm_fault_t privcmd_fault(struct vm_fault *vmf)
1199 {
1200 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
1201 	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
1202 	       vmf->pgoff, (void *)vmf->address);
1203 
1204 	return VM_FAULT_SIGBUS;
1205 }
1206 
1207 static const struct vm_operations_struct privcmd_vm_ops = {
1208 	.close = privcmd_close,
1209 	.fault = privcmd_fault
1210 };
1211 
1212 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
1213 {
1214 	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
1215 	 * how to recreate these mappings */
1216 	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY |
1217 			 VM_DONTEXPAND | VM_DONTDUMP);
1218 	vma->vm_ops = &privcmd_vm_ops;
1219 	vma->vm_private_data = NULL;
1220 
1221 	return 0;
1222 }
1223 
1224 /*
1225  * For MMAPBATCH*. This allows asserting the singleshot mapping
1226  * on a per pfn/pte basis. Mapping calls that fail with ENOENT
1227  * can be then retried until success.
1228  */
1229 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
1230 {
1231 	return pte_none(ptep_get(pte)) ? 0 : -EBUSY;
1232 }
1233 
1234 static int privcmd_vma_range_is_mapped(
1235 	           struct vm_area_struct *vma,
1236 	           unsigned long addr,
1237 	           unsigned long nr_pages)
1238 {
1239 	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
1240 				   is_mapped_fn, NULL) != 0;
1241 }
1242 
1243 const struct file_operations xen_privcmd_fops = {
1244 	.owner = THIS_MODULE,
1245 	.unlocked_ioctl = privcmd_ioctl,
1246 	.open = privcmd_open,
1247 	.release = privcmd_release,
1248 	.mmap = privcmd_mmap,
1249 };
1250 EXPORT_SYMBOL_GPL(xen_privcmd_fops);
1251 
1252 static struct miscdevice privcmd_dev = {
1253 	.minor = MISC_DYNAMIC_MINOR,
1254 	.name = "xen/privcmd",
1255 	.fops = &xen_privcmd_fops,
1256 };
1257 
1258 static int __init privcmd_init(void)
1259 {
1260 	int err;
1261 
1262 	if (!xen_domain())
1263 		return -ENODEV;
1264 
1265 	err = misc_register(&privcmd_dev);
1266 	if (err != 0) {
1267 		pr_err("Could not register Xen privcmd device\n");
1268 		return err;
1269 	}
1270 
1271 	err = misc_register(&xen_privcmdbuf_dev);
1272 	if (err != 0) {
1273 		pr_err("Could not register Xen hypercall-buf device\n");
1274 		goto err_privcmdbuf;
1275 	}
1276 
1277 	err = privcmd_irqfd_init();
1278 	if (err != 0) {
1279 		pr_err("irqfd init failed\n");
1280 		goto err_irqfd;
1281 	}
1282 
1283 	return 0;
1284 
1285 err_irqfd:
1286 	misc_deregister(&xen_privcmdbuf_dev);
1287 err_privcmdbuf:
1288 	misc_deregister(&privcmd_dev);
1289 	return err;
1290 }
1291 
1292 static void __exit privcmd_exit(void)
1293 {
1294 	privcmd_irqfd_exit();
1295 	misc_deregister(&privcmd_dev);
1296 	misc_deregister(&xen_privcmdbuf_dev);
1297 }
1298 
1299 module_init(privcmd_init);
1300 module_exit(privcmd_exit);
1301