xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/kvmgt.c (revision c0ecca6604b80e438b032578634c6e133c7028f6)
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  */
30 
31 #include <linux/init.h>
32 #include <linux/device.h>
33 #include <linux/mm.h>
34 #include <linux/kthread.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
46 
47 #include <linux/nospec.h>
48 
49 #include "i915_drv.h"
50 #include "gvt.h"
51 
52 static const struct intel_gvt_ops *intel_gvt_ops;
53 
54 /* helper macros copied from vfio-pci */
55 #define VFIO_PCI_OFFSET_SHIFT   40
56 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
58 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
59 
60 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
61 
62 #define OPREGION_SIGNATURE "IntelGraphicsMem"
63 
64 struct vfio_region;
65 struct intel_vgpu_regops {
66 	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
67 			size_t count, loff_t *ppos, bool iswrite);
68 	void (*release)(struct intel_vgpu *vgpu,
69 			struct vfio_region *region);
70 };
71 
72 struct vfio_region {
73 	u32				type;
74 	u32				subtype;
75 	size_t				size;
76 	u32				flags;
77 	const struct intel_vgpu_regops	*ops;
78 	void				*data;
79 };
80 
81 struct vfio_edid_region {
82 	struct vfio_region_gfx_edid vfio_edid_regs;
83 	void *edid_blob;
84 };
85 
86 struct kvmgt_pgfn {
87 	gfn_t gfn;
88 	struct hlist_node hnode;
89 };
90 
91 struct kvmgt_guest_info {
92 	struct kvm *kvm;
93 	struct intel_vgpu *vgpu;
94 	struct kvm_page_track_notifier_node track_node;
95 #define NR_BKT (1 << 18)
96 	struct hlist_head ptable[NR_BKT];
97 #undef NR_BKT
98 	struct dentry *debugfs_cache_entries;
99 };
100 
101 struct gvt_dma {
102 	struct intel_vgpu *vgpu;
103 	struct rb_node gfn_node;
104 	struct rb_node dma_addr_node;
105 	gfn_t gfn;
106 	dma_addr_t dma_addr;
107 	unsigned long size;
108 	struct kref ref;
109 };
110 
111 struct kvmgt_vdev {
112 	struct intel_vgpu *vgpu;
113 	struct mdev_device *mdev;
114 	struct vfio_region *region;
115 	int num_regions;
116 	struct eventfd_ctx *intx_trigger;
117 	struct eventfd_ctx *msi_trigger;
118 
119 	/*
120 	 * Two caches are used to avoid mapping duplicated pages (eg.
121 	 * scratch pages). This help to reduce dma setup overhead.
122 	 */
123 	struct rb_root gfn_cache;
124 	struct rb_root dma_addr_cache;
125 	unsigned long nr_cache_entries;
126 	struct mutex cache_lock;
127 
128 	struct notifier_block iommu_notifier;
129 	struct notifier_block group_notifier;
130 	struct kvm *kvm;
131 	struct work_struct release_work;
132 	atomic_t released;
133 	struct vfio_device *vfio_device;
134 	struct vfio_group *vfio_group;
135 };
136 
137 static inline struct kvmgt_vdev *kvmgt_vdev(struct intel_vgpu *vgpu)
138 {
139 	return intel_vgpu_vdev(vgpu);
140 }
141 
142 static inline bool handle_valid(unsigned long handle)
143 {
144 	return !!(handle & ~0xff);
145 }
146 
147 static ssize_t available_instances_show(struct mdev_type *mtype,
148 					struct mdev_type_attribute *attr,
149 					char *buf)
150 {
151 	struct intel_vgpu_type *type;
152 	unsigned int num = 0;
153 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
154 
155 	type = &gvt->types[mtype_get_type_group_id(mtype)];
156 	if (!type)
157 		num = 0;
158 	else
159 		num = type->avail_instance;
160 
161 	return sprintf(buf, "%u\n", num);
162 }
163 
164 static ssize_t device_api_show(struct mdev_type *mtype,
165 			       struct mdev_type_attribute *attr, char *buf)
166 {
167 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
168 }
169 
170 static ssize_t description_show(struct mdev_type *mtype,
171 				struct mdev_type_attribute *attr, char *buf)
172 {
173 	struct intel_vgpu_type *type;
174 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
175 
176 	type = &gvt->types[mtype_get_type_group_id(mtype)];
177 	if (!type)
178 		return 0;
179 
180 	return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
181 		       "fence: %d\nresolution: %s\n"
182 		       "weight: %d\n",
183 		       BYTES_TO_MB(type->low_gm_size),
184 		       BYTES_TO_MB(type->high_gm_size),
185 		       type->fence, vgpu_edid_str(type->resolution),
186 		       type->weight);
187 }
188 
189 static MDEV_TYPE_ATTR_RO(available_instances);
190 static MDEV_TYPE_ATTR_RO(device_api);
191 static MDEV_TYPE_ATTR_RO(description);
192 
193 static struct attribute *gvt_type_attrs[] = {
194 	&mdev_type_attr_available_instances.attr,
195 	&mdev_type_attr_device_api.attr,
196 	&mdev_type_attr_description.attr,
197 	NULL,
198 };
199 
200 static struct attribute_group *gvt_vgpu_type_groups[] = {
201 	[0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
202 };
203 
204 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
205 {
206 	int i, j;
207 	struct intel_vgpu_type *type;
208 	struct attribute_group *group;
209 
210 	for (i = 0; i < gvt->num_types; i++) {
211 		type = &gvt->types[i];
212 
213 		group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
214 		if (!group)
215 			goto unwind;
216 
217 		group->name = type->name;
218 		group->attrs = gvt_type_attrs;
219 		gvt_vgpu_type_groups[i] = group;
220 	}
221 
222 	return 0;
223 
224 unwind:
225 	for (j = 0; j < i; j++) {
226 		group = gvt_vgpu_type_groups[j];
227 		kfree(group);
228 	}
229 
230 	return -ENOMEM;
231 }
232 
233 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
234 {
235 	int i;
236 	struct attribute_group *group;
237 
238 	for (i = 0; i < gvt->num_types; i++) {
239 		group = gvt_vgpu_type_groups[i];
240 		gvt_vgpu_type_groups[i] = NULL;
241 		kfree(group);
242 	}
243 }
244 
245 static int kvmgt_guest_init(struct mdev_device *mdev);
246 static void intel_vgpu_release_work(struct work_struct *work);
247 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
248 
249 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
250 		unsigned long size)
251 {
252 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
253 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
254 	int total_pages;
255 	int npage;
256 	int ret;
257 
258 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
259 
260 	for (npage = 0; npage < total_pages; npage++) {
261 		unsigned long cur_gfn = gfn + npage;
262 
263 		ret = vfio_group_unpin_pages(vdev->vfio_group, &cur_gfn, 1);
264 		drm_WARN_ON(&i915->drm, ret != 1);
265 	}
266 }
267 
268 /* Pin a normal or compound guest page for dma. */
269 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
270 		unsigned long size, struct page **page)
271 {
272 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
273 	unsigned long base_pfn = 0;
274 	int total_pages;
275 	int npage;
276 	int ret;
277 
278 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
279 	/*
280 	 * We pin the pages one-by-one to avoid allocating a big arrary
281 	 * on stack to hold pfns.
282 	 */
283 	for (npage = 0; npage < total_pages; npage++) {
284 		unsigned long cur_gfn = gfn + npage;
285 		unsigned long pfn;
286 
287 		ret = vfio_group_pin_pages(vdev->vfio_group, &cur_gfn, 1,
288 					   IOMMU_READ | IOMMU_WRITE, &pfn);
289 		if (ret != 1) {
290 			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
291 				     cur_gfn, ret);
292 			goto err;
293 		}
294 
295 		if (!pfn_valid(pfn)) {
296 			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
297 			npage++;
298 			ret = -EFAULT;
299 			goto err;
300 		}
301 
302 		if (npage == 0)
303 			base_pfn = pfn;
304 		else if (base_pfn + npage != pfn) {
305 			gvt_vgpu_err("The pages are not continuous\n");
306 			ret = -EINVAL;
307 			npage++;
308 			goto err;
309 		}
310 	}
311 
312 	*page = pfn_to_page(base_pfn);
313 	return 0;
314 err:
315 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
316 	return ret;
317 }
318 
319 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
320 		dma_addr_t *dma_addr, unsigned long size)
321 {
322 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
323 	struct page *page = NULL;
324 	int ret;
325 
326 	ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
327 	if (ret)
328 		return ret;
329 
330 	/* Setup DMA mapping. */
331 	*dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
332 	if (dma_mapping_error(dev, *dma_addr)) {
333 		gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
334 			     page_to_pfn(page), ret);
335 		gvt_unpin_guest_page(vgpu, gfn, size);
336 		return -ENOMEM;
337 	}
338 
339 	return 0;
340 }
341 
342 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
343 		dma_addr_t dma_addr, unsigned long size)
344 {
345 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
346 
347 	dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
348 	gvt_unpin_guest_page(vgpu, gfn, size);
349 }
350 
351 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
352 		dma_addr_t dma_addr)
353 {
354 	struct rb_node *node = kvmgt_vdev(vgpu)->dma_addr_cache.rb_node;
355 	struct gvt_dma *itr;
356 
357 	while (node) {
358 		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
359 
360 		if (dma_addr < itr->dma_addr)
361 			node = node->rb_left;
362 		else if (dma_addr > itr->dma_addr)
363 			node = node->rb_right;
364 		else
365 			return itr;
366 	}
367 	return NULL;
368 }
369 
370 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
371 {
372 	struct rb_node *node = kvmgt_vdev(vgpu)->gfn_cache.rb_node;
373 	struct gvt_dma *itr;
374 
375 	while (node) {
376 		itr = rb_entry(node, struct gvt_dma, gfn_node);
377 
378 		if (gfn < itr->gfn)
379 			node = node->rb_left;
380 		else if (gfn > itr->gfn)
381 			node = node->rb_right;
382 		else
383 			return itr;
384 	}
385 	return NULL;
386 }
387 
388 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
389 		dma_addr_t dma_addr, unsigned long size)
390 {
391 	struct gvt_dma *new, *itr;
392 	struct rb_node **link, *parent = NULL;
393 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
394 
395 	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
396 	if (!new)
397 		return -ENOMEM;
398 
399 	new->vgpu = vgpu;
400 	new->gfn = gfn;
401 	new->dma_addr = dma_addr;
402 	new->size = size;
403 	kref_init(&new->ref);
404 
405 	/* gfn_cache maps gfn to struct gvt_dma. */
406 	link = &vdev->gfn_cache.rb_node;
407 	while (*link) {
408 		parent = *link;
409 		itr = rb_entry(parent, struct gvt_dma, gfn_node);
410 
411 		if (gfn < itr->gfn)
412 			link = &parent->rb_left;
413 		else
414 			link = &parent->rb_right;
415 	}
416 	rb_link_node(&new->gfn_node, parent, link);
417 	rb_insert_color(&new->gfn_node, &vdev->gfn_cache);
418 
419 	/* dma_addr_cache maps dma addr to struct gvt_dma. */
420 	parent = NULL;
421 	link = &vdev->dma_addr_cache.rb_node;
422 	while (*link) {
423 		parent = *link;
424 		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
425 
426 		if (dma_addr < itr->dma_addr)
427 			link = &parent->rb_left;
428 		else
429 			link = &parent->rb_right;
430 	}
431 	rb_link_node(&new->dma_addr_node, parent, link);
432 	rb_insert_color(&new->dma_addr_node, &vdev->dma_addr_cache);
433 
434 	vdev->nr_cache_entries++;
435 	return 0;
436 }
437 
438 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
439 				struct gvt_dma *entry)
440 {
441 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
442 
443 	rb_erase(&entry->gfn_node, &vdev->gfn_cache);
444 	rb_erase(&entry->dma_addr_node, &vdev->dma_addr_cache);
445 	kfree(entry);
446 	vdev->nr_cache_entries--;
447 }
448 
449 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
450 {
451 	struct gvt_dma *dma;
452 	struct rb_node *node = NULL;
453 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
454 
455 	for (;;) {
456 		mutex_lock(&vdev->cache_lock);
457 		node = rb_first(&vdev->gfn_cache);
458 		if (!node) {
459 			mutex_unlock(&vdev->cache_lock);
460 			break;
461 		}
462 		dma = rb_entry(node, struct gvt_dma, gfn_node);
463 		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
464 		__gvt_cache_remove_entry(vgpu, dma);
465 		mutex_unlock(&vdev->cache_lock);
466 	}
467 }
468 
469 static void gvt_cache_init(struct intel_vgpu *vgpu)
470 {
471 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
472 
473 	vdev->gfn_cache = RB_ROOT;
474 	vdev->dma_addr_cache = RB_ROOT;
475 	vdev->nr_cache_entries = 0;
476 	mutex_init(&vdev->cache_lock);
477 }
478 
479 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
480 {
481 	hash_init(info->ptable);
482 }
483 
484 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
485 {
486 	struct kvmgt_pgfn *p;
487 	struct hlist_node *tmp;
488 	int i;
489 
490 	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
491 		hash_del(&p->hnode);
492 		kfree(p);
493 	}
494 }
495 
496 static struct kvmgt_pgfn *
497 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
498 {
499 	struct kvmgt_pgfn *p, *res = NULL;
500 
501 	hash_for_each_possible(info->ptable, p, hnode, gfn) {
502 		if (gfn == p->gfn) {
503 			res = p;
504 			break;
505 		}
506 	}
507 
508 	return res;
509 }
510 
511 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
512 				gfn_t gfn)
513 {
514 	struct kvmgt_pgfn *p;
515 
516 	p = __kvmgt_protect_table_find(info, gfn);
517 	return !!p;
518 }
519 
520 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
521 {
522 	struct kvmgt_pgfn *p;
523 
524 	if (kvmgt_gfn_is_write_protected(info, gfn))
525 		return;
526 
527 	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
528 	if (WARN(!p, "gfn: 0x%llx\n", gfn))
529 		return;
530 
531 	p->gfn = gfn;
532 	hash_add(info->ptable, &p->hnode, gfn);
533 }
534 
535 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
536 				gfn_t gfn)
537 {
538 	struct kvmgt_pgfn *p;
539 
540 	p = __kvmgt_protect_table_find(info, gfn);
541 	if (p) {
542 		hash_del(&p->hnode);
543 		kfree(p);
544 	}
545 }
546 
547 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
548 		size_t count, loff_t *ppos, bool iswrite)
549 {
550 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
551 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
552 			VFIO_PCI_NUM_REGIONS;
553 	void *base = vdev->region[i].data;
554 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
555 
556 
557 	if (pos >= vdev->region[i].size || iswrite) {
558 		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
559 		return -EINVAL;
560 	}
561 	count = min(count, (size_t)(vdev->region[i].size - pos));
562 	memcpy(buf, base + pos, count);
563 
564 	return count;
565 }
566 
567 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
568 		struct vfio_region *region)
569 {
570 }
571 
572 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
573 	.rw = intel_vgpu_reg_rw_opregion,
574 	.release = intel_vgpu_reg_release_opregion,
575 };
576 
577 static int handle_edid_regs(struct intel_vgpu *vgpu,
578 			struct vfio_edid_region *region, char *buf,
579 			size_t count, u16 offset, bool is_write)
580 {
581 	struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
582 	unsigned int data;
583 
584 	if (offset + count > sizeof(*regs))
585 		return -EINVAL;
586 
587 	if (count != 4)
588 		return -EINVAL;
589 
590 	if (is_write) {
591 		data = *((unsigned int *)buf);
592 		switch (offset) {
593 		case offsetof(struct vfio_region_gfx_edid, link_state):
594 			if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
595 				if (!drm_edid_block_valid(
596 					(u8 *)region->edid_blob,
597 					0,
598 					true,
599 					NULL)) {
600 					gvt_vgpu_err("invalid EDID blob\n");
601 					return -EINVAL;
602 				}
603 				intel_gvt_ops->emulate_hotplug(vgpu, true);
604 			} else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
605 				intel_gvt_ops->emulate_hotplug(vgpu, false);
606 			else {
607 				gvt_vgpu_err("invalid EDID link state %d\n",
608 					regs->link_state);
609 				return -EINVAL;
610 			}
611 			regs->link_state = data;
612 			break;
613 		case offsetof(struct vfio_region_gfx_edid, edid_size):
614 			if (data > regs->edid_max_size) {
615 				gvt_vgpu_err("EDID size is bigger than %d!\n",
616 					regs->edid_max_size);
617 				return -EINVAL;
618 			}
619 			regs->edid_size = data;
620 			break;
621 		default:
622 			/* read-only regs */
623 			gvt_vgpu_err("write read-only EDID region at offset %d\n",
624 				offset);
625 			return -EPERM;
626 		}
627 	} else {
628 		memcpy(buf, (char *)regs + offset, count);
629 	}
630 
631 	return count;
632 }
633 
634 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
635 			size_t count, u16 offset, bool is_write)
636 {
637 	if (offset + count > region->vfio_edid_regs.edid_size)
638 		return -EINVAL;
639 
640 	if (is_write)
641 		memcpy(region->edid_blob + offset, buf, count);
642 	else
643 		memcpy(buf, region->edid_blob + offset, count);
644 
645 	return count;
646 }
647 
648 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
649 		size_t count, loff_t *ppos, bool iswrite)
650 {
651 	int ret;
652 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
653 			VFIO_PCI_NUM_REGIONS;
654 	struct vfio_edid_region *region =
655 		(struct vfio_edid_region *)kvmgt_vdev(vgpu)->region[i].data;
656 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
657 
658 	if (pos < region->vfio_edid_regs.edid_offset) {
659 		ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
660 	} else {
661 		pos -= EDID_BLOB_OFFSET;
662 		ret = handle_edid_blob(region, buf, count, pos, iswrite);
663 	}
664 
665 	if (ret < 0)
666 		gvt_vgpu_err("failed to access EDID region\n");
667 
668 	return ret;
669 }
670 
671 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
672 					struct vfio_region *region)
673 {
674 	kfree(region->data);
675 }
676 
677 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
678 	.rw = intel_vgpu_reg_rw_edid,
679 	.release = intel_vgpu_reg_release_edid,
680 };
681 
682 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
683 		unsigned int type, unsigned int subtype,
684 		const struct intel_vgpu_regops *ops,
685 		size_t size, u32 flags, void *data)
686 {
687 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
688 	struct vfio_region *region;
689 
690 	region = krealloc(vdev->region,
691 			(vdev->num_regions + 1) * sizeof(*region),
692 			GFP_KERNEL);
693 	if (!region)
694 		return -ENOMEM;
695 
696 	vdev->region = region;
697 	vdev->region[vdev->num_regions].type = type;
698 	vdev->region[vdev->num_regions].subtype = subtype;
699 	vdev->region[vdev->num_regions].ops = ops;
700 	vdev->region[vdev->num_regions].size = size;
701 	vdev->region[vdev->num_regions].flags = flags;
702 	vdev->region[vdev->num_regions].data = data;
703 	vdev->num_regions++;
704 	return 0;
705 }
706 
707 static int kvmgt_get_vfio_device(void *p_vgpu)
708 {
709 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
710 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
711 
712 	vdev->vfio_device = vfio_device_get_from_dev(
713 		mdev_dev(vdev->mdev));
714 	if (!vdev->vfio_device) {
715 		gvt_vgpu_err("failed to get vfio device\n");
716 		return -ENODEV;
717 	}
718 	return 0;
719 }
720 
721 
722 static int kvmgt_set_opregion(void *p_vgpu)
723 {
724 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
725 	void *base;
726 	int ret;
727 
728 	/* Each vgpu has its own opregion, although VFIO would create another
729 	 * one later. This one is used to expose opregion to VFIO. And the
730 	 * other one created by VFIO later, is used by guest actually.
731 	 */
732 	base = vgpu_opregion(vgpu)->va;
733 	if (!base)
734 		return -ENOMEM;
735 
736 	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
737 		memunmap(base);
738 		return -EINVAL;
739 	}
740 
741 	ret = intel_vgpu_register_reg(vgpu,
742 			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
743 			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
744 			&intel_vgpu_regops_opregion, OPREGION_SIZE,
745 			VFIO_REGION_INFO_FLAG_READ, base);
746 
747 	return ret;
748 }
749 
750 static int kvmgt_set_edid(void *p_vgpu, int port_num)
751 {
752 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
753 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
754 	struct vfio_edid_region *base;
755 	int ret;
756 
757 	base = kzalloc(sizeof(*base), GFP_KERNEL);
758 	if (!base)
759 		return -ENOMEM;
760 
761 	/* TODO: Add multi-port and EDID extension block support */
762 	base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
763 	base->vfio_edid_regs.edid_max_size = EDID_SIZE;
764 	base->vfio_edid_regs.edid_size = EDID_SIZE;
765 	base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
766 	base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
767 	base->edid_blob = port->edid->edid_block;
768 
769 	ret = intel_vgpu_register_reg(vgpu,
770 			VFIO_REGION_TYPE_GFX,
771 			VFIO_REGION_SUBTYPE_GFX_EDID,
772 			&intel_vgpu_regops_edid, EDID_SIZE,
773 			VFIO_REGION_INFO_FLAG_READ |
774 			VFIO_REGION_INFO_FLAG_WRITE |
775 			VFIO_REGION_INFO_FLAG_CAPS, base);
776 
777 	return ret;
778 }
779 
780 static void kvmgt_put_vfio_device(void *vgpu)
781 {
782 	struct kvmgt_vdev *vdev = kvmgt_vdev((struct intel_vgpu *)vgpu);
783 
784 	if (WARN_ON(!vdev->vfio_device))
785 		return;
786 
787 	vfio_device_put(vdev->vfio_device);
788 }
789 
790 static int intel_vgpu_create(struct mdev_device *mdev)
791 {
792 	struct intel_vgpu *vgpu = NULL;
793 	struct intel_vgpu_type *type;
794 	struct device *pdev;
795 	struct intel_gvt *gvt;
796 	int ret;
797 
798 	pdev = mdev_parent_dev(mdev);
799 	gvt = kdev_to_i915(pdev)->gvt;
800 
801 	type = &gvt->types[mdev_get_type_group_id(mdev)];
802 	if (!type) {
803 		ret = -EINVAL;
804 		goto out;
805 	}
806 
807 	vgpu = intel_gvt_ops->vgpu_create(gvt, type);
808 	if (IS_ERR_OR_NULL(vgpu)) {
809 		ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
810 		gvt_err("failed to create intel vgpu: %d\n", ret);
811 		goto out;
812 	}
813 
814 	INIT_WORK(&kvmgt_vdev(vgpu)->release_work, intel_vgpu_release_work);
815 
816 	kvmgt_vdev(vgpu)->mdev = mdev;
817 	mdev_set_drvdata(mdev, vgpu);
818 
819 	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
820 		     dev_name(mdev_dev(mdev)));
821 	ret = 0;
822 
823 out:
824 	return ret;
825 }
826 
827 static int intel_vgpu_remove(struct mdev_device *mdev)
828 {
829 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
830 
831 	if (handle_valid(vgpu->handle))
832 		return -EBUSY;
833 
834 	intel_gvt_ops->vgpu_destroy(vgpu);
835 	return 0;
836 }
837 
838 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
839 				     unsigned long action, void *data)
840 {
841 	struct kvmgt_vdev *vdev = container_of(nb,
842 					       struct kvmgt_vdev,
843 					       iommu_notifier);
844 	struct intel_vgpu *vgpu = vdev->vgpu;
845 
846 	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
847 		struct vfio_iommu_type1_dma_unmap *unmap = data;
848 		struct gvt_dma *entry;
849 		unsigned long iov_pfn, end_iov_pfn;
850 
851 		iov_pfn = unmap->iova >> PAGE_SHIFT;
852 		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
853 
854 		mutex_lock(&vdev->cache_lock);
855 		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
856 			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
857 			if (!entry)
858 				continue;
859 
860 			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
861 					   entry->size);
862 			__gvt_cache_remove_entry(vgpu, entry);
863 		}
864 		mutex_unlock(&vdev->cache_lock);
865 	}
866 
867 	return NOTIFY_OK;
868 }
869 
870 static int intel_vgpu_group_notifier(struct notifier_block *nb,
871 				     unsigned long action, void *data)
872 {
873 	struct kvmgt_vdev *vdev = container_of(nb,
874 					       struct kvmgt_vdev,
875 					       group_notifier);
876 
877 	/* the only action we care about */
878 	if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
879 		vdev->kvm = data;
880 
881 		if (!data)
882 			schedule_work(&vdev->release_work);
883 	}
884 
885 	return NOTIFY_OK;
886 }
887 
888 static int intel_vgpu_open(struct mdev_device *mdev)
889 {
890 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
891 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
892 	unsigned long events;
893 	int ret;
894 	struct vfio_group *vfio_group;
895 
896 	vdev->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
897 	vdev->group_notifier.notifier_call = intel_vgpu_group_notifier;
898 
899 	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
900 	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
901 				&vdev->iommu_notifier);
902 	if (ret != 0) {
903 		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
904 			ret);
905 		goto out;
906 	}
907 
908 	events = VFIO_GROUP_NOTIFY_SET_KVM;
909 	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
910 				&vdev->group_notifier);
911 	if (ret != 0) {
912 		gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
913 			ret);
914 		goto undo_iommu;
915 	}
916 
917 	vfio_group = vfio_group_get_external_user_from_dev(mdev_dev(mdev));
918 	if (IS_ERR_OR_NULL(vfio_group)) {
919 		ret = !vfio_group ? -EFAULT : PTR_ERR(vfio_group);
920 		gvt_vgpu_err("vfio_group_get_external_user_from_dev failed\n");
921 		goto undo_register;
922 	}
923 	vdev->vfio_group = vfio_group;
924 
925 	/* Take a module reference as mdev core doesn't take
926 	 * a reference for vendor driver.
927 	 */
928 	if (!try_module_get(THIS_MODULE)) {
929 		ret = -ENODEV;
930 		goto undo_group;
931 	}
932 
933 	ret = kvmgt_guest_init(mdev);
934 	if (ret)
935 		goto undo_group;
936 
937 	intel_gvt_ops->vgpu_activate(vgpu);
938 
939 	atomic_set(&vdev->released, 0);
940 	return ret;
941 
942 undo_group:
943 	vfio_group_put_external_user(vdev->vfio_group);
944 	vdev->vfio_group = NULL;
945 
946 undo_register:
947 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
948 					&vdev->group_notifier);
949 
950 undo_iommu:
951 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
952 					&vdev->iommu_notifier);
953 out:
954 	return ret;
955 }
956 
957 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
958 {
959 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
960 	struct eventfd_ctx *trigger;
961 
962 	trigger = vdev->msi_trigger;
963 	if (trigger) {
964 		eventfd_ctx_put(trigger);
965 		vdev->msi_trigger = NULL;
966 	}
967 }
968 
969 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
970 {
971 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
972 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
973 	struct kvmgt_guest_info *info;
974 	int ret;
975 
976 	if (!handle_valid(vgpu->handle))
977 		return;
978 
979 	if (atomic_cmpxchg(&vdev->released, 0, 1))
980 		return;
981 
982 	intel_gvt_ops->vgpu_release(vgpu);
983 
984 	ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_IOMMU_NOTIFY,
985 					&vdev->iommu_notifier);
986 	drm_WARN(&i915->drm, ret,
987 		 "vfio_unregister_notifier for iommu failed: %d\n", ret);
988 
989 	ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_GROUP_NOTIFY,
990 					&vdev->group_notifier);
991 	drm_WARN(&i915->drm, ret,
992 		 "vfio_unregister_notifier for group failed: %d\n", ret);
993 
994 	/* dereference module reference taken at open */
995 	module_put(THIS_MODULE);
996 
997 	info = (struct kvmgt_guest_info *)vgpu->handle;
998 	kvmgt_guest_exit(info);
999 
1000 	intel_vgpu_release_msi_eventfd_ctx(vgpu);
1001 	vfio_group_put_external_user(vdev->vfio_group);
1002 
1003 	vdev->kvm = NULL;
1004 	vgpu->handle = 0;
1005 }
1006 
1007 static void intel_vgpu_release(struct mdev_device *mdev)
1008 {
1009 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1010 
1011 	__intel_vgpu_release(vgpu);
1012 }
1013 
1014 static void intel_vgpu_release_work(struct work_struct *work)
1015 {
1016 	struct kvmgt_vdev *vdev = container_of(work, struct kvmgt_vdev,
1017 					       release_work);
1018 
1019 	__intel_vgpu_release(vdev->vgpu);
1020 }
1021 
1022 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
1023 {
1024 	u32 start_lo, start_hi;
1025 	u32 mem_type;
1026 
1027 	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1028 			PCI_BASE_ADDRESS_MEM_MASK;
1029 	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1030 			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
1031 
1032 	switch (mem_type) {
1033 	case PCI_BASE_ADDRESS_MEM_TYPE_64:
1034 		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
1035 						+ bar + 4));
1036 		break;
1037 	case PCI_BASE_ADDRESS_MEM_TYPE_32:
1038 	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
1039 		/* 1M mem BAR treated as 32-bit BAR */
1040 	default:
1041 		/* mem unknown type treated as 32-bit BAR */
1042 		start_hi = 0;
1043 		break;
1044 	}
1045 
1046 	return ((u64)start_hi << 32) | start_lo;
1047 }
1048 
1049 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
1050 			     void *buf, unsigned int count, bool is_write)
1051 {
1052 	u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
1053 	int ret;
1054 
1055 	if (is_write)
1056 		ret = intel_gvt_ops->emulate_mmio_write(vgpu,
1057 					bar_start + off, buf, count);
1058 	else
1059 		ret = intel_gvt_ops->emulate_mmio_read(vgpu,
1060 					bar_start + off, buf, count);
1061 	return ret;
1062 }
1063 
1064 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
1065 {
1066 	return off >= vgpu_aperture_offset(vgpu) &&
1067 	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
1068 }
1069 
1070 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
1071 		void *buf, unsigned long count, bool is_write)
1072 {
1073 	void __iomem *aperture_va;
1074 
1075 	if (!intel_vgpu_in_aperture(vgpu, off) ||
1076 	    !intel_vgpu_in_aperture(vgpu, off + count)) {
1077 		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
1078 		return -EINVAL;
1079 	}
1080 
1081 	aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
1082 					ALIGN_DOWN(off, PAGE_SIZE),
1083 					count + offset_in_page(off));
1084 	if (!aperture_va)
1085 		return -EIO;
1086 
1087 	if (is_write)
1088 		memcpy_toio(aperture_va + offset_in_page(off), buf, count);
1089 	else
1090 		memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
1091 
1092 	io_mapping_unmap(aperture_va);
1093 
1094 	return 0;
1095 }
1096 
1097 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
1098 			size_t count, loff_t *ppos, bool is_write)
1099 {
1100 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1101 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1102 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1103 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
1104 	int ret = -EINVAL;
1105 
1106 
1107 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) {
1108 		gvt_vgpu_err("invalid index: %u\n", index);
1109 		return -EINVAL;
1110 	}
1111 
1112 	switch (index) {
1113 	case VFIO_PCI_CONFIG_REGION_INDEX:
1114 		if (is_write)
1115 			ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
1116 						buf, count);
1117 		else
1118 			ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
1119 						buf, count);
1120 		break;
1121 	case VFIO_PCI_BAR0_REGION_INDEX:
1122 		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
1123 					buf, count, is_write);
1124 		break;
1125 	case VFIO_PCI_BAR2_REGION_INDEX:
1126 		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
1127 		break;
1128 	case VFIO_PCI_BAR1_REGION_INDEX:
1129 	case VFIO_PCI_BAR3_REGION_INDEX:
1130 	case VFIO_PCI_BAR4_REGION_INDEX:
1131 	case VFIO_PCI_BAR5_REGION_INDEX:
1132 	case VFIO_PCI_VGA_REGION_INDEX:
1133 	case VFIO_PCI_ROM_REGION_INDEX:
1134 		break;
1135 	default:
1136 		if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1137 			return -EINVAL;
1138 
1139 		index -= VFIO_PCI_NUM_REGIONS;
1140 		return vdev->region[index].ops->rw(vgpu, buf, count,
1141 				ppos, is_write);
1142 	}
1143 
1144 	return ret == 0 ? count : ret;
1145 }
1146 
1147 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
1148 {
1149 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1150 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1151 	struct intel_gvt *gvt = vgpu->gvt;
1152 	int offset;
1153 
1154 	/* Only allow MMIO GGTT entry access */
1155 	if (index != PCI_BASE_ADDRESS_0)
1156 		return false;
1157 
1158 	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
1159 		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
1160 
1161 	return (offset >= gvt->device_info.gtt_start_offset &&
1162 		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1163 			true : false;
1164 }
1165 
1166 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
1167 			size_t count, loff_t *ppos)
1168 {
1169 	unsigned int done = 0;
1170 	int ret;
1171 
1172 	while (count) {
1173 		size_t filled;
1174 
1175 		/* Only support GGTT entry 8 bytes read */
1176 		if (count >= 8 && !(*ppos % 8) &&
1177 			gtt_entry(mdev, ppos)) {
1178 			u64 val;
1179 
1180 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1181 					ppos, false);
1182 			if (ret <= 0)
1183 				goto read_err;
1184 
1185 			if (copy_to_user(buf, &val, sizeof(val)))
1186 				goto read_err;
1187 
1188 			filled = 8;
1189 		} else if (count >= 4 && !(*ppos % 4)) {
1190 			u32 val;
1191 
1192 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1193 					ppos, false);
1194 			if (ret <= 0)
1195 				goto read_err;
1196 
1197 			if (copy_to_user(buf, &val, sizeof(val)))
1198 				goto read_err;
1199 
1200 			filled = 4;
1201 		} else if (count >= 2 && !(*ppos % 2)) {
1202 			u16 val;
1203 
1204 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1205 					ppos, false);
1206 			if (ret <= 0)
1207 				goto read_err;
1208 
1209 			if (copy_to_user(buf, &val, sizeof(val)))
1210 				goto read_err;
1211 
1212 			filled = 2;
1213 		} else {
1214 			u8 val;
1215 
1216 			ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
1217 					false);
1218 			if (ret <= 0)
1219 				goto read_err;
1220 
1221 			if (copy_to_user(buf, &val, sizeof(val)))
1222 				goto read_err;
1223 
1224 			filled = 1;
1225 		}
1226 
1227 		count -= filled;
1228 		done += filled;
1229 		*ppos += filled;
1230 		buf += filled;
1231 	}
1232 
1233 	return done;
1234 
1235 read_err:
1236 	return -EFAULT;
1237 }
1238 
1239 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
1240 				const char __user *buf,
1241 				size_t count, loff_t *ppos)
1242 {
1243 	unsigned int done = 0;
1244 	int ret;
1245 
1246 	while (count) {
1247 		size_t filled;
1248 
1249 		/* Only support GGTT entry 8 bytes write */
1250 		if (count >= 8 && !(*ppos % 8) &&
1251 			gtt_entry(mdev, ppos)) {
1252 			u64 val;
1253 
1254 			if (copy_from_user(&val, buf, sizeof(val)))
1255 				goto write_err;
1256 
1257 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1258 					ppos, true);
1259 			if (ret <= 0)
1260 				goto write_err;
1261 
1262 			filled = 8;
1263 		} else if (count >= 4 && !(*ppos % 4)) {
1264 			u32 val;
1265 
1266 			if (copy_from_user(&val, buf, sizeof(val)))
1267 				goto write_err;
1268 
1269 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1270 					ppos, true);
1271 			if (ret <= 0)
1272 				goto write_err;
1273 
1274 			filled = 4;
1275 		} else if (count >= 2 && !(*ppos % 2)) {
1276 			u16 val;
1277 
1278 			if (copy_from_user(&val, buf, sizeof(val)))
1279 				goto write_err;
1280 
1281 			ret = intel_vgpu_rw(mdev, (char *)&val,
1282 					sizeof(val), ppos, true);
1283 			if (ret <= 0)
1284 				goto write_err;
1285 
1286 			filled = 2;
1287 		} else {
1288 			u8 val;
1289 
1290 			if (copy_from_user(&val, buf, sizeof(val)))
1291 				goto write_err;
1292 
1293 			ret = intel_vgpu_rw(mdev, &val, sizeof(val),
1294 					ppos, true);
1295 			if (ret <= 0)
1296 				goto write_err;
1297 
1298 			filled = 1;
1299 		}
1300 
1301 		count -= filled;
1302 		done += filled;
1303 		*ppos += filled;
1304 		buf += filled;
1305 	}
1306 
1307 	return done;
1308 write_err:
1309 	return -EFAULT;
1310 }
1311 
1312 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1313 {
1314 	unsigned int index;
1315 	u64 virtaddr;
1316 	unsigned long req_size, pgoff, req_start;
1317 	pgprot_t pg_prot;
1318 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1319 
1320 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1321 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1322 		return -EINVAL;
1323 
1324 	if (vma->vm_end < vma->vm_start)
1325 		return -EINVAL;
1326 	if ((vma->vm_flags & VM_SHARED) == 0)
1327 		return -EINVAL;
1328 	if (index != VFIO_PCI_BAR2_REGION_INDEX)
1329 		return -EINVAL;
1330 
1331 	pg_prot = vma->vm_page_prot;
1332 	virtaddr = vma->vm_start;
1333 	req_size = vma->vm_end - vma->vm_start;
1334 	pgoff = vma->vm_pgoff &
1335 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1336 	req_start = pgoff << PAGE_SHIFT;
1337 
1338 	if (!intel_vgpu_in_aperture(vgpu, req_start))
1339 		return -EINVAL;
1340 	if (req_start + req_size >
1341 	    vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1342 		return -EINVAL;
1343 
1344 	pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1345 
1346 	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1347 }
1348 
1349 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1350 {
1351 	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1352 		return 1;
1353 
1354 	return 0;
1355 }
1356 
1357 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1358 			unsigned int index, unsigned int start,
1359 			unsigned int count, u32 flags,
1360 			void *data)
1361 {
1362 	return 0;
1363 }
1364 
1365 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1366 			unsigned int index, unsigned int start,
1367 			unsigned int count, u32 flags, void *data)
1368 {
1369 	return 0;
1370 }
1371 
1372 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1373 		unsigned int index, unsigned int start, unsigned int count,
1374 		u32 flags, void *data)
1375 {
1376 	return 0;
1377 }
1378 
1379 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1380 		unsigned int index, unsigned int start, unsigned int count,
1381 		u32 flags, void *data)
1382 {
1383 	struct eventfd_ctx *trigger;
1384 
1385 	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1386 		int fd = *(int *)data;
1387 
1388 		trigger = eventfd_ctx_fdget(fd);
1389 		if (IS_ERR(trigger)) {
1390 			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1391 			return PTR_ERR(trigger);
1392 		}
1393 		kvmgt_vdev(vgpu)->msi_trigger = trigger;
1394 	} else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1395 		intel_vgpu_release_msi_eventfd_ctx(vgpu);
1396 
1397 	return 0;
1398 }
1399 
1400 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1401 		unsigned int index, unsigned int start, unsigned int count,
1402 		void *data)
1403 {
1404 	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1405 			unsigned int start, unsigned int count, u32 flags,
1406 			void *data) = NULL;
1407 
1408 	switch (index) {
1409 	case VFIO_PCI_INTX_IRQ_INDEX:
1410 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1411 		case VFIO_IRQ_SET_ACTION_MASK:
1412 			func = intel_vgpu_set_intx_mask;
1413 			break;
1414 		case VFIO_IRQ_SET_ACTION_UNMASK:
1415 			func = intel_vgpu_set_intx_unmask;
1416 			break;
1417 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1418 			func = intel_vgpu_set_intx_trigger;
1419 			break;
1420 		}
1421 		break;
1422 	case VFIO_PCI_MSI_IRQ_INDEX:
1423 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1424 		case VFIO_IRQ_SET_ACTION_MASK:
1425 		case VFIO_IRQ_SET_ACTION_UNMASK:
1426 			/* XXX Need masking support exported */
1427 			break;
1428 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1429 			func = intel_vgpu_set_msi_trigger;
1430 			break;
1431 		}
1432 		break;
1433 	}
1434 
1435 	if (!func)
1436 		return -ENOTTY;
1437 
1438 	return func(vgpu, index, start, count, flags, data);
1439 }
1440 
1441 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1442 			     unsigned long arg)
1443 {
1444 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1445 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1446 	unsigned long minsz;
1447 
1448 	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1449 
1450 	if (cmd == VFIO_DEVICE_GET_INFO) {
1451 		struct vfio_device_info info;
1452 
1453 		minsz = offsetofend(struct vfio_device_info, num_irqs);
1454 
1455 		if (copy_from_user(&info, (void __user *)arg, minsz))
1456 			return -EFAULT;
1457 
1458 		if (info.argsz < minsz)
1459 			return -EINVAL;
1460 
1461 		info.flags = VFIO_DEVICE_FLAGS_PCI;
1462 		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1463 		info.num_regions = VFIO_PCI_NUM_REGIONS +
1464 				vdev->num_regions;
1465 		info.num_irqs = VFIO_PCI_NUM_IRQS;
1466 
1467 		return copy_to_user((void __user *)arg, &info, minsz) ?
1468 			-EFAULT : 0;
1469 
1470 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1471 		struct vfio_region_info info;
1472 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1473 		unsigned int i;
1474 		int ret;
1475 		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1476 		int nr_areas = 1;
1477 		int cap_type_id;
1478 
1479 		minsz = offsetofend(struct vfio_region_info, offset);
1480 
1481 		if (copy_from_user(&info, (void __user *)arg, minsz))
1482 			return -EFAULT;
1483 
1484 		if (info.argsz < minsz)
1485 			return -EINVAL;
1486 
1487 		switch (info.index) {
1488 		case VFIO_PCI_CONFIG_REGION_INDEX:
1489 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1490 			info.size = vgpu->gvt->device_info.cfg_space_size;
1491 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1492 				     VFIO_REGION_INFO_FLAG_WRITE;
1493 			break;
1494 		case VFIO_PCI_BAR0_REGION_INDEX:
1495 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1496 			info.size = vgpu->cfg_space.bar[info.index].size;
1497 			if (!info.size) {
1498 				info.flags = 0;
1499 				break;
1500 			}
1501 
1502 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1503 				     VFIO_REGION_INFO_FLAG_WRITE;
1504 			break;
1505 		case VFIO_PCI_BAR1_REGION_INDEX:
1506 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1507 			info.size = 0;
1508 			info.flags = 0;
1509 			break;
1510 		case VFIO_PCI_BAR2_REGION_INDEX:
1511 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1512 			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1513 					VFIO_REGION_INFO_FLAG_MMAP |
1514 					VFIO_REGION_INFO_FLAG_READ |
1515 					VFIO_REGION_INFO_FLAG_WRITE;
1516 			info.size = gvt_aperture_sz(vgpu->gvt);
1517 
1518 			sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1519 					 GFP_KERNEL);
1520 			if (!sparse)
1521 				return -ENOMEM;
1522 
1523 			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1524 			sparse->header.version = 1;
1525 			sparse->nr_areas = nr_areas;
1526 			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1527 			sparse->areas[0].offset =
1528 					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1529 			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1530 			break;
1531 
1532 		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1533 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1534 			info.size = 0;
1535 			info.flags = 0;
1536 
1537 			gvt_dbg_core("get region info bar:%d\n", info.index);
1538 			break;
1539 
1540 		case VFIO_PCI_ROM_REGION_INDEX:
1541 		case VFIO_PCI_VGA_REGION_INDEX:
1542 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1543 			info.size = 0;
1544 			info.flags = 0;
1545 
1546 			gvt_dbg_core("get region info index:%d\n", info.index);
1547 			break;
1548 		default:
1549 			{
1550 				struct vfio_region_info_cap_type cap_type = {
1551 					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1552 					.header.version = 1 };
1553 
1554 				if (info.index >= VFIO_PCI_NUM_REGIONS +
1555 						vdev->num_regions)
1556 					return -EINVAL;
1557 				info.index =
1558 					array_index_nospec(info.index,
1559 							VFIO_PCI_NUM_REGIONS +
1560 							vdev->num_regions);
1561 
1562 				i = info.index - VFIO_PCI_NUM_REGIONS;
1563 
1564 				info.offset =
1565 					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1566 				info.size = vdev->region[i].size;
1567 				info.flags = vdev->region[i].flags;
1568 
1569 				cap_type.type = vdev->region[i].type;
1570 				cap_type.subtype = vdev->region[i].subtype;
1571 
1572 				ret = vfio_info_add_capability(&caps,
1573 							&cap_type.header,
1574 							sizeof(cap_type));
1575 				if (ret)
1576 					return ret;
1577 			}
1578 		}
1579 
1580 		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1581 			switch (cap_type_id) {
1582 			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1583 				ret = vfio_info_add_capability(&caps,
1584 					&sparse->header,
1585 					struct_size(sparse, areas,
1586 						    sparse->nr_areas));
1587 				if (ret) {
1588 					kfree(sparse);
1589 					return ret;
1590 				}
1591 				break;
1592 			default:
1593 				kfree(sparse);
1594 				return -EINVAL;
1595 			}
1596 		}
1597 
1598 		if (caps.size) {
1599 			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1600 			if (info.argsz < sizeof(info) + caps.size) {
1601 				info.argsz = sizeof(info) + caps.size;
1602 				info.cap_offset = 0;
1603 			} else {
1604 				vfio_info_cap_shift(&caps, sizeof(info));
1605 				if (copy_to_user((void __user *)arg +
1606 						  sizeof(info), caps.buf,
1607 						  caps.size)) {
1608 					kfree(caps.buf);
1609 					kfree(sparse);
1610 					return -EFAULT;
1611 				}
1612 				info.cap_offset = sizeof(info);
1613 			}
1614 
1615 			kfree(caps.buf);
1616 		}
1617 
1618 		kfree(sparse);
1619 		return copy_to_user((void __user *)arg, &info, minsz) ?
1620 			-EFAULT : 0;
1621 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1622 		struct vfio_irq_info info;
1623 
1624 		minsz = offsetofend(struct vfio_irq_info, count);
1625 
1626 		if (copy_from_user(&info, (void __user *)arg, minsz))
1627 			return -EFAULT;
1628 
1629 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1630 			return -EINVAL;
1631 
1632 		switch (info.index) {
1633 		case VFIO_PCI_INTX_IRQ_INDEX:
1634 		case VFIO_PCI_MSI_IRQ_INDEX:
1635 			break;
1636 		default:
1637 			return -EINVAL;
1638 		}
1639 
1640 		info.flags = VFIO_IRQ_INFO_EVENTFD;
1641 
1642 		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1643 
1644 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1645 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1646 				       VFIO_IRQ_INFO_AUTOMASKED);
1647 		else
1648 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1649 
1650 		return copy_to_user((void __user *)arg, &info, minsz) ?
1651 			-EFAULT : 0;
1652 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1653 		struct vfio_irq_set hdr;
1654 		u8 *data = NULL;
1655 		int ret = 0;
1656 		size_t data_size = 0;
1657 
1658 		minsz = offsetofend(struct vfio_irq_set, count);
1659 
1660 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1661 			return -EFAULT;
1662 
1663 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1664 			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1665 
1666 			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1667 						VFIO_PCI_NUM_IRQS, &data_size);
1668 			if (ret) {
1669 				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1670 				return -EINVAL;
1671 			}
1672 			if (data_size) {
1673 				data = memdup_user((void __user *)(arg + minsz),
1674 						   data_size);
1675 				if (IS_ERR(data))
1676 					return PTR_ERR(data);
1677 			}
1678 		}
1679 
1680 		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1681 					hdr.start, hdr.count, data);
1682 		kfree(data);
1683 
1684 		return ret;
1685 	} else if (cmd == VFIO_DEVICE_RESET) {
1686 		intel_gvt_ops->vgpu_reset(vgpu);
1687 		return 0;
1688 	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1689 		struct vfio_device_gfx_plane_info dmabuf;
1690 		int ret = 0;
1691 
1692 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1693 				    dmabuf_id);
1694 		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1695 			return -EFAULT;
1696 		if (dmabuf.argsz < minsz)
1697 			return -EINVAL;
1698 
1699 		ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1700 		if (ret != 0)
1701 			return ret;
1702 
1703 		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1704 								-EFAULT : 0;
1705 	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1706 		__u32 dmabuf_id;
1707 		__s32 dmabuf_fd;
1708 
1709 		if (get_user(dmabuf_id, (__u32 __user *)arg))
1710 			return -EFAULT;
1711 
1712 		dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1713 		return dmabuf_fd;
1714 
1715 	}
1716 
1717 	return -ENOTTY;
1718 }
1719 
1720 static ssize_t
1721 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1722 	     char *buf)
1723 {
1724 	struct mdev_device *mdev = mdev_from_dev(dev);
1725 
1726 	if (mdev) {
1727 		struct intel_vgpu *vgpu = (struct intel_vgpu *)
1728 			mdev_get_drvdata(mdev);
1729 		return sprintf(buf, "%d\n", vgpu->id);
1730 	}
1731 	return sprintf(buf, "\n");
1732 }
1733 
1734 static DEVICE_ATTR_RO(vgpu_id);
1735 
1736 static struct attribute *intel_vgpu_attrs[] = {
1737 	&dev_attr_vgpu_id.attr,
1738 	NULL
1739 };
1740 
1741 static const struct attribute_group intel_vgpu_group = {
1742 	.name = "intel_vgpu",
1743 	.attrs = intel_vgpu_attrs,
1744 };
1745 
1746 static const struct attribute_group *intel_vgpu_groups[] = {
1747 	&intel_vgpu_group,
1748 	NULL,
1749 };
1750 
1751 static struct mdev_parent_ops intel_vgpu_ops = {
1752 	.mdev_attr_groups       = intel_vgpu_groups,
1753 	.create			= intel_vgpu_create,
1754 	.remove			= intel_vgpu_remove,
1755 
1756 	.open			= intel_vgpu_open,
1757 	.release		= intel_vgpu_release,
1758 
1759 	.read			= intel_vgpu_read,
1760 	.write			= intel_vgpu_write,
1761 	.mmap			= intel_vgpu_mmap,
1762 	.ioctl			= intel_vgpu_ioctl,
1763 };
1764 
1765 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1766 {
1767 	int ret;
1768 
1769 	ret = intel_gvt_init_vgpu_type_groups((struct intel_gvt *)gvt);
1770 	if (ret)
1771 		return ret;
1772 
1773 	intel_gvt_ops = ops;
1774 	intel_vgpu_ops.supported_type_groups = gvt_vgpu_type_groups;
1775 
1776 	ret = mdev_register_device(dev, &intel_vgpu_ops);
1777 	if (ret)
1778 		intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1779 
1780 	return ret;
1781 }
1782 
1783 static void kvmgt_host_exit(struct device *dev, void *gvt)
1784 {
1785 	mdev_unregister_device(dev);
1786 	intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1787 }
1788 
1789 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1790 {
1791 	struct kvmgt_guest_info *info;
1792 	struct kvm *kvm;
1793 	struct kvm_memory_slot *slot;
1794 	int idx;
1795 
1796 	if (!handle_valid(handle))
1797 		return -ESRCH;
1798 
1799 	info = (struct kvmgt_guest_info *)handle;
1800 	kvm = info->kvm;
1801 
1802 	idx = srcu_read_lock(&kvm->srcu);
1803 	slot = gfn_to_memslot(kvm, gfn);
1804 	if (!slot) {
1805 		srcu_read_unlock(&kvm->srcu, idx);
1806 		return -EINVAL;
1807 	}
1808 
1809 	write_lock(&kvm->mmu_lock);
1810 
1811 	if (kvmgt_gfn_is_write_protected(info, gfn))
1812 		goto out;
1813 
1814 	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1815 	kvmgt_protect_table_add(info, gfn);
1816 
1817 out:
1818 	write_unlock(&kvm->mmu_lock);
1819 	srcu_read_unlock(&kvm->srcu, idx);
1820 	return 0;
1821 }
1822 
1823 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1824 {
1825 	struct kvmgt_guest_info *info;
1826 	struct kvm *kvm;
1827 	struct kvm_memory_slot *slot;
1828 	int idx;
1829 
1830 	if (!handle_valid(handle))
1831 		return 0;
1832 
1833 	info = (struct kvmgt_guest_info *)handle;
1834 	kvm = info->kvm;
1835 
1836 	idx = srcu_read_lock(&kvm->srcu);
1837 	slot = gfn_to_memslot(kvm, gfn);
1838 	if (!slot) {
1839 		srcu_read_unlock(&kvm->srcu, idx);
1840 		return -EINVAL;
1841 	}
1842 
1843 	write_lock(&kvm->mmu_lock);
1844 
1845 	if (!kvmgt_gfn_is_write_protected(info, gfn))
1846 		goto out;
1847 
1848 	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1849 	kvmgt_protect_table_del(info, gfn);
1850 
1851 out:
1852 	write_unlock(&kvm->mmu_lock);
1853 	srcu_read_unlock(&kvm->srcu, idx);
1854 	return 0;
1855 }
1856 
1857 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1858 		const u8 *val, int len,
1859 		struct kvm_page_track_notifier_node *node)
1860 {
1861 	struct kvmgt_guest_info *info = container_of(node,
1862 					struct kvmgt_guest_info, track_node);
1863 
1864 	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1865 		intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1866 						     (void *)val, len);
1867 }
1868 
1869 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1870 		struct kvm_memory_slot *slot,
1871 		struct kvm_page_track_notifier_node *node)
1872 {
1873 	int i;
1874 	gfn_t gfn;
1875 	struct kvmgt_guest_info *info = container_of(node,
1876 					struct kvmgt_guest_info, track_node);
1877 
1878 	write_lock(&kvm->mmu_lock);
1879 	for (i = 0; i < slot->npages; i++) {
1880 		gfn = slot->base_gfn + i;
1881 		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1882 			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1883 						KVM_PAGE_TRACK_WRITE);
1884 			kvmgt_protect_table_del(info, gfn);
1885 		}
1886 	}
1887 	write_unlock(&kvm->mmu_lock);
1888 }
1889 
1890 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1891 {
1892 	struct intel_vgpu *itr;
1893 	struct kvmgt_guest_info *info;
1894 	int id;
1895 	bool ret = false;
1896 
1897 	mutex_lock(&vgpu->gvt->lock);
1898 	for_each_active_vgpu(vgpu->gvt, itr, id) {
1899 		if (!handle_valid(itr->handle))
1900 			continue;
1901 
1902 		info = (struct kvmgt_guest_info *)itr->handle;
1903 		if (kvm && kvm == info->kvm) {
1904 			ret = true;
1905 			goto out;
1906 		}
1907 	}
1908 out:
1909 	mutex_unlock(&vgpu->gvt->lock);
1910 	return ret;
1911 }
1912 
1913 static int kvmgt_guest_init(struct mdev_device *mdev)
1914 {
1915 	struct kvmgt_guest_info *info;
1916 	struct intel_vgpu *vgpu;
1917 	struct kvmgt_vdev *vdev;
1918 	struct kvm *kvm;
1919 
1920 	vgpu = mdev_get_drvdata(mdev);
1921 	if (handle_valid(vgpu->handle))
1922 		return -EEXIST;
1923 
1924 	vdev = kvmgt_vdev(vgpu);
1925 	kvm = vdev->kvm;
1926 	if (!kvm || kvm->mm != current->mm) {
1927 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1928 		return -ESRCH;
1929 	}
1930 
1931 	if (__kvmgt_vgpu_exist(vgpu, kvm))
1932 		return -EEXIST;
1933 
1934 	info = vzalloc(sizeof(struct kvmgt_guest_info));
1935 	if (!info)
1936 		return -ENOMEM;
1937 
1938 	vgpu->handle = (unsigned long)info;
1939 	info->vgpu = vgpu;
1940 	info->kvm = kvm;
1941 	kvm_get_kvm(info->kvm);
1942 
1943 	kvmgt_protect_table_init(info);
1944 	gvt_cache_init(vgpu);
1945 
1946 	info->track_node.track_write = kvmgt_page_track_write;
1947 	info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1948 	kvm_page_track_register_notifier(kvm, &info->track_node);
1949 
1950 	info->debugfs_cache_entries = debugfs_create_ulong(
1951 						"kvmgt_nr_cache_entries",
1952 						0444, vgpu->debugfs,
1953 						&vdev->nr_cache_entries);
1954 	return 0;
1955 }
1956 
1957 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1958 {
1959 	debugfs_remove(info->debugfs_cache_entries);
1960 
1961 	kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1962 	kvm_put_kvm(info->kvm);
1963 	kvmgt_protect_table_destroy(info);
1964 	gvt_cache_destroy(info->vgpu);
1965 	vfree(info);
1966 
1967 	return true;
1968 }
1969 
1970 static int kvmgt_attach_vgpu(void *p_vgpu, unsigned long *handle)
1971 {
1972 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1973 
1974 	vgpu->vdev = kzalloc(sizeof(struct kvmgt_vdev), GFP_KERNEL);
1975 
1976 	if (!vgpu->vdev)
1977 		return -ENOMEM;
1978 
1979 	kvmgt_vdev(vgpu)->vgpu = vgpu;
1980 
1981 	return 0;
1982 }
1983 
1984 static void kvmgt_detach_vgpu(void *p_vgpu)
1985 {
1986 	int i;
1987 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1988 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1989 
1990 	if (!vdev->region)
1991 		return;
1992 
1993 	for (i = 0; i < vdev->num_regions; i++)
1994 		if (vdev->region[i].ops->release)
1995 			vdev->region[i].ops->release(vgpu,
1996 					&vdev->region[i]);
1997 	vdev->num_regions = 0;
1998 	kfree(vdev->region);
1999 	vdev->region = NULL;
2000 
2001 	kfree(vdev);
2002 }
2003 
2004 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
2005 {
2006 	struct kvmgt_guest_info *info;
2007 	struct intel_vgpu *vgpu;
2008 	struct kvmgt_vdev *vdev;
2009 
2010 	if (!handle_valid(handle))
2011 		return -ESRCH;
2012 
2013 	info = (struct kvmgt_guest_info *)handle;
2014 	vgpu = info->vgpu;
2015 	vdev = kvmgt_vdev(vgpu);
2016 
2017 	/*
2018 	 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
2019 	 * config and mmio register isn't restored to default during guest
2020 	 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
2021 	 * may be enabled, then once this vgpu is active, it will get inject
2022 	 * vblank interrupt request. But msi_trigger is null until msi is
2023 	 * enabled by guest. so if msi_trigger is null, success is still
2024 	 * returned and don't inject interrupt into guest.
2025 	 */
2026 	if (vdev->msi_trigger == NULL)
2027 		return 0;
2028 
2029 	if (eventfd_signal(vdev->msi_trigger, 1) == 1)
2030 		return 0;
2031 
2032 	return -EFAULT;
2033 }
2034 
2035 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
2036 {
2037 	struct kvmgt_guest_info *info;
2038 	kvm_pfn_t pfn;
2039 
2040 	if (!handle_valid(handle))
2041 		return INTEL_GVT_INVALID_ADDR;
2042 
2043 	info = (struct kvmgt_guest_info *)handle;
2044 
2045 	pfn = gfn_to_pfn(info->kvm, gfn);
2046 	if (is_error_noslot_pfn(pfn))
2047 		return INTEL_GVT_INVALID_ADDR;
2048 
2049 	return pfn;
2050 }
2051 
2052 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
2053 		unsigned long size, dma_addr_t *dma_addr)
2054 {
2055 	struct intel_vgpu *vgpu;
2056 	struct kvmgt_vdev *vdev;
2057 	struct gvt_dma *entry;
2058 	int ret;
2059 
2060 	if (!handle_valid(handle))
2061 		return -EINVAL;
2062 
2063 	vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2064 	vdev = kvmgt_vdev(vgpu);
2065 
2066 	mutex_lock(&vdev->cache_lock);
2067 
2068 	entry = __gvt_cache_find_gfn(vgpu, gfn);
2069 	if (!entry) {
2070 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2071 		if (ret)
2072 			goto err_unlock;
2073 
2074 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2075 		if (ret)
2076 			goto err_unmap;
2077 	} else if (entry->size != size) {
2078 		/* the same gfn with different size: unmap and re-map */
2079 		gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
2080 		__gvt_cache_remove_entry(vgpu, entry);
2081 
2082 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2083 		if (ret)
2084 			goto err_unlock;
2085 
2086 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2087 		if (ret)
2088 			goto err_unmap;
2089 	} else {
2090 		kref_get(&entry->ref);
2091 		*dma_addr = entry->dma_addr;
2092 	}
2093 
2094 	mutex_unlock(&vdev->cache_lock);
2095 	return 0;
2096 
2097 err_unmap:
2098 	gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
2099 err_unlock:
2100 	mutex_unlock(&vdev->cache_lock);
2101 	return ret;
2102 }
2103 
2104 static int kvmgt_dma_pin_guest_page(unsigned long handle, dma_addr_t dma_addr)
2105 {
2106 	struct kvmgt_guest_info *info;
2107 	struct kvmgt_vdev *vdev;
2108 	struct gvt_dma *entry;
2109 	int ret = 0;
2110 
2111 	if (!handle_valid(handle))
2112 		return -ENODEV;
2113 
2114 	info = (struct kvmgt_guest_info *)handle;
2115 	vdev = kvmgt_vdev(info->vgpu);
2116 
2117 	mutex_lock(&vdev->cache_lock);
2118 	entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
2119 	if (entry)
2120 		kref_get(&entry->ref);
2121 	else
2122 		ret = -ENOMEM;
2123 	mutex_unlock(&vdev->cache_lock);
2124 
2125 	return ret;
2126 }
2127 
2128 static void __gvt_dma_release(struct kref *ref)
2129 {
2130 	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
2131 
2132 	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
2133 			   entry->size);
2134 	__gvt_cache_remove_entry(entry->vgpu, entry);
2135 }
2136 
2137 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
2138 {
2139 	struct intel_vgpu *vgpu;
2140 	struct kvmgt_vdev *vdev;
2141 	struct gvt_dma *entry;
2142 
2143 	if (!handle_valid(handle))
2144 		return;
2145 
2146 	vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2147 	vdev = kvmgt_vdev(vgpu);
2148 
2149 	mutex_lock(&vdev->cache_lock);
2150 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
2151 	if (entry)
2152 		kref_put(&entry->ref, __gvt_dma_release);
2153 	mutex_unlock(&vdev->cache_lock);
2154 }
2155 
2156 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
2157 			void *buf, unsigned long len, bool write)
2158 {
2159 	struct kvmgt_guest_info *info;
2160 
2161 	if (!handle_valid(handle))
2162 		return -ESRCH;
2163 
2164 	info = (struct kvmgt_guest_info *)handle;
2165 
2166 	return vfio_dma_rw(kvmgt_vdev(info->vgpu)->vfio_group,
2167 			   gpa, buf, len, write);
2168 }
2169 
2170 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
2171 			void *buf, unsigned long len)
2172 {
2173 	return kvmgt_rw_gpa(handle, gpa, buf, len, false);
2174 }
2175 
2176 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
2177 			void *buf, unsigned long len)
2178 {
2179 	return kvmgt_rw_gpa(handle, gpa, buf, len, true);
2180 }
2181 
2182 static unsigned long kvmgt_virt_to_pfn(void *addr)
2183 {
2184 	return PFN_DOWN(__pa(addr));
2185 }
2186 
2187 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
2188 {
2189 	struct kvmgt_guest_info *info;
2190 	struct kvm *kvm;
2191 	int idx;
2192 	bool ret;
2193 
2194 	if (!handle_valid(handle))
2195 		return false;
2196 
2197 	info = (struct kvmgt_guest_info *)handle;
2198 	kvm = info->kvm;
2199 
2200 	idx = srcu_read_lock(&kvm->srcu);
2201 	ret = kvm_is_visible_gfn(kvm, gfn);
2202 	srcu_read_unlock(&kvm->srcu, idx);
2203 
2204 	return ret;
2205 }
2206 
2207 static const struct intel_gvt_mpt kvmgt_mpt = {
2208 	.type = INTEL_GVT_HYPERVISOR_KVM,
2209 	.host_init = kvmgt_host_init,
2210 	.host_exit = kvmgt_host_exit,
2211 	.attach_vgpu = kvmgt_attach_vgpu,
2212 	.detach_vgpu = kvmgt_detach_vgpu,
2213 	.inject_msi = kvmgt_inject_msi,
2214 	.from_virt_to_mfn = kvmgt_virt_to_pfn,
2215 	.enable_page_track = kvmgt_page_track_add,
2216 	.disable_page_track = kvmgt_page_track_remove,
2217 	.read_gpa = kvmgt_read_gpa,
2218 	.write_gpa = kvmgt_write_gpa,
2219 	.gfn_to_mfn = kvmgt_gfn_to_pfn,
2220 	.dma_map_guest_page = kvmgt_dma_map_guest_page,
2221 	.dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
2222 	.dma_pin_guest_page = kvmgt_dma_pin_guest_page,
2223 	.set_opregion = kvmgt_set_opregion,
2224 	.set_edid = kvmgt_set_edid,
2225 	.get_vfio_device = kvmgt_get_vfio_device,
2226 	.put_vfio_device = kvmgt_put_vfio_device,
2227 	.is_valid_gfn = kvmgt_is_valid_gfn,
2228 };
2229 
2230 static int __init kvmgt_init(void)
2231 {
2232 	if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
2233 		return -ENODEV;
2234 	return 0;
2235 }
2236 
2237 static void __exit kvmgt_exit(void)
2238 {
2239 	intel_gvt_unregister_hypervisor();
2240 }
2241 
2242 module_init(kvmgt_init);
2243 module_exit(kvmgt_exit);
2244 
2245 MODULE_LICENSE("GPL and additional rights");
2246 MODULE_AUTHOR("Intel Corporation");
2247