xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/kvmgt.c (revision cef69974)
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  */
30 
31 #include <linux/init.h>
32 #include <linux/device.h>
33 #include <linux/mm.h>
34 #include <linux/kthread.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
46 
47 #include <linux/nospec.h>
48 
49 #include <drm/drm_edid.h>
50 
51 #include "i915_drv.h"
52 #include "gvt.h"
53 
54 static const struct intel_gvt_ops *intel_gvt_ops;
55 
56 /* helper macros copied from vfio-pci */
57 #define VFIO_PCI_OFFSET_SHIFT   40
58 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
59 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
60 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
61 
62 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
63 
64 #define OPREGION_SIGNATURE "IntelGraphicsMem"
65 
66 struct vfio_region;
67 struct intel_vgpu_regops {
68 	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
69 			size_t count, loff_t *ppos, bool iswrite);
70 	void (*release)(struct intel_vgpu *vgpu,
71 			struct vfio_region *region);
72 };
73 
74 struct vfio_region {
75 	u32				type;
76 	u32				subtype;
77 	size_t				size;
78 	u32				flags;
79 	const struct intel_vgpu_regops	*ops;
80 	void				*data;
81 };
82 
83 struct vfio_edid_region {
84 	struct vfio_region_gfx_edid vfio_edid_regs;
85 	void *edid_blob;
86 };
87 
88 struct kvmgt_pgfn {
89 	gfn_t gfn;
90 	struct hlist_node hnode;
91 };
92 
93 #define KVMGT_DEBUGFS_FILENAME "kvmgt_nr_cache_entries"
94 struct kvmgt_guest_info {
95 	struct kvm *kvm;
96 	struct intel_vgpu *vgpu;
97 	struct kvm_page_track_notifier_node track_node;
98 #define NR_BKT (1 << 18)
99 	struct hlist_head ptable[NR_BKT];
100 #undef NR_BKT
101 };
102 
103 struct gvt_dma {
104 	struct intel_vgpu *vgpu;
105 	struct rb_node gfn_node;
106 	struct rb_node dma_addr_node;
107 	gfn_t gfn;
108 	dma_addr_t dma_addr;
109 	unsigned long size;
110 	struct kref ref;
111 };
112 
113 struct kvmgt_vdev {
114 	struct intel_vgpu *vgpu;
115 	struct mdev_device *mdev;
116 	struct vfio_region *region;
117 	int num_regions;
118 	struct eventfd_ctx *intx_trigger;
119 	struct eventfd_ctx *msi_trigger;
120 
121 	/*
122 	 * Two caches are used to avoid mapping duplicated pages (eg.
123 	 * scratch pages). This help to reduce dma setup overhead.
124 	 */
125 	struct rb_root gfn_cache;
126 	struct rb_root dma_addr_cache;
127 	unsigned long nr_cache_entries;
128 	struct mutex cache_lock;
129 
130 	struct notifier_block iommu_notifier;
131 	struct notifier_block group_notifier;
132 	struct kvm *kvm;
133 	struct work_struct release_work;
134 	atomic_t released;
135 	struct vfio_device *vfio_device;
136 	struct vfio_group *vfio_group;
137 };
138 
139 static inline struct kvmgt_vdev *kvmgt_vdev(struct intel_vgpu *vgpu)
140 {
141 	return intel_vgpu_vdev(vgpu);
142 }
143 
144 static inline bool handle_valid(unsigned long handle)
145 {
146 	return !!(handle & ~0xff);
147 }
148 
149 static ssize_t available_instances_show(struct mdev_type *mtype,
150 					struct mdev_type_attribute *attr,
151 					char *buf)
152 {
153 	struct intel_vgpu_type *type;
154 	unsigned int num = 0;
155 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
156 
157 	type = &gvt->types[mtype_get_type_group_id(mtype)];
158 	if (!type)
159 		num = 0;
160 	else
161 		num = type->avail_instance;
162 
163 	return sprintf(buf, "%u\n", num);
164 }
165 
166 static ssize_t device_api_show(struct mdev_type *mtype,
167 			       struct mdev_type_attribute *attr, char *buf)
168 {
169 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
170 }
171 
172 static ssize_t description_show(struct mdev_type *mtype,
173 				struct mdev_type_attribute *attr, char *buf)
174 {
175 	struct intel_vgpu_type *type;
176 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
177 
178 	type = &gvt->types[mtype_get_type_group_id(mtype)];
179 	if (!type)
180 		return 0;
181 
182 	return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
183 		       "fence: %d\nresolution: %s\n"
184 		       "weight: %d\n",
185 		       BYTES_TO_MB(type->low_gm_size),
186 		       BYTES_TO_MB(type->high_gm_size),
187 		       type->fence, vgpu_edid_str(type->resolution),
188 		       type->weight);
189 }
190 
191 static ssize_t name_show(struct mdev_type *mtype,
192 			 struct mdev_type_attribute *attr, char *buf)
193 {
194 	struct intel_vgpu_type *type;
195 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
196 
197 	type = &gvt->types[mtype_get_type_group_id(mtype)];
198 	if (!type)
199 		return 0;
200 
201 	return sprintf(buf, "%s\n", type->name);
202 }
203 
204 static MDEV_TYPE_ATTR_RO(available_instances);
205 static MDEV_TYPE_ATTR_RO(device_api);
206 static MDEV_TYPE_ATTR_RO(description);
207 static MDEV_TYPE_ATTR_RO(name);
208 
209 static struct attribute *gvt_type_attrs[] = {
210 	&mdev_type_attr_available_instances.attr,
211 	&mdev_type_attr_device_api.attr,
212 	&mdev_type_attr_description.attr,
213 	&mdev_type_attr_name.attr,
214 	NULL,
215 };
216 
217 static struct attribute_group *gvt_vgpu_type_groups[] = {
218 	[0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
219 };
220 
221 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
222 {
223 	int i, j;
224 	struct intel_vgpu_type *type;
225 	struct attribute_group *group;
226 
227 	for (i = 0; i < gvt->num_types; i++) {
228 		type = &gvt->types[i];
229 
230 		group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
231 		if (!group)
232 			goto unwind;
233 
234 		group->name = type->name;
235 		group->attrs = gvt_type_attrs;
236 		gvt_vgpu_type_groups[i] = group;
237 	}
238 
239 	return 0;
240 
241 unwind:
242 	for (j = 0; j < i; j++) {
243 		group = gvt_vgpu_type_groups[j];
244 		kfree(group);
245 	}
246 
247 	return -ENOMEM;
248 }
249 
250 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
251 {
252 	int i;
253 	struct attribute_group *group;
254 
255 	for (i = 0; i < gvt->num_types; i++) {
256 		group = gvt_vgpu_type_groups[i];
257 		gvt_vgpu_type_groups[i] = NULL;
258 		kfree(group);
259 	}
260 }
261 
262 static int kvmgt_guest_init(struct mdev_device *mdev);
263 static void intel_vgpu_release_work(struct work_struct *work);
264 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
265 
266 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
267 		unsigned long size)
268 {
269 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
270 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
271 	int total_pages;
272 	int npage;
273 	int ret;
274 
275 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
276 
277 	for (npage = 0; npage < total_pages; npage++) {
278 		unsigned long cur_gfn = gfn + npage;
279 
280 		ret = vfio_group_unpin_pages(vdev->vfio_group, &cur_gfn, 1);
281 		drm_WARN_ON(&i915->drm, ret != 1);
282 	}
283 }
284 
285 /* Pin a normal or compound guest page for dma. */
286 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
287 		unsigned long size, struct page **page)
288 {
289 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
290 	unsigned long base_pfn = 0;
291 	int total_pages;
292 	int npage;
293 	int ret;
294 
295 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
296 	/*
297 	 * We pin the pages one-by-one to avoid allocating a big arrary
298 	 * on stack to hold pfns.
299 	 */
300 	for (npage = 0; npage < total_pages; npage++) {
301 		unsigned long cur_gfn = gfn + npage;
302 		unsigned long pfn;
303 
304 		ret = vfio_group_pin_pages(vdev->vfio_group, &cur_gfn, 1,
305 					   IOMMU_READ | IOMMU_WRITE, &pfn);
306 		if (ret != 1) {
307 			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
308 				     cur_gfn, ret);
309 			goto err;
310 		}
311 
312 		if (!pfn_valid(pfn)) {
313 			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
314 			npage++;
315 			ret = -EFAULT;
316 			goto err;
317 		}
318 
319 		if (npage == 0)
320 			base_pfn = pfn;
321 		else if (base_pfn + npage != pfn) {
322 			gvt_vgpu_err("The pages are not continuous\n");
323 			ret = -EINVAL;
324 			npage++;
325 			goto err;
326 		}
327 	}
328 
329 	*page = pfn_to_page(base_pfn);
330 	return 0;
331 err:
332 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
333 	return ret;
334 }
335 
336 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
337 		dma_addr_t *dma_addr, unsigned long size)
338 {
339 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
340 	struct page *page = NULL;
341 	int ret;
342 
343 	ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
344 	if (ret)
345 		return ret;
346 
347 	/* Setup DMA mapping. */
348 	*dma_addr = dma_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL);
349 	if (dma_mapping_error(dev, *dma_addr)) {
350 		gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
351 			     page_to_pfn(page), ret);
352 		gvt_unpin_guest_page(vgpu, gfn, size);
353 		return -ENOMEM;
354 	}
355 
356 	return 0;
357 }
358 
359 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
360 		dma_addr_t dma_addr, unsigned long size)
361 {
362 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
363 
364 	dma_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL);
365 	gvt_unpin_guest_page(vgpu, gfn, size);
366 }
367 
368 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
369 		dma_addr_t dma_addr)
370 {
371 	struct rb_node *node = kvmgt_vdev(vgpu)->dma_addr_cache.rb_node;
372 	struct gvt_dma *itr;
373 
374 	while (node) {
375 		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
376 
377 		if (dma_addr < itr->dma_addr)
378 			node = node->rb_left;
379 		else if (dma_addr > itr->dma_addr)
380 			node = node->rb_right;
381 		else
382 			return itr;
383 	}
384 	return NULL;
385 }
386 
387 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
388 {
389 	struct rb_node *node = kvmgt_vdev(vgpu)->gfn_cache.rb_node;
390 	struct gvt_dma *itr;
391 
392 	while (node) {
393 		itr = rb_entry(node, struct gvt_dma, gfn_node);
394 
395 		if (gfn < itr->gfn)
396 			node = node->rb_left;
397 		else if (gfn > itr->gfn)
398 			node = node->rb_right;
399 		else
400 			return itr;
401 	}
402 	return NULL;
403 }
404 
405 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
406 		dma_addr_t dma_addr, unsigned long size)
407 {
408 	struct gvt_dma *new, *itr;
409 	struct rb_node **link, *parent = NULL;
410 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
411 
412 	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
413 	if (!new)
414 		return -ENOMEM;
415 
416 	new->vgpu = vgpu;
417 	new->gfn = gfn;
418 	new->dma_addr = dma_addr;
419 	new->size = size;
420 	kref_init(&new->ref);
421 
422 	/* gfn_cache maps gfn to struct gvt_dma. */
423 	link = &vdev->gfn_cache.rb_node;
424 	while (*link) {
425 		parent = *link;
426 		itr = rb_entry(parent, struct gvt_dma, gfn_node);
427 
428 		if (gfn < itr->gfn)
429 			link = &parent->rb_left;
430 		else
431 			link = &parent->rb_right;
432 	}
433 	rb_link_node(&new->gfn_node, parent, link);
434 	rb_insert_color(&new->gfn_node, &vdev->gfn_cache);
435 
436 	/* dma_addr_cache maps dma addr to struct gvt_dma. */
437 	parent = NULL;
438 	link = &vdev->dma_addr_cache.rb_node;
439 	while (*link) {
440 		parent = *link;
441 		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
442 
443 		if (dma_addr < itr->dma_addr)
444 			link = &parent->rb_left;
445 		else
446 			link = &parent->rb_right;
447 	}
448 	rb_link_node(&new->dma_addr_node, parent, link);
449 	rb_insert_color(&new->dma_addr_node, &vdev->dma_addr_cache);
450 
451 	vdev->nr_cache_entries++;
452 	return 0;
453 }
454 
455 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
456 				struct gvt_dma *entry)
457 {
458 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
459 
460 	rb_erase(&entry->gfn_node, &vdev->gfn_cache);
461 	rb_erase(&entry->dma_addr_node, &vdev->dma_addr_cache);
462 	kfree(entry);
463 	vdev->nr_cache_entries--;
464 }
465 
466 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
467 {
468 	struct gvt_dma *dma;
469 	struct rb_node *node = NULL;
470 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
471 
472 	for (;;) {
473 		mutex_lock(&vdev->cache_lock);
474 		node = rb_first(&vdev->gfn_cache);
475 		if (!node) {
476 			mutex_unlock(&vdev->cache_lock);
477 			break;
478 		}
479 		dma = rb_entry(node, struct gvt_dma, gfn_node);
480 		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
481 		__gvt_cache_remove_entry(vgpu, dma);
482 		mutex_unlock(&vdev->cache_lock);
483 	}
484 }
485 
486 static void gvt_cache_init(struct intel_vgpu *vgpu)
487 {
488 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
489 
490 	vdev->gfn_cache = RB_ROOT;
491 	vdev->dma_addr_cache = RB_ROOT;
492 	vdev->nr_cache_entries = 0;
493 	mutex_init(&vdev->cache_lock);
494 }
495 
496 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
497 {
498 	hash_init(info->ptable);
499 }
500 
501 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
502 {
503 	struct kvmgt_pgfn *p;
504 	struct hlist_node *tmp;
505 	int i;
506 
507 	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
508 		hash_del(&p->hnode);
509 		kfree(p);
510 	}
511 }
512 
513 static struct kvmgt_pgfn *
514 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
515 {
516 	struct kvmgt_pgfn *p, *res = NULL;
517 
518 	hash_for_each_possible(info->ptable, p, hnode, gfn) {
519 		if (gfn == p->gfn) {
520 			res = p;
521 			break;
522 		}
523 	}
524 
525 	return res;
526 }
527 
528 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
529 				gfn_t gfn)
530 {
531 	struct kvmgt_pgfn *p;
532 
533 	p = __kvmgt_protect_table_find(info, gfn);
534 	return !!p;
535 }
536 
537 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
538 {
539 	struct kvmgt_pgfn *p;
540 
541 	if (kvmgt_gfn_is_write_protected(info, gfn))
542 		return;
543 
544 	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
545 	if (WARN(!p, "gfn: 0x%llx\n", gfn))
546 		return;
547 
548 	p->gfn = gfn;
549 	hash_add(info->ptable, &p->hnode, gfn);
550 }
551 
552 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
553 				gfn_t gfn)
554 {
555 	struct kvmgt_pgfn *p;
556 
557 	p = __kvmgt_protect_table_find(info, gfn);
558 	if (p) {
559 		hash_del(&p->hnode);
560 		kfree(p);
561 	}
562 }
563 
564 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
565 		size_t count, loff_t *ppos, bool iswrite)
566 {
567 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
568 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
569 			VFIO_PCI_NUM_REGIONS;
570 	void *base = vdev->region[i].data;
571 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
572 
573 
574 	if (pos >= vdev->region[i].size || iswrite) {
575 		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
576 		return -EINVAL;
577 	}
578 	count = min(count, (size_t)(vdev->region[i].size - pos));
579 	memcpy(buf, base + pos, count);
580 
581 	return count;
582 }
583 
584 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
585 		struct vfio_region *region)
586 {
587 }
588 
589 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
590 	.rw = intel_vgpu_reg_rw_opregion,
591 	.release = intel_vgpu_reg_release_opregion,
592 };
593 
594 static int handle_edid_regs(struct intel_vgpu *vgpu,
595 			struct vfio_edid_region *region, char *buf,
596 			size_t count, u16 offset, bool is_write)
597 {
598 	struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
599 	unsigned int data;
600 
601 	if (offset + count > sizeof(*regs))
602 		return -EINVAL;
603 
604 	if (count != 4)
605 		return -EINVAL;
606 
607 	if (is_write) {
608 		data = *((unsigned int *)buf);
609 		switch (offset) {
610 		case offsetof(struct vfio_region_gfx_edid, link_state):
611 			if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
612 				if (!drm_edid_block_valid(
613 					(u8 *)region->edid_blob,
614 					0,
615 					true,
616 					NULL)) {
617 					gvt_vgpu_err("invalid EDID blob\n");
618 					return -EINVAL;
619 				}
620 				intel_gvt_ops->emulate_hotplug(vgpu, true);
621 			} else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
622 				intel_gvt_ops->emulate_hotplug(vgpu, false);
623 			else {
624 				gvt_vgpu_err("invalid EDID link state %d\n",
625 					regs->link_state);
626 				return -EINVAL;
627 			}
628 			regs->link_state = data;
629 			break;
630 		case offsetof(struct vfio_region_gfx_edid, edid_size):
631 			if (data > regs->edid_max_size) {
632 				gvt_vgpu_err("EDID size is bigger than %d!\n",
633 					regs->edid_max_size);
634 				return -EINVAL;
635 			}
636 			regs->edid_size = data;
637 			break;
638 		default:
639 			/* read-only regs */
640 			gvt_vgpu_err("write read-only EDID region at offset %d\n",
641 				offset);
642 			return -EPERM;
643 		}
644 	} else {
645 		memcpy(buf, (char *)regs + offset, count);
646 	}
647 
648 	return count;
649 }
650 
651 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
652 			size_t count, u16 offset, bool is_write)
653 {
654 	if (offset + count > region->vfio_edid_regs.edid_size)
655 		return -EINVAL;
656 
657 	if (is_write)
658 		memcpy(region->edid_blob + offset, buf, count);
659 	else
660 		memcpy(buf, region->edid_blob + offset, count);
661 
662 	return count;
663 }
664 
665 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
666 		size_t count, loff_t *ppos, bool iswrite)
667 {
668 	int ret;
669 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
670 			VFIO_PCI_NUM_REGIONS;
671 	struct vfio_edid_region *region =
672 		(struct vfio_edid_region *)kvmgt_vdev(vgpu)->region[i].data;
673 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
674 
675 	if (pos < region->vfio_edid_regs.edid_offset) {
676 		ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
677 	} else {
678 		pos -= EDID_BLOB_OFFSET;
679 		ret = handle_edid_blob(region, buf, count, pos, iswrite);
680 	}
681 
682 	if (ret < 0)
683 		gvt_vgpu_err("failed to access EDID region\n");
684 
685 	return ret;
686 }
687 
688 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
689 					struct vfio_region *region)
690 {
691 	kfree(region->data);
692 }
693 
694 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
695 	.rw = intel_vgpu_reg_rw_edid,
696 	.release = intel_vgpu_reg_release_edid,
697 };
698 
699 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
700 		unsigned int type, unsigned int subtype,
701 		const struct intel_vgpu_regops *ops,
702 		size_t size, u32 flags, void *data)
703 {
704 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
705 	struct vfio_region *region;
706 
707 	region = krealloc(vdev->region,
708 			(vdev->num_regions + 1) * sizeof(*region),
709 			GFP_KERNEL);
710 	if (!region)
711 		return -ENOMEM;
712 
713 	vdev->region = region;
714 	vdev->region[vdev->num_regions].type = type;
715 	vdev->region[vdev->num_regions].subtype = subtype;
716 	vdev->region[vdev->num_regions].ops = ops;
717 	vdev->region[vdev->num_regions].size = size;
718 	vdev->region[vdev->num_regions].flags = flags;
719 	vdev->region[vdev->num_regions].data = data;
720 	vdev->num_regions++;
721 	return 0;
722 }
723 
724 static int kvmgt_get_vfio_device(void *p_vgpu)
725 {
726 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
727 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
728 
729 	vdev->vfio_device = vfio_device_get_from_dev(
730 		mdev_dev(vdev->mdev));
731 	if (!vdev->vfio_device) {
732 		gvt_vgpu_err("failed to get vfio device\n");
733 		return -ENODEV;
734 	}
735 	return 0;
736 }
737 
738 
739 static int kvmgt_set_opregion(void *p_vgpu)
740 {
741 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
742 	void *base;
743 	int ret;
744 
745 	/* Each vgpu has its own opregion, although VFIO would create another
746 	 * one later. This one is used to expose opregion to VFIO. And the
747 	 * other one created by VFIO later, is used by guest actually.
748 	 */
749 	base = vgpu_opregion(vgpu)->va;
750 	if (!base)
751 		return -ENOMEM;
752 
753 	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
754 		memunmap(base);
755 		return -EINVAL;
756 	}
757 
758 	ret = intel_vgpu_register_reg(vgpu,
759 			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
760 			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
761 			&intel_vgpu_regops_opregion, OPREGION_SIZE,
762 			VFIO_REGION_INFO_FLAG_READ, base);
763 
764 	return ret;
765 }
766 
767 static int kvmgt_set_edid(void *p_vgpu, int port_num)
768 {
769 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
770 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
771 	struct vfio_edid_region *base;
772 	int ret;
773 
774 	base = kzalloc(sizeof(*base), GFP_KERNEL);
775 	if (!base)
776 		return -ENOMEM;
777 
778 	/* TODO: Add multi-port and EDID extension block support */
779 	base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
780 	base->vfio_edid_regs.edid_max_size = EDID_SIZE;
781 	base->vfio_edid_regs.edid_size = EDID_SIZE;
782 	base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
783 	base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
784 	base->edid_blob = port->edid->edid_block;
785 
786 	ret = intel_vgpu_register_reg(vgpu,
787 			VFIO_REGION_TYPE_GFX,
788 			VFIO_REGION_SUBTYPE_GFX_EDID,
789 			&intel_vgpu_regops_edid, EDID_SIZE,
790 			VFIO_REGION_INFO_FLAG_READ |
791 			VFIO_REGION_INFO_FLAG_WRITE |
792 			VFIO_REGION_INFO_FLAG_CAPS, base);
793 
794 	return ret;
795 }
796 
797 static void kvmgt_put_vfio_device(void *vgpu)
798 {
799 	struct kvmgt_vdev *vdev = kvmgt_vdev((struct intel_vgpu *)vgpu);
800 
801 	if (WARN_ON(!vdev->vfio_device))
802 		return;
803 
804 	vfio_device_put(vdev->vfio_device);
805 }
806 
807 static int intel_vgpu_create(struct mdev_device *mdev)
808 {
809 	struct intel_vgpu *vgpu = NULL;
810 	struct intel_vgpu_type *type;
811 	struct device *pdev;
812 	struct intel_gvt *gvt;
813 	int ret;
814 
815 	pdev = mdev_parent_dev(mdev);
816 	gvt = kdev_to_i915(pdev)->gvt;
817 
818 	type = &gvt->types[mdev_get_type_group_id(mdev)];
819 	if (!type) {
820 		ret = -EINVAL;
821 		goto out;
822 	}
823 
824 	vgpu = intel_gvt_ops->vgpu_create(gvt, type);
825 	if (IS_ERR_OR_NULL(vgpu)) {
826 		ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
827 		gvt_err("failed to create intel vgpu: %d\n", ret);
828 		goto out;
829 	}
830 
831 	INIT_WORK(&kvmgt_vdev(vgpu)->release_work, intel_vgpu_release_work);
832 
833 	kvmgt_vdev(vgpu)->mdev = mdev;
834 	mdev_set_drvdata(mdev, vgpu);
835 
836 	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
837 		     dev_name(mdev_dev(mdev)));
838 	ret = 0;
839 
840 out:
841 	return ret;
842 }
843 
844 static int intel_vgpu_remove(struct mdev_device *mdev)
845 {
846 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
847 
848 	if (handle_valid(vgpu->handle))
849 		return -EBUSY;
850 
851 	intel_gvt_ops->vgpu_destroy(vgpu);
852 	return 0;
853 }
854 
855 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
856 				     unsigned long action, void *data)
857 {
858 	struct kvmgt_vdev *vdev = container_of(nb,
859 					       struct kvmgt_vdev,
860 					       iommu_notifier);
861 	struct intel_vgpu *vgpu = vdev->vgpu;
862 
863 	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
864 		struct vfio_iommu_type1_dma_unmap *unmap = data;
865 		struct gvt_dma *entry;
866 		unsigned long iov_pfn, end_iov_pfn;
867 
868 		iov_pfn = unmap->iova >> PAGE_SHIFT;
869 		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
870 
871 		mutex_lock(&vdev->cache_lock);
872 		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
873 			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
874 			if (!entry)
875 				continue;
876 
877 			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
878 					   entry->size);
879 			__gvt_cache_remove_entry(vgpu, entry);
880 		}
881 		mutex_unlock(&vdev->cache_lock);
882 	}
883 
884 	return NOTIFY_OK;
885 }
886 
887 static int intel_vgpu_group_notifier(struct notifier_block *nb,
888 				     unsigned long action, void *data)
889 {
890 	struct kvmgt_vdev *vdev = container_of(nb,
891 					       struct kvmgt_vdev,
892 					       group_notifier);
893 
894 	/* the only action we care about */
895 	if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
896 		vdev->kvm = data;
897 
898 		if (!data)
899 			schedule_work(&vdev->release_work);
900 	}
901 
902 	return NOTIFY_OK;
903 }
904 
905 static int intel_vgpu_open_device(struct mdev_device *mdev)
906 {
907 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
908 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
909 	unsigned long events;
910 	int ret;
911 	struct vfio_group *vfio_group;
912 
913 	vdev->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
914 	vdev->group_notifier.notifier_call = intel_vgpu_group_notifier;
915 
916 	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
917 	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
918 				&vdev->iommu_notifier);
919 	if (ret != 0) {
920 		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
921 			ret);
922 		goto out;
923 	}
924 
925 	events = VFIO_GROUP_NOTIFY_SET_KVM;
926 	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
927 				&vdev->group_notifier);
928 	if (ret != 0) {
929 		gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
930 			ret);
931 		goto undo_iommu;
932 	}
933 
934 	vfio_group = vfio_group_get_external_user_from_dev(mdev_dev(mdev));
935 	if (IS_ERR_OR_NULL(vfio_group)) {
936 		ret = !vfio_group ? -EFAULT : PTR_ERR(vfio_group);
937 		gvt_vgpu_err("vfio_group_get_external_user_from_dev failed\n");
938 		goto undo_register;
939 	}
940 	vdev->vfio_group = vfio_group;
941 
942 	/* Take a module reference as mdev core doesn't take
943 	 * a reference for vendor driver.
944 	 */
945 	if (!try_module_get(THIS_MODULE)) {
946 		ret = -ENODEV;
947 		goto undo_group;
948 	}
949 
950 	ret = kvmgt_guest_init(mdev);
951 	if (ret)
952 		goto undo_group;
953 
954 	intel_gvt_ops->vgpu_activate(vgpu);
955 
956 	atomic_set(&vdev->released, 0);
957 	return ret;
958 
959 undo_group:
960 	vfio_group_put_external_user(vdev->vfio_group);
961 	vdev->vfio_group = NULL;
962 
963 undo_register:
964 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
965 					&vdev->group_notifier);
966 
967 undo_iommu:
968 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
969 					&vdev->iommu_notifier);
970 out:
971 	return ret;
972 }
973 
974 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
975 {
976 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
977 	struct eventfd_ctx *trigger;
978 
979 	trigger = vdev->msi_trigger;
980 	if (trigger) {
981 		eventfd_ctx_put(trigger);
982 		vdev->msi_trigger = NULL;
983 	}
984 }
985 
986 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
987 {
988 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
989 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
990 	struct kvmgt_guest_info *info;
991 	int ret;
992 
993 	if (!handle_valid(vgpu->handle))
994 		return;
995 
996 	if (atomic_cmpxchg(&vdev->released, 0, 1))
997 		return;
998 
999 	intel_gvt_ops->vgpu_release(vgpu);
1000 
1001 	ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_IOMMU_NOTIFY,
1002 					&vdev->iommu_notifier);
1003 	drm_WARN(&i915->drm, ret,
1004 		 "vfio_unregister_notifier for iommu failed: %d\n", ret);
1005 
1006 	ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_GROUP_NOTIFY,
1007 					&vdev->group_notifier);
1008 	drm_WARN(&i915->drm, ret,
1009 		 "vfio_unregister_notifier for group failed: %d\n", ret);
1010 
1011 	/* dereference module reference taken at open */
1012 	module_put(THIS_MODULE);
1013 
1014 	info = (struct kvmgt_guest_info *)vgpu->handle;
1015 	kvmgt_guest_exit(info);
1016 
1017 	intel_vgpu_release_msi_eventfd_ctx(vgpu);
1018 	vfio_group_put_external_user(vdev->vfio_group);
1019 
1020 	vdev->kvm = NULL;
1021 	vgpu->handle = 0;
1022 }
1023 
1024 static void intel_vgpu_close_device(struct mdev_device *mdev)
1025 {
1026 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1027 
1028 	__intel_vgpu_release(vgpu);
1029 }
1030 
1031 static void intel_vgpu_release_work(struct work_struct *work)
1032 {
1033 	struct kvmgt_vdev *vdev = container_of(work, struct kvmgt_vdev,
1034 					       release_work);
1035 
1036 	__intel_vgpu_release(vdev->vgpu);
1037 }
1038 
1039 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
1040 {
1041 	u32 start_lo, start_hi;
1042 	u32 mem_type;
1043 
1044 	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1045 			PCI_BASE_ADDRESS_MEM_MASK;
1046 	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1047 			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
1048 
1049 	switch (mem_type) {
1050 	case PCI_BASE_ADDRESS_MEM_TYPE_64:
1051 		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
1052 						+ bar + 4));
1053 		break;
1054 	case PCI_BASE_ADDRESS_MEM_TYPE_32:
1055 	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
1056 		/* 1M mem BAR treated as 32-bit BAR */
1057 	default:
1058 		/* mem unknown type treated as 32-bit BAR */
1059 		start_hi = 0;
1060 		break;
1061 	}
1062 
1063 	return ((u64)start_hi << 32) | start_lo;
1064 }
1065 
1066 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
1067 			     void *buf, unsigned int count, bool is_write)
1068 {
1069 	u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
1070 	int ret;
1071 
1072 	if (is_write)
1073 		ret = intel_gvt_ops->emulate_mmio_write(vgpu,
1074 					bar_start + off, buf, count);
1075 	else
1076 		ret = intel_gvt_ops->emulate_mmio_read(vgpu,
1077 					bar_start + off, buf, count);
1078 	return ret;
1079 }
1080 
1081 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
1082 {
1083 	return off >= vgpu_aperture_offset(vgpu) &&
1084 	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
1085 }
1086 
1087 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
1088 		void *buf, unsigned long count, bool is_write)
1089 {
1090 	void __iomem *aperture_va;
1091 
1092 	if (!intel_vgpu_in_aperture(vgpu, off) ||
1093 	    !intel_vgpu_in_aperture(vgpu, off + count)) {
1094 		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
1095 		return -EINVAL;
1096 	}
1097 
1098 	aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
1099 					ALIGN_DOWN(off, PAGE_SIZE),
1100 					count + offset_in_page(off));
1101 	if (!aperture_va)
1102 		return -EIO;
1103 
1104 	if (is_write)
1105 		memcpy_toio(aperture_va + offset_in_page(off), buf, count);
1106 	else
1107 		memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
1108 
1109 	io_mapping_unmap(aperture_va);
1110 
1111 	return 0;
1112 }
1113 
1114 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
1115 			size_t count, loff_t *ppos, bool is_write)
1116 {
1117 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1118 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1119 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1120 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
1121 	int ret = -EINVAL;
1122 
1123 
1124 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) {
1125 		gvt_vgpu_err("invalid index: %u\n", index);
1126 		return -EINVAL;
1127 	}
1128 
1129 	switch (index) {
1130 	case VFIO_PCI_CONFIG_REGION_INDEX:
1131 		if (is_write)
1132 			ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
1133 						buf, count);
1134 		else
1135 			ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
1136 						buf, count);
1137 		break;
1138 	case VFIO_PCI_BAR0_REGION_INDEX:
1139 		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
1140 					buf, count, is_write);
1141 		break;
1142 	case VFIO_PCI_BAR2_REGION_INDEX:
1143 		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
1144 		break;
1145 	case VFIO_PCI_BAR1_REGION_INDEX:
1146 	case VFIO_PCI_BAR3_REGION_INDEX:
1147 	case VFIO_PCI_BAR4_REGION_INDEX:
1148 	case VFIO_PCI_BAR5_REGION_INDEX:
1149 	case VFIO_PCI_VGA_REGION_INDEX:
1150 	case VFIO_PCI_ROM_REGION_INDEX:
1151 		break;
1152 	default:
1153 		if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1154 			return -EINVAL;
1155 
1156 		index -= VFIO_PCI_NUM_REGIONS;
1157 		return vdev->region[index].ops->rw(vgpu, buf, count,
1158 				ppos, is_write);
1159 	}
1160 
1161 	return ret == 0 ? count : ret;
1162 }
1163 
1164 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
1165 {
1166 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1167 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1168 	struct intel_gvt *gvt = vgpu->gvt;
1169 	int offset;
1170 
1171 	/* Only allow MMIO GGTT entry access */
1172 	if (index != PCI_BASE_ADDRESS_0)
1173 		return false;
1174 
1175 	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
1176 		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
1177 
1178 	return (offset >= gvt->device_info.gtt_start_offset &&
1179 		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1180 			true : false;
1181 }
1182 
1183 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
1184 			size_t count, loff_t *ppos)
1185 {
1186 	unsigned int done = 0;
1187 	int ret;
1188 
1189 	while (count) {
1190 		size_t filled;
1191 
1192 		/* Only support GGTT entry 8 bytes read */
1193 		if (count >= 8 && !(*ppos % 8) &&
1194 			gtt_entry(mdev, ppos)) {
1195 			u64 val;
1196 
1197 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1198 					ppos, false);
1199 			if (ret <= 0)
1200 				goto read_err;
1201 
1202 			if (copy_to_user(buf, &val, sizeof(val)))
1203 				goto read_err;
1204 
1205 			filled = 8;
1206 		} else if (count >= 4 && !(*ppos % 4)) {
1207 			u32 val;
1208 
1209 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1210 					ppos, false);
1211 			if (ret <= 0)
1212 				goto read_err;
1213 
1214 			if (copy_to_user(buf, &val, sizeof(val)))
1215 				goto read_err;
1216 
1217 			filled = 4;
1218 		} else if (count >= 2 && !(*ppos % 2)) {
1219 			u16 val;
1220 
1221 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1222 					ppos, false);
1223 			if (ret <= 0)
1224 				goto read_err;
1225 
1226 			if (copy_to_user(buf, &val, sizeof(val)))
1227 				goto read_err;
1228 
1229 			filled = 2;
1230 		} else {
1231 			u8 val;
1232 
1233 			ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
1234 					false);
1235 			if (ret <= 0)
1236 				goto read_err;
1237 
1238 			if (copy_to_user(buf, &val, sizeof(val)))
1239 				goto read_err;
1240 
1241 			filled = 1;
1242 		}
1243 
1244 		count -= filled;
1245 		done += filled;
1246 		*ppos += filled;
1247 		buf += filled;
1248 	}
1249 
1250 	return done;
1251 
1252 read_err:
1253 	return -EFAULT;
1254 }
1255 
1256 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
1257 				const char __user *buf,
1258 				size_t count, loff_t *ppos)
1259 {
1260 	unsigned int done = 0;
1261 	int ret;
1262 
1263 	while (count) {
1264 		size_t filled;
1265 
1266 		/* Only support GGTT entry 8 bytes write */
1267 		if (count >= 8 && !(*ppos % 8) &&
1268 			gtt_entry(mdev, ppos)) {
1269 			u64 val;
1270 
1271 			if (copy_from_user(&val, buf, sizeof(val)))
1272 				goto write_err;
1273 
1274 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1275 					ppos, true);
1276 			if (ret <= 0)
1277 				goto write_err;
1278 
1279 			filled = 8;
1280 		} else if (count >= 4 && !(*ppos % 4)) {
1281 			u32 val;
1282 
1283 			if (copy_from_user(&val, buf, sizeof(val)))
1284 				goto write_err;
1285 
1286 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1287 					ppos, true);
1288 			if (ret <= 0)
1289 				goto write_err;
1290 
1291 			filled = 4;
1292 		} else if (count >= 2 && !(*ppos % 2)) {
1293 			u16 val;
1294 
1295 			if (copy_from_user(&val, buf, sizeof(val)))
1296 				goto write_err;
1297 
1298 			ret = intel_vgpu_rw(mdev, (char *)&val,
1299 					sizeof(val), ppos, true);
1300 			if (ret <= 0)
1301 				goto write_err;
1302 
1303 			filled = 2;
1304 		} else {
1305 			u8 val;
1306 
1307 			if (copy_from_user(&val, buf, sizeof(val)))
1308 				goto write_err;
1309 
1310 			ret = intel_vgpu_rw(mdev, &val, sizeof(val),
1311 					ppos, true);
1312 			if (ret <= 0)
1313 				goto write_err;
1314 
1315 			filled = 1;
1316 		}
1317 
1318 		count -= filled;
1319 		done += filled;
1320 		*ppos += filled;
1321 		buf += filled;
1322 	}
1323 
1324 	return done;
1325 write_err:
1326 	return -EFAULT;
1327 }
1328 
1329 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1330 {
1331 	unsigned int index;
1332 	u64 virtaddr;
1333 	unsigned long req_size, pgoff, req_start;
1334 	pgprot_t pg_prot;
1335 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1336 
1337 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1338 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1339 		return -EINVAL;
1340 
1341 	if (vma->vm_end < vma->vm_start)
1342 		return -EINVAL;
1343 	if ((vma->vm_flags & VM_SHARED) == 0)
1344 		return -EINVAL;
1345 	if (index != VFIO_PCI_BAR2_REGION_INDEX)
1346 		return -EINVAL;
1347 
1348 	pg_prot = vma->vm_page_prot;
1349 	virtaddr = vma->vm_start;
1350 	req_size = vma->vm_end - vma->vm_start;
1351 	pgoff = vma->vm_pgoff &
1352 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1353 	req_start = pgoff << PAGE_SHIFT;
1354 
1355 	if (!intel_vgpu_in_aperture(vgpu, req_start))
1356 		return -EINVAL;
1357 	if (req_start + req_size >
1358 	    vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1359 		return -EINVAL;
1360 
1361 	pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1362 
1363 	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1364 }
1365 
1366 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1367 {
1368 	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1369 		return 1;
1370 
1371 	return 0;
1372 }
1373 
1374 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1375 			unsigned int index, unsigned int start,
1376 			unsigned int count, u32 flags,
1377 			void *data)
1378 {
1379 	return 0;
1380 }
1381 
1382 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1383 			unsigned int index, unsigned int start,
1384 			unsigned int count, u32 flags, void *data)
1385 {
1386 	return 0;
1387 }
1388 
1389 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1390 		unsigned int index, unsigned int start, unsigned int count,
1391 		u32 flags, void *data)
1392 {
1393 	return 0;
1394 }
1395 
1396 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1397 		unsigned int index, unsigned int start, unsigned int count,
1398 		u32 flags, void *data)
1399 {
1400 	struct eventfd_ctx *trigger;
1401 
1402 	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1403 		int fd = *(int *)data;
1404 
1405 		trigger = eventfd_ctx_fdget(fd);
1406 		if (IS_ERR(trigger)) {
1407 			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1408 			return PTR_ERR(trigger);
1409 		}
1410 		kvmgt_vdev(vgpu)->msi_trigger = trigger;
1411 	} else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1412 		intel_vgpu_release_msi_eventfd_ctx(vgpu);
1413 
1414 	return 0;
1415 }
1416 
1417 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1418 		unsigned int index, unsigned int start, unsigned int count,
1419 		void *data)
1420 {
1421 	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1422 			unsigned int start, unsigned int count, u32 flags,
1423 			void *data) = NULL;
1424 
1425 	switch (index) {
1426 	case VFIO_PCI_INTX_IRQ_INDEX:
1427 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1428 		case VFIO_IRQ_SET_ACTION_MASK:
1429 			func = intel_vgpu_set_intx_mask;
1430 			break;
1431 		case VFIO_IRQ_SET_ACTION_UNMASK:
1432 			func = intel_vgpu_set_intx_unmask;
1433 			break;
1434 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1435 			func = intel_vgpu_set_intx_trigger;
1436 			break;
1437 		}
1438 		break;
1439 	case VFIO_PCI_MSI_IRQ_INDEX:
1440 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1441 		case VFIO_IRQ_SET_ACTION_MASK:
1442 		case VFIO_IRQ_SET_ACTION_UNMASK:
1443 			/* XXX Need masking support exported */
1444 			break;
1445 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1446 			func = intel_vgpu_set_msi_trigger;
1447 			break;
1448 		}
1449 		break;
1450 	}
1451 
1452 	if (!func)
1453 		return -ENOTTY;
1454 
1455 	return func(vgpu, index, start, count, flags, data);
1456 }
1457 
1458 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1459 			     unsigned long arg)
1460 {
1461 	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1462 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1463 	unsigned long minsz;
1464 
1465 	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1466 
1467 	if (cmd == VFIO_DEVICE_GET_INFO) {
1468 		struct vfio_device_info info;
1469 
1470 		minsz = offsetofend(struct vfio_device_info, num_irqs);
1471 
1472 		if (copy_from_user(&info, (void __user *)arg, minsz))
1473 			return -EFAULT;
1474 
1475 		if (info.argsz < minsz)
1476 			return -EINVAL;
1477 
1478 		info.flags = VFIO_DEVICE_FLAGS_PCI;
1479 		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1480 		info.num_regions = VFIO_PCI_NUM_REGIONS +
1481 				vdev->num_regions;
1482 		info.num_irqs = VFIO_PCI_NUM_IRQS;
1483 
1484 		return copy_to_user((void __user *)arg, &info, minsz) ?
1485 			-EFAULT : 0;
1486 
1487 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1488 		struct vfio_region_info info;
1489 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1490 		unsigned int i;
1491 		int ret;
1492 		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1493 		int nr_areas = 1;
1494 		int cap_type_id;
1495 
1496 		minsz = offsetofend(struct vfio_region_info, offset);
1497 
1498 		if (copy_from_user(&info, (void __user *)arg, minsz))
1499 			return -EFAULT;
1500 
1501 		if (info.argsz < minsz)
1502 			return -EINVAL;
1503 
1504 		switch (info.index) {
1505 		case VFIO_PCI_CONFIG_REGION_INDEX:
1506 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1507 			info.size = vgpu->gvt->device_info.cfg_space_size;
1508 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1509 				     VFIO_REGION_INFO_FLAG_WRITE;
1510 			break;
1511 		case VFIO_PCI_BAR0_REGION_INDEX:
1512 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1513 			info.size = vgpu->cfg_space.bar[info.index].size;
1514 			if (!info.size) {
1515 				info.flags = 0;
1516 				break;
1517 			}
1518 
1519 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1520 				     VFIO_REGION_INFO_FLAG_WRITE;
1521 			break;
1522 		case VFIO_PCI_BAR1_REGION_INDEX:
1523 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1524 			info.size = 0;
1525 			info.flags = 0;
1526 			break;
1527 		case VFIO_PCI_BAR2_REGION_INDEX:
1528 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1529 			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1530 					VFIO_REGION_INFO_FLAG_MMAP |
1531 					VFIO_REGION_INFO_FLAG_READ |
1532 					VFIO_REGION_INFO_FLAG_WRITE;
1533 			info.size = gvt_aperture_sz(vgpu->gvt);
1534 
1535 			sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1536 					 GFP_KERNEL);
1537 			if (!sparse)
1538 				return -ENOMEM;
1539 
1540 			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1541 			sparse->header.version = 1;
1542 			sparse->nr_areas = nr_areas;
1543 			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1544 			sparse->areas[0].offset =
1545 					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1546 			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1547 			break;
1548 
1549 		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1550 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1551 			info.size = 0;
1552 			info.flags = 0;
1553 
1554 			gvt_dbg_core("get region info bar:%d\n", info.index);
1555 			break;
1556 
1557 		case VFIO_PCI_ROM_REGION_INDEX:
1558 		case VFIO_PCI_VGA_REGION_INDEX:
1559 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1560 			info.size = 0;
1561 			info.flags = 0;
1562 
1563 			gvt_dbg_core("get region info index:%d\n", info.index);
1564 			break;
1565 		default:
1566 			{
1567 				struct vfio_region_info_cap_type cap_type = {
1568 					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1569 					.header.version = 1 };
1570 
1571 				if (info.index >= VFIO_PCI_NUM_REGIONS +
1572 						vdev->num_regions)
1573 					return -EINVAL;
1574 				info.index =
1575 					array_index_nospec(info.index,
1576 							VFIO_PCI_NUM_REGIONS +
1577 							vdev->num_regions);
1578 
1579 				i = info.index - VFIO_PCI_NUM_REGIONS;
1580 
1581 				info.offset =
1582 					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1583 				info.size = vdev->region[i].size;
1584 				info.flags = vdev->region[i].flags;
1585 
1586 				cap_type.type = vdev->region[i].type;
1587 				cap_type.subtype = vdev->region[i].subtype;
1588 
1589 				ret = vfio_info_add_capability(&caps,
1590 							&cap_type.header,
1591 							sizeof(cap_type));
1592 				if (ret)
1593 					return ret;
1594 			}
1595 		}
1596 
1597 		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1598 			switch (cap_type_id) {
1599 			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1600 				ret = vfio_info_add_capability(&caps,
1601 					&sparse->header,
1602 					struct_size(sparse, areas,
1603 						    sparse->nr_areas));
1604 				if (ret) {
1605 					kfree(sparse);
1606 					return ret;
1607 				}
1608 				break;
1609 			default:
1610 				kfree(sparse);
1611 				return -EINVAL;
1612 			}
1613 		}
1614 
1615 		if (caps.size) {
1616 			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1617 			if (info.argsz < sizeof(info) + caps.size) {
1618 				info.argsz = sizeof(info) + caps.size;
1619 				info.cap_offset = 0;
1620 			} else {
1621 				vfio_info_cap_shift(&caps, sizeof(info));
1622 				if (copy_to_user((void __user *)arg +
1623 						  sizeof(info), caps.buf,
1624 						  caps.size)) {
1625 					kfree(caps.buf);
1626 					kfree(sparse);
1627 					return -EFAULT;
1628 				}
1629 				info.cap_offset = sizeof(info);
1630 			}
1631 
1632 			kfree(caps.buf);
1633 		}
1634 
1635 		kfree(sparse);
1636 		return copy_to_user((void __user *)arg, &info, minsz) ?
1637 			-EFAULT : 0;
1638 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1639 		struct vfio_irq_info info;
1640 
1641 		minsz = offsetofend(struct vfio_irq_info, count);
1642 
1643 		if (copy_from_user(&info, (void __user *)arg, minsz))
1644 			return -EFAULT;
1645 
1646 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1647 			return -EINVAL;
1648 
1649 		switch (info.index) {
1650 		case VFIO_PCI_INTX_IRQ_INDEX:
1651 		case VFIO_PCI_MSI_IRQ_INDEX:
1652 			break;
1653 		default:
1654 			return -EINVAL;
1655 		}
1656 
1657 		info.flags = VFIO_IRQ_INFO_EVENTFD;
1658 
1659 		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1660 
1661 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1662 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1663 				       VFIO_IRQ_INFO_AUTOMASKED);
1664 		else
1665 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1666 
1667 		return copy_to_user((void __user *)arg, &info, minsz) ?
1668 			-EFAULT : 0;
1669 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1670 		struct vfio_irq_set hdr;
1671 		u8 *data = NULL;
1672 		int ret = 0;
1673 		size_t data_size = 0;
1674 
1675 		minsz = offsetofend(struct vfio_irq_set, count);
1676 
1677 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1678 			return -EFAULT;
1679 
1680 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1681 			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1682 
1683 			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1684 						VFIO_PCI_NUM_IRQS, &data_size);
1685 			if (ret) {
1686 				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1687 				return -EINVAL;
1688 			}
1689 			if (data_size) {
1690 				data = memdup_user((void __user *)(arg + minsz),
1691 						   data_size);
1692 				if (IS_ERR(data))
1693 					return PTR_ERR(data);
1694 			}
1695 		}
1696 
1697 		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1698 					hdr.start, hdr.count, data);
1699 		kfree(data);
1700 
1701 		return ret;
1702 	} else if (cmd == VFIO_DEVICE_RESET) {
1703 		intel_gvt_ops->vgpu_reset(vgpu);
1704 		return 0;
1705 	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1706 		struct vfio_device_gfx_plane_info dmabuf;
1707 		int ret = 0;
1708 
1709 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1710 				    dmabuf_id);
1711 		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1712 			return -EFAULT;
1713 		if (dmabuf.argsz < minsz)
1714 			return -EINVAL;
1715 
1716 		ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1717 		if (ret != 0)
1718 			return ret;
1719 
1720 		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1721 								-EFAULT : 0;
1722 	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1723 		__u32 dmabuf_id;
1724 		__s32 dmabuf_fd;
1725 
1726 		if (get_user(dmabuf_id, (__u32 __user *)arg))
1727 			return -EFAULT;
1728 
1729 		dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1730 		return dmabuf_fd;
1731 
1732 	}
1733 
1734 	return -ENOTTY;
1735 }
1736 
1737 static ssize_t
1738 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1739 	     char *buf)
1740 {
1741 	struct mdev_device *mdev = mdev_from_dev(dev);
1742 
1743 	if (mdev) {
1744 		struct intel_vgpu *vgpu = (struct intel_vgpu *)
1745 			mdev_get_drvdata(mdev);
1746 		return sprintf(buf, "%d\n", vgpu->id);
1747 	}
1748 	return sprintf(buf, "\n");
1749 }
1750 
1751 static DEVICE_ATTR_RO(vgpu_id);
1752 
1753 static struct attribute *intel_vgpu_attrs[] = {
1754 	&dev_attr_vgpu_id.attr,
1755 	NULL
1756 };
1757 
1758 static const struct attribute_group intel_vgpu_group = {
1759 	.name = "intel_vgpu",
1760 	.attrs = intel_vgpu_attrs,
1761 };
1762 
1763 static const struct attribute_group *intel_vgpu_groups[] = {
1764 	&intel_vgpu_group,
1765 	NULL,
1766 };
1767 
1768 static struct mdev_parent_ops intel_vgpu_ops = {
1769 	.mdev_attr_groups       = intel_vgpu_groups,
1770 	.create			= intel_vgpu_create,
1771 	.remove			= intel_vgpu_remove,
1772 
1773 	.open_device		= intel_vgpu_open_device,
1774 	.close_device		= intel_vgpu_close_device,
1775 
1776 	.read			= intel_vgpu_read,
1777 	.write			= intel_vgpu_write,
1778 	.mmap			= intel_vgpu_mmap,
1779 	.ioctl			= intel_vgpu_ioctl,
1780 };
1781 
1782 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1783 {
1784 	int ret;
1785 
1786 	ret = intel_gvt_init_vgpu_type_groups((struct intel_gvt *)gvt);
1787 	if (ret)
1788 		return ret;
1789 
1790 	intel_gvt_ops = ops;
1791 	intel_vgpu_ops.supported_type_groups = gvt_vgpu_type_groups;
1792 
1793 	ret = mdev_register_device(dev, &intel_vgpu_ops);
1794 	if (ret)
1795 		intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1796 
1797 	return ret;
1798 }
1799 
1800 static void kvmgt_host_exit(struct device *dev, void *gvt)
1801 {
1802 	mdev_unregister_device(dev);
1803 	intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1804 }
1805 
1806 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1807 {
1808 	struct kvmgt_guest_info *info;
1809 	struct kvm *kvm;
1810 	struct kvm_memory_slot *slot;
1811 	int idx;
1812 
1813 	if (!handle_valid(handle))
1814 		return -ESRCH;
1815 
1816 	info = (struct kvmgt_guest_info *)handle;
1817 	kvm = info->kvm;
1818 
1819 	idx = srcu_read_lock(&kvm->srcu);
1820 	slot = gfn_to_memslot(kvm, gfn);
1821 	if (!slot) {
1822 		srcu_read_unlock(&kvm->srcu, idx);
1823 		return -EINVAL;
1824 	}
1825 
1826 	write_lock(&kvm->mmu_lock);
1827 
1828 	if (kvmgt_gfn_is_write_protected(info, gfn))
1829 		goto out;
1830 
1831 	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1832 	kvmgt_protect_table_add(info, gfn);
1833 
1834 out:
1835 	write_unlock(&kvm->mmu_lock);
1836 	srcu_read_unlock(&kvm->srcu, idx);
1837 	return 0;
1838 }
1839 
1840 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1841 {
1842 	struct kvmgt_guest_info *info;
1843 	struct kvm *kvm;
1844 	struct kvm_memory_slot *slot;
1845 	int idx;
1846 
1847 	if (!handle_valid(handle))
1848 		return 0;
1849 
1850 	info = (struct kvmgt_guest_info *)handle;
1851 	kvm = info->kvm;
1852 
1853 	idx = srcu_read_lock(&kvm->srcu);
1854 	slot = gfn_to_memslot(kvm, gfn);
1855 	if (!slot) {
1856 		srcu_read_unlock(&kvm->srcu, idx);
1857 		return -EINVAL;
1858 	}
1859 
1860 	write_lock(&kvm->mmu_lock);
1861 
1862 	if (!kvmgt_gfn_is_write_protected(info, gfn))
1863 		goto out;
1864 
1865 	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1866 	kvmgt_protect_table_del(info, gfn);
1867 
1868 out:
1869 	write_unlock(&kvm->mmu_lock);
1870 	srcu_read_unlock(&kvm->srcu, idx);
1871 	return 0;
1872 }
1873 
1874 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1875 		const u8 *val, int len,
1876 		struct kvm_page_track_notifier_node *node)
1877 {
1878 	struct kvmgt_guest_info *info = container_of(node,
1879 					struct kvmgt_guest_info, track_node);
1880 
1881 	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1882 		intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1883 						     (void *)val, len);
1884 }
1885 
1886 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1887 		struct kvm_memory_slot *slot,
1888 		struct kvm_page_track_notifier_node *node)
1889 {
1890 	int i;
1891 	gfn_t gfn;
1892 	struct kvmgt_guest_info *info = container_of(node,
1893 					struct kvmgt_guest_info, track_node);
1894 
1895 	write_lock(&kvm->mmu_lock);
1896 	for (i = 0; i < slot->npages; i++) {
1897 		gfn = slot->base_gfn + i;
1898 		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1899 			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1900 						KVM_PAGE_TRACK_WRITE);
1901 			kvmgt_protect_table_del(info, gfn);
1902 		}
1903 	}
1904 	write_unlock(&kvm->mmu_lock);
1905 }
1906 
1907 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1908 {
1909 	struct intel_vgpu *itr;
1910 	struct kvmgt_guest_info *info;
1911 	int id;
1912 	bool ret = false;
1913 
1914 	mutex_lock(&vgpu->gvt->lock);
1915 	for_each_active_vgpu(vgpu->gvt, itr, id) {
1916 		if (!handle_valid(itr->handle))
1917 			continue;
1918 
1919 		info = (struct kvmgt_guest_info *)itr->handle;
1920 		if (kvm && kvm == info->kvm) {
1921 			ret = true;
1922 			goto out;
1923 		}
1924 	}
1925 out:
1926 	mutex_unlock(&vgpu->gvt->lock);
1927 	return ret;
1928 }
1929 
1930 static int kvmgt_guest_init(struct mdev_device *mdev)
1931 {
1932 	struct kvmgt_guest_info *info;
1933 	struct intel_vgpu *vgpu;
1934 	struct kvmgt_vdev *vdev;
1935 	struct kvm *kvm;
1936 
1937 	vgpu = mdev_get_drvdata(mdev);
1938 	if (handle_valid(vgpu->handle))
1939 		return -EEXIST;
1940 
1941 	vdev = kvmgt_vdev(vgpu);
1942 	kvm = vdev->kvm;
1943 	if (!kvm || kvm->mm != current->mm) {
1944 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1945 		return -ESRCH;
1946 	}
1947 
1948 	if (__kvmgt_vgpu_exist(vgpu, kvm))
1949 		return -EEXIST;
1950 
1951 	info = vzalloc(sizeof(struct kvmgt_guest_info));
1952 	if (!info)
1953 		return -ENOMEM;
1954 
1955 	vgpu->handle = (unsigned long)info;
1956 	info->vgpu = vgpu;
1957 	info->kvm = kvm;
1958 	kvm_get_kvm(info->kvm);
1959 
1960 	kvmgt_protect_table_init(info);
1961 	gvt_cache_init(vgpu);
1962 
1963 	info->track_node.track_write = kvmgt_page_track_write;
1964 	info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1965 	kvm_page_track_register_notifier(kvm, &info->track_node);
1966 
1967 	debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs,
1968 			     &vdev->nr_cache_entries);
1969 	return 0;
1970 }
1971 
1972 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1973 {
1974 	debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME,
1975 				      info->vgpu->debugfs));
1976 
1977 	kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1978 	kvm_put_kvm(info->kvm);
1979 	kvmgt_protect_table_destroy(info);
1980 	gvt_cache_destroy(info->vgpu);
1981 	vfree(info);
1982 
1983 	return true;
1984 }
1985 
1986 static int kvmgt_attach_vgpu(void *p_vgpu, unsigned long *handle)
1987 {
1988 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1989 
1990 	vgpu->vdev = kzalloc(sizeof(struct kvmgt_vdev), GFP_KERNEL);
1991 
1992 	if (!vgpu->vdev)
1993 		return -ENOMEM;
1994 
1995 	kvmgt_vdev(vgpu)->vgpu = vgpu;
1996 
1997 	return 0;
1998 }
1999 
2000 static void kvmgt_detach_vgpu(void *p_vgpu)
2001 {
2002 	int i;
2003 	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
2004 	struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
2005 
2006 	if (!vdev->region)
2007 		return;
2008 
2009 	for (i = 0; i < vdev->num_regions; i++)
2010 		if (vdev->region[i].ops->release)
2011 			vdev->region[i].ops->release(vgpu,
2012 					&vdev->region[i]);
2013 	vdev->num_regions = 0;
2014 	kfree(vdev->region);
2015 	vdev->region = NULL;
2016 
2017 	kfree(vdev);
2018 }
2019 
2020 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
2021 {
2022 	struct kvmgt_guest_info *info;
2023 	struct intel_vgpu *vgpu;
2024 	struct kvmgt_vdev *vdev;
2025 
2026 	if (!handle_valid(handle))
2027 		return -ESRCH;
2028 
2029 	info = (struct kvmgt_guest_info *)handle;
2030 	vgpu = info->vgpu;
2031 	vdev = kvmgt_vdev(vgpu);
2032 
2033 	/*
2034 	 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
2035 	 * config and mmio register isn't restored to default during guest
2036 	 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
2037 	 * may be enabled, then once this vgpu is active, it will get inject
2038 	 * vblank interrupt request. But msi_trigger is null until msi is
2039 	 * enabled by guest. so if msi_trigger is null, success is still
2040 	 * returned and don't inject interrupt into guest.
2041 	 */
2042 	if (vdev->msi_trigger == NULL)
2043 		return 0;
2044 
2045 	if (eventfd_signal(vdev->msi_trigger, 1) == 1)
2046 		return 0;
2047 
2048 	return -EFAULT;
2049 }
2050 
2051 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
2052 {
2053 	struct kvmgt_guest_info *info;
2054 	kvm_pfn_t pfn;
2055 
2056 	if (!handle_valid(handle))
2057 		return INTEL_GVT_INVALID_ADDR;
2058 
2059 	info = (struct kvmgt_guest_info *)handle;
2060 
2061 	pfn = gfn_to_pfn(info->kvm, gfn);
2062 	if (is_error_noslot_pfn(pfn))
2063 		return INTEL_GVT_INVALID_ADDR;
2064 
2065 	return pfn;
2066 }
2067 
2068 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
2069 		unsigned long size, dma_addr_t *dma_addr)
2070 {
2071 	struct intel_vgpu *vgpu;
2072 	struct kvmgt_vdev *vdev;
2073 	struct gvt_dma *entry;
2074 	int ret;
2075 
2076 	if (!handle_valid(handle))
2077 		return -EINVAL;
2078 
2079 	vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2080 	vdev = kvmgt_vdev(vgpu);
2081 
2082 	mutex_lock(&vdev->cache_lock);
2083 
2084 	entry = __gvt_cache_find_gfn(vgpu, gfn);
2085 	if (!entry) {
2086 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2087 		if (ret)
2088 			goto err_unlock;
2089 
2090 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2091 		if (ret)
2092 			goto err_unmap;
2093 	} else if (entry->size != size) {
2094 		/* the same gfn with different size: unmap and re-map */
2095 		gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
2096 		__gvt_cache_remove_entry(vgpu, entry);
2097 
2098 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2099 		if (ret)
2100 			goto err_unlock;
2101 
2102 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2103 		if (ret)
2104 			goto err_unmap;
2105 	} else {
2106 		kref_get(&entry->ref);
2107 		*dma_addr = entry->dma_addr;
2108 	}
2109 
2110 	mutex_unlock(&vdev->cache_lock);
2111 	return 0;
2112 
2113 err_unmap:
2114 	gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
2115 err_unlock:
2116 	mutex_unlock(&vdev->cache_lock);
2117 	return ret;
2118 }
2119 
2120 static int kvmgt_dma_pin_guest_page(unsigned long handle, dma_addr_t dma_addr)
2121 {
2122 	struct kvmgt_guest_info *info;
2123 	struct kvmgt_vdev *vdev;
2124 	struct gvt_dma *entry;
2125 	int ret = 0;
2126 
2127 	if (!handle_valid(handle))
2128 		return -ENODEV;
2129 
2130 	info = (struct kvmgt_guest_info *)handle;
2131 	vdev = kvmgt_vdev(info->vgpu);
2132 
2133 	mutex_lock(&vdev->cache_lock);
2134 	entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
2135 	if (entry)
2136 		kref_get(&entry->ref);
2137 	else
2138 		ret = -ENOMEM;
2139 	mutex_unlock(&vdev->cache_lock);
2140 
2141 	return ret;
2142 }
2143 
2144 static void __gvt_dma_release(struct kref *ref)
2145 {
2146 	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
2147 
2148 	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
2149 			   entry->size);
2150 	__gvt_cache_remove_entry(entry->vgpu, entry);
2151 }
2152 
2153 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
2154 {
2155 	struct intel_vgpu *vgpu;
2156 	struct kvmgt_vdev *vdev;
2157 	struct gvt_dma *entry;
2158 
2159 	if (!handle_valid(handle))
2160 		return;
2161 
2162 	vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2163 	vdev = kvmgt_vdev(vgpu);
2164 
2165 	mutex_lock(&vdev->cache_lock);
2166 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
2167 	if (entry)
2168 		kref_put(&entry->ref, __gvt_dma_release);
2169 	mutex_unlock(&vdev->cache_lock);
2170 }
2171 
2172 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
2173 			void *buf, unsigned long len, bool write)
2174 {
2175 	struct kvmgt_guest_info *info;
2176 
2177 	if (!handle_valid(handle))
2178 		return -ESRCH;
2179 
2180 	info = (struct kvmgt_guest_info *)handle;
2181 
2182 	return vfio_dma_rw(kvmgt_vdev(info->vgpu)->vfio_group,
2183 			   gpa, buf, len, write);
2184 }
2185 
2186 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
2187 			void *buf, unsigned long len)
2188 {
2189 	return kvmgt_rw_gpa(handle, gpa, buf, len, false);
2190 }
2191 
2192 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
2193 			void *buf, unsigned long len)
2194 {
2195 	return kvmgt_rw_gpa(handle, gpa, buf, len, true);
2196 }
2197 
2198 static unsigned long kvmgt_virt_to_pfn(void *addr)
2199 {
2200 	return PFN_DOWN(__pa(addr));
2201 }
2202 
2203 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
2204 {
2205 	struct kvmgt_guest_info *info;
2206 	struct kvm *kvm;
2207 	int idx;
2208 	bool ret;
2209 
2210 	if (!handle_valid(handle))
2211 		return false;
2212 
2213 	info = (struct kvmgt_guest_info *)handle;
2214 	kvm = info->kvm;
2215 
2216 	idx = srcu_read_lock(&kvm->srcu);
2217 	ret = kvm_is_visible_gfn(kvm, gfn);
2218 	srcu_read_unlock(&kvm->srcu, idx);
2219 
2220 	return ret;
2221 }
2222 
2223 static const struct intel_gvt_mpt kvmgt_mpt = {
2224 	.type = INTEL_GVT_HYPERVISOR_KVM,
2225 	.host_init = kvmgt_host_init,
2226 	.host_exit = kvmgt_host_exit,
2227 	.attach_vgpu = kvmgt_attach_vgpu,
2228 	.detach_vgpu = kvmgt_detach_vgpu,
2229 	.inject_msi = kvmgt_inject_msi,
2230 	.from_virt_to_mfn = kvmgt_virt_to_pfn,
2231 	.enable_page_track = kvmgt_page_track_add,
2232 	.disable_page_track = kvmgt_page_track_remove,
2233 	.read_gpa = kvmgt_read_gpa,
2234 	.write_gpa = kvmgt_write_gpa,
2235 	.gfn_to_mfn = kvmgt_gfn_to_pfn,
2236 	.dma_map_guest_page = kvmgt_dma_map_guest_page,
2237 	.dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
2238 	.dma_pin_guest_page = kvmgt_dma_pin_guest_page,
2239 	.set_opregion = kvmgt_set_opregion,
2240 	.set_edid = kvmgt_set_edid,
2241 	.get_vfio_device = kvmgt_get_vfio_device,
2242 	.put_vfio_device = kvmgt_put_vfio_device,
2243 	.is_valid_gfn = kvmgt_is_valid_gfn,
2244 };
2245 
2246 static int __init kvmgt_init(void)
2247 {
2248 	if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
2249 		return -ENODEV;
2250 	return 0;
2251 }
2252 
2253 static void __exit kvmgt_exit(void)
2254 {
2255 	intel_gvt_unregister_hypervisor();
2256 }
2257 
2258 module_init(kvmgt_init);
2259 module_exit(kvmgt_exit);
2260 
2261 MODULE_LICENSE("GPL and additional rights");
2262 MODULE_AUTHOR("Intel Corporation");
2263