xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/kvmgt.c (revision 6f0c460f)
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  *    Eddie Dong <eddie.dong@intel.com>
30  *
31  * Contributors:
32  *    Niu Bing <bing.niu@intel.com>
33  *    Zhi Wang <zhi.a.wang@intel.com>
34  */
35 
36 #include <linux/init.h>
37 #include <linux/device.h>
38 #include <linux/mm.h>
39 #include <linux/kthread.h>
40 #include <linux/sched/mm.h>
41 #include <linux/types.h>
42 #include <linux/list.h>
43 #include <linux/rbtree.h>
44 #include <linux/spinlock.h>
45 #include <linux/eventfd.h>
46 #include <linux/uuid.h>
47 #include <linux/mdev.h>
48 #include <linux/debugfs.h>
49 
50 #include <linux/nospec.h>
51 
52 #include <drm/drm_edid.h>
53 
54 #include "i915_drv.h"
55 #include "intel_gvt.h"
56 #include "gvt.h"
57 
58 MODULE_IMPORT_NS(DMA_BUF);
59 MODULE_IMPORT_NS(I915_GVT);
60 
61 /* helper macros copied from vfio-pci */
62 #define VFIO_PCI_OFFSET_SHIFT   40
63 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
64 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
65 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
66 
67 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
68 
69 #define OPREGION_SIGNATURE "IntelGraphicsMem"
70 
71 struct vfio_region;
72 struct intel_vgpu_regops {
73 	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
74 			size_t count, loff_t *ppos, bool iswrite);
75 	void (*release)(struct intel_vgpu *vgpu,
76 			struct vfio_region *region);
77 };
78 
79 struct vfio_region {
80 	u32				type;
81 	u32				subtype;
82 	size_t				size;
83 	u32				flags;
84 	const struct intel_vgpu_regops	*ops;
85 	void				*data;
86 };
87 
88 struct vfio_edid_region {
89 	struct vfio_region_gfx_edid vfio_edid_regs;
90 	void *edid_blob;
91 };
92 
93 struct kvmgt_pgfn {
94 	gfn_t gfn;
95 	struct hlist_node hnode;
96 };
97 
98 struct gvt_dma {
99 	struct intel_vgpu *vgpu;
100 	struct rb_node gfn_node;
101 	struct rb_node dma_addr_node;
102 	gfn_t gfn;
103 	dma_addr_t dma_addr;
104 	unsigned long size;
105 	struct kref ref;
106 };
107 
108 #define vfio_dev_to_vgpu(vfio_dev) \
109 	container_of((vfio_dev), struct intel_vgpu, vfio_device)
110 
111 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
112 		const u8 *val, int len,
113 		struct kvm_page_track_notifier_node *node);
114 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
115 		struct kvm_memory_slot *slot,
116 		struct kvm_page_track_notifier_node *node);
117 
118 static ssize_t available_instances_show(struct mdev_type *mtype,
119 					struct mdev_type_attribute *attr,
120 					char *buf)
121 {
122 	struct intel_vgpu_type *type;
123 	unsigned int num = 0;
124 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
125 
126 	type = &gvt->types[mtype_get_type_group_id(mtype)];
127 	if (!type)
128 		num = 0;
129 	else
130 		num = type->avail_instance;
131 
132 	return sprintf(buf, "%u\n", num);
133 }
134 
135 static ssize_t device_api_show(struct mdev_type *mtype,
136 			       struct mdev_type_attribute *attr, char *buf)
137 {
138 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
139 }
140 
141 static ssize_t description_show(struct mdev_type *mtype,
142 				struct mdev_type_attribute *attr, char *buf)
143 {
144 	struct intel_vgpu_type *type;
145 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
146 
147 	type = &gvt->types[mtype_get_type_group_id(mtype)];
148 	if (!type)
149 		return 0;
150 
151 	return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
152 		       "fence: %d\nresolution: %s\n"
153 		       "weight: %d\n",
154 		       BYTES_TO_MB(type->low_gm_size),
155 		       BYTES_TO_MB(type->high_gm_size),
156 		       type->fence, vgpu_edid_str(type->resolution),
157 		       type->weight);
158 }
159 
160 static ssize_t name_show(struct mdev_type *mtype,
161 			 struct mdev_type_attribute *attr, char *buf)
162 {
163 	struct intel_vgpu_type *type;
164 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
165 
166 	type = &gvt->types[mtype_get_type_group_id(mtype)];
167 	if (!type)
168 		return 0;
169 
170 	return sprintf(buf, "%s\n", type->name);
171 }
172 
173 static MDEV_TYPE_ATTR_RO(available_instances);
174 static MDEV_TYPE_ATTR_RO(device_api);
175 static MDEV_TYPE_ATTR_RO(description);
176 static MDEV_TYPE_ATTR_RO(name);
177 
178 static struct attribute *gvt_type_attrs[] = {
179 	&mdev_type_attr_available_instances.attr,
180 	&mdev_type_attr_device_api.attr,
181 	&mdev_type_attr_description.attr,
182 	&mdev_type_attr_name.attr,
183 	NULL,
184 };
185 
186 static struct attribute_group *gvt_vgpu_type_groups[] = {
187 	[0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
188 };
189 
190 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
191 {
192 	int i, j;
193 	struct intel_vgpu_type *type;
194 	struct attribute_group *group;
195 
196 	for (i = 0; i < gvt->num_types; i++) {
197 		type = &gvt->types[i];
198 
199 		group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
200 		if (!group)
201 			goto unwind;
202 
203 		group->name = type->name;
204 		group->attrs = gvt_type_attrs;
205 		gvt_vgpu_type_groups[i] = group;
206 	}
207 
208 	return 0;
209 
210 unwind:
211 	for (j = 0; j < i; j++) {
212 		group = gvt_vgpu_type_groups[j];
213 		kfree(group);
214 	}
215 
216 	return -ENOMEM;
217 }
218 
219 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
220 {
221 	int i;
222 	struct attribute_group *group;
223 
224 	for (i = 0; i < gvt->num_types; i++) {
225 		group = gvt_vgpu_type_groups[i];
226 		gvt_vgpu_type_groups[i] = NULL;
227 		kfree(group);
228 	}
229 }
230 
231 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
232 		unsigned long size)
233 {
234 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
235 	int total_pages;
236 	int npage;
237 	int ret;
238 
239 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
240 
241 	for (npage = 0; npage < total_pages; npage++) {
242 		unsigned long cur_gfn = gfn + npage;
243 
244 		ret = vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
245 		drm_WARN_ON(&i915->drm, ret != 1);
246 	}
247 }
248 
249 /* Pin a normal or compound guest page for dma. */
250 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
251 		unsigned long size, struct page **page)
252 {
253 	unsigned long base_pfn = 0;
254 	int total_pages;
255 	int npage;
256 	int ret;
257 
258 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
259 	/*
260 	 * We pin the pages one-by-one to avoid allocating a big arrary
261 	 * on stack to hold pfns.
262 	 */
263 	for (npage = 0; npage < total_pages; npage++) {
264 		unsigned long cur_gfn = gfn + npage;
265 		unsigned long pfn;
266 
267 		ret = vfio_pin_pages(&vgpu->vfio_device, &cur_gfn, 1,
268 				     IOMMU_READ | IOMMU_WRITE, &pfn);
269 		if (ret != 1) {
270 			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
271 				     cur_gfn, ret);
272 			goto err;
273 		}
274 
275 		if (!pfn_valid(pfn)) {
276 			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
277 			npage++;
278 			ret = -EFAULT;
279 			goto err;
280 		}
281 
282 		if (npage == 0)
283 			base_pfn = pfn;
284 		else if (base_pfn + npage != pfn) {
285 			gvt_vgpu_err("The pages are not continuous\n");
286 			ret = -EINVAL;
287 			npage++;
288 			goto err;
289 		}
290 	}
291 
292 	*page = pfn_to_page(base_pfn);
293 	return 0;
294 err:
295 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
296 	return ret;
297 }
298 
299 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
300 		dma_addr_t *dma_addr, unsigned long size)
301 {
302 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
303 	struct page *page = NULL;
304 	int ret;
305 
306 	ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
307 	if (ret)
308 		return ret;
309 
310 	/* Setup DMA mapping. */
311 	*dma_addr = dma_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL);
312 	if (dma_mapping_error(dev, *dma_addr)) {
313 		gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
314 			     page_to_pfn(page), ret);
315 		gvt_unpin_guest_page(vgpu, gfn, size);
316 		return -ENOMEM;
317 	}
318 
319 	return 0;
320 }
321 
322 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
323 		dma_addr_t dma_addr, unsigned long size)
324 {
325 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
326 
327 	dma_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL);
328 	gvt_unpin_guest_page(vgpu, gfn, size);
329 }
330 
331 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
332 		dma_addr_t dma_addr)
333 {
334 	struct rb_node *node = vgpu->dma_addr_cache.rb_node;
335 	struct gvt_dma *itr;
336 
337 	while (node) {
338 		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
339 
340 		if (dma_addr < itr->dma_addr)
341 			node = node->rb_left;
342 		else if (dma_addr > itr->dma_addr)
343 			node = node->rb_right;
344 		else
345 			return itr;
346 	}
347 	return NULL;
348 }
349 
350 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
351 {
352 	struct rb_node *node = vgpu->gfn_cache.rb_node;
353 	struct gvt_dma *itr;
354 
355 	while (node) {
356 		itr = rb_entry(node, struct gvt_dma, gfn_node);
357 
358 		if (gfn < itr->gfn)
359 			node = node->rb_left;
360 		else if (gfn > itr->gfn)
361 			node = node->rb_right;
362 		else
363 			return itr;
364 	}
365 	return NULL;
366 }
367 
368 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
369 		dma_addr_t dma_addr, unsigned long size)
370 {
371 	struct gvt_dma *new, *itr;
372 	struct rb_node **link, *parent = NULL;
373 
374 	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
375 	if (!new)
376 		return -ENOMEM;
377 
378 	new->vgpu = vgpu;
379 	new->gfn = gfn;
380 	new->dma_addr = dma_addr;
381 	new->size = size;
382 	kref_init(&new->ref);
383 
384 	/* gfn_cache maps gfn to struct gvt_dma. */
385 	link = &vgpu->gfn_cache.rb_node;
386 	while (*link) {
387 		parent = *link;
388 		itr = rb_entry(parent, struct gvt_dma, gfn_node);
389 
390 		if (gfn < itr->gfn)
391 			link = &parent->rb_left;
392 		else
393 			link = &parent->rb_right;
394 	}
395 	rb_link_node(&new->gfn_node, parent, link);
396 	rb_insert_color(&new->gfn_node, &vgpu->gfn_cache);
397 
398 	/* dma_addr_cache maps dma addr to struct gvt_dma. */
399 	parent = NULL;
400 	link = &vgpu->dma_addr_cache.rb_node;
401 	while (*link) {
402 		parent = *link;
403 		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
404 
405 		if (dma_addr < itr->dma_addr)
406 			link = &parent->rb_left;
407 		else
408 			link = &parent->rb_right;
409 	}
410 	rb_link_node(&new->dma_addr_node, parent, link);
411 	rb_insert_color(&new->dma_addr_node, &vgpu->dma_addr_cache);
412 
413 	vgpu->nr_cache_entries++;
414 	return 0;
415 }
416 
417 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
418 				struct gvt_dma *entry)
419 {
420 	rb_erase(&entry->gfn_node, &vgpu->gfn_cache);
421 	rb_erase(&entry->dma_addr_node, &vgpu->dma_addr_cache);
422 	kfree(entry);
423 	vgpu->nr_cache_entries--;
424 }
425 
426 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
427 {
428 	struct gvt_dma *dma;
429 	struct rb_node *node = NULL;
430 
431 	for (;;) {
432 		mutex_lock(&vgpu->cache_lock);
433 		node = rb_first(&vgpu->gfn_cache);
434 		if (!node) {
435 			mutex_unlock(&vgpu->cache_lock);
436 			break;
437 		}
438 		dma = rb_entry(node, struct gvt_dma, gfn_node);
439 		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
440 		__gvt_cache_remove_entry(vgpu, dma);
441 		mutex_unlock(&vgpu->cache_lock);
442 	}
443 }
444 
445 static void gvt_cache_init(struct intel_vgpu *vgpu)
446 {
447 	vgpu->gfn_cache = RB_ROOT;
448 	vgpu->dma_addr_cache = RB_ROOT;
449 	vgpu->nr_cache_entries = 0;
450 	mutex_init(&vgpu->cache_lock);
451 }
452 
453 static void kvmgt_protect_table_init(struct intel_vgpu *info)
454 {
455 	hash_init(info->ptable);
456 }
457 
458 static void kvmgt_protect_table_destroy(struct intel_vgpu *info)
459 {
460 	struct kvmgt_pgfn *p;
461 	struct hlist_node *tmp;
462 	int i;
463 
464 	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
465 		hash_del(&p->hnode);
466 		kfree(p);
467 	}
468 }
469 
470 static struct kvmgt_pgfn *
471 __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
472 {
473 	struct kvmgt_pgfn *p, *res = NULL;
474 
475 	hash_for_each_possible(info->ptable, p, hnode, gfn) {
476 		if (gfn == p->gfn) {
477 			res = p;
478 			break;
479 		}
480 	}
481 
482 	return res;
483 }
484 
485 static bool kvmgt_gfn_is_write_protected(struct intel_vgpu *info, gfn_t gfn)
486 {
487 	struct kvmgt_pgfn *p;
488 
489 	p = __kvmgt_protect_table_find(info, gfn);
490 	return !!p;
491 }
492 
493 static void kvmgt_protect_table_add(struct intel_vgpu *info, gfn_t gfn)
494 {
495 	struct kvmgt_pgfn *p;
496 
497 	if (kvmgt_gfn_is_write_protected(info, gfn))
498 		return;
499 
500 	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
501 	if (WARN(!p, "gfn: 0x%llx\n", gfn))
502 		return;
503 
504 	p->gfn = gfn;
505 	hash_add(info->ptable, &p->hnode, gfn);
506 }
507 
508 static void kvmgt_protect_table_del(struct intel_vgpu *info, gfn_t gfn)
509 {
510 	struct kvmgt_pgfn *p;
511 
512 	p = __kvmgt_protect_table_find(info, gfn);
513 	if (p) {
514 		hash_del(&p->hnode);
515 		kfree(p);
516 	}
517 }
518 
519 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
520 		size_t count, loff_t *ppos, bool iswrite)
521 {
522 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
523 			VFIO_PCI_NUM_REGIONS;
524 	void *base = vgpu->region[i].data;
525 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
526 
527 
528 	if (pos >= vgpu->region[i].size || iswrite) {
529 		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
530 		return -EINVAL;
531 	}
532 	count = min(count, (size_t)(vgpu->region[i].size - pos));
533 	memcpy(buf, base + pos, count);
534 
535 	return count;
536 }
537 
538 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
539 		struct vfio_region *region)
540 {
541 }
542 
543 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
544 	.rw = intel_vgpu_reg_rw_opregion,
545 	.release = intel_vgpu_reg_release_opregion,
546 };
547 
548 static int handle_edid_regs(struct intel_vgpu *vgpu,
549 			struct vfio_edid_region *region, char *buf,
550 			size_t count, u16 offset, bool is_write)
551 {
552 	struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
553 	unsigned int data;
554 
555 	if (offset + count > sizeof(*regs))
556 		return -EINVAL;
557 
558 	if (count != 4)
559 		return -EINVAL;
560 
561 	if (is_write) {
562 		data = *((unsigned int *)buf);
563 		switch (offset) {
564 		case offsetof(struct vfio_region_gfx_edid, link_state):
565 			if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
566 				if (!drm_edid_block_valid(
567 					(u8 *)region->edid_blob,
568 					0,
569 					true,
570 					NULL)) {
571 					gvt_vgpu_err("invalid EDID blob\n");
572 					return -EINVAL;
573 				}
574 				intel_vgpu_emulate_hotplug(vgpu, true);
575 			} else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
576 				intel_vgpu_emulate_hotplug(vgpu, false);
577 			else {
578 				gvt_vgpu_err("invalid EDID link state %d\n",
579 					regs->link_state);
580 				return -EINVAL;
581 			}
582 			regs->link_state = data;
583 			break;
584 		case offsetof(struct vfio_region_gfx_edid, edid_size):
585 			if (data > regs->edid_max_size) {
586 				gvt_vgpu_err("EDID size is bigger than %d!\n",
587 					regs->edid_max_size);
588 				return -EINVAL;
589 			}
590 			regs->edid_size = data;
591 			break;
592 		default:
593 			/* read-only regs */
594 			gvt_vgpu_err("write read-only EDID region at offset %d\n",
595 				offset);
596 			return -EPERM;
597 		}
598 	} else {
599 		memcpy(buf, (char *)regs + offset, count);
600 	}
601 
602 	return count;
603 }
604 
605 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
606 			size_t count, u16 offset, bool is_write)
607 {
608 	if (offset + count > region->vfio_edid_regs.edid_size)
609 		return -EINVAL;
610 
611 	if (is_write)
612 		memcpy(region->edid_blob + offset, buf, count);
613 	else
614 		memcpy(buf, region->edid_blob + offset, count);
615 
616 	return count;
617 }
618 
619 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
620 		size_t count, loff_t *ppos, bool iswrite)
621 {
622 	int ret;
623 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
624 			VFIO_PCI_NUM_REGIONS;
625 	struct vfio_edid_region *region = vgpu->region[i].data;
626 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
627 
628 	if (pos < region->vfio_edid_regs.edid_offset) {
629 		ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
630 	} else {
631 		pos -= EDID_BLOB_OFFSET;
632 		ret = handle_edid_blob(region, buf, count, pos, iswrite);
633 	}
634 
635 	if (ret < 0)
636 		gvt_vgpu_err("failed to access EDID region\n");
637 
638 	return ret;
639 }
640 
641 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
642 					struct vfio_region *region)
643 {
644 	kfree(region->data);
645 }
646 
647 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
648 	.rw = intel_vgpu_reg_rw_edid,
649 	.release = intel_vgpu_reg_release_edid,
650 };
651 
652 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
653 		unsigned int type, unsigned int subtype,
654 		const struct intel_vgpu_regops *ops,
655 		size_t size, u32 flags, void *data)
656 {
657 	struct vfio_region *region;
658 
659 	region = krealloc(vgpu->region,
660 			(vgpu->num_regions + 1) * sizeof(*region),
661 			GFP_KERNEL);
662 	if (!region)
663 		return -ENOMEM;
664 
665 	vgpu->region = region;
666 	vgpu->region[vgpu->num_regions].type = type;
667 	vgpu->region[vgpu->num_regions].subtype = subtype;
668 	vgpu->region[vgpu->num_regions].ops = ops;
669 	vgpu->region[vgpu->num_regions].size = size;
670 	vgpu->region[vgpu->num_regions].flags = flags;
671 	vgpu->region[vgpu->num_regions].data = data;
672 	vgpu->num_regions++;
673 	return 0;
674 }
675 
676 int intel_gvt_set_opregion(struct intel_vgpu *vgpu)
677 {
678 	void *base;
679 	int ret;
680 
681 	/* Each vgpu has its own opregion, although VFIO would create another
682 	 * one later. This one is used to expose opregion to VFIO. And the
683 	 * other one created by VFIO later, is used by guest actually.
684 	 */
685 	base = vgpu_opregion(vgpu)->va;
686 	if (!base)
687 		return -ENOMEM;
688 
689 	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
690 		memunmap(base);
691 		return -EINVAL;
692 	}
693 
694 	ret = intel_vgpu_register_reg(vgpu,
695 			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
696 			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
697 			&intel_vgpu_regops_opregion, OPREGION_SIZE,
698 			VFIO_REGION_INFO_FLAG_READ, base);
699 
700 	return ret;
701 }
702 
703 int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
704 {
705 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
706 	struct vfio_edid_region *base;
707 	int ret;
708 
709 	base = kzalloc(sizeof(*base), GFP_KERNEL);
710 	if (!base)
711 		return -ENOMEM;
712 
713 	/* TODO: Add multi-port and EDID extension block support */
714 	base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
715 	base->vfio_edid_regs.edid_max_size = EDID_SIZE;
716 	base->vfio_edid_regs.edid_size = EDID_SIZE;
717 	base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
718 	base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
719 	base->edid_blob = port->edid->edid_block;
720 
721 	ret = intel_vgpu_register_reg(vgpu,
722 			VFIO_REGION_TYPE_GFX,
723 			VFIO_REGION_SUBTYPE_GFX_EDID,
724 			&intel_vgpu_regops_edid, EDID_SIZE,
725 			VFIO_REGION_INFO_FLAG_READ |
726 			VFIO_REGION_INFO_FLAG_WRITE |
727 			VFIO_REGION_INFO_FLAG_CAPS, base);
728 
729 	return ret;
730 }
731 
732 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
733 				     unsigned long action, void *data)
734 {
735 	struct intel_vgpu *vgpu =
736 		container_of(nb, struct intel_vgpu, iommu_notifier);
737 
738 	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
739 		struct vfio_iommu_type1_dma_unmap *unmap = data;
740 		struct gvt_dma *entry;
741 		unsigned long iov_pfn, end_iov_pfn;
742 
743 		iov_pfn = unmap->iova >> PAGE_SHIFT;
744 		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
745 
746 		mutex_lock(&vgpu->cache_lock);
747 		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
748 			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
749 			if (!entry)
750 				continue;
751 
752 			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
753 					   entry->size);
754 			__gvt_cache_remove_entry(vgpu, entry);
755 		}
756 		mutex_unlock(&vgpu->cache_lock);
757 	}
758 
759 	return NOTIFY_OK;
760 }
761 
762 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
763 {
764 	struct intel_vgpu *itr;
765 	int id;
766 	bool ret = false;
767 
768 	mutex_lock(&vgpu->gvt->lock);
769 	for_each_active_vgpu(vgpu->gvt, itr, id) {
770 		if (!itr->attached)
771 			continue;
772 
773 		if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) {
774 			ret = true;
775 			goto out;
776 		}
777 	}
778 out:
779 	mutex_unlock(&vgpu->gvt->lock);
780 	return ret;
781 }
782 
783 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
784 {
785 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
786 	unsigned long events;
787 	int ret;
788 
789 	vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
790 
791 	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
792 	ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events,
793 				     &vgpu->iommu_notifier);
794 	if (ret != 0) {
795 		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
796 			ret);
797 		goto out;
798 	}
799 
800 	ret = -EEXIST;
801 	if (vgpu->attached)
802 		goto undo_iommu;
803 
804 	ret = -ESRCH;
805 	if (!vgpu->vfio_device.kvm ||
806 	    vgpu->vfio_device.kvm->mm != current->mm) {
807 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
808 		goto undo_iommu;
809 	}
810 
811 	kvm_get_kvm(vgpu->vfio_device.kvm);
812 
813 	ret = -EEXIST;
814 	if (__kvmgt_vgpu_exist(vgpu))
815 		goto undo_iommu;
816 
817 	vgpu->attached = true;
818 
819 	kvmgt_protect_table_init(vgpu);
820 	gvt_cache_init(vgpu);
821 
822 	vgpu->track_node.track_write = kvmgt_page_track_write;
823 	vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
824 	kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
825 					 &vgpu->track_node);
826 
827 	debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs,
828 			     &vgpu->nr_cache_entries);
829 
830 	intel_gvt_activate_vgpu(vgpu);
831 
832 	atomic_set(&vgpu->released, 0);
833 	return 0;
834 
835 undo_iommu:
836 	vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY,
837 				 &vgpu->iommu_notifier);
838 out:
839 	return ret;
840 }
841 
842 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
843 {
844 	struct eventfd_ctx *trigger;
845 
846 	trigger = vgpu->msi_trigger;
847 	if (trigger) {
848 		eventfd_ctx_put(trigger);
849 		vgpu->msi_trigger = NULL;
850 	}
851 }
852 
853 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
854 {
855 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
856 	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
857 	int ret;
858 
859 	if (!vgpu->attached)
860 		return;
861 
862 	if (atomic_cmpxchg(&vgpu->released, 0, 1))
863 		return;
864 
865 	intel_gvt_release_vgpu(vgpu);
866 
867 	ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_IOMMU_NOTIFY,
868 				       &vgpu->iommu_notifier);
869 	drm_WARN(&i915->drm, ret,
870 		 "vfio_unregister_notifier for iommu failed: %d\n", ret);
871 
872 	debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs));
873 
874 	kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
875 					   &vgpu->track_node);
876 	kvmgt_protect_table_destroy(vgpu);
877 	gvt_cache_destroy(vgpu);
878 
879 	intel_vgpu_release_msi_eventfd_ctx(vgpu);
880 
881 	vgpu->attached = false;
882 
883 	if (vgpu->vfio_device.kvm)
884 		kvm_put_kvm(vgpu->vfio_device.kvm);
885 }
886 
887 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
888 {
889 	u32 start_lo, start_hi;
890 	u32 mem_type;
891 
892 	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
893 			PCI_BASE_ADDRESS_MEM_MASK;
894 	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
895 			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
896 
897 	switch (mem_type) {
898 	case PCI_BASE_ADDRESS_MEM_TYPE_64:
899 		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
900 						+ bar + 4));
901 		break;
902 	case PCI_BASE_ADDRESS_MEM_TYPE_32:
903 	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
904 		/* 1M mem BAR treated as 32-bit BAR */
905 	default:
906 		/* mem unknown type treated as 32-bit BAR */
907 		start_hi = 0;
908 		break;
909 	}
910 
911 	return ((u64)start_hi << 32) | start_lo;
912 }
913 
914 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
915 			     void *buf, unsigned int count, bool is_write)
916 {
917 	u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
918 	int ret;
919 
920 	if (is_write)
921 		ret = intel_vgpu_emulate_mmio_write(vgpu,
922 					bar_start + off, buf, count);
923 	else
924 		ret = intel_vgpu_emulate_mmio_read(vgpu,
925 					bar_start + off, buf, count);
926 	return ret;
927 }
928 
929 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
930 {
931 	return off >= vgpu_aperture_offset(vgpu) &&
932 	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
933 }
934 
935 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
936 		void *buf, unsigned long count, bool is_write)
937 {
938 	void __iomem *aperture_va;
939 
940 	if (!intel_vgpu_in_aperture(vgpu, off) ||
941 	    !intel_vgpu_in_aperture(vgpu, off + count)) {
942 		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
943 		return -EINVAL;
944 	}
945 
946 	aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
947 					ALIGN_DOWN(off, PAGE_SIZE),
948 					count + offset_in_page(off));
949 	if (!aperture_va)
950 		return -EIO;
951 
952 	if (is_write)
953 		memcpy_toio(aperture_va + offset_in_page(off), buf, count);
954 	else
955 		memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
956 
957 	io_mapping_unmap(aperture_va);
958 
959 	return 0;
960 }
961 
962 static ssize_t intel_vgpu_rw(struct intel_vgpu *vgpu, char *buf,
963 			size_t count, loff_t *ppos, bool is_write)
964 {
965 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
966 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
967 	int ret = -EINVAL;
968 
969 
970 	if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) {
971 		gvt_vgpu_err("invalid index: %u\n", index);
972 		return -EINVAL;
973 	}
974 
975 	switch (index) {
976 	case VFIO_PCI_CONFIG_REGION_INDEX:
977 		if (is_write)
978 			ret = intel_vgpu_emulate_cfg_write(vgpu, pos,
979 						buf, count);
980 		else
981 			ret = intel_vgpu_emulate_cfg_read(vgpu, pos,
982 						buf, count);
983 		break;
984 	case VFIO_PCI_BAR0_REGION_INDEX:
985 		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
986 					buf, count, is_write);
987 		break;
988 	case VFIO_PCI_BAR2_REGION_INDEX:
989 		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
990 		break;
991 	case VFIO_PCI_BAR1_REGION_INDEX:
992 	case VFIO_PCI_BAR3_REGION_INDEX:
993 	case VFIO_PCI_BAR4_REGION_INDEX:
994 	case VFIO_PCI_BAR5_REGION_INDEX:
995 	case VFIO_PCI_VGA_REGION_INDEX:
996 	case VFIO_PCI_ROM_REGION_INDEX:
997 		break;
998 	default:
999 		if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
1000 			return -EINVAL;
1001 
1002 		index -= VFIO_PCI_NUM_REGIONS;
1003 		return vgpu->region[index].ops->rw(vgpu, buf, count,
1004 				ppos, is_write);
1005 	}
1006 
1007 	return ret == 0 ? count : ret;
1008 }
1009 
1010 static bool gtt_entry(struct intel_vgpu *vgpu, loff_t *ppos)
1011 {
1012 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1013 	struct intel_gvt *gvt = vgpu->gvt;
1014 	int offset;
1015 
1016 	/* Only allow MMIO GGTT entry access */
1017 	if (index != PCI_BASE_ADDRESS_0)
1018 		return false;
1019 
1020 	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
1021 		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
1022 
1023 	return (offset >= gvt->device_info.gtt_start_offset &&
1024 		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1025 			true : false;
1026 }
1027 
1028 static ssize_t intel_vgpu_read(struct vfio_device *vfio_dev, char __user *buf,
1029 			size_t count, loff_t *ppos)
1030 {
1031 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1032 	unsigned int done = 0;
1033 	int ret;
1034 
1035 	while (count) {
1036 		size_t filled;
1037 
1038 		/* Only support GGTT entry 8 bytes read */
1039 		if (count >= 8 && !(*ppos % 8) &&
1040 			gtt_entry(vgpu, ppos)) {
1041 			u64 val;
1042 
1043 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1044 					ppos, false);
1045 			if (ret <= 0)
1046 				goto read_err;
1047 
1048 			if (copy_to_user(buf, &val, sizeof(val)))
1049 				goto read_err;
1050 
1051 			filled = 8;
1052 		} else if (count >= 4 && !(*ppos % 4)) {
1053 			u32 val;
1054 
1055 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1056 					ppos, false);
1057 			if (ret <= 0)
1058 				goto read_err;
1059 
1060 			if (copy_to_user(buf, &val, sizeof(val)))
1061 				goto read_err;
1062 
1063 			filled = 4;
1064 		} else if (count >= 2 && !(*ppos % 2)) {
1065 			u16 val;
1066 
1067 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1068 					ppos, false);
1069 			if (ret <= 0)
1070 				goto read_err;
1071 
1072 			if (copy_to_user(buf, &val, sizeof(val)))
1073 				goto read_err;
1074 
1075 			filled = 2;
1076 		} else {
1077 			u8 val;
1078 
1079 			ret = intel_vgpu_rw(vgpu, &val, sizeof(val), ppos,
1080 					false);
1081 			if (ret <= 0)
1082 				goto read_err;
1083 
1084 			if (copy_to_user(buf, &val, sizeof(val)))
1085 				goto read_err;
1086 
1087 			filled = 1;
1088 		}
1089 
1090 		count -= filled;
1091 		done += filled;
1092 		*ppos += filled;
1093 		buf += filled;
1094 	}
1095 
1096 	return done;
1097 
1098 read_err:
1099 	return -EFAULT;
1100 }
1101 
1102 static ssize_t intel_vgpu_write(struct vfio_device *vfio_dev,
1103 				const char __user *buf,
1104 				size_t count, loff_t *ppos)
1105 {
1106 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1107 	unsigned int done = 0;
1108 	int ret;
1109 
1110 	while (count) {
1111 		size_t filled;
1112 
1113 		/* Only support GGTT entry 8 bytes write */
1114 		if (count >= 8 && !(*ppos % 8) &&
1115 			gtt_entry(vgpu, ppos)) {
1116 			u64 val;
1117 
1118 			if (copy_from_user(&val, buf, sizeof(val)))
1119 				goto write_err;
1120 
1121 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1122 					ppos, true);
1123 			if (ret <= 0)
1124 				goto write_err;
1125 
1126 			filled = 8;
1127 		} else if (count >= 4 && !(*ppos % 4)) {
1128 			u32 val;
1129 
1130 			if (copy_from_user(&val, buf, sizeof(val)))
1131 				goto write_err;
1132 
1133 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1134 					ppos, true);
1135 			if (ret <= 0)
1136 				goto write_err;
1137 
1138 			filled = 4;
1139 		} else if (count >= 2 && !(*ppos % 2)) {
1140 			u16 val;
1141 
1142 			if (copy_from_user(&val, buf, sizeof(val)))
1143 				goto write_err;
1144 
1145 			ret = intel_vgpu_rw(vgpu, (char *)&val,
1146 					sizeof(val), ppos, true);
1147 			if (ret <= 0)
1148 				goto write_err;
1149 
1150 			filled = 2;
1151 		} else {
1152 			u8 val;
1153 
1154 			if (copy_from_user(&val, buf, sizeof(val)))
1155 				goto write_err;
1156 
1157 			ret = intel_vgpu_rw(vgpu, &val, sizeof(val),
1158 					ppos, true);
1159 			if (ret <= 0)
1160 				goto write_err;
1161 
1162 			filled = 1;
1163 		}
1164 
1165 		count -= filled;
1166 		done += filled;
1167 		*ppos += filled;
1168 		buf += filled;
1169 	}
1170 
1171 	return done;
1172 write_err:
1173 	return -EFAULT;
1174 }
1175 
1176 static int intel_vgpu_mmap(struct vfio_device *vfio_dev,
1177 		struct vm_area_struct *vma)
1178 {
1179 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1180 	unsigned int index;
1181 	u64 virtaddr;
1182 	unsigned long req_size, pgoff, req_start;
1183 	pgprot_t pg_prot;
1184 
1185 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1186 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1187 		return -EINVAL;
1188 
1189 	if (vma->vm_end < vma->vm_start)
1190 		return -EINVAL;
1191 	if ((vma->vm_flags & VM_SHARED) == 0)
1192 		return -EINVAL;
1193 	if (index != VFIO_PCI_BAR2_REGION_INDEX)
1194 		return -EINVAL;
1195 
1196 	pg_prot = vma->vm_page_prot;
1197 	virtaddr = vma->vm_start;
1198 	req_size = vma->vm_end - vma->vm_start;
1199 	pgoff = vma->vm_pgoff &
1200 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1201 	req_start = pgoff << PAGE_SHIFT;
1202 
1203 	if (!intel_vgpu_in_aperture(vgpu, req_start))
1204 		return -EINVAL;
1205 	if (req_start + req_size >
1206 	    vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1207 		return -EINVAL;
1208 
1209 	pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1210 
1211 	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1212 }
1213 
1214 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1215 {
1216 	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1217 		return 1;
1218 
1219 	return 0;
1220 }
1221 
1222 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1223 			unsigned int index, unsigned int start,
1224 			unsigned int count, u32 flags,
1225 			void *data)
1226 {
1227 	return 0;
1228 }
1229 
1230 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1231 			unsigned int index, unsigned int start,
1232 			unsigned int count, u32 flags, void *data)
1233 {
1234 	return 0;
1235 }
1236 
1237 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1238 		unsigned int index, unsigned int start, unsigned int count,
1239 		u32 flags, void *data)
1240 {
1241 	return 0;
1242 }
1243 
1244 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1245 		unsigned int index, unsigned int start, unsigned int count,
1246 		u32 flags, void *data)
1247 {
1248 	struct eventfd_ctx *trigger;
1249 
1250 	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1251 		int fd = *(int *)data;
1252 
1253 		trigger = eventfd_ctx_fdget(fd);
1254 		if (IS_ERR(trigger)) {
1255 			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1256 			return PTR_ERR(trigger);
1257 		}
1258 		vgpu->msi_trigger = trigger;
1259 	} else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1260 		intel_vgpu_release_msi_eventfd_ctx(vgpu);
1261 
1262 	return 0;
1263 }
1264 
1265 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1266 		unsigned int index, unsigned int start, unsigned int count,
1267 		void *data)
1268 {
1269 	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1270 			unsigned int start, unsigned int count, u32 flags,
1271 			void *data) = NULL;
1272 
1273 	switch (index) {
1274 	case VFIO_PCI_INTX_IRQ_INDEX:
1275 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1276 		case VFIO_IRQ_SET_ACTION_MASK:
1277 			func = intel_vgpu_set_intx_mask;
1278 			break;
1279 		case VFIO_IRQ_SET_ACTION_UNMASK:
1280 			func = intel_vgpu_set_intx_unmask;
1281 			break;
1282 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1283 			func = intel_vgpu_set_intx_trigger;
1284 			break;
1285 		}
1286 		break;
1287 	case VFIO_PCI_MSI_IRQ_INDEX:
1288 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1289 		case VFIO_IRQ_SET_ACTION_MASK:
1290 		case VFIO_IRQ_SET_ACTION_UNMASK:
1291 			/* XXX Need masking support exported */
1292 			break;
1293 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1294 			func = intel_vgpu_set_msi_trigger;
1295 			break;
1296 		}
1297 		break;
1298 	}
1299 
1300 	if (!func)
1301 		return -ENOTTY;
1302 
1303 	return func(vgpu, index, start, count, flags, data);
1304 }
1305 
1306 static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
1307 			     unsigned long arg)
1308 {
1309 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1310 	unsigned long minsz;
1311 
1312 	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1313 
1314 	if (cmd == VFIO_DEVICE_GET_INFO) {
1315 		struct vfio_device_info info;
1316 
1317 		minsz = offsetofend(struct vfio_device_info, num_irqs);
1318 
1319 		if (copy_from_user(&info, (void __user *)arg, minsz))
1320 			return -EFAULT;
1321 
1322 		if (info.argsz < minsz)
1323 			return -EINVAL;
1324 
1325 		info.flags = VFIO_DEVICE_FLAGS_PCI;
1326 		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1327 		info.num_regions = VFIO_PCI_NUM_REGIONS +
1328 				vgpu->num_regions;
1329 		info.num_irqs = VFIO_PCI_NUM_IRQS;
1330 
1331 		return copy_to_user((void __user *)arg, &info, minsz) ?
1332 			-EFAULT : 0;
1333 
1334 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1335 		struct vfio_region_info info;
1336 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1337 		unsigned int i;
1338 		int ret;
1339 		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1340 		int nr_areas = 1;
1341 		int cap_type_id;
1342 
1343 		minsz = offsetofend(struct vfio_region_info, offset);
1344 
1345 		if (copy_from_user(&info, (void __user *)arg, minsz))
1346 			return -EFAULT;
1347 
1348 		if (info.argsz < minsz)
1349 			return -EINVAL;
1350 
1351 		switch (info.index) {
1352 		case VFIO_PCI_CONFIG_REGION_INDEX:
1353 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1354 			info.size = vgpu->gvt->device_info.cfg_space_size;
1355 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1356 				     VFIO_REGION_INFO_FLAG_WRITE;
1357 			break;
1358 		case VFIO_PCI_BAR0_REGION_INDEX:
1359 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1360 			info.size = vgpu->cfg_space.bar[info.index].size;
1361 			if (!info.size) {
1362 				info.flags = 0;
1363 				break;
1364 			}
1365 
1366 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1367 				     VFIO_REGION_INFO_FLAG_WRITE;
1368 			break;
1369 		case VFIO_PCI_BAR1_REGION_INDEX:
1370 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1371 			info.size = 0;
1372 			info.flags = 0;
1373 			break;
1374 		case VFIO_PCI_BAR2_REGION_INDEX:
1375 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1376 			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1377 					VFIO_REGION_INFO_FLAG_MMAP |
1378 					VFIO_REGION_INFO_FLAG_READ |
1379 					VFIO_REGION_INFO_FLAG_WRITE;
1380 			info.size = gvt_aperture_sz(vgpu->gvt);
1381 
1382 			sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1383 					 GFP_KERNEL);
1384 			if (!sparse)
1385 				return -ENOMEM;
1386 
1387 			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1388 			sparse->header.version = 1;
1389 			sparse->nr_areas = nr_areas;
1390 			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1391 			sparse->areas[0].offset =
1392 					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1393 			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1394 			break;
1395 
1396 		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1397 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1398 			info.size = 0;
1399 			info.flags = 0;
1400 
1401 			gvt_dbg_core("get region info bar:%d\n", info.index);
1402 			break;
1403 
1404 		case VFIO_PCI_ROM_REGION_INDEX:
1405 		case VFIO_PCI_VGA_REGION_INDEX:
1406 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1407 			info.size = 0;
1408 			info.flags = 0;
1409 
1410 			gvt_dbg_core("get region info index:%d\n", info.index);
1411 			break;
1412 		default:
1413 			{
1414 				struct vfio_region_info_cap_type cap_type = {
1415 					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1416 					.header.version = 1 };
1417 
1418 				if (info.index >= VFIO_PCI_NUM_REGIONS +
1419 						vgpu->num_regions)
1420 					return -EINVAL;
1421 				info.index =
1422 					array_index_nospec(info.index,
1423 							VFIO_PCI_NUM_REGIONS +
1424 							vgpu->num_regions);
1425 
1426 				i = info.index - VFIO_PCI_NUM_REGIONS;
1427 
1428 				info.offset =
1429 					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1430 				info.size = vgpu->region[i].size;
1431 				info.flags = vgpu->region[i].flags;
1432 
1433 				cap_type.type = vgpu->region[i].type;
1434 				cap_type.subtype = vgpu->region[i].subtype;
1435 
1436 				ret = vfio_info_add_capability(&caps,
1437 							&cap_type.header,
1438 							sizeof(cap_type));
1439 				if (ret)
1440 					return ret;
1441 			}
1442 		}
1443 
1444 		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1445 			switch (cap_type_id) {
1446 			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1447 				ret = vfio_info_add_capability(&caps,
1448 					&sparse->header,
1449 					struct_size(sparse, areas,
1450 						    sparse->nr_areas));
1451 				if (ret) {
1452 					kfree(sparse);
1453 					return ret;
1454 				}
1455 				break;
1456 			default:
1457 				kfree(sparse);
1458 				return -EINVAL;
1459 			}
1460 		}
1461 
1462 		if (caps.size) {
1463 			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1464 			if (info.argsz < sizeof(info) + caps.size) {
1465 				info.argsz = sizeof(info) + caps.size;
1466 				info.cap_offset = 0;
1467 			} else {
1468 				vfio_info_cap_shift(&caps, sizeof(info));
1469 				if (copy_to_user((void __user *)arg +
1470 						  sizeof(info), caps.buf,
1471 						  caps.size)) {
1472 					kfree(caps.buf);
1473 					kfree(sparse);
1474 					return -EFAULT;
1475 				}
1476 				info.cap_offset = sizeof(info);
1477 			}
1478 
1479 			kfree(caps.buf);
1480 		}
1481 
1482 		kfree(sparse);
1483 		return copy_to_user((void __user *)arg, &info, minsz) ?
1484 			-EFAULT : 0;
1485 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1486 		struct vfio_irq_info info;
1487 
1488 		minsz = offsetofend(struct vfio_irq_info, count);
1489 
1490 		if (copy_from_user(&info, (void __user *)arg, minsz))
1491 			return -EFAULT;
1492 
1493 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1494 			return -EINVAL;
1495 
1496 		switch (info.index) {
1497 		case VFIO_PCI_INTX_IRQ_INDEX:
1498 		case VFIO_PCI_MSI_IRQ_INDEX:
1499 			break;
1500 		default:
1501 			return -EINVAL;
1502 		}
1503 
1504 		info.flags = VFIO_IRQ_INFO_EVENTFD;
1505 
1506 		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1507 
1508 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1509 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1510 				       VFIO_IRQ_INFO_AUTOMASKED);
1511 		else
1512 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1513 
1514 		return copy_to_user((void __user *)arg, &info, minsz) ?
1515 			-EFAULT : 0;
1516 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1517 		struct vfio_irq_set hdr;
1518 		u8 *data = NULL;
1519 		int ret = 0;
1520 		size_t data_size = 0;
1521 
1522 		minsz = offsetofend(struct vfio_irq_set, count);
1523 
1524 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1525 			return -EFAULT;
1526 
1527 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1528 			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1529 
1530 			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1531 						VFIO_PCI_NUM_IRQS, &data_size);
1532 			if (ret) {
1533 				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1534 				return -EINVAL;
1535 			}
1536 			if (data_size) {
1537 				data = memdup_user((void __user *)(arg + minsz),
1538 						   data_size);
1539 				if (IS_ERR(data))
1540 					return PTR_ERR(data);
1541 			}
1542 		}
1543 
1544 		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1545 					hdr.start, hdr.count, data);
1546 		kfree(data);
1547 
1548 		return ret;
1549 	} else if (cmd == VFIO_DEVICE_RESET) {
1550 		intel_gvt_reset_vgpu(vgpu);
1551 		return 0;
1552 	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1553 		struct vfio_device_gfx_plane_info dmabuf;
1554 		int ret = 0;
1555 
1556 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1557 				    dmabuf_id);
1558 		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1559 			return -EFAULT;
1560 		if (dmabuf.argsz < minsz)
1561 			return -EINVAL;
1562 
1563 		ret = intel_vgpu_query_plane(vgpu, &dmabuf);
1564 		if (ret != 0)
1565 			return ret;
1566 
1567 		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1568 								-EFAULT : 0;
1569 	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1570 		__u32 dmabuf_id;
1571 
1572 		if (get_user(dmabuf_id, (__u32 __user *)arg))
1573 			return -EFAULT;
1574 		return intel_vgpu_get_dmabuf(vgpu, dmabuf_id);
1575 	}
1576 
1577 	return -ENOTTY;
1578 }
1579 
1580 static ssize_t
1581 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1582 	     char *buf)
1583 {
1584 	struct intel_vgpu *vgpu = dev_get_drvdata(dev);
1585 
1586 	return sprintf(buf, "%d\n", vgpu->id);
1587 }
1588 
1589 static DEVICE_ATTR_RO(vgpu_id);
1590 
1591 static struct attribute *intel_vgpu_attrs[] = {
1592 	&dev_attr_vgpu_id.attr,
1593 	NULL
1594 };
1595 
1596 static const struct attribute_group intel_vgpu_group = {
1597 	.name = "intel_vgpu",
1598 	.attrs = intel_vgpu_attrs,
1599 };
1600 
1601 static const struct attribute_group *intel_vgpu_groups[] = {
1602 	&intel_vgpu_group,
1603 	NULL,
1604 };
1605 
1606 static const struct vfio_device_ops intel_vgpu_dev_ops = {
1607 	.open_device	= intel_vgpu_open_device,
1608 	.close_device	= intel_vgpu_close_device,
1609 	.read		= intel_vgpu_read,
1610 	.write		= intel_vgpu_write,
1611 	.mmap		= intel_vgpu_mmap,
1612 	.ioctl		= intel_vgpu_ioctl,
1613 };
1614 
1615 static int intel_vgpu_probe(struct mdev_device *mdev)
1616 {
1617 	struct device *pdev = mdev_parent_dev(mdev);
1618 	struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt;
1619 	struct intel_vgpu_type *type;
1620 	struct intel_vgpu *vgpu;
1621 	int ret;
1622 
1623 	type = &gvt->types[mdev_get_type_group_id(mdev)];
1624 	if (!type)
1625 		return -EINVAL;
1626 
1627 	vgpu = intel_gvt_create_vgpu(gvt, type);
1628 	if (IS_ERR(vgpu)) {
1629 		gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu));
1630 		return PTR_ERR(vgpu);
1631 	}
1632 
1633 	vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev,
1634 			    &intel_vgpu_dev_ops);
1635 
1636 	dev_set_drvdata(&mdev->dev, vgpu);
1637 	ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device);
1638 	if (ret) {
1639 		intel_gvt_destroy_vgpu(vgpu);
1640 		return ret;
1641 	}
1642 
1643 	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
1644 		     dev_name(mdev_dev(mdev)));
1645 	return 0;
1646 }
1647 
1648 static void intel_vgpu_remove(struct mdev_device *mdev)
1649 {
1650 	struct intel_vgpu *vgpu = dev_get_drvdata(&mdev->dev);
1651 
1652 	if (WARN_ON_ONCE(vgpu->attached))
1653 		return;
1654 	intel_gvt_destroy_vgpu(vgpu);
1655 }
1656 
1657 static struct mdev_driver intel_vgpu_mdev_driver = {
1658 	.driver = {
1659 		.name		= "intel_vgpu_mdev",
1660 		.owner		= THIS_MODULE,
1661 		.dev_groups	= intel_vgpu_groups,
1662 	},
1663 	.probe		= intel_vgpu_probe,
1664 	.remove		= intel_vgpu_remove,
1665 	.supported_type_groups	= gvt_vgpu_type_groups,
1666 };
1667 
1668 int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
1669 {
1670 	struct kvm *kvm = info->vfio_device.kvm;
1671 	struct kvm_memory_slot *slot;
1672 	int idx;
1673 
1674 	if (!info->attached)
1675 		return -ESRCH;
1676 
1677 	idx = srcu_read_lock(&kvm->srcu);
1678 	slot = gfn_to_memslot(kvm, gfn);
1679 	if (!slot) {
1680 		srcu_read_unlock(&kvm->srcu, idx);
1681 		return -EINVAL;
1682 	}
1683 
1684 	write_lock(&kvm->mmu_lock);
1685 
1686 	if (kvmgt_gfn_is_write_protected(info, gfn))
1687 		goto out;
1688 
1689 	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1690 	kvmgt_protect_table_add(info, gfn);
1691 
1692 out:
1693 	write_unlock(&kvm->mmu_lock);
1694 	srcu_read_unlock(&kvm->srcu, idx);
1695 	return 0;
1696 }
1697 
1698 int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
1699 {
1700 	struct kvm *kvm = info->vfio_device.kvm;
1701 	struct kvm_memory_slot *slot;
1702 	int idx;
1703 
1704 	if (!info->attached)
1705 		return 0;
1706 
1707 	idx = srcu_read_lock(&kvm->srcu);
1708 	slot = gfn_to_memslot(kvm, gfn);
1709 	if (!slot) {
1710 		srcu_read_unlock(&kvm->srcu, idx);
1711 		return -EINVAL;
1712 	}
1713 
1714 	write_lock(&kvm->mmu_lock);
1715 
1716 	if (!kvmgt_gfn_is_write_protected(info, gfn))
1717 		goto out;
1718 
1719 	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1720 	kvmgt_protect_table_del(info, gfn);
1721 
1722 out:
1723 	write_unlock(&kvm->mmu_lock);
1724 	srcu_read_unlock(&kvm->srcu, idx);
1725 	return 0;
1726 }
1727 
1728 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1729 		const u8 *val, int len,
1730 		struct kvm_page_track_notifier_node *node)
1731 {
1732 	struct intel_vgpu *info =
1733 		container_of(node, struct intel_vgpu, track_node);
1734 
1735 	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1736 		intel_vgpu_page_track_handler(info, gpa,
1737 						     (void *)val, len);
1738 }
1739 
1740 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1741 		struct kvm_memory_slot *slot,
1742 		struct kvm_page_track_notifier_node *node)
1743 {
1744 	int i;
1745 	gfn_t gfn;
1746 	struct intel_vgpu *info =
1747 		container_of(node, struct intel_vgpu, track_node);
1748 
1749 	write_lock(&kvm->mmu_lock);
1750 	for (i = 0; i < slot->npages; i++) {
1751 		gfn = slot->base_gfn + i;
1752 		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1753 			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1754 						KVM_PAGE_TRACK_WRITE);
1755 			kvmgt_protect_table_del(info, gfn);
1756 		}
1757 	}
1758 	write_unlock(&kvm->mmu_lock);
1759 }
1760 
1761 void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)
1762 {
1763 	int i;
1764 
1765 	if (!vgpu->region)
1766 		return;
1767 
1768 	for (i = 0; i < vgpu->num_regions; i++)
1769 		if (vgpu->region[i].ops->release)
1770 			vgpu->region[i].ops->release(vgpu,
1771 					&vgpu->region[i]);
1772 	vgpu->num_regions = 0;
1773 	kfree(vgpu->region);
1774 	vgpu->region = NULL;
1775 }
1776 
1777 int intel_gvt_dma_map_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
1778 		unsigned long size, dma_addr_t *dma_addr)
1779 {
1780 	struct gvt_dma *entry;
1781 	int ret;
1782 
1783 	if (!vgpu->attached)
1784 		return -EINVAL;
1785 
1786 	mutex_lock(&vgpu->cache_lock);
1787 
1788 	entry = __gvt_cache_find_gfn(vgpu, gfn);
1789 	if (!entry) {
1790 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1791 		if (ret)
1792 			goto err_unlock;
1793 
1794 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1795 		if (ret)
1796 			goto err_unmap;
1797 	} else if (entry->size != size) {
1798 		/* the same gfn with different size: unmap and re-map */
1799 		gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1800 		__gvt_cache_remove_entry(vgpu, entry);
1801 
1802 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1803 		if (ret)
1804 			goto err_unlock;
1805 
1806 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1807 		if (ret)
1808 			goto err_unmap;
1809 	} else {
1810 		kref_get(&entry->ref);
1811 		*dma_addr = entry->dma_addr;
1812 	}
1813 
1814 	mutex_unlock(&vgpu->cache_lock);
1815 	return 0;
1816 
1817 err_unmap:
1818 	gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1819 err_unlock:
1820 	mutex_unlock(&vgpu->cache_lock);
1821 	return ret;
1822 }
1823 
1824 int intel_gvt_dma_pin_guest_page(struct intel_vgpu *vgpu, dma_addr_t dma_addr)
1825 {
1826 	struct gvt_dma *entry;
1827 	int ret = 0;
1828 
1829 	if (!vgpu->attached)
1830 		return -ENODEV;
1831 
1832 	mutex_lock(&vgpu->cache_lock);
1833 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1834 	if (entry)
1835 		kref_get(&entry->ref);
1836 	else
1837 		ret = -ENOMEM;
1838 	mutex_unlock(&vgpu->cache_lock);
1839 
1840 	return ret;
1841 }
1842 
1843 static void __gvt_dma_release(struct kref *ref)
1844 {
1845 	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1846 
1847 	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1848 			   entry->size);
1849 	__gvt_cache_remove_entry(entry->vgpu, entry);
1850 }
1851 
1852 void intel_gvt_dma_unmap_guest_page(struct intel_vgpu *vgpu,
1853 		dma_addr_t dma_addr)
1854 {
1855 	struct gvt_dma *entry;
1856 
1857 	if (!vgpu->attached)
1858 		return;
1859 
1860 	mutex_lock(&vgpu->cache_lock);
1861 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1862 	if (entry)
1863 		kref_put(&entry->ref, __gvt_dma_release);
1864 	mutex_unlock(&vgpu->cache_lock);
1865 }
1866 
1867 static void init_device_info(struct intel_gvt *gvt)
1868 {
1869 	struct intel_gvt_device_info *info = &gvt->device_info;
1870 	struct pci_dev *pdev = to_pci_dev(gvt->gt->i915->drm.dev);
1871 
1872 	info->max_support_vgpus = 8;
1873 	info->cfg_space_size = PCI_CFG_SPACE_EXP_SIZE;
1874 	info->mmio_size = 2 * 1024 * 1024;
1875 	info->mmio_bar = 0;
1876 	info->gtt_start_offset = 8 * 1024 * 1024;
1877 	info->gtt_entry_size = 8;
1878 	info->gtt_entry_size_shift = 3;
1879 	info->gmadr_bytes_in_cmd = 8;
1880 	info->max_surface_size = 36 * 1024 * 1024;
1881 	info->msi_cap_offset = pdev->msi_cap;
1882 }
1883 
1884 static void intel_gvt_test_and_emulate_vblank(struct intel_gvt *gvt)
1885 {
1886 	struct intel_vgpu *vgpu;
1887 	int id;
1888 
1889 	mutex_lock(&gvt->lock);
1890 	idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) {
1891 		if (test_and_clear_bit(INTEL_GVT_REQUEST_EMULATE_VBLANK + id,
1892 				       (void *)&gvt->service_request)) {
1893 			if (vgpu->active)
1894 				intel_vgpu_emulate_vblank(vgpu);
1895 		}
1896 	}
1897 	mutex_unlock(&gvt->lock);
1898 }
1899 
1900 static int gvt_service_thread(void *data)
1901 {
1902 	struct intel_gvt *gvt = (struct intel_gvt *)data;
1903 	int ret;
1904 
1905 	gvt_dbg_core("service thread start\n");
1906 
1907 	while (!kthread_should_stop()) {
1908 		ret = wait_event_interruptible(gvt->service_thread_wq,
1909 				kthread_should_stop() || gvt->service_request);
1910 
1911 		if (kthread_should_stop())
1912 			break;
1913 
1914 		if (WARN_ONCE(ret, "service thread is waken up by signal.\n"))
1915 			continue;
1916 
1917 		intel_gvt_test_and_emulate_vblank(gvt);
1918 
1919 		if (test_bit(INTEL_GVT_REQUEST_SCHED,
1920 				(void *)&gvt->service_request) ||
1921 			test_bit(INTEL_GVT_REQUEST_EVENT_SCHED,
1922 					(void *)&gvt->service_request)) {
1923 			intel_gvt_schedule(gvt);
1924 		}
1925 	}
1926 
1927 	return 0;
1928 }
1929 
1930 static void clean_service_thread(struct intel_gvt *gvt)
1931 {
1932 	kthread_stop(gvt->service_thread);
1933 }
1934 
1935 static int init_service_thread(struct intel_gvt *gvt)
1936 {
1937 	init_waitqueue_head(&gvt->service_thread_wq);
1938 
1939 	gvt->service_thread = kthread_run(gvt_service_thread,
1940 			gvt, "gvt_service_thread");
1941 	if (IS_ERR(gvt->service_thread)) {
1942 		gvt_err("fail to start service thread.\n");
1943 		return PTR_ERR(gvt->service_thread);
1944 	}
1945 	return 0;
1946 }
1947 
1948 /**
1949  * intel_gvt_clean_device - clean a GVT device
1950  * @i915: i915 private
1951  *
1952  * This function is called at the driver unloading stage, to free the
1953  * resources owned by a GVT device.
1954  *
1955  */
1956 static void intel_gvt_clean_device(struct drm_i915_private *i915)
1957 {
1958 	struct intel_gvt *gvt = fetch_and_zero(&i915->gvt);
1959 
1960 	if (drm_WARN_ON(&i915->drm, !gvt))
1961 		return;
1962 
1963 	mdev_unregister_device(i915->drm.dev);
1964 	intel_gvt_cleanup_vgpu_type_groups(gvt);
1965 	intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
1966 	intel_gvt_clean_vgpu_types(gvt);
1967 
1968 	intel_gvt_debugfs_clean(gvt);
1969 	clean_service_thread(gvt);
1970 	intel_gvt_clean_cmd_parser(gvt);
1971 	intel_gvt_clean_sched_policy(gvt);
1972 	intel_gvt_clean_workload_scheduler(gvt);
1973 	intel_gvt_clean_gtt(gvt);
1974 	intel_gvt_free_firmware(gvt);
1975 	intel_gvt_clean_mmio_info(gvt);
1976 	idr_destroy(&gvt->vgpu_idr);
1977 
1978 	kfree(i915->gvt);
1979 }
1980 
1981 /**
1982  * intel_gvt_init_device - initialize a GVT device
1983  * @i915: drm i915 private data
1984  *
1985  * This function is called at the initialization stage, to initialize
1986  * necessary GVT components.
1987  *
1988  * Returns:
1989  * Zero on success, negative error code if failed.
1990  *
1991  */
1992 static int intel_gvt_init_device(struct drm_i915_private *i915)
1993 {
1994 	struct intel_gvt *gvt;
1995 	struct intel_vgpu *vgpu;
1996 	int ret;
1997 
1998 	if (drm_WARN_ON(&i915->drm, i915->gvt))
1999 		return -EEXIST;
2000 
2001 	gvt = kzalloc(sizeof(struct intel_gvt), GFP_KERNEL);
2002 	if (!gvt)
2003 		return -ENOMEM;
2004 
2005 	gvt_dbg_core("init gvt device\n");
2006 
2007 	idr_init_base(&gvt->vgpu_idr, 1);
2008 	spin_lock_init(&gvt->scheduler.mmio_context_lock);
2009 	mutex_init(&gvt->lock);
2010 	mutex_init(&gvt->sched_lock);
2011 	gvt->gt = to_gt(i915);
2012 	i915->gvt = gvt;
2013 
2014 	init_device_info(gvt);
2015 
2016 	ret = intel_gvt_setup_mmio_info(gvt);
2017 	if (ret)
2018 		goto out_clean_idr;
2019 
2020 	intel_gvt_init_engine_mmio_context(gvt);
2021 
2022 	ret = intel_gvt_load_firmware(gvt);
2023 	if (ret)
2024 		goto out_clean_mmio_info;
2025 
2026 	ret = intel_gvt_init_irq(gvt);
2027 	if (ret)
2028 		goto out_free_firmware;
2029 
2030 	ret = intel_gvt_init_gtt(gvt);
2031 	if (ret)
2032 		goto out_free_firmware;
2033 
2034 	ret = intel_gvt_init_workload_scheduler(gvt);
2035 	if (ret)
2036 		goto out_clean_gtt;
2037 
2038 	ret = intel_gvt_init_sched_policy(gvt);
2039 	if (ret)
2040 		goto out_clean_workload_scheduler;
2041 
2042 	ret = intel_gvt_init_cmd_parser(gvt);
2043 	if (ret)
2044 		goto out_clean_sched_policy;
2045 
2046 	ret = init_service_thread(gvt);
2047 	if (ret)
2048 		goto out_clean_cmd_parser;
2049 
2050 	ret = intel_gvt_init_vgpu_types(gvt);
2051 	if (ret)
2052 		goto out_clean_thread;
2053 
2054 	vgpu = intel_gvt_create_idle_vgpu(gvt);
2055 	if (IS_ERR(vgpu)) {
2056 		ret = PTR_ERR(vgpu);
2057 		gvt_err("failed to create idle vgpu\n");
2058 		goto out_clean_types;
2059 	}
2060 	gvt->idle_vgpu = vgpu;
2061 
2062 	intel_gvt_debugfs_init(gvt);
2063 
2064 	ret = intel_gvt_init_vgpu_type_groups(gvt);
2065 	if (ret)
2066 		goto out_destroy_idle_vgpu;
2067 
2068 	ret = mdev_register_device(i915->drm.dev, &intel_vgpu_mdev_driver);
2069 	if (ret)
2070 		goto out_cleanup_vgpu_type_groups;
2071 
2072 	gvt_dbg_core("gvt device initialization is done\n");
2073 	return 0;
2074 
2075 out_cleanup_vgpu_type_groups:
2076 	intel_gvt_cleanup_vgpu_type_groups(gvt);
2077 out_destroy_idle_vgpu:
2078 	intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
2079 	intel_gvt_debugfs_clean(gvt);
2080 out_clean_types:
2081 	intel_gvt_clean_vgpu_types(gvt);
2082 out_clean_thread:
2083 	clean_service_thread(gvt);
2084 out_clean_cmd_parser:
2085 	intel_gvt_clean_cmd_parser(gvt);
2086 out_clean_sched_policy:
2087 	intel_gvt_clean_sched_policy(gvt);
2088 out_clean_workload_scheduler:
2089 	intel_gvt_clean_workload_scheduler(gvt);
2090 out_clean_gtt:
2091 	intel_gvt_clean_gtt(gvt);
2092 out_free_firmware:
2093 	intel_gvt_free_firmware(gvt);
2094 out_clean_mmio_info:
2095 	intel_gvt_clean_mmio_info(gvt);
2096 out_clean_idr:
2097 	idr_destroy(&gvt->vgpu_idr);
2098 	kfree(gvt);
2099 	i915->gvt = NULL;
2100 	return ret;
2101 }
2102 
2103 static void intel_gvt_pm_resume(struct drm_i915_private *i915)
2104 {
2105 	struct intel_gvt *gvt = i915->gvt;
2106 
2107 	intel_gvt_restore_fence(gvt);
2108 	intel_gvt_restore_mmio(gvt);
2109 	intel_gvt_restore_ggtt(gvt);
2110 }
2111 
2112 static const struct intel_vgpu_ops intel_gvt_vgpu_ops = {
2113 	.init_device	= intel_gvt_init_device,
2114 	.clean_device	= intel_gvt_clean_device,
2115 	.pm_resume	= intel_gvt_pm_resume,
2116 };
2117 
2118 static int __init kvmgt_init(void)
2119 {
2120 	int ret;
2121 
2122 	ret = intel_gvt_set_ops(&intel_gvt_vgpu_ops);
2123 	if (ret)
2124 		return ret;
2125 
2126 	ret = mdev_register_driver(&intel_vgpu_mdev_driver);
2127 	if (ret)
2128 		intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2129 	return ret;
2130 }
2131 
2132 static void __exit kvmgt_exit(void)
2133 {
2134 	mdev_unregister_driver(&intel_vgpu_mdev_driver);
2135 	intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2136 }
2137 
2138 module_init(kvmgt_init);
2139 module_exit(kvmgt_exit);
2140 
2141 MODULE_LICENSE("GPL and additional rights");
2142 MODULE_AUTHOR("Intel Corporation");
2143