xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/kvmgt.c (revision 12cecbf9)
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  *    Eddie Dong <eddie.dong@intel.com>
30  *
31  * Contributors:
32  *    Niu Bing <bing.niu@intel.com>
33  *    Zhi Wang <zhi.a.wang@intel.com>
34  */
35 
36 #include <linux/init.h>
37 #include <linux/device.h>
38 #include <linux/mm.h>
39 #include <linux/kthread.h>
40 #include <linux/sched/mm.h>
41 #include <linux/types.h>
42 #include <linux/list.h>
43 #include <linux/rbtree.h>
44 #include <linux/spinlock.h>
45 #include <linux/eventfd.h>
46 #include <linux/uuid.h>
47 #include <linux/mdev.h>
48 #include <linux/debugfs.h>
49 
50 #include <linux/nospec.h>
51 
52 #include <drm/drm_edid.h>
53 
54 #include "i915_drv.h"
55 #include "intel_gvt.h"
56 #include "gvt.h"
57 
58 MODULE_IMPORT_NS(DMA_BUF);
59 MODULE_IMPORT_NS(I915_GVT);
60 
61 /* helper macros copied from vfio-pci */
62 #define VFIO_PCI_OFFSET_SHIFT   40
63 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
64 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
65 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
66 
67 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
68 
69 #define OPREGION_SIGNATURE "IntelGraphicsMem"
70 
71 struct vfio_region;
72 struct intel_vgpu_regops {
73 	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
74 			size_t count, loff_t *ppos, bool iswrite);
75 	void (*release)(struct intel_vgpu *vgpu,
76 			struct vfio_region *region);
77 };
78 
79 struct vfio_region {
80 	u32				type;
81 	u32				subtype;
82 	size_t				size;
83 	u32				flags;
84 	const struct intel_vgpu_regops	*ops;
85 	void				*data;
86 };
87 
88 struct vfio_edid_region {
89 	struct vfio_region_gfx_edid vfio_edid_regs;
90 	void *edid_blob;
91 };
92 
93 struct kvmgt_pgfn {
94 	gfn_t gfn;
95 	struct hlist_node hnode;
96 };
97 
98 struct gvt_dma {
99 	struct intel_vgpu *vgpu;
100 	struct rb_node gfn_node;
101 	struct rb_node dma_addr_node;
102 	gfn_t gfn;
103 	dma_addr_t dma_addr;
104 	unsigned long size;
105 	struct kref ref;
106 };
107 
108 #define vfio_dev_to_vgpu(vfio_dev) \
109 	container_of((vfio_dev), struct intel_vgpu, vfio_device)
110 
111 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
112 		const u8 *val, int len,
113 		struct kvm_page_track_notifier_node *node);
114 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
115 		struct kvm_memory_slot *slot,
116 		struct kvm_page_track_notifier_node *node);
117 
118 static ssize_t available_instances_show(struct mdev_type *mtype,
119 					struct mdev_type_attribute *attr,
120 					char *buf)
121 {
122 	struct intel_vgpu_type *type;
123 	unsigned int num = 0;
124 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
125 
126 	type = &gvt->types[mtype_get_type_group_id(mtype)];
127 	if (!type)
128 		num = 0;
129 	else
130 		num = type->avail_instance;
131 
132 	return sprintf(buf, "%u\n", num);
133 }
134 
135 static ssize_t device_api_show(struct mdev_type *mtype,
136 			       struct mdev_type_attribute *attr, char *buf)
137 {
138 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
139 }
140 
141 static ssize_t description_show(struct mdev_type *mtype,
142 				struct mdev_type_attribute *attr, char *buf)
143 {
144 	struct intel_vgpu_type *type;
145 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
146 
147 	type = &gvt->types[mtype_get_type_group_id(mtype)];
148 	if (!type)
149 		return 0;
150 
151 	return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
152 		       "fence: %d\nresolution: %s\n"
153 		       "weight: %d\n",
154 		       BYTES_TO_MB(type->low_gm_size),
155 		       BYTES_TO_MB(type->high_gm_size),
156 		       type->fence, vgpu_edid_str(type->resolution),
157 		       type->weight);
158 }
159 
160 static ssize_t name_show(struct mdev_type *mtype,
161 			 struct mdev_type_attribute *attr, char *buf)
162 {
163 	struct intel_vgpu_type *type;
164 	struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
165 
166 	type = &gvt->types[mtype_get_type_group_id(mtype)];
167 	if (!type)
168 		return 0;
169 
170 	return sprintf(buf, "%s\n", type->name);
171 }
172 
173 static MDEV_TYPE_ATTR_RO(available_instances);
174 static MDEV_TYPE_ATTR_RO(device_api);
175 static MDEV_TYPE_ATTR_RO(description);
176 static MDEV_TYPE_ATTR_RO(name);
177 
178 static struct attribute *gvt_type_attrs[] = {
179 	&mdev_type_attr_available_instances.attr,
180 	&mdev_type_attr_device_api.attr,
181 	&mdev_type_attr_description.attr,
182 	&mdev_type_attr_name.attr,
183 	NULL,
184 };
185 
186 static struct attribute_group *gvt_vgpu_type_groups[] = {
187 	[0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
188 };
189 
190 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
191 {
192 	int i, j;
193 	struct intel_vgpu_type *type;
194 	struct attribute_group *group;
195 
196 	for (i = 0; i < gvt->num_types; i++) {
197 		type = &gvt->types[i];
198 
199 		group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
200 		if (!group)
201 			goto unwind;
202 
203 		group->name = type->name;
204 		group->attrs = gvt_type_attrs;
205 		gvt_vgpu_type_groups[i] = group;
206 	}
207 
208 	return 0;
209 
210 unwind:
211 	for (j = 0; j < i; j++) {
212 		group = gvt_vgpu_type_groups[j];
213 		kfree(group);
214 	}
215 
216 	return -ENOMEM;
217 }
218 
219 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
220 {
221 	int i;
222 	struct attribute_group *group;
223 
224 	for (i = 0; i < gvt->num_types; i++) {
225 		group = gvt_vgpu_type_groups[i];
226 		gvt_vgpu_type_groups[i] = NULL;
227 		kfree(group);
228 	}
229 }
230 
231 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
232 		unsigned long size)
233 {
234 	vfio_unpin_pages(&vgpu->vfio_device, gfn << PAGE_SHIFT,
235 			 DIV_ROUND_UP(size, PAGE_SIZE));
236 }
237 
238 /* Pin a normal or compound guest page for dma. */
239 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
240 		unsigned long size, struct page **page)
241 {
242 	int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
243 	struct page *base_page = NULL;
244 	int npage;
245 	int ret;
246 
247 	/*
248 	 * We pin the pages one-by-one to avoid allocating a big arrary
249 	 * on stack to hold pfns.
250 	 */
251 	for (npage = 0; npage < total_pages; npage++) {
252 		dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
253 		struct page *cur_page;
254 
255 		ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
256 				     IOMMU_READ | IOMMU_WRITE, &cur_page);
257 		if (ret != 1) {
258 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
259 				     &cur_iova, ret);
260 			goto err;
261 		}
262 
263 		if (npage == 0)
264 			base_page = cur_page;
265 		else if (base_page + npage != cur_page) {
266 			gvt_vgpu_err("The pages are not continuous\n");
267 			ret = -EINVAL;
268 			npage++;
269 			goto err;
270 		}
271 	}
272 
273 	*page = base_page;
274 	return 0;
275 err:
276 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
277 	return ret;
278 }
279 
280 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
281 		dma_addr_t *dma_addr, unsigned long size)
282 {
283 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
284 	struct page *page = NULL;
285 	int ret;
286 
287 	ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
288 	if (ret)
289 		return ret;
290 
291 	/* Setup DMA mapping. */
292 	*dma_addr = dma_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL);
293 	if (dma_mapping_error(dev, *dma_addr)) {
294 		gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
295 			     page_to_pfn(page), ret);
296 		gvt_unpin_guest_page(vgpu, gfn, size);
297 		return -ENOMEM;
298 	}
299 
300 	return 0;
301 }
302 
303 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
304 		dma_addr_t dma_addr, unsigned long size)
305 {
306 	struct device *dev = vgpu->gvt->gt->i915->drm.dev;
307 
308 	dma_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL);
309 	gvt_unpin_guest_page(vgpu, gfn, size);
310 }
311 
312 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
313 		dma_addr_t dma_addr)
314 {
315 	struct rb_node *node = vgpu->dma_addr_cache.rb_node;
316 	struct gvt_dma *itr;
317 
318 	while (node) {
319 		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
320 
321 		if (dma_addr < itr->dma_addr)
322 			node = node->rb_left;
323 		else if (dma_addr > itr->dma_addr)
324 			node = node->rb_right;
325 		else
326 			return itr;
327 	}
328 	return NULL;
329 }
330 
331 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
332 {
333 	struct rb_node *node = vgpu->gfn_cache.rb_node;
334 	struct gvt_dma *itr;
335 
336 	while (node) {
337 		itr = rb_entry(node, struct gvt_dma, gfn_node);
338 
339 		if (gfn < itr->gfn)
340 			node = node->rb_left;
341 		else if (gfn > itr->gfn)
342 			node = node->rb_right;
343 		else
344 			return itr;
345 	}
346 	return NULL;
347 }
348 
349 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
350 		dma_addr_t dma_addr, unsigned long size)
351 {
352 	struct gvt_dma *new, *itr;
353 	struct rb_node **link, *parent = NULL;
354 
355 	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
356 	if (!new)
357 		return -ENOMEM;
358 
359 	new->vgpu = vgpu;
360 	new->gfn = gfn;
361 	new->dma_addr = dma_addr;
362 	new->size = size;
363 	kref_init(&new->ref);
364 
365 	/* gfn_cache maps gfn to struct gvt_dma. */
366 	link = &vgpu->gfn_cache.rb_node;
367 	while (*link) {
368 		parent = *link;
369 		itr = rb_entry(parent, struct gvt_dma, gfn_node);
370 
371 		if (gfn < itr->gfn)
372 			link = &parent->rb_left;
373 		else
374 			link = &parent->rb_right;
375 	}
376 	rb_link_node(&new->gfn_node, parent, link);
377 	rb_insert_color(&new->gfn_node, &vgpu->gfn_cache);
378 
379 	/* dma_addr_cache maps dma addr to struct gvt_dma. */
380 	parent = NULL;
381 	link = &vgpu->dma_addr_cache.rb_node;
382 	while (*link) {
383 		parent = *link;
384 		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
385 
386 		if (dma_addr < itr->dma_addr)
387 			link = &parent->rb_left;
388 		else
389 			link = &parent->rb_right;
390 	}
391 	rb_link_node(&new->dma_addr_node, parent, link);
392 	rb_insert_color(&new->dma_addr_node, &vgpu->dma_addr_cache);
393 
394 	vgpu->nr_cache_entries++;
395 	return 0;
396 }
397 
398 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
399 				struct gvt_dma *entry)
400 {
401 	rb_erase(&entry->gfn_node, &vgpu->gfn_cache);
402 	rb_erase(&entry->dma_addr_node, &vgpu->dma_addr_cache);
403 	kfree(entry);
404 	vgpu->nr_cache_entries--;
405 }
406 
407 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
408 {
409 	struct gvt_dma *dma;
410 	struct rb_node *node = NULL;
411 
412 	for (;;) {
413 		mutex_lock(&vgpu->cache_lock);
414 		node = rb_first(&vgpu->gfn_cache);
415 		if (!node) {
416 			mutex_unlock(&vgpu->cache_lock);
417 			break;
418 		}
419 		dma = rb_entry(node, struct gvt_dma, gfn_node);
420 		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
421 		__gvt_cache_remove_entry(vgpu, dma);
422 		mutex_unlock(&vgpu->cache_lock);
423 	}
424 }
425 
426 static void gvt_cache_init(struct intel_vgpu *vgpu)
427 {
428 	vgpu->gfn_cache = RB_ROOT;
429 	vgpu->dma_addr_cache = RB_ROOT;
430 	vgpu->nr_cache_entries = 0;
431 	mutex_init(&vgpu->cache_lock);
432 }
433 
434 static void kvmgt_protect_table_init(struct intel_vgpu *info)
435 {
436 	hash_init(info->ptable);
437 }
438 
439 static void kvmgt_protect_table_destroy(struct intel_vgpu *info)
440 {
441 	struct kvmgt_pgfn *p;
442 	struct hlist_node *tmp;
443 	int i;
444 
445 	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
446 		hash_del(&p->hnode);
447 		kfree(p);
448 	}
449 }
450 
451 static struct kvmgt_pgfn *
452 __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
453 {
454 	struct kvmgt_pgfn *p, *res = NULL;
455 
456 	hash_for_each_possible(info->ptable, p, hnode, gfn) {
457 		if (gfn == p->gfn) {
458 			res = p;
459 			break;
460 		}
461 	}
462 
463 	return res;
464 }
465 
466 static bool kvmgt_gfn_is_write_protected(struct intel_vgpu *info, gfn_t gfn)
467 {
468 	struct kvmgt_pgfn *p;
469 
470 	p = __kvmgt_protect_table_find(info, gfn);
471 	return !!p;
472 }
473 
474 static void kvmgt_protect_table_add(struct intel_vgpu *info, gfn_t gfn)
475 {
476 	struct kvmgt_pgfn *p;
477 
478 	if (kvmgt_gfn_is_write_protected(info, gfn))
479 		return;
480 
481 	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
482 	if (WARN(!p, "gfn: 0x%llx\n", gfn))
483 		return;
484 
485 	p->gfn = gfn;
486 	hash_add(info->ptable, &p->hnode, gfn);
487 }
488 
489 static void kvmgt_protect_table_del(struct intel_vgpu *info, gfn_t gfn)
490 {
491 	struct kvmgt_pgfn *p;
492 
493 	p = __kvmgt_protect_table_find(info, gfn);
494 	if (p) {
495 		hash_del(&p->hnode);
496 		kfree(p);
497 	}
498 }
499 
500 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
501 		size_t count, loff_t *ppos, bool iswrite)
502 {
503 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
504 			VFIO_PCI_NUM_REGIONS;
505 	void *base = vgpu->region[i].data;
506 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
507 
508 
509 	if (pos >= vgpu->region[i].size || iswrite) {
510 		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
511 		return -EINVAL;
512 	}
513 	count = min(count, (size_t)(vgpu->region[i].size - pos));
514 	memcpy(buf, base + pos, count);
515 
516 	return count;
517 }
518 
519 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
520 		struct vfio_region *region)
521 {
522 }
523 
524 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
525 	.rw = intel_vgpu_reg_rw_opregion,
526 	.release = intel_vgpu_reg_release_opregion,
527 };
528 
529 static int handle_edid_regs(struct intel_vgpu *vgpu,
530 			struct vfio_edid_region *region, char *buf,
531 			size_t count, u16 offset, bool is_write)
532 {
533 	struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
534 	unsigned int data;
535 
536 	if (offset + count > sizeof(*regs))
537 		return -EINVAL;
538 
539 	if (count != 4)
540 		return -EINVAL;
541 
542 	if (is_write) {
543 		data = *((unsigned int *)buf);
544 		switch (offset) {
545 		case offsetof(struct vfio_region_gfx_edid, link_state):
546 			if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
547 				if (!drm_edid_block_valid(
548 					(u8 *)region->edid_blob,
549 					0,
550 					true,
551 					NULL)) {
552 					gvt_vgpu_err("invalid EDID blob\n");
553 					return -EINVAL;
554 				}
555 				intel_vgpu_emulate_hotplug(vgpu, true);
556 			} else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
557 				intel_vgpu_emulate_hotplug(vgpu, false);
558 			else {
559 				gvt_vgpu_err("invalid EDID link state %d\n",
560 					regs->link_state);
561 				return -EINVAL;
562 			}
563 			regs->link_state = data;
564 			break;
565 		case offsetof(struct vfio_region_gfx_edid, edid_size):
566 			if (data > regs->edid_max_size) {
567 				gvt_vgpu_err("EDID size is bigger than %d!\n",
568 					regs->edid_max_size);
569 				return -EINVAL;
570 			}
571 			regs->edid_size = data;
572 			break;
573 		default:
574 			/* read-only regs */
575 			gvt_vgpu_err("write read-only EDID region at offset %d\n",
576 				offset);
577 			return -EPERM;
578 		}
579 	} else {
580 		memcpy(buf, (char *)regs + offset, count);
581 	}
582 
583 	return count;
584 }
585 
586 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
587 			size_t count, u16 offset, bool is_write)
588 {
589 	if (offset + count > region->vfio_edid_regs.edid_size)
590 		return -EINVAL;
591 
592 	if (is_write)
593 		memcpy(region->edid_blob + offset, buf, count);
594 	else
595 		memcpy(buf, region->edid_blob + offset, count);
596 
597 	return count;
598 }
599 
600 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
601 		size_t count, loff_t *ppos, bool iswrite)
602 {
603 	int ret;
604 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
605 			VFIO_PCI_NUM_REGIONS;
606 	struct vfio_edid_region *region = vgpu->region[i].data;
607 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
608 
609 	if (pos < region->vfio_edid_regs.edid_offset) {
610 		ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
611 	} else {
612 		pos -= EDID_BLOB_OFFSET;
613 		ret = handle_edid_blob(region, buf, count, pos, iswrite);
614 	}
615 
616 	if (ret < 0)
617 		gvt_vgpu_err("failed to access EDID region\n");
618 
619 	return ret;
620 }
621 
622 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
623 					struct vfio_region *region)
624 {
625 	kfree(region->data);
626 }
627 
628 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
629 	.rw = intel_vgpu_reg_rw_edid,
630 	.release = intel_vgpu_reg_release_edid,
631 };
632 
633 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
634 		unsigned int type, unsigned int subtype,
635 		const struct intel_vgpu_regops *ops,
636 		size_t size, u32 flags, void *data)
637 {
638 	struct vfio_region *region;
639 
640 	region = krealloc(vgpu->region,
641 			(vgpu->num_regions + 1) * sizeof(*region),
642 			GFP_KERNEL);
643 	if (!region)
644 		return -ENOMEM;
645 
646 	vgpu->region = region;
647 	vgpu->region[vgpu->num_regions].type = type;
648 	vgpu->region[vgpu->num_regions].subtype = subtype;
649 	vgpu->region[vgpu->num_regions].ops = ops;
650 	vgpu->region[vgpu->num_regions].size = size;
651 	vgpu->region[vgpu->num_regions].flags = flags;
652 	vgpu->region[vgpu->num_regions].data = data;
653 	vgpu->num_regions++;
654 	return 0;
655 }
656 
657 int intel_gvt_set_opregion(struct intel_vgpu *vgpu)
658 {
659 	void *base;
660 	int ret;
661 
662 	/* Each vgpu has its own opregion, although VFIO would create another
663 	 * one later. This one is used to expose opregion to VFIO. And the
664 	 * other one created by VFIO later, is used by guest actually.
665 	 */
666 	base = vgpu_opregion(vgpu)->va;
667 	if (!base)
668 		return -ENOMEM;
669 
670 	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
671 		memunmap(base);
672 		return -EINVAL;
673 	}
674 
675 	ret = intel_vgpu_register_reg(vgpu,
676 			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
677 			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
678 			&intel_vgpu_regops_opregion, OPREGION_SIZE,
679 			VFIO_REGION_INFO_FLAG_READ, base);
680 
681 	return ret;
682 }
683 
684 int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
685 {
686 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
687 	struct vfio_edid_region *base;
688 	int ret;
689 
690 	base = kzalloc(sizeof(*base), GFP_KERNEL);
691 	if (!base)
692 		return -ENOMEM;
693 
694 	/* TODO: Add multi-port and EDID extension block support */
695 	base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
696 	base->vfio_edid_regs.edid_max_size = EDID_SIZE;
697 	base->vfio_edid_regs.edid_size = EDID_SIZE;
698 	base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
699 	base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
700 	base->edid_blob = port->edid->edid_block;
701 
702 	ret = intel_vgpu_register_reg(vgpu,
703 			VFIO_REGION_TYPE_GFX,
704 			VFIO_REGION_SUBTYPE_GFX_EDID,
705 			&intel_vgpu_regops_edid, EDID_SIZE,
706 			VFIO_REGION_INFO_FLAG_READ |
707 			VFIO_REGION_INFO_FLAG_WRITE |
708 			VFIO_REGION_INFO_FLAG_CAPS, base);
709 
710 	return ret;
711 }
712 
713 static void intel_vgpu_dma_unmap(struct vfio_device *vfio_dev, u64 iova,
714 				 u64 length)
715 {
716 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
717 	struct gvt_dma *entry;
718 	u64 iov_pfn = iova >> PAGE_SHIFT;
719 	u64 end_iov_pfn = iov_pfn + length / PAGE_SIZE;
720 
721 	mutex_lock(&vgpu->cache_lock);
722 	for (; iov_pfn < end_iov_pfn; iov_pfn++) {
723 		entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
724 		if (!entry)
725 			continue;
726 
727 		gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
728 				   entry->size);
729 		__gvt_cache_remove_entry(vgpu, entry);
730 	}
731 	mutex_unlock(&vgpu->cache_lock);
732 }
733 
734 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
735 {
736 	struct intel_vgpu *itr;
737 	int id;
738 	bool ret = false;
739 
740 	mutex_lock(&vgpu->gvt->lock);
741 	for_each_active_vgpu(vgpu->gvt, itr, id) {
742 		if (!itr->attached)
743 			continue;
744 
745 		if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) {
746 			ret = true;
747 			goto out;
748 		}
749 	}
750 out:
751 	mutex_unlock(&vgpu->gvt->lock);
752 	return ret;
753 }
754 
755 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
756 {
757 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
758 
759 	if (vgpu->attached)
760 		return -EEXIST;
761 
762 	if (!vgpu->vfio_device.kvm ||
763 	    vgpu->vfio_device.kvm->mm != current->mm) {
764 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
765 		return -ESRCH;
766 	}
767 
768 	kvm_get_kvm(vgpu->vfio_device.kvm);
769 
770 	if (__kvmgt_vgpu_exist(vgpu))
771 		return -EEXIST;
772 
773 	vgpu->attached = true;
774 
775 	kvmgt_protect_table_init(vgpu);
776 	gvt_cache_init(vgpu);
777 
778 	vgpu->track_node.track_write = kvmgt_page_track_write;
779 	vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
780 	kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
781 					 &vgpu->track_node);
782 
783 	debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs,
784 			     &vgpu->nr_cache_entries);
785 
786 	intel_gvt_activate_vgpu(vgpu);
787 
788 	atomic_set(&vgpu->released, 0);
789 	return 0;
790 }
791 
792 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
793 {
794 	struct eventfd_ctx *trigger;
795 
796 	trigger = vgpu->msi_trigger;
797 	if (trigger) {
798 		eventfd_ctx_put(trigger);
799 		vgpu->msi_trigger = NULL;
800 	}
801 }
802 
803 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
804 {
805 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
806 
807 	if (!vgpu->attached)
808 		return;
809 
810 	if (atomic_cmpxchg(&vgpu->released, 0, 1))
811 		return;
812 
813 	intel_gvt_release_vgpu(vgpu);
814 
815 	debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs));
816 
817 	kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
818 					   &vgpu->track_node);
819 	kvmgt_protect_table_destroy(vgpu);
820 	gvt_cache_destroy(vgpu);
821 
822 	intel_vgpu_release_msi_eventfd_ctx(vgpu);
823 
824 	vgpu->attached = false;
825 
826 	if (vgpu->vfio_device.kvm)
827 		kvm_put_kvm(vgpu->vfio_device.kvm);
828 }
829 
830 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
831 {
832 	u32 start_lo, start_hi;
833 	u32 mem_type;
834 
835 	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
836 			PCI_BASE_ADDRESS_MEM_MASK;
837 	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
838 			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
839 
840 	switch (mem_type) {
841 	case PCI_BASE_ADDRESS_MEM_TYPE_64:
842 		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
843 						+ bar + 4));
844 		break;
845 	case PCI_BASE_ADDRESS_MEM_TYPE_32:
846 	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
847 		/* 1M mem BAR treated as 32-bit BAR */
848 	default:
849 		/* mem unknown type treated as 32-bit BAR */
850 		start_hi = 0;
851 		break;
852 	}
853 
854 	return ((u64)start_hi << 32) | start_lo;
855 }
856 
857 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
858 			     void *buf, unsigned int count, bool is_write)
859 {
860 	u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
861 	int ret;
862 
863 	if (is_write)
864 		ret = intel_vgpu_emulate_mmio_write(vgpu,
865 					bar_start + off, buf, count);
866 	else
867 		ret = intel_vgpu_emulate_mmio_read(vgpu,
868 					bar_start + off, buf, count);
869 	return ret;
870 }
871 
872 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
873 {
874 	return off >= vgpu_aperture_offset(vgpu) &&
875 	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
876 }
877 
878 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
879 		void *buf, unsigned long count, bool is_write)
880 {
881 	void __iomem *aperture_va;
882 
883 	if (!intel_vgpu_in_aperture(vgpu, off) ||
884 	    !intel_vgpu_in_aperture(vgpu, off + count)) {
885 		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
886 		return -EINVAL;
887 	}
888 
889 	aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
890 					ALIGN_DOWN(off, PAGE_SIZE),
891 					count + offset_in_page(off));
892 	if (!aperture_va)
893 		return -EIO;
894 
895 	if (is_write)
896 		memcpy_toio(aperture_va + offset_in_page(off), buf, count);
897 	else
898 		memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
899 
900 	io_mapping_unmap(aperture_va);
901 
902 	return 0;
903 }
904 
905 static ssize_t intel_vgpu_rw(struct intel_vgpu *vgpu, char *buf,
906 			size_t count, loff_t *ppos, bool is_write)
907 {
908 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
909 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
910 	int ret = -EINVAL;
911 
912 
913 	if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) {
914 		gvt_vgpu_err("invalid index: %u\n", index);
915 		return -EINVAL;
916 	}
917 
918 	switch (index) {
919 	case VFIO_PCI_CONFIG_REGION_INDEX:
920 		if (is_write)
921 			ret = intel_vgpu_emulate_cfg_write(vgpu, pos,
922 						buf, count);
923 		else
924 			ret = intel_vgpu_emulate_cfg_read(vgpu, pos,
925 						buf, count);
926 		break;
927 	case VFIO_PCI_BAR0_REGION_INDEX:
928 		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
929 					buf, count, is_write);
930 		break;
931 	case VFIO_PCI_BAR2_REGION_INDEX:
932 		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
933 		break;
934 	case VFIO_PCI_BAR1_REGION_INDEX:
935 	case VFIO_PCI_BAR3_REGION_INDEX:
936 	case VFIO_PCI_BAR4_REGION_INDEX:
937 	case VFIO_PCI_BAR5_REGION_INDEX:
938 	case VFIO_PCI_VGA_REGION_INDEX:
939 	case VFIO_PCI_ROM_REGION_INDEX:
940 		break;
941 	default:
942 		if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
943 			return -EINVAL;
944 
945 		index -= VFIO_PCI_NUM_REGIONS;
946 		return vgpu->region[index].ops->rw(vgpu, buf, count,
947 				ppos, is_write);
948 	}
949 
950 	return ret == 0 ? count : ret;
951 }
952 
953 static bool gtt_entry(struct intel_vgpu *vgpu, loff_t *ppos)
954 {
955 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
956 	struct intel_gvt *gvt = vgpu->gvt;
957 	int offset;
958 
959 	/* Only allow MMIO GGTT entry access */
960 	if (index != PCI_BASE_ADDRESS_0)
961 		return false;
962 
963 	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
964 		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
965 
966 	return (offset >= gvt->device_info.gtt_start_offset &&
967 		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
968 			true : false;
969 }
970 
971 static ssize_t intel_vgpu_read(struct vfio_device *vfio_dev, char __user *buf,
972 			size_t count, loff_t *ppos)
973 {
974 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
975 	unsigned int done = 0;
976 	int ret;
977 
978 	while (count) {
979 		size_t filled;
980 
981 		/* Only support GGTT entry 8 bytes read */
982 		if (count >= 8 && !(*ppos % 8) &&
983 			gtt_entry(vgpu, ppos)) {
984 			u64 val;
985 
986 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
987 					ppos, false);
988 			if (ret <= 0)
989 				goto read_err;
990 
991 			if (copy_to_user(buf, &val, sizeof(val)))
992 				goto read_err;
993 
994 			filled = 8;
995 		} else if (count >= 4 && !(*ppos % 4)) {
996 			u32 val;
997 
998 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
999 					ppos, false);
1000 			if (ret <= 0)
1001 				goto read_err;
1002 
1003 			if (copy_to_user(buf, &val, sizeof(val)))
1004 				goto read_err;
1005 
1006 			filled = 4;
1007 		} else if (count >= 2 && !(*ppos % 2)) {
1008 			u16 val;
1009 
1010 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1011 					ppos, false);
1012 			if (ret <= 0)
1013 				goto read_err;
1014 
1015 			if (copy_to_user(buf, &val, sizeof(val)))
1016 				goto read_err;
1017 
1018 			filled = 2;
1019 		} else {
1020 			u8 val;
1021 
1022 			ret = intel_vgpu_rw(vgpu, &val, sizeof(val), ppos,
1023 					false);
1024 			if (ret <= 0)
1025 				goto read_err;
1026 
1027 			if (copy_to_user(buf, &val, sizeof(val)))
1028 				goto read_err;
1029 
1030 			filled = 1;
1031 		}
1032 
1033 		count -= filled;
1034 		done += filled;
1035 		*ppos += filled;
1036 		buf += filled;
1037 	}
1038 
1039 	return done;
1040 
1041 read_err:
1042 	return -EFAULT;
1043 }
1044 
1045 static ssize_t intel_vgpu_write(struct vfio_device *vfio_dev,
1046 				const char __user *buf,
1047 				size_t count, loff_t *ppos)
1048 {
1049 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1050 	unsigned int done = 0;
1051 	int ret;
1052 
1053 	while (count) {
1054 		size_t filled;
1055 
1056 		/* Only support GGTT entry 8 bytes write */
1057 		if (count >= 8 && !(*ppos % 8) &&
1058 			gtt_entry(vgpu, ppos)) {
1059 			u64 val;
1060 
1061 			if (copy_from_user(&val, buf, sizeof(val)))
1062 				goto write_err;
1063 
1064 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1065 					ppos, true);
1066 			if (ret <= 0)
1067 				goto write_err;
1068 
1069 			filled = 8;
1070 		} else if (count >= 4 && !(*ppos % 4)) {
1071 			u32 val;
1072 
1073 			if (copy_from_user(&val, buf, sizeof(val)))
1074 				goto write_err;
1075 
1076 			ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
1077 					ppos, true);
1078 			if (ret <= 0)
1079 				goto write_err;
1080 
1081 			filled = 4;
1082 		} else if (count >= 2 && !(*ppos % 2)) {
1083 			u16 val;
1084 
1085 			if (copy_from_user(&val, buf, sizeof(val)))
1086 				goto write_err;
1087 
1088 			ret = intel_vgpu_rw(vgpu, (char *)&val,
1089 					sizeof(val), ppos, true);
1090 			if (ret <= 0)
1091 				goto write_err;
1092 
1093 			filled = 2;
1094 		} else {
1095 			u8 val;
1096 
1097 			if (copy_from_user(&val, buf, sizeof(val)))
1098 				goto write_err;
1099 
1100 			ret = intel_vgpu_rw(vgpu, &val, sizeof(val),
1101 					ppos, true);
1102 			if (ret <= 0)
1103 				goto write_err;
1104 
1105 			filled = 1;
1106 		}
1107 
1108 		count -= filled;
1109 		done += filled;
1110 		*ppos += filled;
1111 		buf += filled;
1112 	}
1113 
1114 	return done;
1115 write_err:
1116 	return -EFAULT;
1117 }
1118 
1119 static int intel_vgpu_mmap(struct vfio_device *vfio_dev,
1120 		struct vm_area_struct *vma)
1121 {
1122 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1123 	unsigned int index;
1124 	u64 virtaddr;
1125 	unsigned long req_size, pgoff, req_start;
1126 	pgprot_t pg_prot;
1127 
1128 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1129 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1130 		return -EINVAL;
1131 
1132 	if (vma->vm_end < vma->vm_start)
1133 		return -EINVAL;
1134 	if ((vma->vm_flags & VM_SHARED) == 0)
1135 		return -EINVAL;
1136 	if (index != VFIO_PCI_BAR2_REGION_INDEX)
1137 		return -EINVAL;
1138 
1139 	pg_prot = vma->vm_page_prot;
1140 	virtaddr = vma->vm_start;
1141 	req_size = vma->vm_end - vma->vm_start;
1142 	pgoff = vma->vm_pgoff &
1143 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1144 	req_start = pgoff << PAGE_SHIFT;
1145 
1146 	if (!intel_vgpu_in_aperture(vgpu, req_start))
1147 		return -EINVAL;
1148 	if (req_start + req_size >
1149 	    vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1150 		return -EINVAL;
1151 
1152 	pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1153 
1154 	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1155 }
1156 
1157 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1158 {
1159 	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1160 		return 1;
1161 
1162 	return 0;
1163 }
1164 
1165 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1166 			unsigned int index, unsigned int start,
1167 			unsigned int count, u32 flags,
1168 			void *data)
1169 {
1170 	return 0;
1171 }
1172 
1173 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1174 			unsigned int index, unsigned int start,
1175 			unsigned int count, u32 flags, void *data)
1176 {
1177 	return 0;
1178 }
1179 
1180 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1181 		unsigned int index, unsigned int start, unsigned int count,
1182 		u32 flags, void *data)
1183 {
1184 	return 0;
1185 }
1186 
1187 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1188 		unsigned int index, unsigned int start, unsigned int count,
1189 		u32 flags, void *data)
1190 {
1191 	struct eventfd_ctx *trigger;
1192 
1193 	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1194 		int fd = *(int *)data;
1195 
1196 		trigger = eventfd_ctx_fdget(fd);
1197 		if (IS_ERR(trigger)) {
1198 			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1199 			return PTR_ERR(trigger);
1200 		}
1201 		vgpu->msi_trigger = trigger;
1202 	} else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1203 		intel_vgpu_release_msi_eventfd_ctx(vgpu);
1204 
1205 	return 0;
1206 }
1207 
1208 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1209 		unsigned int index, unsigned int start, unsigned int count,
1210 		void *data)
1211 {
1212 	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1213 			unsigned int start, unsigned int count, u32 flags,
1214 			void *data) = NULL;
1215 
1216 	switch (index) {
1217 	case VFIO_PCI_INTX_IRQ_INDEX:
1218 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1219 		case VFIO_IRQ_SET_ACTION_MASK:
1220 			func = intel_vgpu_set_intx_mask;
1221 			break;
1222 		case VFIO_IRQ_SET_ACTION_UNMASK:
1223 			func = intel_vgpu_set_intx_unmask;
1224 			break;
1225 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1226 			func = intel_vgpu_set_intx_trigger;
1227 			break;
1228 		}
1229 		break;
1230 	case VFIO_PCI_MSI_IRQ_INDEX:
1231 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1232 		case VFIO_IRQ_SET_ACTION_MASK:
1233 		case VFIO_IRQ_SET_ACTION_UNMASK:
1234 			/* XXX Need masking support exported */
1235 			break;
1236 		case VFIO_IRQ_SET_ACTION_TRIGGER:
1237 			func = intel_vgpu_set_msi_trigger;
1238 			break;
1239 		}
1240 		break;
1241 	}
1242 
1243 	if (!func)
1244 		return -ENOTTY;
1245 
1246 	return func(vgpu, index, start, count, flags, data);
1247 }
1248 
1249 static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
1250 			     unsigned long arg)
1251 {
1252 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1253 	unsigned long minsz;
1254 
1255 	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1256 
1257 	if (cmd == VFIO_DEVICE_GET_INFO) {
1258 		struct vfio_device_info info;
1259 
1260 		minsz = offsetofend(struct vfio_device_info, num_irqs);
1261 
1262 		if (copy_from_user(&info, (void __user *)arg, minsz))
1263 			return -EFAULT;
1264 
1265 		if (info.argsz < minsz)
1266 			return -EINVAL;
1267 
1268 		info.flags = VFIO_DEVICE_FLAGS_PCI;
1269 		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1270 		info.num_regions = VFIO_PCI_NUM_REGIONS +
1271 				vgpu->num_regions;
1272 		info.num_irqs = VFIO_PCI_NUM_IRQS;
1273 
1274 		return copy_to_user((void __user *)arg, &info, minsz) ?
1275 			-EFAULT : 0;
1276 
1277 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1278 		struct vfio_region_info info;
1279 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1280 		unsigned int i;
1281 		int ret;
1282 		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1283 		int nr_areas = 1;
1284 		int cap_type_id;
1285 
1286 		minsz = offsetofend(struct vfio_region_info, offset);
1287 
1288 		if (copy_from_user(&info, (void __user *)arg, minsz))
1289 			return -EFAULT;
1290 
1291 		if (info.argsz < minsz)
1292 			return -EINVAL;
1293 
1294 		switch (info.index) {
1295 		case VFIO_PCI_CONFIG_REGION_INDEX:
1296 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1297 			info.size = vgpu->gvt->device_info.cfg_space_size;
1298 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1299 				     VFIO_REGION_INFO_FLAG_WRITE;
1300 			break;
1301 		case VFIO_PCI_BAR0_REGION_INDEX:
1302 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1303 			info.size = vgpu->cfg_space.bar[info.index].size;
1304 			if (!info.size) {
1305 				info.flags = 0;
1306 				break;
1307 			}
1308 
1309 			info.flags = VFIO_REGION_INFO_FLAG_READ |
1310 				     VFIO_REGION_INFO_FLAG_WRITE;
1311 			break;
1312 		case VFIO_PCI_BAR1_REGION_INDEX:
1313 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1314 			info.size = 0;
1315 			info.flags = 0;
1316 			break;
1317 		case VFIO_PCI_BAR2_REGION_INDEX:
1318 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1319 			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1320 					VFIO_REGION_INFO_FLAG_MMAP |
1321 					VFIO_REGION_INFO_FLAG_READ |
1322 					VFIO_REGION_INFO_FLAG_WRITE;
1323 			info.size = gvt_aperture_sz(vgpu->gvt);
1324 
1325 			sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1326 					 GFP_KERNEL);
1327 			if (!sparse)
1328 				return -ENOMEM;
1329 
1330 			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1331 			sparse->header.version = 1;
1332 			sparse->nr_areas = nr_areas;
1333 			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1334 			sparse->areas[0].offset =
1335 					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1336 			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1337 			break;
1338 
1339 		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1340 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1341 			info.size = 0;
1342 			info.flags = 0;
1343 
1344 			gvt_dbg_core("get region info bar:%d\n", info.index);
1345 			break;
1346 
1347 		case VFIO_PCI_ROM_REGION_INDEX:
1348 		case VFIO_PCI_VGA_REGION_INDEX:
1349 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1350 			info.size = 0;
1351 			info.flags = 0;
1352 
1353 			gvt_dbg_core("get region info index:%d\n", info.index);
1354 			break;
1355 		default:
1356 			{
1357 				struct vfio_region_info_cap_type cap_type = {
1358 					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1359 					.header.version = 1 };
1360 
1361 				if (info.index >= VFIO_PCI_NUM_REGIONS +
1362 						vgpu->num_regions)
1363 					return -EINVAL;
1364 				info.index =
1365 					array_index_nospec(info.index,
1366 							VFIO_PCI_NUM_REGIONS +
1367 							vgpu->num_regions);
1368 
1369 				i = info.index - VFIO_PCI_NUM_REGIONS;
1370 
1371 				info.offset =
1372 					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1373 				info.size = vgpu->region[i].size;
1374 				info.flags = vgpu->region[i].flags;
1375 
1376 				cap_type.type = vgpu->region[i].type;
1377 				cap_type.subtype = vgpu->region[i].subtype;
1378 
1379 				ret = vfio_info_add_capability(&caps,
1380 							&cap_type.header,
1381 							sizeof(cap_type));
1382 				if (ret)
1383 					return ret;
1384 			}
1385 		}
1386 
1387 		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1388 			switch (cap_type_id) {
1389 			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1390 				ret = vfio_info_add_capability(&caps,
1391 					&sparse->header,
1392 					struct_size(sparse, areas,
1393 						    sparse->nr_areas));
1394 				if (ret) {
1395 					kfree(sparse);
1396 					return ret;
1397 				}
1398 				break;
1399 			default:
1400 				kfree(sparse);
1401 				return -EINVAL;
1402 			}
1403 		}
1404 
1405 		if (caps.size) {
1406 			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1407 			if (info.argsz < sizeof(info) + caps.size) {
1408 				info.argsz = sizeof(info) + caps.size;
1409 				info.cap_offset = 0;
1410 			} else {
1411 				vfio_info_cap_shift(&caps, sizeof(info));
1412 				if (copy_to_user((void __user *)arg +
1413 						  sizeof(info), caps.buf,
1414 						  caps.size)) {
1415 					kfree(caps.buf);
1416 					kfree(sparse);
1417 					return -EFAULT;
1418 				}
1419 				info.cap_offset = sizeof(info);
1420 			}
1421 
1422 			kfree(caps.buf);
1423 		}
1424 
1425 		kfree(sparse);
1426 		return copy_to_user((void __user *)arg, &info, minsz) ?
1427 			-EFAULT : 0;
1428 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1429 		struct vfio_irq_info info;
1430 
1431 		minsz = offsetofend(struct vfio_irq_info, count);
1432 
1433 		if (copy_from_user(&info, (void __user *)arg, minsz))
1434 			return -EFAULT;
1435 
1436 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1437 			return -EINVAL;
1438 
1439 		switch (info.index) {
1440 		case VFIO_PCI_INTX_IRQ_INDEX:
1441 		case VFIO_PCI_MSI_IRQ_INDEX:
1442 			break;
1443 		default:
1444 			return -EINVAL;
1445 		}
1446 
1447 		info.flags = VFIO_IRQ_INFO_EVENTFD;
1448 
1449 		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1450 
1451 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1452 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1453 				       VFIO_IRQ_INFO_AUTOMASKED);
1454 		else
1455 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1456 
1457 		return copy_to_user((void __user *)arg, &info, minsz) ?
1458 			-EFAULT : 0;
1459 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1460 		struct vfio_irq_set hdr;
1461 		u8 *data = NULL;
1462 		int ret = 0;
1463 		size_t data_size = 0;
1464 
1465 		minsz = offsetofend(struct vfio_irq_set, count);
1466 
1467 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1468 			return -EFAULT;
1469 
1470 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1471 			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1472 
1473 			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1474 						VFIO_PCI_NUM_IRQS, &data_size);
1475 			if (ret) {
1476 				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1477 				return -EINVAL;
1478 			}
1479 			if (data_size) {
1480 				data = memdup_user((void __user *)(arg + minsz),
1481 						   data_size);
1482 				if (IS_ERR(data))
1483 					return PTR_ERR(data);
1484 			}
1485 		}
1486 
1487 		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1488 					hdr.start, hdr.count, data);
1489 		kfree(data);
1490 
1491 		return ret;
1492 	} else if (cmd == VFIO_DEVICE_RESET) {
1493 		intel_gvt_reset_vgpu(vgpu);
1494 		return 0;
1495 	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1496 		struct vfio_device_gfx_plane_info dmabuf;
1497 		int ret = 0;
1498 
1499 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1500 				    dmabuf_id);
1501 		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1502 			return -EFAULT;
1503 		if (dmabuf.argsz < minsz)
1504 			return -EINVAL;
1505 
1506 		ret = intel_vgpu_query_plane(vgpu, &dmabuf);
1507 		if (ret != 0)
1508 			return ret;
1509 
1510 		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1511 								-EFAULT : 0;
1512 	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1513 		__u32 dmabuf_id;
1514 
1515 		if (get_user(dmabuf_id, (__u32 __user *)arg))
1516 			return -EFAULT;
1517 		return intel_vgpu_get_dmabuf(vgpu, dmabuf_id);
1518 	}
1519 
1520 	return -ENOTTY;
1521 }
1522 
1523 static ssize_t
1524 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1525 	     char *buf)
1526 {
1527 	struct intel_vgpu *vgpu = dev_get_drvdata(dev);
1528 
1529 	return sprintf(buf, "%d\n", vgpu->id);
1530 }
1531 
1532 static DEVICE_ATTR_RO(vgpu_id);
1533 
1534 static struct attribute *intel_vgpu_attrs[] = {
1535 	&dev_attr_vgpu_id.attr,
1536 	NULL
1537 };
1538 
1539 static const struct attribute_group intel_vgpu_group = {
1540 	.name = "intel_vgpu",
1541 	.attrs = intel_vgpu_attrs,
1542 };
1543 
1544 static const struct attribute_group *intel_vgpu_groups[] = {
1545 	&intel_vgpu_group,
1546 	NULL,
1547 };
1548 
1549 static const struct vfio_device_ops intel_vgpu_dev_ops = {
1550 	.open_device	= intel_vgpu_open_device,
1551 	.close_device	= intel_vgpu_close_device,
1552 	.read		= intel_vgpu_read,
1553 	.write		= intel_vgpu_write,
1554 	.mmap		= intel_vgpu_mmap,
1555 	.ioctl		= intel_vgpu_ioctl,
1556 	.dma_unmap	= intel_vgpu_dma_unmap,
1557 };
1558 
1559 static int intel_vgpu_probe(struct mdev_device *mdev)
1560 {
1561 	struct device *pdev = mdev_parent_dev(mdev);
1562 	struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt;
1563 	struct intel_vgpu_type *type;
1564 	struct intel_vgpu *vgpu;
1565 	int ret;
1566 
1567 	type = &gvt->types[mdev_get_type_group_id(mdev)];
1568 	if (!type)
1569 		return -EINVAL;
1570 
1571 	vgpu = intel_gvt_create_vgpu(gvt, type);
1572 	if (IS_ERR(vgpu)) {
1573 		gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu));
1574 		return PTR_ERR(vgpu);
1575 	}
1576 
1577 	vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev,
1578 			    &intel_vgpu_dev_ops);
1579 
1580 	dev_set_drvdata(&mdev->dev, vgpu);
1581 	ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device);
1582 	if (ret) {
1583 		intel_gvt_destroy_vgpu(vgpu);
1584 		return ret;
1585 	}
1586 
1587 	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
1588 		     dev_name(mdev_dev(mdev)));
1589 	return 0;
1590 }
1591 
1592 static void intel_vgpu_remove(struct mdev_device *mdev)
1593 {
1594 	struct intel_vgpu *vgpu = dev_get_drvdata(&mdev->dev);
1595 
1596 	if (WARN_ON_ONCE(vgpu->attached))
1597 		return;
1598 	intel_gvt_destroy_vgpu(vgpu);
1599 }
1600 
1601 static struct mdev_driver intel_vgpu_mdev_driver = {
1602 	.driver = {
1603 		.name		= "intel_vgpu_mdev",
1604 		.owner		= THIS_MODULE,
1605 		.dev_groups	= intel_vgpu_groups,
1606 	},
1607 	.probe		= intel_vgpu_probe,
1608 	.remove		= intel_vgpu_remove,
1609 	.supported_type_groups	= gvt_vgpu_type_groups,
1610 };
1611 
1612 int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
1613 {
1614 	struct kvm *kvm = info->vfio_device.kvm;
1615 	struct kvm_memory_slot *slot;
1616 	int idx;
1617 
1618 	if (!info->attached)
1619 		return -ESRCH;
1620 
1621 	idx = srcu_read_lock(&kvm->srcu);
1622 	slot = gfn_to_memslot(kvm, gfn);
1623 	if (!slot) {
1624 		srcu_read_unlock(&kvm->srcu, idx);
1625 		return -EINVAL;
1626 	}
1627 
1628 	write_lock(&kvm->mmu_lock);
1629 
1630 	if (kvmgt_gfn_is_write_protected(info, gfn))
1631 		goto out;
1632 
1633 	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1634 	kvmgt_protect_table_add(info, gfn);
1635 
1636 out:
1637 	write_unlock(&kvm->mmu_lock);
1638 	srcu_read_unlock(&kvm->srcu, idx);
1639 	return 0;
1640 }
1641 
1642 int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
1643 {
1644 	struct kvm *kvm = info->vfio_device.kvm;
1645 	struct kvm_memory_slot *slot;
1646 	int idx;
1647 
1648 	if (!info->attached)
1649 		return 0;
1650 
1651 	idx = srcu_read_lock(&kvm->srcu);
1652 	slot = gfn_to_memslot(kvm, gfn);
1653 	if (!slot) {
1654 		srcu_read_unlock(&kvm->srcu, idx);
1655 		return -EINVAL;
1656 	}
1657 
1658 	write_lock(&kvm->mmu_lock);
1659 
1660 	if (!kvmgt_gfn_is_write_protected(info, gfn))
1661 		goto out;
1662 
1663 	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1664 	kvmgt_protect_table_del(info, gfn);
1665 
1666 out:
1667 	write_unlock(&kvm->mmu_lock);
1668 	srcu_read_unlock(&kvm->srcu, idx);
1669 	return 0;
1670 }
1671 
1672 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1673 		const u8 *val, int len,
1674 		struct kvm_page_track_notifier_node *node)
1675 {
1676 	struct intel_vgpu *info =
1677 		container_of(node, struct intel_vgpu, track_node);
1678 
1679 	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1680 		intel_vgpu_page_track_handler(info, gpa,
1681 						     (void *)val, len);
1682 }
1683 
1684 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1685 		struct kvm_memory_slot *slot,
1686 		struct kvm_page_track_notifier_node *node)
1687 {
1688 	int i;
1689 	gfn_t gfn;
1690 	struct intel_vgpu *info =
1691 		container_of(node, struct intel_vgpu, track_node);
1692 
1693 	write_lock(&kvm->mmu_lock);
1694 	for (i = 0; i < slot->npages; i++) {
1695 		gfn = slot->base_gfn + i;
1696 		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1697 			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1698 						KVM_PAGE_TRACK_WRITE);
1699 			kvmgt_protect_table_del(info, gfn);
1700 		}
1701 	}
1702 	write_unlock(&kvm->mmu_lock);
1703 }
1704 
1705 void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)
1706 {
1707 	int i;
1708 
1709 	if (!vgpu->region)
1710 		return;
1711 
1712 	for (i = 0; i < vgpu->num_regions; i++)
1713 		if (vgpu->region[i].ops->release)
1714 			vgpu->region[i].ops->release(vgpu,
1715 					&vgpu->region[i]);
1716 	vgpu->num_regions = 0;
1717 	kfree(vgpu->region);
1718 	vgpu->region = NULL;
1719 }
1720 
1721 int intel_gvt_dma_map_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
1722 		unsigned long size, dma_addr_t *dma_addr)
1723 {
1724 	struct gvt_dma *entry;
1725 	int ret;
1726 
1727 	if (!vgpu->attached)
1728 		return -EINVAL;
1729 
1730 	mutex_lock(&vgpu->cache_lock);
1731 
1732 	entry = __gvt_cache_find_gfn(vgpu, gfn);
1733 	if (!entry) {
1734 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1735 		if (ret)
1736 			goto err_unlock;
1737 
1738 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1739 		if (ret)
1740 			goto err_unmap;
1741 	} else if (entry->size != size) {
1742 		/* the same gfn with different size: unmap and re-map */
1743 		gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1744 		__gvt_cache_remove_entry(vgpu, entry);
1745 
1746 		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1747 		if (ret)
1748 			goto err_unlock;
1749 
1750 		ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1751 		if (ret)
1752 			goto err_unmap;
1753 	} else {
1754 		kref_get(&entry->ref);
1755 		*dma_addr = entry->dma_addr;
1756 	}
1757 
1758 	mutex_unlock(&vgpu->cache_lock);
1759 	return 0;
1760 
1761 err_unmap:
1762 	gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1763 err_unlock:
1764 	mutex_unlock(&vgpu->cache_lock);
1765 	return ret;
1766 }
1767 
1768 int intel_gvt_dma_pin_guest_page(struct intel_vgpu *vgpu, dma_addr_t dma_addr)
1769 {
1770 	struct gvt_dma *entry;
1771 	int ret = 0;
1772 
1773 	if (!vgpu->attached)
1774 		return -ENODEV;
1775 
1776 	mutex_lock(&vgpu->cache_lock);
1777 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1778 	if (entry)
1779 		kref_get(&entry->ref);
1780 	else
1781 		ret = -ENOMEM;
1782 	mutex_unlock(&vgpu->cache_lock);
1783 
1784 	return ret;
1785 }
1786 
1787 static void __gvt_dma_release(struct kref *ref)
1788 {
1789 	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1790 
1791 	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1792 			   entry->size);
1793 	__gvt_cache_remove_entry(entry->vgpu, entry);
1794 }
1795 
1796 void intel_gvt_dma_unmap_guest_page(struct intel_vgpu *vgpu,
1797 		dma_addr_t dma_addr)
1798 {
1799 	struct gvt_dma *entry;
1800 
1801 	if (!vgpu->attached)
1802 		return;
1803 
1804 	mutex_lock(&vgpu->cache_lock);
1805 	entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1806 	if (entry)
1807 		kref_put(&entry->ref, __gvt_dma_release);
1808 	mutex_unlock(&vgpu->cache_lock);
1809 }
1810 
1811 static void init_device_info(struct intel_gvt *gvt)
1812 {
1813 	struct intel_gvt_device_info *info = &gvt->device_info;
1814 	struct pci_dev *pdev = to_pci_dev(gvt->gt->i915->drm.dev);
1815 
1816 	info->max_support_vgpus = 8;
1817 	info->cfg_space_size = PCI_CFG_SPACE_EXP_SIZE;
1818 	info->mmio_size = 2 * 1024 * 1024;
1819 	info->mmio_bar = 0;
1820 	info->gtt_start_offset = 8 * 1024 * 1024;
1821 	info->gtt_entry_size = 8;
1822 	info->gtt_entry_size_shift = 3;
1823 	info->gmadr_bytes_in_cmd = 8;
1824 	info->max_surface_size = 36 * 1024 * 1024;
1825 	info->msi_cap_offset = pdev->msi_cap;
1826 }
1827 
1828 static void intel_gvt_test_and_emulate_vblank(struct intel_gvt *gvt)
1829 {
1830 	struct intel_vgpu *vgpu;
1831 	int id;
1832 
1833 	mutex_lock(&gvt->lock);
1834 	idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) {
1835 		if (test_and_clear_bit(INTEL_GVT_REQUEST_EMULATE_VBLANK + id,
1836 				       (void *)&gvt->service_request)) {
1837 			if (vgpu->active)
1838 				intel_vgpu_emulate_vblank(vgpu);
1839 		}
1840 	}
1841 	mutex_unlock(&gvt->lock);
1842 }
1843 
1844 static int gvt_service_thread(void *data)
1845 {
1846 	struct intel_gvt *gvt = (struct intel_gvt *)data;
1847 	int ret;
1848 
1849 	gvt_dbg_core("service thread start\n");
1850 
1851 	while (!kthread_should_stop()) {
1852 		ret = wait_event_interruptible(gvt->service_thread_wq,
1853 				kthread_should_stop() || gvt->service_request);
1854 
1855 		if (kthread_should_stop())
1856 			break;
1857 
1858 		if (WARN_ONCE(ret, "service thread is waken up by signal.\n"))
1859 			continue;
1860 
1861 		intel_gvt_test_and_emulate_vblank(gvt);
1862 
1863 		if (test_bit(INTEL_GVT_REQUEST_SCHED,
1864 				(void *)&gvt->service_request) ||
1865 			test_bit(INTEL_GVT_REQUEST_EVENT_SCHED,
1866 					(void *)&gvt->service_request)) {
1867 			intel_gvt_schedule(gvt);
1868 		}
1869 	}
1870 
1871 	return 0;
1872 }
1873 
1874 static void clean_service_thread(struct intel_gvt *gvt)
1875 {
1876 	kthread_stop(gvt->service_thread);
1877 }
1878 
1879 static int init_service_thread(struct intel_gvt *gvt)
1880 {
1881 	init_waitqueue_head(&gvt->service_thread_wq);
1882 
1883 	gvt->service_thread = kthread_run(gvt_service_thread,
1884 			gvt, "gvt_service_thread");
1885 	if (IS_ERR(gvt->service_thread)) {
1886 		gvt_err("fail to start service thread.\n");
1887 		return PTR_ERR(gvt->service_thread);
1888 	}
1889 	return 0;
1890 }
1891 
1892 /**
1893  * intel_gvt_clean_device - clean a GVT device
1894  * @i915: i915 private
1895  *
1896  * This function is called at the driver unloading stage, to free the
1897  * resources owned by a GVT device.
1898  *
1899  */
1900 static void intel_gvt_clean_device(struct drm_i915_private *i915)
1901 {
1902 	struct intel_gvt *gvt = fetch_and_zero(&i915->gvt);
1903 
1904 	if (drm_WARN_ON(&i915->drm, !gvt))
1905 		return;
1906 
1907 	mdev_unregister_device(i915->drm.dev);
1908 	intel_gvt_cleanup_vgpu_type_groups(gvt);
1909 	intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
1910 	intel_gvt_clean_vgpu_types(gvt);
1911 
1912 	intel_gvt_debugfs_clean(gvt);
1913 	clean_service_thread(gvt);
1914 	intel_gvt_clean_cmd_parser(gvt);
1915 	intel_gvt_clean_sched_policy(gvt);
1916 	intel_gvt_clean_workload_scheduler(gvt);
1917 	intel_gvt_clean_gtt(gvt);
1918 	intel_gvt_free_firmware(gvt);
1919 	intel_gvt_clean_mmio_info(gvt);
1920 	idr_destroy(&gvt->vgpu_idr);
1921 
1922 	kfree(i915->gvt);
1923 }
1924 
1925 /**
1926  * intel_gvt_init_device - initialize a GVT device
1927  * @i915: drm i915 private data
1928  *
1929  * This function is called at the initialization stage, to initialize
1930  * necessary GVT components.
1931  *
1932  * Returns:
1933  * Zero on success, negative error code if failed.
1934  *
1935  */
1936 static int intel_gvt_init_device(struct drm_i915_private *i915)
1937 {
1938 	struct intel_gvt *gvt;
1939 	struct intel_vgpu *vgpu;
1940 	int ret;
1941 
1942 	if (drm_WARN_ON(&i915->drm, i915->gvt))
1943 		return -EEXIST;
1944 
1945 	gvt = kzalloc(sizeof(struct intel_gvt), GFP_KERNEL);
1946 	if (!gvt)
1947 		return -ENOMEM;
1948 
1949 	gvt_dbg_core("init gvt device\n");
1950 
1951 	idr_init_base(&gvt->vgpu_idr, 1);
1952 	spin_lock_init(&gvt->scheduler.mmio_context_lock);
1953 	mutex_init(&gvt->lock);
1954 	mutex_init(&gvt->sched_lock);
1955 	gvt->gt = to_gt(i915);
1956 	i915->gvt = gvt;
1957 
1958 	init_device_info(gvt);
1959 
1960 	ret = intel_gvt_setup_mmio_info(gvt);
1961 	if (ret)
1962 		goto out_clean_idr;
1963 
1964 	intel_gvt_init_engine_mmio_context(gvt);
1965 
1966 	ret = intel_gvt_load_firmware(gvt);
1967 	if (ret)
1968 		goto out_clean_mmio_info;
1969 
1970 	ret = intel_gvt_init_irq(gvt);
1971 	if (ret)
1972 		goto out_free_firmware;
1973 
1974 	ret = intel_gvt_init_gtt(gvt);
1975 	if (ret)
1976 		goto out_free_firmware;
1977 
1978 	ret = intel_gvt_init_workload_scheduler(gvt);
1979 	if (ret)
1980 		goto out_clean_gtt;
1981 
1982 	ret = intel_gvt_init_sched_policy(gvt);
1983 	if (ret)
1984 		goto out_clean_workload_scheduler;
1985 
1986 	ret = intel_gvt_init_cmd_parser(gvt);
1987 	if (ret)
1988 		goto out_clean_sched_policy;
1989 
1990 	ret = init_service_thread(gvt);
1991 	if (ret)
1992 		goto out_clean_cmd_parser;
1993 
1994 	ret = intel_gvt_init_vgpu_types(gvt);
1995 	if (ret)
1996 		goto out_clean_thread;
1997 
1998 	vgpu = intel_gvt_create_idle_vgpu(gvt);
1999 	if (IS_ERR(vgpu)) {
2000 		ret = PTR_ERR(vgpu);
2001 		gvt_err("failed to create idle vgpu\n");
2002 		goto out_clean_types;
2003 	}
2004 	gvt->idle_vgpu = vgpu;
2005 
2006 	intel_gvt_debugfs_init(gvt);
2007 
2008 	ret = intel_gvt_init_vgpu_type_groups(gvt);
2009 	if (ret)
2010 		goto out_destroy_idle_vgpu;
2011 
2012 	ret = mdev_register_device(i915->drm.dev, &intel_vgpu_mdev_driver);
2013 	if (ret)
2014 		goto out_cleanup_vgpu_type_groups;
2015 
2016 	gvt_dbg_core("gvt device initialization is done\n");
2017 	return 0;
2018 
2019 out_cleanup_vgpu_type_groups:
2020 	intel_gvt_cleanup_vgpu_type_groups(gvt);
2021 out_destroy_idle_vgpu:
2022 	intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
2023 	intel_gvt_debugfs_clean(gvt);
2024 out_clean_types:
2025 	intel_gvt_clean_vgpu_types(gvt);
2026 out_clean_thread:
2027 	clean_service_thread(gvt);
2028 out_clean_cmd_parser:
2029 	intel_gvt_clean_cmd_parser(gvt);
2030 out_clean_sched_policy:
2031 	intel_gvt_clean_sched_policy(gvt);
2032 out_clean_workload_scheduler:
2033 	intel_gvt_clean_workload_scheduler(gvt);
2034 out_clean_gtt:
2035 	intel_gvt_clean_gtt(gvt);
2036 out_free_firmware:
2037 	intel_gvt_free_firmware(gvt);
2038 out_clean_mmio_info:
2039 	intel_gvt_clean_mmio_info(gvt);
2040 out_clean_idr:
2041 	idr_destroy(&gvt->vgpu_idr);
2042 	kfree(gvt);
2043 	i915->gvt = NULL;
2044 	return ret;
2045 }
2046 
2047 static void intel_gvt_pm_resume(struct drm_i915_private *i915)
2048 {
2049 	struct intel_gvt *gvt = i915->gvt;
2050 
2051 	intel_gvt_restore_fence(gvt);
2052 	intel_gvt_restore_mmio(gvt);
2053 	intel_gvt_restore_ggtt(gvt);
2054 }
2055 
2056 static const struct intel_vgpu_ops intel_gvt_vgpu_ops = {
2057 	.init_device	= intel_gvt_init_device,
2058 	.clean_device	= intel_gvt_clean_device,
2059 	.pm_resume	= intel_gvt_pm_resume,
2060 };
2061 
2062 static int __init kvmgt_init(void)
2063 {
2064 	int ret;
2065 
2066 	ret = intel_gvt_set_ops(&intel_gvt_vgpu_ops);
2067 	if (ret)
2068 		return ret;
2069 
2070 	ret = mdev_register_driver(&intel_vgpu_mdev_driver);
2071 	if (ret)
2072 		intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2073 	return ret;
2074 }
2075 
2076 static void __exit kvmgt_exit(void)
2077 {
2078 	mdev_unregister_driver(&intel_vgpu_mdev_driver);
2079 	intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2080 }
2081 
2082 module_init(kvmgt_init);
2083 module_exit(kvmgt_exit);
2084 
2085 MODULE_LICENSE("GPL and additional rights");
2086 MODULE_AUTHOR("Intel Corporation");
2087