xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/scheduler.c (revision de167752a889d19b9bb018f8eecbc1ebbfe07b2f)
1 /*
2  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Zhi Wang <zhi.a.wang@intel.com>
25  *
26  * Contributors:
27  *    Ping Gao <ping.a.gao@intel.com>
28  *    Tina Zhang <tina.zhang@intel.com>
29  *    Chanbin Du <changbin.du@intel.com>
30  *    Min He <min.he@intel.com>
31  *    Bing Niu <bing.niu@intel.com>
32  *    Zhenyu Wang <zhenyuw@linux.intel.com>
33  *
34  */
35 
36 #include <linux/kthread.h>
37 
38 #include "i915_drv.h"
39 #include "gvt.h"
40 
41 #define RING_CTX_OFF(x) \
42 	offsetof(struct execlist_ring_context, x)
43 
44 static void set_context_pdp_root_pointer(
45 		struct execlist_ring_context *ring_context,
46 		u32 pdp[8])
47 {
48 	struct execlist_mmio_pair *pdp_pair = &ring_context->pdp3_UDW;
49 	int i;
50 
51 	for (i = 0; i < 8; i++)
52 		pdp_pair[i].val = pdp[7 - i];
53 }
54 
55 static void update_shadow_pdps(struct intel_vgpu_workload *workload)
56 {
57 	struct intel_vgpu *vgpu = workload->vgpu;
58 	int ring_id = workload->ring_id;
59 	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
60 	struct drm_i915_gem_object *ctx_obj =
61 		shadow_ctx->__engine[ring_id].state->obj;
62 	struct execlist_ring_context *shadow_ring_context;
63 	struct page *page;
64 
65 	if (WARN_ON(!workload->shadow_mm))
66 		return;
67 
68 	if (WARN_ON(!atomic_read(&workload->shadow_mm->pincount)))
69 		return;
70 
71 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
72 	shadow_ring_context = kmap(page);
73 	set_context_pdp_root_pointer(shadow_ring_context,
74 			(void *)workload->shadow_mm->ppgtt_mm.shadow_pdps);
75 	kunmap(page);
76 }
77 
78 /*
79  * when populating shadow ctx from guest, we should not overrride oa related
80  * registers, so that they will not be overlapped by guest oa configs. Thus
81  * made it possible to capture oa data from host for both host and guests.
82  */
83 static void sr_oa_regs(struct intel_vgpu_workload *workload,
84 		u32 *reg_state, bool save)
85 {
86 	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
87 	u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset;
88 	u32 ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_offset;
89 	int i = 0;
90 	u32 flex_mmio[] = {
91 		i915_mmio_reg_offset(EU_PERF_CNTL0),
92 		i915_mmio_reg_offset(EU_PERF_CNTL1),
93 		i915_mmio_reg_offset(EU_PERF_CNTL2),
94 		i915_mmio_reg_offset(EU_PERF_CNTL3),
95 		i915_mmio_reg_offset(EU_PERF_CNTL4),
96 		i915_mmio_reg_offset(EU_PERF_CNTL5),
97 		i915_mmio_reg_offset(EU_PERF_CNTL6),
98 	};
99 
100 	if (workload->ring_id != RCS)
101 		return;
102 
103 	if (save) {
104 		workload->oactxctrl = reg_state[ctx_oactxctrl + 1];
105 
106 		for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) {
107 			u32 state_offset = ctx_flexeu0 + i * 2;
108 
109 			workload->flex_mmio[i] = reg_state[state_offset + 1];
110 		}
111 	} else {
112 		reg_state[ctx_oactxctrl] =
113 			i915_mmio_reg_offset(GEN8_OACTXCONTROL);
114 		reg_state[ctx_oactxctrl + 1] = workload->oactxctrl;
115 
116 		for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) {
117 			u32 state_offset = ctx_flexeu0 + i * 2;
118 			u32 mmio = flex_mmio[i];
119 
120 			reg_state[state_offset] = mmio;
121 			reg_state[state_offset + 1] = workload->flex_mmio[i];
122 		}
123 	}
124 }
125 
126 static int populate_shadow_context(struct intel_vgpu_workload *workload)
127 {
128 	struct intel_vgpu *vgpu = workload->vgpu;
129 	struct intel_gvt *gvt = vgpu->gvt;
130 	int ring_id = workload->ring_id;
131 	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
132 	struct drm_i915_gem_object *ctx_obj =
133 		shadow_ctx->__engine[ring_id].state->obj;
134 	struct execlist_ring_context *shadow_ring_context;
135 	struct page *page;
136 	void *dst;
137 	unsigned long context_gpa, context_page_num;
138 	int i;
139 
140 	gvt_dbg_sched("ring id %d workload lrca %x", ring_id,
141 			workload->ctx_desc.lrca);
142 
143 	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
144 
145 	context_page_num = context_page_num >> PAGE_SHIFT;
146 
147 	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
148 		context_page_num = 19;
149 
150 	i = 2;
151 
152 	while (i < context_page_num) {
153 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
154 				(u32)((workload->ctx_desc.lrca + i) <<
155 				I915_GTT_PAGE_SHIFT));
156 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
157 			gvt_vgpu_err("Invalid guest context descriptor\n");
158 			return -EFAULT;
159 		}
160 
161 		page = i915_gem_object_get_page(ctx_obj, LRC_HEADER_PAGES + i);
162 		dst = kmap(page);
163 		intel_gvt_hypervisor_read_gpa(vgpu, context_gpa, dst,
164 				I915_GTT_PAGE_SIZE);
165 		kunmap(page);
166 		i++;
167 	}
168 
169 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
170 	shadow_ring_context = kmap(page);
171 
172 	sr_oa_regs(workload, (u32 *)shadow_ring_context, true);
173 #define COPY_REG(name) \
174 	intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \
175 		+ RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
176 #define COPY_REG_MASKED(name) {\
177 		intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \
178 					      + RING_CTX_OFF(name.val),\
179 					      &shadow_ring_context->name.val, 4);\
180 		shadow_ring_context->name.val |= 0xffff << 16;\
181 	}
182 
183 	COPY_REG_MASKED(ctx_ctrl);
184 	COPY_REG(ctx_timestamp);
185 
186 	if (ring_id == RCS) {
187 		COPY_REG(bb_per_ctx_ptr);
188 		COPY_REG(rcs_indirect_ctx);
189 		COPY_REG(rcs_indirect_ctx_offset);
190 	}
191 #undef COPY_REG
192 #undef COPY_REG_MASKED
193 
194 	intel_gvt_hypervisor_read_gpa(vgpu,
195 			workload->ring_context_gpa +
196 			sizeof(*shadow_ring_context),
197 			(void *)shadow_ring_context +
198 			sizeof(*shadow_ring_context),
199 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
200 
201 	sr_oa_regs(workload, (u32 *)shadow_ring_context, false);
202 	kunmap(page);
203 	return 0;
204 }
205 
206 static inline bool is_gvt_request(struct i915_request *req)
207 {
208 	return i915_gem_context_force_single_submission(req->ctx);
209 }
210 
211 static void save_ring_hw_state(struct intel_vgpu *vgpu, int ring_id)
212 {
213 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
214 	u32 ring_base = dev_priv->engine[ring_id]->mmio_base;
215 	i915_reg_t reg;
216 
217 	reg = RING_INSTDONE(ring_base);
218 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
219 	reg = RING_ACTHD(ring_base);
220 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
221 	reg = RING_ACTHD_UDW(ring_base);
222 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
223 }
224 
225 static int shadow_context_status_change(struct notifier_block *nb,
226 		unsigned long action, void *data)
227 {
228 	struct i915_request *req = data;
229 	struct intel_gvt *gvt = container_of(nb, struct intel_gvt,
230 				shadow_ctx_notifier_block[req->engine->id]);
231 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
232 	enum intel_engine_id ring_id = req->engine->id;
233 	struct intel_vgpu_workload *workload;
234 	unsigned long flags;
235 
236 	if (!is_gvt_request(req)) {
237 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
238 		if (action == INTEL_CONTEXT_SCHEDULE_IN &&
239 		    scheduler->engine_owner[ring_id]) {
240 			/* Switch ring from vGPU to host. */
241 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
242 					      NULL, ring_id);
243 			scheduler->engine_owner[ring_id] = NULL;
244 		}
245 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
246 
247 		return NOTIFY_OK;
248 	}
249 
250 	workload = scheduler->current_workload[ring_id];
251 	if (unlikely(!workload))
252 		return NOTIFY_OK;
253 
254 	switch (action) {
255 	case INTEL_CONTEXT_SCHEDULE_IN:
256 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
257 		if (workload->vgpu != scheduler->engine_owner[ring_id]) {
258 			/* Switch ring from host to vGPU or vGPU to vGPU. */
259 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
260 					      workload->vgpu, ring_id);
261 			scheduler->engine_owner[ring_id] = workload->vgpu;
262 		} else
263 			gvt_dbg_sched("skip ring %d mmio switch for vgpu%d\n",
264 				      ring_id, workload->vgpu->id);
265 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
266 		atomic_set(&workload->shadow_ctx_active, 1);
267 		break;
268 	case INTEL_CONTEXT_SCHEDULE_OUT:
269 		save_ring_hw_state(workload->vgpu, ring_id);
270 		atomic_set(&workload->shadow_ctx_active, 0);
271 		break;
272 	case INTEL_CONTEXT_SCHEDULE_PREEMPTED:
273 		save_ring_hw_state(workload->vgpu, ring_id);
274 		break;
275 	default:
276 		WARN_ON(1);
277 		return NOTIFY_OK;
278 	}
279 	wake_up(&workload->shadow_ctx_status_wq);
280 	return NOTIFY_OK;
281 }
282 
283 static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
284 		struct intel_engine_cs *engine)
285 {
286 	struct intel_context *ce = to_intel_context(ctx, engine);
287 	u64 desc = 0;
288 
289 	desc = ce->lrc_desc;
290 
291 	/* Update bits 0-11 of the context descriptor which includes flags
292 	 * like GEN8_CTX_* cached in desc_template
293 	 */
294 	desc &= U64_MAX << 12;
295 	desc |= ctx->desc_template & ((1ULL << 12) - 1);
296 
297 	ce->lrc_desc = desc;
298 }
299 
300 static int copy_workload_to_ring_buffer(struct intel_vgpu_workload *workload)
301 {
302 	struct intel_vgpu *vgpu = workload->vgpu;
303 	void *shadow_ring_buffer_va;
304 	u32 *cs;
305 	struct i915_request *req = workload->req;
306 
307 	if (IS_KABYLAKE(req->i915) &&
308 	    is_inhibit_context(req->ctx, req->engine->id))
309 		intel_vgpu_restore_inhibit_context(vgpu, req);
310 
311 	/* allocate shadow ring buffer */
312 	cs = intel_ring_begin(workload->req, workload->rb_len / sizeof(u32));
313 	if (IS_ERR(cs)) {
314 		gvt_vgpu_err("fail to alloc size =%ld shadow  ring buffer\n",
315 			workload->rb_len);
316 		return PTR_ERR(cs);
317 	}
318 
319 	shadow_ring_buffer_va = workload->shadow_ring_buffer_va;
320 
321 	/* get shadow ring buffer va */
322 	workload->shadow_ring_buffer_va = cs;
323 
324 	memcpy(cs, shadow_ring_buffer_va,
325 			workload->rb_len);
326 
327 	cs += workload->rb_len / sizeof(u32);
328 	intel_ring_advance(workload->req, cs);
329 
330 	return 0;
331 }
332 
333 static void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
334 {
335 	if (!wa_ctx->indirect_ctx.obj)
336 		return;
337 
338 	i915_gem_object_unpin_map(wa_ctx->indirect_ctx.obj);
339 	i915_gem_object_put(wa_ctx->indirect_ctx.obj);
340 }
341 
342 /**
343  * intel_gvt_scan_and_shadow_workload - audit the workload by scanning and
344  * shadow it as well, include ringbuffer,wa_ctx and ctx.
345  * @workload: an abstract entity for each execlist submission.
346  *
347  * This function is called before the workload submitting to i915, to make
348  * sure the content of the workload is valid.
349  */
350 int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
351 {
352 	struct intel_vgpu *vgpu = workload->vgpu;
353 	struct intel_vgpu_submission *s = &vgpu->submission;
354 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
355 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
356 	int ring_id = workload->ring_id;
357 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
358 	struct intel_ring *ring;
359 	int ret;
360 
361 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
362 
363 	if (workload->shadowed)
364 		return 0;
365 
366 	shadow_ctx->desc_template &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
367 	shadow_ctx->desc_template |= workload->ctx_desc.addressing_mode <<
368 				    GEN8_CTX_ADDRESSING_MODE_SHIFT;
369 
370 	if (!test_and_set_bit(ring_id, s->shadow_ctx_desc_updated))
371 		shadow_context_descriptor_update(shadow_ctx,
372 					dev_priv->engine[ring_id]);
373 
374 	ret = intel_gvt_scan_and_shadow_ringbuffer(workload);
375 	if (ret)
376 		goto err_scan;
377 
378 	if ((workload->ring_id == RCS) &&
379 	    (workload->wa_ctx.indirect_ctx.size != 0)) {
380 		ret = intel_gvt_scan_and_shadow_wa_ctx(&workload->wa_ctx);
381 		if (ret)
382 			goto err_scan;
383 	}
384 
385 	/* pin shadow context by gvt even the shadow context will be pinned
386 	 * when i915 alloc request. That is because gvt will update the guest
387 	 * context from shadow context when workload is completed, and at that
388 	 * moment, i915 may already unpined the shadow context to make the
389 	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
390 	 * the guest context, gvt can unpin the shadow_ctx safely.
391 	 */
392 	ring = intel_context_pin(shadow_ctx, engine);
393 	if (IS_ERR(ring)) {
394 		ret = PTR_ERR(ring);
395 		gvt_vgpu_err("fail to pin shadow context\n");
396 		goto err_shadow;
397 	}
398 
399 	ret = populate_shadow_context(workload);
400 	if (ret)
401 		goto err_unpin;
402 	workload->shadowed = true;
403 	return 0;
404 
405 err_unpin:
406 	intel_context_unpin(shadow_ctx, engine);
407 err_shadow:
408 	release_shadow_wa_ctx(&workload->wa_ctx);
409 err_scan:
410 	return ret;
411 }
412 
413 static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
414 {
415 	int ring_id = workload->ring_id;
416 	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
417 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
418 	struct i915_request *rq;
419 	struct intel_vgpu *vgpu = workload->vgpu;
420 	struct intel_vgpu_submission *s = &vgpu->submission;
421 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
422 	int ret;
423 
424 	rq = i915_request_alloc(dev_priv->engine[ring_id], shadow_ctx);
425 	if (IS_ERR(rq)) {
426 		gvt_vgpu_err("fail to allocate gem request\n");
427 		ret = PTR_ERR(rq);
428 		goto err_unpin;
429 	}
430 
431 	gvt_dbg_sched("ring id %d get i915 gem request %p\n", ring_id, rq);
432 
433 	workload->req = i915_request_get(rq);
434 	ret = copy_workload_to_ring_buffer(workload);
435 	if (ret)
436 		goto err_unpin;
437 	return 0;
438 
439 err_unpin:
440 	intel_context_unpin(shadow_ctx, engine);
441 	release_shadow_wa_ctx(&workload->wa_ctx);
442 	return ret;
443 }
444 
445 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload);
446 
447 static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload)
448 {
449 	struct intel_gvt *gvt = workload->vgpu->gvt;
450 	const int gmadr_bytes = gvt->device_info.gmadr_bytes_in_cmd;
451 	struct intel_vgpu_shadow_bb *bb;
452 	int ret;
453 
454 	list_for_each_entry(bb, &workload->shadow_bb, list) {
455 		/* For privilge batch buffer and not wa_ctx, the bb_start_cmd_va
456 		 * is only updated into ring_scan_buffer, not real ring address
457 		 * allocated in later copy_workload_to_ring_buffer. pls be noted
458 		 * shadow_ring_buffer_va is now pointed to real ring buffer va
459 		 * in copy_workload_to_ring_buffer.
460 		 */
461 
462 		if (bb->bb_offset)
463 			bb->bb_start_cmd_va = workload->shadow_ring_buffer_va
464 				+ bb->bb_offset;
465 
466 		if (bb->ppgtt) {
467 			/* for non-priv bb, scan&shadow is only for
468 			 * debugging purpose, so the content of shadow bb
469 			 * is the same as original bb. Therefore,
470 			 * here, rather than switch to shadow bb's gma
471 			 * address, we directly use original batch buffer's
472 			 * gma address, and send original bb to hardware
473 			 * directly
474 			 */
475 			if (bb->clflush & CLFLUSH_AFTER) {
476 				drm_clflush_virt_range(bb->va,
477 						bb->obj->base.size);
478 				bb->clflush &= ~CLFLUSH_AFTER;
479 			}
480 			i915_gem_obj_finish_shmem_access(bb->obj);
481 			bb->accessing = false;
482 
483 		} else {
484 			bb->vma = i915_gem_object_ggtt_pin(bb->obj,
485 					NULL, 0, 0, 0);
486 			if (IS_ERR(bb->vma)) {
487 				ret = PTR_ERR(bb->vma);
488 				goto err;
489 			}
490 
491 			/* relocate shadow batch buffer */
492 			bb->bb_start_cmd_va[1] = i915_ggtt_offset(bb->vma);
493 			if (gmadr_bytes == 8)
494 				bb->bb_start_cmd_va[2] = 0;
495 
496 			/* No one is going to touch shadow bb from now on. */
497 			if (bb->clflush & CLFLUSH_AFTER) {
498 				drm_clflush_virt_range(bb->va,
499 						bb->obj->base.size);
500 				bb->clflush &= ~CLFLUSH_AFTER;
501 			}
502 
503 			ret = i915_gem_object_set_to_gtt_domain(bb->obj,
504 					false);
505 			if (ret)
506 				goto err;
507 
508 			i915_gem_obj_finish_shmem_access(bb->obj);
509 			bb->accessing = false;
510 
511 			i915_vma_move_to_active(bb->vma, workload->req, 0);
512 		}
513 	}
514 	return 0;
515 err:
516 	release_shadow_batch_buffer(workload);
517 	return ret;
518 }
519 
520 static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
521 {
522 	struct intel_vgpu_workload *workload = container_of(wa_ctx,
523 					struct intel_vgpu_workload,
524 					wa_ctx);
525 	int ring_id = workload->ring_id;
526 	struct intel_vgpu_submission *s = &workload->vgpu->submission;
527 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
528 	struct drm_i915_gem_object *ctx_obj =
529 		shadow_ctx->__engine[ring_id].state->obj;
530 	struct execlist_ring_context *shadow_ring_context;
531 	struct page *page;
532 
533 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
534 	shadow_ring_context = kmap_atomic(page);
535 
536 	shadow_ring_context->bb_per_ctx_ptr.val =
537 		(shadow_ring_context->bb_per_ctx_ptr.val &
538 		(~PER_CTX_ADDR_MASK)) | wa_ctx->per_ctx.shadow_gma;
539 	shadow_ring_context->rcs_indirect_ctx.val =
540 		(shadow_ring_context->rcs_indirect_ctx.val &
541 		(~INDIRECT_CTX_ADDR_MASK)) | wa_ctx->indirect_ctx.shadow_gma;
542 
543 	kunmap_atomic(shadow_ring_context);
544 	return 0;
545 }
546 
547 static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
548 {
549 	struct i915_vma *vma;
550 	unsigned char *per_ctx_va =
551 		(unsigned char *)wa_ctx->indirect_ctx.shadow_va +
552 		wa_ctx->indirect_ctx.size;
553 
554 	if (wa_ctx->indirect_ctx.size == 0)
555 		return 0;
556 
557 	vma = i915_gem_object_ggtt_pin(wa_ctx->indirect_ctx.obj, NULL,
558 				       0, CACHELINE_BYTES, 0);
559 	if (IS_ERR(vma))
560 		return PTR_ERR(vma);
561 
562 	/* FIXME: we are not tracking our pinned VMA leaving it
563 	 * up to the core to fix up the stray pin_count upon
564 	 * free.
565 	 */
566 
567 	wa_ctx->indirect_ctx.shadow_gma = i915_ggtt_offset(vma);
568 
569 	wa_ctx->per_ctx.shadow_gma = *((unsigned int *)per_ctx_va + 1);
570 	memset(per_ctx_va, 0, CACHELINE_BYTES);
571 
572 	update_wa_ctx_2_shadow_ctx(wa_ctx);
573 	return 0;
574 }
575 
576 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload)
577 {
578 	struct intel_vgpu *vgpu = workload->vgpu;
579 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
580 	struct intel_vgpu_shadow_bb *bb, *pos;
581 
582 	if (list_empty(&workload->shadow_bb))
583 		return;
584 
585 	bb = list_first_entry(&workload->shadow_bb,
586 			struct intel_vgpu_shadow_bb, list);
587 
588 	mutex_lock(&dev_priv->drm.struct_mutex);
589 
590 	list_for_each_entry_safe(bb, pos, &workload->shadow_bb, list) {
591 		if (bb->obj) {
592 			if (bb->accessing)
593 				i915_gem_obj_finish_shmem_access(bb->obj);
594 
595 			if (bb->va && !IS_ERR(bb->va))
596 				i915_gem_object_unpin_map(bb->obj);
597 
598 			if (bb->vma && !IS_ERR(bb->vma)) {
599 				i915_vma_unpin(bb->vma);
600 				i915_vma_close(bb->vma);
601 			}
602 			__i915_gem_object_release_unless_active(bb->obj);
603 		}
604 		list_del(&bb->list);
605 		kfree(bb);
606 	}
607 
608 	mutex_unlock(&dev_priv->drm.struct_mutex);
609 }
610 
611 static int prepare_workload(struct intel_vgpu_workload *workload)
612 {
613 	struct intel_vgpu *vgpu = workload->vgpu;
614 	int ret = 0;
615 
616 	ret = intel_vgpu_pin_mm(workload->shadow_mm);
617 	if (ret) {
618 		gvt_vgpu_err("fail to vgpu pin mm\n");
619 		return ret;
620 	}
621 
622 	update_shadow_pdps(workload);
623 
624 	ret = intel_vgpu_sync_oos_pages(workload->vgpu);
625 	if (ret) {
626 		gvt_vgpu_err("fail to vgpu sync oos pages\n");
627 		goto err_unpin_mm;
628 	}
629 
630 	ret = intel_vgpu_flush_post_shadow(workload->vgpu);
631 	if (ret) {
632 		gvt_vgpu_err("fail to flush post shadow\n");
633 		goto err_unpin_mm;
634 	}
635 
636 	ret = intel_gvt_generate_request(workload);
637 	if (ret) {
638 		gvt_vgpu_err("fail to generate request\n");
639 		goto err_unpin_mm;
640 	}
641 
642 	ret = prepare_shadow_batch_buffer(workload);
643 	if (ret) {
644 		gvt_vgpu_err("fail to prepare_shadow_batch_buffer\n");
645 		goto err_unpin_mm;
646 	}
647 
648 	ret = prepare_shadow_wa_ctx(&workload->wa_ctx);
649 	if (ret) {
650 		gvt_vgpu_err("fail to prepare_shadow_wa_ctx\n");
651 		goto err_shadow_batch;
652 	}
653 
654 	if (workload->prepare) {
655 		ret = workload->prepare(workload);
656 		if (ret)
657 			goto err_shadow_wa_ctx;
658 	}
659 
660 	return 0;
661 err_shadow_wa_ctx:
662 	release_shadow_wa_ctx(&workload->wa_ctx);
663 err_shadow_batch:
664 	release_shadow_batch_buffer(workload);
665 err_unpin_mm:
666 	intel_vgpu_unpin_mm(workload->shadow_mm);
667 	return ret;
668 }
669 
670 static int dispatch_workload(struct intel_vgpu_workload *workload)
671 {
672 	struct intel_vgpu *vgpu = workload->vgpu;
673 	struct intel_vgpu_submission *s = &vgpu->submission;
674 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
675 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
676 	int ring_id = workload->ring_id;
677 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
678 	int ret = 0;
679 
680 	gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n",
681 		ring_id, workload);
682 
683 	mutex_lock(&dev_priv->drm.struct_mutex);
684 
685 	ret = intel_gvt_scan_and_shadow_workload(workload);
686 	if (ret)
687 		goto out;
688 
689 	ret = prepare_workload(workload);
690 	if (ret) {
691 		intel_context_unpin(shadow_ctx, engine);
692 		goto out;
693 	}
694 
695 out:
696 	if (ret)
697 		workload->status = ret;
698 
699 	if (!IS_ERR_OR_NULL(workload->req)) {
700 		gvt_dbg_sched("ring id %d submit workload to i915 %p\n",
701 				ring_id, workload->req);
702 		i915_request_add(workload->req);
703 		workload->dispatched = true;
704 	}
705 
706 	mutex_unlock(&dev_priv->drm.struct_mutex);
707 	return ret;
708 }
709 
710 static struct intel_vgpu_workload *pick_next_workload(
711 		struct intel_gvt *gvt, int ring_id)
712 {
713 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
714 	struct intel_vgpu_workload *workload = NULL;
715 
716 	mutex_lock(&gvt->lock);
717 
718 	/*
719 	 * no current vgpu / will be scheduled out / no workload
720 	 * bail out
721 	 */
722 	if (!scheduler->current_vgpu) {
723 		gvt_dbg_sched("ring id %d stop - no current vgpu\n", ring_id);
724 		goto out;
725 	}
726 
727 	if (scheduler->need_reschedule) {
728 		gvt_dbg_sched("ring id %d stop - will reschedule\n", ring_id);
729 		goto out;
730 	}
731 
732 	if (list_empty(workload_q_head(scheduler->current_vgpu, ring_id)))
733 		goto out;
734 
735 	/*
736 	 * still have current workload, maybe the workload disptacher
737 	 * fail to submit it for some reason, resubmit it.
738 	 */
739 	if (scheduler->current_workload[ring_id]) {
740 		workload = scheduler->current_workload[ring_id];
741 		gvt_dbg_sched("ring id %d still have current workload %p\n",
742 				ring_id, workload);
743 		goto out;
744 	}
745 
746 	/*
747 	 * pick a workload as current workload
748 	 * once current workload is set, schedule policy routines
749 	 * will wait the current workload is finished when trying to
750 	 * schedule out a vgpu.
751 	 */
752 	scheduler->current_workload[ring_id] = container_of(
753 			workload_q_head(scheduler->current_vgpu, ring_id)->next,
754 			struct intel_vgpu_workload, list);
755 
756 	workload = scheduler->current_workload[ring_id];
757 
758 	gvt_dbg_sched("ring id %d pick new workload %p\n", ring_id, workload);
759 
760 	atomic_inc(&workload->vgpu->submission.running_workload_num);
761 out:
762 	mutex_unlock(&gvt->lock);
763 	return workload;
764 }
765 
766 static void update_guest_context(struct intel_vgpu_workload *workload)
767 {
768 	struct intel_vgpu *vgpu = workload->vgpu;
769 	struct intel_gvt *gvt = vgpu->gvt;
770 	struct intel_vgpu_submission *s = &vgpu->submission;
771 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
772 	int ring_id = workload->ring_id;
773 	struct drm_i915_gem_object *ctx_obj =
774 		shadow_ctx->__engine[ring_id].state->obj;
775 	struct execlist_ring_context *shadow_ring_context;
776 	struct page *page;
777 	void *src;
778 	unsigned long context_gpa, context_page_num;
779 	int i;
780 
781 	gvt_dbg_sched("ring id %d workload lrca %x\n", ring_id,
782 			workload->ctx_desc.lrca);
783 
784 	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
785 
786 	context_page_num = context_page_num >> PAGE_SHIFT;
787 
788 	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
789 		context_page_num = 19;
790 
791 	i = 2;
792 
793 	while (i < context_page_num) {
794 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
795 				(u32)((workload->ctx_desc.lrca + i) <<
796 					I915_GTT_PAGE_SHIFT));
797 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
798 			gvt_vgpu_err("invalid guest context descriptor\n");
799 			return;
800 		}
801 
802 		page = i915_gem_object_get_page(ctx_obj, LRC_HEADER_PAGES + i);
803 		src = kmap(page);
804 		intel_gvt_hypervisor_write_gpa(vgpu, context_gpa, src,
805 				I915_GTT_PAGE_SIZE);
806 		kunmap(page);
807 		i++;
808 	}
809 
810 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa +
811 		RING_CTX_OFF(ring_header.val), &workload->rb_tail, 4);
812 
813 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
814 	shadow_ring_context = kmap(page);
815 
816 #define COPY_REG(name) \
817 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa + \
818 		RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
819 
820 	COPY_REG(ctx_ctrl);
821 	COPY_REG(ctx_timestamp);
822 
823 #undef COPY_REG
824 
825 	intel_gvt_hypervisor_write_gpa(vgpu,
826 			workload->ring_context_gpa +
827 			sizeof(*shadow_ring_context),
828 			(void *)shadow_ring_context +
829 			sizeof(*shadow_ring_context),
830 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
831 
832 	kunmap(page);
833 }
834 
835 static void clean_workloads(struct intel_vgpu *vgpu, unsigned long engine_mask)
836 {
837 	struct intel_vgpu_submission *s = &vgpu->submission;
838 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
839 	struct intel_engine_cs *engine;
840 	struct intel_vgpu_workload *pos, *n;
841 	unsigned int tmp;
842 
843 	/* free the unsubmited workloads in the queues. */
844 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
845 		list_for_each_entry_safe(pos, n,
846 			&s->workload_q_head[engine->id], list) {
847 			list_del_init(&pos->list);
848 			intel_vgpu_destroy_workload(pos);
849 		}
850 		clear_bit(engine->id, s->shadow_ctx_desc_updated);
851 	}
852 }
853 
854 static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
855 {
856 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
857 	struct intel_vgpu_workload *workload =
858 		scheduler->current_workload[ring_id];
859 	struct intel_vgpu *vgpu = workload->vgpu;
860 	struct intel_vgpu_submission *s = &vgpu->submission;
861 	int event;
862 
863 	mutex_lock(&gvt->lock);
864 
865 	/* For the workload w/ request, needs to wait for the context
866 	 * switch to make sure request is completed.
867 	 * For the workload w/o request, directly complete the workload.
868 	 */
869 	if (workload->req) {
870 		struct drm_i915_private *dev_priv =
871 			workload->vgpu->gvt->dev_priv;
872 		struct intel_engine_cs *engine =
873 			dev_priv->engine[workload->ring_id];
874 		wait_event(workload->shadow_ctx_status_wq,
875 			   !atomic_read(&workload->shadow_ctx_active));
876 
877 		/* If this request caused GPU hang, req->fence.error will
878 		 * be set to -EIO. Use -EIO to set workload status so
879 		 * that when this request caused GPU hang, didn't trigger
880 		 * context switch interrupt to guest.
881 		 */
882 		if (likely(workload->status == -EINPROGRESS)) {
883 			if (workload->req->fence.error == -EIO)
884 				workload->status = -EIO;
885 			else
886 				workload->status = 0;
887 		}
888 
889 		i915_request_put(fetch_and_zero(&workload->req));
890 
891 		if (!workload->status && !(vgpu->resetting_eng &
892 					   ENGINE_MASK(ring_id))) {
893 			update_guest_context(workload);
894 
895 			for_each_set_bit(event, workload->pending_events,
896 					 INTEL_GVT_EVENT_MAX)
897 				intel_vgpu_trigger_virtual_event(vgpu, event);
898 		}
899 		mutex_lock(&dev_priv->drm.struct_mutex);
900 		/* unpin shadow ctx as the shadow_ctx update is done */
901 		intel_context_unpin(s->shadow_ctx, engine);
902 		mutex_unlock(&dev_priv->drm.struct_mutex);
903 	}
904 
905 	gvt_dbg_sched("ring id %d complete workload %p status %d\n",
906 			ring_id, workload, workload->status);
907 
908 	scheduler->current_workload[ring_id] = NULL;
909 
910 	list_del_init(&workload->list);
911 
912 	if (!workload->status) {
913 		release_shadow_batch_buffer(workload);
914 		release_shadow_wa_ctx(&workload->wa_ctx);
915 	}
916 
917 	if (workload->status || (vgpu->resetting_eng & ENGINE_MASK(ring_id))) {
918 		/* if workload->status is not successful means HW GPU
919 		 * has occurred GPU hang or something wrong with i915/GVT,
920 		 * and GVT won't inject context switch interrupt to guest.
921 		 * So this error is a vGPU hang actually to the guest.
922 		 * According to this we should emunlate a vGPU hang. If
923 		 * there are pending workloads which are already submitted
924 		 * from guest, we should clean them up like HW GPU does.
925 		 *
926 		 * if it is in middle of engine resetting, the pending
927 		 * workloads won't be submitted to HW GPU and will be
928 		 * cleaned up during the resetting process later, so doing
929 		 * the workload clean up here doesn't have any impact.
930 		 **/
931 		clean_workloads(vgpu, ENGINE_MASK(ring_id));
932 	}
933 
934 	workload->complete(workload);
935 
936 	atomic_dec(&s->running_workload_num);
937 	wake_up(&scheduler->workload_complete_wq);
938 
939 	if (gvt->scheduler.need_reschedule)
940 		intel_gvt_request_service(gvt, INTEL_GVT_REQUEST_EVENT_SCHED);
941 
942 	mutex_unlock(&gvt->lock);
943 }
944 
945 struct workload_thread_param {
946 	struct intel_gvt *gvt;
947 	int ring_id;
948 };
949 
950 static int workload_thread(void *priv)
951 {
952 	struct workload_thread_param *p = (struct workload_thread_param *)priv;
953 	struct intel_gvt *gvt = p->gvt;
954 	int ring_id = p->ring_id;
955 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
956 	struct intel_vgpu_workload *workload = NULL;
957 	struct intel_vgpu *vgpu = NULL;
958 	int ret;
959 	bool need_force_wake = IS_SKYLAKE(gvt->dev_priv)
960 			|| IS_KABYLAKE(gvt->dev_priv);
961 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
962 
963 	kfree(p);
964 
965 	gvt_dbg_core("workload thread for ring %d started\n", ring_id);
966 
967 	while (!kthread_should_stop()) {
968 		add_wait_queue(&scheduler->waitq[ring_id], &wait);
969 		do {
970 			workload = pick_next_workload(gvt, ring_id);
971 			if (workload)
972 				break;
973 			wait_woken(&wait, TASK_INTERRUPTIBLE,
974 				   MAX_SCHEDULE_TIMEOUT);
975 		} while (!kthread_should_stop());
976 		remove_wait_queue(&scheduler->waitq[ring_id], &wait);
977 
978 		if (!workload)
979 			break;
980 
981 		gvt_dbg_sched("ring id %d next workload %p vgpu %d\n",
982 				workload->ring_id, workload,
983 				workload->vgpu->id);
984 
985 		intel_runtime_pm_get(gvt->dev_priv);
986 
987 		gvt_dbg_sched("ring id %d will dispatch workload %p\n",
988 				workload->ring_id, workload);
989 
990 		if (need_force_wake)
991 			intel_uncore_forcewake_get(gvt->dev_priv,
992 					FORCEWAKE_ALL);
993 
994 		mutex_lock(&gvt->lock);
995 		ret = dispatch_workload(workload);
996 		mutex_unlock(&gvt->lock);
997 
998 		if (ret) {
999 			vgpu = workload->vgpu;
1000 			gvt_vgpu_err("fail to dispatch workload, skip\n");
1001 			goto complete;
1002 		}
1003 
1004 		gvt_dbg_sched("ring id %d wait workload %p\n",
1005 				workload->ring_id, workload);
1006 		i915_request_wait(workload->req, 0, MAX_SCHEDULE_TIMEOUT);
1007 
1008 complete:
1009 		gvt_dbg_sched("will complete workload %p, status: %d\n",
1010 				workload, workload->status);
1011 
1012 		complete_current_workload(gvt, ring_id);
1013 
1014 		if (need_force_wake)
1015 			intel_uncore_forcewake_put(gvt->dev_priv,
1016 					FORCEWAKE_ALL);
1017 
1018 		intel_runtime_pm_put(gvt->dev_priv);
1019 		if (ret && (vgpu_is_vm_unhealthy(ret)))
1020 			enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
1021 	}
1022 	return 0;
1023 }
1024 
1025 void intel_gvt_wait_vgpu_idle(struct intel_vgpu *vgpu)
1026 {
1027 	struct intel_vgpu_submission *s = &vgpu->submission;
1028 	struct intel_gvt *gvt = vgpu->gvt;
1029 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
1030 
1031 	if (atomic_read(&s->running_workload_num)) {
1032 		gvt_dbg_sched("wait vgpu idle\n");
1033 
1034 		wait_event(scheduler->workload_complete_wq,
1035 				!atomic_read(&s->running_workload_num));
1036 	}
1037 }
1038 
1039 void intel_gvt_clean_workload_scheduler(struct intel_gvt *gvt)
1040 {
1041 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
1042 	struct intel_engine_cs *engine;
1043 	enum intel_engine_id i;
1044 
1045 	gvt_dbg_core("clean workload scheduler\n");
1046 
1047 	for_each_engine(engine, gvt->dev_priv, i) {
1048 		atomic_notifier_chain_unregister(
1049 					&engine->context_status_notifier,
1050 					&gvt->shadow_ctx_notifier_block[i]);
1051 		kthread_stop(scheduler->thread[i]);
1052 	}
1053 }
1054 
1055 int intel_gvt_init_workload_scheduler(struct intel_gvt *gvt)
1056 {
1057 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
1058 	struct workload_thread_param *param = NULL;
1059 	struct intel_engine_cs *engine;
1060 	enum intel_engine_id i;
1061 	int ret;
1062 
1063 	gvt_dbg_core("init workload scheduler\n");
1064 
1065 	init_waitqueue_head(&scheduler->workload_complete_wq);
1066 
1067 	for_each_engine(engine, gvt->dev_priv, i) {
1068 		init_waitqueue_head(&scheduler->waitq[i]);
1069 
1070 		param = kzalloc(sizeof(*param), GFP_KERNEL);
1071 		if (!param) {
1072 			ret = -ENOMEM;
1073 			goto err;
1074 		}
1075 
1076 		param->gvt = gvt;
1077 		param->ring_id = i;
1078 
1079 		scheduler->thread[i] = kthread_run(workload_thread, param,
1080 			"gvt workload %d", i);
1081 		if (IS_ERR(scheduler->thread[i])) {
1082 			gvt_err("fail to create workload thread\n");
1083 			ret = PTR_ERR(scheduler->thread[i]);
1084 			goto err;
1085 		}
1086 
1087 		gvt->shadow_ctx_notifier_block[i].notifier_call =
1088 					shadow_context_status_change;
1089 		atomic_notifier_chain_register(&engine->context_status_notifier,
1090 					&gvt->shadow_ctx_notifier_block[i]);
1091 	}
1092 	return 0;
1093 err:
1094 	intel_gvt_clean_workload_scheduler(gvt);
1095 	kfree(param);
1096 	param = NULL;
1097 	return ret;
1098 }
1099 
1100 /**
1101  * intel_vgpu_clean_submission - free submission-related resource for vGPU
1102  * @vgpu: a vGPU
1103  *
1104  * This function is called when a vGPU is being destroyed.
1105  *
1106  */
1107 void intel_vgpu_clean_submission(struct intel_vgpu *vgpu)
1108 {
1109 	struct intel_vgpu_submission *s = &vgpu->submission;
1110 
1111 	intel_vgpu_select_submission_ops(vgpu, ALL_ENGINES, 0);
1112 	i915_gem_context_put(s->shadow_ctx);
1113 	kmem_cache_destroy(s->workloads);
1114 }
1115 
1116 
1117 /**
1118  * intel_vgpu_reset_submission - reset submission-related resource for vGPU
1119  * @vgpu: a vGPU
1120  * @engine_mask: engines expected to be reset
1121  *
1122  * This function is called when a vGPU is being destroyed.
1123  *
1124  */
1125 void intel_vgpu_reset_submission(struct intel_vgpu *vgpu,
1126 		unsigned long engine_mask)
1127 {
1128 	struct intel_vgpu_submission *s = &vgpu->submission;
1129 
1130 	if (!s->active)
1131 		return;
1132 
1133 	clean_workloads(vgpu, engine_mask);
1134 	s->ops->reset(vgpu, engine_mask);
1135 }
1136 
1137 /**
1138  * intel_vgpu_setup_submission - setup submission-related resource for vGPU
1139  * @vgpu: a vGPU
1140  *
1141  * This function is called when a vGPU is being created.
1142  *
1143  * Returns:
1144  * Zero on success, negative error code if failed.
1145  *
1146  */
1147 int intel_vgpu_setup_submission(struct intel_vgpu *vgpu)
1148 {
1149 	struct intel_vgpu_submission *s = &vgpu->submission;
1150 	enum intel_engine_id i;
1151 	struct intel_engine_cs *engine;
1152 	int ret;
1153 
1154 	s->shadow_ctx = i915_gem_context_create_gvt(
1155 			&vgpu->gvt->dev_priv->drm);
1156 	if (IS_ERR(s->shadow_ctx))
1157 		return PTR_ERR(s->shadow_ctx);
1158 
1159 	bitmap_zero(s->shadow_ctx_desc_updated, I915_NUM_ENGINES);
1160 
1161 	s->workloads = kmem_cache_create_usercopy("gvt-g_vgpu_workload",
1162 						  sizeof(struct intel_vgpu_workload), 0,
1163 						  SLAB_HWCACHE_ALIGN,
1164 						  offsetof(struct intel_vgpu_workload, rb_tail),
1165 						  sizeof_field(struct intel_vgpu_workload, rb_tail),
1166 						  NULL);
1167 
1168 	if (!s->workloads) {
1169 		ret = -ENOMEM;
1170 		goto out_shadow_ctx;
1171 	}
1172 
1173 	for_each_engine(engine, vgpu->gvt->dev_priv, i)
1174 		INIT_LIST_HEAD(&s->workload_q_head[i]);
1175 
1176 	atomic_set(&s->running_workload_num, 0);
1177 	bitmap_zero(s->tlb_handle_pending, I915_NUM_ENGINES);
1178 
1179 	return 0;
1180 
1181 out_shadow_ctx:
1182 	i915_gem_context_put(s->shadow_ctx);
1183 	return ret;
1184 }
1185 
1186 /**
1187  * intel_vgpu_select_submission_ops - select virtual submission interface
1188  * @vgpu: a vGPU
1189  * @interface: expected vGPU virtual submission interface
1190  *
1191  * This function is called when guest configures submission interface.
1192  *
1193  * Returns:
1194  * Zero on success, negative error code if failed.
1195  *
1196  */
1197 int intel_vgpu_select_submission_ops(struct intel_vgpu *vgpu,
1198 				     unsigned long engine_mask,
1199 				     unsigned int interface)
1200 {
1201 	struct intel_vgpu_submission *s = &vgpu->submission;
1202 	const struct intel_vgpu_submission_ops *ops[] = {
1203 		[INTEL_VGPU_EXECLIST_SUBMISSION] =
1204 			&intel_vgpu_execlist_submission_ops,
1205 	};
1206 	int ret;
1207 
1208 	if (WARN_ON(interface >= ARRAY_SIZE(ops)))
1209 		return -EINVAL;
1210 
1211 	if (WARN_ON(interface == 0 && engine_mask != ALL_ENGINES))
1212 		return -EINVAL;
1213 
1214 	if (s->active)
1215 		s->ops->clean(vgpu, engine_mask);
1216 
1217 	if (interface == 0) {
1218 		s->ops = NULL;
1219 		s->virtual_submission_interface = 0;
1220 		s->active = false;
1221 		gvt_dbg_core("vgpu%d: remove submission ops\n", vgpu->id);
1222 		return 0;
1223 	}
1224 
1225 	ret = ops[interface]->init(vgpu, engine_mask);
1226 	if (ret)
1227 		return ret;
1228 
1229 	s->ops = ops[interface];
1230 	s->virtual_submission_interface = interface;
1231 	s->active = true;
1232 
1233 	gvt_dbg_core("vgpu%d: activate ops [ %s ]\n",
1234 			vgpu->id, s->ops->name);
1235 
1236 	return 0;
1237 }
1238 
1239 /**
1240  * intel_vgpu_destroy_workload - destroy a vGPU workload
1241  * @vgpu: a vGPU
1242  *
1243  * This function is called when destroy a vGPU workload.
1244  *
1245  */
1246 void intel_vgpu_destroy_workload(struct intel_vgpu_workload *workload)
1247 {
1248 	struct intel_vgpu_submission *s = &workload->vgpu->submission;
1249 
1250 	if (workload->shadow_mm)
1251 		intel_vgpu_mm_put(workload->shadow_mm);
1252 
1253 	kmem_cache_free(s->workloads, workload);
1254 }
1255 
1256 static struct intel_vgpu_workload *
1257 alloc_workload(struct intel_vgpu *vgpu)
1258 {
1259 	struct intel_vgpu_submission *s = &vgpu->submission;
1260 	struct intel_vgpu_workload *workload;
1261 
1262 	workload = kmem_cache_zalloc(s->workloads, GFP_KERNEL);
1263 	if (!workload)
1264 		return ERR_PTR(-ENOMEM);
1265 
1266 	INIT_LIST_HEAD(&workload->list);
1267 	INIT_LIST_HEAD(&workload->shadow_bb);
1268 
1269 	init_waitqueue_head(&workload->shadow_ctx_status_wq);
1270 	atomic_set(&workload->shadow_ctx_active, 0);
1271 
1272 	workload->status = -EINPROGRESS;
1273 	workload->shadowed = false;
1274 	workload->vgpu = vgpu;
1275 
1276 	return workload;
1277 }
1278 
1279 #define RING_CTX_OFF(x) \
1280 	offsetof(struct execlist_ring_context, x)
1281 
1282 static void read_guest_pdps(struct intel_vgpu *vgpu,
1283 		u64 ring_context_gpa, u32 pdp[8])
1284 {
1285 	u64 gpa;
1286 	int i;
1287 
1288 	gpa = ring_context_gpa + RING_CTX_OFF(pdp3_UDW.val);
1289 
1290 	for (i = 0; i < 8; i++)
1291 		intel_gvt_hypervisor_read_gpa(vgpu,
1292 				gpa + i * 8, &pdp[7 - i], 4);
1293 }
1294 
1295 static int prepare_mm(struct intel_vgpu_workload *workload)
1296 {
1297 	struct execlist_ctx_descriptor_format *desc = &workload->ctx_desc;
1298 	struct intel_vgpu_mm *mm;
1299 	struct intel_vgpu *vgpu = workload->vgpu;
1300 	intel_gvt_gtt_type_t root_entry_type;
1301 	u64 pdps[GVT_RING_CTX_NR_PDPS];
1302 
1303 	switch (desc->addressing_mode) {
1304 	case 1: /* legacy 32-bit */
1305 		root_entry_type = GTT_TYPE_PPGTT_ROOT_L3_ENTRY;
1306 		break;
1307 	case 3: /* legacy 64-bit */
1308 		root_entry_type = GTT_TYPE_PPGTT_ROOT_L4_ENTRY;
1309 		break;
1310 	default:
1311 		gvt_vgpu_err("Advanced Context mode(SVM) is not supported!\n");
1312 		return -EINVAL;
1313 	}
1314 
1315 	read_guest_pdps(workload->vgpu, workload->ring_context_gpa, (void *)pdps);
1316 
1317 	mm = intel_vgpu_get_ppgtt_mm(workload->vgpu, root_entry_type, pdps);
1318 	if (IS_ERR(mm))
1319 		return PTR_ERR(mm);
1320 
1321 	workload->shadow_mm = mm;
1322 	return 0;
1323 }
1324 
1325 #define same_context(a, b) (((a)->context_id == (b)->context_id) && \
1326 		((a)->lrca == (b)->lrca))
1327 
1328 #define get_last_workload(q) \
1329 	(list_empty(q) ? NULL : container_of(q->prev, \
1330 	struct intel_vgpu_workload, list))
1331 /**
1332  * intel_vgpu_create_workload - create a vGPU workload
1333  * @vgpu: a vGPU
1334  * @desc: a guest context descriptor
1335  *
1336  * This function is called when creating a vGPU workload.
1337  *
1338  * Returns:
1339  * struct intel_vgpu_workload * on success, negative error code in
1340  * pointer if failed.
1341  *
1342  */
1343 struct intel_vgpu_workload *
1344 intel_vgpu_create_workload(struct intel_vgpu *vgpu, int ring_id,
1345 			   struct execlist_ctx_descriptor_format *desc)
1346 {
1347 	struct intel_vgpu_submission *s = &vgpu->submission;
1348 	struct list_head *q = workload_q_head(vgpu, ring_id);
1349 	struct intel_vgpu_workload *last_workload = get_last_workload(q);
1350 	struct intel_vgpu_workload *workload = NULL;
1351 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
1352 	u64 ring_context_gpa;
1353 	u32 head, tail, start, ctl, ctx_ctl, per_ctx, indirect_ctx;
1354 	int ret;
1355 
1356 	ring_context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
1357 			(u32)((desc->lrca + 1) << I915_GTT_PAGE_SHIFT));
1358 	if (ring_context_gpa == INTEL_GVT_INVALID_ADDR) {
1359 		gvt_vgpu_err("invalid guest context LRCA: %x\n", desc->lrca);
1360 		return ERR_PTR(-EINVAL);
1361 	}
1362 
1363 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1364 			RING_CTX_OFF(ring_header.val), &head, 4);
1365 
1366 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1367 			RING_CTX_OFF(ring_tail.val), &tail, 4);
1368 
1369 	head &= RB_HEAD_OFF_MASK;
1370 	tail &= RB_TAIL_OFF_MASK;
1371 
1372 	if (last_workload && same_context(&last_workload->ctx_desc, desc)) {
1373 		gvt_dbg_el("ring id %d cur workload == last\n", ring_id);
1374 		gvt_dbg_el("ctx head %x real head %lx\n", head,
1375 				last_workload->rb_tail);
1376 		/*
1377 		 * cannot use guest context head pointer here,
1378 		 * as it might not be updated at this time
1379 		 */
1380 		head = last_workload->rb_tail;
1381 	}
1382 
1383 	gvt_dbg_el("ring id %d begin a new workload\n", ring_id);
1384 
1385 	/* record some ring buffer register values for scan and shadow */
1386 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1387 			RING_CTX_OFF(rb_start.val), &start, 4);
1388 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1389 			RING_CTX_OFF(rb_ctrl.val), &ctl, 4);
1390 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1391 			RING_CTX_OFF(ctx_ctrl.val), &ctx_ctl, 4);
1392 
1393 	workload = alloc_workload(vgpu);
1394 	if (IS_ERR(workload))
1395 		return workload;
1396 
1397 	workload->ring_id = ring_id;
1398 	workload->ctx_desc = *desc;
1399 	workload->ring_context_gpa = ring_context_gpa;
1400 	workload->rb_head = head;
1401 	workload->rb_tail = tail;
1402 	workload->rb_start = start;
1403 	workload->rb_ctl = ctl;
1404 
1405 	if (ring_id == RCS) {
1406 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1407 			RING_CTX_OFF(bb_per_ctx_ptr.val), &per_ctx, 4);
1408 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1409 			RING_CTX_OFF(rcs_indirect_ctx.val), &indirect_ctx, 4);
1410 
1411 		workload->wa_ctx.indirect_ctx.guest_gma =
1412 			indirect_ctx & INDIRECT_CTX_ADDR_MASK;
1413 		workload->wa_ctx.indirect_ctx.size =
1414 			(indirect_ctx & INDIRECT_CTX_SIZE_MASK) *
1415 			CACHELINE_BYTES;
1416 		workload->wa_ctx.per_ctx.guest_gma =
1417 			per_ctx & PER_CTX_ADDR_MASK;
1418 		workload->wa_ctx.per_ctx.valid = per_ctx & 1;
1419 	}
1420 
1421 	gvt_dbg_el("workload %p ring id %d head %x tail %x start %x ctl %x\n",
1422 			workload, ring_id, head, tail, start, ctl);
1423 
1424 	ret = prepare_mm(workload);
1425 	if (ret) {
1426 		kmem_cache_free(s->workloads, workload);
1427 		return ERR_PTR(ret);
1428 	}
1429 
1430 	/* Only scan and shadow the first workload in the queue
1431 	 * as there is only one pre-allocated buf-obj for shadow.
1432 	 */
1433 	if (list_empty(workload_q_head(vgpu, ring_id))) {
1434 		intel_runtime_pm_get(dev_priv);
1435 		mutex_lock(&dev_priv->drm.struct_mutex);
1436 		ret = intel_gvt_scan_and_shadow_workload(workload);
1437 		mutex_unlock(&dev_priv->drm.struct_mutex);
1438 		intel_runtime_pm_put(dev_priv);
1439 	}
1440 
1441 	if (ret && (vgpu_is_vm_unhealthy(ret))) {
1442 		enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
1443 		intel_vgpu_destroy_workload(workload);
1444 		return ERR_PTR(ret);
1445 	}
1446 
1447 	return workload;
1448 }
1449 
1450 /**
1451  * intel_vgpu_queue_workload - Qeue a vGPU workload
1452  * @workload: the workload to queue in
1453  */
1454 void intel_vgpu_queue_workload(struct intel_vgpu_workload *workload)
1455 {
1456 	list_add_tail(&workload->list,
1457 		workload_q_head(workload->vgpu, workload->ring_id));
1458 	intel_gvt_kick_schedule(workload->vgpu->gvt);
1459 	wake_up(&workload->vgpu->gvt->scheduler.waitq[workload->ring_id]);
1460 }
1461