xref: /openbmc/linux/drivers/gpu/drm/i915/gvt/scheduler.c (revision fb960bd2)
1 /*
2  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Zhi Wang <zhi.a.wang@intel.com>
25  *
26  * Contributors:
27  *    Ping Gao <ping.a.gao@intel.com>
28  *    Tina Zhang <tina.zhang@intel.com>
29  *    Chanbin Du <changbin.du@intel.com>
30  *    Min He <min.he@intel.com>
31  *    Bing Niu <bing.niu@intel.com>
32  *    Zhenyu Wang <zhenyuw@linux.intel.com>
33  *
34  */
35 
36 #include <linux/kthread.h>
37 
38 #include "i915_drv.h"
39 #include "gvt.h"
40 
41 #define RING_CTX_OFF(x) \
42 	offsetof(struct execlist_ring_context, x)
43 
44 static void set_context_pdp_root_pointer(
45 		struct execlist_ring_context *ring_context,
46 		u32 pdp[8])
47 {
48 	struct execlist_mmio_pair *pdp_pair = &ring_context->pdp3_UDW;
49 	int i;
50 
51 	for (i = 0; i < 8; i++)
52 		pdp_pair[i].val = pdp[7 - i];
53 }
54 
55 static int populate_shadow_context(struct intel_vgpu_workload *workload)
56 {
57 	struct intel_vgpu *vgpu = workload->vgpu;
58 	struct intel_gvt *gvt = vgpu->gvt;
59 	int ring_id = workload->ring_id;
60 	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
61 	struct drm_i915_gem_object *ctx_obj =
62 		shadow_ctx->engine[ring_id].state->obj;
63 	struct execlist_ring_context *shadow_ring_context;
64 	struct page *page;
65 	void *dst;
66 	unsigned long context_gpa, context_page_num;
67 	int i;
68 
69 	gvt_dbg_sched("ring id %d workload lrca %x", ring_id,
70 			workload->ctx_desc.lrca);
71 
72 	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
73 
74 	context_page_num = context_page_num >> PAGE_SHIFT;
75 
76 	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
77 		context_page_num = 19;
78 
79 	i = 2;
80 
81 	while (i < context_page_num) {
82 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
83 				(u32)((workload->ctx_desc.lrca + i) <<
84 				I915_GTT_PAGE_SHIFT));
85 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
86 			gvt_vgpu_err("Invalid guest context descriptor\n");
87 			return -EFAULT;
88 		}
89 
90 		page = i915_gem_object_get_page(ctx_obj, LRC_HEADER_PAGES + i);
91 		dst = kmap(page);
92 		intel_gvt_hypervisor_read_gpa(vgpu, context_gpa, dst,
93 				I915_GTT_PAGE_SIZE);
94 		kunmap(page);
95 		i++;
96 	}
97 
98 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
99 	shadow_ring_context = kmap(page);
100 
101 #define COPY_REG(name) \
102 	intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \
103 		+ RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
104 
105 	COPY_REG(ctx_ctrl);
106 	COPY_REG(ctx_timestamp);
107 
108 	if (ring_id == RCS) {
109 		COPY_REG(bb_per_ctx_ptr);
110 		COPY_REG(rcs_indirect_ctx);
111 		COPY_REG(rcs_indirect_ctx_offset);
112 	}
113 #undef COPY_REG
114 
115 	set_context_pdp_root_pointer(shadow_ring_context,
116 				     workload->shadow_mm->shadow_page_table);
117 
118 	intel_gvt_hypervisor_read_gpa(vgpu,
119 			workload->ring_context_gpa +
120 			sizeof(*shadow_ring_context),
121 			(void *)shadow_ring_context +
122 			sizeof(*shadow_ring_context),
123 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
124 
125 	kunmap(page);
126 	return 0;
127 }
128 
129 static inline bool is_gvt_request(struct drm_i915_gem_request *req)
130 {
131 	return i915_gem_context_force_single_submission(req->ctx);
132 }
133 
134 static void save_ring_hw_state(struct intel_vgpu *vgpu, int ring_id)
135 {
136 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
137 	u32 ring_base = dev_priv->engine[ring_id]->mmio_base;
138 	i915_reg_t reg;
139 
140 	reg = RING_INSTDONE(ring_base);
141 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
142 	reg = RING_ACTHD(ring_base);
143 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
144 	reg = RING_ACTHD_UDW(ring_base);
145 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
146 }
147 
148 static int shadow_context_status_change(struct notifier_block *nb,
149 		unsigned long action, void *data)
150 {
151 	struct drm_i915_gem_request *req = (struct drm_i915_gem_request *)data;
152 	struct intel_gvt *gvt = container_of(nb, struct intel_gvt,
153 				shadow_ctx_notifier_block[req->engine->id]);
154 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
155 	enum intel_engine_id ring_id = req->engine->id;
156 	struct intel_vgpu_workload *workload;
157 	unsigned long flags;
158 
159 	if (!is_gvt_request(req)) {
160 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
161 		if (action == INTEL_CONTEXT_SCHEDULE_IN &&
162 		    scheduler->engine_owner[ring_id]) {
163 			/* Switch ring from vGPU to host. */
164 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
165 					      NULL, ring_id);
166 			scheduler->engine_owner[ring_id] = NULL;
167 		}
168 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
169 
170 		return NOTIFY_OK;
171 	}
172 
173 	workload = scheduler->current_workload[ring_id];
174 	if (unlikely(!workload))
175 		return NOTIFY_OK;
176 
177 	switch (action) {
178 	case INTEL_CONTEXT_SCHEDULE_IN:
179 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
180 		if (workload->vgpu != scheduler->engine_owner[ring_id]) {
181 			/* Switch ring from host to vGPU or vGPU to vGPU. */
182 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
183 					      workload->vgpu, ring_id);
184 			scheduler->engine_owner[ring_id] = workload->vgpu;
185 		} else
186 			gvt_dbg_sched("skip ring %d mmio switch for vgpu%d\n",
187 				      ring_id, workload->vgpu->id);
188 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
189 		atomic_set(&workload->shadow_ctx_active, 1);
190 		break;
191 	case INTEL_CONTEXT_SCHEDULE_OUT:
192 	case INTEL_CONTEXT_SCHEDULE_PREEMPTED:
193 		save_ring_hw_state(workload->vgpu, ring_id);
194 		atomic_set(&workload->shadow_ctx_active, 0);
195 		break;
196 	default:
197 		WARN_ON(1);
198 		return NOTIFY_OK;
199 	}
200 	wake_up(&workload->shadow_ctx_status_wq);
201 	return NOTIFY_OK;
202 }
203 
204 static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
205 		struct intel_engine_cs *engine)
206 {
207 	struct intel_context *ce = &ctx->engine[engine->id];
208 	u64 desc = 0;
209 
210 	desc = ce->lrc_desc;
211 
212 	/* Update bits 0-11 of the context descriptor which includes flags
213 	 * like GEN8_CTX_* cached in desc_template
214 	 */
215 	desc &= U64_MAX << 12;
216 	desc |= ctx->desc_template & ((1ULL << 12) - 1);
217 
218 	ce->lrc_desc = desc;
219 }
220 
221 static int copy_workload_to_ring_buffer(struct intel_vgpu_workload *workload)
222 {
223 	struct intel_vgpu *vgpu = workload->vgpu;
224 	void *shadow_ring_buffer_va;
225 	u32 *cs;
226 
227 	/* allocate shadow ring buffer */
228 	cs = intel_ring_begin(workload->req, workload->rb_len / sizeof(u32));
229 	if (IS_ERR(cs)) {
230 		gvt_vgpu_err("fail to alloc size =%ld shadow  ring buffer\n",
231 			workload->rb_len);
232 		return PTR_ERR(cs);
233 	}
234 
235 	shadow_ring_buffer_va = workload->shadow_ring_buffer_va;
236 
237 	/* get shadow ring buffer va */
238 	workload->shadow_ring_buffer_va = cs;
239 
240 	memcpy(cs, shadow_ring_buffer_va,
241 			workload->rb_len);
242 
243 	cs += workload->rb_len / sizeof(u32);
244 	intel_ring_advance(workload->req, cs);
245 
246 	return 0;
247 }
248 
249 void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
250 {
251 	if (!wa_ctx->indirect_ctx.obj)
252 		return;
253 
254 	i915_gem_object_unpin_map(wa_ctx->indirect_ctx.obj);
255 	i915_gem_object_put(wa_ctx->indirect_ctx.obj);
256 }
257 
258 /**
259  * intel_gvt_scan_and_shadow_workload - audit the workload by scanning and
260  * shadow it as well, include ringbuffer,wa_ctx and ctx.
261  * @workload: an abstract entity for each execlist submission.
262  *
263  * This function is called before the workload submitting to i915, to make
264  * sure the content of the workload is valid.
265  */
266 int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
267 {
268 	struct intel_vgpu *vgpu = workload->vgpu;
269 	struct intel_vgpu_submission *s = &vgpu->submission;
270 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
271 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
272 	int ring_id = workload->ring_id;
273 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
274 	struct intel_ring *ring;
275 	int ret;
276 
277 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
278 
279 	if (workload->shadowed)
280 		return 0;
281 
282 	shadow_ctx->desc_template &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
283 	shadow_ctx->desc_template |= workload->ctx_desc.addressing_mode <<
284 				    GEN8_CTX_ADDRESSING_MODE_SHIFT;
285 
286 	if (!test_and_set_bit(ring_id, s->shadow_ctx_desc_updated))
287 		shadow_context_descriptor_update(shadow_ctx,
288 					dev_priv->engine[ring_id]);
289 
290 	ret = intel_gvt_scan_and_shadow_ringbuffer(workload);
291 	if (ret)
292 		goto err_scan;
293 
294 	if ((workload->ring_id == RCS) &&
295 	    (workload->wa_ctx.indirect_ctx.size != 0)) {
296 		ret = intel_gvt_scan_and_shadow_wa_ctx(&workload->wa_ctx);
297 		if (ret)
298 			goto err_scan;
299 	}
300 
301 	/* pin shadow context by gvt even the shadow context will be pinned
302 	 * when i915 alloc request. That is because gvt will update the guest
303 	 * context from shadow context when workload is completed, and at that
304 	 * moment, i915 may already unpined the shadow context to make the
305 	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
306 	 * the guest context, gvt can unpin the shadow_ctx safely.
307 	 */
308 	ring = engine->context_pin(engine, shadow_ctx);
309 	if (IS_ERR(ring)) {
310 		ret = PTR_ERR(ring);
311 		gvt_vgpu_err("fail to pin shadow context\n");
312 		goto err_shadow;
313 	}
314 
315 	ret = populate_shadow_context(workload);
316 	if (ret)
317 		goto err_unpin;
318 	workload->shadowed = true;
319 	return 0;
320 
321 err_unpin:
322 	engine->context_unpin(engine, shadow_ctx);
323 err_shadow:
324 	release_shadow_wa_ctx(&workload->wa_ctx);
325 err_scan:
326 	return ret;
327 }
328 
329 static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
330 {
331 	int ring_id = workload->ring_id;
332 	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
333 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
334 	struct drm_i915_gem_request *rq;
335 	struct intel_vgpu *vgpu = workload->vgpu;
336 	struct intel_vgpu_submission *s = &vgpu->submission;
337 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
338 	int ret;
339 
340 	rq = i915_gem_request_alloc(dev_priv->engine[ring_id], shadow_ctx);
341 	if (IS_ERR(rq)) {
342 		gvt_vgpu_err("fail to allocate gem request\n");
343 		ret = PTR_ERR(rq);
344 		goto err_unpin;
345 	}
346 
347 	gvt_dbg_sched("ring id %d get i915 gem request %p\n", ring_id, rq);
348 
349 	workload->req = i915_gem_request_get(rq);
350 	ret = copy_workload_to_ring_buffer(workload);
351 	if (ret)
352 		goto err_unpin;
353 	return 0;
354 
355 err_unpin:
356 	engine->context_unpin(engine, shadow_ctx);
357 	release_shadow_wa_ctx(&workload->wa_ctx);
358 	return ret;
359 }
360 
361 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload);
362 
363 static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload)
364 {
365 	struct intel_gvt *gvt = workload->vgpu->gvt;
366 	const int gmadr_bytes = gvt->device_info.gmadr_bytes_in_cmd;
367 	struct intel_vgpu_shadow_bb *bb;
368 	int ret;
369 
370 	list_for_each_entry(bb, &workload->shadow_bb, list) {
371 		bb->vma = i915_gem_object_ggtt_pin(bb->obj, NULL, 0, 0, 0);
372 		if (IS_ERR(bb->vma)) {
373 			ret = PTR_ERR(bb->vma);
374 			goto err;
375 		}
376 
377 		/* relocate shadow batch buffer */
378 		bb->bb_start_cmd_va[1] = i915_ggtt_offset(bb->vma);
379 		if (gmadr_bytes == 8)
380 			bb->bb_start_cmd_va[2] = 0;
381 
382 		/* No one is going to touch shadow bb from now on. */
383 		if (bb->clflush & CLFLUSH_AFTER) {
384 			drm_clflush_virt_range(bb->va, bb->obj->base.size);
385 			bb->clflush &= ~CLFLUSH_AFTER;
386 		}
387 
388 		ret = i915_gem_object_set_to_gtt_domain(bb->obj, false);
389 		if (ret)
390 			goto err;
391 
392 		i915_gem_obj_finish_shmem_access(bb->obj);
393 		bb->accessing = false;
394 
395 		i915_vma_move_to_active(bb->vma, workload->req, 0);
396 	}
397 	return 0;
398 err:
399 	release_shadow_batch_buffer(workload);
400 	return ret;
401 }
402 
403 static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
404 {
405 	struct intel_vgpu_workload *workload = container_of(wa_ctx,
406 					struct intel_vgpu_workload,
407 					wa_ctx);
408 	int ring_id = workload->ring_id;
409 	struct intel_vgpu_submission *s = &workload->vgpu->submission;
410 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
411 	struct drm_i915_gem_object *ctx_obj =
412 		shadow_ctx->engine[ring_id].state->obj;
413 	struct execlist_ring_context *shadow_ring_context;
414 	struct page *page;
415 
416 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
417 	shadow_ring_context = kmap_atomic(page);
418 
419 	shadow_ring_context->bb_per_ctx_ptr.val =
420 		(shadow_ring_context->bb_per_ctx_ptr.val &
421 		(~PER_CTX_ADDR_MASK)) | wa_ctx->per_ctx.shadow_gma;
422 	shadow_ring_context->rcs_indirect_ctx.val =
423 		(shadow_ring_context->rcs_indirect_ctx.val &
424 		(~INDIRECT_CTX_ADDR_MASK)) | wa_ctx->indirect_ctx.shadow_gma;
425 
426 	kunmap_atomic(shadow_ring_context);
427 	return 0;
428 }
429 
430 static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
431 {
432 	struct i915_vma *vma;
433 	unsigned char *per_ctx_va =
434 		(unsigned char *)wa_ctx->indirect_ctx.shadow_va +
435 		wa_ctx->indirect_ctx.size;
436 
437 	if (wa_ctx->indirect_ctx.size == 0)
438 		return 0;
439 
440 	vma = i915_gem_object_ggtt_pin(wa_ctx->indirect_ctx.obj, NULL,
441 				       0, CACHELINE_BYTES, 0);
442 	if (IS_ERR(vma))
443 		return PTR_ERR(vma);
444 
445 	/* FIXME: we are not tracking our pinned VMA leaving it
446 	 * up to the core to fix up the stray pin_count upon
447 	 * free.
448 	 */
449 
450 	wa_ctx->indirect_ctx.shadow_gma = i915_ggtt_offset(vma);
451 
452 	wa_ctx->per_ctx.shadow_gma = *((unsigned int *)per_ctx_va + 1);
453 	memset(per_ctx_va, 0, CACHELINE_BYTES);
454 
455 	update_wa_ctx_2_shadow_ctx(wa_ctx);
456 	return 0;
457 }
458 
459 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload)
460 {
461 	struct intel_vgpu *vgpu = workload->vgpu;
462 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
463 	struct intel_vgpu_shadow_bb *bb, *pos;
464 
465 	if (list_empty(&workload->shadow_bb))
466 		return;
467 
468 	bb = list_first_entry(&workload->shadow_bb,
469 			struct intel_vgpu_shadow_bb, list);
470 
471 	mutex_lock(&dev_priv->drm.struct_mutex);
472 
473 	list_for_each_entry_safe(bb, pos, &workload->shadow_bb, list) {
474 		if (bb->obj) {
475 			if (bb->accessing)
476 				i915_gem_obj_finish_shmem_access(bb->obj);
477 
478 			if (bb->va && !IS_ERR(bb->va))
479 				i915_gem_object_unpin_map(bb->obj);
480 
481 			if (bb->vma && !IS_ERR(bb->vma)) {
482 				i915_vma_unpin(bb->vma);
483 				i915_vma_close(bb->vma);
484 			}
485 			__i915_gem_object_release_unless_active(bb->obj);
486 		}
487 		list_del(&bb->list);
488 		kfree(bb);
489 	}
490 
491 	mutex_unlock(&dev_priv->drm.struct_mutex);
492 }
493 
494 static int prepare_workload(struct intel_vgpu_workload *workload)
495 {
496 	struct intel_vgpu *vgpu = workload->vgpu;
497 	int ret = 0;
498 
499 	ret = intel_vgpu_pin_mm(workload->shadow_mm);
500 	if (ret) {
501 		gvt_vgpu_err("fail to vgpu pin mm\n");
502 		return ret;
503 	}
504 
505 	ret = intel_vgpu_sync_oos_pages(workload->vgpu);
506 	if (ret) {
507 		gvt_vgpu_err("fail to vgpu sync oos pages\n");
508 		goto err_unpin_mm;
509 	}
510 
511 	ret = intel_vgpu_flush_post_shadow(workload->vgpu);
512 	if (ret) {
513 		gvt_vgpu_err("fail to flush post shadow\n");
514 		goto err_unpin_mm;
515 	}
516 
517 	ret = intel_gvt_generate_request(workload);
518 	if (ret) {
519 		gvt_vgpu_err("fail to generate request\n");
520 		goto err_unpin_mm;
521 	}
522 
523 	ret = prepare_shadow_batch_buffer(workload);
524 	if (ret) {
525 		gvt_vgpu_err("fail to prepare_shadow_batch_buffer\n");
526 		goto err_unpin_mm;
527 	}
528 
529 	ret = prepare_shadow_wa_ctx(&workload->wa_ctx);
530 	if (ret) {
531 		gvt_vgpu_err("fail to prepare_shadow_wa_ctx\n");
532 		goto err_shadow_batch;
533 	}
534 
535 	if (workload->prepare) {
536 		ret = workload->prepare(workload);
537 		if (ret)
538 			goto err_shadow_wa_ctx;
539 	}
540 
541 	return 0;
542 err_shadow_wa_ctx:
543 	release_shadow_wa_ctx(&workload->wa_ctx);
544 err_shadow_batch:
545 	release_shadow_batch_buffer(workload);
546 err_unpin_mm:
547 	intel_vgpu_unpin_mm(workload->shadow_mm);
548 	return ret;
549 }
550 
551 static int dispatch_workload(struct intel_vgpu_workload *workload)
552 {
553 	struct intel_vgpu *vgpu = workload->vgpu;
554 	struct intel_vgpu_submission *s = &vgpu->submission;
555 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
556 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
557 	int ring_id = workload->ring_id;
558 	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
559 	int ret = 0;
560 
561 	gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n",
562 		ring_id, workload);
563 
564 	mutex_lock(&dev_priv->drm.struct_mutex);
565 
566 	ret = intel_gvt_scan_and_shadow_workload(workload);
567 	if (ret)
568 		goto out;
569 
570 	ret = prepare_workload(workload);
571 	if (ret) {
572 		engine->context_unpin(engine, shadow_ctx);
573 		goto out;
574 	}
575 
576 out:
577 	if (ret)
578 		workload->status = ret;
579 
580 	if (!IS_ERR_OR_NULL(workload->req)) {
581 		gvt_dbg_sched("ring id %d submit workload to i915 %p\n",
582 				ring_id, workload->req);
583 		i915_add_request(workload->req);
584 		workload->dispatched = true;
585 	}
586 
587 	mutex_unlock(&dev_priv->drm.struct_mutex);
588 	return ret;
589 }
590 
591 static struct intel_vgpu_workload *pick_next_workload(
592 		struct intel_gvt *gvt, int ring_id)
593 {
594 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
595 	struct intel_vgpu_workload *workload = NULL;
596 
597 	mutex_lock(&gvt->lock);
598 
599 	/*
600 	 * no current vgpu / will be scheduled out / no workload
601 	 * bail out
602 	 */
603 	if (!scheduler->current_vgpu) {
604 		gvt_dbg_sched("ring id %d stop - no current vgpu\n", ring_id);
605 		goto out;
606 	}
607 
608 	if (scheduler->need_reschedule) {
609 		gvt_dbg_sched("ring id %d stop - will reschedule\n", ring_id);
610 		goto out;
611 	}
612 
613 	if (list_empty(workload_q_head(scheduler->current_vgpu, ring_id)))
614 		goto out;
615 
616 	/*
617 	 * still have current workload, maybe the workload disptacher
618 	 * fail to submit it for some reason, resubmit it.
619 	 */
620 	if (scheduler->current_workload[ring_id]) {
621 		workload = scheduler->current_workload[ring_id];
622 		gvt_dbg_sched("ring id %d still have current workload %p\n",
623 				ring_id, workload);
624 		goto out;
625 	}
626 
627 	/*
628 	 * pick a workload as current workload
629 	 * once current workload is set, schedule policy routines
630 	 * will wait the current workload is finished when trying to
631 	 * schedule out a vgpu.
632 	 */
633 	scheduler->current_workload[ring_id] = container_of(
634 			workload_q_head(scheduler->current_vgpu, ring_id)->next,
635 			struct intel_vgpu_workload, list);
636 
637 	workload = scheduler->current_workload[ring_id];
638 
639 	gvt_dbg_sched("ring id %d pick new workload %p\n", ring_id, workload);
640 
641 	atomic_inc(&workload->vgpu->submission.running_workload_num);
642 out:
643 	mutex_unlock(&gvt->lock);
644 	return workload;
645 }
646 
647 static void update_guest_context(struct intel_vgpu_workload *workload)
648 {
649 	struct intel_vgpu *vgpu = workload->vgpu;
650 	struct intel_gvt *gvt = vgpu->gvt;
651 	struct intel_vgpu_submission *s = &vgpu->submission;
652 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
653 	int ring_id = workload->ring_id;
654 	struct drm_i915_gem_object *ctx_obj =
655 		shadow_ctx->engine[ring_id].state->obj;
656 	struct execlist_ring_context *shadow_ring_context;
657 	struct page *page;
658 	void *src;
659 	unsigned long context_gpa, context_page_num;
660 	int i;
661 
662 	gvt_dbg_sched("ring id %d workload lrca %x\n", ring_id,
663 			workload->ctx_desc.lrca);
664 
665 	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
666 
667 	context_page_num = context_page_num >> PAGE_SHIFT;
668 
669 	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
670 		context_page_num = 19;
671 
672 	i = 2;
673 
674 	while (i < context_page_num) {
675 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
676 				(u32)((workload->ctx_desc.lrca + i) <<
677 					I915_GTT_PAGE_SHIFT));
678 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
679 			gvt_vgpu_err("invalid guest context descriptor\n");
680 			return;
681 		}
682 
683 		page = i915_gem_object_get_page(ctx_obj, LRC_HEADER_PAGES + i);
684 		src = kmap(page);
685 		intel_gvt_hypervisor_write_gpa(vgpu, context_gpa, src,
686 				I915_GTT_PAGE_SIZE);
687 		kunmap(page);
688 		i++;
689 	}
690 
691 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa +
692 		RING_CTX_OFF(ring_header.val), &workload->rb_tail, 4);
693 
694 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
695 	shadow_ring_context = kmap(page);
696 
697 #define COPY_REG(name) \
698 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa + \
699 		RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
700 
701 	COPY_REG(ctx_ctrl);
702 	COPY_REG(ctx_timestamp);
703 
704 #undef COPY_REG
705 
706 	intel_gvt_hypervisor_write_gpa(vgpu,
707 			workload->ring_context_gpa +
708 			sizeof(*shadow_ring_context),
709 			(void *)shadow_ring_context +
710 			sizeof(*shadow_ring_context),
711 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
712 
713 	kunmap(page);
714 }
715 
716 static void clean_workloads(struct intel_vgpu *vgpu, unsigned long engine_mask)
717 {
718 	struct intel_vgpu_submission *s = &vgpu->submission;
719 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
720 	struct intel_engine_cs *engine;
721 	struct intel_vgpu_workload *pos, *n;
722 	unsigned int tmp;
723 
724 	/* free the unsubmited workloads in the queues. */
725 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
726 		list_for_each_entry_safe(pos, n,
727 			&s->workload_q_head[engine->id], list) {
728 			list_del_init(&pos->list);
729 			intel_vgpu_destroy_workload(pos);
730 		}
731 		clear_bit(engine->id, s->shadow_ctx_desc_updated);
732 	}
733 }
734 
735 static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
736 {
737 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
738 	struct intel_vgpu_workload *workload =
739 		scheduler->current_workload[ring_id];
740 	struct intel_vgpu *vgpu = workload->vgpu;
741 	struct intel_vgpu_submission *s = &vgpu->submission;
742 	int event;
743 
744 	mutex_lock(&gvt->lock);
745 
746 	/* For the workload w/ request, needs to wait for the context
747 	 * switch to make sure request is completed.
748 	 * For the workload w/o request, directly complete the workload.
749 	 */
750 	if (workload->req) {
751 		struct drm_i915_private *dev_priv =
752 			workload->vgpu->gvt->dev_priv;
753 		struct intel_engine_cs *engine =
754 			dev_priv->engine[workload->ring_id];
755 		wait_event(workload->shadow_ctx_status_wq,
756 			   !atomic_read(&workload->shadow_ctx_active));
757 
758 		/* If this request caused GPU hang, req->fence.error will
759 		 * be set to -EIO. Use -EIO to set workload status so
760 		 * that when this request caused GPU hang, didn't trigger
761 		 * context switch interrupt to guest.
762 		 */
763 		if (likely(workload->status == -EINPROGRESS)) {
764 			if (workload->req->fence.error == -EIO)
765 				workload->status = -EIO;
766 			else
767 				workload->status = 0;
768 		}
769 
770 		i915_gem_request_put(fetch_and_zero(&workload->req));
771 
772 		if (!workload->status && !(vgpu->resetting_eng &
773 					   ENGINE_MASK(ring_id))) {
774 			update_guest_context(workload);
775 
776 			for_each_set_bit(event, workload->pending_events,
777 					 INTEL_GVT_EVENT_MAX)
778 				intel_vgpu_trigger_virtual_event(vgpu, event);
779 		}
780 		mutex_lock(&dev_priv->drm.struct_mutex);
781 		/* unpin shadow ctx as the shadow_ctx update is done */
782 		engine->context_unpin(engine, s->shadow_ctx);
783 		mutex_unlock(&dev_priv->drm.struct_mutex);
784 	}
785 
786 	gvt_dbg_sched("ring id %d complete workload %p status %d\n",
787 			ring_id, workload, workload->status);
788 
789 	scheduler->current_workload[ring_id] = NULL;
790 
791 	list_del_init(&workload->list);
792 
793 	if (!workload->status) {
794 		release_shadow_batch_buffer(workload);
795 		release_shadow_wa_ctx(&workload->wa_ctx);
796 	}
797 
798 	if (workload->status || (vgpu->resetting_eng & ENGINE_MASK(ring_id))) {
799 		/* if workload->status is not successful means HW GPU
800 		 * has occurred GPU hang or something wrong with i915/GVT,
801 		 * and GVT won't inject context switch interrupt to guest.
802 		 * So this error is a vGPU hang actually to the guest.
803 		 * According to this we should emunlate a vGPU hang. If
804 		 * there are pending workloads which are already submitted
805 		 * from guest, we should clean them up like HW GPU does.
806 		 *
807 		 * if it is in middle of engine resetting, the pending
808 		 * workloads won't be submitted to HW GPU and will be
809 		 * cleaned up during the resetting process later, so doing
810 		 * the workload clean up here doesn't have any impact.
811 		 **/
812 		clean_workloads(vgpu, ENGINE_MASK(ring_id));
813 	}
814 
815 	workload->complete(workload);
816 
817 	atomic_dec(&s->running_workload_num);
818 	wake_up(&scheduler->workload_complete_wq);
819 
820 	if (gvt->scheduler.need_reschedule)
821 		intel_gvt_request_service(gvt, INTEL_GVT_REQUEST_EVENT_SCHED);
822 
823 	mutex_unlock(&gvt->lock);
824 }
825 
826 struct workload_thread_param {
827 	struct intel_gvt *gvt;
828 	int ring_id;
829 };
830 
831 static int workload_thread(void *priv)
832 {
833 	struct workload_thread_param *p = (struct workload_thread_param *)priv;
834 	struct intel_gvt *gvt = p->gvt;
835 	int ring_id = p->ring_id;
836 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
837 	struct intel_vgpu_workload *workload = NULL;
838 	struct intel_vgpu *vgpu = NULL;
839 	int ret;
840 	bool need_force_wake = IS_SKYLAKE(gvt->dev_priv)
841 			|| IS_KABYLAKE(gvt->dev_priv);
842 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
843 
844 	kfree(p);
845 
846 	gvt_dbg_core("workload thread for ring %d started\n", ring_id);
847 
848 	while (!kthread_should_stop()) {
849 		add_wait_queue(&scheduler->waitq[ring_id], &wait);
850 		do {
851 			workload = pick_next_workload(gvt, ring_id);
852 			if (workload)
853 				break;
854 			wait_woken(&wait, TASK_INTERRUPTIBLE,
855 				   MAX_SCHEDULE_TIMEOUT);
856 		} while (!kthread_should_stop());
857 		remove_wait_queue(&scheduler->waitq[ring_id], &wait);
858 
859 		if (!workload)
860 			break;
861 
862 		gvt_dbg_sched("ring id %d next workload %p vgpu %d\n",
863 				workload->ring_id, workload,
864 				workload->vgpu->id);
865 
866 		intel_runtime_pm_get(gvt->dev_priv);
867 
868 		gvt_dbg_sched("ring id %d will dispatch workload %p\n",
869 				workload->ring_id, workload);
870 
871 		if (need_force_wake)
872 			intel_uncore_forcewake_get(gvt->dev_priv,
873 					FORCEWAKE_ALL);
874 
875 		mutex_lock(&gvt->lock);
876 		ret = dispatch_workload(workload);
877 		mutex_unlock(&gvt->lock);
878 
879 		if (ret) {
880 			vgpu = workload->vgpu;
881 			gvt_vgpu_err("fail to dispatch workload, skip\n");
882 			goto complete;
883 		}
884 
885 		gvt_dbg_sched("ring id %d wait workload %p\n",
886 				workload->ring_id, workload);
887 		i915_wait_request(workload->req, 0, MAX_SCHEDULE_TIMEOUT);
888 
889 complete:
890 		gvt_dbg_sched("will complete workload %p, status: %d\n",
891 				workload, workload->status);
892 
893 		complete_current_workload(gvt, ring_id);
894 
895 		if (need_force_wake)
896 			intel_uncore_forcewake_put(gvt->dev_priv,
897 					FORCEWAKE_ALL);
898 
899 		intel_runtime_pm_put(gvt->dev_priv);
900 		if (ret && (vgpu_is_vm_unhealthy(ret)))
901 			enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
902 	}
903 	return 0;
904 }
905 
906 void intel_gvt_wait_vgpu_idle(struct intel_vgpu *vgpu)
907 {
908 	struct intel_vgpu_submission *s = &vgpu->submission;
909 	struct intel_gvt *gvt = vgpu->gvt;
910 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
911 
912 	if (atomic_read(&s->running_workload_num)) {
913 		gvt_dbg_sched("wait vgpu idle\n");
914 
915 		wait_event(scheduler->workload_complete_wq,
916 				!atomic_read(&s->running_workload_num));
917 	}
918 }
919 
920 void intel_gvt_clean_workload_scheduler(struct intel_gvt *gvt)
921 {
922 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
923 	struct intel_engine_cs *engine;
924 	enum intel_engine_id i;
925 
926 	gvt_dbg_core("clean workload scheduler\n");
927 
928 	for_each_engine(engine, gvt->dev_priv, i) {
929 		atomic_notifier_chain_unregister(
930 					&engine->context_status_notifier,
931 					&gvt->shadow_ctx_notifier_block[i]);
932 		kthread_stop(scheduler->thread[i]);
933 	}
934 }
935 
936 int intel_gvt_init_workload_scheduler(struct intel_gvt *gvt)
937 {
938 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
939 	struct workload_thread_param *param = NULL;
940 	struct intel_engine_cs *engine;
941 	enum intel_engine_id i;
942 	int ret;
943 
944 	gvt_dbg_core("init workload scheduler\n");
945 
946 	init_waitqueue_head(&scheduler->workload_complete_wq);
947 
948 	for_each_engine(engine, gvt->dev_priv, i) {
949 		init_waitqueue_head(&scheduler->waitq[i]);
950 
951 		param = kzalloc(sizeof(*param), GFP_KERNEL);
952 		if (!param) {
953 			ret = -ENOMEM;
954 			goto err;
955 		}
956 
957 		param->gvt = gvt;
958 		param->ring_id = i;
959 
960 		scheduler->thread[i] = kthread_run(workload_thread, param,
961 			"gvt workload %d", i);
962 		if (IS_ERR(scheduler->thread[i])) {
963 			gvt_err("fail to create workload thread\n");
964 			ret = PTR_ERR(scheduler->thread[i]);
965 			goto err;
966 		}
967 
968 		gvt->shadow_ctx_notifier_block[i].notifier_call =
969 					shadow_context_status_change;
970 		atomic_notifier_chain_register(&engine->context_status_notifier,
971 					&gvt->shadow_ctx_notifier_block[i]);
972 	}
973 	return 0;
974 err:
975 	intel_gvt_clean_workload_scheduler(gvt);
976 	kfree(param);
977 	param = NULL;
978 	return ret;
979 }
980 
981 /**
982  * intel_vgpu_clean_submission - free submission-related resource for vGPU
983  * @vgpu: a vGPU
984  *
985  * This function is called when a vGPU is being destroyed.
986  *
987  */
988 void intel_vgpu_clean_submission(struct intel_vgpu *vgpu)
989 {
990 	struct intel_vgpu_submission *s = &vgpu->submission;
991 
992 	intel_vgpu_select_submission_ops(vgpu, 0);
993 	i915_gem_context_put(s->shadow_ctx);
994 	kmem_cache_destroy(s->workloads);
995 }
996 
997 
998 /**
999  * intel_vgpu_reset_submission - reset submission-related resource for vGPU
1000  * @vgpu: a vGPU
1001  * @engine_mask: engines expected to be reset
1002  *
1003  * This function is called when a vGPU is being destroyed.
1004  *
1005  */
1006 void intel_vgpu_reset_submission(struct intel_vgpu *vgpu,
1007 		unsigned long engine_mask)
1008 {
1009 	struct intel_vgpu_submission *s = &vgpu->submission;
1010 
1011 	if (!s->active)
1012 		return;
1013 
1014 	clean_workloads(vgpu, engine_mask);
1015 	s->ops->reset(vgpu, engine_mask);
1016 }
1017 
1018 /**
1019  * intel_vgpu_setup_submission - setup submission-related resource for vGPU
1020  * @vgpu: a vGPU
1021  *
1022  * This function is called when a vGPU is being created.
1023  *
1024  * Returns:
1025  * Zero on success, negative error code if failed.
1026  *
1027  */
1028 int intel_vgpu_setup_submission(struct intel_vgpu *vgpu)
1029 {
1030 	struct intel_vgpu_submission *s = &vgpu->submission;
1031 	enum intel_engine_id i;
1032 	struct intel_engine_cs *engine;
1033 	int ret;
1034 
1035 	s->shadow_ctx = i915_gem_context_create_gvt(
1036 			&vgpu->gvt->dev_priv->drm);
1037 	if (IS_ERR(s->shadow_ctx))
1038 		return PTR_ERR(s->shadow_ctx);
1039 
1040 	bitmap_zero(s->shadow_ctx_desc_updated, I915_NUM_ENGINES);
1041 
1042 	s->workloads = kmem_cache_create("gvt-g_vgpu_workload",
1043 			sizeof(struct intel_vgpu_workload), 0,
1044 			SLAB_HWCACHE_ALIGN,
1045 			NULL);
1046 
1047 	if (!s->workloads) {
1048 		ret = -ENOMEM;
1049 		goto out_shadow_ctx;
1050 	}
1051 
1052 	for_each_engine(engine, vgpu->gvt->dev_priv, i)
1053 		INIT_LIST_HEAD(&s->workload_q_head[i]);
1054 
1055 	atomic_set(&s->running_workload_num, 0);
1056 	bitmap_zero(s->tlb_handle_pending, I915_NUM_ENGINES);
1057 
1058 	return 0;
1059 
1060 out_shadow_ctx:
1061 	i915_gem_context_put(s->shadow_ctx);
1062 	return ret;
1063 }
1064 
1065 /**
1066  * intel_vgpu_select_submission_ops - select virtual submission interface
1067  * @vgpu: a vGPU
1068  * @interface: expected vGPU virtual submission interface
1069  *
1070  * This function is called when guest configures submission interface.
1071  *
1072  * Returns:
1073  * Zero on success, negative error code if failed.
1074  *
1075  */
1076 int intel_vgpu_select_submission_ops(struct intel_vgpu *vgpu,
1077 				     unsigned int interface)
1078 {
1079 	struct intel_vgpu_submission *s = &vgpu->submission;
1080 	const struct intel_vgpu_submission_ops *ops[] = {
1081 		[INTEL_VGPU_EXECLIST_SUBMISSION] =
1082 			&intel_vgpu_execlist_submission_ops,
1083 	};
1084 	int ret;
1085 
1086 	if (WARN_ON(interface >= ARRAY_SIZE(ops)))
1087 		return -EINVAL;
1088 
1089 	if (s->active) {
1090 		s->ops->clean(vgpu);
1091 		s->active = false;
1092 		gvt_dbg_core("vgpu%d: de-select ops [ %s ] \n",
1093 				vgpu->id, s->ops->name);
1094 	}
1095 
1096 	if (interface == 0) {
1097 		s->ops = NULL;
1098 		s->virtual_submission_interface = 0;
1099 		gvt_dbg_core("vgpu%d: no submission ops\n", vgpu->id);
1100 		return 0;
1101 	}
1102 
1103 	ret = ops[interface]->init(vgpu);
1104 	if (ret)
1105 		return ret;
1106 
1107 	s->ops = ops[interface];
1108 	s->virtual_submission_interface = interface;
1109 	s->active = true;
1110 
1111 	gvt_dbg_core("vgpu%d: activate ops [ %s ]\n",
1112 			vgpu->id, s->ops->name);
1113 
1114 	return 0;
1115 }
1116 
1117 /**
1118  * intel_vgpu_destroy_workload - destroy a vGPU workload
1119  * @vgpu: a vGPU
1120  *
1121  * This function is called when destroy a vGPU workload.
1122  *
1123  */
1124 void intel_vgpu_destroy_workload(struct intel_vgpu_workload *workload)
1125 {
1126 	struct intel_vgpu_submission *s = &workload->vgpu->submission;
1127 
1128 	if (workload->shadow_mm)
1129 		intel_gvt_mm_unreference(workload->shadow_mm);
1130 
1131 	kmem_cache_free(s->workloads, workload);
1132 }
1133 
1134 static struct intel_vgpu_workload *
1135 alloc_workload(struct intel_vgpu *vgpu)
1136 {
1137 	struct intel_vgpu_submission *s = &vgpu->submission;
1138 	struct intel_vgpu_workload *workload;
1139 
1140 	workload = kmem_cache_zalloc(s->workloads, GFP_KERNEL);
1141 	if (!workload)
1142 		return ERR_PTR(-ENOMEM);
1143 
1144 	INIT_LIST_HEAD(&workload->list);
1145 	INIT_LIST_HEAD(&workload->shadow_bb);
1146 
1147 	init_waitqueue_head(&workload->shadow_ctx_status_wq);
1148 	atomic_set(&workload->shadow_ctx_active, 0);
1149 
1150 	workload->status = -EINPROGRESS;
1151 	workload->shadowed = false;
1152 	workload->vgpu = vgpu;
1153 
1154 	return workload;
1155 }
1156 
1157 #define RING_CTX_OFF(x) \
1158 	offsetof(struct execlist_ring_context, x)
1159 
1160 static void read_guest_pdps(struct intel_vgpu *vgpu,
1161 		u64 ring_context_gpa, u32 pdp[8])
1162 {
1163 	u64 gpa;
1164 	int i;
1165 
1166 	gpa = ring_context_gpa + RING_CTX_OFF(pdp3_UDW.val);
1167 
1168 	for (i = 0; i < 8; i++)
1169 		intel_gvt_hypervisor_read_gpa(vgpu,
1170 				gpa + i * 8, &pdp[7 - i], 4);
1171 }
1172 
1173 static int prepare_mm(struct intel_vgpu_workload *workload)
1174 {
1175 	struct execlist_ctx_descriptor_format *desc = &workload->ctx_desc;
1176 	struct intel_vgpu_mm *mm;
1177 	struct intel_vgpu *vgpu = workload->vgpu;
1178 	int page_table_level;
1179 	u32 pdp[8];
1180 
1181 	if (desc->addressing_mode == 1) { /* legacy 32-bit */
1182 		page_table_level = 3;
1183 	} else if (desc->addressing_mode == 3) { /* legacy 64 bit */
1184 		page_table_level = 4;
1185 	} else {
1186 		gvt_vgpu_err("Advanced Context mode(SVM) is not supported!\n");
1187 		return -EINVAL;
1188 	}
1189 
1190 	read_guest_pdps(workload->vgpu, workload->ring_context_gpa, pdp);
1191 
1192 	mm = intel_vgpu_find_ppgtt_mm(workload->vgpu, page_table_level, pdp);
1193 	if (mm) {
1194 		intel_gvt_mm_reference(mm);
1195 	} else {
1196 
1197 		mm = intel_vgpu_create_mm(workload->vgpu, INTEL_GVT_MM_PPGTT,
1198 				pdp, page_table_level, 0);
1199 		if (IS_ERR(mm)) {
1200 			gvt_vgpu_err("fail to create mm object.\n");
1201 			return PTR_ERR(mm);
1202 		}
1203 	}
1204 	workload->shadow_mm = mm;
1205 	return 0;
1206 }
1207 
1208 #define same_context(a, b) (((a)->context_id == (b)->context_id) && \
1209 		((a)->lrca == (b)->lrca))
1210 
1211 #define get_last_workload(q) \
1212 	(list_empty(q) ? NULL : container_of(q->prev, \
1213 	struct intel_vgpu_workload, list))
1214 /**
1215  * intel_vgpu_create_workload - create a vGPU workload
1216  * @vgpu: a vGPU
1217  * @desc: a guest context descriptor
1218  *
1219  * This function is called when creating a vGPU workload.
1220  *
1221  * Returns:
1222  * struct intel_vgpu_workload * on success, negative error code in
1223  * pointer if failed.
1224  *
1225  */
1226 struct intel_vgpu_workload *
1227 intel_vgpu_create_workload(struct intel_vgpu *vgpu, int ring_id,
1228 			   struct execlist_ctx_descriptor_format *desc)
1229 {
1230 	struct intel_vgpu_submission *s = &vgpu->submission;
1231 	struct list_head *q = workload_q_head(vgpu, ring_id);
1232 	struct intel_vgpu_workload *last_workload = get_last_workload(q);
1233 	struct intel_vgpu_workload *workload = NULL;
1234 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
1235 	u64 ring_context_gpa;
1236 	u32 head, tail, start, ctl, ctx_ctl, per_ctx, indirect_ctx;
1237 	int ret;
1238 
1239 	ring_context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
1240 			(u32)((desc->lrca + 1) << I915_GTT_PAGE_SHIFT));
1241 	if (ring_context_gpa == INTEL_GVT_INVALID_ADDR) {
1242 		gvt_vgpu_err("invalid guest context LRCA: %x\n", desc->lrca);
1243 		return ERR_PTR(-EINVAL);
1244 	}
1245 
1246 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1247 			RING_CTX_OFF(ring_header.val), &head, 4);
1248 
1249 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1250 			RING_CTX_OFF(ring_tail.val), &tail, 4);
1251 
1252 	head &= RB_HEAD_OFF_MASK;
1253 	tail &= RB_TAIL_OFF_MASK;
1254 
1255 	if (last_workload && same_context(&last_workload->ctx_desc, desc)) {
1256 		gvt_dbg_el("ring id %d cur workload == last\n", ring_id);
1257 		gvt_dbg_el("ctx head %x real head %lx\n", head,
1258 				last_workload->rb_tail);
1259 		/*
1260 		 * cannot use guest context head pointer here,
1261 		 * as it might not be updated at this time
1262 		 */
1263 		head = last_workload->rb_tail;
1264 	}
1265 
1266 	gvt_dbg_el("ring id %d begin a new workload\n", ring_id);
1267 
1268 	/* record some ring buffer register values for scan and shadow */
1269 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1270 			RING_CTX_OFF(rb_start.val), &start, 4);
1271 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1272 			RING_CTX_OFF(rb_ctrl.val), &ctl, 4);
1273 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1274 			RING_CTX_OFF(ctx_ctrl.val), &ctx_ctl, 4);
1275 
1276 	workload = alloc_workload(vgpu);
1277 	if (IS_ERR(workload))
1278 		return workload;
1279 
1280 	workload->ring_id = ring_id;
1281 	workload->ctx_desc = *desc;
1282 	workload->ring_context_gpa = ring_context_gpa;
1283 	workload->rb_head = head;
1284 	workload->rb_tail = tail;
1285 	workload->rb_start = start;
1286 	workload->rb_ctl = ctl;
1287 
1288 	if (ring_id == RCS) {
1289 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1290 			RING_CTX_OFF(bb_per_ctx_ptr.val), &per_ctx, 4);
1291 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
1292 			RING_CTX_OFF(rcs_indirect_ctx.val), &indirect_ctx, 4);
1293 
1294 		workload->wa_ctx.indirect_ctx.guest_gma =
1295 			indirect_ctx & INDIRECT_CTX_ADDR_MASK;
1296 		workload->wa_ctx.indirect_ctx.size =
1297 			(indirect_ctx & INDIRECT_CTX_SIZE_MASK) *
1298 			CACHELINE_BYTES;
1299 		workload->wa_ctx.per_ctx.guest_gma =
1300 			per_ctx & PER_CTX_ADDR_MASK;
1301 		workload->wa_ctx.per_ctx.valid = per_ctx & 1;
1302 	}
1303 
1304 	gvt_dbg_el("workload %p ring id %d head %x tail %x start %x ctl %x\n",
1305 			workload, ring_id, head, tail, start, ctl);
1306 
1307 	ret = prepare_mm(workload);
1308 	if (ret) {
1309 		kmem_cache_free(s->workloads, workload);
1310 		return ERR_PTR(ret);
1311 	}
1312 
1313 	/* Only scan and shadow the first workload in the queue
1314 	 * as there is only one pre-allocated buf-obj for shadow.
1315 	 */
1316 	if (list_empty(workload_q_head(vgpu, ring_id))) {
1317 		intel_runtime_pm_get(dev_priv);
1318 		mutex_lock(&dev_priv->drm.struct_mutex);
1319 		ret = intel_gvt_scan_and_shadow_workload(workload);
1320 		mutex_unlock(&dev_priv->drm.struct_mutex);
1321 		intel_runtime_pm_put(dev_priv);
1322 	}
1323 
1324 	if (ret && (vgpu_is_vm_unhealthy(ret))) {
1325 		enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
1326 		intel_vgpu_destroy_workload(workload);
1327 		return ERR_PTR(ret);
1328 	}
1329 
1330 	return workload;
1331 }
1332