xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_migrate.c (revision 19dc81b4017baffd6e919fd71cfc8dcbd5442e15)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "i915_drv.h"
7 #include "intel_context.h"
8 #include "intel_gpu_commands.h"
9 #include "intel_gt.h"
10 #include "intel_gtt.h"
11 #include "intel_migrate.h"
12 #include "intel_ring.h"
13 
14 struct insert_pte_data {
15 	u64 offset;
16 };
17 
18 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
19 
20 static bool engine_supports_migration(struct intel_engine_cs *engine)
21 {
22 	if (!engine)
23 		return false;
24 
25 	/*
26 	 * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
27 	 * the ability to write PTE using inline data (MI_STORE_DATA)
28 	 * and of course the ability to do the block transfer (blits).
29 	 */
30 	GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
31 
32 	return true;
33 }
34 
35 static void xehpsdv_toggle_pdes(struct i915_address_space *vm,
36 				struct i915_page_table *pt,
37 				void *data)
38 {
39 	struct insert_pte_data *d = data;
40 
41 	/*
42 	 * Insert a dummy PTE into every PT that will map to LMEM to ensure
43 	 * we have a correctly setup PDE structure for later use.
44 	 */
45 	vm->insert_page(vm, 0, d->offset, I915_CACHE_NONE, PTE_LM);
46 	GEM_BUG_ON(!pt->is_compact);
47 	d->offset += SZ_2M;
48 }
49 
50 static void xehpsdv_insert_pte(struct i915_address_space *vm,
51 			       struct i915_page_table *pt,
52 			       void *data)
53 {
54 	struct insert_pte_data *d = data;
55 
56 	/*
57 	 * We are playing tricks here, since the actual pt, from the hw
58 	 * pov, is only 256bytes with 32 entries, or 4096bytes with 512
59 	 * entries, but we are still guaranteed that the physical
60 	 * alignment is 64K underneath for the pt, and we are careful
61 	 * not to access the space in the void.
62 	 */
63 	vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, PTE_LM);
64 	d->offset += SZ_64K;
65 }
66 
67 static void insert_pte(struct i915_address_space *vm,
68 		       struct i915_page_table *pt,
69 		       void *data)
70 {
71 	struct insert_pte_data *d = data;
72 
73 	vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
74 			i915_gem_object_is_lmem(pt->base) ? PTE_LM : 0);
75 	d->offset += PAGE_SIZE;
76 }
77 
78 static struct i915_address_space *migrate_vm(struct intel_gt *gt)
79 {
80 	struct i915_vm_pt_stash stash = {};
81 	struct i915_ppgtt *vm;
82 	int err;
83 	int i;
84 
85 	/*
86 	 * We construct a very special VM for use by all migration contexts,
87 	 * it is kept pinned so that it can be used at any time. As we need
88 	 * to pre-allocate the page directories for the migration VM, this
89 	 * limits us to only using a small number of prepared vma.
90 	 *
91 	 * To be able to pipeline and reschedule migration operations while
92 	 * avoiding unnecessary contention on the vm itself, the PTE updates
93 	 * are inline with the blits. All the blits use the same fixed
94 	 * addresses, with the backing store redirection being updated on the
95 	 * fly. Only 2 implicit vma are used for all migration operations.
96 	 *
97 	 * We lay the ppGTT out as:
98 	 *
99 	 *	[0, CHUNK_SZ) -> first object
100 	 *	[CHUNK_SZ, 2 * CHUNK_SZ) -> second object
101 	 *	[2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
102 	 *
103 	 * By exposing the dma addresses of the page directories themselves
104 	 * within the ppGTT, we are then able to rewrite the PTE prior to use.
105 	 * But the PTE update and subsequent migration operation must be atomic,
106 	 * i.e. within the same non-preemptible window so that we do not switch
107 	 * to another migration context that overwrites the PTE.
108 	 *
109 	 * This changes quite a bit on platforms with HAS_64K_PAGES support,
110 	 * where we instead have three windows, each CHUNK_SIZE in size. The
111 	 * first is reserved for mapping system-memory, and that just uses the
112 	 * 512 entry layout using 4K GTT pages. The other two windows just map
113 	 * lmem pages and must use the new compact 32 entry layout using 64K GTT
114 	 * pages, which ensures we can address any lmem object that the user
115 	 * throws at us. We then also use the xehpsdv_toggle_pdes as a way of
116 	 * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
117 	 * compact layout for each of these page-tables, that fall within the
118 	 * [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
119 	 *
120 	 * We lay the ppGTT out as:
121 	 *
122 	 * [0, CHUNK_SZ) -> first window/object, maps smem
123 	 * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
124 	 * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
125 	 *
126 	 * For the PTE window it's also quite different, since each PTE must
127 	 * point to some 64K page, one for each PT(since it's in lmem), and yet
128 	 * each is only <= 4096bytes, but since the unused space within that PTE
129 	 * range is never touched, this should be fine.
130 	 *
131 	 * So basically each PT now needs 64K of virtual memory, instead of 4K,
132 	 * which looks like:
133 	 *
134 	 * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
135 	 */
136 
137 	vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY);
138 	if (IS_ERR(vm))
139 		return ERR_CAST(vm);
140 
141 	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
142 		err = -ENODEV;
143 		goto err_vm;
144 	}
145 
146 	if (HAS_64K_PAGES(gt->i915))
147 		stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
148 
149 	/*
150 	 * Each engine instance is assigned its own chunk in the VM, so
151 	 * that we can run multiple instances concurrently
152 	 */
153 	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
154 		struct intel_engine_cs *engine;
155 		u64 base = (u64)i << 32;
156 		struct insert_pte_data d = {};
157 		struct i915_gem_ww_ctx ww;
158 		u64 sz;
159 
160 		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
161 		if (!engine_supports_migration(engine))
162 			continue;
163 
164 		/*
165 		 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
166 		 * 4x2 page directories for source/destination.
167 		 */
168 		if (HAS_64K_PAGES(gt->i915))
169 			sz = 3 * CHUNK_SZ;
170 		else
171 			sz = 2 * CHUNK_SZ;
172 		d.offset = base + sz;
173 
174 		/*
175 		 * We need another page directory setup so that we can write
176 		 * the 8x512 PTE in each chunk.
177 		 */
178 		if (HAS_64K_PAGES(gt->i915))
179 			sz += (sz / SZ_2M) * SZ_64K;
180 		else
181 			sz += (sz >> 12) * sizeof(u64);
182 
183 		err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
184 		if (err)
185 			goto err_vm;
186 
187 		for_i915_gem_ww(&ww, err, true) {
188 			err = i915_vm_lock_objects(&vm->vm, &ww);
189 			if (err)
190 				continue;
191 			err = i915_vm_map_pt_stash(&vm->vm, &stash);
192 			if (err)
193 				continue;
194 
195 			vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
196 		}
197 		i915_vm_free_pt_stash(&vm->vm, &stash);
198 		if (err)
199 			goto err_vm;
200 
201 		/* Now allow the GPU to rewrite the PTE via its own ppGTT */
202 		if (HAS_64K_PAGES(gt->i915)) {
203 			vm->vm.foreach(&vm->vm, base, d.offset - base,
204 				       xehpsdv_insert_pte, &d);
205 			d.offset = base + CHUNK_SZ;
206 			vm->vm.foreach(&vm->vm,
207 				       d.offset,
208 				       2 * CHUNK_SZ,
209 				       xehpsdv_toggle_pdes, &d);
210 		} else {
211 			vm->vm.foreach(&vm->vm, base, d.offset - base,
212 				       insert_pte, &d);
213 		}
214 	}
215 
216 	return &vm->vm;
217 
218 err_vm:
219 	i915_vm_put(&vm->vm);
220 	return ERR_PTR(err);
221 }
222 
223 static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
224 {
225 	struct intel_engine_cs *engine;
226 	int i;
227 
228 	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
229 		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
230 		if (engine_supports_migration(engine))
231 			return engine;
232 	}
233 
234 	return NULL;
235 }
236 
237 static struct intel_context *pinned_context(struct intel_gt *gt)
238 {
239 	static struct lock_class_key key;
240 	struct intel_engine_cs *engine;
241 	struct i915_address_space *vm;
242 	struct intel_context *ce;
243 
244 	engine = first_copy_engine(gt);
245 	if (!engine)
246 		return ERR_PTR(-ENODEV);
247 
248 	vm = migrate_vm(gt);
249 	if (IS_ERR(vm))
250 		return ERR_CAST(vm);
251 
252 	ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
253 						I915_GEM_HWS_MIGRATE,
254 						&key, "migrate");
255 	i915_vm_put(vm);
256 	return ce;
257 }
258 
259 int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
260 {
261 	struct intel_context *ce;
262 
263 	memset(m, 0, sizeof(*m));
264 
265 	ce = pinned_context(gt);
266 	if (IS_ERR(ce))
267 		return PTR_ERR(ce);
268 
269 	m->context = ce;
270 	return 0;
271 }
272 
273 static int random_index(unsigned int max)
274 {
275 	return upper_32_bits(mul_u32_u32(get_random_u32(), max));
276 }
277 
278 static struct intel_context *__migrate_engines(struct intel_gt *gt)
279 {
280 	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
281 	struct intel_engine_cs *engine;
282 	unsigned int count, i;
283 
284 	count = 0;
285 	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
286 		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
287 		if (engine_supports_migration(engine))
288 			engines[count++] = engine;
289 	}
290 
291 	return intel_context_create(engines[random_index(count)]);
292 }
293 
294 struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
295 {
296 	struct intel_context *ce;
297 
298 	/*
299 	 * We randomly distribute contexts across the engines upon constrction,
300 	 * as they all share the same pinned vm, and so in order to allow
301 	 * multiple blits to run in parallel, we must construct each blit
302 	 * to use a different range of the vm for its GTT. This has to be
303 	 * known at construction, so we can not use the late greedy load
304 	 * balancing of the virtual-engine.
305 	 */
306 	ce = __migrate_engines(m->context->engine->gt);
307 	if (IS_ERR(ce))
308 		return ce;
309 
310 	ce->ring = NULL;
311 	ce->ring_size = SZ_256K;
312 
313 	i915_vm_put(ce->vm);
314 	ce->vm = i915_vm_get(m->context->vm);
315 
316 	return ce;
317 }
318 
319 static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
320 {
321 	dma_addr_t addr = sg_dma_address(sg);
322 
323 	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
324 }
325 
326 static int emit_no_arbitration(struct i915_request *rq)
327 {
328 	u32 *cs;
329 
330 	cs = intel_ring_begin(rq, 2);
331 	if (IS_ERR(cs))
332 		return PTR_ERR(cs);
333 
334 	/* Explicitly disable preemption for this request. */
335 	*cs++ = MI_ARB_ON_OFF;
336 	*cs++ = MI_NOOP;
337 	intel_ring_advance(rq, cs);
338 
339 	return 0;
340 }
341 
342 static int emit_pte(struct i915_request *rq,
343 		    struct sgt_dma *it,
344 		    enum i915_cache_level cache_level,
345 		    bool is_lmem,
346 		    u64 offset,
347 		    int length)
348 {
349 	bool has_64K_pages = HAS_64K_PAGES(rq->engine->i915);
350 	const u64 encode = rq->context->vm->pte_encode(0, cache_level,
351 						       is_lmem ? PTE_LM : 0);
352 	struct intel_ring *ring = rq->ring;
353 	int pkt, dword_length;
354 	u32 total = 0;
355 	u32 page_size;
356 	u32 *hdr, *cs;
357 
358 	GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8);
359 
360 	page_size = I915_GTT_PAGE_SIZE;
361 	dword_length = 0x400;
362 
363 	/* Compute the page directory offset for the target address range */
364 	if (has_64K_pages) {
365 		GEM_BUG_ON(!IS_ALIGNED(offset, SZ_2M));
366 
367 		offset /= SZ_2M;
368 		offset *= SZ_64K;
369 		offset += 3 * CHUNK_SZ;
370 
371 		if (is_lmem) {
372 			page_size = I915_GTT_PAGE_SIZE_64K;
373 			dword_length = 0x40;
374 		}
375 	} else {
376 		offset >>= 12;
377 		offset *= sizeof(u64);
378 		offset += 2 * CHUNK_SZ;
379 	}
380 
381 	offset += (u64)rq->engine->instance << 32;
382 
383 	cs = intel_ring_begin(rq, 6);
384 	if (IS_ERR(cs))
385 		return PTR_ERR(cs);
386 
387 	/* Pack as many PTE updates as possible into a single MI command */
388 	pkt = min_t(int, dword_length, ring->space / sizeof(u32) + 5);
389 	pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
390 
391 	hdr = cs;
392 	*cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */
393 	*cs++ = lower_32_bits(offset);
394 	*cs++ = upper_32_bits(offset);
395 
396 	do {
397 		if (cs - hdr >= pkt) {
398 			int dword_rem;
399 
400 			*hdr += cs - hdr - 2;
401 			*cs++ = MI_NOOP;
402 
403 			ring->emit = (void *)cs - ring->vaddr;
404 			intel_ring_advance(rq, cs);
405 			intel_ring_update_space(ring);
406 
407 			cs = intel_ring_begin(rq, 6);
408 			if (IS_ERR(cs))
409 				return PTR_ERR(cs);
410 
411 			dword_rem = dword_length;
412 			if (has_64K_pages) {
413 				if (IS_ALIGNED(total, SZ_2M)) {
414 					offset = round_up(offset, SZ_64K);
415 				} else {
416 					dword_rem = SZ_2M - (total & (SZ_2M - 1));
417 					dword_rem /= page_size;
418 					dword_rem *= 2;
419 				}
420 			}
421 
422 			pkt = min_t(int, dword_rem, ring->space / sizeof(u32) + 5);
423 			pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
424 
425 			hdr = cs;
426 			*cs++ = MI_STORE_DATA_IMM | REG_BIT(21);
427 			*cs++ = lower_32_bits(offset);
428 			*cs++ = upper_32_bits(offset);
429 		}
430 
431 		GEM_BUG_ON(!IS_ALIGNED(it->dma, page_size));
432 
433 		*cs++ = lower_32_bits(encode | it->dma);
434 		*cs++ = upper_32_bits(encode | it->dma);
435 
436 		offset += 8;
437 		total += page_size;
438 
439 		it->dma += page_size;
440 		if (it->dma >= it->max) {
441 			it->sg = __sg_next(it->sg);
442 			if (!it->sg || sg_dma_len(it->sg) == 0)
443 				break;
444 
445 			it->dma = sg_dma_address(it->sg);
446 			it->max = it->dma + sg_dma_len(it->sg);
447 		}
448 	} while (total < length);
449 
450 	*hdr += cs - hdr - 2;
451 	*cs++ = MI_NOOP;
452 
453 	ring->emit = (void *)cs - ring->vaddr;
454 	intel_ring_advance(rq, cs);
455 	intel_ring_update_space(ring);
456 
457 	return total;
458 }
459 
460 static bool wa_1209644611_applies(int ver, u32 size)
461 {
462 	u32 height = size >> PAGE_SHIFT;
463 
464 	if (ver != 11)
465 		return false;
466 
467 	return height % 4 == 3 && height <= 8;
468 }
469 
470 static int emit_copy(struct i915_request *rq,
471 		     u32 dst_offset, u32 src_offset, int size)
472 {
473 	const int ver = GRAPHICS_VER(rq->engine->i915);
474 	u32 instance = rq->engine->instance;
475 	u32 *cs;
476 
477 	cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
478 	if (IS_ERR(cs))
479 		return PTR_ERR(cs);
480 
481 	if (ver >= 9 && !wa_1209644611_applies(ver, size)) {
482 		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
483 		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
484 		*cs++ = 0;
485 		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
486 		*cs++ = dst_offset;
487 		*cs++ = instance;
488 		*cs++ = 0;
489 		*cs++ = PAGE_SIZE;
490 		*cs++ = src_offset;
491 		*cs++ = instance;
492 	} else if (ver >= 8) {
493 		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
494 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
495 		*cs++ = 0;
496 		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
497 		*cs++ = dst_offset;
498 		*cs++ = instance;
499 		*cs++ = 0;
500 		*cs++ = PAGE_SIZE;
501 		*cs++ = src_offset;
502 		*cs++ = instance;
503 	} else {
504 		GEM_BUG_ON(instance);
505 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
506 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
507 		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
508 		*cs++ = dst_offset;
509 		*cs++ = PAGE_SIZE;
510 		*cs++ = src_offset;
511 	}
512 
513 	intel_ring_advance(rq, cs);
514 	return 0;
515 }
516 
517 int
518 intel_context_migrate_copy(struct intel_context *ce,
519 			   const struct i915_deps *deps,
520 			   struct scatterlist *src,
521 			   enum i915_cache_level src_cache_level,
522 			   bool src_is_lmem,
523 			   struct scatterlist *dst,
524 			   enum i915_cache_level dst_cache_level,
525 			   bool dst_is_lmem,
526 			   struct i915_request **out)
527 {
528 	struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
529 	struct i915_request *rq;
530 	int err;
531 
532 	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
533 	*out = NULL;
534 
535 	GEM_BUG_ON(ce->ring->size < SZ_64K);
536 
537 	do {
538 		u32 src_offset, dst_offset;
539 		int len;
540 
541 		rq = i915_request_create(ce);
542 		if (IS_ERR(rq)) {
543 			err = PTR_ERR(rq);
544 			goto out_ce;
545 		}
546 
547 		if (deps) {
548 			err = i915_request_await_deps(rq, deps);
549 			if (err)
550 				goto out_rq;
551 
552 			if (rq->engine->emit_init_breadcrumb) {
553 				err = rq->engine->emit_init_breadcrumb(rq);
554 				if (err)
555 					goto out_rq;
556 			}
557 
558 			deps = NULL;
559 		}
560 
561 		/* The PTE updates + copy must not be interrupted. */
562 		err = emit_no_arbitration(rq);
563 		if (err)
564 			goto out_rq;
565 
566 		src_offset = 0;
567 		dst_offset = CHUNK_SZ;
568 		if (HAS_64K_PAGES(ce->engine->i915)) {
569 			GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
570 
571 			src_offset = 0;
572 			dst_offset = 0;
573 			if (src_is_lmem)
574 				src_offset = CHUNK_SZ;
575 			if (dst_is_lmem)
576 				dst_offset = 2 * CHUNK_SZ;
577 		}
578 
579 		len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
580 			       src_offset, CHUNK_SZ);
581 		if (len <= 0) {
582 			err = len;
583 			goto out_rq;
584 		}
585 
586 		err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
587 			       dst_offset, len);
588 		if (err < 0)
589 			goto out_rq;
590 		if (err < len) {
591 			err = -EINVAL;
592 			goto out_rq;
593 		}
594 
595 		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
596 		if (err)
597 			goto out_rq;
598 
599 		err = emit_copy(rq, dst_offset, src_offset, len);
600 
601 		/* Arbitration is re-enabled between requests. */
602 out_rq:
603 		if (*out)
604 			i915_request_put(*out);
605 		*out = i915_request_get(rq);
606 		i915_request_add(rq);
607 		if (err || !it_src.sg || !sg_dma_len(it_src.sg))
608 			break;
609 
610 		cond_resched();
611 	} while (1);
612 
613 out_ce:
614 	return err;
615 }
616 
617 static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
618 {
619 	const int ver = GRAPHICS_VER(rq->engine->i915);
620 	u32 *cs;
621 
622 	GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
623 
624 	offset += (u64)rq->engine->instance << 32;
625 
626 	cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
627 	if (IS_ERR(cs))
628 		return PTR_ERR(cs);
629 
630 	if (ver >= 8) {
631 		*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
632 		*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
633 		*cs++ = 0;
634 		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
635 		*cs++ = lower_32_bits(offset);
636 		*cs++ = upper_32_bits(offset);
637 		*cs++ = value;
638 		*cs++ = MI_NOOP;
639 	} else {
640 		GEM_BUG_ON(upper_32_bits(offset));
641 		*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
642 		*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
643 		*cs++ = 0;
644 		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
645 		*cs++ = lower_32_bits(offset);
646 		*cs++ = value;
647 	}
648 
649 	intel_ring_advance(rq, cs);
650 	return 0;
651 }
652 
653 int
654 intel_context_migrate_clear(struct intel_context *ce,
655 			    const struct i915_deps *deps,
656 			    struct scatterlist *sg,
657 			    enum i915_cache_level cache_level,
658 			    bool is_lmem,
659 			    u32 value,
660 			    struct i915_request **out)
661 {
662 	struct sgt_dma it = sg_sgt(sg);
663 	struct i915_request *rq;
664 	int err;
665 
666 	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
667 	*out = NULL;
668 
669 	GEM_BUG_ON(ce->ring->size < SZ_64K);
670 
671 	do {
672 		u32 offset;
673 		int len;
674 
675 		rq = i915_request_create(ce);
676 		if (IS_ERR(rq)) {
677 			err = PTR_ERR(rq);
678 			goto out_ce;
679 		}
680 
681 		if (deps) {
682 			err = i915_request_await_deps(rq, deps);
683 			if (err)
684 				goto out_rq;
685 
686 			if (rq->engine->emit_init_breadcrumb) {
687 				err = rq->engine->emit_init_breadcrumb(rq);
688 				if (err)
689 					goto out_rq;
690 			}
691 
692 			deps = NULL;
693 		}
694 
695 		/* The PTE updates + clear must not be interrupted. */
696 		err = emit_no_arbitration(rq);
697 		if (err)
698 			goto out_rq;
699 
700 		offset = 0;
701 		if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
702 			offset = CHUNK_SZ;
703 
704 		len = emit_pte(rq, &it, cache_level, is_lmem, offset, CHUNK_SZ);
705 		if (len <= 0) {
706 			err = len;
707 			goto out_rq;
708 		}
709 
710 		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
711 		if (err)
712 			goto out_rq;
713 
714 		err = emit_clear(rq, offset, len, value);
715 
716 		/* Arbitration is re-enabled between requests. */
717 out_rq:
718 		if (*out)
719 			i915_request_put(*out);
720 		*out = i915_request_get(rq);
721 		i915_request_add(rq);
722 		if (err || !it.sg || !sg_dma_len(it.sg))
723 			break;
724 
725 		cond_resched();
726 	} while (1);
727 
728 out_ce:
729 	return err;
730 }
731 
732 int intel_migrate_copy(struct intel_migrate *m,
733 		       struct i915_gem_ww_ctx *ww,
734 		       const struct i915_deps *deps,
735 		       struct scatterlist *src,
736 		       enum i915_cache_level src_cache_level,
737 		       bool src_is_lmem,
738 		       struct scatterlist *dst,
739 		       enum i915_cache_level dst_cache_level,
740 		       bool dst_is_lmem,
741 		       struct i915_request **out)
742 {
743 	struct intel_context *ce;
744 	int err;
745 
746 	*out = NULL;
747 	if (!m->context)
748 		return -ENODEV;
749 
750 	ce = intel_migrate_create_context(m);
751 	if (IS_ERR(ce))
752 		ce = intel_context_get(m->context);
753 	GEM_BUG_ON(IS_ERR(ce));
754 
755 	err = intel_context_pin_ww(ce, ww);
756 	if (err)
757 		goto out;
758 
759 	err = intel_context_migrate_copy(ce, deps,
760 					 src, src_cache_level, src_is_lmem,
761 					 dst, dst_cache_level, dst_is_lmem,
762 					 out);
763 
764 	intel_context_unpin(ce);
765 out:
766 	intel_context_put(ce);
767 	return err;
768 }
769 
770 int
771 intel_migrate_clear(struct intel_migrate *m,
772 		    struct i915_gem_ww_ctx *ww,
773 		    const struct i915_deps *deps,
774 		    struct scatterlist *sg,
775 		    enum i915_cache_level cache_level,
776 		    bool is_lmem,
777 		    u32 value,
778 		    struct i915_request **out)
779 {
780 	struct intel_context *ce;
781 	int err;
782 
783 	*out = NULL;
784 	if (!m->context)
785 		return -ENODEV;
786 
787 	ce = intel_migrate_create_context(m);
788 	if (IS_ERR(ce))
789 		ce = intel_context_get(m->context);
790 	GEM_BUG_ON(IS_ERR(ce));
791 
792 	err = intel_context_pin_ww(ce, ww);
793 	if (err)
794 		goto out;
795 
796 	err = intel_context_migrate_clear(ce, deps, sg, cache_level,
797 					  is_lmem, value, out);
798 
799 	intel_context_unpin(ce);
800 out:
801 	intel_context_put(ce);
802 	return err;
803 }
804 
805 void intel_migrate_fini(struct intel_migrate *m)
806 {
807 	struct intel_context *ce;
808 
809 	ce = fetch_and_zero(&m->context);
810 	if (!ce)
811 		return;
812 
813 	intel_engine_destroy_pinned_context(ce);
814 }
815 
816 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
817 #include "selftest_migrate.c"
818 #endif
819