xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c (revision ea47eed33a3fe3d919e6e3cf4e4eb5507b817188)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/dma-fence-array.h>
29 #include <linux/interval_tree_generic.h>
30 #include <linux/idr.h>
31 #include <drm/drmP.h>
32 #include <drm/amdgpu_drm.h>
33 #include "amdgpu.h"
34 #include "amdgpu_trace.h"
35 #include "amdgpu_amdkfd.h"
36 
37 /*
38  * GPUVM
39  * GPUVM is similar to the legacy gart on older asics, however
40  * rather than there being a single global gart table
41  * for the entire GPU, there are multiple VM page tables active
42  * at any given time.  The VM page tables can contain a mix
43  * vram pages and system memory pages and system memory pages
44  * can be mapped as snooped (cached system pages) or unsnooped
45  * (uncached system pages).
46  * Each VM has an ID associated with it and there is a page table
47  * associated with each VMID.  When execting a command buffer,
48  * the kernel tells the the ring what VMID to use for that command
49  * buffer.  VMIDs are allocated dynamically as commands are submitted.
50  * The userspace drivers maintain their own address space and the kernel
51  * sets up their pages tables accordingly when they submit their
52  * command buffers and a VMID is assigned.
53  * Cayman/Trinity support up to 8 active VMs at any given time;
54  * SI supports 16.
55  */
56 
57 #define START(node) ((node)->start)
58 #define LAST(node) ((node)->last)
59 
60 INTERVAL_TREE_DEFINE(struct amdgpu_bo_va_mapping, rb, uint64_t, __subtree_last,
61 		     START, LAST, static, amdgpu_vm_it)
62 
63 #undef START
64 #undef LAST
65 
66 /* Local structure. Encapsulate some VM table update parameters to reduce
67  * the number of function parameters
68  */
69 struct amdgpu_pte_update_params {
70 	/* amdgpu device we do this update for */
71 	struct amdgpu_device *adev;
72 	/* optional amdgpu_vm we do this update for */
73 	struct amdgpu_vm *vm;
74 	/* address where to copy page table entries from */
75 	uint64_t src;
76 	/* indirect buffer to fill with commands */
77 	struct amdgpu_ib *ib;
78 	/* Function which actually does the update */
79 	void (*func)(struct amdgpu_pte_update_params *params,
80 		     struct amdgpu_bo *bo, uint64_t pe,
81 		     uint64_t addr, unsigned count, uint32_t incr,
82 		     uint64_t flags);
83 	/* The next two are used during VM update by CPU
84 	 *  DMA addresses to use for mapping
85 	 *  Kernel pointer of PD/PT BO that needs to be updated
86 	 */
87 	dma_addr_t *pages_addr;
88 	void *kptr;
89 };
90 
91 /* Helper to disable partial resident texture feature from a fence callback */
92 struct amdgpu_prt_cb {
93 	struct amdgpu_device *adev;
94 	struct dma_fence_cb cb;
95 };
96 
97 static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
98 				   struct amdgpu_vm *vm,
99 				   struct amdgpu_bo *bo)
100 {
101 	base->vm = vm;
102 	base->bo = bo;
103 	INIT_LIST_HEAD(&base->bo_list);
104 	INIT_LIST_HEAD(&base->vm_status);
105 
106 	if (!bo)
107 		return;
108 	list_add_tail(&base->bo_list, &bo->va);
109 
110 	if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
111 		return;
112 
113 	if (bo->preferred_domains &
114 	    amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
115 		return;
116 
117 	/*
118 	 * we checked all the prerequisites, but it looks like this per vm bo
119 	 * is currently evicted. add the bo to the evicted list to make sure it
120 	 * is validated on next vm use to avoid fault.
121 	 * */
122 	list_move_tail(&base->vm_status, &vm->evicted);
123 }
124 
125 /**
126  * amdgpu_vm_level_shift - return the addr shift for each level
127  *
128  * @adev: amdgpu_device pointer
129  *
130  * Returns the number of bits the pfn needs to be right shifted for a level.
131  */
132 static unsigned amdgpu_vm_level_shift(struct amdgpu_device *adev,
133 				      unsigned level)
134 {
135 	unsigned shift = 0xff;
136 
137 	switch (level) {
138 	case AMDGPU_VM_PDB2:
139 	case AMDGPU_VM_PDB1:
140 	case AMDGPU_VM_PDB0:
141 		shift = 9 * (AMDGPU_VM_PDB0 - level) +
142 			adev->vm_manager.block_size;
143 		break;
144 	case AMDGPU_VM_PTB:
145 		shift = 0;
146 		break;
147 	default:
148 		dev_err(adev->dev, "the level%d isn't supported.\n", level);
149 	}
150 
151 	return shift;
152 }
153 
154 /**
155  * amdgpu_vm_num_entries - return the number of entries in a PD/PT
156  *
157  * @adev: amdgpu_device pointer
158  *
159  * Calculate the number of entries in a page directory or page table.
160  */
161 static unsigned amdgpu_vm_num_entries(struct amdgpu_device *adev,
162 				      unsigned level)
163 {
164 	unsigned shift = amdgpu_vm_level_shift(adev,
165 					       adev->vm_manager.root_level);
166 
167 	if (level == adev->vm_manager.root_level)
168 		/* For the root directory */
169 		return round_up(adev->vm_manager.max_pfn, 1 << shift) >> shift;
170 	else if (level != AMDGPU_VM_PTB)
171 		/* Everything in between */
172 		return 512;
173 	else
174 		/* For the page tables on the leaves */
175 		return AMDGPU_VM_PTE_COUNT(adev);
176 }
177 
178 /**
179  * amdgpu_vm_bo_size - returns the size of the BOs in bytes
180  *
181  * @adev: amdgpu_device pointer
182  *
183  * Calculate the size of the BO for a page directory or page table in bytes.
184  */
185 static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
186 {
187 	return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
188 }
189 
190 /**
191  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
192  *
193  * @vm: vm providing the BOs
194  * @validated: head of validation list
195  * @entry: entry to add
196  *
197  * Add the page directory to the list of BOs to
198  * validate for command submission.
199  */
200 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
201 			 struct list_head *validated,
202 			 struct amdgpu_bo_list_entry *entry)
203 {
204 	entry->robj = vm->root.base.bo;
205 	entry->priority = 0;
206 	entry->tv.bo = &entry->robj->tbo;
207 	entry->tv.shared = true;
208 	entry->user_pages = NULL;
209 	list_add(&entry->tv.head, validated);
210 }
211 
212 /**
213  * amdgpu_vm_validate_pt_bos - validate the page table BOs
214  *
215  * @adev: amdgpu device pointer
216  * @vm: vm providing the BOs
217  * @validate: callback to do the validation
218  * @param: parameter for the validation callback
219  *
220  * Validate the page table BOs on command submission if neccessary.
221  */
222 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
223 			      int (*validate)(void *p, struct amdgpu_bo *bo),
224 			      void *param)
225 {
226 	struct ttm_bo_global *glob = adev->mman.bdev.glob;
227 	struct amdgpu_vm_bo_base *bo_base, *tmp;
228 	int r = 0;
229 
230 	list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
231 		struct amdgpu_bo *bo = bo_base->bo;
232 
233 		if (bo->parent) {
234 			r = validate(param, bo);
235 			if (r)
236 				break;
237 
238 			spin_lock(&glob->lru_lock);
239 			ttm_bo_move_to_lru_tail(&bo->tbo);
240 			if (bo->shadow)
241 				ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
242 			spin_unlock(&glob->lru_lock);
243 		}
244 
245 		if (bo->tbo.type != ttm_bo_type_kernel) {
246 			spin_lock(&vm->moved_lock);
247 			list_move(&bo_base->vm_status, &vm->moved);
248 			spin_unlock(&vm->moved_lock);
249 		} else {
250 			list_move(&bo_base->vm_status, &vm->relocated);
251 		}
252 	}
253 
254 	spin_lock(&glob->lru_lock);
255 	list_for_each_entry(bo_base, &vm->idle, vm_status) {
256 		struct amdgpu_bo *bo = bo_base->bo;
257 
258 		if (!bo->parent)
259 			continue;
260 
261 		ttm_bo_move_to_lru_tail(&bo->tbo);
262 		if (bo->shadow)
263 			ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
264 	}
265 	spin_unlock(&glob->lru_lock);
266 
267 	return r;
268 }
269 
270 /**
271  * amdgpu_vm_ready - check VM is ready for updates
272  *
273  * @vm: VM to check
274  *
275  * Check if all VM PDs/PTs are ready for updates
276  */
277 bool amdgpu_vm_ready(struct amdgpu_vm *vm)
278 {
279 	return list_empty(&vm->evicted);
280 }
281 
282 /**
283  * amdgpu_vm_clear_bo - initially clear the PDs/PTs
284  *
285  * @adev: amdgpu_device pointer
286  * @bo: BO to clear
287  * @level: level this BO is at
288  *
289  * Root PD needs to be reserved when calling this.
290  */
291 static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
292 			      struct amdgpu_vm *vm, struct amdgpu_bo *bo,
293 			      unsigned level, bool pte_support_ats)
294 {
295 	struct ttm_operation_ctx ctx = { true, false };
296 	struct dma_fence *fence = NULL;
297 	unsigned entries, ats_entries;
298 	struct amdgpu_ring *ring;
299 	struct amdgpu_job *job;
300 	uint64_t addr;
301 	int r;
302 
303 	addr = amdgpu_bo_gpu_offset(bo);
304 	entries = amdgpu_bo_size(bo) / 8;
305 
306 	if (pte_support_ats) {
307 		if (level == adev->vm_manager.root_level) {
308 			ats_entries = amdgpu_vm_level_shift(adev, level);
309 			ats_entries += AMDGPU_GPU_PAGE_SHIFT;
310 			ats_entries = AMDGPU_VA_HOLE_START >> ats_entries;
311 			ats_entries = min(ats_entries, entries);
312 			entries -= ats_entries;
313 		} else {
314 			ats_entries = entries;
315 			entries = 0;
316 		}
317 	} else {
318 		ats_entries = 0;
319 	}
320 
321 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
322 
323 	r = reservation_object_reserve_shared(bo->tbo.resv);
324 	if (r)
325 		return r;
326 
327 	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
328 	if (r)
329 		goto error;
330 
331 	r = amdgpu_job_alloc_with_ib(adev, 64, &job);
332 	if (r)
333 		goto error;
334 
335 	if (ats_entries) {
336 		uint64_t ats_value;
337 
338 		ats_value = AMDGPU_PTE_DEFAULT_ATC;
339 		if (level != AMDGPU_VM_PTB)
340 			ats_value |= AMDGPU_PDE_PTE;
341 
342 		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
343 				      ats_entries, 0, ats_value);
344 		addr += ats_entries * 8;
345 	}
346 
347 	if (entries)
348 		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
349 				      entries, 0, 0);
350 
351 	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
352 
353 	WARN_ON(job->ibs[0].length_dw > 64);
354 	r = amdgpu_sync_resv(adev, &job->sync, bo->tbo.resv,
355 			     AMDGPU_FENCE_OWNER_UNDEFINED, false);
356 	if (r)
357 		goto error_free;
358 
359 	r = amdgpu_job_submit(job, ring, &vm->entity,
360 			      AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
361 	if (r)
362 		goto error_free;
363 
364 	amdgpu_bo_fence(bo, fence, true);
365 	dma_fence_put(fence);
366 
367 	if (bo->shadow)
368 		return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
369 					  level, pte_support_ats);
370 
371 	return 0;
372 
373 error_free:
374 	amdgpu_job_free(job);
375 
376 error:
377 	return r;
378 }
379 
380 /**
381  * amdgpu_vm_alloc_levels - allocate the PD/PT levels
382  *
383  * @adev: amdgpu_device pointer
384  * @vm: requested vm
385  * @saddr: start of the address range
386  * @eaddr: end of the address range
387  *
388  * Make sure the page directories and page tables are allocated
389  */
390 static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
391 				  struct amdgpu_vm *vm,
392 				  struct amdgpu_vm_pt *parent,
393 				  uint64_t saddr, uint64_t eaddr,
394 				  unsigned level, bool ats)
395 {
396 	unsigned shift = amdgpu_vm_level_shift(adev, level);
397 	unsigned pt_idx, from, to;
398 	u64 flags;
399 	int r;
400 
401 	if (!parent->entries) {
402 		unsigned num_entries = amdgpu_vm_num_entries(adev, level);
403 
404 		parent->entries = kvmalloc_array(num_entries,
405 						   sizeof(struct amdgpu_vm_pt),
406 						   GFP_KERNEL | __GFP_ZERO);
407 		if (!parent->entries)
408 			return -ENOMEM;
409 		memset(parent->entries, 0 , sizeof(struct amdgpu_vm_pt));
410 	}
411 
412 	from = saddr >> shift;
413 	to = eaddr >> shift;
414 	if (from >= amdgpu_vm_num_entries(adev, level) ||
415 	    to >= amdgpu_vm_num_entries(adev, level))
416 		return -EINVAL;
417 
418 	++level;
419 	saddr = saddr & ((1 << shift) - 1);
420 	eaddr = eaddr & ((1 << shift) - 1);
421 
422 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
423 	if (vm->use_cpu_for_update)
424 		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
425 	else
426 		flags |= (AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
427 				AMDGPU_GEM_CREATE_SHADOW);
428 
429 	/* walk over the address space and allocate the page tables */
430 	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
431 		struct reservation_object *resv = vm->root.base.bo->tbo.resv;
432 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
433 		struct amdgpu_bo *pt;
434 
435 		if (!entry->base.bo) {
436 			struct amdgpu_bo_param bp;
437 
438 			memset(&bp, 0, sizeof(bp));
439 			bp.size = amdgpu_vm_bo_size(adev, level);
440 			bp.byte_align = AMDGPU_GPU_PAGE_SIZE;
441 			bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
442 			bp.flags = flags;
443 			bp.type = ttm_bo_type_kernel;
444 			bp.resv = resv;
445 			r = amdgpu_bo_create(adev, &bp, &pt);
446 			if (r)
447 				return r;
448 
449 			r = amdgpu_vm_clear_bo(adev, vm, pt, level, ats);
450 			if (r) {
451 				amdgpu_bo_unref(&pt->shadow);
452 				amdgpu_bo_unref(&pt);
453 				return r;
454 			}
455 
456 			if (vm->use_cpu_for_update) {
457 				r = amdgpu_bo_kmap(pt, NULL);
458 				if (r) {
459 					amdgpu_bo_unref(&pt->shadow);
460 					amdgpu_bo_unref(&pt);
461 					return r;
462 				}
463 			}
464 
465 			/* Keep a reference to the root directory to avoid
466 			* freeing them up in the wrong order.
467 			*/
468 			pt->parent = amdgpu_bo_ref(parent->base.bo);
469 
470 			amdgpu_vm_bo_base_init(&entry->base, vm, pt);
471 			list_move(&entry->base.vm_status, &vm->relocated);
472 		}
473 
474 		if (level < AMDGPU_VM_PTB) {
475 			uint64_t sub_saddr = (pt_idx == from) ? saddr : 0;
476 			uint64_t sub_eaddr = (pt_idx == to) ? eaddr :
477 				((1 << shift) - 1);
478 			r = amdgpu_vm_alloc_levels(adev, vm, entry, sub_saddr,
479 						   sub_eaddr, level, ats);
480 			if (r)
481 				return r;
482 		}
483 	}
484 
485 	return 0;
486 }
487 
488 /**
489  * amdgpu_vm_alloc_pts - Allocate page tables.
490  *
491  * @adev: amdgpu_device pointer
492  * @vm: VM to allocate page tables for
493  * @saddr: Start address which needs to be allocated
494  * @size: Size from start address we need.
495  *
496  * Make sure the page tables are allocated.
497  */
498 int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
499 			struct amdgpu_vm *vm,
500 			uint64_t saddr, uint64_t size)
501 {
502 	uint64_t eaddr;
503 	bool ats = false;
504 
505 	/* validate the parameters */
506 	if (saddr & AMDGPU_GPU_PAGE_MASK || size & AMDGPU_GPU_PAGE_MASK)
507 		return -EINVAL;
508 
509 	eaddr = saddr + size - 1;
510 
511 	if (vm->pte_support_ats)
512 		ats = saddr < AMDGPU_VA_HOLE_START;
513 
514 	saddr /= AMDGPU_GPU_PAGE_SIZE;
515 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
516 
517 	if (eaddr >= adev->vm_manager.max_pfn) {
518 		dev_err(adev->dev, "va above limit (0x%08llX >= 0x%08llX)\n",
519 			eaddr, adev->vm_manager.max_pfn);
520 		return -EINVAL;
521 	}
522 
523 	return amdgpu_vm_alloc_levels(adev, vm, &vm->root, saddr, eaddr,
524 				      adev->vm_manager.root_level, ats);
525 }
526 
527 /**
528  * amdgpu_vm_check_compute_bug - check whether asic has compute vm bug
529  *
530  * @adev: amdgpu_device pointer
531  */
532 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
533 {
534 	const struct amdgpu_ip_block *ip_block;
535 	bool has_compute_vm_bug;
536 	struct amdgpu_ring *ring;
537 	int i;
538 
539 	has_compute_vm_bug = false;
540 
541 	ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
542 	if (ip_block) {
543 		/* Compute has a VM bug for GFX version < 7.
544 		   Compute has a VM bug for GFX 8 MEC firmware version < 673.*/
545 		if (ip_block->version->major <= 7)
546 			has_compute_vm_bug = true;
547 		else if (ip_block->version->major == 8)
548 			if (adev->gfx.mec_fw_version < 673)
549 				has_compute_vm_bug = true;
550 	}
551 
552 	for (i = 0; i < adev->num_rings; i++) {
553 		ring = adev->rings[i];
554 		if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
555 			/* only compute rings */
556 			ring->has_compute_vm_bug = has_compute_vm_bug;
557 		else
558 			ring->has_compute_vm_bug = false;
559 	}
560 }
561 
562 bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
563 				  struct amdgpu_job *job)
564 {
565 	struct amdgpu_device *adev = ring->adev;
566 	unsigned vmhub = ring->funcs->vmhub;
567 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
568 	struct amdgpu_vmid *id;
569 	bool gds_switch_needed;
570 	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
571 
572 	if (job->vmid == 0)
573 		return false;
574 	id = &id_mgr->ids[job->vmid];
575 	gds_switch_needed = ring->funcs->emit_gds_switch && (
576 		id->gds_base != job->gds_base ||
577 		id->gds_size != job->gds_size ||
578 		id->gws_base != job->gws_base ||
579 		id->gws_size != job->gws_size ||
580 		id->oa_base != job->oa_base ||
581 		id->oa_size != job->oa_size);
582 
583 	if (amdgpu_vmid_had_gpu_reset(adev, id))
584 		return true;
585 
586 	return vm_flush_needed || gds_switch_needed;
587 }
588 
589 static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
590 {
591 	return (adev->gmc.real_vram_size == adev->gmc.visible_vram_size);
592 }
593 
594 /**
595  * amdgpu_vm_flush - hardware flush the vm
596  *
597  * @ring: ring to use for flush
598  * @vmid: vmid number to use
599  * @pd_addr: address of the page directory
600  *
601  * Emit a VM flush when it is necessary.
602  */
603 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
604 {
605 	struct amdgpu_device *adev = ring->adev;
606 	unsigned vmhub = ring->funcs->vmhub;
607 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
608 	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
609 	bool gds_switch_needed = ring->funcs->emit_gds_switch && (
610 		id->gds_base != job->gds_base ||
611 		id->gds_size != job->gds_size ||
612 		id->gws_base != job->gws_base ||
613 		id->gws_size != job->gws_size ||
614 		id->oa_base != job->oa_base ||
615 		id->oa_size != job->oa_size);
616 	bool vm_flush_needed = job->vm_needs_flush;
617 	bool pasid_mapping_needed = id->pasid != job->pasid ||
618 		!id->pasid_mapping ||
619 		!dma_fence_is_signaled(id->pasid_mapping);
620 	struct dma_fence *fence = NULL;
621 	unsigned patch_offset = 0;
622 	int r;
623 
624 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
625 		gds_switch_needed = true;
626 		vm_flush_needed = true;
627 		pasid_mapping_needed = true;
628 	}
629 
630 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
631 	vm_flush_needed &= !!ring->funcs->emit_vm_flush;
632 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
633 		ring->funcs->emit_wreg;
634 
635 	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
636 		return 0;
637 
638 	if (ring->funcs->init_cond_exec)
639 		patch_offset = amdgpu_ring_init_cond_exec(ring);
640 
641 	if (need_pipe_sync)
642 		amdgpu_ring_emit_pipeline_sync(ring);
643 
644 	if (vm_flush_needed) {
645 		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
646 		amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
647 	}
648 
649 	if (pasid_mapping_needed)
650 		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
651 
652 	if (vm_flush_needed || pasid_mapping_needed) {
653 		r = amdgpu_fence_emit(ring, &fence, 0);
654 		if (r)
655 			return r;
656 	}
657 
658 	if (vm_flush_needed) {
659 		mutex_lock(&id_mgr->lock);
660 		dma_fence_put(id->last_flush);
661 		id->last_flush = dma_fence_get(fence);
662 		id->current_gpu_reset_count =
663 			atomic_read(&adev->gpu_reset_counter);
664 		mutex_unlock(&id_mgr->lock);
665 	}
666 
667 	if (pasid_mapping_needed) {
668 		id->pasid = job->pasid;
669 		dma_fence_put(id->pasid_mapping);
670 		id->pasid_mapping = dma_fence_get(fence);
671 	}
672 	dma_fence_put(fence);
673 
674 	if (ring->funcs->emit_gds_switch && gds_switch_needed) {
675 		id->gds_base = job->gds_base;
676 		id->gds_size = job->gds_size;
677 		id->gws_base = job->gws_base;
678 		id->gws_size = job->gws_size;
679 		id->oa_base = job->oa_base;
680 		id->oa_size = job->oa_size;
681 		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
682 					    job->gds_size, job->gws_base,
683 					    job->gws_size, job->oa_base,
684 					    job->oa_size);
685 	}
686 
687 	if (ring->funcs->patch_cond_exec)
688 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
689 
690 	/* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
691 	if (ring->funcs->emit_switch_buffer) {
692 		amdgpu_ring_emit_switch_buffer(ring);
693 		amdgpu_ring_emit_switch_buffer(ring);
694 	}
695 	return 0;
696 }
697 
698 /**
699  * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo
700  *
701  * @vm: requested vm
702  * @bo: requested buffer object
703  *
704  * Find @bo inside the requested vm.
705  * Search inside the @bos vm list for the requested vm
706  * Returns the found bo_va or NULL if none is found
707  *
708  * Object has to be reserved!
709  */
710 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
711 				       struct amdgpu_bo *bo)
712 {
713 	struct amdgpu_bo_va *bo_va;
714 
715 	list_for_each_entry(bo_va, &bo->va, base.bo_list) {
716 		if (bo_va->base.vm == vm) {
717 			return bo_va;
718 		}
719 	}
720 	return NULL;
721 }
722 
723 /**
724  * amdgpu_vm_do_set_ptes - helper to call the right asic function
725  *
726  * @params: see amdgpu_pte_update_params definition
727  * @bo: PD/PT to update
728  * @pe: addr of the page entry
729  * @addr: dst addr to write into pe
730  * @count: number of page entries to update
731  * @incr: increase next addr by incr bytes
732  * @flags: hw access flags
733  *
734  * Traces the parameters and calls the right asic functions
735  * to setup the page table using the DMA.
736  */
737 static void amdgpu_vm_do_set_ptes(struct amdgpu_pte_update_params *params,
738 				  struct amdgpu_bo *bo,
739 				  uint64_t pe, uint64_t addr,
740 				  unsigned count, uint32_t incr,
741 				  uint64_t flags)
742 {
743 	pe += amdgpu_bo_gpu_offset(bo);
744 	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
745 
746 	if (count < 3) {
747 		amdgpu_vm_write_pte(params->adev, params->ib, pe,
748 				    addr | flags, count, incr);
749 
750 	} else {
751 		amdgpu_vm_set_pte_pde(params->adev, params->ib, pe, addr,
752 				      count, incr, flags);
753 	}
754 }
755 
756 /**
757  * amdgpu_vm_do_copy_ptes - copy the PTEs from the GART
758  *
759  * @params: see amdgpu_pte_update_params definition
760  * @bo: PD/PT to update
761  * @pe: addr of the page entry
762  * @addr: dst addr to write into pe
763  * @count: number of page entries to update
764  * @incr: increase next addr by incr bytes
765  * @flags: hw access flags
766  *
767  * Traces the parameters and calls the DMA function to copy the PTEs.
768  */
769 static void amdgpu_vm_do_copy_ptes(struct amdgpu_pte_update_params *params,
770 				   struct amdgpu_bo *bo,
771 				   uint64_t pe, uint64_t addr,
772 				   unsigned count, uint32_t incr,
773 				   uint64_t flags)
774 {
775 	uint64_t src = (params->src + (addr >> 12) * 8);
776 
777 	pe += amdgpu_bo_gpu_offset(bo);
778 	trace_amdgpu_vm_copy_ptes(pe, src, count);
779 
780 	amdgpu_vm_copy_pte(params->adev, params->ib, pe, src, count);
781 }
782 
783 /**
784  * amdgpu_vm_map_gart - Resolve gart mapping of addr
785  *
786  * @pages_addr: optional DMA address to use for lookup
787  * @addr: the unmapped addr
788  *
789  * Look up the physical address of the page that the pte resolves
790  * to and return the pointer for the page table entry.
791  */
792 static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
793 {
794 	uint64_t result;
795 
796 	/* page table offset */
797 	result = pages_addr[addr >> PAGE_SHIFT];
798 
799 	/* in case cpu page size != gpu page size*/
800 	result |= addr & (~PAGE_MASK);
801 
802 	result &= 0xFFFFFFFFFFFFF000ULL;
803 
804 	return result;
805 }
806 
807 /**
808  * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
809  *
810  * @params: see amdgpu_pte_update_params definition
811  * @bo: PD/PT to update
812  * @pe: kmap addr of the page entry
813  * @addr: dst addr to write into pe
814  * @count: number of page entries to update
815  * @incr: increase next addr by incr bytes
816  * @flags: hw access flags
817  *
818  * Write count number of PT/PD entries directly.
819  */
820 static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
821 				   struct amdgpu_bo *bo,
822 				   uint64_t pe, uint64_t addr,
823 				   unsigned count, uint32_t incr,
824 				   uint64_t flags)
825 {
826 	unsigned int i;
827 	uint64_t value;
828 
829 	pe += (unsigned long)amdgpu_bo_kptr(bo);
830 
831 	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
832 
833 	for (i = 0; i < count; i++) {
834 		value = params->pages_addr ?
835 			amdgpu_vm_map_gart(params->pages_addr, addr) :
836 			addr;
837 		amdgpu_gmc_set_pte_pde(params->adev, (void *)(uintptr_t)pe,
838 				       i, value, flags);
839 		addr += incr;
840 	}
841 }
842 
843 static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
844 			     void *owner)
845 {
846 	struct amdgpu_sync sync;
847 	int r;
848 
849 	amdgpu_sync_create(&sync);
850 	amdgpu_sync_resv(adev, &sync, vm->root.base.bo->tbo.resv, owner, false);
851 	r = amdgpu_sync_wait(&sync, true);
852 	amdgpu_sync_free(&sync);
853 
854 	return r;
855 }
856 
857 /*
858  * amdgpu_vm_update_pde - update a single level in the hierarchy
859  *
860  * @param: parameters for the update
861  * @vm: requested vm
862  * @parent: parent directory
863  * @entry: entry to update
864  *
865  * Makes sure the requested entry in parent is up to date.
866  */
867 static void amdgpu_vm_update_pde(struct amdgpu_pte_update_params *params,
868 				 struct amdgpu_vm *vm,
869 				 struct amdgpu_vm_pt *parent,
870 				 struct amdgpu_vm_pt *entry)
871 {
872 	struct amdgpu_bo *bo = parent->base.bo, *pbo;
873 	uint64_t pde, pt, flags;
874 	unsigned level;
875 
876 	/* Don't update huge pages here */
877 	if (entry->huge)
878 		return;
879 
880 	for (level = 0, pbo = bo->parent; pbo; ++level)
881 		pbo = pbo->parent;
882 
883 	level += params->adev->vm_manager.root_level;
884 	pt = amdgpu_bo_gpu_offset(entry->base.bo);
885 	flags = AMDGPU_PTE_VALID;
886 	amdgpu_gmc_get_vm_pde(params->adev, level, &pt, &flags);
887 	pde = (entry - parent->entries) * 8;
888 	if (bo->shadow)
889 		params->func(params, bo->shadow, pde, pt, 1, 0, flags);
890 	params->func(params, bo, pde, pt, 1, 0, flags);
891 }
892 
893 /*
894  * amdgpu_vm_invalidate_level - mark all PD levels as invalid
895  *
896  * @parent: parent PD
897  *
898  * Mark all PD level as invalid after an error.
899  */
900 static void amdgpu_vm_invalidate_level(struct amdgpu_device *adev,
901 				       struct amdgpu_vm *vm,
902 				       struct amdgpu_vm_pt *parent,
903 				       unsigned level)
904 {
905 	unsigned pt_idx, num_entries;
906 
907 	/*
908 	 * Recurse into the subdirectories. This recursion is harmless because
909 	 * we only have a maximum of 5 layers.
910 	 */
911 	num_entries = amdgpu_vm_num_entries(adev, level);
912 	for (pt_idx = 0; pt_idx < num_entries; ++pt_idx) {
913 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
914 
915 		if (!entry->base.bo)
916 			continue;
917 
918 		if (!entry->base.moved)
919 			list_move(&entry->base.vm_status, &vm->relocated);
920 		amdgpu_vm_invalidate_level(adev, vm, entry, level + 1);
921 	}
922 }
923 
924 /*
925  * amdgpu_vm_update_directories - make sure that all directories are valid
926  *
927  * @adev: amdgpu_device pointer
928  * @vm: requested vm
929  *
930  * Makes sure all directories are up to date.
931  * Returns 0 for success, error for failure.
932  */
933 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
934 				 struct amdgpu_vm *vm)
935 {
936 	struct amdgpu_pte_update_params params;
937 	struct amdgpu_job *job;
938 	unsigned ndw = 0;
939 	int r = 0;
940 
941 	if (list_empty(&vm->relocated))
942 		return 0;
943 
944 restart:
945 	memset(&params, 0, sizeof(params));
946 	params.adev = adev;
947 
948 	if (vm->use_cpu_for_update) {
949 		struct amdgpu_vm_bo_base *bo_base;
950 
951 		list_for_each_entry(bo_base, &vm->relocated, vm_status) {
952 			r = amdgpu_bo_kmap(bo_base->bo, NULL);
953 			if (unlikely(r))
954 				return r;
955 		}
956 
957 		r = amdgpu_vm_wait_pd(adev, vm, AMDGPU_FENCE_OWNER_VM);
958 		if (unlikely(r))
959 			return r;
960 
961 		params.func = amdgpu_vm_cpu_set_ptes;
962 	} else {
963 		ndw = 512 * 8;
964 		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
965 		if (r)
966 			return r;
967 
968 		params.ib = &job->ibs[0];
969 		params.func = amdgpu_vm_do_set_ptes;
970 	}
971 
972 	while (!list_empty(&vm->relocated)) {
973 		struct amdgpu_vm_bo_base *bo_base, *parent;
974 		struct amdgpu_vm_pt *pt, *entry;
975 		struct amdgpu_bo *bo;
976 
977 		bo_base = list_first_entry(&vm->relocated,
978 					   struct amdgpu_vm_bo_base,
979 					   vm_status);
980 		bo_base->moved = false;
981 		list_move(&bo_base->vm_status, &vm->idle);
982 
983 		bo = bo_base->bo->parent;
984 		if (!bo)
985 			continue;
986 
987 		parent = list_first_entry(&bo->va, struct amdgpu_vm_bo_base,
988 					  bo_list);
989 		pt = container_of(parent, struct amdgpu_vm_pt, base);
990 		entry = container_of(bo_base, struct amdgpu_vm_pt, base);
991 
992 		amdgpu_vm_update_pde(&params, vm, pt, entry);
993 
994 		if (!vm->use_cpu_for_update &&
995 		    (ndw - params.ib->length_dw) < 32)
996 			break;
997 	}
998 
999 	if (vm->use_cpu_for_update) {
1000 		/* Flush HDP */
1001 		mb();
1002 		amdgpu_asic_flush_hdp(adev, NULL);
1003 	} else if (params.ib->length_dw == 0) {
1004 		amdgpu_job_free(job);
1005 	} else {
1006 		struct amdgpu_bo *root = vm->root.base.bo;
1007 		struct amdgpu_ring *ring;
1008 		struct dma_fence *fence;
1009 
1010 		ring = container_of(vm->entity.sched, struct amdgpu_ring,
1011 				    sched);
1012 
1013 		amdgpu_ring_pad_ib(ring, params.ib);
1014 		amdgpu_sync_resv(adev, &job->sync, root->tbo.resv,
1015 				 AMDGPU_FENCE_OWNER_VM, false);
1016 		WARN_ON(params.ib->length_dw > ndw);
1017 		r = amdgpu_job_submit(job, ring, &vm->entity,
1018 				      AMDGPU_FENCE_OWNER_VM, &fence);
1019 		if (r)
1020 			goto error;
1021 
1022 		amdgpu_bo_fence(root, fence, true);
1023 		dma_fence_put(vm->last_update);
1024 		vm->last_update = fence;
1025 	}
1026 
1027 	if (!list_empty(&vm->relocated))
1028 		goto restart;
1029 
1030 	return 0;
1031 
1032 error:
1033 	amdgpu_vm_invalidate_level(adev, vm, &vm->root,
1034 				   adev->vm_manager.root_level);
1035 	amdgpu_job_free(job);
1036 	return r;
1037 }
1038 
1039 /**
1040  * amdgpu_vm_find_entry - find the entry for an address
1041  *
1042  * @p: see amdgpu_pte_update_params definition
1043  * @addr: virtual address in question
1044  * @entry: resulting entry or NULL
1045  * @parent: parent entry
1046  *
1047  * Find the vm_pt entry and it's parent for the given address.
1048  */
1049 void amdgpu_vm_get_entry(struct amdgpu_pte_update_params *p, uint64_t addr,
1050 			 struct amdgpu_vm_pt **entry,
1051 			 struct amdgpu_vm_pt **parent)
1052 {
1053 	unsigned level = p->adev->vm_manager.root_level;
1054 
1055 	*parent = NULL;
1056 	*entry = &p->vm->root;
1057 	while ((*entry)->entries) {
1058 		unsigned shift = amdgpu_vm_level_shift(p->adev, level++);
1059 
1060 		*parent = *entry;
1061 		*entry = &(*entry)->entries[addr >> shift];
1062 		addr &= (1ULL << shift) - 1;
1063 	}
1064 
1065 	if (level != AMDGPU_VM_PTB)
1066 		*entry = NULL;
1067 }
1068 
1069 /**
1070  * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
1071  *
1072  * @p: see amdgpu_pte_update_params definition
1073  * @entry: vm_pt entry to check
1074  * @parent: parent entry
1075  * @nptes: number of PTEs updated with this operation
1076  * @dst: destination address where the PTEs should point to
1077  * @flags: access flags fro the PTEs
1078  *
1079  * Check if we can update the PD with a huge page.
1080  */
1081 static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
1082 					struct amdgpu_vm_pt *entry,
1083 					struct amdgpu_vm_pt *parent,
1084 					unsigned nptes, uint64_t dst,
1085 					uint64_t flags)
1086 {
1087 	uint64_t pde;
1088 
1089 	/* In the case of a mixed PT the PDE must point to it*/
1090 	if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
1091 	    nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
1092 		/* Set the huge page flag to stop scanning at this PDE */
1093 		flags |= AMDGPU_PDE_PTE;
1094 	}
1095 
1096 	if (!(flags & AMDGPU_PDE_PTE)) {
1097 		if (entry->huge) {
1098 			/* Add the entry to the relocated list to update it. */
1099 			entry->huge = false;
1100 			list_move(&entry->base.vm_status, &p->vm->relocated);
1101 		}
1102 		return;
1103 	}
1104 
1105 	entry->huge = true;
1106 	amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
1107 
1108 	pde = (entry - parent->entries) * 8;
1109 	if (parent->base.bo->shadow)
1110 		p->func(p, parent->base.bo->shadow, pde, dst, 1, 0, flags);
1111 	p->func(p, parent->base.bo, pde, dst, 1, 0, flags);
1112 }
1113 
1114 /**
1115  * amdgpu_vm_update_ptes - make sure that page tables are valid
1116  *
1117  * @params: see amdgpu_pte_update_params definition
1118  * @vm: requested vm
1119  * @start: start of GPU address range
1120  * @end: end of GPU address range
1121  * @dst: destination address to map to, the next dst inside the function
1122  * @flags: mapping flags
1123  *
1124  * Update the page tables in the range @start - @end.
1125  * Returns 0 for success, -EINVAL for failure.
1126  */
1127 static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
1128 				  uint64_t start, uint64_t end,
1129 				  uint64_t dst, uint64_t flags)
1130 {
1131 	struct amdgpu_device *adev = params->adev;
1132 	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
1133 
1134 	uint64_t addr, pe_start;
1135 	struct amdgpu_bo *pt;
1136 	unsigned nptes;
1137 
1138 	/* walk over the address space and update the page tables */
1139 	for (addr = start; addr < end; addr += nptes,
1140 	     dst += nptes * AMDGPU_GPU_PAGE_SIZE) {
1141 		struct amdgpu_vm_pt *entry, *parent;
1142 
1143 		amdgpu_vm_get_entry(params, addr, &entry, &parent);
1144 		if (!entry)
1145 			return -ENOENT;
1146 
1147 		if ((addr & ~mask) == (end & ~mask))
1148 			nptes = end - addr;
1149 		else
1150 			nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
1151 
1152 		amdgpu_vm_handle_huge_pages(params, entry, parent,
1153 					    nptes, dst, flags);
1154 		/* We don't need to update PTEs for huge pages */
1155 		if (entry->huge)
1156 			continue;
1157 
1158 		pt = entry->base.bo;
1159 		pe_start = (addr & mask) * 8;
1160 		if (pt->shadow)
1161 			params->func(params, pt->shadow, pe_start, dst, nptes,
1162 				     AMDGPU_GPU_PAGE_SIZE, flags);
1163 		params->func(params, pt, pe_start, dst, nptes,
1164 			     AMDGPU_GPU_PAGE_SIZE, flags);
1165 	}
1166 
1167 	return 0;
1168 }
1169 
1170 /*
1171  * amdgpu_vm_frag_ptes - add fragment information to PTEs
1172  *
1173  * @params: see amdgpu_pte_update_params definition
1174  * @vm: requested vm
1175  * @start: first PTE to handle
1176  * @end: last PTE to handle
1177  * @dst: addr those PTEs should point to
1178  * @flags: hw mapping flags
1179  * Returns 0 for success, -EINVAL for failure.
1180  */
1181 static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
1182 				uint64_t start, uint64_t end,
1183 				uint64_t dst, uint64_t flags)
1184 {
1185 	/**
1186 	 * The MC L1 TLB supports variable sized pages, based on a fragment
1187 	 * field in the PTE. When this field is set to a non-zero value, page
1188 	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
1189 	 * flags are considered valid for all PTEs within the fragment range
1190 	 * and corresponding mappings are assumed to be physically contiguous.
1191 	 *
1192 	 * The L1 TLB can store a single PTE for the whole fragment,
1193 	 * significantly increasing the space available for translation
1194 	 * caching. This leads to large improvements in throughput when the
1195 	 * TLB is under pressure.
1196 	 *
1197 	 * The L2 TLB distributes small and large fragments into two
1198 	 * asymmetric partitions. The large fragment cache is significantly
1199 	 * larger. Thus, we try to use large fragments wherever possible.
1200 	 * Userspace can support this by aligning virtual base address and
1201 	 * allocation size to the fragment size.
1202 	 */
1203 	unsigned max_frag = params->adev->vm_manager.fragment_size;
1204 	int r;
1205 
1206 	/* system pages are non continuously */
1207 	if (params->src || !(flags & AMDGPU_PTE_VALID))
1208 		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
1209 
1210 	while (start != end) {
1211 		uint64_t frag_flags, frag_end;
1212 		unsigned frag;
1213 
1214 		/* This intentionally wraps around if no bit is set */
1215 		frag = min((unsigned)ffs(start) - 1,
1216 			   (unsigned)fls64(end - start) - 1);
1217 		if (frag >= max_frag) {
1218 			frag_flags = AMDGPU_PTE_FRAG(max_frag);
1219 			frag_end = end & ~((1ULL << max_frag) - 1);
1220 		} else {
1221 			frag_flags = AMDGPU_PTE_FRAG(frag);
1222 			frag_end = start + (1 << frag);
1223 		}
1224 
1225 		r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
1226 					  flags | frag_flags);
1227 		if (r)
1228 			return r;
1229 
1230 		dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
1231 		start = frag_end;
1232 	}
1233 
1234 	return 0;
1235 }
1236 
1237 /**
1238  * amdgpu_vm_bo_update_mapping - update a mapping in the vm page table
1239  *
1240  * @adev: amdgpu_device pointer
1241  * @exclusive: fence we need to sync to
1242  * @pages_addr: DMA addresses to use for mapping
1243  * @vm: requested vm
1244  * @start: start of mapped range
1245  * @last: last mapped entry
1246  * @flags: flags for the entries
1247  * @addr: addr to set the area to
1248  * @fence: optional resulting fence
1249  *
1250  * Fill in the page table entries between @start and @last.
1251  * Returns 0 for success, -EINVAL for failure.
1252  */
1253 static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
1254 				       struct dma_fence *exclusive,
1255 				       dma_addr_t *pages_addr,
1256 				       struct amdgpu_vm *vm,
1257 				       uint64_t start, uint64_t last,
1258 				       uint64_t flags, uint64_t addr,
1259 				       struct dma_fence **fence)
1260 {
1261 	struct amdgpu_ring *ring;
1262 	void *owner = AMDGPU_FENCE_OWNER_VM;
1263 	unsigned nptes, ncmds, ndw;
1264 	struct amdgpu_job *job;
1265 	struct amdgpu_pte_update_params params;
1266 	struct dma_fence *f = NULL;
1267 	int r;
1268 
1269 	memset(&params, 0, sizeof(params));
1270 	params.adev = adev;
1271 	params.vm = vm;
1272 
1273 	/* sync to everything on unmapping */
1274 	if (!(flags & AMDGPU_PTE_VALID))
1275 		owner = AMDGPU_FENCE_OWNER_UNDEFINED;
1276 
1277 	if (vm->use_cpu_for_update) {
1278 		/* params.src is used as flag to indicate system Memory */
1279 		if (pages_addr)
1280 			params.src = ~0;
1281 
1282 		/* Wait for PT BOs to be free. PTs share the same resv. object
1283 		 * as the root PD BO
1284 		 */
1285 		r = amdgpu_vm_wait_pd(adev, vm, owner);
1286 		if (unlikely(r))
1287 			return r;
1288 
1289 		params.func = amdgpu_vm_cpu_set_ptes;
1290 		params.pages_addr = pages_addr;
1291 		return amdgpu_vm_frag_ptes(&params, start, last + 1,
1292 					   addr, flags);
1293 	}
1294 
1295 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
1296 
1297 	nptes = last - start + 1;
1298 
1299 	/*
1300 	 * reserve space for two commands every (1 << BLOCK_SIZE)
1301 	 *  entries or 2k dwords (whatever is smaller)
1302          *
1303          * The second command is for the shadow pagetables.
1304 	 */
1305 	if (vm->root.base.bo->shadow)
1306 		ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1) * 2;
1307 	else
1308 		ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1);
1309 
1310 	/* padding, etc. */
1311 	ndw = 64;
1312 
1313 	if (pages_addr) {
1314 		/* copy commands needed */
1315 		ndw += ncmds * adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
1316 
1317 		/* and also PTEs */
1318 		ndw += nptes * 2;
1319 
1320 		params.func = amdgpu_vm_do_copy_ptes;
1321 
1322 	} else {
1323 		/* set page commands needed */
1324 		ndw += ncmds * 10;
1325 
1326 		/* extra commands for begin/end fragments */
1327 		ndw += 2 * 10 * adev->vm_manager.fragment_size;
1328 
1329 		params.func = amdgpu_vm_do_set_ptes;
1330 	}
1331 
1332 	r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
1333 	if (r)
1334 		return r;
1335 
1336 	params.ib = &job->ibs[0];
1337 
1338 	if (pages_addr) {
1339 		uint64_t *pte;
1340 		unsigned i;
1341 
1342 		/* Put the PTEs at the end of the IB. */
1343 		i = ndw - nptes * 2;
1344 		pte= (uint64_t *)&(job->ibs->ptr[i]);
1345 		params.src = job->ibs->gpu_addr + i * 4;
1346 
1347 		for (i = 0; i < nptes; ++i) {
1348 			pte[i] = amdgpu_vm_map_gart(pages_addr, addr + i *
1349 						    AMDGPU_GPU_PAGE_SIZE);
1350 			pte[i] |= flags;
1351 		}
1352 		addr = 0;
1353 	}
1354 
1355 	r = amdgpu_sync_fence(adev, &job->sync, exclusive, false);
1356 	if (r)
1357 		goto error_free;
1358 
1359 	r = amdgpu_sync_resv(adev, &job->sync, vm->root.base.bo->tbo.resv,
1360 			     owner, false);
1361 	if (r)
1362 		goto error_free;
1363 
1364 	r = reservation_object_reserve_shared(vm->root.base.bo->tbo.resv);
1365 	if (r)
1366 		goto error_free;
1367 
1368 	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
1369 	if (r)
1370 		goto error_free;
1371 
1372 	amdgpu_ring_pad_ib(ring, params.ib);
1373 	WARN_ON(params.ib->length_dw > ndw);
1374 	r = amdgpu_job_submit(job, ring, &vm->entity,
1375 			      AMDGPU_FENCE_OWNER_VM, &f);
1376 	if (r)
1377 		goto error_free;
1378 
1379 	amdgpu_bo_fence(vm->root.base.bo, f, true);
1380 	dma_fence_put(*fence);
1381 	*fence = f;
1382 	return 0;
1383 
1384 error_free:
1385 	amdgpu_job_free(job);
1386 	return r;
1387 }
1388 
1389 /**
1390  * amdgpu_vm_bo_split_mapping - split a mapping into smaller chunks
1391  *
1392  * @adev: amdgpu_device pointer
1393  * @exclusive: fence we need to sync to
1394  * @pages_addr: DMA addresses to use for mapping
1395  * @vm: requested vm
1396  * @mapping: mapped range and flags to use for the update
1397  * @flags: HW flags for the mapping
1398  * @nodes: array of drm_mm_nodes with the MC addresses
1399  * @fence: optional resulting fence
1400  *
1401  * Split the mapping into smaller chunks so that each update fits
1402  * into a SDMA IB.
1403  * Returns 0 for success, -EINVAL for failure.
1404  */
1405 static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
1406 				      struct dma_fence *exclusive,
1407 				      dma_addr_t *pages_addr,
1408 				      struct amdgpu_vm *vm,
1409 				      struct amdgpu_bo_va_mapping *mapping,
1410 				      uint64_t flags,
1411 				      struct drm_mm_node *nodes,
1412 				      struct dma_fence **fence)
1413 {
1414 	unsigned min_linear_pages = 1 << adev->vm_manager.fragment_size;
1415 	uint64_t pfn, start = mapping->start;
1416 	int r;
1417 
1418 	/* normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here
1419 	 * but in case of something, we filter the flags in first place
1420 	 */
1421 	if (!(mapping->flags & AMDGPU_PTE_READABLE))
1422 		flags &= ~AMDGPU_PTE_READABLE;
1423 	if (!(mapping->flags & AMDGPU_PTE_WRITEABLE))
1424 		flags &= ~AMDGPU_PTE_WRITEABLE;
1425 
1426 	flags &= ~AMDGPU_PTE_EXECUTABLE;
1427 	flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
1428 
1429 	flags &= ~AMDGPU_PTE_MTYPE_MASK;
1430 	flags |= (mapping->flags & AMDGPU_PTE_MTYPE_MASK);
1431 
1432 	if ((mapping->flags & AMDGPU_PTE_PRT) &&
1433 	    (adev->asic_type >= CHIP_VEGA10)) {
1434 		flags |= AMDGPU_PTE_PRT;
1435 		flags &= ~AMDGPU_PTE_VALID;
1436 	}
1437 
1438 	trace_amdgpu_vm_bo_update(mapping);
1439 
1440 	pfn = mapping->offset >> PAGE_SHIFT;
1441 	if (nodes) {
1442 		while (pfn >= nodes->size) {
1443 			pfn -= nodes->size;
1444 			++nodes;
1445 		}
1446 	}
1447 
1448 	do {
1449 		dma_addr_t *dma_addr = NULL;
1450 		uint64_t max_entries;
1451 		uint64_t addr, last;
1452 
1453 		if (nodes) {
1454 			addr = nodes->start << PAGE_SHIFT;
1455 			max_entries = (nodes->size - pfn) *
1456 				(PAGE_SIZE / AMDGPU_GPU_PAGE_SIZE);
1457 		} else {
1458 			addr = 0;
1459 			max_entries = S64_MAX;
1460 		}
1461 
1462 		if (pages_addr) {
1463 			uint64_t count;
1464 
1465 			max_entries = min(max_entries, 16ull * 1024ull);
1466 			for (count = 1; count < max_entries; ++count) {
1467 				uint64_t idx = pfn + count;
1468 
1469 				if (pages_addr[idx] !=
1470 				    (pages_addr[idx - 1] + PAGE_SIZE))
1471 					break;
1472 			}
1473 
1474 			if (count < min_linear_pages) {
1475 				addr = pfn << PAGE_SHIFT;
1476 				dma_addr = pages_addr;
1477 			} else {
1478 				addr = pages_addr[pfn];
1479 				max_entries = count;
1480 			}
1481 
1482 		} else if (flags & AMDGPU_PTE_VALID) {
1483 			addr += adev->vm_manager.vram_base_offset;
1484 			addr += pfn << PAGE_SHIFT;
1485 		}
1486 
1487 		last = min((uint64_t)mapping->last, start + max_entries - 1);
1488 		r = amdgpu_vm_bo_update_mapping(adev, exclusive, dma_addr, vm,
1489 						start, last, flags, addr,
1490 						fence);
1491 		if (r)
1492 			return r;
1493 
1494 		pfn += last - start + 1;
1495 		if (nodes && nodes->size == pfn) {
1496 			pfn = 0;
1497 			++nodes;
1498 		}
1499 		start = last + 1;
1500 
1501 	} while (unlikely(start != mapping->last + 1));
1502 
1503 	return 0;
1504 }
1505 
1506 /**
1507  * amdgpu_vm_bo_update - update all BO mappings in the vm page table
1508  *
1509  * @adev: amdgpu_device pointer
1510  * @bo_va: requested BO and VM object
1511  * @clear: if true clear the entries
1512  *
1513  * Fill in the page table entries for @bo_va.
1514  * Returns 0 for success, -EINVAL for failure.
1515  */
1516 int amdgpu_vm_bo_update(struct amdgpu_device *adev,
1517 			struct amdgpu_bo_va *bo_va,
1518 			bool clear)
1519 {
1520 	struct amdgpu_bo *bo = bo_va->base.bo;
1521 	struct amdgpu_vm *vm = bo_va->base.vm;
1522 	struct amdgpu_bo_va_mapping *mapping;
1523 	dma_addr_t *pages_addr = NULL;
1524 	struct ttm_mem_reg *mem;
1525 	struct drm_mm_node *nodes;
1526 	struct dma_fence *exclusive, **last_update;
1527 	uint64_t flags;
1528 	int r;
1529 
1530 	if (clear || !bo_va->base.bo) {
1531 		mem = NULL;
1532 		nodes = NULL;
1533 		exclusive = NULL;
1534 	} else {
1535 		struct ttm_dma_tt *ttm;
1536 
1537 		mem = &bo_va->base.bo->tbo.mem;
1538 		nodes = mem->mm_node;
1539 		if (mem->mem_type == TTM_PL_TT) {
1540 			ttm = container_of(bo_va->base.bo->tbo.ttm,
1541 					   struct ttm_dma_tt, ttm);
1542 			pages_addr = ttm->dma_address;
1543 		}
1544 		exclusive = reservation_object_get_excl(bo->tbo.resv);
1545 	}
1546 
1547 	if (bo)
1548 		flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
1549 	else
1550 		flags = 0x0;
1551 
1552 	if (clear || (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv))
1553 		last_update = &vm->last_update;
1554 	else
1555 		last_update = &bo_va->last_pt_update;
1556 
1557 	if (!clear && bo_va->base.moved) {
1558 		bo_va->base.moved = false;
1559 		list_splice_init(&bo_va->valids, &bo_va->invalids);
1560 
1561 	} else if (bo_va->cleared != clear) {
1562 		list_splice_init(&bo_va->valids, &bo_va->invalids);
1563 	}
1564 
1565 	list_for_each_entry(mapping, &bo_va->invalids, list) {
1566 		r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
1567 					       mapping, flags, nodes,
1568 					       last_update);
1569 		if (r)
1570 			return r;
1571 	}
1572 
1573 	if (vm->use_cpu_for_update) {
1574 		/* Flush HDP */
1575 		mb();
1576 		amdgpu_asic_flush_hdp(adev, NULL);
1577 	}
1578 
1579 	spin_lock(&vm->moved_lock);
1580 	list_del_init(&bo_va->base.vm_status);
1581 	spin_unlock(&vm->moved_lock);
1582 
1583 	/* If the BO is not in its preferred location add it back to
1584 	 * the evicted list so that it gets validated again on the
1585 	 * next command submission.
1586 	 */
1587 	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
1588 		uint32_t mem_type = bo->tbo.mem.mem_type;
1589 
1590 		if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
1591 			list_add_tail(&bo_va->base.vm_status, &vm->evicted);
1592 		else
1593 			list_add(&bo_va->base.vm_status, &vm->idle);
1594 	}
1595 
1596 	list_splice_init(&bo_va->invalids, &bo_va->valids);
1597 	bo_va->cleared = clear;
1598 
1599 	if (trace_amdgpu_vm_bo_mapping_enabled()) {
1600 		list_for_each_entry(mapping, &bo_va->valids, list)
1601 			trace_amdgpu_vm_bo_mapping(mapping);
1602 	}
1603 
1604 	return 0;
1605 }
1606 
1607 /**
1608  * amdgpu_vm_update_prt_state - update the global PRT state
1609  */
1610 static void amdgpu_vm_update_prt_state(struct amdgpu_device *adev)
1611 {
1612 	unsigned long flags;
1613 	bool enable;
1614 
1615 	spin_lock_irqsave(&adev->vm_manager.prt_lock, flags);
1616 	enable = !!atomic_read(&adev->vm_manager.num_prt_users);
1617 	adev->gmc.gmc_funcs->set_prt(adev, enable);
1618 	spin_unlock_irqrestore(&adev->vm_manager.prt_lock, flags);
1619 }
1620 
1621 /**
1622  * amdgpu_vm_prt_get - add a PRT user
1623  */
1624 static void amdgpu_vm_prt_get(struct amdgpu_device *adev)
1625 {
1626 	if (!adev->gmc.gmc_funcs->set_prt)
1627 		return;
1628 
1629 	if (atomic_inc_return(&adev->vm_manager.num_prt_users) == 1)
1630 		amdgpu_vm_update_prt_state(adev);
1631 }
1632 
1633 /**
1634  * amdgpu_vm_prt_put - drop a PRT user
1635  */
1636 static void amdgpu_vm_prt_put(struct amdgpu_device *adev)
1637 {
1638 	if (atomic_dec_return(&adev->vm_manager.num_prt_users) == 0)
1639 		amdgpu_vm_update_prt_state(adev);
1640 }
1641 
1642 /**
1643  * amdgpu_vm_prt_cb - callback for updating the PRT status
1644  */
1645 static void amdgpu_vm_prt_cb(struct dma_fence *fence, struct dma_fence_cb *_cb)
1646 {
1647 	struct amdgpu_prt_cb *cb = container_of(_cb, struct amdgpu_prt_cb, cb);
1648 
1649 	amdgpu_vm_prt_put(cb->adev);
1650 	kfree(cb);
1651 }
1652 
1653 /**
1654  * amdgpu_vm_add_prt_cb - add callback for updating the PRT status
1655  */
1656 static void amdgpu_vm_add_prt_cb(struct amdgpu_device *adev,
1657 				 struct dma_fence *fence)
1658 {
1659 	struct amdgpu_prt_cb *cb;
1660 
1661 	if (!adev->gmc.gmc_funcs->set_prt)
1662 		return;
1663 
1664 	cb = kmalloc(sizeof(struct amdgpu_prt_cb), GFP_KERNEL);
1665 	if (!cb) {
1666 		/* Last resort when we are OOM */
1667 		if (fence)
1668 			dma_fence_wait(fence, false);
1669 
1670 		amdgpu_vm_prt_put(adev);
1671 	} else {
1672 		cb->adev = adev;
1673 		if (!fence || dma_fence_add_callback(fence, &cb->cb,
1674 						     amdgpu_vm_prt_cb))
1675 			amdgpu_vm_prt_cb(fence, &cb->cb);
1676 	}
1677 }
1678 
1679 /**
1680  * amdgpu_vm_free_mapping - free a mapping
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @vm: requested vm
1684  * @mapping: mapping to be freed
1685  * @fence: fence of the unmap operation
1686  *
1687  * Free a mapping and make sure we decrease the PRT usage count if applicable.
1688  */
1689 static void amdgpu_vm_free_mapping(struct amdgpu_device *adev,
1690 				   struct amdgpu_vm *vm,
1691 				   struct amdgpu_bo_va_mapping *mapping,
1692 				   struct dma_fence *fence)
1693 {
1694 	if (mapping->flags & AMDGPU_PTE_PRT)
1695 		amdgpu_vm_add_prt_cb(adev, fence);
1696 	kfree(mapping);
1697 }
1698 
1699 /**
1700  * amdgpu_vm_prt_fini - finish all prt mappings
1701  *
1702  * @adev: amdgpu_device pointer
1703  * @vm: requested vm
1704  *
1705  * Register a cleanup callback to disable PRT support after VM dies.
1706  */
1707 static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
1708 {
1709 	struct reservation_object *resv = vm->root.base.bo->tbo.resv;
1710 	struct dma_fence *excl, **shared;
1711 	unsigned i, shared_count;
1712 	int r;
1713 
1714 	r = reservation_object_get_fences_rcu(resv, &excl,
1715 					      &shared_count, &shared);
1716 	if (r) {
1717 		/* Not enough memory to grab the fence list, as last resort
1718 		 * block for all the fences to complete.
1719 		 */
1720 		reservation_object_wait_timeout_rcu(resv, true, false,
1721 						    MAX_SCHEDULE_TIMEOUT);
1722 		return;
1723 	}
1724 
1725 	/* Add a callback for each fence in the reservation object */
1726 	amdgpu_vm_prt_get(adev);
1727 	amdgpu_vm_add_prt_cb(adev, excl);
1728 
1729 	for (i = 0; i < shared_count; ++i) {
1730 		amdgpu_vm_prt_get(adev);
1731 		amdgpu_vm_add_prt_cb(adev, shared[i]);
1732 	}
1733 
1734 	kfree(shared);
1735 }
1736 
1737 /**
1738  * amdgpu_vm_clear_freed - clear freed BOs in the PT
1739  *
1740  * @adev: amdgpu_device pointer
1741  * @vm: requested vm
1742  * @fence: optional resulting fence (unchanged if no work needed to be done
1743  * or if an error occurred)
1744  *
1745  * Make sure all freed BOs are cleared in the PT.
1746  * Returns 0 for success.
1747  *
1748  * PTs have to be reserved and mutex must be locked!
1749  */
1750 int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
1751 			  struct amdgpu_vm *vm,
1752 			  struct dma_fence **fence)
1753 {
1754 	struct amdgpu_bo_va_mapping *mapping;
1755 	uint64_t init_pte_value = 0;
1756 	struct dma_fence *f = NULL;
1757 	int r;
1758 
1759 	while (!list_empty(&vm->freed)) {
1760 		mapping = list_first_entry(&vm->freed,
1761 			struct amdgpu_bo_va_mapping, list);
1762 		list_del(&mapping->list);
1763 
1764 		if (vm->pte_support_ats && mapping->start < AMDGPU_VA_HOLE_START)
1765 			init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
1766 
1767 		r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
1768 						mapping->start, mapping->last,
1769 						init_pte_value, 0, &f);
1770 		amdgpu_vm_free_mapping(adev, vm, mapping, f);
1771 		if (r) {
1772 			dma_fence_put(f);
1773 			return r;
1774 		}
1775 	}
1776 
1777 	if (fence && f) {
1778 		dma_fence_put(*fence);
1779 		*fence = f;
1780 	} else {
1781 		dma_fence_put(f);
1782 	}
1783 
1784 	return 0;
1785 
1786 }
1787 
1788 /**
1789  * amdgpu_vm_handle_moved - handle moved BOs in the PT
1790  *
1791  * @adev: amdgpu_device pointer
1792  * @vm: requested vm
1793  * @sync: sync object to add fences to
1794  *
1795  * Make sure all BOs which are moved are updated in the PTs.
1796  * Returns 0 for success.
1797  *
1798  * PTs have to be reserved!
1799  */
1800 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
1801 			   struct amdgpu_vm *vm)
1802 {
1803 	struct amdgpu_bo_va *bo_va, *tmp;
1804 	struct list_head moved;
1805 	bool clear;
1806 	int r;
1807 
1808 	INIT_LIST_HEAD(&moved);
1809 	spin_lock(&vm->moved_lock);
1810 	list_splice_init(&vm->moved, &moved);
1811 	spin_unlock(&vm->moved_lock);
1812 
1813 	list_for_each_entry_safe(bo_va, tmp, &moved, base.vm_status) {
1814 		struct reservation_object *resv = bo_va->base.bo->tbo.resv;
1815 
1816 		/* Per VM BOs never need to bo cleared in the page tables */
1817 		if (resv == vm->root.base.bo->tbo.resv)
1818 			clear = false;
1819 		/* Try to reserve the BO to avoid clearing its ptes */
1820 		else if (!amdgpu_vm_debug && reservation_object_trylock(resv))
1821 			clear = false;
1822 		/* Somebody else is using the BO right now */
1823 		else
1824 			clear = true;
1825 
1826 		r = amdgpu_vm_bo_update(adev, bo_va, clear);
1827 		if (r) {
1828 			spin_lock(&vm->moved_lock);
1829 			list_splice(&moved, &vm->moved);
1830 			spin_unlock(&vm->moved_lock);
1831 			return r;
1832 		}
1833 
1834 		if (!clear && resv != vm->root.base.bo->tbo.resv)
1835 			reservation_object_unlock(resv);
1836 
1837 	}
1838 
1839 	return 0;
1840 }
1841 
1842 /**
1843  * amdgpu_vm_bo_add - add a bo to a specific vm
1844  *
1845  * @adev: amdgpu_device pointer
1846  * @vm: requested vm
1847  * @bo: amdgpu buffer object
1848  *
1849  * Add @bo into the requested vm.
1850  * Add @bo to the list of bos associated with the vm
1851  * Returns newly added bo_va or NULL for failure
1852  *
1853  * Object has to be reserved!
1854  */
1855 struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
1856 				      struct amdgpu_vm *vm,
1857 				      struct amdgpu_bo *bo)
1858 {
1859 	struct amdgpu_bo_va *bo_va;
1860 
1861 	bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
1862 	if (bo_va == NULL) {
1863 		return NULL;
1864 	}
1865 	amdgpu_vm_bo_base_init(&bo_va->base, vm, bo);
1866 
1867 	bo_va->ref_count = 1;
1868 	INIT_LIST_HEAD(&bo_va->valids);
1869 	INIT_LIST_HEAD(&bo_va->invalids);
1870 
1871 	return bo_va;
1872 }
1873 
1874 
1875 /**
1876  * amdgpu_vm_bo_insert_mapping - insert a new mapping
1877  *
1878  * @adev: amdgpu_device pointer
1879  * @bo_va: bo_va to store the address
1880  * @mapping: the mapping to insert
1881  *
1882  * Insert a new mapping into all structures.
1883  */
1884 static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
1885 				    struct amdgpu_bo_va *bo_va,
1886 				    struct amdgpu_bo_va_mapping *mapping)
1887 {
1888 	struct amdgpu_vm *vm = bo_va->base.vm;
1889 	struct amdgpu_bo *bo = bo_va->base.bo;
1890 
1891 	mapping->bo_va = bo_va;
1892 	list_add(&mapping->list, &bo_va->invalids);
1893 	amdgpu_vm_it_insert(mapping, &vm->va);
1894 
1895 	if (mapping->flags & AMDGPU_PTE_PRT)
1896 		amdgpu_vm_prt_get(adev);
1897 
1898 	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv &&
1899 	    !bo_va->base.moved) {
1900 		spin_lock(&vm->moved_lock);
1901 		list_move(&bo_va->base.vm_status, &vm->moved);
1902 		spin_unlock(&vm->moved_lock);
1903 	}
1904 	trace_amdgpu_vm_bo_map(bo_va, mapping);
1905 }
1906 
1907 /**
1908  * amdgpu_vm_bo_map - map bo inside a vm
1909  *
1910  * @adev: amdgpu_device pointer
1911  * @bo_va: bo_va to store the address
1912  * @saddr: where to map the BO
1913  * @offset: requested offset in the BO
1914  * @flags: attributes of pages (read/write/valid/etc.)
1915  *
1916  * Add a mapping of the BO at the specefied addr into the VM.
1917  * Returns 0 for success, error for failure.
1918  *
1919  * Object has to be reserved and unreserved outside!
1920  */
1921 int amdgpu_vm_bo_map(struct amdgpu_device *adev,
1922 		     struct amdgpu_bo_va *bo_va,
1923 		     uint64_t saddr, uint64_t offset,
1924 		     uint64_t size, uint64_t flags)
1925 {
1926 	struct amdgpu_bo_va_mapping *mapping, *tmp;
1927 	struct amdgpu_bo *bo = bo_va->base.bo;
1928 	struct amdgpu_vm *vm = bo_va->base.vm;
1929 	uint64_t eaddr;
1930 
1931 	/* validate the parameters */
1932 	if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
1933 	    size == 0 || size & AMDGPU_GPU_PAGE_MASK)
1934 		return -EINVAL;
1935 
1936 	/* make sure object fit at this offset */
1937 	eaddr = saddr + size - 1;
1938 	if (saddr >= eaddr ||
1939 	    (bo && offset + size > amdgpu_bo_size(bo)))
1940 		return -EINVAL;
1941 
1942 	saddr /= AMDGPU_GPU_PAGE_SIZE;
1943 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
1944 
1945 	tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
1946 	if (tmp) {
1947 		/* bo and tmp overlap, invalid addr */
1948 		dev_err(adev->dev, "bo %p va 0x%010Lx-0x%010Lx conflict with "
1949 			"0x%010Lx-0x%010Lx\n", bo, saddr, eaddr,
1950 			tmp->start, tmp->last + 1);
1951 		return -EINVAL;
1952 	}
1953 
1954 	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
1955 	if (!mapping)
1956 		return -ENOMEM;
1957 
1958 	mapping->start = saddr;
1959 	mapping->last = eaddr;
1960 	mapping->offset = offset;
1961 	mapping->flags = flags;
1962 
1963 	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
1964 
1965 	return 0;
1966 }
1967 
1968 /**
1969  * amdgpu_vm_bo_replace_map - map bo inside a vm, replacing existing mappings
1970  *
1971  * @adev: amdgpu_device pointer
1972  * @bo_va: bo_va to store the address
1973  * @saddr: where to map the BO
1974  * @offset: requested offset in the BO
1975  * @flags: attributes of pages (read/write/valid/etc.)
1976  *
1977  * Add a mapping of the BO at the specefied addr into the VM. Replace existing
1978  * mappings as we do so.
1979  * Returns 0 for success, error for failure.
1980  *
1981  * Object has to be reserved and unreserved outside!
1982  */
1983 int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
1984 			     struct amdgpu_bo_va *bo_va,
1985 			     uint64_t saddr, uint64_t offset,
1986 			     uint64_t size, uint64_t flags)
1987 {
1988 	struct amdgpu_bo_va_mapping *mapping;
1989 	struct amdgpu_bo *bo = bo_va->base.bo;
1990 	uint64_t eaddr;
1991 	int r;
1992 
1993 	/* validate the parameters */
1994 	if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
1995 	    size == 0 || size & AMDGPU_GPU_PAGE_MASK)
1996 		return -EINVAL;
1997 
1998 	/* make sure object fit at this offset */
1999 	eaddr = saddr + size - 1;
2000 	if (saddr >= eaddr ||
2001 	    (bo && offset + size > amdgpu_bo_size(bo)))
2002 		return -EINVAL;
2003 
2004 	/* Allocate all the needed memory */
2005 	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2006 	if (!mapping)
2007 		return -ENOMEM;
2008 
2009 	r = amdgpu_vm_bo_clear_mappings(adev, bo_va->base.vm, saddr, size);
2010 	if (r) {
2011 		kfree(mapping);
2012 		return r;
2013 	}
2014 
2015 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2016 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
2017 
2018 	mapping->start = saddr;
2019 	mapping->last = eaddr;
2020 	mapping->offset = offset;
2021 	mapping->flags = flags;
2022 
2023 	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
2024 
2025 	return 0;
2026 }
2027 
2028 /**
2029  * amdgpu_vm_bo_unmap - remove bo mapping from vm
2030  *
2031  * @adev: amdgpu_device pointer
2032  * @bo_va: bo_va to remove the address from
2033  * @saddr: where to the BO is mapped
2034  *
2035  * Remove a mapping of the BO at the specefied addr from the VM.
2036  * Returns 0 for success, error for failure.
2037  *
2038  * Object has to be reserved and unreserved outside!
2039  */
2040 int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
2041 		       struct amdgpu_bo_va *bo_va,
2042 		       uint64_t saddr)
2043 {
2044 	struct amdgpu_bo_va_mapping *mapping;
2045 	struct amdgpu_vm *vm = bo_va->base.vm;
2046 	bool valid = true;
2047 
2048 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2049 
2050 	list_for_each_entry(mapping, &bo_va->valids, list) {
2051 		if (mapping->start == saddr)
2052 			break;
2053 	}
2054 
2055 	if (&mapping->list == &bo_va->valids) {
2056 		valid = false;
2057 
2058 		list_for_each_entry(mapping, &bo_va->invalids, list) {
2059 			if (mapping->start == saddr)
2060 				break;
2061 		}
2062 
2063 		if (&mapping->list == &bo_va->invalids)
2064 			return -ENOENT;
2065 	}
2066 
2067 	list_del(&mapping->list);
2068 	amdgpu_vm_it_remove(mapping, &vm->va);
2069 	mapping->bo_va = NULL;
2070 	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2071 
2072 	if (valid)
2073 		list_add(&mapping->list, &vm->freed);
2074 	else
2075 		amdgpu_vm_free_mapping(adev, vm, mapping,
2076 				       bo_va->last_pt_update);
2077 
2078 	return 0;
2079 }
2080 
2081 /**
2082  * amdgpu_vm_bo_clear_mappings - remove all mappings in a specific range
2083  *
2084  * @adev: amdgpu_device pointer
2085  * @vm: VM structure to use
2086  * @saddr: start of the range
2087  * @size: size of the range
2088  *
2089  * Remove all mappings in a range, split them as appropriate.
2090  * Returns 0 for success, error for failure.
2091  */
2092 int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
2093 				struct amdgpu_vm *vm,
2094 				uint64_t saddr, uint64_t size)
2095 {
2096 	struct amdgpu_bo_va_mapping *before, *after, *tmp, *next;
2097 	LIST_HEAD(removed);
2098 	uint64_t eaddr;
2099 
2100 	eaddr = saddr + size - 1;
2101 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2102 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
2103 
2104 	/* Allocate all the needed memory */
2105 	before = kzalloc(sizeof(*before), GFP_KERNEL);
2106 	if (!before)
2107 		return -ENOMEM;
2108 	INIT_LIST_HEAD(&before->list);
2109 
2110 	after = kzalloc(sizeof(*after), GFP_KERNEL);
2111 	if (!after) {
2112 		kfree(before);
2113 		return -ENOMEM;
2114 	}
2115 	INIT_LIST_HEAD(&after->list);
2116 
2117 	/* Now gather all removed mappings */
2118 	tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
2119 	while (tmp) {
2120 		/* Remember mapping split at the start */
2121 		if (tmp->start < saddr) {
2122 			before->start = tmp->start;
2123 			before->last = saddr - 1;
2124 			before->offset = tmp->offset;
2125 			before->flags = tmp->flags;
2126 			before->bo_va = tmp->bo_va;
2127 			list_add(&before->list, &tmp->bo_va->invalids);
2128 		}
2129 
2130 		/* Remember mapping split at the end */
2131 		if (tmp->last > eaddr) {
2132 			after->start = eaddr + 1;
2133 			after->last = tmp->last;
2134 			after->offset = tmp->offset;
2135 			after->offset += after->start - tmp->start;
2136 			after->flags = tmp->flags;
2137 			after->bo_va = tmp->bo_va;
2138 			list_add(&after->list, &tmp->bo_va->invalids);
2139 		}
2140 
2141 		list_del(&tmp->list);
2142 		list_add(&tmp->list, &removed);
2143 
2144 		tmp = amdgpu_vm_it_iter_next(tmp, saddr, eaddr);
2145 	}
2146 
2147 	/* And free them up */
2148 	list_for_each_entry_safe(tmp, next, &removed, list) {
2149 		amdgpu_vm_it_remove(tmp, &vm->va);
2150 		list_del(&tmp->list);
2151 
2152 		if (tmp->start < saddr)
2153 		    tmp->start = saddr;
2154 		if (tmp->last > eaddr)
2155 		    tmp->last = eaddr;
2156 
2157 		tmp->bo_va = NULL;
2158 		list_add(&tmp->list, &vm->freed);
2159 		trace_amdgpu_vm_bo_unmap(NULL, tmp);
2160 	}
2161 
2162 	/* Insert partial mapping before the range */
2163 	if (!list_empty(&before->list)) {
2164 		amdgpu_vm_it_insert(before, &vm->va);
2165 		if (before->flags & AMDGPU_PTE_PRT)
2166 			amdgpu_vm_prt_get(adev);
2167 	} else {
2168 		kfree(before);
2169 	}
2170 
2171 	/* Insert partial mapping after the range */
2172 	if (!list_empty(&after->list)) {
2173 		amdgpu_vm_it_insert(after, &vm->va);
2174 		if (after->flags & AMDGPU_PTE_PRT)
2175 			amdgpu_vm_prt_get(adev);
2176 	} else {
2177 		kfree(after);
2178 	}
2179 
2180 	return 0;
2181 }
2182 
2183 /**
2184  * amdgpu_vm_bo_lookup_mapping - find mapping by address
2185  *
2186  * @vm: the requested VM
2187  *
2188  * Find a mapping by it's address.
2189  */
2190 struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm,
2191 							 uint64_t addr)
2192 {
2193 	return amdgpu_vm_it_iter_first(&vm->va, addr, addr);
2194 }
2195 
2196 /**
2197  * amdgpu_vm_bo_rmv - remove a bo to a specific vm
2198  *
2199  * @adev: amdgpu_device pointer
2200  * @bo_va: requested bo_va
2201  *
2202  * Remove @bo_va->bo from the requested vm.
2203  *
2204  * Object have to be reserved!
2205  */
2206 void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
2207 		      struct amdgpu_bo_va *bo_va)
2208 {
2209 	struct amdgpu_bo_va_mapping *mapping, *next;
2210 	struct amdgpu_vm *vm = bo_va->base.vm;
2211 
2212 	list_del(&bo_va->base.bo_list);
2213 
2214 	spin_lock(&vm->moved_lock);
2215 	list_del(&bo_va->base.vm_status);
2216 	spin_unlock(&vm->moved_lock);
2217 
2218 	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
2219 		list_del(&mapping->list);
2220 		amdgpu_vm_it_remove(mapping, &vm->va);
2221 		mapping->bo_va = NULL;
2222 		trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2223 		list_add(&mapping->list, &vm->freed);
2224 	}
2225 	list_for_each_entry_safe(mapping, next, &bo_va->invalids, list) {
2226 		list_del(&mapping->list);
2227 		amdgpu_vm_it_remove(mapping, &vm->va);
2228 		amdgpu_vm_free_mapping(adev, vm, mapping,
2229 				       bo_va->last_pt_update);
2230 	}
2231 
2232 	dma_fence_put(bo_va->last_pt_update);
2233 	kfree(bo_va);
2234 }
2235 
2236 /**
2237  * amdgpu_vm_bo_invalidate - mark the bo as invalid
2238  *
2239  * @adev: amdgpu_device pointer
2240  * @vm: requested vm
2241  * @bo: amdgpu buffer object
2242  *
2243  * Mark @bo as invalid.
2244  */
2245 void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
2246 			     struct amdgpu_bo *bo, bool evicted)
2247 {
2248 	struct amdgpu_vm_bo_base *bo_base;
2249 
2250 	/* shadow bo doesn't have bo base, its validation needs its parent */
2251 	if (bo->parent && bo->parent->shadow == bo)
2252 		bo = bo->parent;
2253 
2254 	list_for_each_entry(bo_base, &bo->va, bo_list) {
2255 		struct amdgpu_vm *vm = bo_base->vm;
2256 		bool was_moved = bo_base->moved;
2257 
2258 		bo_base->moved = true;
2259 		if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
2260 			if (bo->tbo.type == ttm_bo_type_kernel)
2261 				list_move(&bo_base->vm_status, &vm->evicted);
2262 			else
2263 				list_move_tail(&bo_base->vm_status,
2264 					       &vm->evicted);
2265 			continue;
2266 		}
2267 
2268 		if (was_moved)
2269 			continue;
2270 
2271 		if (bo->tbo.type == ttm_bo_type_kernel) {
2272 			list_move(&bo_base->vm_status, &vm->relocated);
2273 		} else {
2274 			spin_lock(&bo_base->vm->moved_lock);
2275 			list_move(&bo_base->vm_status, &vm->moved);
2276 			spin_unlock(&bo_base->vm->moved_lock);
2277 		}
2278 	}
2279 }
2280 
2281 static uint32_t amdgpu_vm_get_block_size(uint64_t vm_size)
2282 {
2283 	/* Total bits covered by PD + PTs */
2284 	unsigned bits = ilog2(vm_size) + 18;
2285 
2286 	/* Make sure the PD is 4K in size up to 8GB address space.
2287 	   Above that split equal between PD and PTs */
2288 	if (vm_size <= 8)
2289 		return (bits - 9);
2290 	else
2291 		return ((bits + 3) / 2);
2292 }
2293 
2294 /**
2295  * amdgpu_vm_adjust_size - adjust vm size, block size and fragment size
2296  *
2297  * @adev: amdgpu_device pointer
2298  * @vm_size: the default vm size if it's set auto
2299  */
2300 void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t vm_size,
2301 			   uint32_t fragment_size_default, unsigned max_level,
2302 			   unsigned max_bits)
2303 {
2304 	uint64_t tmp;
2305 
2306 	/* adjust vm size first */
2307 	if (amdgpu_vm_size != -1) {
2308 		unsigned max_size = 1 << (max_bits - 30);
2309 
2310 		vm_size = amdgpu_vm_size;
2311 		if (vm_size > max_size) {
2312 			dev_warn(adev->dev, "VM size (%d) too large, max is %u GB\n",
2313 				 amdgpu_vm_size, max_size);
2314 			vm_size = max_size;
2315 		}
2316 	}
2317 
2318 	adev->vm_manager.max_pfn = (uint64_t)vm_size << 18;
2319 
2320 	tmp = roundup_pow_of_two(adev->vm_manager.max_pfn);
2321 	if (amdgpu_vm_block_size != -1)
2322 		tmp >>= amdgpu_vm_block_size - 9;
2323 	tmp = DIV_ROUND_UP(fls64(tmp) - 1, 9) - 1;
2324 	adev->vm_manager.num_level = min(max_level, (unsigned)tmp);
2325 	switch (adev->vm_manager.num_level) {
2326 	case 3:
2327 		adev->vm_manager.root_level = AMDGPU_VM_PDB2;
2328 		break;
2329 	case 2:
2330 		adev->vm_manager.root_level = AMDGPU_VM_PDB1;
2331 		break;
2332 	case 1:
2333 		adev->vm_manager.root_level = AMDGPU_VM_PDB0;
2334 		break;
2335 	default:
2336 		dev_err(adev->dev, "VMPT only supports 2~4+1 levels\n");
2337 	}
2338 	/* block size depends on vm size and hw setup*/
2339 	if (amdgpu_vm_block_size != -1)
2340 		adev->vm_manager.block_size =
2341 			min((unsigned)amdgpu_vm_block_size, max_bits
2342 			    - AMDGPU_GPU_PAGE_SHIFT
2343 			    - 9 * adev->vm_manager.num_level);
2344 	else if (adev->vm_manager.num_level > 1)
2345 		adev->vm_manager.block_size = 9;
2346 	else
2347 		adev->vm_manager.block_size = amdgpu_vm_get_block_size(tmp);
2348 
2349 	if (amdgpu_vm_fragment_size == -1)
2350 		adev->vm_manager.fragment_size = fragment_size_default;
2351 	else
2352 		adev->vm_manager.fragment_size = amdgpu_vm_fragment_size;
2353 
2354 	DRM_INFO("vm size is %u GB, %u levels, block size is %u-bit, fragment size is %u-bit\n",
2355 		 vm_size, adev->vm_manager.num_level + 1,
2356 		 adev->vm_manager.block_size,
2357 		 adev->vm_manager.fragment_size);
2358 }
2359 
2360 /**
2361  * amdgpu_vm_init - initialize a vm instance
2362  *
2363  * @adev: amdgpu_device pointer
2364  * @vm: requested vm
2365  * @vm_context: Indicates if it GFX or Compute context
2366  *
2367  * Init @vm fields.
2368  */
2369 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
2370 		   int vm_context, unsigned int pasid)
2371 {
2372 	struct amdgpu_bo_param bp;
2373 	struct amdgpu_bo *root;
2374 	const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
2375 		AMDGPU_VM_PTE_COUNT(adev) * 8);
2376 	unsigned ring_instance;
2377 	struct amdgpu_ring *ring;
2378 	struct drm_sched_rq *rq;
2379 	unsigned long size;
2380 	uint64_t flags;
2381 	int r, i;
2382 
2383 	vm->va = RB_ROOT_CACHED;
2384 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2385 		vm->reserved_vmid[i] = NULL;
2386 	INIT_LIST_HEAD(&vm->evicted);
2387 	INIT_LIST_HEAD(&vm->relocated);
2388 	spin_lock_init(&vm->moved_lock);
2389 	INIT_LIST_HEAD(&vm->moved);
2390 	INIT_LIST_HEAD(&vm->idle);
2391 	INIT_LIST_HEAD(&vm->freed);
2392 
2393 	/* create scheduler entity for page table updates */
2394 
2395 	ring_instance = atomic_inc_return(&adev->vm_manager.vm_pte_next_ring);
2396 	ring_instance %= adev->vm_manager.vm_pte_num_rings;
2397 	ring = adev->vm_manager.vm_pte_rings[ring_instance];
2398 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
2399 	r = drm_sched_entity_init(&ring->sched, &vm->entity,
2400 				  rq, NULL);
2401 	if (r)
2402 		return r;
2403 
2404 	vm->pte_support_ats = false;
2405 
2406 	if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
2407 		vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2408 						AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2409 
2410 		if (adev->asic_type == CHIP_RAVEN)
2411 			vm->pte_support_ats = true;
2412 	} else {
2413 		vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2414 						AMDGPU_VM_USE_CPU_FOR_GFX);
2415 	}
2416 	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2417 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
2418 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
2419 		  "CPU update of VM recommended only for large BAR system\n");
2420 	vm->last_update = NULL;
2421 
2422 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
2423 	if (vm->use_cpu_for_update)
2424 		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
2425 	else
2426 		flags |= AMDGPU_GEM_CREATE_SHADOW;
2427 
2428 	size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level);
2429 	memset(&bp, 0, sizeof(bp));
2430 	bp.size = size;
2431 	bp.byte_align = align;
2432 	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
2433 	bp.flags = flags;
2434 	bp.type = ttm_bo_type_kernel;
2435 	bp.resv = NULL;
2436 	r = amdgpu_bo_create(adev, &bp, &root);
2437 	if (r)
2438 		goto error_free_sched_entity;
2439 
2440 	r = amdgpu_bo_reserve(root, true);
2441 	if (r)
2442 		goto error_free_root;
2443 
2444 	r = amdgpu_vm_clear_bo(adev, vm, root,
2445 			       adev->vm_manager.root_level,
2446 			       vm->pte_support_ats);
2447 	if (r)
2448 		goto error_unreserve;
2449 
2450 	amdgpu_vm_bo_base_init(&vm->root.base, vm, root);
2451 	amdgpu_bo_unreserve(vm->root.base.bo);
2452 
2453 	if (pasid) {
2454 		unsigned long flags;
2455 
2456 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2457 		r = idr_alloc(&adev->vm_manager.pasid_idr, vm, pasid, pasid + 1,
2458 			      GFP_ATOMIC);
2459 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2460 		if (r < 0)
2461 			goto error_free_root;
2462 
2463 		vm->pasid = pasid;
2464 	}
2465 
2466 	INIT_KFIFO(vm->faults);
2467 	vm->fault_credit = 16;
2468 
2469 	return 0;
2470 
2471 error_unreserve:
2472 	amdgpu_bo_unreserve(vm->root.base.bo);
2473 
2474 error_free_root:
2475 	amdgpu_bo_unref(&vm->root.base.bo->shadow);
2476 	amdgpu_bo_unref(&vm->root.base.bo);
2477 	vm->root.base.bo = NULL;
2478 
2479 error_free_sched_entity:
2480 	drm_sched_entity_fini(&ring->sched, &vm->entity);
2481 
2482 	return r;
2483 }
2484 
2485 /**
2486  * amdgpu_vm_make_compute - Turn a GFX VM into a compute VM
2487  *
2488  * This only works on GFX VMs that don't have any BOs added and no
2489  * page tables allocated yet.
2490  *
2491  * Changes the following VM parameters:
2492  * - use_cpu_for_update
2493  * - pte_supports_ats
2494  * - pasid (old PASID is released, because compute manages its own PASIDs)
2495  *
2496  * Reinitializes the page directory to reflect the changed ATS
2497  * setting. May leave behind an unused shadow BO for the page
2498  * directory when switching from SDMA updates to CPU updates.
2499  *
2500  * Returns 0 for success, -errno for errors.
2501  */
2502 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2503 {
2504 	bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
2505 	int r;
2506 
2507 	r = amdgpu_bo_reserve(vm->root.base.bo, true);
2508 	if (r)
2509 		return r;
2510 
2511 	/* Sanity checks */
2512 	if (!RB_EMPTY_ROOT(&vm->va.rb_root) || vm->root.entries) {
2513 		r = -EINVAL;
2514 		goto error;
2515 	}
2516 
2517 	/* Check if PD needs to be reinitialized and do it before
2518 	 * changing any other state, in case it fails.
2519 	 */
2520 	if (pte_support_ats != vm->pte_support_ats) {
2521 		r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo,
2522 			       adev->vm_manager.root_level,
2523 			       pte_support_ats);
2524 		if (r)
2525 			goto error;
2526 	}
2527 
2528 	/* Update VM state */
2529 	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2530 				    AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2531 	vm->pte_support_ats = pte_support_ats;
2532 	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2533 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
2534 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
2535 		  "CPU update of VM recommended only for large BAR system\n");
2536 
2537 	if (vm->pasid) {
2538 		unsigned long flags;
2539 
2540 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2541 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2542 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2543 
2544 		vm->pasid = 0;
2545 	}
2546 
2547 error:
2548 	amdgpu_bo_unreserve(vm->root.base.bo);
2549 	return r;
2550 }
2551 
2552 /**
2553  * amdgpu_vm_free_levels - free PD/PT levels
2554  *
2555  * @adev: amdgpu device structure
2556  * @parent: PD/PT starting level to free
2557  * @level: level of parent structure
2558  *
2559  * Free the page directory or page table level and all sub levels.
2560  */
2561 static void amdgpu_vm_free_levels(struct amdgpu_device *adev,
2562 				  struct amdgpu_vm_pt *parent,
2563 				  unsigned level)
2564 {
2565 	unsigned i, num_entries = amdgpu_vm_num_entries(adev, level);
2566 
2567 	if (parent->base.bo) {
2568 		list_del(&parent->base.bo_list);
2569 		list_del(&parent->base.vm_status);
2570 		amdgpu_bo_unref(&parent->base.bo->shadow);
2571 		amdgpu_bo_unref(&parent->base.bo);
2572 	}
2573 
2574 	if (parent->entries)
2575 		for (i = 0; i < num_entries; i++)
2576 			amdgpu_vm_free_levels(adev, &parent->entries[i],
2577 					      level + 1);
2578 
2579 	kvfree(parent->entries);
2580 }
2581 
2582 /**
2583  * amdgpu_vm_fini - tear down a vm instance
2584  *
2585  * @adev: amdgpu_device pointer
2586  * @vm: requested vm
2587  *
2588  * Tear down @vm.
2589  * Unbind the VM and remove all bos from the vm bo list
2590  */
2591 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2592 {
2593 	struct amdgpu_bo_va_mapping *mapping, *tmp;
2594 	bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
2595 	struct amdgpu_bo *root;
2596 	u64 fault;
2597 	int i, r;
2598 
2599 	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
2600 
2601 	/* Clear pending page faults from IH when the VM is destroyed */
2602 	while (kfifo_get(&vm->faults, &fault))
2603 		amdgpu_ih_clear_fault(adev, fault);
2604 
2605 	if (vm->pasid) {
2606 		unsigned long flags;
2607 
2608 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2609 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2610 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2611 	}
2612 
2613 	drm_sched_entity_fini(vm->entity.sched, &vm->entity);
2614 
2615 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
2616 		dev_err(adev->dev, "still active bo inside vm\n");
2617 	}
2618 	rbtree_postorder_for_each_entry_safe(mapping, tmp,
2619 					     &vm->va.rb_root, rb) {
2620 		list_del(&mapping->list);
2621 		amdgpu_vm_it_remove(mapping, &vm->va);
2622 		kfree(mapping);
2623 	}
2624 	list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
2625 		if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
2626 			amdgpu_vm_prt_fini(adev, vm);
2627 			prt_fini_needed = false;
2628 		}
2629 
2630 		list_del(&mapping->list);
2631 		amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
2632 	}
2633 
2634 	root = amdgpu_bo_ref(vm->root.base.bo);
2635 	r = amdgpu_bo_reserve(root, true);
2636 	if (r) {
2637 		dev_err(adev->dev, "Leaking page tables because BO reservation failed\n");
2638 	} else {
2639 		amdgpu_vm_free_levels(adev, &vm->root,
2640 				      adev->vm_manager.root_level);
2641 		amdgpu_bo_unreserve(root);
2642 	}
2643 	amdgpu_bo_unref(&root);
2644 	dma_fence_put(vm->last_update);
2645 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2646 		amdgpu_vmid_free_reserved(adev, vm, i);
2647 }
2648 
2649 /**
2650  * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
2651  *
2652  * @adev: amdgpu_device pointer
2653  * @pasid: PASID do identify the VM
2654  *
2655  * This function is expected to be called in interrupt context. Returns
2656  * true if there was fault credit, false otherwise
2657  */
2658 bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
2659 				  unsigned int pasid)
2660 {
2661 	struct amdgpu_vm *vm;
2662 
2663 	spin_lock(&adev->vm_manager.pasid_lock);
2664 	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
2665 	if (!vm) {
2666 		/* VM not found, can't track fault credit */
2667 		spin_unlock(&adev->vm_manager.pasid_lock);
2668 		return true;
2669 	}
2670 
2671 	/* No lock needed. only accessed by IRQ handler */
2672 	if (!vm->fault_credit) {
2673 		/* Too many faults in this VM */
2674 		spin_unlock(&adev->vm_manager.pasid_lock);
2675 		return false;
2676 	}
2677 
2678 	vm->fault_credit--;
2679 	spin_unlock(&adev->vm_manager.pasid_lock);
2680 	return true;
2681 }
2682 
2683 /**
2684  * amdgpu_vm_manager_init - init the VM manager
2685  *
2686  * @adev: amdgpu_device pointer
2687  *
2688  * Initialize the VM manager structures
2689  */
2690 void amdgpu_vm_manager_init(struct amdgpu_device *adev)
2691 {
2692 	unsigned i;
2693 
2694 	amdgpu_vmid_mgr_init(adev);
2695 
2696 	adev->vm_manager.fence_context =
2697 		dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2698 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
2699 		adev->vm_manager.seqno[i] = 0;
2700 
2701 	atomic_set(&adev->vm_manager.vm_pte_next_ring, 0);
2702 	spin_lock_init(&adev->vm_manager.prt_lock);
2703 	atomic_set(&adev->vm_manager.num_prt_users, 0);
2704 
2705 	/* If not overridden by the user, by default, only in large BAR systems
2706 	 * Compute VM tables will be updated by CPU
2707 	 */
2708 #ifdef CONFIG_X86_64
2709 	if (amdgpu_vm_update_mode == -1) {
2710 		if (amdgpu_vm_is_large_bar(adev))
2711 			adev->vm_manager.vm_update_mode =
2712 				AMDGPU_VM_USE_CPU_FOR_COMPUTE;
2713 		else
2714 			adev->vm_manager.vm_update_mode = 0;
2715 	} else
2716 		adev->vm_manager.vm_update_mode = amdgpu_vm_update_mode;
2717 #else
2718 	adev->vm_manager.vm_update_mode = 0;
2719 #endif
2720 
2721 	idr_init(&adev->vm_manager.pasid_idr);
2722 	spin_lock_init(&adev->vm_manager.pasid_lock);
2723 }
2724 
2725 /**
2726  * amdgpu_vm_manager_fini - cleanup VM manager
2727  *
2728  * @adev: amdgpu_device pointer
2729  *
2730  * Cleanup the VM manager and free resources.
2731  */
2732 void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
2733 {
2734 	WARN_ON(!idr_is_empty(&adev->vm_manager.pasid_idr));
2735 	idr_destroy(&adev->vm_manager.pasid_idr);
2736 
2737 	amdgpu_vmid_mgr_fini(adev);
2738 }
2739 
2740 int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
2741 {
2742 	union drm_amdgpu_vm *args = data;
2743 	struct amdgpu_device *adev = dev->dev_private;
2744 	struct amdgpu_fpriv *fpriv = filp->driver_priv;
2745 	int r;
2746 
2747 	switch (args->in.op) {
2748 	case AMDGPU_VM_OP_RESERVE_VMID:
2749 		/* current, we only have requirement to reserve vmid from gfxhub */
2750 		r = amdgpu_vmid_alloc_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
2751 		if (r)
2752 			return r;
2753 		break;
2754 	case AMDGPU_VM_OP_UNRESERVE_VMID:
2755 		amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
2756 		break;
2757 	default:
2758 		return -EINVAL;
2759 	}
2760 
2761 	return 0;
2762 }
2763