xref: /openbmc/linux/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c (revision a0ae2562c6c4b2721d9fddba63b7286c13517d9f)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/dma-fence-array.h>
29 #include <linux/interval_tree_generic.h>
30 #include <linux/idr.h>
31 #include <drm/drmP.h>
32 #include <drm/amdgpu_drm.h>
33 #include "amdgpu.h"
34 #include "amdgpu_trace.h"
35 #include "amdgpu_amdkfd.h"
36 
37 /*
38  * GPUVM
39  * GPUVM is similar to the legacy gart on older asics, however
40  * rather than there being a single global gart table
41  * for the entire GPU, there are multiple VM page tables active
42  * at any given time.  The VM page tables can contain a mix
43  * vram pages and system memory pages and system memory pages
44  * can be mapped as snooped (cached system pages) or unsnooped
45  * (uncached system pages).
46  * Each VM has an ID associated with it and there is a page table
47  * associated with each VMID.  When execting a command buffer,
48  * the kernel tells the the ring what VMID to use for that command
49  * buffer.  VMIDs are allocated dynamically as commands are submitted.
50  * The userspace drivers maintain their own address space and the kernel
51  * sets up their pages tables accordingly when they submit their
52  * command buffers and a VMID is assigned.
53  * Cayman/Trinity support up to 8 active VMs at any given time;
54  * SI supports 16.
55  */
56 
57 #define START(node) ((node)->start)
58 #define LAST(node) ((node)->last)
59 
60 INTERVAL_TREE_DEFINE(struct amdgpu_bo_va_mapping, rb, uint64_t, __subtree_last,
61 		     START, LAST, static, amdgpu_vm_it)
62 
63 #undef START
64 #undef LAST
65 
66 /* Local structure. Encapsulate some VM table update parameters to reduce
67  * the number of function parameters
68  */
69 struct amdgpu_pte_update_params {
70 	/* amdgpu device we do this update for */
71 	struct amdgpu_device *adev;
72 	/* optional amdgpu_vm we do this update for */
73 	struct amdgpu_vm *vm;
74 	/* address where to copy page table entries from */
75 	uint64_t src;
76 	/* indirect buffer to fill with commands */
77 	struct amdgpu_ib *ib;
78 	/* Function which actually does the update */
79 	void (*func)(struct amdgpu_pte_update_params *params,
80 		     struct amdgpu_bo *bo, uint64_t pe,
81 		     uint64_t addr, unsigned count, uint32_t incr,
82 		     uint64_t flags);
83 	/* The next two are used during VM update by CPU
84 	 *  DMA addresses to use for mapping
85 	 *  Kernel pointer of PD/PT BO that needs to be updated
86 	 */
87 	dma_addr_t *pages_addr;
88 	void *kptr;
89 };
90 
91 /* Helper to disable partial resident texture feature from a fence callback */
92 struct amdgpu_prt_cb {
93 	struct amdgpu_device *adev;
94 	struct dma_fence_cb cb;
95 };
96 
97 static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
98 				   struct amdgpu_vm *vm,
99 				   struct amdgpu_bo *bo)
100 {
101 	base->vm = vm;
102 	base->bo = bo;
103 	INIT_LIST_HEAD(&base->bo_list);
104 	INIT_LIST_HEAD(&base->vm_status);
105 
106 	if (!bo)
107 		return;
108 	list_add_tail(&base->bo_list, &bo->va);
109 
110 	if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
111 		return;
112 
113 	if (bo->preferred_domains &
114 	    amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
115 		return;
116 
117 	/*
118 	 * we checked all the prerequisites, but it looks like this per vm bo
119 	 * is currently evicted. add the bo to the evicted list to make sure it
120 	 * is validated on next vm use to avoid fault.
121 	 * */
122 	list_move_tail(&base->vm_status, &vm->evicted);
123 }
124 
125 /**
126  * amdgpu_vm_level_shift - return the addr shift for each level
127  *
128  * @adev: amdgpu_device pointer
129  *
130  * Returns the number of bits the pfn needs to be right shifted for a level.
131  */
132 static unsigned amdgpu_vm_level_shift(struct amdgpu_device *adev,
133 				      unsigned level)
134 {
135 	unsigned shift = 0xff;
136 
137 	switch (level) {
138 	case AMDGPU_VM_PDB2:
139 	case AMDGPU_VM_PDB1:
140 	case AMDGPU_VM_PDB0:
141 		shift = 9 * (AMDGPU_VM_PDB0 - level) +
142 			adev->vm_manager.block_size;
143 		break;
144 	case AMDGPU_VM_PTB:
145 		shift = 0;
146 		break;
147 	default:
148 		dev_err(adev->dev, "the level%d isn't supported.\n", level);
149 	}
150 
151 	return shift;
152 }
153 
154 /**
155  * amdgpu_vm_num_entries - return the number of entries in a PD/PT
156  *
157  * @adev: amdgpu_device pointer
158  *
159  * Calculate the number of entries in a page directory or page table.
160  */
161 static unsigned amdgpu_vm_num_entries(struct amdgpu_device *adev,
162 				      unsigned level)
163 {
164 	unsigned shift = amdgpu_vm_level_shift(adev,
165 					       adev->vm_manager.root_level);
166 
167 	if (level == adev->vm_manager.root_level)
168 		/* For the root directory */
169 		return round_up(adev->vm_manager.max_pfn, 1 << shift) >> shift;
170 	else if (level != AMDGPU_VM_PTB)
171 		/* Everything in between */
172 		return 512;
173 	else
174 		/* For the page tables on the leaves */
175 		return AMDGPU_VM_PTE_COUNT(adev);
176 }
177 
178 /**
179  * amdgpu_vm_bo_size - returns the size of the BOs in bytes
180  *
181  * @adev: amdgpu_device pointer
182  *
183  * Calculate the size of the BO for a page directory or page table in bytes.
184  */
185 static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
186 {
187 	return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
188 }
189 
190 /**
191  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
192  *
193  * @vm: vm providing the BOs
194  * @validated: head of validation list
195  * @entry: entry to add
196  *
197  * Add the page directory to the list of BOs to
198  * validate for command submission.
199  */
200 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
201 			 struct list_head *validated,
202 			 struct amdgpu_bo_list_entry *entry)
203 {
204 	entry->robj = vm->root.base.bo;
205 	entry->priority = 0;
206 	entry->tv.bo = &entry->robj->tbo;
207 	entry->tv.shared = true;
208 	entry->user_pages = NULL;
209 	list_add(&entry->tv.head, validated);
210 }
211 
212 /**
213  * amdgpu_vm_validate_pt_bos - validate the page table BOs
214  *
215  * @adev: amdgpu device pointer
216  * @vm: vm providing the BOs
217  * @validate: callback to do the validation
218  * @param: parameter for the validation callback
219  *
220  * Validate the page table BOs on command submission if neccessary.
221  */
222 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
223 			      int (*validate)(void *p, struct amdgpu_bo *bo),
224 			      void *param)
225 {
226 	struct ttm_bo_global *glob = adev->mman.bdev.glob;
227 	struct amdgpu_vm_bo_base *bo_base, *tmp;
228 	int r = 0;
229 
230 	list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
231 		struct amdgpu_bo *bo = bo_base->bo;
232 
233 		if (bo->parent) {
234 			r = validate(param, bo);
235 			if (r)
236 				break;
237 
238 			spin_lock(&glob->lru_lock);
239 			ttm_bo_move_to_lru_tail(&bo->tbo);
240 			if (bo->shadow)
241 				ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
242 			spin_unlock(&glob->lru_lock);
243 		}
244 
245 		if (bo->tbo.type != ttm_bo_type_kernel) {
246 			spin_lock(&vm->moved_lock);
247 			list_move(&bo_base->vm_status, &vm->moved);
248 			spin_unlock(&vm->moved_lock);
249 		} else {
250 			list_move(&bo_base->vm_status, &vm->relocated);
251 		}
252 	}
253 
254 	spin_lock(&glob->lru_lock);
255 	list_for_each_entry(bo_base, &vm->idle, vm_status) {
256 		struct amdgpu_bo *bo = bo_base->bo;
257 
258 		if (!bo->parent)
259 			continue;
260 
261 		ttm_bo_move_to_lru_tail(&bo->tbo);
262 		if (bo->shadow)
263 			ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
264 	}
265 	spin_unlock(&glob->lru_lock);
266 
267 	return r;
268 }
269 
270 /**
271  * amdgpu_vm_ready - check VM is ready for updates
272  *
273  * @vm: VM to check
274  *
275  * Check if all VM PDs/PTs are ready for updates
276  */
277 bool amdgpu_vm_ready(struct amdgpu_vm *vm)
278 {
279 	return list_empty(&vm->evicted);
280 }
281 
282 /**
283  * amdgpu_vm_clear_bo - initially clear the PDs/PTs
284  *
285  * @adev: amdgpu_device pointer
286  * @bo: BO to clear
287  * @level: level this BO is at
288  *
289  * Root PD needs to be reserved when calling this.
290  */
291 static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
292 			      struct amdgpu_vm *vm, struct amdgpu_bo *bo,
293 			      unsigned level, bool pte_support_ats)
294 {
295 	struct ttm_operation_ctx ctx = { true, false };
296 	struct dma_fence *fence = NULL;
297 	unsigned entries, ats_entries;
298 	struct amdgpu_ring *ring;
299 	struct amdgpu_job *job;
300 	uint64_t addr;
301 	int r;
302 
303 	addr = amdgpu_bo_gpu_offset(bo);
304 	entries = amdgpu_bo_size(bo) / 8;
305 
306 	if (pte_support_ats) {
307 		if (level == adev->vm_manager.root_level) {
308 			ats_entries = amdgpu_vm_level_shift(adev, level);
309 			ats_entries += AMDGPU_GPU_PAGE_SHIFT;
310 			ats_entries = AMDGPU_VA_HOLE_START >> ats_entries;
311 			ats_entries = min(ats_entries, entries);
312 			entries -= ats_entries;
313 		} else {
314 			ats_entries = entries;
315 			entries = 0;
316 		}
317 	} else {
318 		ats_entries = 0;
319 	}
320 
321 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
322 
323 	r = reservation_object_reserve_shared(bo->tbo.resv);
324 	if (r)
325 		return r;
326 
327 	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
328 	if (r)
329 		goto error;
330 
331 	r = amdgpu_job_alloc_with_ib(adev, 64, &job);
332 	if (r)
333 		goto error;
334 
335 	if (ats_entries) {
336 		uint64_t ats_value;
337 
338 		ats_value = AMDGPU_PTE_DEFAULT_ATC;
339 		if (level != AMDGPU_VM_PTB)
340 			ats_value |= AMDGPU_PDE_PTE;
341 
342 		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
343 				      ats_entries, 0, ats_value);
344 		addr += ats_entries * 8;
345 	}
346 
347 	if (entries)
348 		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
349 				      entries, 0, 0);
350 
351 	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
352 
353 	WARN_ON(job->ibs[0].length_dw > 64);
354 	r = amdgpu_sync_resv(adev, &job->sync, bo->tbo.resv,
355 			     AMDGPU_FENCE_OWNER_UNDEFINED, false);
356 	if (r)
357 		goto error_free;
358 
359 	r = amdgpu_job_submit(job, ring, &vm->entity,
360 			      AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
361 	if (r)
362 		goto error_free;
363 
364 	amdgpu_bo_fence(bo, fence, true);
365 	dma_fence_put(fence);
366 
367 	if (bo->shadow)
368 		return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
369 					  level, pte_support_ats);
370 
371 	return 0;
372 
373 error_free:
374 	amdgpu_job_free(job);
375 
376 error:
377 	return r;
378 }
379 
380 /**
381  * amdgpu_vm_alloc_levels - allocate the PD/PT levels
382  *
383  * @adev: amdgpu_device pointer
384  * @vm: requested vm
385  * @saddr: start of the address range
386  * @eaddr: end of the address range
387  *
388  * Make sure the page directories and page tables are allocated
389  */
390 static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
391 				  struct amdgpu_vm *vm,
392 				  struct amdgpu_vm_pt *parent,
393 				  uint64_t saddr, uint64_t eaddr,
394 				  unsigned level, bool ats)
395 {
396 	unsigned shift = amdgpu_vm_level_shift(adev, level);
397 	unsigned pt_idx, from, to;
398 	u64 flags;
399 	int r;
400 
401 	if (!parent->entries) {
402 		unsigned num_entries = amdgpu_vm_num_entries(adev, level);
403 
404 		parent->entries = kvmalloc_array(num_entries,
405 						   sizeof(struct amdgpu_vm_pt),
406 						   GFP_KERNEL | __GFP_ZERO);
407 		if (!parent->entries)
408 			return -ENOMEM;
409 		memset(parent->entries, 0 , sizeof(struct amdgpu_vm_pt));
410 	}
411 
412 	from = saddr >> shift;
413 	to = eaddr >> shift;
414 	if (from >= amdgpu_vm_num_entries(adev, level) ||
415 	    to >= amdgpu_vm_num_entries(adev, level))
416 		return -EINVAL;
417 
418 	++level;
419 	saddr = saddr & ((1 << shift) - 1);
420 	eaddr = eaddr & ((1 << shift) - 1);
421 
422 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
423 	if (vm->use_cpu_for_update)
424 		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
425 	else
426 		flags |= (AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
427 				AMDGPU_GEM_CREATE_SHADOW);
428 
429 	/* walk over the address space and allocate the page tables */
430 	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
431 		struct reservation_object *resv = vm->root.base.bo->tbo.resv;
432 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
433 		struct amdgpu_bo *pt;
434 
435 		if (!entry->base.bo) {
436 			struct amdgpu_bo_param bp;
437 
438 			memset(&bp, 0, sizeof(bp));
439 			bp.size = amdgpu_vm_bo_size(adev, level);
440 			bp.byte_align = AMDGPU_GPU_PAGE_SIZE;
441 			bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
442 			bp.flags = flags;
443 			bp.type = ttm_bo_type_kernel;
444 			bp.resv = resv;
445 			r = amdgpu_bo_create(adev, &bp, &pt);
446 			if (r)
447 				return r;
448 
449 			r = amdgpu_vm_clear_bo(adev, vm, pt, level, ats);
450 			if (r) {
451 				amdgpu_bo_unref(&pt->shadow);
452 				amdgpu_bo_unref(&pt);
453 				return r;
454 			}
455 
456 			if (vm->use_cpu_for_update) {
457 				r = amdgpu_bo_kmap(pt, NULL);
458 				if (r) {
459 					amdgpu_bo_unref(&pt->shadow);
460 					amdgpu_bo_unref(&pt);
461 					return r;
462 				}
463 			}
464 
465 			/* Keep a reference to the root directory to avoid
466 			* freeing them up in the wrong order.
467 			*/
468 			pt->parent = amdgpu_bo_ref(parent->base.bo);
469 
470 			amdgpu_vm_bo_base_init(&entry->base, vm, pt);
471 			list_move(&entry->base.vm_status, &vm->relocated);
472 		}
473 
474 		if (level < AMDGPU_VM_PTB) {
475 			uint64_t sub_saddr = (pt_idx == from) ? saddr : 0;
476 			uint64_t sub_eaddr = (pt_idx == to) ? eaddr :
477 				((1 << shift) - 1);
478 			r = amdgpu_vm_alloc_levels(adev, vm, entry, sub_saddr,
479 						   sub_eaddr, level, ats);
480 			if (r)
481 				return r;
482 		}
483 	}
484 
485 	return 0;
486 }
487 
488 /**
489  * amdgpu_vm_alloc_pts - Allocate page tables.
490  *
491  * @adev: amdgpu_device pointer
492  * @vm: VM to allocate page tables for
493  * @saddr: Start address which needs to be allocated
494  * @size: Size from start address we need.
495  *
496  * Make sure the page tables are allocated.
497  */
498 int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
499 			struct amdgpu_vm *vm,
500 			uint64_t saddr, uint64_t size)
501 {
502 	uint64_t eaddr;
503 	bool ats = false;
504 
505 	/* validate the parameters */
506 	if (saddr & AMDGPU_GPU_PAGE_MASK || size & AMDGPU_GPU_PAGE_MASK)
507 		return -EINVAL;
508 
509 	eaddr = saddr + size - 1;
510 
511 	if (vm->pte_support_ats)
512 		ats = saddr < AMDGPU_VA_HOLE_START;
513 
514 	saddr /= AMDGPU_GPU_PAGE_SIZE;
515 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
516 
517 	if (eaddr >= adev->vm_manager.max_pfn) {
518 		dev_err(adev->dev, "va above limit (0x%08llX >= 0x%08llX)\n",
519 			eaddr, adev->vm_manager.max_pfn);
520 		return -EINVAL;
521 	}
522 
523 	return amdgpu_vm_alloc_levels(adev, vm, &vm->root, saddr, eaddr,
524 				      adev->vm_manager.root_level, ats);
525 }
526 
527 /**
528  * amdgpu_vm_check_compute_bug - check whether asic has compute vm bug
529  *
530  * @adev: amdgpu_device pointer
531  */
532 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
533 {
534 	const struct amdgpu_ip_block *ip_block;
535 	bool has_compute_vm_bug;
536 	struct amdgpu_ring *ring;
537 	int i;
538 
539 	has_compute_vm_bug = false;
540 
541 	ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
542 	if (ip_block) {
543 		/* Compute has a VM bug for GFX version < 7.
544 		   Compute has a VM bug for GFX 8 MEC firmware version < 673.*/
545 		if (ip_block->version->major <= 7)
546 			has_compute_vm_bug = true;
547 		else if (ip_block->version->major == 8)
548 			if (adev->gfx.mec_fw_version < 673)
549 				has_compute_vm_bug = true;
550 	}
551 
552 	for (i = 0; i < adev->num_rings; i++) {
553 		ring = adev->rings[i];
554 		if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
555 			/* only compute rings */
556 			ring->has_compute_vm_bug = has_compute_vm_bug;
557 		else
558 			ring->has_compute_vm_bug = false;
559 	}
560 }
561 
562 bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
563 				  struct amdgpu_job *job)
564 {
565 	struct amdgpu_device *adev = ring->adev;
566 	unsigned vmhub = ring->funcs->vmhub;
567 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
568 	struct amdgpu_vmid *id;
569 	bool gds_switch_needed;
570 	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
571 
572 	if (job->vmid == 0)
573 		return false;
574 	id = &id_mgr->ids[job->vmid];
575 	gds_switch_needed = ring->funcs->emit_gds_switch && (
576 		id->gds_base != job->gds_base ||
577 		id->gds_size != job->gds_size ||
578 		id->gws_base != job->gws_base ||
579 		id->gws_size != job->gws_size ||
580 		id->oa_base != job->oa_base ||
581 		id->oa_size != job->oa_size);
582 
583 	if (amdgpu_vmid_had_gpu_reset(adev, id))
584 		return true;
585 
586 	return vm_flush_needed || gds_switch_needed;
587 }
588 
589 static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
590 {
591 	return (adev->gmc.real_vram_size == adev->gmc.visible_vram_size);
592 }
593 
594 /**
595  * amdgpu_vm_flush - hardware flush the vm
596  *
597  * @ring: ring to use for flush
598  * @vmid: vmid number to use
599  * @pd_addr: address of the page directory
600  *
601  * Emit a VM flush when it is necessary.
602  */
603 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
604 {
605 	struct amdgpu_device *adev = ring->adev;
606 	unsigned vmhub = ring->funcs->vmhub;
607 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
608 	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
609 	bool gds_switch_needed = ring->funcs->emit_gds_switch && (
610 		id->gds_base != job->gds_base ||
611 		id->gds_size != job->gds_size ||
612 		id->gws_base != job->gws_base ||
613 		id->gws_size != job->gws_size ||
614 		id->oa_base != job->oa_base ||
615 		id->oa_size != job->oa_size);
616 	bool vm_flush_needed = job->vm_needs_flush;
617 	bool pasid_mapping_needed = id->pasid != job->pasid ||
618 		!id->pasid_mapping ||
619 		!dma_fence_is_signaled(id->pasid_mapping);
620 	struct dma_fence *fence = NULL;
621 	unsigned patch_offset = 0;
622 	int r;
623 
624 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
625 		gds_switch_needed = true;
626 		vm_flush_needed = true;
627 		pasid_mapping_needed = true;
628 	}
629 
630 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
631 	vm_flush_needed &= !!ring->funcs->emit_vm_flush;
632 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
633 		ring->funcs->emit_wreg;
634 
635 	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
636 		return 0;
637 
638 	if (ring->funcs->init_cond_exec)
639 		patch_offset = amdgpu_ring_init_cond_exec(ring);
640 
641 	if (need_pipe_sync)
642 		amdgpu_ring_emit_pipeline_sync(ring);
643 
644 	if (vm_flush_needed) {
645 		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
646 		amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
647 	}
648 
649 	if (pasid_mapping_needed)
650 		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
651 
652 	if (vm_flush_needed || pasid_mapping_needed) {
653 		r = amdgpu_fence_emit(ring, &fence, 0);
654 		if (r)
655 			return r;
656 	}
657 
658 	if (vm_flush_needed) {
659 		mutex_lock(&id_mgr->lock);
660 		dma_fence_put(id->last_flush);
661 		id->last_flush = dma_fence_get(fence);
662 		id->current_gpu_reset_count =
663 			atomic_read(&adev->gpu_reset_counter);
664 		mutex_unlock(&id_mgr->lock);
665 	}
666 
667 	if (pasid_mapping_needed) {
668 		id->pasid = job->pasid;
669 		dma_fence_put(id->pasid_mapping);
670 		id->pasid_mapping = dma_fence_get(fence);
671 	}
672 	dma_fence_put(fence);
673 
674 	if (ring->funcs->emit_gds_switch && gds_switch_needed) {
675 		id->gds_base = job->gds_base;
676 		id->gds_size = job->gds_size;
677 		id->gws_base = job->gws_base;
678 		id->gws_size = job->gws_size;
679 		id->oa_base = job->oa_base;
680 		id->oa_size = job->oa_size;
681 		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
682 					    job->gds_size, job->gws_base,
683 					    job->gws_size, job->oa_base,
684 					    job->oa_size);
685 	}
686 
687 	if (ring->funcs->patch_cond_exec)
688 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
689 
690 	/* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
691 	if (ring->funcs->emit_switch_buffer) {
692 		amdgpu_ring_emit_switch_buffer(ring);
693 		amdgpu_ring_emit_switch_buffer(ring);
694 	}
695 	return 0;
696 }
697 
698 /**
699  * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo
700  *
701  * @vm: requested vm
702  * @bo: requested buffer object
703  *
704  * Find @bo inside the requested vm.
705  * Search inside the @bos vm list for the requested vm
706  * Returns the found bo_va or NULL if none is found
707  *
708  * Object has to be reserved!
709  */
710 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
711 				       struct amdgpu_bo *bo)
712 {
713 	struct amdgpu_bo_va *bo_va;
714 
715 	list_for_each_entry(bo_va, &bo->va, base.bo_list) {
716 		if (bo_va->base.vm == vm) {
717 			return bo_va;
718 		}
719 	}
720 	return NULL;
721 }
722 
723 /**
724  * amdgpu_vm_do_set_ptes - helper to call the right asic function
725  *
726  * @params: see amdgpu_pte_update_params definition
727  * @bo: PD/PT to update
728  * @pe: addr of the page entry
729  * @addr: dst addr to write into pe
730  * @count: number of page entries to update
731  * @incr: increase next addr by incr bytes
732  * @flags: hw access flags
733  *
734  * Traces the parameters and calls the right asic functions
735  * to setup the page table using the DMA.
736  */
737 static void amdgpu_vm_do_set_ptes(struct amdgpu_pte_update_params *params,
738 				  struct amdgpu_bo *bo,
739 				  uint64_t pe, uint64_t addr,
740 				  unsigned count, uint32_t incr,
741 				  uint64_t flags)
742 {
743 	pe += amdgpu_bo_gpu_offset(bo);
744 	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
745 
746 	if (count < 3) {
747 		amdgpu_vm_write_pte(params->adev, params->ib, pe,
748 				    addr | flags, count, incr);
749 
750 	} else {
751 		amdgpu_vm_set_pte_pde(params->adev, params->ib, pe, addr,
752 				      count, incr, flags);
753 	}
754 }
755 
756 /**
757  * amdgpu_vm_do_copy_ptes - copy the PTEs from the GART
758  *
759  * @params: see amdgpu_pte_update_params definition
760  * @bo: PD/PT to update
761  * @pe: addr of the page entry
762  * @addr: dst addr to write into pe
763  * @count: number of page entries to update
764  * @incr: increase next addr by incr bytes
765  * @flags: hw access flags
766  *
767  * Traces the parameters and calls the DMA function to copy the PTEs.
768  */
769 static void amdgpu_vm_do_copy_ptes(struct amdgpu_pte_update_params *params,
770 				   struct amdgpu_bo *bo,
771 				   uint64_t pe, uint64_t addr,
772 				   unsigned count, uint32_t incr,
773 				   uint64_t flags)
774 {
775 	uint64_t src = (params->src + (addr >> 12) * 8);
776 
777 	pe += amdgpu_bo_gpu_offset(bo);
778 	trace_amdgpu_vm_copy_ptes(pe, src, count);
779 
780 	amdgpu_vm_copy_pte(params->adev, params->ib, pe, src, count);
781 }
782 
783 /**
784  * amdgpu_vm_map_gart - Resolve gart mapping of addr
785  *
786  * @pages_addr: optional DMA address to use for lookup
787  * @addr: the unmapped addr
788  *
789  * Look up the physical address of the page that the pte resolves
790  * to and return the pointer for the page table entry.
791  */
792 static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
793 {
794 	uint64_t result;
795 
796 	/* page table offset */
797 	result = pages_addr[addr >> PAGE_SHIFT];
798 
799 	/* in case cpu page size != gpu page size*/
800 	result |= addr & (~PAGE_MASK);
801 
802 	result &= 0xFFFFFFFFFFFFF000ULL;
803 
804 	return result;
805 }
806 
807 /**
808  * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
809  *
810  * @params: see amdgpu_pte_update_params definition
811  * @bo: PD/PT to update
812  * @pe: kmap addr of the page entry
813  * @addr: dst addr to write into pe
814  * @count: number of page entries to update
815  * @incr: increase next addr by incr bytes
816  * @flags: hw access flags
817  *
818  * Write count number of PT/PD entries directly.
819  */
820 static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
821 				   struct amdgpu_bo *bo,
822 				   uint64_t pe, uint64_t addr,
823 				   unsigned count, uint32_t incr,
824 				   uint64_t flags)
825 {
826 	unsigned int i;
827 	uint64_t value;
828 
829 	pe += (unsigned long)amdgpu_bo_kptr(bo);
830 
831 	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
832 
833 	for (i = 0; i < count; i++) {
834 		value = params->pages_addr ?
835 			amdgpu_vm_map_gart(params->pages_addr, addr) :
836 			addr;
837 		amdgpu_gmc_set_pte_pde(params->adev, (void *)(uintptr_t)pe,
838 				       i, value, flags);
839 		addr += incr;
840 	}
841 }
842 
843 static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
844 			     void *owner)
845 {
846 	struct amdgpu_sync sync;
847 	int r;
848 
849 	amdgpu_sync_create(&sync);
850 	amdgpu_sync_resv(adev, &sync, vm->root.base.bo->tbo.resv, owner, false);
851 	r = amdgpu_sync_wait(&sync, true);
852 	amdgpu_sync_free(&sync);
853 
854 	return r;
855 }
856 
857 /*
858  * amdgpu_vm_update_pde - update a single level in the hierarchy
859  *
860  * @param: parameters for the update
861  * @vm: requested vm
862  * @parent: parent directory
863  * @entry: entry to update
864  *
865  * Makes sure the requested entry in parent is up to date.
866  */
867 static void amdgpu_vm_update_pde(struct amdgpu_pte_update_params *params,
868 				 struct amdgpu_vm *vm,
869 				 struct amdgpu_vm_pt *parent,
870 				 struct amdgpu_vm_pt *entry)
871 {
872 	struct amdgpu_bo *bo = parent->base.bo, *pbo;
873 	uint64_t pde, pt, flags;
874 	unsigned level;
875 
876 	/* Don't update huge pages here */
877 	if (entry->huge)
878 		return;
879 
880 	for (level = 0, pbo = bo->parent; pbo; ++level)
881 		pbo = pbo->parent;
882 
883 	level += params->adev->vm_manager.root_level;
884 	pt = amdgpu_bo_gpu_offset(entry->base.bo);
885 	flags = AMDGPU_PTE_VALID;
886 	amdgpu_gmc_get_vm_pde(params->adev, level, &pt, &flags);
887 	pde = (entry - parent->entries) * 8;
888 	if (bo->shadow)
889 		params->func(params, bo->shadow, pde, pt, 1, 0, flags);
890 	params->func(params, bo, pde, pt, 1, 0, flags);
891 }
892 
893 /*
894  * amdgpu_vm_invalidate_level - mark all PD levels as invalid
895  *
896  * @parent: parent PD
897  *
898  * Mark all PD level as invalid after an error.
899  */
900 static void amdgpu_vm_invalidate_level(struct amdgpu_device *adev,
901 				       struct amdgpu_vm *vm,
902 				       struct amdgpu_vm_pt *parent,
903 				       unsigned level)
904 {
905 	unsigned pt_idx, num_entries;
906 
907 	/*
908 	 * Recurse into the subdirectories. This recursion is harmless because
909 	 * we only have a maximum of 5 layers.
910 	 */
911 	num_entries = amdgpu_vm_num_entries(adev, level);
912 	for (pt_idx = 0; pt_idx < num_entries; ++pt_idx) {
913 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
914 
915 		if (!entry->base.bo)
916 			continue;
917 
918 		if (!entry->base.moved)
919 			list_move(&entry->base.vm_status, &vm->relocated);
920 		amdgpu_vm_invalidate_level(adev, vm, entry, level + 1);
921 	}
922 }
923 
924 /*
925  * amdgpu_vm_update_directories - make sure that all directories are valid
926  *
927  * @adev: amdgpu_device pointer
928  * @vm: requested vm
929  *
930  * Makes sure all directories are up to date.
931  * Returns 0 for success, error for failure.
932  */
933 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
934 				 struct amdgpu_vm *vm)
935 {
936 	struct amdgpu_pte_update_params params;
937 	struct amdgpu_job *job;
938 	unsigned ndw = 0;
939 	int r = 0;
940 
941 	if (list_empty(&vm->relocated))
942 		return 0;
943 
944 restart:
945 	memset(&params, 0, sizeof(params));
946 	params.adev = adev;
947 
948 	if (vm->use_cpu_for_update) {
949 		struct amdgpu_vm_bo_base *bo_base;
950 
951 		list_for_each_entry(bo_base, &vm->relocated, vm_status) {
952 			r = amdgpu_bo_kmap(bo_base->bo, NULL);
953 			if (unlikely(r))
954 				return r;
955 		}
956 
957 		r = amdgpu_vm_wait_pd(adev, vm, AMDGPU_FENCE_OWNER_VM);
958 		if (unlikely(r))
959 			return r;
960 
961 		params.func = amdgpu_vm_cpu_set_ptes;
962 	} else {
963 		ndw = 512 * 8;
964 		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
965 		if (r)
966 			return r;
967 
968 		params.ib = &job->ibs[0];
969 		params.func = amdgpu_vm_do_set_ptes;
970 	}
971 
972 	while (!list_empty(&vm->relocated)) {
973 		struct amdgpu_vm_bo_base *bo_base, *parent;
974 		struct amdgpu_vm_pt *pt, *entry;
975 		struct amdgpu_bo *bo;
976 
977 		bo_base = list_first_entry(&vm->relocated,
978 					   struct amdgpu_vm_bo_base,
979 					   vm_status);
980 		bo_base->moved = false;
981 		list_move(&bo_base->vm_status, &vm->idle);
982 
983 		bo = bo_base->bo->parent;
984 		if (!bo)
985 			continue;
986 
987 		parent = list_first_entry(&bo->va, struct amdgpu_vm_bo_base,
988 					  bo_list);
989 		pt = container_of(parent, struct amdgpu_vm_pt, base);
990 		entry = container_of(bo_base, struct amdgpu_vm_pt, base);
991 
992 		amdgpu_vm_update_pde(&params, vm, pt, entry);
993 
994 		if (!vm->use_cpu_for_update &&
995 		    (ndw - params.ib->length_dw) < 32)
996 			break;
997 	}
998 
999 	if (vm->use_cpu_for_update) {
1000 		/* Flush HDP */
1001 		mb();
1002 		amdgpu_asic_flush_hdp(adev, NULL);
1003 	} else if (params.ib->length_dw == 0) {
1004 		amdgpu_job_free(job);
1005 	} else {
1006 		struct amdgpu_bo *root = vm->root.base.bo;
1007 		struct amdgpu_ring *ring;
1008 		struct dma_fence *fence;
1009 
1010 		ring = container_of(vm->entity.sched, struct amdgpu_ring,
1011 				    sched);
1012 
1013 		amdgpu_ring_pad_ib(ring, params.ib);
1014 		amdgpu_sync_resv(adev, &job->sync, root->tbo.resv,
1015 				 AMDGPU_FENCE_OWNER_VM, false);
1016 		WARN_ON(params.ib->length_dw > ndw);
1017 		r = amdgpu_job_submit(job, ring, &vm->entity,
1018 				      AMDGPU_FENCE_OWNER_VM, &fence);
1019 		if (r)
1020 			goto error;
1021 
1022 		amdgpu_bo_fence(root, fence, true);
1023 		dma_fence_put(vm->last_update);
1024 		vm->last_update = fence;
1025 	}
1026 
1027 	if (!list_empty(&vm->relocated))
1028 		goto restart;
1029 
1030 	return 0;
1031 
1032 error:
1033 	amdgpu_vm_invalidate_level(adev, vm, &vm->root,
1034 				   adev->vm_manager.root_level);
1035 	amdgpu_job_free(job);
1036 	return r;
1037 }
1038 
1039 /**
1040  * amdgpu_vm_find_entry - find the entry for an address
1041  *
1042  * @p: see amdgpu_pte_update_params definition
1043  * @addr: virtual address in question
1044  * @entry: resulting entry or NULL
1045  * @parent: parent entry
1046  *
1047  * Find the vm_pt entry and it's parent for the given address.
1048  */
1049 void amdgpu_vm_get_entry(struct amdgpu_pte_update_params *p, uint64_t addr,
1050 			 struct amdgpu_vm_pt **entry,
1051 			 struct amdgpu_vm_pt **parent)
1052 {
1053 	unsigned level = p->adev->vm_manager.root_level;
1054 
1055 	*parent = NULL;
1056 	*entry = &p->vm->root;
1057 	while ((*entry)->entries) {
1058 		unsigned shift = amdgpu_vm_level_shift(p->adev, level++);
1059 
1060 		*parent = *entry;
1061 		*entry = &(*entry)->entries[addr >> shift];
1062 		addr &= (1ULL << shift) - 1;
1063 	}
1064 
1065 	if (level != AMDGPU_VM_PTB)
1066 		*entry = NULL;
1067 }
1068 
1069 /**
1070  * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
1071  *
1072  * @p: see amdgpu_pte_update_params definition
1073  * @entry: vm_pt entry to check
1074  * @parent: parent entry
1075  * @nptes: number of PTEs updated with this operation
1076  * @dst: destination address where the PTEs should point to
1077  * @flags: access flags fro the PTEs
1078  *
1079  * Check if we can update the PD with a huge page.
1080  */
1081 static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
1082 					struct amdgpu_vm_pt *entry,
1083 					struct amdgpu_vm_pt *parent,
1084 					unsigned nptes, uint64_t dst,
1085 					uint64_t flags)
1086 {
1087 	uint64_t pde;
1088 
1089 	/* In the case of a mixed PT the PDE must point to it*/
1090 	if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
1091 	    nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
1092 		/* Set the huge page flag to stop scanning at this PDE */
1093 		flags |= AMDGPU_PDE_PTE;
1094 	}
1095 
1096 	if (!(flags & AMDGPU_PDE_PTE)) {
1097 		if (entry->huge) {
1098 			/* Add the entry to the relocated list to update it. */
1099 			entry->huge = false;
1100 			list_move(&entry->base.vm_status, &p->vm->relocated);
1101 		}
1102 		return;
1103 	}
1104 
1105 	entry->huge = true;
1106 	amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
1107 
1108 	pde = (entry - parent->entries) * 8;
1109 	if (parent->base.bo->shadow)
1110 		p->func(p, parent->base.bo->shadow, pde, dst, 1, 0, flags);
1111 	p->func(p, parent->base.bo, pde, dst, 1, 0, flags);
1112 }
1113 
1114 /**
1115  * amdgpu_vm_update_ptes - make sure that page tables are valid
1116  *
1117  * @params: see amdgpu_pte_update_params definition
1118  * @vm: requested vm
1119  * @start: start of GPU address range
1120  * @end: end of GPU address range
1121  * @dst: destination address to map to, the next dst inside the function
1122  * @flags: mapping flags
1123  *
1124  * Update the page tables in the range @start - @end.
1125  * Returns 0 for success, -EINVAL for failure.
1126  */
1127 static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
1128 				  uint64_t start, uint64_t end,
1129 				  uint64_t dst, uint64_t flags)
1130 {
1131 	struct amdgpu_device *adev = params->adev;
1132 	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
1133 
1134 	uint64_t addr, pe_start;
1135 	struct amdgpu_bo *pt;
1136 	unsigned nptes;
1137 
1138 	/* walk over the address space and update the page tables */
1139 	for (addr = start; addr < end; addr += nptes,
1140 	     dst += nptes * AMDGPU_GPU_PAGE_SIZE) {
1141 		struct amdgpu_vm_pt *entry, *parent;
1142 
1143 		amdgpu_vm_get_entry(params, addr, &entry, &parent);
1144 		if (!entry)
1145 			return -ENOENT;
1146 
1147 		if ((addr & ~mask) == (end & ~mask))
1148 			nptes = end - addr;
1149 		else
1150 			nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
1151 
1152 		amdgpu_vm_handle_huge_pages(params, entry, parent,
1153 					    nptes, dst, flags);
1154 		/* We don't need to update PTEs for huge pages */
1155 		if (entry->huge)
1156 			continue;
1157 
1158 		pt = entry->base.bo;
1159 		pe_start = (addr & mask) * 8;
1160 		if (pt->shadow)
1161 			params->func(params, pt->shadow, pe_start, dst, nptes,
1162 				     AMDGPU_GPU_PAGE_SIZE, flags);
1163 		params->func(params, pt, pe_start, dst, nptes,
1164 			     AMDGPU_GPU_PAGE_SIZE, flags);
1165 	}
1166 
1167 	return 0;
1168 }
1169 
1170 /*
1171  * amdgpu_vm_frag_ptes - add fragment information to PTEs
1172  *
1173  * @params: see amdgpu_pte_update_params definition
1174  * @vm: requested vm
1175  * @start: first PTE to handle
1176  * @end: last PTE to handle
1177  * @dst: addr those PTEs should point to
1178  * @flags: hw mapping flags
1179  * Returns 0 for success, -EINVAL for failure.
1180  */
1181 static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
1182 				uint64_t start, uint64_t end,
1183 				uint64_t dst, uint64_t flags)
1184 {
1185 	/**
1186 	 * The MC L1 TLB supports variable sized pages, based on a fragment
1187 	 * field in the PTE. When this field is set to a non-zero value, page
1188 	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
1189 	 * flags are considered valid for all PTEs within the fragment range
1190 	 * and corresponding mappings are assumed to be physically contiguous.
1191 	 *
1192 	 * The L1 TLB can store a single PTE for the whole fragment,
1193 	 * significantly increasing the space available for translation
1194 	 * caching. This leads to large improvements in throughput when the
1195 	 * TLB is under pressure.
1196 	 *
1197 	 * The L2 TLB distributes small and large fragments into two
1198 	 * asymmetric partitions. The large fragment cache is significantly
1199 	 * larger. Thus, we try to use large fragments wherever possible.
1200 	 * Userspace can support this by aligning virtual base address and
1201 	 * allocation size to the fragment size.
1202 	 */
1203 	unsigned max_frag = params->adev->vm_manager.fragment_size;
1204 	int r;
1205 
1206 	/* system pages are non continuously */
1207 	if (params->src || !(flags & AMDGPU_PTE_VALID))
1208 		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
1209 
1210 	while (start != end) {
1211 		uint64_t frag_flags, frag_end;
1212 		unsigned frag;
1213 
1214 		/* This intentionally wraps around if no bit is set */
1215 		frag = min((unsigned)ffs(start) - 1,
1216 			   (unsigned)fls64(end - start) - 1);
1217 		if (frag >= max_frag) {
1218 			frag_flags = AMDGPU_PTE_FRAG(max_frag);
1219 			frag_end = end & ~((1ULL << max_frag) - 1);
1220 		} else {
1221 			frag_flags = AMDGPU_PTE_FRAG(frag);
1222 			frag_end = start + (1 << frag);
1223 		}
1224 
1225 		r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
1226 					  flags | frag_flags);
1227 		if (r)
1228 			return r;
1229 
1230 		dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
1231 		start = frag_end;
1232 	}
1233 
1234 	return 0;
1235 }
1236 
1237 /**
1238  * amdgpu_vm_bo_update_mapping - update a mapping in the vm page table
1239  *
1240  * @adev: amdgpu_device pointer
1241  * @exclusive: fence we need to sync to
1242  * @pages_addr: DMA addresses to use for mapping
1243  * @vm: requested vm
1244  * @start: start of mapped range
1245  * @last: last mapped entry
1246  * @flags: flags for the entries
1247  * @addr: addr to set the area to
1248  * @fence: optional resulting fence
1249  *
1250  * Fill in the page table entries between @start and @last.
1251  * Returns 0 for success, -EINVAL for failure.
1252  */
1253 static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
1254 				       struct dma_fence *exclusive,
1255 				       dma_addr_t *pages_addr,
1256 				       struct amdgpu_vm *vm,
1257 				       uint64_t start, uint64_t last,
1258 				       uint64_t flags, uint64_t addr,
1259 				       struct dma_fence **fence)
1260 {
1261 	struct amdgpu_ring *ring;
1262 	void *owner = AMDGPU_FENCE_OWNER_VM;
1263 	unsigned nptes, ncmds, ndw;
1264 	struct amdgpu_job *job;
1265 	struct amdgpu_pte_update_params params;
1266 	struct dma_fence *f = NULL;
1267 	int r;
1268 
1269 	memset(&params, 0, sizeof(params));
1270 	params.adev = adev;
1271 	params.vm = vm;
1272 
1273 	/* sync to everything on unmapping */
1274 	if (!(flags & AMDGPU_PTE_VALID))
1275 		owner = AMDGPU_FENCE_OWNER_UNDEFINED;
1276 
1277 	if (vm->use_cpu_for_update) {
1278 		/* params.src is used as flag to indicate system Memory */
1279 		if (pages_addr)
1280 			params.src = ~0;
1281 
1282 		/* Wait for PT BOs to be free. PTs share the same resv. object
1283 		 * as the root PD BO
1284 		 */
1285 		r = amdgpu_vm_wait_pd(adev, vm, owner);
1286 		if (unlikely(r))
1287 			return r;
1288 
1289 		params.func = amdgpu_vm_cpu_set_ptes;
1290 		params.pages_addr = pages_addr;
1291 		return amdgpu_vm_frag_ptes(&params, start, last + 1,
1292 					   addr, flags);
1293 	}
1294 
1295 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
1296 
1297 	nptes = last - start + 1;
1298 
1299 	/*
1300 	 * reserve space for two commands every (1 << BLOCK_SIZE)
1301 	 *  entries or 2k dwords (whatever is smaller)
1302          *
1303          * The second command is for the shadow pagetables.
1304 	 */
1305 	if (vm->root.base.bo->shadow)
1306 		ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1) * 2;
1307 	else
1308 		ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1);
1309 
1310 	/* padding, etc. */
1311 	ndw = 64;
1312 
1313 	if (pages_addr) {
1314 		/* copy commands needed */
1315 		ndw += ncmds * adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
1316 
1317 		/* and also PTEs */
1318 		ndw += nptes * 2;
1319 
1320 		params.func = amdgpu_vm_do_copy_ptes;
1321 
1322 	} else {
1323 		/* set page commands needed */
1324 		ndw += ncmds * 10;
1325 
1326 		/* extra commands for begin/end fragments */
1327 		ndw += 2 * 10 * adev->vm_manager.fragment_size;
1328 
1329 		params.func = amdgpu_vm_do_set_ptes;
1330 	}
1331 
1332 	r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
1333 	if (r)
1334 		return r;
1335 
1336 	params.ib = &job->ibs[0];
1337 
1338 	if (pages_addr) {
1339 		uint64_t *pte;
1340 		unsigned i;
1341 
1342 		/* Put the PTEs at the end of the IB. */
1343 		i = ndw - nptes * 2;
1344 		pte= (uint64_t *)&(job->ibs->ptr[i]);
1345 		params.src = job->ibs->gpu_addr + i * 4;
1346 
1347 		for (i = 0; i < nptes; ++i) {
1348 			pte[i] = amdgpu_vm_map_gart(pages_addr, addr + i *
1349 						    AMDGPU_GPU_PAGE_SIZE);
1350 			pte[i] |= flags;
1351 		}
1352 		addr = 0;
1353 	}
1354 
1355 	r = amdgpu_sync_fence(adev, &job->sync, exclusive, false);
1356 	if (r)
1357 		goto error_free;
1358 
1359 	r = amdgpu_sync_resv(adev, &job->sync, vm->root.base.bo->tbo.resv,
1360 			     owner, false);
1361 	if (r)
1362 		goto error_free;
1363 
1364 	r = reservation_object_reserve_shared(vm->root.base.bo->tbo.resv);
1365 	if (r)
1366 		goto error_free;
1367 
1368 	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
1369 	if (r)
1370 		goto error_free;
1371 
1372 	amdgpu_ring_pad_ib(ring, params.ib);
1373 	WARN_ON(params.ib->length_dw > ndw);
1374 	r = amdgpu_job_submit(job, ring, &vm->entity,
1375 			      AMDGPU_FENCE_OWNER_VM, &f);
1376 	if (r)
1377 		goto error_free;
1378 
1379 	amdgpu_bo_fence(vm->root.base.bo, f, true);
1380 	dma_fence_put(*fence);
1381 	*fence = f;
1382 	return 0;
1383 
1384 error_free:
1385 	amdgpu_job_free(job);
1386 	return r;
1387 }
1388 
1389 /**
1390  * amdgpu_vm_bo_split_mapping - split a mapping into smaller chunks
1391  *
1392  * @adev: amdgpu_device pointer
1393  * @exclusive: fence we need to sync to
1394  * @pages_addr: DMA addresses to use for mapping
1395  * @vm: requested vm
1396  * @mapping: mapped range and flags to use for the update
1397  * @flags: HW flags for the mapping
1398  * @nodes: array of drm_mm_nodes with the MC addresses
1399  * @fence: optional resulting fence
1400  *
1401  * Split the mapping into smaller chunks so that each update fits
1402  * into a SDMA IB.
1403  * Returns 0 for success, -EINVAL for failure.
1404  */
1405 static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
1406 				      struct dma_fence *exclusive,
1407 				      dma_addr_t *pages_addr,
1408 				      struct amdgpu_vm *vm,
1409 				      struct amdgpu_bo_va_mapping *mapping,
1410 				      uint64_t flags,
1411 				      struct drm_mm_node *nodes,
1412 				      struct dma_fence **fence)
1413 {
1414 	unsigned min_linear_pages = 1 << adev->vm_manager.fragment_size;
1415 	uint64_t pfn, start = mapping->start;
1416 	int r;
1417 
1418 	/* normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here
1419 	 * but in case of something, we filter the flags in first place
1420 	 */
1421 	if (!(mapping->flags & AMDGPU_PTE_READABLE))
1422 		flags &= ~AMDGPU_PTE_READABLE;
1423 	if (!(mapping->flags & AMDGPU_PTE_WRITEABLE))
1424 		flags &= ~AMDGPU_PTE_WRITEABLE;
1425 
1426 	flags &= ~AMDGPU_PTE_EXECUTABLE;
1427 	flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
1428 
1429 	flags &= ~AMDGPU_PTE_MTYPE_MASK;
1430 	flags |= (mapping->flags & AMDGPU_PTE_MTYPE_MASK);
1431 
1432 	if ((mapping->flags & AMDGPU_PTE_PRT) &&
1433 	    (adev->asic_type >= CHIP_VEGA10)) {
1434 		flags |= AMDGPU_PTE_PRT;
1435 		flags &= ~AMDGPU_PTE_VALID;
1436 	}
1437 
1438 	trace_amdgpu_vm_bo_update(mapping);
1439 
1440 	pfn = mapping->offset >> PAGE_SHIFT;
1441 	if (nodes) {
1442 		while (pfn >= nodes->size) {
1443 			pfn -= nodes->size;
1444 			++nodes;
1445 		}
1446 	}
1447 
1448 	do {
1449 		dma_addr_t *dma_addr = NULL;
1450 		uint64_t max_entries;
1451 		uint64_t addr, last;
1452 
1453 		if (nodes) {
1454 			addr = nodes->start << PAGE_SHIFT;
1455 			max_entries = (nodes->size - pfn) *
1456 				(PAGE_SIZE / AMDGPU_GPU_PAGE_SIZE);
1457 		} else {
1458 			addr = 0;
1459 			max_entries = S64_MAX;
1460 		}
1461 
1462 		if (pages_addr) {
1463 			uint64_t count;
1464 
1465 			max_entries = min(max_entries, 16ull * 1024ull);
1466 			for (count = 1;
1467 			     count < max_entries / (PAGE_SIZE / AMDGPU_GPU_PAGE_SIZE);
1468 			     ++count) {
1469 				uint64_t idx = pfn + count;
1470 
1471 				if (pages_addr[idx] !=
1472 				    (pages_addr[idx - 1] + PAGE_SIZE))
1473 					break;
1474 			}
1475 
1476 			if (count < min_linear_pages) {
1477 				addr = pfn << PAGE_SHIFT;
1478 				dma_addr = pages_addr;
1479 			} else {
1480 				addr = pages_addr[pfn];
1481 				max_entries = count * (PAGE_SIZE / AMDGPU_GPU_PAGE_SIZE);
1482 			}
1483 
1484 		} else if (flags & AMDGPU_PTE_VALID) {
1485 			addr += adev->vm_manager.vram_base_offset;
1486 			addr += pfn << PAGE_SHIFT;
1487 		}
1488 
1489 		last = min((uint64_t)mapping->last, start + max_entries - 1);
1490 		r = amdgpu_vm_bo_update_mapping(adev, exclusive, dma_addr, vm,
1491 						start, last, flags, addr,
1492 						fence);
1493 		if (r)
1494 			return r;
1495 
1496 		pfn += (last - start + 1) / (PAGE_SIZE / AMDGPU_GPU_PAGE_SIZE);
1497 		if (nodes && nodes->size == pfn) {
1498 			pfn = 0;
1499 			++nodes;
1500 		}
1501 		start = last + 1;
1502 
1503 	} while (unlikely(start != mapping->last + 1));
1504 
1505 	return 0;
1506 }
1507 
1508 /**
1509  * amdgpu_vm_bo_update - update all BO mappings in the vm page table
1510  *
1511  * @adev: amdgpu_device pointer
1512  * @bo_va: requested BO and VM object
1513  * @clear: if true clear the entries
1514  *
1515  * Fill in the page table entries for @bo_va.
1516  * Returns 0 for success, -EINVAL for failure.
1517  */
1518 int amdgpu_vm_bo_update(struct amdgpu_device *adev,
1519 			struct amdgpu_bo_va *bo_va,
1520 			bool clear)
1521 {
1522 	struct amdgpu_bo *bo = bo_va->base.bo;
1523 	struct amdgpu_vm *vm = bo_va->base.vm;
1524 	struct amdgpu_bo_va_mapping *mapping;
1525 	dma_addr_t *pages_addr = NULL;
1526 	struct ttm_mem_reg *mem;
1527 	struct drm_mm_node *nodes;
1528 	struct dma_fence *exclusive, **last_update;
1529 	uint64_t flags;
1530 	int r;
1531 
1532 	if (clear || !bo_va->base.bo) {
1533 		mem = NULL;
1534 		nodes = NULL;
1535 		exclusive = NULL;
1536 	} else {
1537 		struct ttm_dma_tt *ttm;
1538 
1539 		mem = &bo_va->base.bo->tbo.mem;
1540 		nodes = mem->mm_node;
1541 		if (mem->mem_type == TTM_PL_TT) {
1542 			ttm = container_of(bo_va->base.bo->tbo.ttm,
1543 					   struct ttm_dma_tt, ttm);
1544 			pages_addr = ttm->dma_address;
1545 		}
1546 		exclusive = reservation_object_get_excl(bo->tbo.resv);
1547 	}
1548 
1549 	if (bo)
1550 		flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
1551 	else
1552 		flags = 0x0;
1553 
1554 	if (clear || (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv))
1555 		last_update = &vm->last_update;
1556 	else
1557 		last_update = &bo_va->last_pt_update;
1558 
1559 	if (!clear && bo_va->base.moved) {
1560 		bo_va->base.moved = false;
1561 		list_splice_init(&bo_va->valids, &bo_va->invalids);
1562 
1563 	} else if (bo_va->cleared != clear) {
1564 		list_splice_init(&bo_va->valids, &bo_va->invalids);
1565 	}
1566 
1567 	list_for_each_entry(mapping, &bo_va->invalids, list) {
1568 		r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
1569 					       mapping, flags, nodes,
1570 					       last_update);
1571 		if (r)
1572 			return r;
1573 	}
1574 
1575 	if (vm->use_cpu_for_update) {
1576 		/* Flush HDP */
1577 		mb();
1578 		amdgpu_asic_flush_hdp(adev, NULL);
1579 	}
1580 
1581 	spin_lock(&vm->moved_lock);
1582 	list_del_init(&bo_va->base.vm_status);
1583 	spin_unlock(&vm->moved_lock);
1584 
1585 	/* If the BO is not in its preferred location add it back to
1586 	 * the evicted list so that it gets validated again on the
1587 	 * next command submission.
1588 	 */
1589 	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
1590 		uint32_t mem_type = bo->tbo.mem.mem_type;
1591 
1592 		if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
1593 			list_add_tail(&bo_va->base.vm_status, &vm->evicted);
1594 		else
1595 			list_add(&bo_va->base.vm_status, &vm->idle);
1596 	}
1597 
1598 	list_splice_init(&bo_va->invalids, &bo_va->valids);
1599 	bo_va->cleared = clear;
1600 
1601 	if (trace_amdgpu_vm_bo_mapping_enabled()) {
1602 		list_for_each_entry(mapping, &bo_va->valids, list)
1603 			trace_amdgpu_vm_bo_mapping(mapping);
1604 	}
1605 
1606 	return 0;
1607 }
1608 
1609 /**
1610  * amdgpu_vm_update_prt_state - update the global PRT state
1611  */
1612 static void amdgpu_vm_update_prt_state(struct amdgpu_device *adev)
1613 {
1614 	unsigned long flags;
1615 	bool enable;
1616 
1617 	spin_lock_irqsave(&adev->vm_manager.prt_lock, flags);
1618 	enable = !!atomic_read(&adev->vm_manager.num_prt_users);
1619 	adev->gmc.gmc_funcs->set_prt(adev, enable);
1620 	spin_unlock_irqrestore(&adev->vm_manager.prt_lock, flags);
1621 }
1622 
1623 /**
1624  * amdgpu_vm_prt_get - add a PRT user
1625  */
1626 static void amdgpu_vm_prt_get(struct amdgpu_device *adev)
1627 {
1628 	if (!adev->gmc.gmc_funcs->set_prt)
1629 		return;
1630 
1631 	if (atomic_inc_return(&adev->vm_manager.num_prt_users) == 1)
1632 		amdgpu_vm_update_prt_state(adev);
1633 }
1634 
1635 /**
1636  * amdgpu_vm_prt_put - drop a PRT user
1637  */
1638 static void amdgpu_vm_prt_put(struct amdgpu_device *adev)
1639 {
1640 	if (atomic_dec_return(&adev->vm_manager.num_prt_users) == 0)
1641 		amdgpu_vm_update_prt_state(adev);
1642 }
1643 
1644 /**
1645  * amdgpu_vm_prt_cb - callback for updating the PRT status
1646  */
1647 static void amdgpu_vm_prt_cb(struct dma_fence *fence, struct dma_fence_cb *_cb)
1648 {
1649 	struct amdgpu_prt_cb *cb = container_of(_cb, struct amdgpu_prt_cb, cb);
1650 
1651 	amdgpu_vm_prt_put(cb->adev);
1652 	kfree(cb);
1653 }
1654 
1655 /**
1656  * amdgpu_vm_add_prt_cb - add callback for updating the PRT status
1657  */
1658 static void amdgpu_vm_add_prt_cb(struct amdgpu_device *adev,
1659 				 struct dma_fence *fence)
1660 {
1661 	struct amdgpu_prt_cb *cb;
1662 
1663 	if (!adev->gmc.gmc_funcs->set_prt)
1664 		return;
1665 
1666 	cb = kmalloc(sizeof(struct amdgpu_prt_cb), GFP_KERNEL);
1667 	if (!cb) {
1668 		/* Last resort when we are OOM */
1669 		if (fence)
1670 			dma_fence_wait(fence, false);
1671 
1672 		amdgpu_vm_prt_put(adev);
1673 	} else {
1674 		cb->adev = adev;
1675 		if (!fence || dma_fence_add_callback(fence, &cb->cb,
1676 						     amdgpu_vm_prt_cb))
1677 			amdgpu_vm_prt_cb(fence, &cb->cb);
1678 	}
1679 }
1680 
1681 /**
1682  * amdgpu_vm_free_mapping - free a mapping
1683  *
1684  * @adev: amdgpu_device pointer
1685  * @vm: requested vm
1686  * @mapping: mapping to be freed
1687  * @fence: fence of the unmap operation
1688  *
1689  * Free a mapping and make sure we decrease the PRT usage count if applicable.
1690  */
1691 static void amdgpu_vm_free_mapping(struct amdgpu_device *adev,
1692 				   struct amdgpu_vm *vm,
1693 				   struct amdgpu_bo_va_mapping *mapping,
1694 				   struct dma_fence *fence)
1695 {
1696 	if (mapping->flags & AMDGPU_PTE_PRT)
1697 		amdgpu_vm_add_prt_cb(adev, fence);
1698 	kfree(mapping);
1699 }
1700 
1701 /**
1702  * amdgpu_vm_prt_fini - finish all prt mappings
1703  *
1704  * @adev: amdgpu_device pointer
1705  * @vm: requested vm
1706  *
1707  * Register a cleanup callback to disable PRT support after VM dies.
1708  */
1709 static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
1710 {
1711 	struct reservation_object *resv = vm->root.base.bo->tbo.resv;
1712 	struct dma_fence *excl, **shared;
1713 	unsigned i, shared_count;
1714 	int r;
1715 
1716 	r = reservation_object_get_fences_rcu(resv, &excl,
1717 					      &shared_count, &shared);
1718 	if (r) {
1719 		/* Not enough memory to grab the fence list, as last resort
1720 		 * block for all the fences to complete.
1721 		 */
1722 		reservation_object_wait_timeout_rcu(resv, true, false,
1723 						    MAX_SCHEDULE_TIMEOUT);
1724 		return;
1725 	}
1726 
1727 	/* Add a callback for each fence in the reservation object */
1728 	amdgpu_vm_prt_get(adev);
1729 	amdgpu_vm_add_prt_cb(adev, excl);
1730 
1731 	for (i = 0; i < shared_count; ++i) {
1732 		amdgpu_vm_prt_get(adev);
1733 		amdgpu_vm_add_prt_cb(adev, shared[i]);
1734 	}
1735 
1736 	kfree(shared);
1737 }
1738 
1739 /**
1740  * amdgpu_vm_clear_freed - clear freed BOs in the PT
1741  *
1742  * @adev: amdgpu_device pointer
1743  * @vm: requested vm
1744  * @fence: optional resulting fence (unchanged if no work needed to be done
1745  * or if an error occurred)
1746  *
1747  * Make sure all freed BOs are cleared in the PT.
1748  * Returns 0 for success.
1749  *
1750  * PTs have to be reserved and mutex must be locked!
1751  */
1752 int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
1753 			  struct amdgpu_vm *vm,
1754 			  struct dma_fence **fence)
1755 {
1756 	struct amdgpu_bo_va_mapping *mapping;
1757 	uint64_t init_pte_value = 0;
1758 	struct dma_fence *f = NULL;
1759 	int r;
1760 
1761 	while (!list_empty(&vm->freed)) {
1762 		mapping = list_first_entry(&vm->freed,
1763 			struct amdgpu_bo_va_mapping, list);
1764 		list_del(&mapping->list);
1765 
1766 		if (vm->pte_support_ats && mapping->start < AMDGPU_VA_HOLE_START)
1767 			init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
1768 
1769 		r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
1770 						mapping->start, mapping->last,
1771 						init_pte_value, 0, &f);
1772 		amdgpu_vm_free_mapping(adev, vm, mapping, f);
1773 		if (r) {
1774 			dma_fence_put(f);
1775 			return r;
1776 		}
1777 	}
1778 
1779 	if (fence && f) {
1780 		dma_fence_put(*fence);
1781 		*fence = f;
1782 	} else {
1783 		dma_fence_put(f);
1784 	}
1785 
1786 	return 0;
1787 
1788 }
1789 
1790 /**
1791  * amdgpu_vm_handle_moved - handle moved BOs in the PT
1792  *
1793  * @adev: amdgpu_device pointer
1794  * @vm: requested vm
1795  * @sync: sync object to add fences to
1796  *
1797  * Make sure all BOs which are moved are updated in the PTs.
1798  * Returns 0 for success.
1799  *
1800  * PTs have to be reserved!
1801  */
1802 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
1803 			   struct amdgpu_vm *vm)
1804 {
1805 	struct amdgpu_bo_va *bo_va, *tmp;
1806 	struct list_head moved;
1807 	bool clear;
1808 	int r;
1809 
1810 	INIT_LIST_HEAD(&moved);
1811 	spin_lock(&vm->moved_lock);
1812 	list_splice_init(&vm->moved, &moved);
1813 	spin_unlock(&vm->moved_lock);
1814 
1815 	list_for_each_entry_safe(bo_va, tmp, &moved, base.vm_status) {
1816 		struct reservation_object *resv = bo_va->base.bo->tbo.resv;
1817 
1818 		/* Per VM BOs never need to bo cleared in the page tables */
1819 		if (resv == vm->root.base.bo->tbo.resv)
1820 			clear = false;
1821 		/* Try to reserve the BO to avoid clearing its ptes */
1822 		else if (!amdgpu_vm_debug && reservation_object_trylock(resv))
1823 			clear = false;
1824 		/* Somebody else is using the BO right now */
1825 		else
1826 			clear = true;
1827 
1828 		r = amdgpu_vm_bo_update(adev, bo_va, clear);
1829 		if (r) {
1830 			spin_lock(&vm->moved_lock);
1831 			list_splice(&moved, &vm->moved);
1832 			spin_unlock(&vm->moved_lock);
1833 			return r;
1834 		}
1835 
1836 		if (!clear && resv != vm->root.base.bo->tbo.resv)
1837 			reservation_object_unlock(resv);
1838 
1839 	}
1840 
1841 	return 0;
1842 }
1843 
1844 /**
1845  * amdgpu_vm_bo_add - add a bo to a specific vm
1846  *
1847  * @adev: amdgpu_device pointer
1848  * @vm: requested vm
1849  * @bo: amdgpu buffer object
1850  *
1851  * Add @bo into the requested vm.
1852  * Add @bo to the list of bos associated with the vm
1853  * Returns newly added bo_va or NULL for failure
1854  *
1855  * Object has to be reserved!
1856  */
1857 struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
1858 				      struct amdgpu_vm *vm,
1859 				      struct amdgpu_bo *bo)
1860 {
1861 	struct amdgpu_bo_va *bo_va;
1862 
1863 	bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
1864 	if (bo_va == NULL) {
1865 		return NULL;
1866 	}
1867 	amdgpu_vm_bo_base_init(&bo_va->base, vm, bo);
1868 
1869 	bo_va->ref_count = 1;
1870 	INIT_LIST_HEAD(&bo_va->valids);
1871 	INIT_LIST_HEAD(&bo_va->invalids);
1872 
1873 	return bo_va;
1874 }
1875 
1876 
1877 /**
1878  * amdgpu_vm_bo_insert_mapping - insert a new mapping
1879  *
1880  * @adev: amdgpu_device pointer
1881  * @bo_va: bo_va to store the address
1882  * @mapping: the mapping to insert
1883  *
1884  * Insert a new mapping into all structures.
1885  */
1886 static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
1887 				    struct amdgpu_bo_va *bo_va,
1888 				    struct amdgpu_bo_va_mapping *mapping)
1889 {
1890 	struct amdgpu_vm *vm = bo_va->base.vm;
1891 	struct amdgpu_bo *bo = bo_va->base.bo;
1892 
1893 	mapping->bo_va = bo_va;
1894 	list_add(&mapping->list, &bo_va->invalids);
1895 	amdgpu_vm_it_insert(mapping, &vm->va);
1896 
1897 	if (mapping->flags & AMDGPU_PTE_PRT)
1898 		amdgpu_vm_prt_get(adev);
1899 
1900 	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv &&
1901 	    !bo_va->base.moved) {
1902 		spin_lock(&vm->moved_lock);
1903 		list_move(&bo_va->base.vm_status, &vm->moved);
1904 		spin_unlock(&vm->moved_lock);
1905 	}
1906 	trace_amdgpu_vm_bo_map(bo_va, mapping);
1907 }
1908 
1909 /**
1910  * amdgpu_vm_bo_map - map bo inside a vm
1911  *
1912  * @adev: amdgpu_device pointer
1913  * @bo_va: bo_va to store the address
1914  * @saddr: where to map the BO
1915  * @offset: requested offset in the BO
1916  * @flags: attributes of pages (read/write/valid/etc.)
1917  *
1918  * Add a mapping of the BO at the specefied addr into the VM.
1919  * Returns 0 for success, error for failure.
1920  *
1921  * Object has to be reserved and unreserved outside!
1922  */
1923 int amdgpu_vm_bo_map(struct amdgpu_device *adev,
1924 		     struct amdgpu_bo_va *bo_va,
1925 		     uint64_t saddr, uint64_t offset,
1926 		     uint64_t size, uint64_t flags)
1927 {
1928 	struct amdgpu_bo_va_mapping *mapping, *tmp;
1929 	struct amdgpu_bo *bo = bo_va->base.bo;
1930 	struct amdgpu_vm *vm = bo_va->base.vm;
1931 	uint64_t eaddr;
1932 
1933 	/* validate the parameters */
1934 	if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
1935 	    size == 0 || size & AMDGPU_GPU_PAGE_MASK)
1936 		return -EINVAL;
1937 
1938 	/* make sure object fit at this offset */
1939 	eaddr = saddr + size - 1;
1940 	if (saddr >= eaddr ||
1941 	    (bo && offset + size > amdgpu_bo_size(bo)))
1942 		return -EINVAL;
1943 
1944 	saddr /= AMDGPU_GPU_PAGE_SIZE;
1945 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
1946 
1947 	tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
1948 	if (tmp) {
1949 		/* bo and tmp overlap, invalid addr */
1950 		dev_err(adev->dev, "bo %p va 0x%010Lx-0x%010Lx conflict with "
1951 			"0x%010Lx-0x%010Lx\n", bo, saddr, eaddr,
1952 			tmp->start, tmp->last + 1);
1953 		return -EINVAL;
1954 	}
1955 
1956 	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
1957 	if (!mapping)
1958 		return -ENOMEM;
1959 
1960 	mapping->start = saddr;
1961 	mapping->last = eaddr;
1962 	mapping->offset = offset;
1963 	mapping->flags = flags;
1964 
1965 	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
1966 
1967 	return 0;
1968 }
1969 
1970 /**
1971  * amdgpu_vm_bo_replace_map - map bo inside a vm, replacing existing mappings
1972  *
1973  * @adev: amdgpu_device pointer
1974  * @bo_va: bo_va to store the address
1975  * @saddr: where to map the BO
1976  * @offset: requested offset in the BO
1977  * @flags: attributes of pages (read/write/valid/etc.)
1978  *
1979  * Add a mapping of the BO at the specefied addr into the VM. Replace existing
1980  * mappings as we do so.
1981  * Returns 0 for success, error for failure.
1982  *
1983  * Object has to be reserved and unreserved outside!
1984  */
1985 int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
1986 			     struct amdgpu_bo_va *bo_va,
1987 			     uint64_t saddr, uint64_t offset,
1988 			     uint64_t size, uint64_t flags)
1989 {
1990 	struct amdgpu_bo_va_mapping *mapping;
1991 	struct amdgpu_bo *bo = bo_va->base.bo;
1992 	uint64_t eaddr;
1993 	int r;
1994 
1995 	/* validate the parameters */
1996 	if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
1997 	    size == 0 || size & AMDGPU_GPU_PAGE_MASK)
1998 		return -EINVAL;
1999 
2000 	/* make sure object fit at this offset */
2001 	eaddr = saddr + size - 1;
2002 	if (saddr >= eaddr ||
2003 	    (bo && offset + size > amdgpu_bo_size(bo)))
2004 		return -EINVAL;
2005 
2006 	/* Allocate all the needed memory */
2007 	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2008 	if (!mapping)
2009 		return -ENOMEM;
2010 
2011 	r = amdgpu_vm_bo_clear_mappings(adev, bo_va->base.vm, saddr, size);
2012 	if (r) {
2013 		kfree(mapping);
2014 		return r;
2015 	}
2016 
2017 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2018 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
2019 
2020 	mapping->start = saddr;
2021 	mapping->last = eaddr;
2022 	mapping->offset = offset;
2023 	mapping->flags = flags;
2024 
2025 	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
2026 
2027 	return 0;
2028 }
2029 
2030 /**
2031  * amdgpu_vm_bo_unmap - remove bo mapping from vm
2032  *
2033  * @adev: amdgpu_device pointer
2034  * @bo_va: bo_va to remove the address from
2035  * @saddr: where to the BO is mapped
2036  *
2037  * Remove a mapping of the BO at the specefied addr from the VM.
2038  * Returns 0 for success, error for failure.
2039  *
2040  * Object has to be reserved and unreserved outside!
2041  */
2042 int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
2043 		       struct amdgpu_bo_va *bo_va,
2044 		       uint64_t saddr)
2045 {
2046 	struct amdgpu_bo_va_mapping *mapping;
2047 	struct amdgpu_vm *vm = bo_va->base.vm;
2048 	bool valid = true;
2049 
2050 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2051 
2052 	list_for_each_entry(mapping, &bo_va->valids, list) {
2053 		if (mapping->start == saddr)
2054 			break;
2055 	}
2056 
2057 	if (&mapping->list == &bo_va->valids) {
2058 		valid = false;
2059 
2060 		list_for_each_entry(mapping, &bo_va->invalids, list) {
2061 			if (mapping->start == saddr)
2062 				break;
2063 		}
2064 
2065 		if (&mapping->list == &bo_va->invalids)
2066 			return -ENOENT;
2067 	}
2068 
2069 	list_del(&mapping->list);
2070 	amdgpu_vm_it_remove(mapping, &vm->va);
2071 	mapping->bo_va = NULL;
2072 	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2073 
2074 	if (valid)
2075 		list_add(&mapping->list, &vm->freed);
2076 	else
2077 		amdgpu_vm_free_mapping(adev, vm, mapping,
2078 				       bo_va->last_pt_update);
2079 
2080 	return 0;
2081 }
2082 
2083 /**
2084  * amdgpu_vm_bo_clear_mappings - remove all mappings in a specific range
2085  *
2086  * @adev: amdgpu_device pointer
2087  * @vm: VM structure to use
2088  * @saddr: start of the range
2089  * @size: size of the range
2090  *
2091  * Remove all mappings in a range, split them as appropriate.
2092  * Returns 0 for success, error for failure.
2093  */
2094 int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
2095 				struct amdgpu_vm *vm,
2096 				uint64_t saddr, uint64_t size)
2097 {
2098 	struct amdgpu_bo_va_mapping *before, *after, *tmp, *next;
2099 	LIST_HEAD(removed);
2100 	uint64_t eaddr;
2101 
2102 	eaddr = saddr + size - 1;
2103 	saddr /= AMDGPU_GPU_PAGE_SIZE;
2104 	eaddr /= AMDGPU_GPU_PAGE_SIZE;
2105 
2106 	/* Allocate all the needed memory */
2107 	before = kzalloc(sizeof(*before), GFP_KERNEL);
2108 	if (!before)
2109 		return -ENOMEM;
2110 	INIT_LIST_HEAD(&before->list);
2111 
2112 	after = kzalloc(sizeof(*after), GFP_KERNEL);
2113 	if (!after) {
2114 		kfree(before);
2115 		return -ENOMEM;
2116 	}
2117 	INIT_LIST_HEAD(&after->list);
2118 
2119 	/* Now gather all removed mappings */
2120 	tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
2121 	while (tmp) {
2122 		/* Remember mapping split at the start */
2123 		if (tmp->start < saddr) {
2124 			before->start = tmp->start;
2125 			before->last = saddr - 1;
2126 			before->offset = tmp->offset;
2127 			before->flags = tmp->flags;
2128 			before->bo_va = tmp->bo_va;
2129 			list_add(&before->list, &tmp->bo_va->invalids);
2130 		}
2131 
2132 		/* Remember mapping split at the end */
2133 		if (tmp->last > eaddr) {
2134 			after->start = eaddr + 1;
2135 			after->last = tmp->last;
2136 			after->offset = tmp->offset;
2137 			after->offset += after->start - tmp->start;
2138 			after->flags = tmp->flags;
2139 			after->bo_va = tmp->bo_va;
2140 			list_add(&after->list, &tmp->bo_va->invalids);
2141 		}
2142 
2143 		list_del(&tmp->list);
2144 		list_add(&tmp->list, &removed);
2145 
2146 		tmp = amdgpu_vm_it_iter_next(tmp, saddr, eaddr);
2147 	}
2148 
2149 	/* And free them up */
2150 	list_for_each_entry_safe(tmp, next, &removed, list) {
2151 		amdgpu_vm_it_remove(tmp, &vm->va);
2152 		list_del(&tmp->list);
2153 
2154 		if (tmp->start < saddr)
2155 		    tmp->start = saddr;
2156 		if (tmp->last > eaddr)
2157 		    tmp->last = eaddr;
2158 
2159 		tmp->bo_va = NULL;
2160 		list_add(&tmp->list, &vm->freed);
2161 		trace_amdgpu_vm_bo_unmap(NULL, tmp);
2162 	}
2163 
2164 	/* Insert partial mapping before the range */
2165 	if (!list_empty(&before->list)) {
2166 		amdgpu_vm_it_insert(before, &vm->va);
2167 		if (before->flags & AMDGPU_PTE_PRT)
2168 			amdgpu_vm_prt_get(adev);
2169 	} else {
2170 		kfree(before);
2171 	}
2172 
2173 	/* Insert partial mapping after the range */
2174 	if (!list_empty(&after->list)) {
2175 		amdgpu_vm_it_insert(after, &vm->va);
2176 		if (after->flags & AMDGPU_PTE_PRT)
2177 			amdgpu_vm_prt_get(adev);
2178 	} else {
2179 		kfree(after);
2180 	}
2181 
2182 	return 0;
2183 }
2184 
2185 /**
2186  * amdgpu_vm_bo_lookup_mapping - find mapping by address
2187  *
2188  * @vm: the requested VM
2189  *
2190  * Find a mapping by it's address.
2191  */
2192 struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm,
2193 							 uint64_t addr)
2194 {
2195 	return amdgpu_vm_it_iter_first(&vm->va, addr, addr);
2196 }
2197 
2198 /**
2199  * amdgpu_vm_bo_rmv - remove a bo to a specific vm
2200  *
2201  * @adev: amdgpu_device pointer
2202  * @bo_va: requested bo_va
2203  *
2204  * Remove @bo_va->bo from the requested vm.
2205  *
2206  * Object have to be reserved!
2207  */
2208 void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
2209 		      struct amdgpu_bo_va *bo_va)
2210 {
2211 	struct amdgpu_bo_va_mapping *mapping, *next;
2212 	struct amdgpu_vm *vm = bo_va->base.vm;
2213 
2214 	list_del(&bo_va->base.bo_list);
2215 
2216 	spin_lock(&vm->moved_lock);
2217 	list_del(&bo_va->base.vm_status);
2218 	spin_unlock(&vm->moved_lock);
2219 
2220 	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
2221 		list_del(&mapping->list);
2222 		amdgpu_vm_it_remove(mapping, &vm->va);
2223 		mapping->bo_va = NULL;
2224 		trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2225 		list_add(&mapping->list, &vm->freed);
2226 	}
2227 	list_for_each_entry_safe(mapping, next, &bo_va->invalids, list) {
2228 		list_del(&mapping->list);
2229 		amdgpu_vm_it_remove(mapping, &vm->va);
2230 		amdgpu_vm_free_mapping(adev, vm, mapping,
2231 				       bo_va->last_pt_update);
2232 	}
2233 
2234 	dma_fence_put(bo_va->last_pt_update);
2235 	kfree(bo_va);
2236 }
2237 
2238 /**
2239  * amdgpu_vm_bo_invalidate - mark the bo as invalid
2240  *
2241  * @adev: amdgpu_device pointer
2242  * @vm: requested vm
2243  * @bo: amdgpu buffer object
2244  *
2245  * Mark @bo as invalid.
2246  */
2247 void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
2248 			     struct amdgpu_bo *bo, bool evicted)
2249 {
2250 	struct amdgpu_vm_bo_base *bo_base;
2251 
2252 	/* shadow bo doesn't have bo base, its validation needs its parent */
2253 	if (bo->parent && bo->parent->shadow == bo)
2254 		bo = bo->parent;
2255 
2256 	list_for_each_entry(bo_base, &bo->va, bo_list) {
2257 		struct amdgpu_vm *vm = bo_base->vm;
2258 		bool was_moved = bo_base->moved;
2259 
2260 		bo_base->moved = true;
2261 		if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
2262 			if (bo->tbo.type == ttm_bo_type_kernel)
2263 				list_move(&bo_base->vm_status, &vm->evicted);
2264 			else
2265 				list_move_tail(&bo_base->vm_status,
2266 					       &vm->evicted);
2267 			continue;
2268 		}
2269 
2270 		if (was_moved)
2271 			continue;
2272 
2273 		if (bo->tbo.type == ttm_bo_type_kernel) {
2274 			list_move(&bo_base->vm_status, &vm->relocated);
2275 		} else {
2276 			spin_lock(&bo_base->vm->moved_lock);
2277 			list_move(&bo_base->vm_status, &vm->moved);
2278 			spin_unlock(&bo_base->vm->moved_lock);
2279 		}
2280 	}
2281 }
2282 
2283 static uint32_t amdgpu_vm_get_block_size(uint64_t vm_size)
2284 {
2285 	/* Total bits covered by PD + PTs */
2286 	unsigned bits = ilog2(vm_size) + 18;
2287 
2288 	/* Make sure the PD is 4K in size up to 8GB address space.
2289 	   Above that split equal between PD and PTs */
2290 	if (vm_size <= 8)
2291 		return (bits - 9);
2292 	else
2293 		return ((bits + 3) / 2);
2294 }
2295 
2296 /**
2297  * amdgpu_vm_adjust_size - adjust vm size, block size and fragment size
2298  *
2299  * @adev: amdgpu_device pointer
2300  * @vm_size: the default vm size if it's set auto
2301  */
2302 void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t vm_size,
2303 			   uint32_t fragment_size_default, unsigned max_level,
2304 			   unsigned max_bits)
2305 {
2306 	uint64_t tmp;
2307 
2308 	/* adjust vm size first */
2309 	if (amdgpu_vm_size != -1) {
2310 		unsigned max_size = 1 << (max_bits - 30);
2311 
2312 		vm_size = amdgpu_vm_size;
2313 		if (vm_size > max_size) {
2314 			dev_warn(adev->dev, "VM size (%d) too large, max is %u GB\n",
2315 				 amdgpu_vm_size, max_size);
2316 			vm_size = max_size;
2317 		}
2318 	}
2319 
2320 	adev->vm_manager.max_pfn = (uint64_t)vm_size << 18;
2321 
2322 	tmp = roundup_pow_of_two(adev->vm_manager.max_pfn);
2323 	if (amdgpu_vm_block_size != -1)
2324 		tmp >>= amdgpu_vm_block_size - 9;
2325 	tmp = DIV_ROUND_UP(fls64(tmp) - 1, 9) - 1;
2326 	adev->vm_manager.num_level = min(max_level, (unsigned)tmp);
2327 	switch (adev->vm_manager.num_level) {
2328 	case 3:
2329 		adev->vm_manager.root_level = AMDGPU_VM_PDB2;
2330 		break;
2331 	case 2:
2332 		adev->vm_manager.root_level = AMDGPU_VM_PDB1;
2333 		break;
2334 	case 1:
2335 		adev->vm_manager.root_level = AMDGPU_VM_PDB0;
2336 		break;
2337 	default:
2338 		dev_err(adev->dev, "VMPT only supports 2~4+1 levels\n");
2339 	}
2340 	/* block size depends on vm size and hw setup*/
2341 	if (amdgpu_vm_block_size != -1)
2342 		adev->vm_manager.block_size =
2343 			min((unsigned)amdgpu_vm_block_size, max_bits
2344 			    - AMDGPU_GPU_PAGE_SHIFT
2345 			    - 9 * adev->vm_manager.num_level);
2346 	else if (adev->vm_manager.num_level > 1)
2347 		adev->vm_manager.block_size = 9;
2348 	else
2349 		adev->vm_manager.block_size = amdgpu_vm_get_block_size(tmp);
2350 
2351 	if (amdgpu_vm_fragment_size == -1)
2352 		adev->vm_manager.fragment_size = fragment_size_default;
2353 	else
2354 		adev->vm_manager.fragment_size = amdgpu_vm_fragment_size;
2355 
2356 	DRM_INFO("vm size is %u GB, %u levels, block size is %u-bit, fragment size is %u-bit\n",
2357 		 vm_size, adev->vm_manager.num_level + 1,
2358 		 adev->vm_manager.block_size,
2359 		 adev->vm_manager.fragment_size);
2360 }
2361 
2362 /**
2363  * amdgpu_vm_init - initialize a vm instance
2364  *
2365  * @adev: amdgpu_device pointer
2366  * @vm: requested vm
2367  * @vm_context: Indicates if it GFX or Compute context
2368  *
2369  * Init @vm fields.
2370  */
2371 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
2372 		   int vm_context, unsigned int pasid)
2373 {
2374 	struct amdgpu_bo_param bp;
2375 	struct amdgpu_bo *root;
2376 	const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
2377 		AMDGPU_VM_PTE_COUNT(adev) * 8);
2378 	unsigned ring_instance;
2379 	struct amdgpu_ring *ring;
2380 	struct drm_sched_rq *rq;
2381 	unsigned long size;
2382 	uint64_t flags;
2383 	int r, i;
2384 
2385 	vm->va = RB_ROOT_CACHED;
2386 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2387 		vm->reserved_vmid[i] = NULL;
2388 	INIT_LIST_HEAD(&vm->evicted);
2389 	INIT_LIST_HEAD(&vm->relocated);
2390 	spin_lock_init(&vm->moved_lock);
2391 	INIT_LIST_HEAD(&vm->moved);
2392 	INIT_LIST_HEAD(&vm->idle);
2393 	INIT_LIST_HEAD(&vm->freed);
2394 
2395 	/* create scheduler entity for page table updates */
2396 
2397 	ring_instance = atomic_inc_return(&adev->vm_manager.vm_pte_next_ring);
2398 	ring_instance %= adev->vm_manager.vm_pte_num_rings;
2399 	ring = adev->vm_manager.vm_pte_rings[ring_instance];
2400 	rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
2401 	r = drm_sched_entity_init(&ring->sched, &vm->entity,
2402 				  rq, NULL);
2403 	if (r)
2404 		return r;
2405 
2406 	vm->pte_support_ats = false;
2407 
2408 	if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
2409 		vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2410 						AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2411 
2412 		if (adev->asic_type == CHIP_RAVEN)
2413 			vm->pte_support_ats = true;
2414 	} else {
2415 		vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2416 						AMDGPU_VM_USE_CPU_FOR_GFX);
2417 	}
2418 	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2419 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
2420 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
2421 		  "CPU update of VM recommended only for large BAR system\n");
2422 	vm->last_update = NULL;
2423 
2424 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
2425 	if (vm->use_cpu_for_update)
2426 		flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
2427 	else
2428 		flags |= AMDGPU_GEM_CREATE_SHADOW;
2429 
2430 	size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level);
2431 	memset(&bp, 0, sizeof(bp));
2432 	bp.size = size;
2433 	bp.byte_align = align;
2434 	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
2435 	bp.flags = flags;
2436 	bp.type = ttm_bo_type_kernel;
2437 	bp.resv = NULL;
2438 	r = amdgpu_bo_create(adev, &bp, &root);
2439 	if (r)
2440 		goto error_free_sched_entity;
2441 
2442 	r = amdgpu_bo_reserve(root, true);
2443 	if (r)
2444 		goto error_free_root;
2445 
2446 	r = amdgpu_vm_clear_bo(adev, vm, root,
2447 			       adev->vm_manager.root_level,
2448 			       vm->pte_support_ats);
2449 	if (r)
2450 		goto error_unreserve;
2451 
2452 	amdgpu_vm_bo_base_init(&vm->root.base, vm, root);
2453 	amdgpu_bo_unreserve(vm->root.base.bo);
2454 
2455 	if (pasid) {
2456 		unsigned long flags;
2457 
2458 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2459 		r = idr_alloc(&adev->vm_manager.pasid_idr, vm, pasid, pasid + 1,
2460 			      GFP_ATOMIC);
2461 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2462 		if (r < 0)
2463 			goto error_free_root;
2464 
2465 		vm->pasid = pasid;
2466 	}
2467 
2468 	INIT_KFIFO(vm->faults);
2469 	vm->fault_credit = 16;
2470 
2471 	return 0;
2472 
2473 error_unreserve:
2474 	amdgpu_bo_unreserve(vm->root.base.bo);
2475 
2476 error_free_root:
2477 	amdgpu_bo_unref(&vm->root.base.bo->shadow);
2478 	amdgpu_bo_unref(&vm->root.base.bo);
2479 	vm->root.base.bo = NULL;
2480 
2481 error_free_sched_entity:
2482 	drm_sched_entity_fini(&ring->sched, &vm->entity);
2483 
2484 	return r;
2485 }
2486 
2487 /**
2488  * amdgpu_vm_make_compute - Turn a GFX VM into a compute VM
2489  *
2490  * This only works on GFX VMs that don't have any BOs added and no
2491  * page tables allocated yet.
2492  *
2493  * Changes the following VM parameters:
2494  * - use_cpu_for_update
2495  * - pte_supports_ats
2496  * - pasid (old PASID is released, because compute manages its own PASIDs)
2497  *
2498  * Reinitializes the page directory to reflect the changed ATS
2499  * setting. May leave behind an unused shadow BO for the page
2500  * directory when switching from SDMA updates to CPU updates.
2501  *
2502  * Returns 0 for success, -errno for errors.
2503  */
2504 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2505 {
2506 	bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
2507 	int r;
2508 
2509 	r = amdgpu_bo_reserve(vm->root.base.bo, true);
2510 	if (r)
2511 		return r;
2512 
2513 	/* Sanity checks */
2514 	if (!RB_EMPTY_ROOT(&vm->va.rb_root) || vm->root.entries) {
2515 		r = -EINVAL;
2516 		goto error;
2517 	}
2518 
2519 	/* Check if PD needs to be reinitialized and do it before
2520 	 * changing any other state, in case it fails.
2521 	 */
2522 	if (pte_support_ats != vm->pte_support_ats) {
2523 		r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo,
2524 			       adev->vm_manager.root_level,
2525 			       pte_support_ats);
2526 		if (r)
2527 			goto error;
2528 	}
2529 
2530 	/* Update VM state */
2531 	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2532 				    AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2533 	vm->pte_support_ats = pte_support_ats;
2534 	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2535 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
2536 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
2537 		  "CPU update of VM recommended only for large BAR system\n");
2538 
2539 	if (vm->pasid) {
2540 		unsigned long flags;
2541 
2542 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2543 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2544 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2545 
2546 		vm->pasid = 0;
2547 	}
2548 
2549 error:
2550 	amdgpu_bo_unreserve(vm->root.base.bo);
2551 	return r;
2552 }
2553 
2554 /**
2555  * amdgpu_vm_free_levels - free PD/PT levels
2556  *
2557  * @adev: amdgpu device structure
2558  * @parent: PD/PT starting level to free
2559  * @level: level of parent structure
2560  *
2561  * Free the page directory or page table level and all sub levels.
2562  */
2563 static void amdgpu_vm_free_levels(struct amdgpu_device *adev,
2564 				  struct amdgpu_vm_pt *parent,
2565 				  unsigned level)
2566 {
2567 	unsigned i, num_entries = amdgpu_vm_num_entries(adev, level);
2568 
2569 	if (parent->base.bo) {
2570 		list_del(&parent->base.bo_list);
2571 		list_del(&parent->base.vm_status);
2572 		amdgpu_bo_unref(&parent->base.bo->shadow);
2573 		amdgpu_bo_unref(&parent->base.bo);
2574 	}
2575 
2576 	if (parent->entries)
2577 		for (i = 0; i < num_entries; i++)
2578 			amdgpu_vm_free_levels(adev, &parent->entries[i],
2579 					      level + 1);
2580 
2581 	kvfree(parent->entries);
2582 }
2583 
2584 /**
2585  * amdgpu_vm_fini - tear down a vm instance
2586  *
2587  * @adev: amdgpu_device pointer
2588  * @vm: requested vm
2589  *
2590  * Tear down @vm.
2591  * Unbind the VM and remove all bos from the vm bo list
2592  */
2593 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2594 {
2595 	struct amdgpu_bo_va_mapping *mapping, *tmp;
2596 	bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
2597 	struct amdgpu_bo *root;
2598 	u64 fault;
2599 	int i, r;
2600 
2601 	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
2602 
2603 	/* Clear pending page faults from IH when the VM is destroyed */
2604 	while (kfifo_get(&vm->faults, &fault))
2605 		amdgpu_ih_clear_fault(adev, fault);
2606 
2607 	if (vm->pasid) {
2608 		unsigned long flags;
2609 
2610 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2611 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2612 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2613 	}
2614 
2615 	drm_sched_entity_fini(vm->entity.sched, &vm->entity);
2616 
2617 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
2618 		dev_err(adev->dev, "still active bo inside vm\n");
2619 	}
2620 	rbtree_postorder_for_each_entry_safe(mapping, tmp,
2621 					     &vm->va.rb_root, rb) {
2622 		list_del(&mapping->list);
2623 		amdgpu_vm_it_remove(mapping, &vm->va);
2624 		kfree(mapping);
2625 	}
2626 	list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
2627 		if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
2628 			amdgpu_vm_prt_fini(adev, vm);
2629 			prt_fini_needed = false;
2630 		}
2631 
2632 		list_del(&mapping->list);
2633 		amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
2634 	}
2635 
2636 	root = amdgpu_bo_ref(vm->root.base.bo);
2637 	r = amdgpu_bo_reserve(root, true);
2638 	if (r) {
2639 		dev_err(adev->dev, "Leaking page tables because BO reservation failed\n");
2640 	} else {
2641 		amdgpu_vm_free_levels(adev, &vm->root,
2642 				      adev->vm_manager.root_level);
2643 		amdgpu_bo_unreserve(root);
2644 	}
2645 	amdgpu_bo_unref(&root);
2646 	dma_fence_put(vm->last_update);
2647 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2648 		amdgpu_vmid_free_reserved(adev, vm, i);
2649 }
2650 
2651 /**
2652  * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
2653  *
2654  * @adev: amdgpu_device pointer
2655  * @pasid: PASID do identify the VM
2656  *
2657  * This function is expected to be called in interrupt context. Returns
2658  * true if there was fault credit, false otherwise
2659  */
2660 bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
2661 				  unsigned int pasid)
2662 {
2663 	struct amdgpu_vm *vm;
2664 
2665 	spin_lock(&adev->vm_manager.pasid_lock);
2666 	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
2667 	if (!vm) {
2668 		/* VM not found, can't track fault credit */
2669 		spin_unlock(&adev->vm_manager.pasid_lock);
2670 		return true;
2671 	}
2672 
2673 	/* No lock needed. only accessed by IRQ handler */
2674 	if (!vm->fault_credit) {
2675 		/* Too many faults in this VM */
2676 		spin_unlock(&adev->vm_manager.pasid_lock);
2677 		return false;
2678 	}
2679 
2680 	vm->fault_credit--;
2681 	spin_unlock(&adev->vm_manager.pasid_lock);
2682 	return true;
2683 }
2684 
2685 /**
2686  * amdgpu_vm_manager_init - init the VM manager
2687  *
2688  * @adev: amdgpu_device pointer
2689  *
2690  * Initialize the VM manager structures
2691  */
2692 void amdgpu_vm_manager_init(struct amdgpu_device *adev)
2693 {
2694 	unsigned i;
2695 
2696 	amdgpu_vmid_mgr_init(adev);
2697 
2698 	adev->vm_manager.fence_context =
2699 		dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2700 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
2701 		adev->vm_manager.seqno[i] = 0;
2702 
2703 	atomic_set(&adev->vm_manager.vm_pte_next_ring, 0);
2704 	spin_lock_init(&adev->vm_manager.prt_lock);
2705 	atomic_set(&adev->vm_manager.num_prt_users, 0);
2706 
2707 	/* If not overridden by the user, by default, only in large BAR systems
2708 	 * Compute VM tables will be updated by CPU
2709 	 */
2710 #ifdef CONFIG_X86_64
2711 	if (amdgpu_vm_update_mode == -1) {
2712 		if (amdgpu_vm_is_large_bar(adev))
2713 			adev->vm_manager.vm_update_mode =
2714 				AMDGPU_VM_USE_CPU_FOR_COMPUTE;
2715 		else
2716 			adev->vm_manager.vm_update_mode = 0;
2717 	} else
2718 		adev->vm_manager.vm_update_mode = amdgpu_vm_update_mode;
2719 #else
2720 	adev->vm_manager.vm_update_mode = 0;
2721 #endif
2722 
2723 	idr_init(&adev->vm_manager.pasid_idr);
2724 	spin_lock_init(&adev->vm_manager.pasid_lock);
2725 }
2726 
2727 /**
2728  * amdgpu_vm_manager_fini - cleanup VM manager
2729  *
2730  * @adev: amdgpu_device pointer
2731  *
2732  * Cleanup the VM manager and free resources.
2733  */
2734 void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
2735 {
2736 	WARN_ON(!idr_is_empty(&adev->vm_manager.pasid_idr));
2737 	idr_destroy(&adev->vm_manager.pasid_idr);
2738 
2739 	amdgpu_vmid_mgr_fini(adev);
2740 }
2741 
2742 int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
2743 {
2744 	union drm_amdgpu_vm *args = data;
2745 	struct amdgpu_device *adev = dev->dev_private;
2746 	struct amdgpu_fpriv *fpriv = filp->driver_priv;
2747 	int r;
2748 
2749 	switch (args->in.op) {
2750 	case AMDGPU_VM_OP_RESERVE_VMID:
2751 		/* current, we only have requirement to reserve vmid from gfxhub */
2752 		r = amdgpu_vmid_alloc_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
2753 		if (r)
2754 			return r;
2755 		break;
2756 	case AMDGPU_VM_OP_UNRESERVE_VMID:
2757 		amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
2758 		break;
2759 	default:
2760 		return -EINVAL;
2761 	}
2762 
2763 	return 0;
2764 }
2765