1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #ifdef dev_fmt 37 #undef dev_fmt 38 #endif 39 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 40 41 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 42 43 /* Long enough to ensure no retry fault comes after svm range is restored and 44 * page table is updated. 45 */ 46 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 47 48 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 49 static bool 50 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 51 const struct mmu_notifier_range *range, 52 unsigned long cur_seq); 53 static int 54 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 55 uint64_t *bo_s, uint64_t *bo_l); 56 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 57 .invalidate = svm_range_cpu_invalidate_pagetables, 58 }; 59 60 /** 61 * svm_range_unlink - unlink svm_range from lists and interval tree 62 * @prange: svm range structure to be removed 63 * 64 * Remove the svm_range from the svms and svm_bo lists and the svms 65 * interval tree. 66 * 67 * Context: The caller must hold svms->lock 68 */ 69 static void svm_range_unlink(struct svm_range *prange) 70 { 71 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 72 prange, prange->start, prange->last); 73 74 if (prange->svm_bo) { 75 spin_lock(&prange->svm_bo->list_lock); 76 list_del(&prange->svm_bo_list); 77 spin_unlock(&prange->svm_bo->list_lock); 78 } 79 80 list_del(&prange->list); 81 if (prange->it_node.start != 0 && prange->it_node.last != 0) 82 interval_tree_remove(&prange->it_node, &prange->svms->objects); 83 } 84 85 static void 86 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 87 { 88 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 89 prange, prange->start, prange->last); 90 91 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 92 prange->start << PAGE_SHIFT, 93 prange->npages << PAGE_SHIFT, 94 &svm_range_mn_ops); 95 } 96 97 /** 98 * svm_range_add_to_svms - add svm range to svms 99 * @prange: svm range structure to be added 100 * 101 * Add the svm range to svms interval tree and link list 102 * 103 * Context: The caller must hold svms->lock 104 */ 105 static void svm_range_add_to_svms(struct svm_range *prange) 106 { 107 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 108 prange, prange->start, prange->last); 109 110 list_move_tail(&prange->list, &prange->svms->list); 111 prange->it_node.start = prange->start; 112 prange->it_node.last = prange->last; 113 interval_tree_insert(&prange->it_node, &prange->svms->objects); 114 } 115 116 static void svm_range_remove_notifier(struct svm_range *prange) 117 { 118 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 119 prange->svms, prange, 120 prange->notifier.interval_tree.start >> PAGE_SHIFT, 121 prange->notifier.interval_tree.last >> PAGE_SHIFT); 122 123 if (prange->notifier.interval_tree.start != 0 && 124 prange->notifier.interval_tree.last != 0) 125 mmu_interval_notifier_remove(&prange->notifier); 126 } 127 128 static bool 129 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 130 { 131 return dma_addr && !dma_mapping_error(dev, dma_addr) && 132 !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 133 } 134 135 static int 136 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 137 unsigned long offset, unsigned long npages, 138 unsigned long *hmm_pfns, uint32_t gpuidx) 139 { 140 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 141 dma_addr_t *addr = prange->dma_addr[gpuidx]; 142 struct device *dev = adev->dev; 143 struct page *page; 144 int i, r; 145 146 if (!addr) { 147 addr = kvmalloc_array(prange->npages, sizeof(*addr), 148 GFP_KERNEL | __GFP_ZERO); 149 if (!addr) 150 return -ENOMEM; 151 prange->dma_addr[gpuidx] = addr; 152 } 153 154 addr += offset; 155 for (i = 0; i < npages; i++) { 156 if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 157 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 158 159 page = hmm_pfn_to_page(hmm_pfns[i]); 160 if (is_zone_device_page(page)) { 161 struct amdgpu_device *bo_adev = 162 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 163 164 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 165 bo_adev->vm_manager.vram_base_offset - 166 bo_adev->kfd.dev->pgmap.range.start; 167 addr[i] |= SVM_RANGE_VRAM_DOMAIN; 168 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 169 continue; 170 } 171 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 172 r = dma_mapping_error(dev, addr[i]); 173 if (r) { 174 dev_err(dev, "failed %d dma_map_page\n", r); 175 return r; 176 } 177 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 178 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 179 } 180 return 0; 181 } 182 183 static int 184 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 185 unsigned long offset, unsigned long npages, 186 unsigned long *hmm_pfns) 187 { 188 struct kfd_process *p; 189 uint32_t gpuidx; 190 int r; 191 192 p = container_of(prange->svms, struct kfd_process, svms); 193 194 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 195 struct kfd_process_device *pdd; 196 197 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 198 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 199 if (!pdd) { 200 pr_debug("failed to find device idx %d\n", gpuidx); 201 return -EINVAL; 202 } 203 204 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 205 hmm_pfns, gpuidx); 206 if (r) 207 break; 208 } 209 210 return r; 211 } 212 213 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 214 unsigned long offset, unsigned long npages) 215 { 216 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 217 int i; 218 219 if (!dma_addr) 220 return; 221 222 for (i = offset; i < offset + npages; i++) { 223 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 224 continue; 225 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 226 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 227 dma_addr[i] = 0; 228 } 229 } 230 231 void svm_range_free_dma_mappings(struct svm_range *prange) 232 { 233 struct kfd_process_device *pdd; 234 dma_addr_t *dma_addr; 235 struct device *dev; 236 struct kfd_process *p; 237 uint32_t gpuidx; 238 239 p = container_of(prange->svms, struct kfd_process, svms); 240 241 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 242 dma_addr = prange->dma_addr[gpuidx]; 243 if (!dma_addr) 244 continue; 245 246 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 247 if (!pdd) { 248 pr_debug("failed to find device idx %d\n", gpuidx); 249 continue; 250 } 251 dev = &pdd->dev->pdev->dev; 252 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 253 kvfree(dma_addr); 254 prange->dma_addr[gpuidx] = NULL; 255 } 256 } 257 258 static void svm_range_free(struct svm_range *prange) 259 { 260 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 261 prange->start, prange->last); 262 263 svm_range_vram_node_free(prange); 264 svm_range_free_dma_mappings(prange); 265 mutex_destroy(&prange->lock); 266 mutex_destroy(&prange->migrate_mutex); 267 kfree(prange); 268 } 269 270 static void 271 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 272 uint8_t *granularity, uint32_t *flags) 273 { 274 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 275 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 276 *granularity = 9; 277 *flags = 278 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 279 } 280 281 static struct 282 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 283 uint64_t last) 284 { 285 uint64_t size = last - start + 1; 286 struct svm_range *prange; 287 struct kfd_process *p; 288 289 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 290 if (!prange) 291 return NULL; 292 prange->npages = size; 293 prange->svms = svms; 294 prange->start = start; 295 prange->last = last; 296 INIT_LIST_HEAD(&prange->list); 297 INIT_LIST_HEAD(&prange->update_list); 298 INIT_LIST_HEAD(&prange->svm_bo_list); 299 INIT_LIST_HEAD(&prange->deferred_list); 300 INIT_LIST_HEAD(&prange->child_list); 301 atomic_set(&prange->invalid, 0); 302 prange->validate_timestamp = 0; 303 mutex_init(&prange->migrate_mutex); 304 mutex_init(&prange->lock); 305 306 p = container_of(svms, struct kfd_process, svms); 307 if (p->xnack_enabled) 308 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 309 MAX_GPU_INSTANCE); 310 311 svm_range_set_default_attributes(&prange->preferred_loc, 312 &prange->prefetch_loc, 313 &prange->granularity, &prange->flags); 314 315 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 316 317 return prange; 318 } 319 320 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 321 { 322 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 323 return false; 324 325 return true; 326 } 327 328 static void svm_range_bo_release(struct kref *kref) 329 { 330 struct svm_range_bo *svm_bo; 331 332 svm_bo = container_of(kref, struct svm_range_bo, kref); 333 pr_debug("svm_bo 0x%p\n", svm_bo); 334 335 spin_lock(&svm_bo->list_lock); 336 while (!list_empty(&svm_bo->range_list)) { 337 struct svm_range *prange = 338 list_first_entry(&svm_bo->range_list, 339 struct svm_range, svm_bo_list); 340 /* list_del_init tells a concurrent svm_range_vram_node_new when 341 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 342 */ 343 list_del_init(&prange->svm_bo_list); 344 spin_unlock(&svm_bo->list_lock); 345 346 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 347 prange->start, prange->last); 348 mutex_lock(&prange->lock); 349 prange->svm_bo = NULL; 350 mutex_unlock(&prange->lock); 351 352 spin_lock(&svm_bo->list_lock); 353 } 354 spin_unlock(&svm_bo->list_lock); 355 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 356 /* We're not in the eviction worker. 357 * Signal the fence and synchronize with any 358 * pending eviction work. 359 */ 360 dma_fence_signal(&svm_bo->eviction_fence->base); 361 cancel_work_sync(&svm_bo->eviction_work); 362 } 363 dma_fence_put(&svm_bo->eviction_fence->base); 364 amdgpu_bo_unref(&svm_bo->bo); 365 kfree(svm_bo); 366 } 367 368 static void svm_range_bo_wq_release(struct work_struct *work) 369 { 370 struct svm_range_bo *svm_bo; 371 372 svm_bo = container_of(work, struct svm_range_bo, release_work); 373 svm_range_bo_release(&svm_bo->kref); 374 } 375 376 static void svm_range_bo_release_async(struct kref *kref) 377 { 378 struct svm_range_bo *svm_bo; 379 380 svm_bo = container_of(kref, struct svm_range_bo, kref); 381 pr_debug("svm_bo 0x%p\n", svm_bo); 382 INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); 383 schedule_work(&svm_bo->release_work); 384 } 385 386 void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) 387 { 388 kref_put(&svm_bo->kref, svm_range_bo_release_async); 389 } 390 391 static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 392 { 393 if (svm_bo) 394 kref_put(&svm_bo->kref, svm_range_bo_release); 395 } 396 397 static bool 398 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 399 { 400 struct amdgpu_device *bo_adev; 401 402 mutex_lock(&prange->lock); 403 if (!prange->svm_bo) { 404 mutex_unlock(&prange->lock); 405 return false; 406 } 407 if (prange->ttm_res) { 408 /* We still have a reference, all is well */ 409 mutex_unlock(&prange->lock); 410 return true; 411 } 412 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 413 /* 414 * Migrate from GPU to GPU, remove range from source bo_adev 415 * svm_bo range list, and return false to allocate svm_bo from 416 * destination adev. 417 */ 418 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 419 if (bo_adev != adev) { 420 mutex_unlock(&prange->lock); 421 422 spin_lock(&prange->svm_bo->list_lock); 423 list_del_init(&prange->svm_bo_list); 424 spin_unlock(&prange->svm_bo->list_lock); 425 426 svm_range_bo_unref(prange->svm_bo); 427 return false; 428 } 429 if (READ_ONCE(prange->svm_bo->evicting)) { 430 struct dma_fence *f; 431 struct svm_range_bo *svm_bo; 432 /* The BO is getting evicted, 433 * we need to get a new one 434 */ 435 mutex_unlock(&prange->lock); 436 svm_bo = prange->svm_bo; 437 f = dma_fence_get(&svm_bo->eviction_fence->base); 438 svm_range_bo_unref(prange->svm_bo); 439 /* wait for the fence to avoid long spin-loop 440 * at list_empty_careful 441 */ 442 dma_fence_wait(f, false); 443 dma_fence_put(f); 444 } else { 445 /* The BO was still around and we got 446 * a new reference to it 447 */ 448 mutex_unlock(&prange->lock); 449 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 450 prange->svms, prange->start, prange->last); 451 452 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 453 return true; 454 } 455 456 } else { 457 mutex_unlock(&prange->lock); 458 } 459 460 /* We need a new svm_bo. Spin-loop to wait for concurrent 461 * svm_range_bo_release to finish removing this range from 462 * its range list. After this, it is safe to reuse the 463 * svm_bo pointer and svm_bo_list head. 464 */ 465 while (!list_empty_careful(&prange->svm_bo_list)) 466 ; 467 468 return false; 469 } 470 471 static struct svm_range_bo *svm_range_bo_new(void) 472 { 473 struct svm_range_bo *svm_bo; 474 475 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 476 if (!svm_bo) 477 return NULL; 478 479 kref_init(&svm_bo->kref); 480 INIT_LIST_HEAD(&svm_bo->range_list); 481 spin_lock_init(&svm_bo->list_lock); 482 483 return svm_bo; 484 } 485 486 int 487 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 488 bool clear) 489 { 490 struct amdgpu_bo_param bp; 491 struct svm_range_bo *svm_bo; 492 struct amdgpu_bo_user *ubo; 493 struct amdgpu_bo *bo; 494 struct kfd_process *p; 495 struct mm_struct *mm; 496 int r; 497 498 p = container_of(prange->svms, struct kfd_process, svms); 499 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 500 prange->start, prange->last); 501 502 if (svm_range_validate_svm_bo(adev, prange)) 503 return 0; 504 505 svm_bo = svm_range_bo_new(); 506 if (!svm_bo) { 507 pr_debug("failed to alloc svm bo\n"); 508 return -ENOMEM; 509 } 510 mm = get_task_mm(p->lead_thread); 511 if (!mm) { 512 pr_debug("failed to get mm\n"); 513 kfree(svm_bo); 514 return -ESRCH; 515 } 516 svm_bo->svms = prange->svms; 517 svm_bo->eviction_fence = 518 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 519 mm, 520 svm_bo); 521 mmput(mm); 522 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 523 svm_bo->evicting = 0; 524 memset(&bp, 0, sizeof(bp)); 525 bp.size = prange->npages * PAGE_SIZE; 526 bp.byte_align = PAGE_SIZE; 527 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 528 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 529 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 530 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 531 bp.type = ttm_bo_type_device; 532 bp.resv = NULL; 533 534 r = amdgpu_bo_create_user(adev, &bp, &ubo); 535 if (r) { 536 pr_debug("failed %d to create bo\n", r); 537 goto create_bo_failed; 538 } 539 bo = &ubo->bo; 540 r = amdgpu_bo_reserve(bo, true); 541 if (r) { 542 pr_debug("failed %d to reserve bo\n", r); 543 goto reserve_bo_failed; 544 } 545 546 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 547 if (r) { 548 pr_debug("failed %d to reserve bo\n", r); 549 amdgpu_bo_unreserve(bo); 550 goto reserve_bo_failed; 551 } 552 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 553 554 amdgpu_bo_unreserve(bo); 555 556 svm_bo->bo = bo; 557 prange->svm_bo = svm_bo; 558 prange->ttm_res = bo->tbo.resource; 559 prange->offset = 0; 560 561 spin_lock(&svm_bo->list_lock); 562 list_add(&prange->svm_bo_list, &svm_bo->range_list); 563 spin_unlock(&svm_bo->list_lock); 564 565 return 0; 566 567 reserve_bo_failed: 568 amdgpu_bo_unref(&bo); 569 create_bo_failed: 570 dma_fence_put(&svm_bo->eviction_fence->base); 571 kfree(svm_bo); 572 prange->ttm_res = NULL; 573 574 return r; 575 } 576 577 void svm_range_vram_node_free(struct svm_range *prange) 578 { 579 svm_range_bo_unref(prange->svm_bo); 580 prange->ttm_res = NULL; 581 } 582 583 struct amdgpu_device * 584 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 585 { 586 struct kfd_process_device *pdd; 587 struct kfd_process *p; 588 int32_t gpu_idx; 589 590 p = container_of(prange->svms, struct kfd_process, svms); 591 592 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 593 if (gpu_idx < 0) { 594 pr_debug("failed to get device by id 0x%x\n", gpu_id); 595 return NULL; 596 } 597 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 598 if (!pdd) { 599 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 600 return NULL; 601 } 602 603 return pdd->dev->adev; 604 } 605 606 struct kfd_process_device * 607 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) 608 { 609 struct kfd_process *p; 610 int32_t gpu_idx, gpuid; 611 int r; 612 613 p = container_of(prange->svms, struct kfd_process, svms); 614 615 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); 616 if (r) { 617 pr_debug("failed to get device id by adev %p\n", adev); 618 return NULL; 619 } 620 621 return kfd_process_device_from_gpuidx(p, gpu_idx); 622 } 623 624 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 625 { 626 struct ttm_operation_ctx ctx = { false, false }; 627 628 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 629 630 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 631 } 632 633 static int 634 svm_range_check_attr(struct kfd_process *p, 635 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 636 { 637 uint32_t i; 638 639 for (i = 0; i < nattr; i++) { 640 uint32_t val = attrs[i].value; 641 int gpuidx = MAX_GPU_INSTANCE; 642 643 switch (attrs[i].type) { 644 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 645 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 646 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 647 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 648 break; 649 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 650 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 651 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 652 break; 653 case KFD_IOCTL_SVM_ATTR_ACCESS: 654 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 655 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 656 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 657 break; 658 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 659 break; 660 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 661 break; 662 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 663 break; 664 default: 665 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 666 return -EINVAL; 667 } 668 669 if (gpuidx < 0) { 670 pr_debug("no GPU 0x%x found\n", val); 671 return -EINVAL; 672 } else if (gpuidx < MAX_GPU_INSTANCE && 673 !test_bit(gpuidx, p->svms.bitmap_supported)) { 674 pr_debug("GPU 0x%x not supported\n", val); 675 return -EINVAL; 676 } 677 } 678 679 return 0; 680 } 681 682 static void 683 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 684 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 685 { 686 uint32_t i; 687 int gpuidx; 688 689 for (i = 0; i < nattr; i++) { 690 switch (attrs[i].type) { 691 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 692 prange->preferred_loc = attrs[i].value; 693 break; 694 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 695 prange->prefetch_loc = attrs[i].value; 696 break; 697 case KFD_IOCTL_SVM_ATTR_ACCESS: 698 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 699 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 700 gpuidx = kfd_process_gpuidx_from_gpuid(p, 701 attrs[i].value); 702 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 703 bitmap_clear(prange->bitmap_access, gpuidx, 1); 704 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 705 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 706 bitmap_set(prange->bitmap_access, gpuidx, 1); 707 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 708 } else { 709 bitmap_clear(prange->bitmap_access, gpuidx, 1); 710 bitmap_set(prange->bitmap_aip, gpuidx, 1); 711 } 712 break; 713 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 714 prange->flags |= attrs[i].value; 715 break; 716 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 717 prange->flags &= ~attrs[i].value; 718 break; 719 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 720 prange->granularity = attrs[i].value; 721 break; 722 default: 723 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 724 } 725 } 726 } 727 728 static bool 729 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, 730 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 731 { 732 uint32_t i; 733 int gpuidx; 734 735 for (i = 0; i < nattr; i++) { 736 switch (attrs[i].type) { 737 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 738 if (prange->preferred_loc != attrs[i].value) 739 return false; 740 break; 741 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 742 /* Prefetch should always trigger a migration even 743 * if the value of the attribute didn't change. 744 */ 745 return false; 746 case KFD_IOCTL_SVM_ATTR_ACCESS: 747 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 748 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 749 gpuidx = kfd_process_gpuidx_from_gpuid(p, 750 attrs[i].value); 751 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 752 if (test_bit(gpuidx, prange->bitmap_access) || 753 test_bit(gpuidx, prange->bitmap_aip)) 754 return false; 755 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 756 if (!test_bit(gpuidx, prange->bitmap_access)) 757 return false; 758 } else { 759 if (!test_bit(gpuidx, prange->bitmap_aip)) 760 return false; 761 } 762 break; 763 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 764 if ((prange->flags & attrs[i].value) != attrs[i].value) 765 return false; 766 break; 767 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 768 if ((prange->flags & attrs[i].value) != 0) 769 return false; 770 break; 771 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 772 if (prange->granularity != attrs[i].value) 773 return false; 774 break; 775 default: 776 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 777 } 778 } 779 780 return true; 781 } 782 783 /** 784 * svm_range_debug_dump - print all range information from svms 785 * @svms: svm range list header 786 * 787 * debug output svm range start, end, prefetch location from svms 788 * interval tree and link list 789 * 790 * Context: The caller must hold svms->lock 791 */ 792 static void svm_range_debug_dump(struct svm_range_list *svms) 793 { 794 struct interval_tree_node *node; 795 struct svm_range *prange; 796 797 pr_debug("dump svms 0x%p list\n", svms); 798 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 799 800 list_for_each_entry(prange, &svms->list, list) { 801 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 802 prange, prange->start, prange->npages, 803 prange->start + prange->npages - 1, 804 prange->actual_loc); 805 } 806 807 pr_debug("dump svms 0x%p interval tree\n", svms); 808 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 809 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 810 while (node) { 811 prange = container_of(node, struct svm_range, it_node); 812 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 813 prange, prange->start, prange->npages, 814 prange->start + prange->npages - 1, 815 prange->actual_loc); 816 node = interval_tree_iter_next(node, 0, ~0ULL); 817 } 818 } 819 820 static int 821 svm_range_split_array(void *ppnew, void *ppold, size_t size, 822 uint64_t old_start, uint64_t old_n, 823 uint64_t new_start, uint64_t new_n) 824 { 825 unsigned char *new, *old, *pold; 826 uint64_t d; 827 828 if (!ppold) 829 return 0; 830 pold = *(unsigned char **)ppold; 831 if (!pold) 832 return 0; 833 834 new = kvmalloc_array(new_n, size, GFP_KERNEL); 835 if (!new) 836 return -ENOMEM; 837 838 d = (new_start - old_start) * size; 839 memcpy(new, pold + d, new_n * size); 840 841 old = kvmalloc_array(old_n, size, GFP_KERNEL); 842 if (!old) { 843 kvfree(new); 844 return -ENOMEM; 845 } 846 847 d = (new_start == old_start) ? new_n * size : 0; 848 memcpy(old, pold + d, old_n * size); 849 850 kvfree(pold); 851 *(void **)ppold = old; 852 *(void **)ppnew = new; 853 854 return 0; 855 } 856 857 static int 858 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 859 uint64_t start, uint64_t last) 860 { 861 uint64_t npages = last - start + 1; 862 int i, r; 863 864 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 865 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 866 sizeof(*old->dma_addr[i]), old->start, 867 npages, new->start, new->npages); 868 if (r) 869 return r; 870 } 871 872 return 0; 873 } 874 875 static int 876 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 877 uint64_t start, uint64_t last) 878 { 879 uint64_t npages = last - start + 1; 880 881 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 882 new->svms, new, new->start, start, last); 883 884 if (new->start == old->start) { 885 new->offset = old->offset; 886 old->offset += new->npages; 887 } else { 888 new->offset = old->offset + npages; 889 } 890 891 new->svm_bo = svm_range_bo_ref(old->svm_bo); 892 new->ttm_res = old->ttm_res; 893 894 spin_lock(&new->svm_bo->list_lock); 895 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 896 spin_unlock(&new->svm_bo->list_lock); 897 898 return 0; 899 } 900 901 /** 902 * svm_range_split_adjust - split range and adjust 903 * 904 * @new: new range 905 * @old: the old range 906 * @start: the old range adjust to start address in pages 907 * @last: the old range adjust to last address in pages 908 * 909 * Copy system memory dma_addr or vram ttm_res in old range to new 910 * range from new_start up to size new->npages, the remaining old range is from 911 * start to last 912 * 913 * Return: 914 * 0 - OK, -ENOMEM - out of memory 915 */ 916 static int 917 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 918 uint64_t start, uint64_t last) 919 { 920 int r; 921 922 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 923 new->svms, new->start, old->start, old->last, start, last); 924 925 if (new->start < old->start || 926 new->last > old->last) { 927 WARN_ONCE(1, "invalid new range start or last\n"); 928 return -EINVAL; 929 } 930 931 r = svm_range_split_pages(new, old, start, last); 932 if (r) 933 return r; 934 935 if (old->actual_loc && old->ttm_res) { 936 r = svm_range_split_nodes(new, old, start, last); 937 if (r) 938 return r; 939 } 940 941 old->npages = last - start + 1; 942 old->start = start; 943 old->last = last; 944 new->flags = old->flags; 945 new->preferred_loc = old->preferred_loc; 946 new->prefetch_loc = old->prefetch_loc; 947 new->actual_loc = old->actual_loc; 948 new->granularity = old->granularity; 949 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 950 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 951 952 return 0; 953 } 954 955 /** 956 * svm_range_split - split a range in 2 ranges 957 * 958 * @prange: the svm range to split 959 * @start: the remaining range start address in pages 960 * @last: the remaining range last address in pages 961 * @new: the result new range generated 962 * 963 * Two cases only: 964 * case 1: if start == prange->start 965 * prange ==> prange[start, last] 966 * new range [last + 1, prange->last] 967 * 968 * case 2: if last == prange->last 969 * prange ==> prange[start, last] 970 * new range [prange->start, start - 1] 971 * 972 * Return: 973 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 974 */ 975 static int 976 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 977 struct svm_range **new) 978 { 979 uint64_t old_start = prange->start; 980 uint64_t old_last = prange->last; 981 struct svm_range_list *svms; 982 int r = 0; 983 984 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 985 old_start, old_last, start, last); 986 987 if (old_start != start && old_last != last) 988 return -EINVAL; 989 if (start < old_start || last > old_last) 990 return -EINVAL; 991 992 svms = prange->svms; 993 if (old_start == start) 994 *new = svm_range_new(svms, last + 1, old_last); 995 else 996 *new = svm_range_new(svms, old_start, start - 1); 997 if (!*new) 998 return -ENOMEM; 999 1000 r = svm_range_split_adjust(*new, prange, start, last); 1001 if (r) { 1002 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 1003 r, old_start, old_last, start, last); 1004 svm_range_free(*new); 1005 *new = NULL; 1006 } 1007 1008 return r; 1009 } 1010 1011 static int 1012 svm_range_split_tail(struct svm_range *prange, 1013 uint64_t new_last, struct list_head *insert_list) 1014 { 1015 struct svm_range *tail; 1016 int r = svm_range_split(prange, prange->start, new_last, &tail); 1017 1018 if (!r) 1019 list_add(&tail->list, insert_list); 1020 return r; 1021 } 1022 1023 static int 1024 svm_range_split_head(struct svm_range *prange, 1025 uint64_t new_start, struct list_head *insert_list) 1026 { 1027 struct svm_range *head; 1028 int r = svm_range_split(prange, new_start, prange->last, &head); 1029 1030 if (!r) 1031 list_add(&head->list, insert_list); 1032 return r; 1033 } 1034 1035 static void 1036 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 1037 struct svm_range *pchild, enum svm_work_list_ops op) 1038 { 1039 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 1040 pchild, pchild->start, pchild->last, prange, op); 1041 1042 pchild->work_item.mm = mm; 1043 pchild->work_item.op = op; 1044 list_add_tail(&pchild->child_list, &prange->child_list); 1045 } 1046 1047 /** 1048 * svm_range_split_by_granularity - collect ranges within granularity boundary 1049 * 1050 * @p: the process with svms list 1051 * @mm: mm structure 1052 * @addr: the vm fault address in pages, to split the prange 1053 * @parent: parent range if prange is from child list 1054 * @prange: prange to split 1055 * 1056 * Trims @prange to be a single aligned block of prange->granularity if 1057 * possible. The head and tail are added to the child_list in @parent. 1058 * 1059 * Context: caller must hold mmap_read_lock and prange->lock 1060 * 1061 * Return: 1062 * 0 - OK, otherwise error code 1063 */ 1064 int 1065 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 1066 unsigned long addr, struct svm_range *parent, 1067 struct svm_range *prange) 1068 { 1069 struct svm_range *head, *tail; 1070 unsigned long start, last, size; 1071 int r; 1072 1073 /* Align splited range start and size to granularity size, then a single 1074 * PTE will be used for whole range, this reduces the number of PTE 1075 * updated and the L1 TLB space used for translation. 1076 */ 1077 size = 1UL << prange->granularity; 1078 start = ALIGN_DOWN(addr, size); 1079 last = ALIGN(addr + 1, size) - 1; 1080 1081 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 1082 prange->svms, prange->start, prange->last, start, last, size); 1083 1084 if (start > prange->start) { 1085 r = svm_range_split(prange, start, prange->last, &head); 1086 if (r) 1087 return r; 1088 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 1089 } 1090 1091 if (last < prange->last) { 1092 r = svm_range_split(prange, prange->start, last, &tail); 1093 if (r) 1094 return r; 1095 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1096 } 1097 1098 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 1099 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 1100 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 1101 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 1102 prange, prange->start, prange->last, 1103 SVM_OP_ADD_RANGE_AND_MAP); 1104 } 1105 return 0; 1106 } 1107 1108 static uint64_t 1109 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, 1110 int domain) 1111 { 1112 struct amdgpu_device *bo_adev; 1113 uint32_t flags = prange->flags; 1114 uint32_t mapping_flags = 0; 1115 uint64_t pte_flags; 1116 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 1117 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1118 1119 if (domain == SVM_RANGE_VRAM_DOMAIN) 1120 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1121 1122 switch (KFD_GC_VERSION(adev->kfd.dev)) { 1123 case IP_VERSION(9, 4, 1): 1124 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1125 if (bo_adev == adev) { 1126 mapping_flags |= coherent ? 1127 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1128 } else { 1129 mapping_flags |= coherent ? 1130 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1131 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1132 snoop = true; 1133 } 1134 } else { 1135 mapping_flags |= coherent ? 1136 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1137 } 1138 break; 1139 case IP_VERSION(9, 4, 2): 1140 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1141 if (bo_adev == adev) { 1142 mapping_flags |= coherent ? 1143 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1144 if (adev->gmc.xgmi.connected_to_cpu) 1145 snoop = true; 1146 } else { 1147 mapping_flags |= coherent ? 1148 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1149 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1150 snoop = true; 1151 } 1152 } else { 1153 mapping_flags |= coherent ? 1154 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1155 } 1156 break; 1157 default: 1158 mapping_flags |= coherent ? 1159 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1160 } 1161 1162 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1163 1164 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1165 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1166 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1167 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1168 1169 pte_flags = AMDGPU_PTE_VALID; 1170 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 1171 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1172 1173 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1174 return pte_flags; 1175 } 1176 1177 static int 1178 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1179 uint64_t start, uint64_t last, 1180 struct dma_fence **fence) 1181 { 1182 uint64_t init_pte_value = 0; 1183 1184 pr_debug("[0x%llx 0x%llx]\n", start, last); 1185 1186 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1187 start, last, init_pte_value, 0, 1188 NULL, NULL, fence, NULL); 1189 } 1190 1191 static int 1192 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1193 unsigned long last) 1194 { 1195 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1196 struct kfd_process_device *pdd; 1197 struct dma_fence *fence = NULL; 1198 struct kfd_process *p; 1199 uint32_t gpuidx; 1200 int r = 0; 1201 1202 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1203 MAX_GPU_INSTANCE); 1204 p = container_of(prange->svms, struct kfd_process, svms); 1205 1206 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1207 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1208 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1209 if (!pdd) { 1210 pr_debug("failed to find device idx %d\n", gpuidx); 1211 return -EINVAL; 1212 } 1213 1214 r = svm_range_unmap_from_gpu(pdd->dev->adev, 1215 drm_priv_to_vm(pdd->drm_priv), 1216 start, last, &fence); 1217 if (r) 1218 break; 1219 1220 if (fence) { 1221 r = dma_fence_wait(fence, false); 1222 dma_fence_put(fence); 1223 fence = NULL; 1224 if (r) 1225 break; 1226 } 1227 amdgpu_amdkfd_flush_gpu_tlb_pasid(pdd->dev->adev, 1228 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1229 } 1230 1231 return r; 1232 } 1233 1234 static int 1235 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1236 struct svm_range *prange, unsigned long offset, 1237 unsigned long npages, bool readonly, dma_addr_t *dma_addr, 1238 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1239 { 1240 bool table_freed = false; 1241 uint64_t pte_flags; 1242 unsigned long last_start; 1243 int last_domain; 1244 int r = 0; 1245 int64_t i, j; 1246 1247 last_start = prange->start + offset; 1248 1249 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 1250 last_start, last_start + npages - 1, readonly); 1251 1252 for (i = offset; i < offset + npages; i++) { 1253 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 1254 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 1255 1256 /* Collect all pages in the same address range and memory domain 1257 * that can be mapped with a single call to update mapping. 1258 */ 1259 if (i < offset + npages - 1 && 1260 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 1261 continue; 1262 1263 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 1264 last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 1265 1266 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); 1267 if (readonly) 1268 pte_flags &= ~AMDGPU_PTE_WRITEABLE; 1269 1270 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 1271 prange->svms, last_start, prange->start + i, 1272 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 1273 pte_flags); 1274 1275 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, 1276 NULL, last_start, 1277 prange->start + i, pte_flags, 1278 last_start - prange->start, 1279 NULL, dma_addr, 1280 &vm->last_update, 1281 &table_freed); 1282 1283 for (j = last_start - prange->start; j <= i; j++) 1284 dma_addr[j] |= last_domain; 1285 1286 if (r) { 1287 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1288 goto out; 1289 } 1290 last_start = prange->start + i + 1; 1291 } 1292 1293 r = amdgpu_vm_update_pdes(adev, vm, false); 1294 if (r) { 1295 pr_debug("failed %d to update directories 0x%lx\n", r, 1296 prange->start); 1297 goto out; 1298 } 1299 1300 if (fence) 1301 *fence = dma_fence_get(vm->last_update); 1302 1303 if (table_freed) { 1304 struct kfd_process *p; 1305 1306 p = container_of(prange->svms, struct kfd_process, svms); 1307 amdgpu_amdkfd_flush_gpu_tlb_pasid(adev, p->pasid, TLB_FLUSH_LEGACY); 1308 } 1309 out: 1310 return r; 1311 } 1312 1313 static int 1314 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 1315 unsigned long npages, bool readonly, 1316 unsigned long *bitmap, bool wait) 1317 { 1318 struct kfd_process_device *pdd; 1319 struct amdgpu_device *bo_adev; 1320 struct kfd_process *p; 1321 struct dma_fence *fence = NULL; 1322 uint32_t gpuidx; 1323 int r = 0; 1324 1325 if (prange->svm_bo && prange->ttm_res) 1326 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1327 else 1328 bo_adev = NULL; 1329 1330 p = container_of(prange->svms, struct kfd_process, svms); 1331 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1332 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1333 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1334 if (!pdd) { 1335 pr_debug("failed to find device idx %d\n", gpuidx); 1336 return -EINVAL; 1337 } 1338 1339 pdd = kfd_bind_process_to_device(pdd->dev, p); 1340 if (IS_ERR(pdd)) 1341 return -EINVAL; 1342 1343 if (bo_adev && pdd->dev->adev != bo_adev && 1344 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 1345 pr_debug("cannot map to device idx %d\n", gpuidx); 1346 continue; 1347 } 1348 1349 r = svm_range_map_to_gpu(pdd->dev->adev, drm_priv_to_vm(pdd->drm_priv), 1350 prange, offset, npages, readonly, 1351 prange->dma_addr[gpuidx], 1352 bo_adev, wait ? &fence : NULL); 1353 if (r) 1354 break; 1355 1356 if (fence) { 1357 r = dma_fence_wait(fence, false); 1358 dma_fence_put(fence); 1359 fence = NULL; 1360 if (r) { 1361 pr_debug("failed %d to dma fence wait\n", r); 1362 break; 1363 } 1364 } 1365 } 1366 1367 return r; 1368 } 1369 1370 struct svm_validate_context { 1371 struct kfd_process *process; 1372 struct svm_range *prange; 1373 bool intr; 1374 unsigned long bitmap[MAX_GPU_INSTANCE]; 1375 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; 1376 struct list_head validate_list; 1377 struct ww_acquire_ctx ticket; 1378 }; 1379 1380 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1381 { 1382 struct kfd_process_device *pdd; 1383 struct amdgpu_vm *vm; 1384 uint32_t gpuidx; 1385 int r; 1386 1387 INIT_LIST_HEAD(&ctx->validate_list); 1388 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1389 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1390 if (!pdd) { 1391 pr_debug("failed to find device idx %d\n", gpuidx); 1392 return -EINVAL; 1393 } 1394 vm = drm_priv_to_vm(pdd->drm_priv); 1395 1396 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1397 ctx->tv[gpuidx].num_shared = 4; 1398 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1399 } 1400 1401 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1402 ctx->intr, NULL); 1403 if (r) { 1404 pr_debug("failed %d to reserve bo\n", r); 1405 return r; 1406 } 1407 1408 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1409 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1410 if (!pdd) { 1411 pr_debug("failed to find device idx %d\n", gpuidx); 1412 r = -EINVAL; 1413 goto unreserve_out; 1414 } 1415 1416 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 1417 drm_priv_to_vm(pdd->drm_priv), 1418 svm_range_bo_validate, NULL); 1419 if (r) { 1420 pr_debug("failed %d validate pt bos\n", r); 1421 goto unreserve_out; 1422 } 1423 } 1424 1425 return 0; 1426 1427 unreserve_out: 1428 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1429 return r; 1430 } 1431 1432 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1433 { 1434 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1435 } 1436 1437 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 1438 { 1439 struct kfd_process_device *pdd; 1440 1441 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1442 1443 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 1444 } 1445 1446 /* 1447 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1448 * 1449 * To prevent concurrent destruction or change of range attributes, the 1450 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1451 * because that would block concurrent evictions and lead to deadlocks. To 1452 * serialize concurrent migrations or validations of the same range, the 1453 * prange->migrate_mutex must be held. 1454 * 1455 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1456 * eviction fence. 1457 * 1458 * The following sequence ensures race-free validation and GPU mapping: 1459 * 1460 * 1. Reserve page table (and SVM BO if range is in VRAM) 1461 * 2. hmm_range_fault to get page addresses (if system memory) 1462 * 3. DMA-map pages (if system memory) 1463 * 4-a. Take notifier lock 1464 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1465 * 4-c. Check that the range was not split or otherwise invalidated 1466 * 4-d. Update GPU page table 1467 * 4.e. Release notifier lock 1468 * 5. Release page table (and SVM BO) reservation 1469 */ 1470 static int svm_range_validate_and_map(struct mm_struct *mm, 1471 struct svm_range *prange, 1472 int32_t gpuidx, bool intr, bool wait) 1473 { 1474 struct svm_validate_context ctx; 1475 unsigned long start, end, addr; 1476 struct kfd_process *p; 1477 void *owner; 1478 int32_t idx; 1479 int r = 0; 1480 1481 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1482 ctx.prange = prange; 1483 ctx.intr = intr; 1484 1485 if (gpuidx < MAX_GPU_INSTANCE) { 1486 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1487 bitmap_set(ctx.bitmap, gpuidx, 1); 1488 } else if (ctx.process->xnack_enabled) { 1489 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1490 1491 /* If prefetch range to GPU, or GPU retry fault migrate range to 1492 * GPU, which has ACCESS attribute to the range, create mapping 1493 * on that GPU. 1494 */ 1495 if (prange->actual_loc) { 1496 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1497 prange->actual_loc); 1498 if (gpuidx < 0) { 1499 WARN_ONCE(1, "failed get device by id 0x%x\n", 1500 prange->actual_loc); 1501 return -EINVAL; 1502 } 1503 if (test_bit(gpuidx, prange->bitmap_access)) 1504 bitmap_set(ctx.bitmap, gpuidx, 1); 1505 } 1506 } else { 1507 bitmap_or(ctx.bitmap, prange->bitmap_access, 1508 prange->bitmap_aip, MAX_GPU_INSTANCE); 1509 } 1510 1511 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1512 return 0; 1513 1514 if (prange->actual_loc && !prange->ttm_res) { 1515 /* This should never happen. actual_loc gets set by 1516 * svm_migrate_ram_to_vram after allocating a BO. 1517 */ 1518 WARN_ONCE(1, "VRAM BO missing during validation\n"); 1519 return -EINVAL; 1520 } 1521 1522 svm_range_reserve_bos(&ctx); 1523 1524 p = container_of(prange->svms, struct kfd_process, svms); 1525 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, 1526 MAX_GPU_INSTANCE)); 1527 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { 1528 if (kfd_svm_page_owner(p, idx) != owner) { 1529 owner = NULL; 1530 break; 1531 } 1532 } 1533 1534 start = prange->start << PAGE_SHIFT; 1535 end = (prange->last + 1) << PAGE_SHIFT; 1536 for (addr = start; addr < end && !r; ) { 1537 struct hmm_range *hmm_range; 1538 struct vm_area_struct *vma; 1539 unsigned long next; 1540 unsigned long offset; 1541 unsigned long npages; 1542 bool readonly; 1543 1544 vma = find_vma(mm, addr); 1545 if (!vma || addr < vma->vm_start) { 1546 r = -EFAULT; 1547 goto unreserve_out; 1548 } 1549 readonly = !(vma->vm_flags & VM_WRITE); 1550 1551 next = min(vma->vm_end, end); 1552 npages = (next - addr) >> PAGE_SHIFT; 1553 WRITE_ONCE(p->svms.faulting_task, current); 1554 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1555 addr, npages, &hmm_range, 1556 readonly, true, owner); 1557 WRITE_ONCE(p->svms.faulting_task, NULL); 1558 if (r) { 1559 pr_debug("failed %d to get svm range pages\n", r); 1560 goto unreserve_out; 1561 } 1562 1563 offset = (addr - start) >> PAGE_SHIFT; 1564 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, 1565 hmm_range->hmm_pfns); 1566 if (r) { 1567 pr_debug("failed %d to dma map range\n", r); 1568 goto unreserve_out; 1569 } 1570 1571 svm_range_lock(prange); 1572 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1573 pr_debug("hmm update the range, need validate again\n"); 1574 r = -EAGAIN; 1575 goto unlock_out; 1576 } 1577 if (!list_empty(&prange->child_list)) { 1578 pr_debug("range split by unmap in parallel, validate again\n"); 1579 r = -EAGAIN; 1580 goto unlock_out; 1581 } 1582 1583 r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1584 ctx.bitmap, wait); 1585 1586 unlock_out: 1587 svm_range_unlock(prange); 1588 1589 addr = next; 1590 } 1591 1592 if (addr == end) 1593 prange->validated_once = true; 1594 1595 unreserve_out: 1596 svm_range_unreserve_bos(&ctx); 1597 1598 if (!r) 1599 prange->validate_timestamp = ktime_to_us(ktime_get()); 1600 1601 return r; 1602 } 1603 1604 /** 1605 * svm_range_list_lock_and_flush_work - flush pending deferred work 1606 * 1607 * @svms: the svm range list 1608 * @mm: the mm structure 1609 * 1610 * Context: Returns with mmap write lock held, pending deferred work flushed 1611 * 1612 */ 1613 void 1614 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1615 struct mm_struct *mm) 1616 { 1617 retry_flush_work: 1618 flush_work(&svms->deferred_list_work); 1619 mmap_write_lock(mm); 1620 1621 if (list_empty(&svms->deferred_range_list)) 1622 return; 1623 mmap_write_unlock(mm); 1624 pr_debug("retry flush\n"); 1625 goto retry_flush_work; 1626 } 1627 1628 static void svm_range_restore_work(struct work_struct *work) 1629 { 1630 struct delayed_work *dwork = to_delayed_work(work); 1631 struct svm_range_list *svms; 1632 struct svm_range *prange; 1633 struct kfd_process *p; 1634 struct mm_struct *mm; 1635 int evicted_ranges; 1636 int invalid; 1637 int r; 1638 1639 svms = container_of(dwork, struct svm_range_list, restore_work); 1640 evicted_ranges = atomic_read(&svms->evicted_ranges); 1641 if (!evicted_ranges) 1642 return; 1643 1644 pr_debug("restore svm ranges\n"); 1645 1646 p = container_of(svms, struct kfd_process, svms); 1647 1648 /* Keep mm reference when svm_range_validate_and_map ranges */ 1649 mm = get_task_mm(p->lead_thread); 1650 if (!mm) { 1651 pr_debug("svms 0x%p process mm gone\n", svms); 1652 return; 1653 } 1654 1655 svm_range_list_lock_and_flush_work(svms, mm); 1656 mutex_lock(&svms->lock); 1657 1658 evicted_ranges = atomic_read(&svms->evicted_ranges); 1659 1660 list_for_each_entry(prange, &svms->list, list) { 1661 invalid = atomic_read(&prange->invalid); 1662 if (!invalid) 1663 continue; 1664 1665 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1666 prange->svms, prange, prange->start, prange->last, 1667 invalid); 1668 1669 /* 1670 * If range is migrating, wait for migration is done. 1671 */ 1672 mutex_lock(&prange->migrate_mutex); 1673 1674 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1675 false, true); 1676 if (r) 1677 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1678 prange->start); 1679 1680 mutex_unlock(&prange->migrate_mutex); 1681 if (r) 1682 goto out_reschedule; 1683 1684 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1685 goto out_reschedule; 1686 } 1687 1688 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1689 evicted_ranges) 1690 goto out_reschedule; 1691 1692 evicted_ranges = 0; 1693 1694 r = kgd2kfd_resume_mm(mm); 1695 if (r) { 1696 /* No recovery from this failure. Probably the CP is 1697 * hanging. No point trying again. 1698 */ 1699 pr_debug("failed %d to resume KFD\n", r); 1700 } 1701 1702 pr_debug("restore svm ranges successfully\n"); 1703 1704 out_reschedule: 1705 mutex_unlock(&svms->lock); 1706 mmap_write_unlock(mm); 1707 mmput(mm); 1708 1709 /* If validation failed, reschedule another attempt */ 1710 if (evicted_ranges) { 1711 pr_debug("reschedule to restore svm range\n"); 1712 schedule_delayed_work(&svms->restore_work, 1713 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1714 } 1715 } 1716 1717 /** 1718 * svm_range_evict - evict svm range 1719 * @prange: svm range structure 1720 * @mm: current process mm_struct 1721 * @start: starting process queue number 1722 * @last: last process queue number 1723 * 1724 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1725 * return to let CPU evict the buffer and proceed CPU pagetable update. 1726 * 1727 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1728 * If invalidation happens while restore work is running, restore work will 1729 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1730 * the queues. 1731 */ 1732 static int 1733 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1734 unsigned long start, unsigned long last) 1735 { 1736 struct svm_range_list *svms = prange->svms; 1737 struct svm_range *pchild; 1738 struct kfd_process *p; 1739 int r = 0; 1740 1741 p = container_of(svms, struct kfd_process, svms); 1742 1743 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1744 svms, prange->start, prange->last, start, last); 1745 1746 if (!p->xnack_enabled) { 1747 int evicted_ranges; 1748 1749 list_for_each_entry(pchild, &prange->child_list, child_list) { 1750 mutex_lock_nested(&pchild->lock, 1); 1751 if (pchild->start <= last && pchild->last >= start) { 1752 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 1753 pchild->start, pchild->last); 1754 atomic_inc(&pchild->invalid); 1755 } 1756 mutex_unlock(&pchild->lock); 1757 } 1758 1759 if (prange->start <= last && prange->last >= start) 1760 atomic_inc(&prange->invalid); 1761 1762 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1763 if (evicted_ranges != 1) 1764 return r; 1765 1766 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1767 prange->svms, prange->start, prange->last); 1768 1769 /* First eviction, stop the queues */ 1770 r = kgd2kfd_quiesce_mm(mm); 1771 if (r) 1772 pr_debug("failed to quiesce KFD\n"); 1773 1774 pr_debug("schedule to restore svm %p ranges\n", svms); 1775 schedule_delayed_work(&svms->restore_work, 1776 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1777 } else { 1778 unsigned long s, l; 1779 1780 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1781 prange->svms, start, last); 1782 list_for_each_entry(pchild, &prange->child_list, child_list) { 1783 mutex_lock_nested(&pchild->lock, 1); 1784 s = max(start, pchild->start); 1785 l = min(last, pchild->last); 1786 if (l >= s) 1787 svm_range_unmap_from_gpus(pchild, s, l); 1788 mutex_unlock(&pchild->lock); 1789 } 1790 s = max(start, prange->start); 1791 l = min(last, prange->last); 1792 if (l >= s) 1793 svm_range_unmap_from_gpus(prange, s, l); 1794 } 1795 1796 return r; 1797 } 1798 1799 static struct svm_range *svm_range_clone(struct svm_range *old) 1800 { 1801 struct svm_range *new; 1802 1803 new = svm_range_new(old->svms, old->start, old->last); 1804 if (!new) 1805 return NULL; 1806 1807 if (old->svm_bo) { 1808 new->ttm_res = old->ttm_res; 1809 new->offset = old->offset; 1810 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1811 spin_lock(&new->svm_bo->list_lock); 1812 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1813 spin_unlock(&new->svm_bo->list_lock); 1814 } 1815 new->flags = old->flags; 1816 new->preferred_loc = old->preferred_loc; 1817 new->prefetch_loc = old->prefetch_loc; 1818 new->actual_loc = old->actual_loc; 1819 new->granularity = old->granularity; 1820 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1821 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1822 1823 return new; 1824 } 1825 1826 /** 1827 * svm_range_add - add svm range and handle overlap 1828 * @p: the range add to this process svms 1829 * @start: page size aligned 1830 * @size: page size aligned 1831 * @nattr: number of attributes 1832 * @attrs: array of attributes 1833 * @update_list: output, the ranges need validate and update GPU mapping 1834 * @insert_list: output, the ranges need insert to svms 1835 * @remove_list: output, the ranges are replaced and need remove from svms 1836 * 1837 * Check if the virtual address range has overlap with any existing ranges, 1838 * split partly overlapping ranges and add new ranges in the gaps. All changes 1839 * should be applied to the range_list and interval tree transactionally. If 1840 * any range split or allocation fails, the entire update fails. Therefore any 1841 * existing overlapping svm_ranges are cloned and the original svm_ranges left 1842 * unchanged. 1843 * 1844 * If the transaction succeeds, the caller can update and insert clones and 1845 * new ranges, then free the originals. 1846 * 1847 * Otherwise the caller can free the clones and new ranges, while the old 1848 * svm_ranges remain unchanged. 1849 * 1850 * Context: Process context, caller must hold svms->lock 1851 * 1852 * Return: 1853 * 0 - OK, otherwise error code 1854 */ 1855 static int 1856 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 1857 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 1858 struct list_head *update_list, struct list_head *insert_list, 1859 struct list_head *remove_list) 1860 { 1861 unsigned long last = start + size - 1UL; 1862 struct svm_range_list *svms = &p->svms; 1863 struct interval_tree_node *node; 1864 struct svm_range *prange; 1865 struct svm_range *tmp; 1866 int r = 0; 1867 1868 pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); 1869 1870 INIT_LIST_HEAD(update_list); 1871 INIT_LIST_HEAD(insert_list); 1872 INIT_LIST_HEAD(remove_list); 1873 1874 node = interval_tree_iter_first(&svms->objects, start, last); 1875 while (node) { 1876 struct interval_tree_node *next; 1877 unsigned long next_start; 1878 1879 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1880 node->last); 1881 1882 prange = container_of(node, struct svm_range, it_node); 1883 next = interval_tree_iter_next(node, start, last); 1884 next_start = min(node->last, last) + 1; 1885 1886 if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { 1887 /* nothing to do */ 1888 } else if (node->start < start || node->last > last) { 1889 /* node intersects the update range and its attributes 1890 * will change. Clone and split it, apply updates only 1891 * to the overlapping part 1892 */ 1893 struct svm_range *old = prange; 1894 1895 prange = svm_range_clone(old); 1896 if (!prange) { 1897 r = -ENOMEM; 1898 goto out; 1899 } 1900 1901 list_add(&old->update_list, remove_list); 1902 list_add(&prange->list, insert_list); 1903 list_add(&prange->update_list, update_list); 1904 1905 if (node->start < start) { 1906 pr_debug("change old range start\n"); 1907 r = svm_range_split_head(prange, start, 1908 insert_list); 1909 if (r) 1910 goto out; 1911 } 1912 if (node->last > last) { 1913 pr_debug("change old range last\n"); 1914 r = svm_range_split_tail(prange, last, 1915 insert_list); 1916 if (r) 1917 goto out; 1918 } 1919 } else { 1920 /* The node is contained within start..last, 1921 * just update it 1922 */ 1923 list_add(&prange->update_list, update_list); 1924 } 1925 1926 /* insert a new node if needed */ 1927 if (node->start > start) { 1928 prange = svm_range_new(svms, start, node->start - 1); 1929 if (!prange) { 1930 r = -ENOMEM; 1931 goto out; 1932 } 1933 1934 list_add(&prange->list, insert_list); 1935 list_add(&prange->update_list, update_list); 1936 } 1937 1938 node = next; 1939 start = next_start; 1940 } 1941 1942 /* add a final range at the end if needed */ 1943 if (start <= last) { 1944 prange = svm_range_new(svms, start, last); 1945 if (!prange) { 1946 r = -ENOMEM; 1947 goto out; 1948 } 1949 list_add(&prange->list, insert_list); 1950 list_add(&prange->update_list, update_list); 1951 } 1952 1953 out: 1954 if (r) 1955 list_for_each_entry_safe(prange, tmp, insert_list, list) 1956 svm_range_free(prange); 1957 1958 return r; 1959 } 1960 1961 static void 1962 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1963 struct svm_range *prange) 1964 { 1965 unsigned long start; 1966 unsigned long last; 1967 1968 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1969 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1970 1971 if (prange->start == start && prange->last == last) 1972 return; 1973 1974 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1975 prange->svms, prange, start, last, prange->start, 1976 prange->last); 1977 1978 if (start != 0 && last != 0) { 1979 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1980 svm_range_remove_notifier(prange); 1981 } 1982 prange->it_node.start = prange->start; 1983 prange->it_node.last = prange->last; 1984 1985 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1986 svm_range_add_notifier_locked(mm, prange); 1987 } 1988 1989 static void 1990 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, 1991 struct mm_struct *mm) 1992 { 1993 switch (prange->work_item.op) { 1994 case SVM_OP_NULL: 1995 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1996 svms, prange, prange->start, prange->last); 1997 break; 1998 case SVM_OP_UNMAP_RANGE: 1999 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2000 svms, prange, prange->start, prange->last); 2001 svm_range_unlink(prange); 2002 svm_range_remove_notifier(prange); 2003 svm_range_free(prange); 2004 break; 2005 case SVM_OP_UPDATE_RANGE_NOTIFIER: 2006 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2007 svms, prange, prange->start, prange->last); 2008 svm_range_update_notifier_and_interval_tree(mm, prange); 2009 break; 2010 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 2011 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2012 svms, prange, prange->start, prange->last); 2013 svm_range_update_notifier_and_interval_tree(mm, prange); 2014 /* TODO: implement deferred validation and mapping */ 2015 break; 2016 case SVM_OP_ADD_RANGE: 2017 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 2018 prange->start, prange->last); 2019 svm_range_add_to_svms(prange); 2020 svm_range_add_notifier_locked(mm, prange); 2021 break; 2022 case SVM_OP_ADD_RANGE_AND_MAP: 2023 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 2024 prange, prange->start, prange->last); 2025 svm_range_add_to_svms(prange); 2026 svm_range_add_notifier_locked(mm, prange); 2027 /* TODO: implement deferred validation and mapping */ 2028 break; 2029 default: 2030 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 2031 prange->work_item.op); 2032 } 2033 } 2034 2035 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 2036 { 2037 struct kfd_process_device *pdd; 2038 struct kfd_process *p; 2039 int drain; 2040 uint32_t i; 2041 2042 p = container_of(svms, struct kfd_process, svms); 2043 2044 restart: 2045 drain = atomic_read(&svms->drain_pagefaults); 2046 if (!drain) 2047 return; 2048 2049 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 2050 pdd = p->pdds[i]; 2051 if (!pdd) 2052 continue; 2053 2054 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 2055 2056 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2057 &pdd->dev->adev->irq.ih1); 2058 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 2059 } 2060 if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) 2061 goto restart; 2062 } 2063 2064 static void svm_range_deferred_list_work(struct work_struct *work) 2065 { 2066 struct svm_range_list *svms; 2067 struct svm_range *prange; 2068 struct mm_struct *mm; 2069 2070 svms = container_of(work, struct svm_range_list, deferred_list_work); 2071 pr_debug("enter svms 0x%p\n", svms); 2072 2073 spin_lock(&svms->deferred_list_lock); 2074 while (!list_empty(&svms->deferred_range_list)) { 2075 prange = list_first_entry(&svms->deferred_range_list, 2076 struct svm_range, deferred_list); 2077 spin_unlock(&svms->deferred_list_lock); 2078 2079 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 2080 prange->start, prange->last, prange->work_item.op); 2081 2082 mm = prange->work_item.mm; 2083 retry: 2084 mmap_write_lock(mm); 2085 2086 /* Checking for the need to drain retry faults must be inside 2087 * mmap write lock to serialize with munmap notifiers. 2088 */ 2089 if (unlikely(atomic_read(&svms->drain_pagefaults))) { 2090 mmap_write_unlock(mm); 2091 svm_range_drain_retry_fault(svms); 2092 goto retry; 2093 } 2094 2095 /* Remove from deferred_list must be inside mmap write lock, for 2096 * two race cases: 2097 * 1. unmap_from_cpu may change work_item.op and add the range 2098 * to deferred_list again, cause use after free bug. 2099 * 2. svm_range_list_lock_and_flush_work may hold mmap write 2100 * lock and continue because deferred_list is empty, but 2101 * deferred_list work is actually waiting for mmap lock. 2102 */ 2103 spin_lock(&svms->deferred_list_lock); 2104 list_del_init(&prange->deferred_list); 2105 spin_unlock(&svms->deferred_list_lock); 2106 2107 mutex_lock(&svms->lock); 2108 mutex_lock(&prange->migrate_mutex); 2109 while (!list_empty(&prange->child_list)) { 2110 struct svm_range *pchild; 2111 2112 pchild = list_first_entry(&prange->child_list, 2113 struct svm_range, child_list); 2114 pr_debug("child prange 0x%p op %d\n", pchild, 2115 pchild->work_item.op); 2116 list_del_init(&pchild->child_list); 2117 svm_range_handle_list_op(svms, pchild, mm); 2118 } 2119 mutex_unlock(&prange->migrate_mutex); 2120 2121 svm_range_handle_list_op(svms, prange, mm); 2122 mutex_unlock(&svms->lock); 2123 mmap_write_unlock(mm); 2124 2125 /* Pairs with mmget in svm_range_add_list_work */ 2126 mmput(mm); 2127 2128 spin_lock(&svms->deferred_list_lock); 2129 } 2130 spin_unlock(&svms->deferred_list_lock); 2131 pr_debug("exit svms 0x%p\n", svms); 2132 } 2133 2134 void 2135 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 2136 struct mm_struct *mm, enum svm_work_list_ops op) 2137 { 2138 spin_lock(&svms->deferred_list_lock); 2139 /* if prange is on the deferred list */ 2140 if (!list_empty(&prange->deferred_list)) { 2141 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 2142 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 2143 if (op != SVM_OP_NULL && 2144 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2145 prange->work_item.op = op; 2146 } else { 2147 prange->work_item.op = op; 2148 2149 /* Pairs with mmput in deferred_list_work */ 2150 mmget(mm); 2151 prange->work_item.mm = mm; 2152 list_add_tail(&prange->deferred_list, 2153 &prange->svms->deferred_range_list); 2154 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2155 prange, prange->start, prange->last, op); 2156 } 2157 spin_unlock(&svms->deferred_list_lock); 2158 } 2159 2160 void schedule_deferred_list_work(struct svm_range_list *svms) 2161 { 2162 spin_lock(&svms->deferred_list_lock); 2163 if (!list_empty(&svms->deferred_range_list)) 2164 schedule_work(&svms->deferred_list_work); 2165 spin_unlock(&svms->deferred_list_lock); 2166 } 2167 2168 static void 2169 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2170 struct svm_range *prange, unsigned long start, 2171 unsigned long last) 2172 { 2173 struct svm_range *head; 2174 struct svm_range *tail; 2175 2176 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2177 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 2178 prange->start, prange->last); 2179 return; 2180 } 2181 if (start > prange->last || last < prange->start) 2182 return; 2183 2184 head = tail = prange; 2185 if (start > prange->start) 2186 svm_range_split(prange, prange->start, start - 1, &tail); 2187 if (last < tail->last) 2188 svm_range_split(tail, last + 1, tail->last, &head); 2189 2190 if (head != prange && tail != prange) { 2191 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2192 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2193 } else if (tail != prange) { 2194 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2195 } else if (head != prange) { 2196 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2197 } else if (parent != prange) { 2198 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2199 } 2200 } 2201 2202 static void 2203 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 2204 unsigned long start, unsigned long last) 2205 { 2206 struct svm_range_list *svms; 2207 struct svm_range *pchild; 2208 struct kfd_process *p; 2209 unsigned long s, l; 2210 bool unmap_parent; 2211 2212 p = kfd_lookup_process_by_mm(mm); 2213 if (!p) 2214 return; 2215 svms = &p->svms; 2216 2217 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2218 prange, prange->start, prange->last, start, last); 2219 2220 /* Make sure pending page faults are drained in the deferred worker 2221 * before the range is freed to avoid straggler interrupts on 2222 * unmapped memory causing "phantom faults". 2223 */ 2224 atomic_inc(&svms->drain_pagefaults); 2225 2226 unmap_parent = start <= prange->start && last >= prange->last; 2227 2228 list_for_each_entry(pchild, &prange->child_list, child_list) { 2229 mutex_lock_nested(&pchild->lock, 1); 2230 s = max(start, pchild->start); 2231 l = min(last, pchild->last); 2232 if (l >= s) 2233 svm_range_unmap_from_gpus(pchild, s, l); 2234 svm_range_unmap_split(mm, prange, pchild, start, last); 2235 mutex_unlock(&pchild->lock); 2236 } 2237 s = max(start, prange->start); 2238 l = min(last, prange->last); 2239 if (l >= s) 2240 svm_range_unmap_from_gpus(prange, s, l); 2241 svm_range_unmap_split(mm, prange, prange, start, last); 2242 2243 if (unmap_parent) 2244 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2245 else 2246 svm_range_add_list_work(svms, prange, mm, 2247 SVM_OP_UPDATE_RANGE_NOTIFIER); 2248 schedule_deferred_list_work(svms); 2249 2250 kfd_unref_process(p); 2251 } 2252 2253 /** 2254 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2255 * @mni: mmu_interval_notifier struct 2256 * @range: mmu_notifier_range struct 2257 * @cur_seq: value to pass to mmu_interval_set_seq() 2258 * 2259 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2260 * is from migration, or CPU page invalidation callback. 2261 * 2262 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2263 * work thread, and split prange if only part of prange is unmapped. 2264 * 2265 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2266 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2267 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2268 * update GPU mapping to recover. 2269 * 2270 * Context: mmap lock, notifier_invalidate_start lock are held 2271 * for invalidate event, prange lock is held if this is from migration 2272 */ 2273 static bool 2274 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2275 const struct mmu_notifier_range *range, 2276 unsigned long cur_seq) 2277 { 2278 struct svm_range *prange; 2279 unsigned long start; 2280 unsigned long last; 2281 2282 if (range->event == MMU_NOTIFY_RELEASE) 2283 return true; 2284 2285 start = mni->interval_tree.start; 2286 last = mni->interval_tree.last; 2287 start = max(start, range->start) >> PAGE_SHIFT; 2288 last = min(last, range->end - 1) >> PAGE_SHIFT; 2289 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2290 start, last, range->start >> PAGE_SHIFT, 2291 (range->end - 1) >> PAGE_SHIFT, 2292 mni->interval_tree.start >> PAGE_SHIFT, 2293 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2294 2295 prange = container_of(mni, struct svm_range, notifier); 2296 2297 svm_range_lock(prange); 2298 mmu_interval_set_seq(mni, cur_seq); 2299 2300 switch (range->event) { 2301 case MMU_NOTIFY_UNMAP: 2302 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2303 break; 2304 default: 2305 svm_range_evict(prange, mni->mm, start, last); 2306 break; 2307 } 2308 2309 svm_range_unlock(prange); 2310 2311 return true; 2312 } 2313 2314 /** 2315 * svm_range_from_addr - find svm range from fault address 2316 * @svms: svm range list header 2317 * @addr: address to search range interval tree, in pages 2318 * @parent: parent range if range is on child list 2319 * 2320 * Context: The caller must hold svms->lock 2321 * 2322 * Return: the svm_range found or NULL 2323 */ 2324 struct svm_range * 2325 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2326 struct svm_range **parent) 2327 { 2328 struct interval_tree_node *node; 2329 struct svm_range *prange; 2330 struct svm_range *pchild; 2331 2332 node = interval_tree_iter_first(&svms->objects, addr, addr); 2333 if (!node) 2334 return NULL; 2335 2336 prange = container_of(node, struct svm_range, it_node); 2337 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2338 addr, prange->start, prange->last, node->start, node->last); 2339 2340 if (addr >= prange->start && addr <= prange->last) { 2341 if (parent) 2342 *parent = prange; 2343 return prange; 2344 } 2345 list_for_each_entry(pchild, &prange->child_list, child_list) 2346 if (addr >= pchild->start && addr <= pchild->last) { 2347 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2348 addr, pchild->start, pchild->last); 2349 if (parent) 2350 *parent = prange; 2351 return pchild; 2352 } 2353 2354 return NULL; 2355 } 2356 2357 /* svm_range_best_restore_location - decide the best fault restore location 2358 * @prange: svm range structure 2359 * @adev: the GPU on which vm fault happened 2360 * 2361 * This is only called when xnack is on, to decide the best location to restore 2362 * the range mapping after GPU vm fault. Caller uses the best location to do 2363 * migration if actual loc is not best location, then update GPU page table 2364 * mapping to the best location. 2365 * 2366 * If the preferred loc is accessible by faulting GPU, use preferred loc. 2367 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2368 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2369 * if range actual loc is cpu, best_loc is cpu 2370 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2371 * range actual loc. 2372 * Otherwise, GPU no access, best_loc is -1. 2373 * 2374 * Return: 2375 * -1 means vm fault GPU no access 2376 * 0 for CPU or GPU id 2377 */ 2378 static int32_t 2379 svm_range_best_restore_location(struct svm_range *prange, 2380 struct amdgpu_device *adev, 2381 int32_t *gpuidx) 2382 { 2383 struct amdgpu_device *bo_adev, *preferred_adev; 2384 struct kfd_process *p; 2385 uint32_t gpuid; 2386 int r; 2387 2388 p = container_of(prange->svms, struct kfd_process, svms); 2389 2390 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); 2391 if (r < 0) { 2392 pr_debug("failed to get gpuid from kgd\n"); 2393 return -1; 2394 } 2395 2396 if (prange->preferred_loc == gpuid || 2397 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 2398 return prange->preferred_loc; 2399 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2400 preferred_adev = svm_range_get_adev_by_id(prange, 2401 prange->preferred_loc); 2402 if (amdgpu_xgmi_same_hive(adev, preferred_adev)) 2403 return prange->preferred_loc; 2404 /* fall through */ 2405 } 2406 2407 if (test_bit(*gpuidx, prange->bitmap_access)) 2408 return gpuid; 2409 2410 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2411 if (!prange->actual_loc) 2412 return 0; 2413 2414 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2415 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2416 return prange->actual_loc; 2417 else 2418 return 0; 2419 } 2420 2421 return -1; 2422 } 2423 2424 static int 2425 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2426 unsigned long *start, unsigned long *last, 2427 bool *is_heap_stack) 2428 { 2429 struct vm_area_struct *vma; 2430 struct interval_tree_node *node; 2431 unsigned long start_limit, end_limit; 2432 2433 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2434 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2435 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2436 return -EFAULT; 2437 } 2438 2439 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && 2440 vma->vm_end >= vma->vm_mm->start_brk) || 2441 (vma->vm_start <= vma->vm_mm->start_stack && 2442 vma->vm_end >= vma->vm_mm->start_stack); 2443 2444 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2445 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2446 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2447 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2448 /* First range that starts after the fault address */ 2449 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2450 if (node) { 2451 end_limit = min(end_limit, node->start); 2452 /* Last range that ends before the fault address */ 2453 node = container_of(rb_prev(&node->rb), 2454 struct interval_tree_node, rb); 2455 } else { 2456 /* Last range must end before addr because 2457 * there was no range after addr 2458 */ 2459 node = container_of(rb_last(&p->svms.objects.rb_root), 2460 struct interval_tree_node, rb); 2461 } 2462 if (node) { 2463 if (node->last >= addr) { 2464 WARN(1, "Overlap with prev node and page fault addr\n"); 2465 return -EFAULT; 2466 } 2467 start_limit = max(start_limit, node->last + 1); 2468 } 2469 2470 *start = start_limit; 2471 *last = end_limit - 1; 2472 2473 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 2474 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 2475 *start, *last, *is_heap_stack); 2476 2477 return 0; 2478 } 2479 2480 static int 2481 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 2482 uint64_t *bo_s, uint64_t *bo_l) 2483 { 2484 struct amdgpu_bo_va_mapping *mapping; 2485 struct interval_tree_node *node; 2486 struct amdgpu_bo *bo = NULL; 2487 unsigned long userptr; 2488 uint32_t i; 2489 int r; 2490 2491 for (i = 0; i < p->n_pdds; i++) { 2492 struct amdgpu_vm *vm; 2493 2494 if (!p->pdds[i]->drm_priv) 2495 continue; 2496 2497 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2498 r = amdgpu_bo_reserve(vm->root.bo, false); 2499 if (r) 2500 return r; 2501 2502 /* Check userptr by searching entire vm->va interval tree */ 2503 node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 2504 while (node) { 2505 mapping = container_of((struct rb_node *)node, 2506 struct amdgpu_bo_va_mapping, rb); 2507 bo = mapping->bo_va->base.bo; 2508 2509 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 2510 start << PAGE_SHIFT, 2511 last << PAGE_SHIFT, 2512 &userptr)) { 2513 node = interval_tree_iter_next(node, 0, ~0ULL); 2514 continue; 2515 } 2516 2517 pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 2518 start, last); 2519 if (bo_s && bo_l) { 2520 *bo_s = userptr >> PAGE_SHIFT; 2521 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 2522 } 2523 amdgpu_bo_unreserve(vm->root.bo); 2524 return -EADDRINUSE; 2525 } 2526 amdgpu_bo_unreserve(vm->root.bo); 2527 } 2528 return 0; 2529 } 2530 2531 static struct 2532 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2533 struct kfd_process *p, 2534 struct mm_struct *mm, 2535 int64_t addr) 2536 { 2537 struct svm_range *prange = NULL; 2538 unsigned long start, last; 2539 uint32_t gpuid, gpuidx; 2540 bool is_heap_stack; 2541 uint64_t bo_s = 0; 2542 uint64_t bo_l = 0; 2543 int r; 2544 2545 if (svm_range_get_range_boundaries(p, addr, &start, &last, 2546 &is_heap_stack)) 2547 return NULL; 2548 2549 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 2550 if (r != -EADDRINUSE) 2551 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 2552 2553 if (r == -EADDRINUSE) { 2554 if (addr >= bo_s && addr <= bo_l) 2555 return NULL; 2556 2557 /* Create one page svm range if 2MB range overlapping */ 2558 start = addr; 2559 last = addr; 2560 } 2561 2562 prange = svm_range_new(&p->svms, start, last); 2563 if (!prange) { 2564 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2565 return NULL; 2566 } 2567 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { 2568 pr_debug("failed to get gpuid from kgd\n"); 2569 svm_range_free(prange); 2570 return NULL; 2571 } 2572 2573 if (is_heap_stack) 2574 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 2575 2576 svm_range_add_to_svms(prange); 2577 svm_range_add_notifier_locked(mm, prange); 2578 2579 return prange; 2580 } 2581 2582 /* svm_range_skip_recover - decide if prange can be recovered 2583 * @prange: svm range structure 2584 * 2585 * GPU vm retry fault handle skip recover the range for cases: 2586 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2587 * deferred list work will drain the stale fault before free the prange. 2588 * 2. prange is on deferred list to add interval notifier after split, or 2589 * 3. prange is child range, it is split from parent prange, recover later 2590 * after interval notifier is added. 2591 * 2592 * Return: true to skip recover, false to recover 2593 */ 2594 static bool svm_range_skip_recover(struct svm_range *prange) 2595 { 2596 struct svm_range_list *svms = prange->svms; 2597 2598 spin_lock(&svms->deferred_list_lock); 2599 if (list_empty(&prange->deferred_list) && 2600 list_empty(&prange->child_list)) { 2601 spin_unlock(&svms->deferred_list_lock); 2602 return false; 2603 } 2604 spin_unlock(&svms->deferred_list_lock); 2605 2606 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2607 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2608 svms, prange, prange->start, prange->last); 2609 return true; 2610 } 2611 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2612 prange->work_item.op == SVM_OP_ADD_RANGE) { 2613 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2614 svms, prange, prange->start, prange->last); 2615 return true; 2616 } 2617 return false; 2618 } 2619 2620 static void 2621 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, 2622 int32_t gpuidx) 2623 { 2624 struct kfd_process_device *pdd; 2625 2626 /* fault is on different page of same range 2627 * or fault is skipped to recover later 2628 * or fault is on invalid virtual address 2629 */ 2630 if (gpuidx == MAX_GPU_INSTANCE) { 2631 uint32_t gpuid; 2632 int r; 2633 2634 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); 2635 if (r < 0) 2636 return; 2637 } 2638 2639 /* fault is recovered 2640 * or fault cannot recover because GPU no access on the range 2641 */ 2642 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2643 if (pdd) 2644 WRITE_ONCE(pdd->faults, pdd->faults + 1); 2645 } 2646 2647 static bool 2648 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) 2649 { 2650 unsigned long requested = VM_READ; 2651 2652 if (write_fault) 2653 requested |= VM_WRITE; 2654 2655 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 2656 vma->vm_flags); 2657 return (vma->vm_flags & requested) == requested; 2658 } 2659 2660 int 2661 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2662 uint64_t addr, bool write_fault) 2663 { 2664 struct mm_struct *mm = NULL; 2665 struct svm_range_list *svms; 2666 struct svm_range *prange; 2667 struct kfd_process *p; 2668 uint64_t timestamp; 2669 int32_t best_loc; 2670 int32_t gpuidx = MAX_GPU_INSTANCE; 2671 bool write_locked = false; 2672 struct vm_area_struct *vma; 2673 int r = 0; 2674 2675 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2676 pr_debug("device does not support SVM\n"); 2677 return -EFAULT; 2678 } 2679 2680 p = kfd_lookup_process_by_pasid(pasid); 2681 if (!p) { 2682 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2683 return 0; 2684 } 2685 if (!p->xnack_enabled) { 2686 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2687 r = -EFAULT; 2688 goto out; 2689 } 2690 svms = &p->svms; 2691 2692 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2693 2694 if (atomic_read(&svms->drain_pagefaults)) { 2695 pr_debug("draining retry fault, drop fault 0x%llx\n", addr); 2696 r = 0; 2697 goto out; 2698 } 2699 2700 /* p->lead_thread is available as kfd_process_wq_release flush the work 2701 * before releasing task ref. 2702 */ 2703 mm = get_task_mm(p->lead_thread); 2704 if (!mm) { 2705 pr_debug("svms 0x%p failed to get mm\n", svms); 2706 r = 0; 2707 goto out; 2708 } 2709 2710 mmap_read_lock(mm); 2711 retry_write_locked: 2712 mutex_lock(&svms->lock); 2713 prange = svm_range_from_addr(svms, addr, NULL); 2714 if (!prange) { 2715 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2716 svms, addr); 2717 if (!write_locked) { 2718 /* Need the write lock to create new range with MMU notifier. 2719 * Also flush pending deferred work to make sure the interval 2720 * tree is up to date before we add a new range 2721 */ 2722 mutex_unlock(&svms->lock); 2723 mmap_read_unlock(mm); 2724 mmap_write_lock(mm); 2725 write_locked = true; 2726 goto retry_write_locked; 2727 } 2728 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2729 if (!prange) { 2730 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2731 svms, addr); 2732 mmap_write_downgrade(mm); 2733 r = -EFAULT; 2734 goto out_unlock_svms; 2735 } 2736 } 2737 if (write_locked) 2738 mmap_write_downgrade(mm); 2739 2740 mutex_lock(&prange->migrate_mutex); 2741 2742 if (svm_range_skip_recover(prange)) { 2743 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2744 r = 0; 2745 goto out_unlock_range; 2746 } 2747 2748 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2749 /* skip duplicate vm fault on different pages of same range */ 2750 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2751 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2752 svms, prange->start, prange->last); 2753 r = 0; 2754 goto out_unlock_range; 2755 } 2756 2757 /* __do_munmap removed VMA, return success as we are handling stale 2758 * retry fault. 2759 */ 2760 vma = find_vma(mm, addr << PAGE_SHIFT); 2761 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2762 pr_debug("address 0x%llx VMA is removed\n", addr); 2763 r = 0; 2764 goto out_unlock_range; 2765 } 2766 2767 if (!svm_fault_allowed(vma, write_fault)) { 2768 pr_debug("fault addr 0x%llx no %s permission\n", addr, 2769 write_fault ? "write" : "read"); 2770 r = -EPERM; 2771 goto out_unlock_range; 2772 } 2773 2774 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2775 if (best_loc == -1) { 2776 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2777 svms, prange->start, prange->last); 2778 r = -EACCES; 2779 goto out_unlock_range; 2780 } 2781 2782 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2783 svms, prange->start, prange->last, best_loc, 2784 prange->actual_loc); 2785 2786 if (prange->actual_loc != best_loc) { 2787 if (best_loc) { 2788 r = svm_migrate_to_vram(prange, best_loc, mm); 2789 if (r) { 2790 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2791 r, addr); 2792 /* Fallback to system memory if migration to 2793 * VRAM failed 2794 */ 2795 if (prange->actual_loc) 2796 r = svm_migrate_vram_to_ram(prange, mm); 2797 else 2798 r = 0; 2799 } 2800 } else { 2801 r = svm_migrate_vram_to_ram(prange, mm); 2802 } 2803 if (r) { 2804 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2805 r, svms, prange->start, prange->last); 2806 goto out_unlock_range; 2807 } 2808 } 2809 2810 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2811 if (r) 2812 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2813 r, svms, prange->start, prange->last); 2814 2815 out_unlock_range: 2816 mutex_unlock(&prange->migrate_mutex); 2817 out_unlock_svms: 2818 mutex_unlock(&svms->lock); 2819 mmap_read_unlock(mm); 2820 2821 svm_range_count_fault(adev, p, gpuidx); 2822 2823 mmput(mm); 2824 out: 2825 kfd_unref_process(p); 2826 2827 if (r == -EAGAIN) { 2828 pr_debug("recover vm fault later\n"); 2829 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2830 r = 0; 2831 } 2832 return r; 2833 } 2834 2835 void svm_range_list_fini(struct kfd_process *p) 2836 { 2837 struct svm_range *prange; 2838 struct svm_range *next; 2839 2840 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2841 2842 cancel_delayed_work_sync(&p->svms.restore_work); 2843 2844 /* Ensure list work is finished before process is destroyed */ 2845 flush_work(&p->svms.deferred_list_work); 2846 2847 /* 2848 * Ensure no retry fault comes in afterwards, as page fault handler will 2849 * not find kfd process and take mm lock to recover fault. 2850 */ 2851 atomic_inc(&p->svms.drain_pagefaults); 2852 svm_range_drain_retry_fault(&p->svms); 2853 2854 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2855 svm_range_unlink(prange); 2856 svm_range_remove_notifier(prange); 2857 svm_range_free(prange); 2858 } 2859 2860 mutex_destroy(&p->svms.lock); 2861 2862 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2863 } 2864 2865 int svm_range_list_init(struct kfd_process *p) 2866 { 2867 struct svm_range_list *svms = &p->svms; 2868 int i; 2869 2870 svms->objects = RB_ROOT_CACHED; 2871 mutex_init(&svms->lock); 2872 INIT_LIST_HEAD(&svms->list); 2873 atomic_set(&svms->evicted_ranges, 0); 2874 atomic_set(&svms->drain_pagefaults, 0); 2875 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2876 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2877 INIT_LIST_HEAD(&svms->deferred_range_list); 2878 spin_lock_init(&svms->deferred_list_lock); 2879 2880 for (i = 0; i < p->n_pdds; i++) 2881 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 2882 bitmap_set(svms->bitmap_supported, i, 1); 2883 2884 return 0; 2885 } 2886 2887 /** 2888 * svm_range_check_vm - check if virtual address range mapped already 2889 * @p: current kfd_process 2890 * @start: range start address, in pages 2891 * @last: range last address, in pages 2892 * @bo_s: mapping start address in pages if address range already mapped 2893 * @bo_l: mapping last address in pages if address range already mapped 2894 * 2895 * The purpose is to avoid virtual address ranges already allocated by 2896 * kfd_ioctl_alloc_memory_of_gpu ioctl. 2897 * It looks for each pdd in the kfd_process. 2898 * 2899 * Context: Process context 2900 * 2901 * Return 0 - OK, if the range is not mapped. 2902 * Otherwise error code: 2903 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 2904 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 2905 * a signal. Release all buffer reservations and return to user-space. 2906 */ 2907 static int 2908 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 2909 uint64_t *bo_s, uint64_t *bo_l) 2910 { 2911 struct amdgpu_bo_va_mapping *mapping; 2912 struct interval_tree_node *node; 2913 uint32_t i; 2914 int r; 2915 2916 for (i = 0; i < p->n_pdds; i++) { 2917 struct amdgpu_vm *vm; 2918 2919 if (!p->pdds[i]->drm_priv) 2920 continue; 2921 2922 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2923 r = amdgpu_bo_reserve(vm->root.bo, false); 2924 if (r) 2925 return r; 2926 2927 node = interval_tree_iter_first(&vm->va, start, last); 2928 if (node) { 2929 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 2930 start, last); 2931 mapping = container_of((struct rb_node *)node, 2932 struct amdgpu_bo_va_mapping, rb); 2933 if (bo_s && bo_l) { 2934 *bo_s = mapping->start; 2935 *bo_l = mapping->last; 2936 } 2937 amdgpu_bo_unreserve(vm->root.bo); 2938 return -EADDRINUSE; 2939 } 2940 amdgpu_bo_unreserve(vm->root.bo); 2941 } 2942 2943 return 0; 2944 } 2945 2946 /** 2947 * svm_range_is_valid - check if virtual address range is valid 2948 * @p: current kfd_process 2949 * @start: range start address, in pages 2950 * @size: range size, in pages 2951 * 2952 * Valid virtual address range means it belongs to one or more VMAs 2953 * 2954 * Context: Process context 2955 * 2956 * Return: 2957 * 0 - OK, otherwise error code 2958 */ 2959 static int 2960 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 2961 { 2962 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2963 struct vm_area_struct *vma; 2964 unsigned long end; 2965 unsigned long start_unchg = start; 2966 2967 start <<= PAGE_SHIFT; 2968 end = start + (size << PAGE_SHIFT); 2969 do { 2970 vma = find_vma(p->mm, start); 2971 if (!vma || start < vma->vm_start || 2972 (vma->vm_flags & device_vma)) 2973 return -EFAULT; 2974 start = min(end, vma->vm_end); 2975 } while (start < end); 2976 2977 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 2978 NULL); 2979 } 2980 2981 /** 2982 * svm_range_best_prefetch_location - decide the best prefetch location 2983 * @prange: svm range structure 2984 * 2985 * For xnack off: 2986 * If range map to single GPU, the best prefetch location is prefetch_loc, which 2987 * can be CPU or GPU. 2988 * 2989 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 2990 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 2991 * the best prefetch location is always CPU, because GPU can not have coherent 2992 * mapping VRAM of other GPUs even with large-BAR PCIe connection. 2993 * 2994 * For xnack on: 2995 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 2996 * prefetch_loc, other GPU access will generate vm fault and trigger migration. 2997 * 2998 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 2999 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 3000 * prefetch location is always CPU. 3001 * 3002 * Context: Process context 3003 * 3004 * Return: 3005 * 0 for CPU or GPU id 3006 */ 3007 static uint32_t 3008 svm_range_best_prefetch_location(struct svm_range *prange) 3009 { 3010 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 3011 uint32_t best_loc = prange->prefetch_loc; 3012 struct kfd_process_device *pdd; 3013 struct amdgpu_device *bo_adev; 3014 struct kfd_process *p; 3015 uint32_t gpuidx; 3016 3017 p = container_of(prange->svms, struct kfd_process, svms); 3018 3019 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 3020 goto out; 3021 3022 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 3023 if (!bo_adev) { 3024 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 3025 best_loc = 0; 3026 goto out; 3027 } 3028 3029 if (p->xnack_enabled) 3030 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 3031 else 3032 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 3033 MAX_GPU_INSTANCE); 3034 3035 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 3036 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 3037 if (!pdd) { 3038 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 3039 continue; 3040 } 3041 3042 if (pdd->dev->adev == bo_adev) 3043 continue; 3044 3045 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 3046 best_loc = 0; 3047 break; 3048 } 3049 } 3050 3051 out: 3052 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 3053 p->xnack_enabled, &p->svms, prange->start, prange->last, 3054 best_loc); 3055 3056 return best_loc; 3057 } 3058 3059 /* FIXME: This is a workaround for page locking bug when some pages are 3060 * invalid during migration to VRAM 3061 */ 3062 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, 3063 void *owner) 3064 { 3065 struct hmm_range *hmm_range; 3066 int r; 3067 3068 if (prange->validated_once) 3069 return; 3070 3071 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 3072 prange->start << PAGE_SHIFT, 3073 prange->npages, &hmm_range, 3074 false, true, owner); 3075 if (!r) { 3076 amdgpu_hmm_range_get_pages_done(hmm_range); 3077 prange->validated_once = true; 3078 } 3079 } 3080 3081 /* svm_range_trigger_migration - start page migration if prefetch loc changed 3082 * @mm: current process mm_struct 3083 * @prange: svm range structure 3084 * @migrated: output, true if migration is triggered 3085 * 3086 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 3087 * from ram to vram. 3088 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 3089 * from vram to ram. 3090 * 3091 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 3092 * and restore work: 3093 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 3094 * stops all queues, schedule restore work 3095 * 2. svm_range_restore_work wait for migration is done by 3096 * a. svm_range_validate_vram takes prange->migrate_mutex 3097 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 3098 * 3. restore work update mappings of GPU, resume all queues. 3099 * 3100 * Context: Process context 3101 * 3102 * Return: 3103 * 0 - OK, otherwise - error code of migration 3104 */ 3105 static int 3106 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 3107 bool *migrated) 3108 { 3109 uint32_t best_loc; 3110 int r = 0; 3111 3112 *migrated = false; 3113 best_loc = svm_range_best_prefetch_location(prange); 3114 3115 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3116 best_loc == prange->actual_loc) 3117 return 0; 3118 3119 if (!best_loc) { 3120 r = svm_migrate_vram_to_ram(prange, mm); 3121 *migrated = !r; 3122 return r; 3123 } 3124 3125 r = svm_migrate_to_vram(prange, best_loc, mm); 3126 *migrated = !r; 3127 3128 return r; 3129 } 3130 3131 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 3132 { 3133 if (!fence) 3134 return -EINVAL; 3135 3136 if (dma_fence_is_signaled(&fence->base)) 3137 return 0; 3138 3139 if (fence->svm_bo) { 3140 WRITE_ONCE(fence->svm_bo->evicting, 1); 3141 schedule_work(&fence->svm_bo->eviction_work); 3142 } 3143 3144 return 0; 3145 } 3146 3147 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 3148 { 3149 struct svm_range_bo *svm_bo; 3150 struct kfd_process *p; 3151 struct mm_struct *mm; 3152 3153 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 3154 if (!svm_bo_ref_unless_zero(svm_bo)) 3155 return; /* svm_bo was freed while eviction was pending */ 3156 3157 /* svm_range_bo_release destroys this worker thread. So during 3158 * the lifetime of this thread, kfd_process and mm will be valid. 3159 */ 3160 p = container_of(svm_bo->svms, struct kfd_process, svms); 3161 mm = p->mm; 3162 if (!mm) 3163 return; 3164 3165 mmap_read_lock(mm); 3166 spin_lock(&svm_bo->list_lock); 3167 while (!list_empty(&svm_bo->range_list)) { 3168 struct svm_range *prange = 3169 list_first_entry(&svm_bo->range_list, 3170 struct svm_range, svm_bo_list); 3171 int retries = 3; 3172 3173 list_del_init(&prange->svm_bo_list); 3174 spin_unlock(&svm_bo->list_lock); 3175 3176 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3177 prange->start, prange->last); 3178 3179 mutex_lock(&prange->migrate_mutex); 3180 do { 3181 svm_migrate_vram_to_ram(prange, 3182 svm_bo->eviction_fence->mm); 3183 } while (prange->actual_loc && --retries); 3184 WARN(prange->actual_loc, "Migration failed during eviction"); 3185 3186 mutex_lock(&prange->lock); 3187 prange->svm_bo = NULL; 3188 mutex_unlock(&prange->lock); 3189 3190 mutex_unlock(&prange->migrate_mutex); 3191 3192 spin_lock(&svm_bo->list_lock); 3193 } 3194 spin_unlock(&svm_bo->list_lock); 3195 mmap_read_unlock(mm); 3196 3197 dma_fence_signal(&svm_bo->eviction_fence->base); 3198 /* This is the last reference to svm_bo, after svm_range_vram_node_free 3199 * has been called in svm_migrate_vram_to_ram 3200 */ 3201 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 3202 svm_range_bo_unref(svm_bo); 3203 } 3204 3205 static int 3206 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3207 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3208 { 3209 struct mm_struct *mm = current->mm; 3210 struct list_head update_list; 3211 struct list_head insert_list; 3212 struct list_head remove_list; 3213 struct svm_range_list *svms; 3214 struct svm_range *prange; 3215 struct svm_range *next; 3216 int r = 0; 3217 3218 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 3219 p->pasid, &p->svms, start, start + size - 1, size); 3220 3221 r = svm_range_check_attr(p, nattr, attrs); 3222 if (r) 3223 return r; 3224 3225 svms = &p->svms; 3226 3227 svm_range_list_lock_and_flush_work(svms, mm); 3228 3229 r = svm_range_is_valid(p, start, size); 3230 if (r) { 3231 pr_debug("invalid range r=%d\n", r); 3232 mmap_write_unlock(mm); 3233 goto out; 3234 } 3235 3236 mutex_lock(&svms->lock); 3237 3238 /* Add new range and split existing ranges as needed */ 3239 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 3240 &insert_list, &remove_list); 3241 if (r) { 3242 mutex_unlock(&svms->lock); 3243 mmap_write_unlock(mm); 3244 goto out; 3245 } 3246 /* Apply changes as a transaction */ 3247 list_for_each_entry_safe(prange, next, &insert_list, list) { 3248 svm_range_add_to_svms(prange); 3249 svm_range_add_notifier_locked(mm, prange); 3250 } 3251 list_for_each_entry(prange, &update_list, update_list) { 3252 svm_range_apply_attrs(p, prange, nattr, attrs); 3253 /* TODO: unmap ranges from GPU that lost access */ 3254 } 3255 list_for_each_entry_safe(prange, next, &remove_list, update_list) { 3256 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 3257 prange->svms, prange, prange->start, 3258 prange->last); 3259 svm_range_unlink(prange); 3260 svm_range_remove_notifier(prange); 3261 svm_range_free(prange); 3262 } 3263 3264 mmap_write_downgrade(mm); 3265 /* Trigger migrations and revalidate and map to GPUs as needed. If 3266 * this fails we may be left with partially completed actions. There 3267 * is no clean way of rolling back to the previous state in such a 3268 * case because the rollback wouldn't be guaranteed to work either. 3269 */ 3270 list_for_each_entry(prange, &update_list, update_list) { 3271 bool migrated; 3272 3273 mutex_lock(&prange->migrate_mutex); 3274 3275 r = svm_range_trigger_migration(mm, prange, &migrated); 3276 if (r) 3277 goto out_unlock_range; 3278 3279 if (migrated && !p->xnack_enabled) { 3280 pr_debug("restore_work will update mappings of GPUs\n"); 3281 mutex_unlock(&prange->migrate_mutex); 3282 continue; 3283 } 3284 3285 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 3286 true, true); 3287 if (r) 3288 pr_debug("failed %d to map svm range\n", r); 3289 3290 out_unlock_range: 3291 mutex_unlock(&prange->migrate_mutex); 3292 if (r) 3293 break; 3294 } 3295 3296 svm_range_debug_dump(svms); 3297 3298 mutex_unlock(&svms->lock); 3299 mmap_read_unlock(mm); 3300 out: 3301 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 3302 &p->svms, start, start + size - 1, r); 3303 3304 return r; 3305 } 3306 3307 static int 3308 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3309 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3310 { 3311 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 3312 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 3313 bool get_preferred_loc = false; 3314 bool get_prefetch_loc = false; 3315 bool get_granularity = false; 3316 bool get_accessible = false; 3317 bool get_flags = false; 3318 uint64_t last = start + size - 1UL; 3319 struct mm_struct *mm = current->mm; 3320 uint8_t granularity = 0xff; 3321 struct interval_tree_node *node; 3322 struct svm_range_list *svms; 3323 struct svm_range *prange; 3324 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3325 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3326 uint32_t flags_and = 0xffffffff; 3327 uint32_t flags_or = 0; 3328 int gpuidx; 3329 uint32_t i; 3330 int r = 0; 3331 3332 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 3333 start + size - 1, nattr); 3334 3335 /* Flush pending deferred work to avoid racing with deferred actions from 3336 * previous memory map changes (e.g. munmap). Concurrent memory map changes 3337 * can still race with get_attr because we don't hold the mmap lock. But that 3338 * would be a race condition in the application anyway, and undefined 3339 * behaviour is acceptable in that case. 3340 */ 3341 flush_work(&p->svms.deferred_list_work); 3342 3343 mmap_read_lock(mm); 3344 r = svm_range_is_valid(p, start, size); 3345 mmap_read_unlock(mm); 3346 if (r) { 3347 pr_debug("invalid range r=%d\n", r); 3348 return r; 3349 } 3350 3351 for (i = 0; i < nattr; i++) { 3352 switch (attrs[i].type) { 3353 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3354 get_preferred_loc = true; 3355 break; 3356 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3357 get_prefetch_loc = true; 3358 break; 3359 case KFD_IOCTL_SVM_ATTR_ACCESS: 3360 get_accessible = true; 3361 break; 3362 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3363 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3364 get_flags = true; 3365 break; 3366 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3367 get_granularity = true; 3368 break; 3369 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 3370 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 3371 fallthrough; 3372 default: 3373 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 3374 return -EINVAL; 3375 } 3376 } 3377 3378 svms = &p->svms; 3379 3380 mutex_lock(&svms->lock); 3381 3382 node = interval_tree_iter_first(&svms->objects, start, last); 3383 if (!node) { 3384 pr_debug("range attrs not found return default values\n"); 3385 svm_range_set_default_attributes(&location, &prefetch_loc, 3386 &granularity, &flags_and); 3387 flags_or = flags_and; 3388 if (p->xnack_enabled) 3389 bitmap_copy(bitmap_access, svms->bitmap_supported, 3390 MAX_GPU_INSTANCE); 3391 else 3392 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 3393 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 3394 goto fill_values; 3395 } 3396 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 3397 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 3398 3399 while (node) { 3400 struct interval_tree_node *next; 3401 3402 prange = container_of(node, struct svm_range, it_node); 3403 next = interval_tree_iter_next(node, start, last); 3404 3405 if (get_preferred_loc) { 3406 if (prange->preferred_loc == 3407 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3408 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3409 location != prange->preferred_loc)) { 3410 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3411 get_preferred_loc = false; 3412 } else { 3413 location = prange->preferred_loc; 3414 } 3415 } 3416 if (get_prefetch_loc) { 3417 if (prange->prefetch_loc == 3418 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3419 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3420 prefetch_loc != prange->prefetch_loc)) { 3421 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3422 get_prefetch_loc = false; 3423 } else { 3424 prefetch_loc = prange->prefetch_loc; 3425 } 3426 } 3427 if (get_accessible) { 3428 bitmap_and(bitmap_access, bitmap_access, 3429 prange->bitmap_access, MAX_GPU_INSTANCE); 3430 bitmap_and(bitmap_aip, bitmap_aip, 3431 prange->bitmap_aip, MAX_GPU_INSTANCE); 3432 } 3433 if (get_flags) { 3434 flags_and &= prange->flags; 3435 flags_or |= prange->flags; 3436 } 3437 3438 if (get_granularity && prange->granularity < granularity) 3439 granularity = prange->granularity; 3440 3441 node = next; 3442 } 3443 fill_values: 3444 mutex_unlock(&svms->lock); 3445 3446 for (i = 0; i < nattr; i++) { 3447 switch (attrs[i].type) { 3448 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3449 attrs[i].value = location; 3450 break; 3451 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3452 attrs[i].value = prefetch_loc; 3453 break; 3454 case KFD_IOCTL_SVM_ATTR_ACCESS: 3455 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3456 attrs[i].value); 3457 if (gpuidx < 0) { 3458 pr_debug("invalid gpuid %x\n", attrs[i].value); 3459 return -EINVAL; 3460 } 3461 if (test_bit(gpuidx, bitmap_access)) 3462 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3463 else if (test_bit(gpuidx, bitmap_aip)) 3464 attrs[i].type = 3465 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3466 else 3467 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3468 break; 3469 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3470 attrs[i].value = flags_and; 3471 break; 3472 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3473 attrs[i].value = ~flags_or; 3474 break; 3475 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3476 attrs[i].value = (uint32_t)granularity; 3477 break; 3478 } 3479 } 3480 3481 return 0; 3482 } 3483 3484 int 3485 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3486 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3487 { 3488 int r; 3489 3490 start >>= PAGE_SHIFT; 3491 size >>= PAGE_SHIFT; 3492 3493 switch (op) { 3494 case KFD_IOCTL_SVM_OP_SET_ATTR: 3495 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3496 break; 3497 case KFD_IOCTL_SVM_OP_GET_ATTR: 3498 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3499 break; 3500 default: 3501 r = EINVAL; 3502 break; 3503 } 3504 3505 return r; 3506 } 3507