1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #ifdef dev_fmt 37 #undef dev_fmt 38 #endif 39 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 40 41 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 42 43 /* Long enough to ensure no retry fault comes after svm range is restored and 44 * page table is updated. 45 */ 46 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 47 48 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 49 static bool 50 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 51 const struct mmu_notifier_range *range, 52 unsigned long cur_seq); 53 static int 54 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 55 uint64_t *bo_s, uint64_t *bo_l); 56 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 57 .invalidate = svm_range_cpu_invalidate_pagetables, 58 }; 59 60 /** 61 * svm_range_unlink - unlink svm_range from lists and interval tree 62 * @prange: svm range structure to be removed 63 * 64 * Remove the svm_range from the svms and svm_bo lists and the svms 65 * interval tree. 66 * 67 * Context: The caller must hold svms->lock 68 */ 69 static void svm_range_unlink(struct svm_range *prange) 70 { 71 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 72 prange, prange->start, prange->last); 73 74 if (prange->svm_bo) { 75 spin_lock(&prange->svm_bo->list_lock); 76 list_del(&prange->svm_bo_list); 77 spin_unlock(&prange->svm_bo->list_lock); 78 } 79 80 list_del(&prange->list); 81 if (prange->it_node.start != 0 && prange->it_node.last != 0) 82 interval_tree_remove(&prange->it_node, &prange->svms->objects); 83 } 84 85 static void 86 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 87 { 88 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 89 prange, prange->start, prange->last); 90 91 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 92 prange->start << PAGE_SHIFT, 93 prange->npages << PAGE_SHIFT, 94 &svm_range_mn_ops); 95 } 96 97 /** 98 * svm_range_add_to_svms - add svm range to svms 99 * @prange: svm range structure to be added 100 * 101 * Add the svm range to svms interval tree and link list 102 * 103 * Context: The caller must hold svms->lock 104 */ 105 static void svm_range_add_to_svms(struct svm_range *prange) 106 { 107 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 108 prange, prange->start, prange->last); 109 110 list_move_tail(&prange->list, &prange->svms->list); 111 prange->it_node.start = prange->start; 112 prange->it_node.last = prange->last; 113 interval_tree_insert(&prange->it_node, &prange->svms->objects); 114 } 115 116 static void svm_range_remove_notifier(struct svm_range *prange) 117 { 118 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 119 prange->svms, prange, 120 prange->notifier.interval_tree.start >> PAGE_SHIFT, 121 prange->notifier.interval_tree.last >> PAGE_SHIFT); 122 123 if (prange->notifier.interval_tree.start != 0 && 124 prange->notifier.interval_tree.last != 0) 125 mmu_interval_notifier_remove(&prange->notifier); 126 } 127 128 static bool 129 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 130 { 131 return dma_addr && !dma_mapping_error(dev, dma_addr) && 132 !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 133 } 134 135 static int 136 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 137 unsigned long offset, unsigned long npages, 138 unsigned long *hmm_pfns, uint32_t gpuidx) 139 { 140 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 141 dma_addr_t *addr = prange->dma_addr[gpuidx]; 142 struct device *dev = adev->dev; 143 struct page *page; 144 int i, r; 145 146 if (!addr) { 147 addr = kvmalloc_array(prange->npages, sizeof(*addr), 148 GFP_KERNEL | __GFP_ZERO); 149 if (!addr) 150 return -ENOMEM; 151 prange->dma_addr[gpuidx] = addr; 152 } 153 154 addr += offset; 155 for (i = 0; i < npages; i++) { 156 if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 157 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 158 159 page = hmm_pfn_to_page(hmm_pfns[i]); 160 if (is_zone_device_page(page)) { 161 struct amdgpu_device *bo_adev = 162 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 163 164 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 165 bo_adev->vm_manager.vram_base_offset - 166 bo_adev->kfd.dev->pgmap.range.start; 167 addr[i] |= SVM_RANGE_VRAM_DOMAIN; 168 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 169 continue; 170 } 171 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 172 r = dma_mapping_error(dev, addr[i]); 173 if (r) { 174 dev_err(dev, "failed %d dma_map_page\n", r); 175 return r; 176 } 177 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 178 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 179 } 180 return 0; 181 } 182 183 static int 184 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 185 unsigned long offset, unsigned long npages, 186 unsigned long *hmm_pfns) 187 { 188 struct kfd_process *p; 189 uint32_t gpuidx; 190 int r; 191 192 p = container_of(prange->svms, struct kfd_process, svms); 193 194 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 195 struct kfd_process_device *pdd; 196 197 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 198 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 199 if (!pdd) { 200 pr_debug("failed to find device idx %d\n", gpuidx); 201 return -EINVAL; 202 } 203 204 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 205 hmm_pfns, gpuidx); 206 if (r) 207 break; 208 } 209 210 return r; 211 } 212 213 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 214 unsigned long offset, unsigned long npages) 215 { 216 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 217 int i; 218 219 if (!dma_addr) 220 return; 221 222 for (i = offset; i < offset + npages; i++) { 223 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 224 continue; 225 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 226 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 227 dma_addr[i] = 0; 228 } 229 } 230 231 void svm_range_free_dma_mappings(struct svm_range *prange) 232 { 233 struct kfd_process_device *pdd; 234 dma_addr_t *dma_addr; 235 struct device *dev; 236 struct kfd_process *p; 237 uint32_t gpuidx; 238 239 p = container_of(prange->svms, struct kfd_process, svms); 240 241 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 242 dma_addr = prange->dma_addr[gpuidx]; 243 if (!dma_addr) 244 continue; 245 246 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 247 if (!pdd) { 248 pr_debug("failed to find device idx %d\n", gpuidx); 249 continue; 250 } 251 dev = &pdd->dev->pdev->dev; 252 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 253 kvfree(dma_addr); 254 prange->dma_addr[gpuidx] = NULL; 255 } 256 } 257 258 static void svm_range_free(struct svm_range *prange) 259 { 260 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 261 prange->start, prange->last); 262 263 svm_range_vram_node_free(prange); 264 svm_range_free_dma_mappings(prange); 265 mutex_destroy(&prange->lock); 266 mutex_destroy(&prange->migrate_mutex); 267 kfree(prange); 268 } 269 270 static void 271 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 272 uint8_t *granularity, uint32_t *flags) 273 { 274 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 275 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 276 *granularity = 9; 277 *flags = 278 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 279 } 280 281 static struct 282 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 283 uint64_t last) 284 { 285 uint64_t size = last - start + 1; 286 struct svm_range *prange; 287 struct kfd_process *p; 288 289 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 290 if (!prange) 291 return NULL; 292 prange->npages = size; 293 prange->svms = svms; 294 prange->start = start; 295 prange->last = last; 296 INIT_LIST_HEAD(&prange->list); 297 INIT_LIST_HEAD(&prange->update_list); 298 INIT_LIST_HEAD(&prange->svm_bo_list); 299 INIT_LIST_HEAD(&prange->deferred_list); 300 INIT_LIST_HEAD(&prange->child_list); 301 atomic_set(&prange->invalid, 0); 302 prange->validate_timestamp = 0; 303 mutex_init(&prange->migrate_mutex); 304 mutex_init(&prange->lock); 305 306 p = container_of(svms, struct kfd_process, svms); 307 if (p->xnack_enabled) 308 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 309 MAX_GPU_INSTANCE); 310 311 svm_range_set_default_attributes(&prange->preferred_loc, 312 &prange->prefetch_loc, 313 &prange->granularity, &prange->flags); 314 315 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 316 317 return prange; 318 } 319 320 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 321 { 322 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 323 return false; 324 325 return true; 326 } 327 328 static void svm_range_bo_release(struct kref *kref) 329 { 330 struct svm_range_bo *svm_bo; 331 332 svm_bo = container_of(kref, struct svm_range_bo, kref); 333 pr_debug("svm_bo 0x%p\n", svm_bo); 334 335 spin_lock(&svm_bo->list_lock); 336 while (!list_empty(&svm_bo->range_list)) { 337 struct svm_range *prange = 338 list_first_entry(&svm_bo->range_list, 339 struct svm_range, svm_bo_list); 340 /* list_del_init tells a concurrent svm_range_vram_node_new when 341 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 342 */ 343 list_del_init(&prange->svm_bo_list); 344 spin_unlock(&svm_bo->list_lock); 345 346 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 347 prange->start, prange->last); 348 mutex_lock(&prange->lock); 349 prange->svm_bo = NULL; 350 mutex_unlock(&prange->lock); 351 352 spin_lock(&svm_bo->list_lock); 353 } 354 spin_unlock(&svm_bo->list_lock); 355 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 356 /* We're not in the eviction worker. 357 * Signal the fence and synchronize with any 358 * pending eviction work. 359 */ 360 dma_fence_signal(&svm_bo->eviction_fence->base); 361 cancel_work_sync(&svm_bo->eviction_work); 362 } 363 dma_fence_put(&svm_bo->eviction_fence->base); 364 amdgpu_bo_unref(&svm_bo->bo); 365 kfree(svm_bo); 366 } 367 368 static void svm_range_bo_wq_release(struct work_struct *work) 369 { 370 struct svm_range_bo *svm_bo; 371 372 svm_bo = container_of(work, struct svm_range_bo, release_work); 373 svm_range_bo_release(&svm_bo->kref); 374 } 375 376 static void svm_range_bo_release_async(struct kref *kref) 377 { 378 struct svm_range_bo *svm_bo; 379 380 svm_bo = container_of(kref, struct svm_range_bo, kref); 381 pr_debug("svm_bo 0x%p\n", svm_bo); 382 INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); 383 schedule_work(&svm_bo->release_work); 384 } 385 386 void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) 387 { 388 kref_put(&svm_bo->kref, svm_range_bo_release_async); 389 } 390 391 static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 392 { 393 if (svm_bo) 394 kref_put(&svm_bo->kref, svm_range_bo_release); 395 } 396 397 static bool 398 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 399 { 400 struct amdgpu_device *bo_adev; 401 402 mutex_lock(&prange->lock); 403 if (!prange->svm_bo) { 404 mutex_unlock(&prange->lock); 405 return false; 406 } 407 if (prange->ttm_res) { 408 /* We still have a reference, all is well */ 409 mutex_unlock(&prange->lock); 410 return true; 411 } 412 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 413 /* 414 * Migrate from GPU to GPU, remove range from source bo_adev 415 * svm_bo range list, and return false to allocate svm_bo from 416 * destination adev. 417 */ 418 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 419 if (bo_adev != adev) { 420 mutex_unlock(&prange->lock); 421 422 spin_lock(&prange->svm_bo->list_lock); 423 list_del_init(&prange->svm_bo_list); 424 spin_unlock(&prange->svm_bo->list_lock); 425 426 svm_range_bo_unref(prange->svm_bo); 427 return false; 428 } 429 if (READ_ONCE(prange->svm_bo->evicting)) { 430 struct dma_fence *f; 431 struct svm_range_bo *svm_bo; 432 /* The BO is getting evicted, 433 * we need to get a new one 434 */ 435 mutex_unlock(&prange->lock); 436 svm_bo = prange->svm_bo; 437 f = dma_fence_get(&svm_bo->eviction_fence->base); 438 svm_range_bo_unref(prange->svm_bo); 439 /* wait for the fence to avoid long spin-loop 440 * at list_empty_careful 441 */ 442 dma_fence_wait(f, false); 443 dma_fence_put(f); 444 } else { 445 /* The BO was still around and we got 446 * a new reference to it 447 */ 448 mutex_unlock(&prange->lock); 449 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 450 prange->svms, prange->start, prange->last); 451 452 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 453 return true; 454 } 455 456 } else { 457 mutex_unlock(&prange->lock); 458 } 459 460 /* We need a new svm_bo. Spin-loop to wait for concurrent 461 * svm_range_bo_release to finish removing this range from 462 * its range list. After this, it is safe to reuse the 463 * svm_bo pointer and svm_bo_list head. 464 */ 465 while (!list_empty_careful(&prange->svm_bo_list)) 466 ; 467 468 return false; 469 } 470 471 static struct svm_range_bo *svm_range_bo_new(void) 472 { 473 struct svm_range_bo *svm_bo; 474 475 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 476 if (!svm_bo) 477 return NULL; 478 479 kref_init(&svm_bo->kref); 480 INIT_LIST_HEAD(&svm_bo->range_list); 481 spin_lock_init(&svm_bo->list_lock); 482 483 return svm_bo; 484 } 485 486 int 487 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 488 bool clear) 489 { 490 struct amdgpu_bo_param bp; 491 struct svm_range_bo *svm_bo; 492 struct amdgpu_bo_user *ubo; 493 struct amdgpu_bo *bo; 494 struct kfd_process *p; 495 struct mm_struct *mm; 496 int r; 497 498 p = container_of(prange->svms, struct kfd_process, svms); 499 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 500 prange->start, prange->last); 501 502 if (svm_range_validate_svm_bo(adev, prange)) 503 return 0; 504 505 svm_bo = svm_range_bo_new(); 506 if (!svm_bo) { 507 pr_debug("failed to alloc svm bo\n"); 508 return -ENOMEM; 509 } 510 mm = get_task_mm(p->lead_thread); 511 if (!mm) { 512 pr_debug("failed to get mm\n"); 513 kfree(svm_bo); 514 return -ESRCH; 515 } 516 svm_bo->svms = prange->svms; 517 svm_bo->eviction_fence = 518 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 519 mm, 520 svm_bo); 521 mmput(mm); 522 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 523 svm_bo->evicting = 0; 524 memset(&bp, 0, sizeof(bp)); 525 bp.size = prange->npages * PAGE_SIZE; 526 bp.byte_align = PAGE_SIZE; 527 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 528 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 529 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 530 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 531 bp.type = ttm_bo_type_device; 532 bp.resv = NULL; 533 534 r = amdgpu_bo_create_user(adev, &bp, &ubo); 535 if (r) { 536 pr_debug("failed %d to create bo\n", r); 537 goto create_bo_failed; 538 } 539 bo = &ubo->bo; 540 r = amdgpu_bo_reserve(bo, true); 541 if (r) { 542 pr_debug("failed %d to reserve bo\n", r); 543 goto reserve_bo_failed; 544 } 545 546 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 547 if (r) { 548 pr_debug("failed %d to reserve bo\n", r); 549 amdgpu_bo_unreserve(bo); 550 goto reserve_bo_failed; 551 } 552 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 553 554 amdgpu_bo_unreserve(bo); 555 556 svm_bo->bo = bo; 557 prange->svm_bo = svm_bo; 558 prange->ttm_res = bo->tbo.resource; 559 prange->offset = 0; 560 561 spin_lock(&svm_bo->list_lock); 562 list_add(&prange->svm_bo_list, &svm_bo->range_list); 563 spin_unlock(&svm_bo->list_lock); 564 565 return 0; 566 567 reserve_bo_failed: 568 amdgpu_bo_unref(&bo); 569 create_bo_failed: 570 dma_fence_put(&svm_bo->eviction_fence->base); 571 kfree(svm_bo); 572 prange->ttm_res = NULL; 573 574 return r; 575 } 576 577 void svm_range_vram_node_free(struct svm_range *prange) 578 { 579 svm_range_bo_unref(prange->svm_bo); 580 prange->ttm_res = NULL; 581 } 582 583 struct amdgpu_device * 584 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 585 { 586 struct kfd_process_device *pdd; 587 struct kfd_process *p; 588 int32_t gpu_idx; 589 590 p = container_of(prange->svms, struct kfd_process, svms); 591 592 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 593 if (gpu_idx < 0) { 594 pr_debug("failed to get device by id 0x%x\n", gpu_id); 595 return NULL; 596 } 597 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 598 if (!pdd) { 599 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 600 return NULL; 601 } 602 603 return pdd->dev->adev; 604 } 605 606 struct kfd_process_device * 607 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) 608 { 609 struct kfd_process *p; 610 int32_t gpu_idx, gpuid; 611 int r; 612 613 p = container_of(prange->svms, struct kfd_process, svms); 614 615 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); 616 if (r) { 617 pr_debug("failed to get device id by adev %p\n", adev); 618 return NULL; 619 } 620 621 return kfd_process_device_from_gpuidx(p, gpu_idx); 622 } 623 624 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 625 { 626 struct ttm_operation_ctx ctx = { false, false }; 627 628 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 629 630 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 631 } 632 633 static int 634 svm_range_check_attr(struct kfd_process *p, 635 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 636 { 637 uint32_t i; 638 639 for (i = 0; i < nattr; i++) { 640 uint32_t val = attrs[i].value; 641 int gpuidx = MAX_GPU_INSTANCE; 642 643 switch (attrs[i].type) { 644 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 645 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 646 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 647 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 648 break; 649 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 650 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 651 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 652 break; 653 case KFD_IOCTL_SVM_ATTR_ACCESS: 654 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 655 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 656 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 657 break; 658 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 659 break; 660 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 661 break; 662 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 663 break; 664 default: 665 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 666 return -EINVAL; 667 } 668 669 if (gpuidx < 0) { 670 pr_debug("no GPU 0x%x found\n", val); 671 return -EINVAL; 672 } else if (gpuidx < MAX_GPU_INSTANCE && 673 !test_bit(gpuidx, p->svms.bitmap_supported)) { 674 pr_debug("GPU 0x%x not supported\n", val); 675 return -EINVAL; 676 } 677 } 678 679 return 0; 680 } 681 682 static void 683 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 684 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 685 { 686 uint32_t i; 687 int gpuidx; 688 689 for (i = 0; i < nattr; i++) { 690 switch (attrs[i].type) { 691 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 692 prange->preferred_loc = attrs[i].value; 693 break; 694 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 695 prange->prefetch_loc = attrs[i].value; 696 break; 697 case KFD_IOCTL_SVM_ATTR_ACCESS: 698 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 699 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 700 gpuidx = kfd_process_gpuidx_from_gpuid(p, 701 attrs[i].value); 702 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 703 bitmap_clear(prange->bitmap_access, gpuidx, 1); 704 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 705 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 706 bitmap_set(prange->bitmap_access, gpuidx, 1); 707 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 708 } else { 709 bitmap_clear(prange->bitmap_access, gpuidx, 1); 710 bitmap_set(prange->bitmap_aip, gpuidx, 1); 711 } 712 break; 713 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 714 prange->flags |= attrs[i].value; 715 break; 716 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 717 prange->flags &= ~attrs[i].value; 718 break; 719 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 720 prange->granularity = attrs[i].value; 721 break; 722 default: 723 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 724 } 725 } 726 } 727 728 static bool 729 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, 730 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 731 { 732 uint32_t i; 733 int gpuidx; 734 735 for (i = 0; i < nattr; i++) { 736 switch (attrs[i].type) { 737 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 738 if (prange->preferred_loc != attrs[i].value) 739 return false; 740 break; 741 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 742 /* Prefetch should always trigger a migration even 743 * if the value of the attribute didn't change. 744 */ 745 return false; 746 case KFD_IOCTL_SVM_ATTR_ACCESS: 747 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 748 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 749 gpuidx = kfd_process_gpuidx_from_gpuid(p, 750 attrs[i].value); 751 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 752 if (test_bit(gpuidx, prange->bitmap_access) || 753 test_bit(gpuidx, prange->bitmap_aip)) 754 return false; 755 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 756 if (!test_bit(gpuidx, prange->bitmap_access)) 757 return false; 758 } else { 759 if (!test_bit(gpuidx, prange->bitmap_aip)) 760 return false; 761 } 762 break; 763 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 764 if ((prange->flags & attrs[i].value) != attrs[i].value) 765 return false; 766 break; 767 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 768 if ((prange->flags & attrs[i].value) != 0) 769 return false; 770 break; 771 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 772 if (prange->granularity != attrs[i].value) 773 return false; 774 break; 775 default: 776 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 777 } 778 } 779 780 return true; 781 } 782 783 /** 784 * svm_range_debug_dump - print all range information from svms 785 * @svms: svm range list header 786 * 787 * debug output svm range start, end, prefetch location from svms 788 * interval tree and link list 789 * 790 * Context: The caller must hold svms->lock 791 */ 792 static void svm_range_debug_dump(struct svm_range_list *svms) 793 { 794 struct interval_tree_node *node; 795 struct svm_range *prange; 796 797 pr_debug("dump svms 0x%p list\n", svms); 798 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 799 800 list_for_each_entry(prange, &svms->list, list) { 801 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 802 prange, prange->start, prange->npages, 803 prange->start + prange->npages - 1, 804 prange->actual_loc); 805 } 806 807 pr_debug("dump svms 0x%p interval tree\n", svms); 808 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 809 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 810 while (node) { 811 prange = container_of(node, struct svm_range, it_node); 812 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 813 prange, prange->start, prange->npages, 814 prange->start + prange->npages - 1, 815 prange->actual_loc); 816 node = interval_tree_iter_next(node, 0, ~0ULL); 817 } 818 } 819 820 static int 821 svm_range_split_array(void *ppnew, void *ppold, size_t size, 822 uint64_t old_start, uint64_t old_n, 823 uint64_t new_start, uint64_t new_n) 824 { 825 unsigned char *new, *old, *pold; 826 uint64_t d; 827 828 if (!ppold) 829 return 0; 830 pold = *(unsigned char **)ppold; 831 if (!pold) 832 return 0; 833 834 new = kvmalloc_array(new_n, size, GFP_KERNEL); 835 if (!new) 836 return -ENOMEM; 837 838 d = (new_start - old_start) * size; 839 memcpy(new, pold + d, new_n * size); 840 841 old = kvmalloc_array(old_n, size, GFP_KERNEL); 842 if (!old) { 843 kvfree(new); 844 return -ENOMEM; 845 } 846 847 d = (new_start == old_start) ? new_n * size : 0; 848 memcpy(old, pold + d, old_n * size); 849 850 kvfree(pold); 851 *(void **)ppold = old; 852 *(void **)ppnew = new; 853 854 return 0; 855 } 856 857 static int 858 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 859 uint64_t start, uint64_t last) 860 { 861 uint64_t npages = last - start + 1; 862 int i, r; 863 864 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 865 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 866 sizeof(*old->dma_addr[i]), old->start, 867 npages, new->start, new->npages); 868 if (r) 869 return r; 870 } 871 872 return 0; 873 } 874 875 static int 876 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 877 uint64_t start, uint64_t last) 878 { 879 uint64_t npages = last - start + 1; 880 881 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 882 new->svms, new, new->start, start, last); 883 884 if (new->start == old->start) { 885 new->offset = old->offset; 886 old->offset += new->npages; 887 } else { 888 new->offset = old->offset + npages; 889 } 890 891 new->svm_bo = svm_range_bo_ref(old->svm_bo); 892 new->ttm_res = old->ttm_res; 893 894 spin_lock(&new->svm_bo->list_lock); 895 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 896 spin_unlock(&new->svm_bo->list_lock); 897 898 return 0; 899 } 900 901 /** 902 * svm_range_split_adjust - split range and adjust 903 * 904 * @new: new range 905 * @old: the old range 906 * @start: the old range adjust to start address in pages 907 * @last: the old range adjust to last address in pages 908 * 909 * Copy system memory dma_addr or vram ttm_res in old range to new 910 * range from new_start up to size new->npages, the remaining old range is from 911 * start to last 912 * 913 * Return: 914 * 0 - OK, -ENOMEM - out of memory 915 */ 916 static int 917 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 918 uint64_t start, uint64_t last) 919 { 920 int r; 921 922 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 923 new->svms, new->start, old->start, old->last, start, last); 924 925 if (new->start < old->start || 926 new->last > old->last) { 927 WARN_ONCE(1, "invalid new range start or last\n"); 928 return -EINVAL; 929 } 930 931 r = svm_range_split_pages(new, old, start, last); 932 if (r) 933 return r; 934 935 if (old->actual_loc && old->ttm_res) { 936 r = svm_range_split_nodes(new, old, start, last); 937 if (r) 938 return r; 939 } 940 941 old->npages = last - start + 1; 942 old->start = start; 943 old->last = last; 944 new->flags = old->flags; 945 new->preferred_loc = old->preferred_loc; 946 new->prefetch_loc = old->prefetch_loc; 947 new->actual_loc = old->actual_loc; 948 new->granularity = old->granularity; 949 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 950 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 951 952 return 0; 953 } 954 955 /** 956 * svm_range_split - split a range in 2 ranges 957 * 958 * @prange: the svm range to split 959 * @start: the remaining range start address in pages 960 * @last: the remaining range last address in pages 961 * @new: the result new range generated 962 * 963 * Two cases only: 964 * case 1: if start == prange->start 965 * prange ==> prange[start, last] 966 * new range [last + 1, prange->last] 967 * 968 * case 2: if last == prange->last 969 * prange ==> prange[start, last] 970 * new range [prange->start, start - 1] 971 * 972 * Return: 973 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 974 */ 975 static int 976 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 977 struct svm_range **new) 978 { 979 uint64_t old_start = prange->start; 980 uint64_t old_last = prange->last; 981 struct svm_range_list *svms; 982 int r = 0; 983 984 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 985 old_start, old_last, start, last); 986 987 if (old_start != start && old_last != last) 988 return -EINVAL; 989 if (start < old_start || last > old_last) 990 return -EINVAL; 991 992 svms = prange->svms; 993 if (old_start == start) 994 *new = svm_range_new(svms, last + 1, old_last); 995 else 996 *new = svm_range_new(svms, old_start, start - 1); 997 if (!*new) 998 return -ENOMEM; 999 1000 r = svm_range_split_adjust(*new, prange, start, last); 1001 if (r) { 1002 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 1003 r, old_start, old_last, start, last); 1004 svm_range_free(*new); 1005 *new = NULL; 1006 } 1007 1008 return r; 1009 } 1010 1011 static int 1012 svm_range_split_tail(struct svm_range *prange, 1013 uint64_t new_last, struct list_head *insert_list) 1014 { 1015 struct svm_range *tail; 1016 int r = svm_range_split(prange, prange->start, new_last, &tail); 1017 1018 if (!r) 1019 list_add(&tail->list, insert_list); 1020 return r; 1021 } 1022 1023 static int 1024 svm_range_split_head(struct svm_range *prange, 1025 uint64_t new_start, struct list_head *insert_list) 1026 { 1027 struct svm_range *head; 1028 int r = svm_range_split(prange, new_start, prange->last, &head); 1029 1030 if (!r) 1031 list_add(&head->list, insert_list); 1032 return r; 1033 } 1034 1035 static void 1036 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 1037 struct svm_range *pchild, enum svm_work_list_ops op) 1038 { 1039 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 1040 pchild, pchild->start, pchild->last, prange, op); 1041 1042 pchild->work_item.mm = mm; 1043 pchild->work_item.op = op; 1044 list_add_tail(&pchild->child_list, &prange->child_list); 1045 } 1046 1047 /** 1048 * svm_range_split_by_granularity - collect ranges within granularity boundary 1049 * 1050 * @p: the process with svms list 1051 * @mm: mm structure 1052 * @addr: the vm fault address in pages, to split the prange 1053 * @parent: parent range if prange is from child list 1054 * @prange: prange to split 1055 * 1056 * Trims @prange to be a single aligned block of prange->granularity if 1057 * possible. The head and tail are added to the child_list in @parent. 1058 * 1059 * Context: caller must hold mmap_read_lock and prange->lock 1060 * 1061 * Return: 1062 * 0 - OK, otherwise error code 1063 */ 1064 int 1065 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 1066 unsigned long addr, struct svm_range *parent, 1067 struct svm_range *prange) 1068 { 1069 struct svm_range *head, *tail; 1070 unsigned long start, last, size; 1071 int r; 1072 1073 /* Align splited range start and size to granularity size, then a single 1074 * PTE will be used for whole range, this reduces the number of PTE 1075 * updated and the L1 TLB space used for translation. 1076 */ 1077 size = 1UL << prange->granularity; 1078 start = ALIGN_DOWN(addr, size); 1079 last = ALIGN(addr + 1, size) - 1; 1080 1081 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 1082 prange->svms, prange->start, prange->last, start, last, size); 1083 1084 if (start > prange->start) { 1085 r = svm_range_split(prange, start, prange->last, &head); 1086 if (r) 1087 return r; 1088 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 1089 } 1090 1091 if (last < prange->last) { 1092 r = svm_range_split(prange, prange->start, last, &tail); 1093 if (r) 1094 return r; 1095 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1096 } 1097 1098 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 1099 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 1100 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 1101 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 1102 prange, prange->start, prange->last, 1103 SVM_OP_ADD_RANGE_AND_MAP); 1104 } 1105 return 0; 1106 } 1107 1108 static uint64_t 1109 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, 1110 int domain) 1111 { 1112 struct amdgpu_device *bo_adev; 1113 uint32_t flags = prange->flags; 1114 uint32_t mapping_flags = 0; 1115 uint64_t pte_flags; 1116 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 1117 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1118 1119 if (domain == SVM_RANGE_VRAM_DOMAIN) 1120 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1121 1122 switch (KFD_GC_VERSION(adev->kfd.dev)) { 1123 case IP_VERSION(9, 4, 1): 1124 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1125 if (bo_adev == adev) { 1126 mapping_flags |= coherent ? 1127 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1128 } else { 1129 mapping_flags |= coherent ? 1130 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1131 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1132 snoop = true; 1133 } 1134 } else { 1135 mapping_flags |= coherent ? 1136 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1137 } 1138 break; 1139 case IP_VERSION(9, 4, 2): 1140 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1141 if (bo_adev == adev) { 1142 mapping_flags |= coherent ? 1143 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1144 if (adev->gmc.xgmi.connected_to_cpu) 1145 snoop = true; 1146 } else { 1147 mapping_flags |= coherent ? 1148 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1149 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1150 snoop = true; 1151 } 1152 } else { 1153 mapping_flags |= coherent ? 1154 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1155 } 1156 break; 1157 default: 1158 mapping_flags |= coherent ? 1159 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1160 } 1161 1162 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1163 1164 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1165 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1166 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1167 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1168 1169 pte_flags = AMDGPU_PTE_VALID; 1170 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 1171 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1172 1173 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1174 return pte_flags; 1175 } 1176 1177 static int 1178 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1179 uint64_t start, uint64_t last, 1180 struct dma_fence **fence) 1181 { 1182 uint64_t init_pte_value = 0; 1183 1184 pr_debug("[0x%llx 0x%llx]\n", start, last); 1185 1186 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1187 start, last, init_pte_value, 0, 1188 NULL, NULL, fence, NULL); 1189 } 1190 1191 static int 1192 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1193 unsigned long last) 1194 { 1195 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1196 struct kfd_process_device *pdd; 1197 struct dma_fence *fence = NULL; 1198 struct kfd_process *p; 1199 uint32_t gpuidx; 1200 int r = 0; 1201 1202 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1203 MAX_GPU_INSTANCE); 1204 p = container_of(prange->svms, struct kfd_process, svms); 1205 1206 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1207 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1208 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1209 if (!pdd) { 1210 pr_debug("failed to find device idx %d\n", gpuidx); 1211 return -EINVAL; 1212 } 1213 1214 r = svm_range_unmap_from_gpu(pdd->dev->adev, 1215 drm_priv_to_vm(pdd->drm_priv), 1216 start, last, &fence); 1217 if (r) 1218 break; 1219 1220 if (fence) { 1221 r = dma_fence_wait(fence, false); 1222 dma_fence_put(fence); 1223 fence = NULL; 1224 if (r) 1225 break; 1226 } 1227 amdgpu_amdkfd_flush_gpu_tlb_pasid(pdd->dev->adev, 1228 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1229 } 1230 1231 return r; 1232 } 1233 1234 static int 1235 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1236 struct svm_range *prange, unsigned long offset, 1237 unsigned long npages, bool readonly, dma_addr_t *dma_addr, 1238 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1239 { 1240 bool table_freed = false; 1241 uint64_t pte_flags; 1242 unsigned long last_start; 1243 int last_domain; 1244 int r = 0; 1245 int64_t i, j; 1246 1247 last_start = prange->start + offset; 1248 1249 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 1250 last_start, last_start + npages - 1, readonly); 1251 1252 for (i = offset; i < offset + npages; i++) { 1253 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 1254 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 1255 1256 /* Collect all pages in the same address range and memory domain 1257 * that can be mapped with a single call to update mapping. 1258 */ 1259 if (i < offset + npages - 1 && 1260 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 1261 continue; 1262 1263 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 1264 last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 1265 1266 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); 1267 if (readonly) 1268 pte_flags &= ~AMDGPU_PTE_WRITEABLE; 1269 1270 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 1271 prange->svms, last_start, prange->start + i, 1272 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 1273 pte_flags); 1274 1275 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, 1276 NULL, last_start, 1277 prange->start + i, pte_flags, 1278 last_start - prange->start, 1279 NULL, dma_addr, 1280 &vm->last_update, 1281 &table_freed); 1282 1283 for (j = last_start - prange->start; j <= i; j++) 1284 dma_addr[j] |= last_domain; 1285 1286 if (r) { 1287 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1288 goto out; 1289 } 1290 last_start = prange->start + i + 1; 1291 } 1292 1293 r = amdgpu_vm_update_pdes(adev, vm, false); 1294 if (r) { 1295 pr_debug("failed %d to update directories 0x%lx\n", r, 1296 prange->start); 1297 goto out; 1298 } 1299 1300 if (fence) 1301 *fence = dma_fence_get(vm->last_update); 1302 1303 if (table_freed) { 1304 struct kfd_process *p; 1305 1306 p = container_of(prange->svms, struct kfd_process, svms); 1307 amdgpu_amdkfd_flush_gpu_tlb_pasid(adev, p->pasid, TLB_FLUSH_LEGACY); 1308 } 1309 out: 1310 return r; 1311 } 1312 1313 static int 1314 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 1315 unsigned long npages, bool readonly, 1316 unsigned long *bitmap, bool wait) 1317 { 1318 struct kfd_process_device *pdd; 1319 struct amdgpu_device *bo_adev; 1320 struct kfd_process *p; 1321 struct dma_fence *fence = NULL; 1322 uint32_t gpuidx; 1323 int r = 0; 1324 1325 if (prange->svm_bo && prange->ttm_res) 1326 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1327 else 1328 bo_adev = NULL; 1329 1330 p = container_of(prange->svms, struct kfd_process, svms); 1331 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1332 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1333 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1334 if (!pdd) { 1335 pr_debug("failed to find device idx %d\n", gpuidx); 1336 return -EINVAL; 1337 } 1338 1339 pdd = kfd_bind_process_to_device(pdd->dev, p); 1340 if (IS_ERR(pdd)) 1341 return -EINVAL; 1342 1343 if (bo_adev && pdd->dev->adev != bo_adev && 1344 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 1345 pr_debug("cannot map to device idx %d\n", gpuidx); 1346 continue; 1347 } 1348 1349 r = svm_range_map_to_gpu(pdd->dev->adev, drm_priv_to_vm(pdd->drm_priv), 1350 prange, offset, npages, readonly, 1351 prange->dma_addr[gpuidx], 1352 bo_adev, wait ? &fence : NULL); 1353 if (r) 1354 break; 1355 1356 if (fence) { 1357 r = dma_fence_wait(fence, false); 1358 dma_fence_put(fence); 1359 fence = NULL; 1360 if (r) { 1361 pr_debug("failed %d to dma fence wait\n", r); 1362 break; 1363 } 1364 } 1365 } 1366 1367 return r; 1368 } 1369 1370 struct svm_validate_context { 1371 struct kfd_process *process; 1372 struct svm_range *prange; 1373 bool intr; 1374 unsigned long bitmap[MAX_GPU_INSTANCE]; 1375 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; 1376 struct list_head validate_list; 1377 struct ww_acquire_ctx ticket; 1378 }; 1379 1380 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1381 { 1382 struct kfd_process_device *pdd; 1383 struct amdgpu_vm *vm; 1384 uint32_t gpuidx; 1385 int r; 1386 1387 INIT_LIST_HEAD(&ctx->validate_list); 1388 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1389 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1390 if (!pdd) { 1391 pr_debug("failed to find device idx %d\n", gpuidx); 1392 return -EINVAL; 1393 } 1394 vm = drm_priv_to_vm(pdd->drm_priv); 1395 1396 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1397 ctx->tv[gpuidx].num_shared = 4; 1398 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1399 } 1400 1401 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1402 ctx->intr, NULL); 1403 if (r) { 1404 pr_debug("failed %d to reserve bo\n", r); 1405 return r; 1406 } 1407 1408 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1409 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1410 if (!pdd) { 1411 pr_debug("failed to find device idx %d\n", gpuidx); 1412 r = -EINVAL; 1413 goto unreserve_out; 1414 } 1415 1416 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 1417 drm_priv_to_vm(pdd->drm_priv), 1418 svm_range_bo_validate, NULL); 1419 if (r) { 1420 pr_debug("failed %d validate pt bos\n", r); 1421 goto unreserve_out; 1422 } 1423 } 1424 1425 return 0; 1426 1427 unreserve_out: 1428 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1429 return r; 1430 } 1431 1432 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1433 { 1434 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1435 } 1436 1437 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 1438 { 1439 struct kfd_process_device *pdd; 1440 1441 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1442 1443 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 1444 } 1445 1446 /* 1447 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1448 * 1449 * To prevent concurrent destruction or change of range attributes, the 1450 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1451 * because that would block concurrent evictions and lead to deadlocks. To 1452 * serialize concurrent migrations or validations of the same range, the 1453 * prange->migrate_mutex must be held. 1454 * 1455 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1456 * eviction fence. 1457 * 1458 * The following sequence ensures race-free validation and GPU mapping: 1459 * 1460 * 1. Reserve page table (and SVM BO if range is in VRAM) 1461 * 2. hmm_range_fault to get page addresses (if system memory) 1462 * 3. DMA-map pages (if system memory) 1463 * 4-a. Take notifier lock 1464 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1465 * 4-c. Check that the range was not split or otherwise invalidated 1466 * 4-d. Update GPU page table 1467 * 4.e. Release notifier lock 1468 * 5. Release page table (and SVM BO) reservation 1469 */ 1470 static int svm_range_validate_and_map(struct mm_struct *mm, 1471 struct svm_range *prange, 1472 int32_t gpuidx, bool intr, bool wait) 1473 { 1474 struct svm_validate_context ctx; 1475 unsigned long start, end, addr; 1476 struct kfd_process *p; 1477 void *owner; 1478 int32_t idx; 1479 int r = 0; 1480 1481 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1482 ctx.prange = prange; 1483 ctx.intr = intr; 1484 1485 if (gpuidx < MAX_GPU_INSTANCE) { 1486 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1487 bitmap_set(ctx.bitmap, gpuidx, 1); 1488 } else if (ctx.process->xnack_enabled) { 1489 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1490 1491 /* If prefetch range to GPU, or GPU retry fault migrate range to 1492 * GPU, which has ACCESS attribute to the range, create mapping 1493 * on that GPU. 1494 */ 1495 if (prange->actual_loc) { 1496 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1497 prange->actual_loc); 1498 if (gpuidx < 0) { 1499 WARN_ONCE(1, "failed get device by id 0x%x\n", 1500 prange->actual_loc); 1501 return -EINVAL; 1502 } 1503 if (test_bit(gpuidx, prange->bitmap_access)) 1504 bitmap_set(ctx.bitmap, gpuidx, 1); 1505 } 1506 } else { 1507 bitmap_or(ctx.bitmap, prange->bitmap_access, 1508 prange->bitmap_aip, MAX_GPU_INSTANCE); 1509 } 1510 1511 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1512 return 0; 1513 1514 if (prange->actual_loc && !prange->ttm_res) { 1515 /* This should never happen. actual_loc gets set by 1516 * svm_migrate_ram_to_vram after allocating a BO. 1517 */ 1518 WARN_ONCE(1, "VRAM BO missing during validation\n"); 1519 return -EINVAL; 1520 } 1521 1522 svm_range_reserve_bos(&ctx); 1523 1524 p = container_of(prange->svms, struct kfd_process, svms); 1525 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, 1526 MAX_GPU_INSTANCE)); 1527 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { 1528 if (kfd_svm_page_owner(p, idx) != owner) { 1529 owner = NULL; 1530 break; 1531 } 1532 } 1533 1534 start = prange->start << PAGE_SHIFT; 1535 end = (prange->last + 1) << PAGE_SHIFT; 1536 for (addr = start; addr < end && !r; ) { 1537 struct hmm_range *hmm_range; 1538 struct vm_area_struct *vma; 1539 unsigned long next; 1540 unsigned long offset; 1541 unsigned long npages; 1542 bool readonly; 1543 1544 vma = find_vma(mm, addr); 1545 if (!vma || addr < vma->vm_start) { 1546 r = -EFAULT; 1547 goto unreserve_out; 1548 } 1549 readonly = !(vma->vm_flags & VM_WRITE); 1550 1551 next = min(vma->vm_end, end); 1552 npages = (next - addr) >> PAGE_SHIFT; 1553 WRITE_ONCE(p->svms.faulting_task, current); 1554 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1555 addr, npages, &hmm_range, 1556 readonly, true, owner); 1557 WRITE_ONCE(p->svms.faulting_task, NULL); 1558 if (r) { 1559 pr_debug("failed %d to get svm range pages\n", r); 1560 goto unreserve_out; 1561 } 1562 1563 offset = (addr - start) >> PAGE_SHIFT; 1564 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, 1565 hmm_range->hmm_pfns); 1566 if (r) { 1567 pr_debug("failed %d to dma map range\n", r); 1568 goto unreserve_out; 1569 } 1570 1571 svm_range_lock(prange); 1572 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1573 pr_debug("hmm update the range, need validate again\n"); 1574 r = -EAGAIN; 1575 goto unlock_out; 1576 } 1577 if (!list_empty(&prange->child_list)) { 1578 pr_debug("range split by unmap in parallel, validate again\n"); 1579 r = -EAGAIN; 1580 goto unlock_out; 1581 } 1582 1583 r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1584 ctx.bitmap, wait); 1585 1586 unlock_out: 1587 svm_range_unlock(prange); 1588 1589 addr = next; 1590 } 1591 1592 if (addr == end) 1593 prange->validated_once = true; 1594 1595 unreserve_out: 1596 svm_range_unreserve_bos(&ctx); 1597 1598 if (!r) 1599 prange->validate_timestamp = ktime_to_us(ktime_get()); 1600 1601 return r; 1602 } 1603 1604 /** 1605 * svm_range_list_lock_and_flush_work - flush pending deferred work 1606 * 1607 * @svms: the svm range list 1608 * @mm: the mm structure 1609 * 1610 * Context: Returns with mmap write lock held, pending deferred work flushed 1611 * 1612 */ 1613 void 1614 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1615 struct mm_struct *mm) 1616 { 1617 retry_flush_work: 1618 flush_work(&svms->deferred_list_work); 1619 mmap_write_lock(mm); 1620 1621 if (list_empty(&svms->deferred_range_list)) 1622 return; 1623 mmap_write_unlock(mm); 1624 pr_debug("retry flush\n"); 1625 goto retry_flush_work; 1626 } 1627 1628 static void svm_range_restore_work(struct work_struct *work) 1629 { 1630 struct delayed_work *dwork = to_delayed_work(work); 1631 struct svm_range_list *svms; 1632 struct svm_range *prange; 1633 struct kfd_process *p; 1634 struct mm_struct *mm; 1635 int evicted_ranges; 1636 int invalid; 1637 int r; 1638 1639 svms = container_of(dwork, struct svm_range_list, restore_work); 1640 evicted_ranges = atomic_read(&svms->evicted_ranges); 1641 if (!evicted_ranges) 1642 return; 1643 1644 pr_debug("restore svm ranges\n"); 1645 1646 /* kfd_process_notifier_release destroys this worker thread. So during 1647 * the lifetime of this thread, kfd_process and mm will be valid. 1648 */ 1649 p = container_of(svms, struct kfd_process, svms); 1650 mm = p->mm; 1651 if (!mm) 1652 return; 1653 1654 svm_range_list_lock_and_flush_work(svms, mm); 1655 mutex_lock(&svms->lock); 1656 1657 evicted_ranges = atomic_read(&svms->evicted_ranges); 1658 1659 list_for_each_entry(prange, &svms->list, list) { 1660 invalid = atomic_read(&prange->invalid); 1661 if (!invalid) 1662 continue; 1663 1664 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1665 prange->svms, prange, prange->start, prange->last, 1666 invalid); 1667 1668 /* 1669 * If range is migrating, wait for migration is done. 1670 */ 1671 mutex_lock(&prange->migrate_mutex); 1672 1673 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1674 false, true); 1675 if (r) 1676 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1677 prange->start); 1678 1679 mutex_unlock(&prange->migrate_mutex); 1680 if (r) 1681 goto out_reschedule; 1682 1683 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1684 goto out_reschedule; 1685 } 1686 1687 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1688 evicted_ranges) 1689 goto out_reschedule; 1690 1691 evicted_ranges = 0; 1692 1693 r = kgd2kfd_resume_mm(mm); 1694 if (r) { 1695 /* No recovery from this failure. Probably the CP is 1696 * hanging. No point trying again. 1697 */ 1698 pr_debug("failed %d to resume KFD\n", r); 1699 } 1700 1701 pr_debug("restore svm ranges successfully\n"); 1702 1703 out_reschedule: 1704 mutex_unlock(&svms->lock); 1705 mmap_write_unlock(mm); 1706 1707 /* If validation failed, reschedule another attempt */ 1708 if (evicted_ranges) { 1709 pr_debug("reschedule to restore svm range\n"); 1710 schedule_delayed_work(&svms->restore_work, 1711 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1712 } 1713 } 1714 1715 /** 1716 * svm_range_evict - evict svm range 1717 * @prange: svm range structure 1718 * @mm: current process mm_struct 1719 * @start: starting process queue number 1720 * @last: last process queue number 1721 * 1722 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1723 * return to let CPU evict the buffer and proceed CPU pagetable update. 1724 * 1725 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1726 * If invalidation happens while restore work is running, restore work will 1727 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1728 * the queues. 1729 */ 1730 static int 1731 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1732 unsigned long start, unsigned long last) 1733 { 1734 struct svm_range_list *svms = prange->svms; 1735 struct svm_range *pchild; 1736 struct kfd_process *p; 1737 int r = 0; 1738 1739 p = container_of(svms, struct kfd_process, svms); 1740 1741 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1742 svms, prange->start, prange->last, start, last); 1743 1744 if (!p->xnack_enabled) { 1745 int evicted_ranges; 1746 1747 list_for_each_entry(pchild, &prange->child_list, child_list) { 1748 mutex_lock_nested(&pchild->lock, 1); 1749 if (pchild->start <= last && pchild->last >= start) { 1750 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 1751 pchild->start, pchild->last); 1752 atomic_inc(&pchild->invalid); 1753 } 1754 mutex_unlock(&pchild->lock); 1755 } 1756 1757 if (prange->start <= last && prange->last >= start) 1758 atomic_inc(&prange->invalid); 1759 1760 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1761 if (evicted_ranges != 1) 1762 return r; 1763 1764 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1765 prange->svms, prange->start, prange->last); 1766 1767 /* First eviction, stop the queues */ 1768 r = kgd2kfd_quiesce_mm(mm); 1769 if (r) 1770 pr_debug("failed to quiesce KFD\n"); 1771 1772 pr_debug("schedule to restore svm %p ranges\n", svms); 1773 schedule_delayed_work(&svms->restore_work, 1774 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1775 } else { 1776 unsigned long s, l; 1777 1778 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1779 prange->svms, start, last); 1780 list_for_each_entry(pchild, &prange->child_list, child_list) { 1781 mutex_lock_nested(&pchild->lock, 1); 1782 s = max(start, pchild->start); 1783 l = min(last, pchild->last); 1784 if (l >= s) 1785 svm_range_unmap_from_gpus(pchild, s, l); 1786 mutex_unlock(&pchild->lock); 1787 } 1788 s = max(start, prange->start); 1789 l = min(last, prange->last); 1790 if (l >= s) 1791 svm_range_unmap_from_gpus(prange, s, l); 1792 } 1793 1794 return r; 1795 } 1796 1797 static struct svm_range *svm_range_clone(struct svm_range *old) 1798 { 1799 struct svm_range *new; 1800 1801 new = svm_range_new(old->svms, old->start, old->last); 1802 if (!new) 1803 return NULL; 1804 1805 if (old->svm_bo) { 1806 new->ttm_res = old->ttm_res; 1807 new->offset = old->offset; 1808 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1809 spin_lock(&new->svm_bo->list_lock); 1810 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1811 spin_unlock(&new->svm_bo->list_lock); 1812 } 1813 new->flags = old->flags; 1814 new->preferred_loc = old->preferred_loc; 1815 new->prefetch_loc = old->prefetch_loc; 1816 new->actual_loc = old->actual_loc; 1817 new->granularity = old->granularity; 1818 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1819 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1820 1821 return new; 1822 } 1823 1824 /** 1825 * svm_range_add - add svm range and handle overlap 1826 * @p: the range add to this process svms 1827 * @start: page size aligned 1828 * @size: page size aligned 1829 * @nattr: number of attributes 1830 * @attrs: array of attributes 1831 * @update_list: output, the ranges need validate and update GPU mapping 1832 * @insert_list: output, the ranges need insert to svms 1833 * @remove_list: output, the ranges are replaced and need remove from svms 1834 * 1835 * Check if the virtual address range has overlap with any existing ranges, 1836 * split partly overlapping ranges and add new ranges in the gaps. All changes 1837 * should be applied to the range_list and interval tree transactionally. If 1838 * any range split or allocation fails, the entire update fails. Therefore any 1839 * existing overlapping svm_ranges are cloned and the original svm_ranges left 1840 * unchanged. 1841 * 1842 * If the transaction succeeds, the caller can update and insert clones and 1843 * new ranges, then free the originals. 1844 * 1845 * Otherwise the caller can free the clones and new ranges, while the old 1846 * svm_ranges remain unchanged. 1847 * 1848 * Context: Process context, caller must hold svms->lock 1849 * 1850 * Return: 1851 * 0 - OK, otherwise error code 1852 */ 1853 static int 1854 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 1855 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 1856 struct list_head *update_list, struct list_head *insert_list, 1857 struct list_head *remove_list) 1858 { 1859 unsigned long last = start + size - 1UL; 1860 struct svm_range_list *svms = &p->svms; 1861 struct interval_tree_node *node; 1862 struct svm_range *prange; 1863 struct svm_range *tmp; 1864 int r = 0; 1865 1866 pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); 1867 1868 INIT_LIST_HEAD(update_list); 1869 INIT_LIST_HEAD(insert_list); 1870 INIT_LIST_HEAD(remove_list); 1871 1872 node = interval_tree_iter_first(&svms->objects, start, last); 1873 while (node) { 1874 struct interval_tree_node *next; 1875 unsigned long next_start; 1876 1877 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1878 node->last); 1879 1880 prange = container_of(node, struct svm_range, it_node); 1881 next = interval_tree_iter_next(node, start, last); 1882 next_start = min(node->last, last) + 1; 1883 1884 if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { 1885 /* nothing to do */ 1886 } else if (node->start < start || node->last > last) { 1887 /* node intersects the update range and its attributes 1888 * will change. Clone and split it, apply updates only 1889 * to the overlapping part 1890 */ 1891 struct svm_range *old = prange; 1892 1893 prange = svm_range_clone(old); 1894 if (!prange) { 1895 r = -ENOMEM; 1896 goto out; 1897 } 1898 1899 list_add(&old->update_list, remove_list); 1900 list_add(&prange->list, insert_list); 1901 list_add(&prange->update_list, update_list); 1902 1903 if (node->start < start) { 1904 pr_debug("change old range start\n"); 1905 r = svm_range_split_head(prange, start, 1906 insert_list); 1907 if (r) 1908 goto out; 1909 } 1910 if (node->last > last) { 1911 pr_debug("change old range last\n"); 1912 r = svm_range_split_tail(prange, last, 1913 insert_list); 1914 if (r) 1915 goto out; 1916 } 1917 } else { 1918 /* The node is contained within start..last, 1919 * just update it 1920 */ 1921 list_add(&prange->update_list, update_list); 1922 } 1923 1924 /* insert a new node if needed */ 1925 if (node->start > start) { 1926 prange = svm_range_new(svms, start, node->start - 1); 1927 if (!prange) { 1928 r = -ENOMEM; 1929 goto out; 1930 } 1931 1932 list_add(&prange->list, insert_list); 1933 list_add(&prange->update_list, update_list); 1934 } 1935 1936 node = next; 1937 start = next_start; 1938 } 1939 1940 /* add a final range at the end if needed */ 1941 if (start <= last) { 1942 prange = svm_range_new(svms, start, last); 1943 if (!prange) { 1944 r = -ENOMEM; 1945 goto out; 1946 } 1947 list_add(&prange->list, insert_list); 1948 list_add(&prange->update_list, update_list); 1949 } 1950 1951 out: 1952 if (r) 1953 list_for_each_entry_safe(prange, tmp, insert_list, list) 1954 svm_range_free(prange); 1955 1956 return r; 1957 } 1958 1959 static void 1960 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1961 struct svm_range *prange) 1962 { 1963 unsigned long start; 1964 unsigned long last; 1965 1966 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1967 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1968 1969 if (prange->start == start && prange->last == last) 1970 return; 1971 1972 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1973 prange->svms, prange, start, last, prange->start, 1974 prange->last); 1975 1976 if (start != 0 && last != 0) { 1977 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1978 svm_range_remove_notifier(prange); 1979 } 1980 prange->it_node.start = prange->start; 1981 prange->it_node.last = prange->last; 1982 1983 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1984 svm_range_add_notifier_locked(mm, prange); 1985 } 1986 1987 static void 1988 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) 1989 { 1990 struct mm_struct *mm = prange->work_item.mm; 1991 1992 switch (prange->work_item.op) { 1993 case SVM_OP_NULL: 1994 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1995 svms, prange, prange->start, prange->last); 1996 break; 1997 case SVM_OP_UNMAP_RANGE: 1998 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1999 svms, prange, prange->start, prange->last); 2000 svm_range_unlink(prange); 2001 svm_range_remove_notifier(prange); 2002 svm_range_free(prange); 2003 break; 2004 case SVM_OP_UPDATE_RANGE_NOTIFIER: 2005 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2006 svms, prange, prange->start, prange->last); 2007 svm_range_update_notifier_and_interval_tree(mm, prange); 2008 break; 2009 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 2010 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2011 svms, prange, prange->start, prange->last); 2012 svm_range_update_notifier_and_interval_tree(mm, prange); 2013 /* TODO: implement deferred validation and mapping */ 2014 break; 2015 case SVM_OP_ADD_RANGE: 2016 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 2017 prange->start, prange->last); 2018 svm_range_add_to_svms(prange); 2019 svm_range_add_notifier_locked(mm, prange); 2020 break; 2021 case SVM_OP_ADD_RANGE_AND_MAP: 2022 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 2023 prange, prange->start, prange->last); 2024 svm_range_add_to_svms(prange); 2025 svm_range_add_notifier_locked(mm, prange); 2026 /* TODO: implement deferred validation and mapping */ 2027 break; 2028 default: 2029 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 2030 prange->work_item.op); 2031 } 2032 } 2033 2034 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 2035 { 2036 struct kfd_process_device *pdd; 2037 struct kfd_process *p; 2038 int drain; 2039 uint32_t i; 2040 2041 p = container_of(svms, struct kfd_process, svms); 2042 2043 restart: 2044 drain = atomic_read(&svms->drain_pagefaults); 2045 if (!drain) 2046 return; 2047 2048 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 2049 pdd = p->pdds[i]; 2050 if (!pdd) 2051 continue; 2052 2053 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 2054 2055 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2056 &pdd->dev->adev->irq.ih1); 2057 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 2058 } 2059 if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) 2060 goto restart; 2061 } 2062 2063 static void svm_range_deferred_list_work(struct work_struct *work) 2064 { 2065 struct svm_range_list *svms; 2066 struct svm_range *prange; 2067 struct mm_struct *mm; 2068 struct kfd_process *p; 2069 2070 svms = container_of(work, struct svm_range_list, deferred_list_work); 2071 pr_debug("enter svms 0x%p\n", svms); 2072 2073 p = container_of(svms, struct kfd_process, svms); 2074 /* Avoid mm is gone when inserting mmu notifier */ 2075 mm = get_task_mm(p->lead_thread); 2076 if (!mm) { 2077 pr_debug("svms 0x%p process mm gone\n", svms); 2078 return; 2079 } 2080 retry: 2081 mmap_write_lock(mm); 2082 2083 /* Checking for the need to drain retry faults must be inside 2084 * mmap write lock to serialize with munmap notifiers. 2085 */ 2086 if (unlikely(atomic_read(&svms->drain_pagefaults))) { 2087 mmap_write_unlock(mm); 2088 svm_range_drain_retry_fault(svms); 2089 goto retry; 2090 } 2091 2092 spin_lock(&svms->deferred_list_lock); 2093 while (!list_empty(&svms->deferred_range_list)) { 2094 prange = list_first_entry(&svms->deferred_range_list, 2095 struct svm_range, deferred_list); 2096 list_del_init(&prange->deferred_list); 2097 spin_unlock(&svms->deferred_list_lock); 2098 2099 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 2100 prange->start, prange->last, prange->work_item.op); 2101 2102 mutex_lock(&svms->lock); 2103 mutex_lock(&prange->migrate_mutex); 2104 while (!list_empty(&prange->child_list)) { 2105 struct svm_range *pchild; 2106 2107 pchild = list_first_entry(&prange->child_list, 2108 struct svm_range, child_list); 2109 pr_debug("child prange 0x%p op %d\n", pchild, 2110 pchild->work_item.op); 2111 list_del_init(&pchild->child_list); 2112 svm_range_handle_list_op(svms, pchild); 2113 } 2114 mutex_unlock(&prange->migrate_mutex); 2115 2116 svm_range_handle_list_op(svms, prange); 2117 mutex_unlock(&svms->lock); 2118 2119 spin_lock(&svms->deferred_list_lock); 2120 } 2121 spin_unlock(&svms->deferred_list_lock); 2122 2123 mmap_write_unlock(mm); 2124 mmput(mm); 2125 pr_debug("exit svms 0x%p\n", svms); 2126 } 2127 2128 void 2129 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 2130 struct mm_struct *mm, enum svm_work_list_ops op) 2131 { 2132 spin_lock(&svms->deferred_list_lock); 2133 /* if prange is on the deferred list */ 2134 if (!list_empty(&prange->deferred_list)) { 2135 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 2136 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 2137 if (op != SVM_OP_NULL && 2138 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2139 prange->work_item.op = op; 2140 } else { 2141 prange->work_item.op = op; 2142 prange->work_item.mm = mm; 2143 list_add_tail(&prange->deferred_list, 2144 &prange->svms->deferred_range_list); 2145 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2146 prange, prange->start, prange->last, op); 2147 } 2148 spin_unlock(&svms->deferred_list_lock); 2149 } 2150 2151 void schedule_deferred_list_work(struct svm_range_list *svms) 2152 { 2153 spin_lock(&svms->deferred_list_lock); 2154 if (!list_empty(&svms->deferred_range_list)) 2155 schedule_work(&svms->deferred_list_work); 2156 spin_unlock(&svms->deferred_list_lock); 2157 } 2158 2159 static void 2160 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2161 struct svm_range *prange, unsigned long start, 2162 unsigned long last) 2163 { 2164 struct svm_range *head; 2165 struct svm_range *tail; 2166 2167 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2168 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 2169 prange->start, prange->last); 2170 return; 2171 } 2172 if (start > prange->last || last < prange->start) 2173 return; 2174 2175 head = tail = prange; 2176 if (start > prange->start) 2177 svm_range_split(prange, prange->start, start - 1, &tail); 2178 if (last < tail->last) 2179 svm_range_split(tail, last + 1, tail->last, &head); 2180 2181 if (head != prange && tail != prange) { 2182 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2183 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2184 } else if (tail != prange) { 2185 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2186 } else if (head != prange) { 2187 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2188 } else if (parent != prange) { 2189 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2190 } 2191 } 2192 2193 static void 2194 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 2195 unsigned long start, unsigned long last) 2196 { 2197 struct svm_range_list *svms; 2198 struct svm_range *pchild; 2199 struct kfd_process *p; 2200 unsigned long s, l; 2201 bool unmap_parent; 2202 2203 p = kfd_lookup_process_by_mm(mm); 2204 if (!p) 2205 return; 2206 svms = &p->svms; 2207 2208 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2209 prange, prange->start, prange->last, start, last); 2210 2211 /* Make sure pending page faults are drained in the deferred worker 2212 * before the range is freed to avoid straggler interrupts on 2213 * unmapped memory causing "phantom faults". 2214 */ 2215 atomic_inc(&svms->drain_pagefaults); 2216 2217 unmap_parent = start <= prange->start && last >= prange->last; 2218 2219 list_for_each_entry(pchild, &prange->child_list, child_list) { 2220 mutex_lock_nested(&pchild->lock, 1); 2221 s = max(start, pchild->start); 2222 l = min(last, pchild->last); 2223 if (l >= s) 2224 svm_range_unmap_from_gpus(pchild, s, l); 2225 svm_range_unmap_split(mm, prange, pchild, start, last); 2226 mutex_unlock(&pchild->lock); 2227 } 2228 s = max(start, prange->start); 2229 l = min(last, prange->last); 2230 if (l >= s) 2231 svm_range_unmap_from_gpus(prange, s, l); 2232 svm_range_unmap_split(mm, prange, prange, start, last); 2233 2234 if (unmap_parent) 2235 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2236 else 2237 svm_range_add_list_work(svms, prange, mm, 2238 SVM_OP_UPDATE_RANGE_NOTIFIER); 2239 schedule_deferred_list_work(svms); 2240 2241 kfd_unref_process(p); 2242 } 2243 2244 /** 2245 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2246 * @mni: mmu_interval_notifier struct 2247 * @range: mmu_notifier_range struct 2248 * @cur_seq: value to pass to mmu_interval_set_seq() 2249 * 2250 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2251 * is from migration, or CPU page invalidation callback. 2252 * 2253 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2254 * work thread, and split prange if only part of prange is unmapped. 2255 * 2256 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2257 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2258 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2259 * update GPU mapping to recover. 2260 * 2261 * Context: mmap lock, notifier_invalidate_start lock are held 2262 * for invalidate event, prange lock is held if this is from migration 2263 */ 2264 static bool 2265 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2266 const struct mmu_notifier_range *range, 2267 unsigned long cur_seq) 2268 { 2269 struct svm_range *prange; 2270 unsigned long start; 2271 unsigned long last; 2272 2273 if (range->event == MMU_NOTIFY_RELEASE) 2274 return true; 2275 2276 start = mni->interval_tree.start; 2277 last = mni->interval_tree.last; 2278 start = max(start, range->start) >> PAGE_SHIFT; 2279 last = min(last, range->end - 1) >> PAGE_SHIFT; 2280 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2281 start, last, range->start >> PAGE_SHIFT, 2282 (range->end - 1) >> PAGE_SHIFT, 2283 mni->interval_tree.start >> PAGE_SHIFT, 2284 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2285 2286 prange = container_of(mni, struct svm_range, notifier); 2287 2288 svm_range_lock(prange); 2289 mmu_interval_set_seq(mni, cur_seq); 2290 2291 switch (range->event) { 2292 case MMU_NOTIFY_UNMAP: 2293 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2294 break; 2295 default: 2296 svm_range_evict(prange, mni->mm, start, last); 2297 break; 2298 } 2299 2300 svm_range_unlock(prange); 2301 2302 return true; 2303 } 2304 2305 /** 2306 * svm_range_from_addr - find svm range from fault address 2307 * @svms: svm range list header 2308 * @addr: address to search range interval tree, in pages 2309 * @parent: parent range if range is on child list 2310 * 2311 * Context: The caller must hold svms->lock 2312 * 2313 * Return: the svm_range found or NULL 2314 */ 2315 struct svm_range * 2316 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2317 struct svm_range **parent) 2318 { 2319 struct interval_tree_node *node; 2320 struct svm_range *prange; 2321 struct svm_range *pchild; 2322 2323 node = interval_tree_iter_first(&svms->objects, addr, addr); 2324 if (!node) 2325 return NULL; 2326 2327 prange = container_of(node, struct svm_range, it_node); 2328 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2329 addr, prange->start, prange->last, node->start, node->last); 2330 2331 if (addr >= prange->start && addr <= prange->last) { 2332 if (parent) 2333 *parent = prange; 2334 return prange; 2335 } 2336 list_for_each_entry(pchild, &prange->child_list, child_list) 2337 if (addr >= pchild->start && addr <= pchild->last) { 2338 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2339 addr, pchild->start, pchild->last); 2340 if (parent) 2341 *parent = prange; 2342 return pchild; 2343 } 2344 2345 return NULL; 2346 } 2347 2348 /* svm_range_best_restore_location - decide the best fault restore location 2349 * @prange: svm range structure 2350 * @adev: the GPU on which vm fault happened 2351 * 2352 * This is only called when xnack is on, to decide the best location to restore 2353 * the range mapping after GPU vm fault. Caller uses the best location to do 2354 * migration if actual loc is not best location, then update GPU page table 2355 * mapping to the best location. 2356 * 2357 * If the preferred loc is accessible by faulting GPU, use preferred loc. 2358 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2359 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2360 * if range actual loc is cpu, best_loc is cpu 2361 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2362 * range actual loc. 2363 * Otherwise, GPU no access, best_loc is -1. 2364 * 2365 * Return: 2366 * -1 means vm fault GPU no access 2367 * 0 for CPU or GPU id 2368 */ 2369 static int32_t 2370 svm_range_best_restore_location(struct svm_range *prange, 2371 struct amdgpu_device *adev, 2372 int32_t *gpuidx) 2373 { 2374 struct amdgpu_device *bo_adev, *preferred_adev; 2375 struct kfd_process *p; 2376 uint32_t gpuid; 2377 int r; 2378 2379 p = container_of(prange->svms, struct kfd_process, svms); 2380 2381 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); 2382 if (r < 0) { 2383 pr_debug("failed to get gpuid from kgd\n"); 2384 return -1; 2385 } 2386 2387 if (prange->preferred_loc == gpuid || 2388 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 2389 return prange->preferred_loc; 2390 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2391 preferred_adev = svm_range_get_adev_by_id(prange, 2392 prange->preferred_loc); 2393 if (amdgpu_xgmi_same_hive(adev, preferred_adev)) 2394 return prange->preferred_loc; 2395 /* fall through */ 2396 } 2397 2398 if (test_bit(*gpuidx, prange->bitmap_access)) 2399 return gpuid; 2400 2401 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2402 if (!prange->actual_loc) 2403 return 0; 2404 2405 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2406 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2407 return prange->actual_loc; 2408 else 2409 return 0; 2410 } 2411 2412 return -1; 2413 } 2414 2415 static int 2416 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2417 unsigned long *start, unsigned long *last, 2418 bool *is_heap_stack) 2419 { 2420 struct vm_area_struct *vma; 2421 struct interval_tree_node *node; 2422 unsigned long start_limit, end_limit; 2423 2424 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2425 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2426 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2427 return -EFAULT; 2428 } 2429 2430 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && 2431 vma->vm_end >= vma->vm_mm->start_brk) || 2432 (vma->vm_start <= vma->vm_mm->start_stack && 2433 vma->vm_end >= vma->vm_mm->start_stack); 2434 2435 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2436 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2437 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2438 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2439 /* First range that starts after the fault address */ 2440 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2441 if (node) { 2442 end_limit = min(end_limit, node->start); 2443 /* Last range that ends before the fault address */ 2444 node = container_of(rb_prev(&node->rb), 2445 struct interval_tree_node, rb); 2446 } else { 2447 /* Last range must end before addr because 2448 * there was no range after addr 2449 */ 2450 node = container_of(rb_last(&p->svms.objects.rb_root), 2451 struct interval_tree_node, rb); 2452 } 2453 if (node) { 2454 if (node->last >= addr) { 2455 WARN(1, "Overlap with prev node and page fault addr\n"); 2456 return -EFAULT; 2457 } 2458 start_limit = max(start_limit, node->last + 1); 2459 } 2460 2461 *start = start_limit; 2462 *last = end_limit - 1; 2463 2464 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 2465 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 2466 *start, *last, *is_heap_stack); 2467 2468 return 0; 2469 } 2470 2471 static int 2472 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 2473 uint64_t *bo_s, uint64_t *bo_l) 2474 { 2475 struct amdgpu_bo_va_mapping *mapping; 2476 struct interval_tree_node *node; 2477 struct amdgpu_bo *bo = NULL; 2478 unsigned long userptr; 2479 uint32_t i; 2480 int r; 2481 2482 for (i = 0; i < p->n_pdds; i++) { 2483 struct amdgpu_vm *vm; 2484 2485 if (!p->pdds[i]->drm_priv) 2486 continue; 2487 2488 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2489 r = amdgpu_bo_reserve(vm->root.bo, false); 2490 if (r) 2491 return r; 2492 2493 /* Check userptr by searching entire vm->va interval tree */ 2494 node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 2495 while (node) { 2496 mapping = container_of((struct rb_node *)node, 2497 struct amdgpu_bo_va_mapping, rb); 2498 bo = mapping->bo_va->base.bo; 2499 2500 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 2501 start << PAGE_SHIFT, 2502 last << PAGE_SHIFT, 2503 &userptr)) { 2504 node = interval_tree_iter_next(node, 0, ~0ULL); 2505 continue; 2506 } 2507 2508 pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 2509 start, last); 2510 if (bo_s && bo_l) { 2511 *bo_s = userptr >> PAGE_SHIFT; 2512 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 2513 } 2514 amdgpu_bo_unreserve(vm->root.bo); 2515 return -EADDRINUSE; 2516 } 2517 amdgpu_bo_unreserve(vm->root.bo); 2518 } 2519 return 0; 2520 } 2521 2522 static struct 2523 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2524 struct kfd_process *p, 2525 struct mm_struct *mm, 2526 int64_t addr) 2527 { 2528 struct svm_range *prange = NULL; 2529 unsigned long start, last; 2530 uint32_t gpuid, gpuidx; 2531 bool is_heap_stack; 2532 uint64_t bo_s = 0; 2533 uint64_t bo_l = 0; 2534 int r; 2535 2536 if (svm_range_get_range_boundaries(p, addr, &start, &last, 2537 &is_heap_stack)) 2538 return NULL; 2539 2540 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 2541 if (r != -EADDRINUSE) 2542 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 2543 2544 if (r == -EADDRINUSE) { 2545 if (addr >= bo_s && addr <= bo_l) 2546 return NULL; 2547 2548 /* Create one page svm range if 2MB range overlapping */ 2549 start = addr; 2550 last = addr; 2551 } 2552 2553 prange = svm_range_new(&p->svms, start, last); 2554 if (!prange) { 2555 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2556 return NULL; 2557 } 2558 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { 2559 pr_debug("failed to get gpuid from kgd\n"); 2560 svm_range_free(prange); 2561 return NULL; 2562 } 2563 2564 if (is_heap_stack) 2565 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 2566 2567 svm_range_add_to_svms(prange); 2568 svm_range_add_notifier_locked(mm, prange); 2569 2570 return prange; 2571 } 2572 2573 /* svm_range_skip_recover - decide if prange can be recovered 2574 * @prange: svm range structure 2575 * 2576 * GPU vm retry fault handle skip recover the range for cases: 2577 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2578 * deferred list work will drain the stale fault before free the prange. 2579 * 2. prange is on deferred list to add interval notifier after split, or 2580 * 3. prange is child range, it is split from parent prange, recover later 2581 * after interval notifier is added. 2582 * 2583 * Return: true to skip recover, false to recover 2584 */ 2585 static bool svm_range_skip_recover(struct svm_range *prange) 2586 { 2587 struct svm_range_list *svms = prange->svms; 2588 2589 spin_lock(&svms->deferred_list_lock); 2590 if (list_empty(&prange->deferred_list) && 2591 list_empty(&prange->child_list)) { 2592 spin_unlock(&svms->deferred_list_lock); 2593 return false; 2594 } 2595 spin_unlock(&svms->deferred_list_lock); 2596 2597 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2598 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2599 svms, prange, prange->start, prange->last); 2600 return true; 2601 } 2602 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2603 prange->work_item.op == SVM_OP_ADD_RANGE) { 2604 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2605 svms, prange, prange->start, prange->last); 2606 return true; 2607 } 2608 return false; 2609 } 2610 2611 static void 2612 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, 2613 int32_t gpuidx) 2614 { 2615 struct kfd_process_device *pdd; 2616 2617 /* fault is on different page of same range 2618 * or fault is skipped to recover later 2619 * or fault is on invalid virtual address 2620 */ 2621 if (gpuidx == MAX_GPU_INSTANCE) { 2622 uint32_t gpuid; 2623 int r; 2624 2625 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); 2626 if (r < 0) 2627 return; 2628 } 2629 2630 /* fault is recovered 2631 * or fault cannot recover because GPU no access on the range 2632 */ 2633 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2634 if (pdd) 2635 WRITE_ONCE(pdd->faults, pdd->faults + 1); 2636 } 2637 2638 static bool 2639 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) 2640 { 2641 unsigned long requested = VM_READ; 2642 2643 if (write_fault) 2644 requested |= VM_WRITE; 2645 2646 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 2647 vma->vm_flags); 2648 return (vma->vm_flags & requested) == requested; 2649 } 2650 2651 int 2652 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2653 uint64_t addr, bool write_fault) 2654 { 2655 struct mm_struct *mm = NULL; 2656 struct svm_range_list *svms; 2657 struct svm_range *prange; 2658 struct kfd_process *p; 2659 uint64_t timestamp; 2660 int32_t best_loc; 2661 int32_t gpuidx = MAX_GPU_INSTANCE; 2662 bool write_locked = false; 2663 struct vm_area_struct *vma; 2664 int r = 0; 2665 2666 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2667 pr_debug("device does not support SVM\n"); 2668 return -EFAULT; 2669 } 2670 2671 p = kfd_lookup_process_by_pasid(pasid); 2672 if (!p) { 2673 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2674 return 0; 2675 } 2676 if (!p->xnack_enabled) { 2677 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2678 r = -EFAULT; 2679 goto out; 2680 } 2681 svms = &p->svms; 2682 2683 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2684 2685 if (atomic_read(&svms->drain_pagefaults)) { 2686 pr_debug("draining retry fault, drop fault 0x%llx\n", addr); 2687 r = 0; 2688 goto out; 2689 } 2690 2691 /* p->lead_thread is available as kfd_process_wq_release flush the work 2692 * before releasing task ref. 2693 */ 2694 mm = get_task_mm(p->lead_thread); 2695 if (!mm) { 2696 pr_debug("svms 0x%p failed to get mm\n", svms); 2697 r = 0; 2698 goto out; 2699 } 2700 2701 mmap_read_lock(mm); 2702 retry_write_locked: 2703 mutex_lock(&svms->lock); 2704 prange = svm_range_from_addr(svms, addr, NULL); 2705 if (!prange) { 2706 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2707 svms, addr); 2708 if (!write_locked) { 2709 /* Need the write lock to create new range with MMU notifier. 2710 * Also flush pending deferred work to make sure the interval 2711 * tree is up to date before we add a new range 2712 */ 2713 mutex_unlock(&svms->lock); 2714 mmap_read_unlock(mm); 2715 mmap_write_lock(mm); 2716 write_locked = true; 2717 goto retry_write_locked; 2718 } 2719 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2720 if (!prange) { 2721 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2722 svms, addr); 2723 mmap_write_downgrade(mm); 2724 r = -EFAULT; 2725 goto out_unlock_svms; 2726 } 2727 } 2728 if (write_locked) 2729 mmap_write_downgrade(mm); 2730 2731 mutex_lock(&prange->migrate_mutex); 2732 2733 if (svm_range_skip_recover(prange)) { 2734 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2735 r = 0; 2736 goto out_unlock_range; 2737 } 2738 2739 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2740 /* skip duplicate vm fault on different pages of same range */ 2741 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2742 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2743 svms, prange->start, prange->last); 2744 r = 0; 2745 goto out_unlock_range; 2746 } 2747 2748 /* __do_munmap removed VMA, return success as we are handling stale 2749 * retry fault. 2750 */ 2751 vma = find_vma(mm, addr << PAGE_SHIFT); 2752 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2753 pr_debug("address 0x%llx VMA is removed\n", addr); 2754 r = 0; 2755 goto out_unlock_range; 2756 } 2757 2758 if (!svm_fault_allowed(vma, write_fault)) { 2759 pr_debug("fault addr 0x%llx no %s permission\n", addr, 2760 write_fault ? "write" : "read"); 2761 r = -EPERM; 2762 goto out_unlock_range; 2763 } 2764 2765 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2766 if (best_loc == -1) { 2767 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2768 svms, prange->start, prange->last); 2769 r = -EACCES; 2770 goto out_unlock_range; 2771 } 2772 2773 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2774 svms, prange->start, prange->last, best_loc, 2775 prange->actual_loc); 2776 2777 if (prange->actual_loc != best_loc) { 2778 if (best_loc) { 2779 r = svm_migrate_to_vram(prange, best_loc, mm); 2780 if (r) { 2781 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2782 r, addr); 2783 /* Fallback to system memory if migration to 2784 * VRAM failed 2785 */ 2786 if (prange->actual_loc) 2787 r = svm_migrate_vram_to_ram(prange, mm); 2788 else 2789 r = 0; 2790 } 2791 } else { 2792 r = svm_migrate_vram_to_ram(prange, mm); 2793 } 2794 if (r) { 2795 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2796 r, svms, prange->start, prange->last); 2797 goto out_unlock_range; 2798 } 2799 } 2800 2801 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2802 if (r) 2803 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2804 r, svms, prange->start, prange->last); 2805 2806 out_unlock_range: 2807 mutex_unlock(&prange->migrate_mutex); 2808 out_unlock_svms: 2809 mutex_unlock(&svms->lock); 2810 mmap_read_unlock(mm); 2811 2812 svm_range_count_fault(adev, p, gpuidx); 2813 2814 mmput(mm); 2815 out: 2816 kfd_unref_process(p); 2817 2818 if (r == -EAGAIN) { 2819 pr_debug("recover vm fault later\n"); 2820 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2821 r = 0; 2822 } 2823 return r; 2824 } 2825 2826 void svm_range_list_fini(struct kfd_process *p) 2827 { 2828 struct svm_range *prange; 2829 struct svm_range *next; 2830 2831 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2832 2833 /* Ensure list work is finished before process is destroyed */ 2834 flush_work(&p->svms.deferred_list_work); 2835 2836 /* 2837 * Ensure no retry fault comes in afterwards, as page fault handler will 2838 * not find kfd process and take mm lock to recover fault. 2839 */ 2840 atomic_inc(&p->svms.drain_pagefaults); 2841 svm_range_drain_retry_fault(&p->svms); 2842 2843 2844 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2845 svm_range_unlink(prange); 2846 svm_range_remove_notifier(prange); 2847 svm_range_free(prange); 2848 } 2849 2850 mutex_destroy(&p->svms.lock); 2851 2852 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2853 } 2854 2855 int svm_range_list_init(struct kfd_process *p) 2856 { 2857 struct svm_range_list *svms = &p->svms; 2858 int i; 2859 2860 svms->objects = RB_ROOT_CACHED; 2861 mutex_init(&svms->lock); 2862 INIT_LIST_HEAD(&svms->list); 2863 atomic_set(&svms->evicted_ranges, 0); 2864 atomic_set(&svms->drain_pagefaults, 0); 2865 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2866 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2867 INIT_LIST_HEAD(&svms->deferred_range_list); 2868 spin_lock_init(&svms->deferred_list_lock); 2869 2870 for (i = 0; i < p->n_pdds; i++) 2871 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 2872 bitmap_set(svms->bitmap_supported, i, 1); 2873 2874 return 0; 2875 } 2876 2877 /** 2878 * svm_range_check_vm - check if virtual address range mapped already 2879 * @p: current kfd_process 2880 * @start: range start address, in pages 2881 * @last: range last address, in pages 2882 * @bo_s: mapping start address in pages if address range already mapped 2883 * @bo_l: mapping last address in pages if address range already mapped 2884 * 2885 * The purpose is to avoid virtual address ranges already allocated by 2886 * kfd_ioctl_alloc_memory_of_gpu ioctl. 2887 * It looks for each pdd in the kfd_process. 2888 * 2889 * Context: Process context 2890 * 2891 * Return 0 - OK, if the range is not mapped. 2892 * Otherwise error code: 2893 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 2894 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 2895 * a signal. Release all buffer reservations and return to user-space. 2896 */ 2897 static int 2898 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 2899 uint64_t *bo_s, uint64_t *bo_l) 2900 { 2901 struct amdgpu_bo_va_mapping *mapping; 2902 struct interval_tree_node *node; 2903 uint32_t i; 2904 int r; 2905 2906 for (i = 0; i < p->n_pdds; i++) { 2907 struct amdgpu_vm *vm; 2908 2909 if (!p->pdds[i]->drm_priv) 2910 continue; 2911 2912 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2913 r = amdgpu_bo_reserve(vm->root.bo, false); 2914 if (r) 2915 return r; 2916 2917 node = interval_tree_iter_first(&vm->va, start, last); 2918 if (node) { 2919 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 2920 start, last); 2921 mapping = container_of((struct rb_node *)node, 2922 struct amdgpu_bo_va_mapping, rb); 2923 if (bo_s && bo_l) { 2924 *bo_s = mapping->start; 2925 *bo_l = mapping->last; 2926 } 2927 amdgpu_bo_unreserve(vm->root.bo); 2928 return -EADDRINUSE; 2929 } 2930 amdgpu_bo_unreserve(vm->root.bo); 2931 } 2932 2933 return 0; 2934 } 2935 2936 /** 2937 * svm_range_is_valid - check if virtual address range is valid 2938 * @p: current kfd_process 2939 * @start: range start address, in pages 2940 * @size: range size, in pages 2941 * 2942 * Valid virtual address range means it belongs to one or more VMAs 2943 * 2944 * Context: Process context 2945 * 2946 * Return: 2947 * 0 - OK, otherwise error code 2948 */ 2949 static int 2950 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 2951 { 2952 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2953 struct vm_area_struct *vma; 2954 unsigned long end; 2955 unsigned long start_unchg = start; 2956 2957 start <<= PAGE_SHIFT; 2958 end = start + (size << PAGE_SHIFT); 2959 do { 2960 vma = find_vma(p->mm, start); 2961 if (!vma || start < vma->vm_start || 2962 (vma->vm_flags & device_vma)) 2963 return -EFAULT; 2964 start = min(end, vma->vm_end); 2965 } while (start < end); 2966 2967 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 2968 NULL); 2969 } 2970 2971 /** 2972 * svm_range_best_prefetch_location - decide the best prefetch location 2973 * @prange: svm range structure 2974 * 2975 * For xnack off: 2976 * If range map to single GPU, the best prefetch location is prefetch_loc, which 2977 * can be CPU or GPU. 2978 * 2979 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 2980 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 2981 * the best prefetch location is always CPU, because GPU can not have coherent 2982 * mapping VRAM of other GPUs even with large-BAR PCIe connection. 2983 * 2984 * For xnack on: 2985 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 2986 * prefetch_loc, other GPU access will generate vm fault and trigger migration. 2987 * 2988 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 2989 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 2990 * prefetch location is always CPU. 2991 * 2992 * Context: Process context 2993 * 2994 * Return: 2995 * 0 for CPU or GPU id 2996 */ 2997 static uint32_t 2998 svm_range_best_prefetch_location(struct svm_range *prange) 2999 { 3000 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 3001 uint32_t best_loc = prange->prefetch_loc; 3002 struct kfd_process_device *pdd; 3003 struct amdgpu_device *bo_adev; 3004 struct kfd_process *p; 3005 uint32_t gpuidx; 3006 3007 p = container_of(prange->svms, struct kfd_process, svms); 3008 3009 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 3010 goto out; 3011 3012 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 3013 if (!bo_adev) { 3014 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 3015 best_loc = 0; 3016 goto out; 3017 } 3018 3019 if (p->xnack_enabled) 3020 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 3021 else 3022 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 3023 MAX_GPU_INSTANCE); 3024 3025 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 3026 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 3027 if (!pdd) { 3028 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 3029 continue; 3030 } 3031 3032 if (pdd->dev->adev == bo_adev) 3033 continue; 3034 3035 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 3036 best_loc = 0; 3037 break; 3038 } 3039 } 3040 3041 out: 3042 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 3043 p->xnack_enabled, &p->svms, prange->start, prange->last, 3044 best_loc); 3045 3046 return best_loc; 3047 } 3048 3049 /* FIXME: This is a workaround for page locking bug when some pages are 3050 * invalid during migration to VRAM 3051 */ 3052 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, 3053 void *owner) 3054 { 3055 struct hmm_range *hmm_range; 3056 int r; 3057 3058 if (prange->validated_once) 3059 return; 3060 3061 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 3062 prange->start << PAGE_SHIFT, 3063 prange->npages, &hmm_range, 3064 false, true, owner); 3065 if (!r) { 3066 amdgpu_hmm_range_get_pages_done(hmm_range); 3067 prange->validated_once = true; 3068 } 3069 } 3070 3071 /* svm_range_trigger_migration - start page migration if prefetch loc changed 3072 * @mm: current process mm_struct 3073 * @prange: svm range structure 3074 * @migrated: output, true if migration is triggered 3075 * 3076 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 3077 * from ram to vram. 3078 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 3079 * from vram to ram. 3080 * 3081 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 3082 * and restore work: 3083 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 3084 * stops all queues, schedule restore work 3085 * 2. svm_range_restore_work wait for migration is done by 3086 * a. svm_range_validate_vram takes prange->migrate_mutex 3087 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 3088 * 3. restore work update mappings of GPU, resume all queues. 3089 * 3090 * Context: Process context 3091 * 3092 * Return: 3093 * 0 - OK, otherwise - error code of migration 3094 */ 3095 static int 3096 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 3097 bool *migrated) 3098 { 3099 uint32_t best_loc; 3100 int r = 0; 3101 3102 *migrated = false; 3103 best_loc = svm_range_best_prefetch_location(prange); 3104 3105 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3106 best_loc == prange->actual_loc) 3107 return 0; 3108 3109 if (!best_loc) { 3110 r = svm_migrate_vram_to_ram(prange, mm); 3111 *migrated = !r; 3112 return r; 3113 } 3114 3115 r = svm_migrate_to_vram(prange, best_loc, mm); 3116 *migrated = !r; 3117 3118 return r; 3119 } 3120 3121 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 3122 { 3123 if (!fence) 3124 return -EINVAL; 3125 3126 if (dma_fence_is_signaled(&fence->base)) 3127 return 0; 3128 3129 if (fence->svm_bo) { 3130 WRITE_ONCE(fence->svm_bo->evicting, 1); 3131 schedule_work(&fence->svm_bo->eviction_work); 3132 } 3133 3134 return 0; 3135 } 3136 3137 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 3138 { 3139 struct svm_range_bo *svm_bo; 3140 struct kfd_process *p; 3141 struct mm_struct *mm; 3142 3143 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 3144 if (!svm_bo_ref_unless_zero(svm_bo)) 3145 return; /* svm_bo was freed while eviction was pending */ 3146 3147 /* svm_range_bo_release destroys this worker thread. So during 3148 * the lifetime of this thread, kfd_process and mm will be valid. 3149 */ 3150 p = container_of(svm_bo->svms, struct kfd_process, svms); 3151 mm = p->mm; 3152 if (!mm) 3153 return; 3154 3155 mmap_read_lock(mm); 3156 spin_lock(&svm_bo->list_lock); 3157 while (!list_empty(&svm_bo->range_list)) { 3158 struct svm_range *prange = 3159 list_first_entry(&svm_bo->range_list, 3160 struct svm_range, svm_bo_list); 3161 int retries = 3; 3162 3163 list_del_init(&prange->svm_bo_list); 3164 spin_unlock(&svm_bo->list_lock); 3165 3166 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3167 prange->start, prange->last); 3168 3169 mutex_lock(&prange->migrate_mutex); 3170 do { 3171 svm_migrate_vram_to_ram(prange, 3172 svm_bo->eviction_fence->mm); 3173 } while (prange->actual_loc && --retries); 3174 WARN(prange->actual_loc, "Migration failed during eviction"); 3175 3176 mutex_lock(&prange->lock); 3177 prange->svm_bo = NULL; 3178 mutex_unlock(&prange->lock); 3179 3180 mutex_unlock(&prange->migrate_mutex); 3181 3182 spin_lock(&svm_bo->list_lock); 3183 } 3184 spin_unlock(&svm_bo->list_lock); 3185 mmap_read_unlock(mm); 3186 3187 dma_fence_signal(&svm_bo->eviction_fence->base); 3188 /* This is the last reference to svm_bo, after svm_range_vram_node_free 3189 * has been called in svm_migrate_vram_to_ram 3190 */ 3191 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 3192 svm_range_bo_unref(svm_bo); 3193 } 3194 3195 static int 3196 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3197 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3198 { 3199 struct mm_struct *mm = current->mm; 3200 struct list_head update_list; 3201 struct list_head insert_list; 3202 struct list_head remove_list; 3203 struct svm_range_list *svms; 3204 struct svm_range *prange; 3205 struct svm_range *next; 3206 int r = 0; 3207 3208 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 3209 p->pasid, &p->svms, start, start + size - 1, size); 3210 3211 r = svm_range_check_attr(p, nattr, attrs); 3212 if (r) 3213 return r; 3214 3215 svms = &p->svms; 3216 3217 svm_range_list_lock_and_flush_work(svms, mm); 3218 3219 r = svm_range_is_valid(p, start, size); 3220 if (r) { 3221 pr_debug("invalid range r=%d\n", r); 3222 mmap_write_unlock(mm); 3223 goto out; 3224 } 3225 3226 mutex_lock(&svms->lock); 3227 3228 /* Add new range and split existing ranges as needed */ 3229 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 3230 &insert_list, &remove_list); 3231 if (r) { 3232 mutex_unlock(&svms->lock); 3233 mmap_write_unlock(mm); 3234 goto out; 3235 } 3236 /* Apply changes as a transaction */ 3237 list_for_each_entry_safe(prange, next, &insert_list, list) { 3238 svm_range_add_to_svms(prange); 3239 svm_range_add_notifier_locked(mm, prange); 3240 } 3241 list_for_each_entry(prange, &update_list, update_list) { 3242 svm_range_apply_attrs(p, prange, nattr, attrs); 3243 /* TODO: unmap ranges from GPU that lost access */ 3244 } 3245 list_for_each_entry_safe(prange, next, &remove_list, update_list) { 3246 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 3247 prange->svms, prange, prange->start, 3248 prange->last); 3249 svm_range_unlink(prange); 3250 svm_range_remove_notifier(prange); 3251 svm_range_free(prange); 3252 } 3253 3254 mmap_write_downgrade(mm); 3255 /* Trigger migrations and revalidate and map to GPUs as needed. If 3256 * this fails we may be left with partially completed actions. There 3257 * is no clean way of rolling back to the previous state in such a 3258 * case because the rollback wouldn't be guaranteed to work either. 3259 */ 3260 list_for_each_entry(prange, &update_list, update_list) { 3261 bool migrated; 3262 3263 mutex_lock(&prange->migrate_mutex); 3264 3265 r = svm_range_trigger_migration(mm, prange, &migrated); 3266 if (r) 3267 goto out_unlock_range; 3268 3269 if (migrated && !p->xnack_enabled) { 3270 pr_debug("restore_work will update mappings of GPUs\n"); 3271 mutex_unlock(&prange->migrate_mutex); 3272 continue; 3273 } 3274 3275 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 3276 true, true); 3277 if (r) 3278 pr_debug("failed %d to map svm range\n", r); 3279 3280 out_unlock_range: 3281 mutex_unlock(&prange->migrate_mutex); 3282 if (r) 3283 break; 3284 } 3285 3286 svm_range_debug_dump(svms); 3287 3288 mutex_unlock(&svms->lock); 3289 mmap_read_unlock(mm); 3290 out: 3291 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 3292 &p->svms, start, start + size - 1, r); 3293 3294 return r; 3295 } 3296 3297 static int 3298 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3299 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3300 { 3301 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 3302 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 3303 bool get_preferred_loc = false; 3304 bool get_prefetch_loc = false; 3305 bool get_granularity = false; 3306 bool get_accessible = false; 3307 bool get_flags = false; 3308 uint64_t last = start + size - 1UL; 3309 struct mm_struct *mm = current->mm; 3310 uint8_t granularity = 0xff; 3311 struct interval_tree_node *node; 3312 struct svm_range_list *svms; 3313 struct svm_range *prange; 3314 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3315 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3316 uint32_t flags_and = 0xffffffff; 3317 uint32_t flags_or = 0; 3318 int gpuidx; 3319 uint32_t i; 3320 int r = 0; 3321 3322 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 3323 start + size - 1, nattr); 3324 3325 /* Flush pending deferred work to avoid racing with deferred actions from 3326 * previous memory map changes (e.g. munmap). Concurrent memory map changes 3327 * can still race with get_attr because we don't hold the mmap lock. But that 3328 * would be a race condition in the application anyway, and undefined 3329 * behaviour is acceptable in that case. 3330 */ 3331 flush_work(&p->svms.deferred_list_work); 3332 3333 mmap_read_lock(mm); 3334 r = svm_range_is_valid(p, start, size); 3335 mmap_read_unlock(mm); 3336 if (r) { 3337 pr_debug("invalid range r=%d\n", r); 3338 return r; 3339 } 3340 3341 for (i = 0; i < nattr; i++) { 3342 switch (attrs[i].type) { 3343 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3344 get_preferred_loc = true; 3345 break; 3346 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3347 get_prefetch_loc = true; 3348 break; 3349 case KFD_IOCTL_SVM_ATTR_ACCESS: 3350 get_accessible = true; 3351 break; 3352 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3353 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3354 get_flags = true; 3355 break; 3356 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3357 get_granularity = true; 3358 break; 3359 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 3360 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 3361 fallthrough; 3362 default: 3363 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 3364 return -EINVAL; 3365 } 3366 } 3367 3368 svms = &p->svms; 3369 3370 mutex_lock(&svms->lock); 3371 3372 node = interval_tree_iter_first(&svms->objects, start, last); 3373 if (!node) { 3374 pr_debug("range attrs not found return default values\n"); 3375 svm_range_set_default_attributes(&location, &prefetch_loc, 3376 &granularity, &flags_and); 3377 flags_or = flags_and; 3378 if (p->xnack_enabled) 3379 bitmap_copy(bitmap_access, svms->bitmap_supported, 3380 MAX_GPU_INSTANCE); 3381 else 3382 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 3383 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 3384 goto fill_values; 3385 } 3386 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 3387 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 3388 3389 while (node) { 3390 struct interval_tree_node *next; 3391 3392 prange = container_of(node, struct svm_range, it_node); 3393 next = interval_tree_iter_next(node, start, last); 3394 3395 if (get_preferred_loc) { 3396 if (prange->preferred_loc == 3397 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3398 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3399 location != prange->preferred_loc)) { 3400 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3401 get_preferred_loc = false; 3402 } else { 3403 location = prange->preferred_loc; 3404 } 3405 } 3406 if (get_prefetch_loc) { 3407 if (prange->prefetch_loc == 3408 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3409 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3410 prefetch_loc != prange->prefetch_loc)) { 3411 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3412 get_prefetch_loc = false; 3413 } else { 3414 prefetch_loc = prange->prefetch_loc; 3415 } 3416 } 3417 if (get_accessible) { 3418 bitmap_and(bitmap_access, bitmap_access, 3419 prange->bitmap_access, MAX_GPU_INSTANCE); 3420 bitmap_and(bitmap_aip, bitmap_aip, 3421 prange->bitmap_aip, MAX_GPU_INSTANCE); 3422 } 3423 if (get_flags) { 3424 flags_and &= prange->flags; 3425 flags_or |= prange->flags; 3426 } 3427 3428 if (get_granularity && prange->granularity < granularity) 3429 granularity = prange->granularity; 3430 3431 node = next; 3432 } 3433 fill_values: 3434 mutex_unlock(&svms->lock); 3435 3436 for (i = 0; i < nattr; i++) { 3437 switch (attrs[i].type) { 3438 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3439 attrs[i].value = location; 3440 break; 3441 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3442 attrs[i].value = prefetch_loc; 3443 break; 3444 case KFD_IOCTL_SVM_ATTR_ACCESS: 3445 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3446 attrs[i].value); 3447 if (gpuidx < 0) { 3448 pr_debug("invalid gpuid %x\n", attrs[i].value); 3449 return -EINVAL; 3450 } 3451 if (test_bit(gpuidx, bitmap_access)) 3452 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3453 else if (test_bit(gpuidx, bitmap_aip)) 3454 attrs[i].type = 3455 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3456 else 3457 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3458 break; 3459 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3460 attrs[i].value = flags_and; 3461 break; 3462 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3463 attrs[i].value = ~flags_or; 3464 break; 3465 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3466 attrs[i].value = (uint32_t)granularity; 3467 break; 3468 } 3469 } 3470 3471 return 0; 3472 } 3473 3474 int 3475 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3476 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3477 { 3478 int r; 3479 3480 start >>= PAGE_SHIFT; 3481 size >>= PAGE_SHIFT; 3482 3483 switch (op) { 3484 case KFD_IOCTL_SVM_OP_SET_ATTR: 3485 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3486 break; 3487 case KFD_IOCTL_SVM_OP_GET_ATTR: 3488 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3489 break; 3490 default: 3491 r = EINVAL; 3492 break; 3493 } 3494 3495 return r; 3496 } 3497