1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #ifdef dev_fmt 37 #undef dev_fmt 38 #endif 39 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 40 41 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 42 43 /* Long enough to ensure no retry fault comes after svm range is restored and 44 * page table is updated. 45 */ 46 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 47 48 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 49 static bool 50 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 51 const struct mmu_notifier_range *range, 52 unsigned long cur_seq); 53 static int 54 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 55 uint64_t *bo_s, uint64_t *bo_l); 56 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 57 .invalidate = svm_range_cpu_invalidate_pagetables, 58 }; 59 60 /** 61 * svm_range_unlink - unlink svm_range from lists and interval tree 62 * @prange: svm range structure to be removed 63 * 64 * Remove the svm_range from the svms and svm_bo lists and the svms 65 * interval tree. 66 * 67 * Context: The caller must hold svms->lock 68 */ 69 static void svm_range_unlink(struct svm_range *prange) 70 { 71 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 72 prange, prange->start, prange->last); 73 74 if (prange->svm_bo) { 75 spin_lock(&prange->svm_bo->list_lock); 76 list_del(&prange->svm_bo_list); 77 spin_unlock(&prange->svm_bo->list_lock); 78 } 79 80 list_del(&prange->list); 81 if (prange->it_node.start != 0 && prange->it_node.last != 0) 82 interval_tree_remove(&prange->it_node, &prange->svms->objects); 83 } 84 85 static void 86 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 87 { 88 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 89 prange, prange->start, prange->last); 90 91 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 92 prange->start << PAGE_SHIFT, 93 prange->npages << PAGE_SHIFT, 94 &svm_range_mn_ops); 95 } 96 97 /** 98 * svm_range_add_to_svms - add svm range to svms 99 * @prange: svm range structure to be added 100 * 101 * Add the svm range to svms interval tree and link list 102 * 103 * Context: The caller must hold svms->lock 104 */ 105 static void svm_range_add_to_svms(struct svm_range *prange) 106 { 107 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 108 prange, prange->start, prange->last); 109 110 list_add_tail(&prange->list, &prange->svms->list); 111 prange->it_node.start = prange->start; 112 prange->it_node.last = prange->last; 113 interval_tree_insert(&prange->it_node, &prange->svms->objects); 114 } 115 116 static void svm_range_remove_notifier(struct svm_range *prange) 117 { 118 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 119 prange->svms, prange, 120 prange->notifier.interval_tree.start >> PAGE_SHIFT, 121 prange->notifier.interval_tree.last >> PAGE_SHIFT); 122 123 if (prange->notifier.interval_tree.start != 0 && 124 prange->notifier.interval_tree.last != 0) 125 mmu_interval_notifier_remove(&prange->notifier); 126 } 127 128 static bool 129 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 130 { 131 return dma_addr && !dma_mapping_error(dev, dma_addr) && 132 !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 133 } 134 135 static int 136 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 137 unsigned long offset, unsigned long npages, 138 unsigned long *hmm_pfns, uint32_t gpuidx) 139 { 140 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 141 dma_addr_t *addr = prange->dma_addr[gpuidx]; 142 struct device *dev = adev->dev; 143 struct page *page; 144 int i, r; 145 146 if (!addr) { 147 addr = kvmalloc_array(prange->npages, sizeof(*addr), 148 GFP_KERNEL | __GFP_ZERO); 149 if (!addr) 150 return -ENOMEM; 151 prange->dma_addr[gpuidx] = addr; 152 } 153 154 addr += offset; 155 for (i = 0; i < npages; i++) { 156 if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 157 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 158 159 page = hmm_pfn_to_page(hmm_pfns[i]); 160 if (is_zone_device_page(page)) { 161 struct amdgpu_device *bo_adev = 162 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 163 164 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 165 bo_adev->vm_manager.vram_base_offset - 166 bo_adev->kfd.dev->pgmap.range.start; 167 addr[i] |= SVM_RANGE_VRAM_DOMAIN; 168 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 169 continue; 170 } 171 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 172 r = dma_mapping_error(dev, addr[i]); 173 if (r) { 174 dev_err(dev, "failed %d dma_map_page\n", r); 175 return r; 176 } 177 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 178 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 179 } 180 return 0; 181 } 182 183 static int 184 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 185 unsigned long offset, unsigned long npages, 186 unsigned long *hmm_pfns) 187 { 188 struct kfd_process *p; 189 uint32_t gpuidx; 190 int r; 191 192 p = container_of(prange->svms, struct kfd_process, svms); 193 194 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 195 struct kfd_process_device *pdd; 196 197 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 198 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 199 if (!pdd) { 200 pr_debug("failed to find device idx %d\n", gpuidx); 201 return -EINVAL; 202 } 203 204 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 205 hmm_pfns, gpuidx); 206 if (r) 207 break; 208 } 209 210 return r; 211 } 212 213 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 214 unsigned long offset, unsigned long npages) 215 { 216 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 217 int i; 218 219 if (!dma_addr) 220 return; 221 222 for (i = offset; i < offset + npages; i++) { 223 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 224 continue; 225 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 226 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 227 dma_addr[i] = 0; 228 } 229 } 230 231 void svm_range_free_dma_mappings(struct svm_range *prange) 232 { 233 struct kfd_process_device *pdd; 234 dma_addr_t *dma_addr; 235 struct device *dev; 236 struct kfd_process *p; 237 uint32_t gpuidx; 238 239 p = container_of(prange->svms, struct kfd_process, svms); 240 241 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 242 dma_addr = prange->dma_addr[gpuidx]; 243 if (!dma_addr) 244 continue; 245 246 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 247 if (!pdd) { 248 pr_debug("failed to find device idx %d\n", gpuidx); 249 continue; 250 } 251 dev = &pdd->dev->pdev->dev; 252 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 253 kvfree(dma_addr); 254 prange->dma_addr[gpuidx] = NULL; 255 } 256 } 257 258 static void svm_range_free(struct svm_range *prange) 259 { 260 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 261 prange->start, prange->last); 262 263 svm_range_vram_node_free(prange); 264 svm_range_free_dma_mappings(prange); 265 mutex_destroy(&prange->lock); 266 mutex_destroy(&prange->migrate_mutex); 267 kfree(prange); 268 } 269 270 static void 271 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 272 uint8_t *granularity, uint32_t *flags) 273 { 274 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 275 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 276 *granularity = 9; 277 *flags = 278 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 279 } 280 281 static struct 282 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 283 uint64_t last) 284 { 285 uint64_t size = last - start + 1; 286 struct svm_range *prange; 287 struct kfd_process *p; 288 289 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 290 if (!prange) 291 return NULL; 292 prange->npages = size; 293 prange->svms = svms; 294 prange->start = start; 295 prange->last = last; 296 INIT_LIST_HEAD(&prange->list); 297 INIT_LIST_HEAD(&prange->update_list); 298 INIT_LIST_HEAD(&prange->remove_list); 299 INIT_LIST_HEAD(&prange->insert_list); 300 INIT_LIST_HEAD(&prange->svm_bo_list); 301 INIT_LIST_HEAD(&prange->deferred_list); 302 INIT_LIST_HEAD(&prange->child_list); 303 atomic_set(&prange->invalid, 0); 304 prange->validate_timestamp = 0; 305 mutex_init(&prange->migrate_mutex); 306 mutex_init(&prange->lock); 307 308 p = container_of(svms, struct kfd_process, svms); 309 if (p->xnack_enabled) 310 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 311 MAX_GPU_INSTANCE); 312 313 svm_range_set_default_attributes(&prange->preferred_loc, 314 &prange->prefetch_loc, 315 &prange->granularity, &prange->flags); 316 317 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 318 319 return prange; 320 } 321 322 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 323 { 324 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 325 return false; 326 327 return true; 328 } 329 330 static void svm_range_bo_release(struct kref *kref) 331 { 332 struct svm_range_bo *svm_bo; 333 334 svm_bo = container_of(kref, struct svm_range_bo, kref); 335 spin_lock(&svm_bo->list_lock); 336 while (!list_empty(&svm_bo->range_list)) { 337 struct svm_range *prange = 338 list_first_entry(&svm_bo->range_list, 339 struct svm_range, svm_bo_list); 340 /* list_del_init tells a concurrent svm_range_vram_node_new when 341 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 342 */ 343 list_del_init(&prange->svm_bo_list); 344 spin_unlock(&svm_bo->list_lock); 345 346 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 347 prange->start, prange->last); 348 mutex_lock(&prange->lock); 349 prange->svm_bo = NULL; 350 mutex_unlock(&prange->lock); 351 352 spin_lock(&svm_bo->list_lock); 353 } 354 spin_unlock(&svm_bo->list_lock); 355 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 356 /* We're not in the eviction worker. 357 * Signal the fence and synchronize with any 358 * pending eviction work. 359 */ 360 dma_fence_signal(&svm_bo->eviction_fence->base); 361 cancel_work_sync(&svm_bo->eviction_work); 362 } 363 dma_fence_put(&svm_bo->eviction_fence->base); 364 amdgpu_bo_unref(&svm_bo->bo); 365 kfree(svm_bo); 366 } 367 368 void svm_range_bo_unref(struct svm_range_bo *svm_bo) 369 { 370 if (!svm_bo) 371 return; 372 373 kref_put(&svm_bo->kref, svm_range_bo_release); 374 } 375 376 static bool 377 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 378 { 379 struct amdgpu_device *bo_adev; 380 381 mutex_lock(&prange->lock); 382 if (!prange->svm_bo) { 383 mutex_unlock(&prange->lock); 384 return false; 385 } 386 if (prange->ttm_res) { 387 /* We still have a reference, all is well */ 388 mutex_unlock(&prange->lock); 389 return true; 390 } 391 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 392 /* 393 * Migrate from GPU to GPU, remove range from source bo_adev 394 * svm_bo range list, and return false to allocate svm_bo from 395 * destination adev. 396 */ 397 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 398 if (bo_adev != adev) { 399 mutex_unlock(&prange->lock); 400 401 spin_lock(&prange->svm_bo->list_lock); 402 list_del_init(&prange->svm_bo_list); 403 spin_unlock(&prange->svm_bo->list_lock); 404 405 svm_range_bo_unref(prange->svm_bo); 406 return false; 407 } 408 if (READ_ONCE(prange->svm_bo->evicting)) { 409 struct dma_fence *f; 410 struct svm_range_bo *svm_bo; 411 /* The BO is getting evicted, 412 * we need to get a new one 413 */ 414 mutex_unlock(&prange->lock); 415 svm_bo = prange->svm_bo; 416 f = dma_fence_get(&svm_bo->eviction_fence->base); 417 svm_range_bo_unref(prange->svm_bo); 418 /* wait for the fence to avoid long spin-loop 419 * at list_empty_careful 420 */ 421 dma_fence_wait(f, false); 422 dma_fence_put(f); 423 } else { 424 /* The BO was still around and we got 425 * a new reference to it 426 */ 427 mutex_unlock(&prange->lock); 428 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 429 prange->svms, prange->start, prange->last); 430 431 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 432 return true; 433 } 434 435 } else { 436 mutex_unlock(&prange->lock); 437 } 438 439 /* We need a new svm_bo. Spin-loop to wait for concurrent 440 * svm_range_bo_release to finish removing this range from 441 * its range list. After this, it is safe to reuse the 442 * svm_bo pointer and svm_bo_list head. 443 */ 444 while (!list_empty_careful(&prange->svm_bo_list)) 445 ; 446 447 return false; 448 } 449 450 static struct svm_range_bo *svm_range_bo_new(void) 451 { 452 struct svm_range_bo *svm_bo; 453 454 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 455 if (!svm_bo) 456 return NULL; 457 458 kref_init(&svm_bo->kref); 459 INIT_LIST_HEAD(&svm_bo->range_list); 460 spin_lock_init(&svm_bo->list_lock); 461 462 return svm_bo; 463 } 464 465 int 466 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 467 bool clear) 468 { 469 struct amdgpu_bo_param bp; 470 struct svm_range_bo *svm_bo; 471 struct amdgpu_bo_user *ubo; 472 struct amdgpu_bo *bo; 473 struct kfd_process *p; 474 struct mm_struct *mm; 475 int r; 476 477 p = container_of(prange->svms, struct kfd_process, svms); 478 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 479 prange->start, prange->last); 480 481 if (svm_range_validate_svm_bo(adev, prange)) 482 return 0; 483 484 svm_bo = svm_range_bo_new(); 485 if (!svm_bo) { 486 pr_debug("failed to alloc svm bo\n"); 487 return -ENOMEM; 488 } 489 mm = get_task_mm(p->lead_thread); 490 if (!mm) { 491 pr_debug("failed to get mm\n"); 492 kfree(svm_bo); 493 return -ESRCH; 494 } 495 svm_bo->svms = prange->svms; 496 svm_bo->eviction_fence = 497 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 498 mm, 499 svm_bo); 500 mmput(mm); 501 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 502 svm_bo->evicting = 0; 503 memset(&bp, 0, sizeof(bp)); 504 bp.size = prange->npages * PAGE_SIZE; 505 bp.byte_align = PAGE_SIZE; 506 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 507 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 508 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 509 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 510 bp.type = ttm_bo_type_device; 511 bp.resv = NULL; 512 513 r = amdgpu_bo_create_user(adev, &bp, &ubo); 514 if (r) { 515 pr_debug("failed %d to create bo\n", r); 516 goto create_bo_failed; 517 } 518 bo = &ubo->bo; 519 r = amdgpu_bo_reserve(bo, true); 520 if (r) { 521 pr_debug("failed %d to reserve bo\n", r); 522 goto reserve_bo_failed; 523 } 524 525 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 526 if (r) { 527 pr_debug("failed %d to reserve bo\n", r); 528 amdgpu_bo_unreserve(bo); 529 goto reserve_bo_failed; 530 } 531 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 532 533 amdgpu_bo_unreserve(bo); 534 535 svm_bo->bo = bo; 536 prange->svm_bo = svm_bo; 537 prange->ttm_res = bo->tbo.resource; 538 prange->offset = 0; 539 540 spin_lock(&svm_bo->list_lock); 541 list_add(&prange->svm_bo_list, &svm_bo->range_list); 542 spin_unlock(&svm_bo->list_lock); 543 544 return 0; 545 546 reserve_bo_failed: 547 amdgpu_bo_unref(&bo); 548 create_bo_failed: 549 dma_fence_put(&svm_bo->eviction_fence->base); 550 kfree(svm_bo); 551 prange->ttm_res = NULL; 552 553 return r; 554 } 555 556 void svm_range_vram_node_free(struct svm_range *prange) 557 { 558 svm_range_bo_unref(prange->svm_bo); 559 prange->ttm_res = NULL; 560 } 561 562 struct amdgpu_device * 563 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 564 { 565 struct kfd_process_device *pdd; 566 struct kfd_process *p; 567 int32_t gpu_idx; 568 569 p = container_of(prange->svms, struct kfd_process, svms); 570 571 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 572 if (gpu_idx < 0) { 573 pr_debug("failed to get device by id 0x%x\n", gpu_id); 574 return NULL; 575 } 576 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 577 if (!pdd) { 578 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 579 return NULL; 580 } 581 582 return pdd->dev->adev; 583 } 584 585 struct kfd_process_device * 586 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) 587 { 588 struct kfd_process *p; 589 int32_t gpu_idx, gpuid; 590 int r; 591 592 p = container_of(prange->svms, struct kfd_process, svms); 593 594 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); 595 if (r) { 596 pr_debug("failed to get device id by adev %p\n", adev); 597 return NULL; 598 } 599 600 return kfd_process_device_from_gpuidx(p, gpu_idx); 601 } 602 603 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 604 { 605 struct ttm_operation_ctx ctx = { false, false }; 606 607 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 608 609 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 610 } 611 612 static int 613 svm_range_check_attr(struct kfd_process *p, 614 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 615 { 616 uint32_t i; 617 618 for (i = 0; i < nattr; i++) { 619 uint32_t val = attrs[i].value; 620 int gpuidx = MAX_GPU_INSTANCE; 621 622 switch (attrs[i].type) { 623 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 624 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 625 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 626 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 627 break; 628 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 629 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 630 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 631 break; 632 case KFD_IOCTL_SVM_ATTR_ACCESS: 633 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 634 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 635 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 636 break; 637 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 638 break; 639 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 640 break; 641 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 642 break; 643 default: 644 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 645 return -EINVAL; 646 } 647 648 if (gpuidx < 0) { 649 pr_debug("no GPU 0x%x found\n", val); 650 return -EINVAL; 651 } else if (gpuidx < MAX_GPU_INSTANCE && 652 !test_bit(gpuidx, p->svms.bitmap_supported)) { 653 pr_debug("GPU 0x%x not supported\n", val); 654 return -EINVAL; 655 } 656 } 657 658 return 0; 659 } 660 661 static void 662 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 663 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 664 { 665 uint32_t i; 666 int gpuidx; 667 668 for (i = 0; i < nattr; i++) { 669 switch (attrs[i].type) { 670 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 671 prange->preferred_loc = attrs[i].value; 672 break; 673 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 674 prange->prefetch_loc = attrs[i].value; 675 break; 676 case KFD_IOCTL_SVM_ATTR_ACCESS: 677 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 678 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 679 gpuidx = kfd_process_gpuidx_from_gpuid(p, 680 attrs[i].value); 681 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 682 bitmap_clear(prange->bitmap_access, gpuidx, 1); 683 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 684 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 685 bitmap_set(prange->bitmap_access, gpuidx, 1); 686 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 687 } else { 688 bitmap_clear(prange->bitmap_access, gpuidx, 1); 689 bitmap_set(prange->bitmap_aip, gpuidx, 1); 690 } 691 break; 692 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 693 prange->flags |= attrs[i].value; 694 break; 695 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 696 prange->flags &= ~attrs[i].value; 697 break; 698 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 699 prange->granularity = attrs[i].value; 700 break; 701 default: 702 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 703 } 704 } 705 } 706 707 static bool 708 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, 709 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 710 { 711 uint32_t i; 712 int gpuidx; 713 714 for (i = 0; i < nattr; i++) { 715 switch (attrs[i].type) { 716 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 717 if (prange->preferred_loc != attrs[i].value) 718 return false; 719 break; 720 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 721 /* Prefetch should always trigger a migration even 722 * if the value of the attribute didn't change. 723 */ 724 return false; 725 case KFD_IOCTL_SVM_ATTR_ACCESS: 726 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 727 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 728 gpuidx = kfd_process_gpuidx_from_gpuid(p, 729 attrs[i].value); 730 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 731 if (test_bit(gpuidx, prange->bitmap_access) || 732 test_bit(gpuidx, prange->bitmap_aip)) 733 return false; 734 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 735 if (!test_bit(gpuidx, prange->bitmap_access)) 736 return false; 737 } else { 738 if (!test_bit(gpuidx, prange->bitmap_aip)) 739 return false; 740 } 741 break; 742 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 743 if ((prange->flags & attrs[i].value) != attrs[i].value) 744 return false; 745 break; 746 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 747 if ((prange->flags & attrs[i].value) != 0) 748 return false; 749 break; 750 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 751 if (prange->granularity != attrs[i].value) 752 return false; 753 break; 754 default: 755 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 756 } 757 } 758 759 return true; 760 } 761 762 /** 763 * svm_range_debug_dump - print all range information from svms 764 * @svms: svm range list header 765 * 766 * debug output svm range start, end, prefetch location from svms 767 * interval tree and link list 768 * 769 * Context: The caller must hold svms->lock 770 */ 771 static void svm_range_debug_dump(struct svm_range_list *svms) 772 { 773 struct interval_tree_node *node; 774 struct svm_range *prange; 775 776 pr_debug("dump svms 0x%p list\n", svms); 777 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 778 779 list_for_each_entry(prange, &svms->list, list) { 780 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 781 prange, prange->start, prange->npages, 782 prange->start + prange->npages - 1, 783 prange->actual_loc); 784 } 785 786 pr_debug("dump svms 0x%p interval tree\n", svms); 787 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 788 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 789 while (node) { 790 prange = container_of(node, struct svm_range, it_node); 791 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 792 prange, prange->start, prange->npages, 793 prange->start + prange->npages - 1, 794 prange->actual_loc); 795 node = interval_tree_iter_next(node, 0, ~0ULL); 796 } 797 } 798 799 static int 800 svm_range_split_array(void *ppnew, void *ppold, size_t size, 801 uint64_t old_start, uint64_t old_n, 802 uint64_t new_start, uint64_t new_n) 803 { 804 unsigned char *new, *old, *pold; 805 uint64_t d; 806 807 if (!ppold) 808 return 0; 809 pold = *(unsigned char **)ppold; 810 if (!pold) 811 return 0; 812 813 new = kvmalloc_array(new_n, size, GFP_KERNEL); 814 if (!new) 815 return -ENOMEM; 816 817 d = (new_start - old_start) * size; 818 memcpy(new, pold + d, new_n * size); 819 820 old = kvmalloc_array(old_n, size, GFP_KERNEL); 821 if (!old) { 822 kvfree(new); 823 return -ENOMEM; 824 } 825 826 d = (new_start == old_start) ? new_n * size : 0; 827 memcpy(old, pold + d, old_n * size); 828 829 kvfree(pold); 830 *(void **)ppold = old; 831 *(void **)ppnew = new; 832 833 return 0; 834 } 835 836 static int 837 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 838 uint64_t start, uint64_t last) 839 { 840 uint64_t npages = last - start + 1; 841 int i, r; 842 843 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 844 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 845 sizeof(*old->dma_addr[i]), old->start, 846 npages, new->start, new->npages); 847 if (r) 848 return r; 849 } 850 851 return 0; 852 } 853 854 static int 855 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 856 uint64_t start, uint64_t last) 857 { 858 uint64_t npages = last - start + 1; 859 860 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 861 new->svms, new, new->start, start, last); 862 863 if (new->start == old->start) { 864 new->offset = old->offset; 865 old->offset += new->npages; 866 } else { 867 new->offset = old->offset + npages; 868 } 869 870 new->svm_bo = svm_range_bo_ref(old->svm_bo); 871 new->ttm_res = old->ttm_res; 872 873 spin_lock(&new->svm_bo->list_lock); 874 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 875 spin_unlock(&new->svm_bo->list_lock); 876 877 return 0; 878 } 879 880 /** 881 * svm_range_split_adjust - split range and adjust 882 * 883 * @new: new range 884 * @old: the old range 885 * @start: the old range adjust to start address in pages 886 * @last: the old range adjust to last address in pages 887 * 888 * Copy system memory dma_addr or vram ttm_res in old range to new 889 * range from new_start up to size new->npages, the remaining old range is from 890 * start to last 891 * 892 * Return: 893 * 0 - OK, -ENOMEM - out of memory 894 */ 895 static int 896 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 897 uint64_t start, uint64_t last) 898 { 899 int r; 900 901 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 902 new->svms, new->start, old->start, old->last, start, last); 903 904 if (new->start < old->start || 905 new->last > old->last) { 906 WARN_ONCE(1, "invalid new range start or last\n"); 907 return -EINVAL; 908 } 909 910 r = svm_range_split_pages(new, old, start, last); 911 if (r) 912 return r; 913 914 if (old->actual_loc && old->ttm_res) { 915 r = svm_range_split_nodes(new, old, start, last); 916 if (r) 917 return r; 918 } 919 920 old->npages = last - start + 1; 921 old->start = start; 922 old->last = last; 923 new->flags = old->flags; 924 new->preferred_loc = old->preferred_loc; 925 new->prefetch_loc = old->prefetch_loc; 926 new->actual_loc = old->actual_loc; 927 new->granularity = old->granularity; 928 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 929 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 930 931 return 0; 932 } 933 934 /** 935 * svm_range_split - split a range in 2 ranges 936 * 937 * @prange: the svm range to split 938 * @start: the remaining range start address in pages 939 * @last: the remaining range last address in pages 940 * @new: the result new range generated 941 * 942 * Two cases only: 943 * case 1: if start == prange->start 944 * prange ==> prange[start, last] 945 * new range [last + 1, prange->last] 946 * 947 * case 2: if last == prange->last 948 * prange ==> prange[start, last] 949 * new range [prange->start, start - 1] 950 * 951 * Return: 952 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 953 */ 954 static int 955 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 956 struct svm_range **new) 957 { 958 uint64_t old_start = prange->start; 959 uint64_t old_last = prange->last; 960 struct svm_range_list *svms; 961 int r = 0; 962 963 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 964 old_start, old_last, start, last); 965 966 if (old_start != start && old_last != last) 967 return -EINVAL; 968 if (start < old_start || last > old_last) 969 return -EINVAL; 970 971 svms = prange->svms; 972 if (old_start == start) 973 *new = svm_range_new(svms, last + 1, old_last); 974 else 975 *new = svm_range_new(svms, old_start, start - 1); 976 if (!*new) 977 return -ENOMEM; 978 979 r = svm_range_split_adjust(*new, prange, start, last); 980 if (r) { 981 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 982 r, old_start, old_last, start, last); 983 svm_range_free(*new); 984 *new = NULL; 985 } 986 987 return r; 988 } 989 990 static int 991 svm_range_split_tail(struct svm_range *prange, 992 uint64_t new_last, struct list_head *insert_list) 993 { 994 struct svm_range *tail; 995 int r = svm_range_split(prange, prange->start, new_last, &tail); 996 997 if (!r) 998 list_add(&tail->insert_list, insert_list); 999 return r; 1000 } 1001 1002 static int 1003 svm_range_split_head(struct svm_range *prange, 1004 uint64_t new_start, struct list_head *insert_list) 1005 { 1006 struct svm_range *head; 1007 int r = svm_range_split(prange, new_start, prange->last, &head); 1008 1009 if (!r) 1010 list_add(&head->insert_list, insert_list); 1011 return r; 1012 } 1013 1014 static void 1015 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 1016 struct svm_range *pchild, enum svm_work_list_ops op) 1017 { 1018 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 1019 pchild, pchild->start, pchild->last, prange, op); 1020 1021 pchild->work_item.mm = mm; 1022 pchild->work_item.op = op; 1023 list_add_tail(&pchild->child_list, &prange->child_list); 1024 } 1025 1026 /** 1027 * svm_range_split_by_granularity - collect ranges within granularity boundary 1028 * 1029 * @p: the process with svms list 1030 * @mm: mm structure 1031 * @addr: the vm fault address in pages, to split the prange 1032 * @parent: parent range if prange is from child list 1033 * @prange: prange to split 1034 * 1035 * Trims @prange to be a single aligned block of prange->granularity if 1036 * possible. The head and tail are added to the child_list in @parent. 1037 * 1038 * Context: caller must hold mmap_read_lock and prange->lock 1039 * 1040 * Return: 1041 * 0 - OK, otherwise error code 1042 */ 1043 int 1044 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 1045 unsigned long addr, struct svm_range *parent, 1046 struct svm_range *prange) 1047 { 1048 struct svm_range *head, *tail; 1049 unsigned long start, last, size; 1050 int r; 1051 1052 /* Align splited range start and size to granularity size, then a single 1053 * PTE will be used for whole range, this reduces the number of PTE 1054 * updated and the L1 TLB space used for translation. 1055 */ 1056 size = 1UL << prange->granularity; 1057 start = ALIGN_DOWN(addr, size); 1058 last = ALIGN(addr + 1, size) - 1; 1059 1060 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 1061 prange->svms, prange->start, prange->last, start, last, size); 1062 1063 if (start > prange->start) { 1064 r = svm_range_split(prange, start, prange->last, &head); 1065 if (r) 1066 return r; 1067 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 1068 } 1069 1070 if (last < prange->last) { 1071 r = svm_range_split(prange, prange->start, last, &tail); 1072 if (r) 1073 return r; 1074 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1075 } 1076 1077 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 1078 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 1079 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 1080 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 1081 prange, prange->start, prange->last, 1082 SVM_OP_ADD_RANGE_AND_MAP); 1083 } 1084 return 0; 1085 } 1086 1087 static uint64_t 1088 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, 1089 int domain) 1090 { 1091 struct amdgpu_device *bo_adev; 1092 uint32_t flags = prange->flags; 1093 uint32_t mapping_flags = 0; 1094 uint64_t pte_flags; 1095 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 1096 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1097 1098 if (domain == SVM_RANGE_VRAM_DOMAIN) 1099 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1100 1101 switch (KFD_GC_VERSION(adev->kfd.dev)) { 1102 case IP_VERSION(9, 4, 1): 1103 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1104 if (bo_adev == adev) { 1105 mapping_flags |= coherent ? 1106 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1107 } else { 1108 mapping_flags |= coherent ? 1109 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1110 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1111 snoop = true; 1112 } 1113 } else { 1114 mapping_flags |= coherent ? 1115 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1116 } 1117 break; 1118 case IP_VERSION(9, 4, 2): 1119 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1120 if (bo_adev == adev) { 1121 mapping_flags |= coherent ? 1122 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1123 if (adev->gmc.xgmi.connected_to_cpu) 1124 snoop = true; 1125 } else { 1126 mapping_flags |= coherent ? 1127 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1128 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1129 snoop = true; 1130 } 1131 } else { 1132 mapping_flags |= coherent ? 1133 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1134 } 1135 break; 1136 default: 1137 mapping_flags |= coherent ? 1138 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1139 } 1140 1141 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1142 1143 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1144 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1145 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1146 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1147 1148 pte_flags = AMDGPU_PTE_VALID; 1149 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 1150 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1151 1152 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1153 return pte_flags; 1154 } 1155 1156 static int 1157 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1158 uint64_t start, uint64_t last, 1159 struct dma_fence **fence) 1160 { 1161 uint64_t init_pte_value = 0; 1162 1163 pr_debug("[0x%llx 0x%llx]\n", start, last); 1164 1165 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1166 start, last, init_pte_value, 0, 1167 NULL, NULL, fence, NULL); 1168 } 1169 1170 static int 1171 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1172 unsigned long last) 1173 { 1174 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1175 struct kfd_process_device *pdd; 1176 struct dma_fence *fence = NULL; 1177 struct kfd_process *p; 1178 uint32_t gpuidx; 1179 int r = 0; 1180 1181 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1182 MAX_GPU_INSTANCE); 1183 p = container_of(prange->svms, struct kfd_process, svms); 1184 1185 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1186 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1187 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1188 if (!pdd) { 1189 pr_debug("failed to find device idx %d\n", gpuidx); 1190 return -EINVAL; 1191 } 1192 1193 r = svm_range_unmap_from_gpu(pdd->dev->adev, 1194 drm_priv_to_vm(pdd->drm_priv), 1195 start, last, &fence); 1196 if (r) 1197 break; 1198 1199 if (fence) { 1200 r = dma_fence_wait(fence, false); 1201 dma_fence_put(fence); 1202 fence = NULL; 1203 if (r) 1204 break; 1205 } 1206 amdgpu_amdkfd_flush_gpu_tlb_pasid(pdd->dev->adev, 1207 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1208 } 1209 1210 return r; 1211 } 1212 1213 static int 1214 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1215 struct svm_range *prange, unsigned long offset, 1216 unsigned long npages, bool readonly, dma_addr_t *dma_addr, 1217 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1218 { 1219 struct amdgpu_bo_va bo_va; 1220 bool table_freed = false; 1221 uint64_t pte_flags; 1222 unsigned long last_start; 1223 int last_domain; 1224 int r = 0; 1225 int64_t i, j; 1226 1227 last_start = prange->start + offset; 1228 1229 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 1230 last_start, last_start + npages - 1, readonly); 1231 1232 if (prange->svm_bo && prange->ttm_res) 1233 bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev); 1234 1235 for (i = offset; i < offset + npages; i++) { 1236 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 1237 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 1238 1239 /* Collect all pages in the same address range and memory domain 1240 * that can be mapped with a single call to update mapping. 1241 */ 1242 if (i < offset + npages - 1 && 1243 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 1244 continue; 1245 1246 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 1247 last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 1248 1249 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); 1250 if (readonly) 1251 pte_flags &= ~AMDGPU_PTE_WRITEABLE; 1252 1253 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 1254 prange->svms, last_start, prange->start + i, 1255 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 1256 pte_flags); 1257 1258 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, 1259 NULL, last_start, 1260 prange->start + i, pte_flags, 1261 last_start - prange->start, 1262 NULL, dma_addr, 1263 &vm->last_update, 1264 &table_freed); 1265 1266 for (j = last_start - prange->start; j <= i; j++) 1267 dma_addr[j] |= last_domain; 1268 1269 if (r) { 1270 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1271 goto out; 1272 } 1273 last_start = prange->start + i + 1; 1274 } 1275 1276 r = amdgpu_vm_update_pdes(adev, vm, false); 1277 if (r) { 1278 pr_debug("failed %d to update directories 0x%lx\n", r, 1279 prange->start); 1280 goto out; 1281 } 1282 1283 if (fence) 1284 *fence = dma_fence_get(vm->last_update); 1285 1286 if (table_freed) { 1287 struct kfd_process *p; 1288 1289 p = container_of(prange->svms, struct kfd_process, svms); 1290 amdgpu_amdkfd_flush_gpu_tlb_pasid(adev, p->pasid, TLB_FLUSH_LEGACY); 1291 } 1292 out: 1293 return r; 1294 } 1295 1296 static int 1297 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 1298 unsigned long npages, bool readonly, 1299 unsigned long *bitmap, bool wait) 1300 { 1301 struct kfd_process_device *pdd; 1302 struct amdgpu_device *bo_adev; 1303 struct kfd_process *p; 1304 struct dma_fence *fence = NULL; 1305 uint32_t gpuidx; 1306 int r = 0; 1307 1308 if (prange->svm_bo && prange->ttm_res) 1309 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1310 else 1311 bo_adev = NULL; 1312 1313 p = container_of(prange->svms, struct kfd_process, svms); 1314 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1315 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1316 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1317 if (!pdd) { 1318 pr_debug("failed to find device idx %d\n", gpuidx); 1319 return -EINVAL; 1320 } 1321 1322 pdd = kfd_bind_process_to_device(pdd->dev, p); 1323 if (IS_ERR(pdd)) 1324 return -EINVAL; 1325 1326 if (bo_adev && pdd->dev->adev != bo_adev && 1327 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 1328 pr_debug("cannot map to device idx %d\n", gpuidx); 1329 continue; 1330 } 1331 1332 r = svm_range_map_to_gpu(pdd->dev->adev, drm_priv_to_vm(pdd->drm_priv), 1333 prange, offset, npages, readonly, 1334 prange->dma_addr[gpuidx], 1335 bo_adev, wait ? &fence : NULL); 1336 if (r) 1337 break; 1338 1339 if (fence) { 1340 r = dma_fence_wait(fence, false); 1341 dma_fence_put(fence); 1342 fence = NULL; 1343 if (r) { 1344 pr_debug("failed %d to dma fence wait\n", r); 1345 break; 1346 } 1347 } 1348 } 1349 1350 return r; 1351 } 1352 1353 struct svm_validate_context { 1354 struct kfd_process *process; 1355 struct svm_range *prange; 1356 bool intr; 1357 unsigned long bitmap[MAX_GPU_INSTANCE]; 1358 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; 1359 struct list_head validate_list; 1360 struct ww_acquire_ctx ticket; 1361 }; 1362 1363 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1364 { 1365 struct kfd_process_device *pdd; 1366 struct amdgpu_vm *vm; 1367 uint32_t gpuidx; 1368 int r; 1369 1370 INIT_LIST_HEAD(&ctx->validate_list); 1371 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1372 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1373 if (!pdd) { 1374 pr_debug("failed to find device idx %d\n", gpuidx); 1375 return -EINVAL; 1376 } 1377 vm = drm_priv_to_vm(pdd->drm_priv); 1378 1379 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1380 ctx->tv[gpuidx].num_shared = 4; 1381 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1382 } 1383 1384 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1385 ctx->intr, NULL); 1386 if (r) { 1387 pr_debug("failed %d to reserve bo\n", r); 1388 return r; 1389 } 1390 1391 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1392 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1393 if (!pdd) { 1394 pr_debug("failed to find device idx %d\n", gpuidx); 1395 r = -EINVAL; 1396 goto unreserve_out; 1397 } 1398 1399 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 1400 drm_priv_to_vm(pdd->drm_priv), 1401 svm_range_bo_validate, NULL); 1402 if (r) { 1403 pr_debug("failed %d validate pt bos\n", r); 1404 goto unreserve_out; 1405 } 1406 } 1407 1408 return 0; 1409 1410 unreserve_out: 1411 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1412 return r; 1413 } 1414 1415 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1416 { 1417 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1418 } 1419 1420 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 1421 { 1422 struct kfd_process_device *pdd; 1423 1424 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1425 1426 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 1427 } 1428 1429 /* 1430 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1431 * 1432 * To prevent concurrent destruction or change of range attributes, the 1433 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1434 * because that would block concurrent evictions and lead to deadlocks. To 1435 * serialize concurrent migrations or validations of the same range, the 1436 * prange->migrate_mutex must be held. 1437 * 1438 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1439 * eviction fence. 1440 * 1441 * The following sequence ensures race-free validation and GPU mapping: 1442 * 1443 * 1. Reserve page table (and SVM BO if range is in VRAM) 1444 * 2. hmm_range_fault to get page addresses (if system memory) 1445 * 3. DMA-map pages (if system memory) 1446 * 4-a. Take notifier lock 1447 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1448 * 4-c. Check that the range was not split or otherwise invalidated 1449 * 4-d. Update GPU page table 1450 * 4.e. Release notifier lock 1451 * 5. Release page table (and SVM BO) reservation 1452 */ 1453 static int svm_range_validate_and_map(struct mm_struct *mm, 1454 struct svm_range *prange, 1455 int32_t gpuidx, bool intr, bool wait) 1456 { 1457 struct svm_validate_context ctx; 1458 unsigned long start, end, addr; 1459 struct kfd_process *p; 1460 void *owner; 1461 int32_t idx; 1462 int r = 0; 1463 1464 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1465 ctx.prange = prange; 1466 ctx.intr = intr; 1467 1468 if (gpuidx < MAX_GPU_INSTANCE) { 1469 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1470 bitmap_set(ctx.bitmap, gpuidx, 1); 1471 } else if (ctx.process->xnack_enabled) { 1472 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1473 1474 /* If prefetch range to GPU, or GPU retry fault migrate range to 1475 * GPU, which has ACCESS attribute to the range, create mapping 1476 * on that GPU. 1477 */ 1478 if (prange->actual_loc) { 1479 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1480 prange->actual_loc); 1481 if (gpuidx < 0) { 1482 WARN_ONCE(1, "failed get device by id 0x%x\n", 1483 prange->actual_loc); 1484 return -EINVAL; 1485 } 1486 if (test_bit(gpuidx, prange->bitmap_access)) 1487 bitmap_set(ctx.bitmap, gpuidx, 1); 1488 } 1489 } else { 1490 bitmap_or(ctx.bitmap, prange->bitmap_access, 1491 prange->bitmap_aip, MAX_GPU_INSTANCE); 1492 } 1493 1494 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1495 return 0; 1496 1497 if (prange->actual_loc && !prange->ttm_res) { 1498 /* This should never happen. actual_loc gets set by 1499 * svm_migrate_ram_to_vram after allocating a BO. 1500 */ 1501 WARN_ONCE(1, "VRAM BO missing during validation\n"); 1502 return -EINVAL; 1503 } 1504 1505 svm_range_reserve_bos(&ctx); 1506 1507 p = container_of(prange->svms, struct kfd_process, svms); 1508 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, 1509 MAX_GPU_INSTANCE)); 1510 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { 1511 if (kfd_svm_page_owner(p, idx) != owner) { 1512 owner = NULL; 1513 break; 1514 } 1515 } 1516 1517 start = prange->start << PAGE_SHIFT; 1518 end = (prange->last + 1) << PAGE_SHIFT; 1519 for (addr = start; addr < end && !r; ) { 1520 struct hmm_range *hmm_range; 1521 struct vm_area_struct *vma; 1522 unsigned long next; 1523 unsigned long offset; 1524 unsigned long npages; 1525 bool readonly; 1526 1527 vma = find_vma(mm, addr); 1528 if (!vma || addr < vma->vm_start) { 1529 r = -EFAULT; 1530 goto unreserve_out; 1531 } 1532 readonly = !(vma->vm_flags & VM_WRITE); 1533 1534 next = min(vma->vm_end, end); 1535 npages = (next - addr) >> PAGE_SHIFT; 1536 WRITE_ONCE(p->svms.faulting_task, current); 1537 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1538 addr, npages, &hmm_range, 1539 readonly, true, owner); 1540 WRITE_ONCE(p->svms.faulting_task, NULL); 1541 if (r) { 1542 pr_debug("failed %d to get svm range pages\n", r); 1543 goto unreserve_out; 1544 } 1545 1546 offset = (addr - start) >> PAGE_SHIFT; 1547 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, 1548 hmm_range->hmm_pfns); 1549 if (r) { 1550 pr_debug("failed %d to dma map range\n", r); 1551 goto unreserve_out; 1552 } 1553 1554 svm_range_lock(prange); 1555 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1556 pr_debug("hmm update the range, need validate again\n"); 1557 r = -EAGAIN; 1558 goto unlock_out; 1559 } 1560 if (!list_empty(&prange->child_list)) { 1561 pr_debug("range split by unmap in parallel, validate again\n"); 1562 r = -EAGAIN; 1563 goto unlock_out; 1564 } 1565 1566 r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1567 ctx.bitmap, wait); 1568 1569 unlock_out: 1570 svm_range_unlock(prange); 1571 1572 addr = next; 1573 } 1574 1575 if (addr == end) 1576 prange->validated_once = true; 1577 1578 unreserve_out: 1579 svm_range_unreserve_bos(&ctx); 1580 1581 if (!r) 1582 prange->validate_timestamp = ktime_to_us(ktime_get()); 1583 1584 return r; 1585 } 1586 1587 /** 1588 * svm_range_list_lock_and_flush_work - flush pending deferred work 1589 * 1590 * @svms: the svm range list 1591 * @mm: the mm structure 1592 * 1593 * Context: Returns with mmap write lock held, pending deferred work flushed 1594 * 1595 */ 1596 void 1597 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1598 struct mm_struct *mm) 1599 { 1600 retry_flush_work: 1601 flush_work(&svms->deferred_list_work); 1602 mmap_write_lock(mm); 1603 1604 if (list_empty(&svms->deferred_range_list)) 1605 return; 1606 mmap_write_unlock(mm); 1607 pr_debug("retry flush\n"); 1608 goto retry_flush_work; 1609 } 1610 1611 static void svm_range_restore_work(struct work_struct *work) 1612 { 1613 struct delayed_work *dwork = to_delayed_work(work); 1614 struct svm_range_list *svms; 1615 struct svm_range *prange; 1616 struct kfd_process *p; 1617 struct mm_struct *mm; 1618 int evicted_ranges; 1619 int invalid; 1620 int r; 1621 1622 svms = container_of(dwork, struct svm_range_list, restore_work); 1623 evicted_ranges = atomic_read(&svms->evicted_ranges); 1624 if (!evicted_ranges) 1625 return; 1626 1627 pr_debug("restore svm ranges\n"); 1628 1629 /* kfd_process_notifier_release destroys this worker thread. So during 1630 * the lifetime of this thread, kfd_process and mm will be valid. 1631 */ 1632 p = container_of(svms, struct kfd_process, svms); 1633 mm = p->mm; 1634 if (!mm) 1635 return; 1636 1637 svm_range_list_lock_and_flush_work(svms, mm); 1638 mutex_lock(&svms->lock); 1639 1640 evicted_ranges = atomic_read(&svms->evicted_ranges); 1641 1642 list_for_each_entry(prange, &svms->list, list) { 1643 invalid = atomic_read(&prange->invalid); 1644 if (!invalid) 1645 continue; 1646 1647 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1648 prange->svms, prange, prange->start, prange->last, 1649 invalid); 1650 1651 /* 1652 * If range is migrating, wait for migration is done. 1653 */ 1654 mutex_lock(&prange->migrate_mutex); 1655 1656 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1657 false, true); 1658 if (r) 1659 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1660 prange->start); 1661 1662 mutex_unlock(&prange->migrate_mutex); 1663 if (r) 1664 goto out_reschedule; 1665 1666 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1667 goto out_reschedule; 1668 } 1669 1670 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1671 evicted_ranges) 1672 goto out_reschedule; 1673 1674 evicted_ranges = 0; 1675 1676 r = kgd2kfd_resume_mm(mm); 1677 if (r) { 1678 /* No recovery from this failure. Probably the CP is 1679 * hanging. No point trying again. 1680 */ 1681 pr_debug("failed %d to resume KFD\n", r); 1682 } 1683 1684 pr_debug("restore svm ranges successfully\n"); 1685 1686 out_reschedule: 1687 mutex_unlock(&svms->lock); 1688 mmap_write_unlock(mm); 1689 1690 /* If validation failed, reschedule another attempt */ 1691 if (evicted_ranges) { 1692 pr_debug("reschedule to restore svm range\n"); 1693 schedule_delayed_work(&svms->restore_work, 1694 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1695 } 1696 } 1697 1698 /** 1699 * svm_range_evict - evict svm range 1700 * @prange: svm range structure 1701 * @mm: current process mm_struct 1702 * @start: starting process queue number 1703 * @last: last process queue number 1704 * 1705 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1706 * return to let CPU evict the buffer and proceed CPU pagetable update. 1707 * 1708 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1709 * If invalidation happens while restore work is running, restore work will 1710 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1711 * the queues. 1712 */ 1713 static int 1714 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1715 unsigned long start, unsigned long last) 1716 { 1717 struct svm_range_list *svms = prange->svms; 1718 struct svm_range *pchild; 1719 struct kfd_process *p; 1720 int r = 0; 1721 1722 p = container_of(svms, struct kfd_process, svms); 1723 1724 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1725 svms, prange->start, prange->last, start, last); 1726 1727 if (!p->xnack_enabled) { 1728 int evicted_ranges; 1729 1730 list_for_each_entry(pchild, &prange->child_list, child_list) { 1731 mutex_lock_nested(&pchild->lock, 1); 1732 if (pchild->start <= last && pchild->last >= start) { 1733 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 1734 pchild->start, pchild->last); 1735 atomic_inc(&pchild->invalid); 1736 } 1737 mutex_unlock(&pchild->lock); 1738 } 1739 1740 if (prange->start <= last && prange->last >= start) 1741 atomic_inc(&prange->invalid); 1742 1743 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1744 if (evicted_ranges != 1) 1745 return r; 1746 1747 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1748 prange->svms, prange->start, prange->last); 1749 1750 /* First eviction, stop the queues */ 1751 r = kgd2kfd_quiesce_mm(mm); 1752 if (r) 1753 pr_debug("failed to quiesce KFD\n"); 1754 1755 pr_debug("schedule to restore svm %p ranges\n", svms); 1756 schedule_delayed_work(&svms->restore_work, 1757 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1758 } else { 1759 unsigned long s, l; 1760 1761 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1762 prange->svms, start, last); 1763 list_for_each_entry(pchild, &prange->child_list, child_list) { 1764 mutex_lock_nested(&pchild->lock, 1); 1765 s = max(start, pchild->start); 1766 l = min(last, pchild->last); 1767 if (l >= s) 1768 svm_range_unmap_from_gpus(pchild, s, l); 1769 mutex_unlock(&pchild->lock); 1770 } 1771 s = max(start, prange->start); 1772 l = min(last, prange->last); 1773 if (l >= s) 1774 svm_range_unmap_from_gpus(prange, s, l); 1775 } 1776 1777 return r; 1778 } 1779 1780 static struct svm_range *svm_range_clone(struct svm_range *old) 1781 { 1782 struct svm_range *new; 1783 1784 new = svm_range_new(old->svms, old->start, old->last); 1785 if (!new) 1786 return NULL; 1787 1788 if (old->svm_bo) { 1789 new->ttm_res = old->ttm_res; 1790 new->offset = old->offset; 1791 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1792 spin_lock(&new->svm_bo->list_lock); 1793 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1794 spin_unlock(&new->svm_bo->list_lock); 1795 } 1796 new->flags = old->flags; 1797 new->preferred_loc = old->preferred_loc; 1798 new->prefetch_loc = old->prefetch_loc; 1799 new->actual_loc = old->actual_loc; 1800 new->granularity = old->granularity; 1801 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1802 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1803 1804 return new; 1805 } 1806 1807 /** 1808 * svm_range_add - add svm range and handle overlap 1809 * @p: the range add to this process svms 1810 * @start: page size aligned 1811 * @size: page size aligned 1812 * @nattr: number of attributes 1813 * @attrs: array of attributes 1814 * @update_list: output, the ranges need validate and update GPU mapping 1815 * @insert_list: output, the ranges need insert to svms 1816 * @remove_list: output, the ranges are replaced and need remove from svms 1817 * 1818 * Check if the virtual address range has overlap with any existing ranges, 1819 * split partly overlapping ranges and add new ranges in the gaps. All changes 1820 * should be applied to the range_list and interval tree transactionally. If 1821 * any range split or allocation fails, the entire update fails. Therefore any 1822 * existing overlapping svm_ranges are cloned and the original svm_ranges left 1823 * unchanged. 1824 * 1825 * If the transaction succeeds, the caller can update and insert clones and 1826 * new ranges, then free the originals. 1827 * 1828 * Otherwise the caller can free the clones and new ranges, while the old 1829 * svm_ranges remain unchanged. 1830 * 1831 * Context: Process context, caller must hold svms->lock 1832 * 1833 * Return: 1834 * 0 - OK, otherwise error code 1835 */ 1836 static int 1837 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 1838 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 1839 struct list_head *update_list, struct list_head *insert_list, 1840 struct list_head *remove_list) 1841 { 1842 unsigned long last = start + size - 1UL; 1843 struct svm_range_list *svms = &p->svms; 1844 struct interval_tree_node *node; 1845 struct svm_range *prange; 1846 struct svm_range *tmp; 1847 int r = 0; 1848 1849 pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); 1850 1851 INIT_LIST_HEAD(update_list); 1852 INIT_LIST_HEAD(insert_list); 1853 INIT_LIST_HEAD(remove_list); 1854 1855 node = interval_tree_iter_first(&svms->objects, start, last); 1856 while (node) { 1857 struct interval_tree_node *next; 1858 unsigned long next_start; 1859 1860 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1861 node->last); 1862 1863 prange = container_of(node, struct svm_range, it_node); 1864 next = interval_tree_iter_next(node, start, last); 1865 next_start = min(node->last, last) + 1; 1866 1867 if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { 1868 /* nothing to do */ 1869 } else if (node->start < start || node->last > last) { 1870 /* node intersects the update range and its attributes 1871 * will change. Clone and split it, apply updates only 1872 * to the overlapping part 1873 */ 1874 struct svm_range *old = prange; 1875 1876 prange = svm_range_clone(old); 1877 if (!prange) { 1878 r = -ENOMEM; 1879 goto out; 1880 } 1881 1882 list_add(&old->remove_list, remove_list); 1883 list_add(&prange->insert_list, insert_list); 1884 list_add(&prange->update_list, update_list); 1885 1886 if (node->start < start) { 1887 pr_debug("change old range start\n"); 1888 r = svm_range_split_head(prange, start, 1889 insert_list); 1890 if (r) 1891 goto out; 1892 } 1893 if (node->last > last) { 1894 pr_debug("change old range last\n"); 1895 r = svm_range_split_tail(prange, last, 1896 insert_list); 1897 if (r) 1898 goto out; 1899 } 1900 } else { 1901 /* The node is contained within start..last, 1902 * just update it 1903 */ 1904 list_add(&prange->update_list, update_list); 1905 } 1906 1907 /* insert a new node if needed */ 1908 if (node->start > start) { 1909 prange = svm_range_new(svms, start, node->start - 1); 1910 if (!prange) { 1911 r = -ENOMEM; 1912 goto out; 1913 } 1914 1915 list_add(&prange->insert_list, insert_list); 1916 list_add(&prange->update_list, update_list); 1917 } 1918 1919 node = next; 1920 start = next_start; 1921 } 1922 1923 /* add a final range at the end if needed */ 1924 if (start <= last) { 1925 prange = svm_range_new(svms, start, last); 1926 if (!prange) { 1927 r = -ENOMEM; 1928 goto out; 1929 } 1930 list_add(&prange->insert_list, insert_list); 1931 list_add(&prange->update_list, update_list); 1932 } 1933 1934 out: 1935 if (r) 1936 list_for_each_entry_safe(prange, tmp, insert_list, insert_list) 1937 svm_range_free(prange); 1938 1939 return r; 1940 } 1941 1942 static void 1943 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1944 struct svm_range *prange) 1945 { 1946 unsigned long start; 1947 unsigned long last; 1948 1949 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1950 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1951 1952 if (prange->start == start && prange->last == last) 1953 return; 1954 1955 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1956 prange->svms, prange, start, last, prange->start, 1957 prange->last); 1958 1959 if (start != 0 && last != 0) { 1960 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1961 svm_range_remove_notifier(prange); 1962 } 1963 prange->it_node.start = prange->start; 1964 prange->it_node.last = prange->last; 1965 1966 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1967 svm_range_add_notifier_locked(mm, prange); 1968 } 1969 1970 static void 1971 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) 1972 { 1973 struct mm_struct *mm = prange->work_item.mm; 1974 1975 switch (prange->work_item.op) { 1976 case SVM_OP_NULL: 1977 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1978 svms, prange, prange->start, prange->last); 1979 break; 1980 case SVM_OP_UNMAP_RANGE: 1981 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1982 svms, prange, prange->start, prange->last); 1983 svm_range_unlink(prange); 1984 svm_range_remove_notifier(prange); 1985 svm_range_free(prange); 1986 break; 1987 case SVM_OP_UPDATE_RANGE_NOTIFIER: 1988 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1989 svms, prange, prange->start, prange->last); 1990 svm_range_update_notifier_and_interval_tree(mm, prange); 1991 break; 1992 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 1993 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1994 svms, prange, prange->start, prange->last); 1995 svm_range_update_notifier_and_interval_tree(mm, prange); 1996 /* TODO: implement deferred validation and mapping */ 1997 break; 1998 case SVM_OP_ADD_RANGE: 1999 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 2000 prange->start, prange->last); 2001 svm_range_add_to_svms(prange); 2002 svm_range_add_notifier_locked(mm, prange); 2003 break; 2004 case SVM_OP_ADD_RANGE_AND_MAP: 2005 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 2006 prange, prange->start, prange->last); 2007 svm_range_add_to_svms(prange); 2008 svm_range_add_notifier_locked(mm, prange); 2009 /* TODO: implement deferred validation and mapping */ 2010 break; 2011 default: 2012 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 2013 prange->work_item.op); 2014 } 2015 } 2016 2017 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 2018 { 2019 struct kfd_process_device *pdd; 2020 struct kfd_process *p; 2021 int drain; 2022 uint32_t i; 2023 2024 p = container_of(svms, struct kfd_process, svms); 2025 2026 restart: 2027 drain = atomic_read(&svms->drain_pagefaults); 2028 if (!drain) 2029 return; 2030 2031 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 2032 pdd = p->pdds[i]; 2033 if (!pdd) 2034 continue; 2035 2036 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 2037 2038 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2039 &pdd->dev->adev->irq.ih1); 2040 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 2041 } 2042 if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) 2043 goto restart; 2044 } 2045 2046 static void svm_range_deferred_list_work(struct work_struct *work) 2047 { 2048 struct svm_range_list *svms; 2049 struct svm_range *prange; 2050 struct mm_struct *mm; 2051 struct kfd_process *p; 2052 2053 svms = container_of(work, struct svm_range_list, deferred_list_work); 2054 pr_debug("enter svms 0x%p\n", svms); 2055 2056 p = container_of(svms, struct kfd_process, svms); 2057 /* Avoid mm is gone when inserting mmu notifier */ 2058 mm = get_task_mm(p->lead_thread); 2059 if (!mm) { 2060 pr_debug("svms 0x%p process mm gone\n", svms); 2061 return; 2062 } 2063 retry: 2064 mmap_write_lock(mm); 2065 2066 /* Checking for the need to drain retry faults must be inside 2067 * mmap write lock to serialize with munmap notifiers. 2068 */ 2069 if (unlikely(atomic_read(&svms->drain_pagefaults))) { 2070 mmap_write_unlock(mm); 2071 svm_range_drain_retry_fault(svms); 2072 goto retry; 2073 } 2074 2075 spin_lock(&svms->deferred_list_lock); 2076 while (!list_empty(&svms->deferred_range_list)) { 2077 prange = list_first_entry(&svms->deferred_range_list, 2078 struct svm_range, deferred_list); 2079 list_del_init(&prange->deferred_list); 2080 spin_unlock(&svms->deferred_list_lock); 2081 2082 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 2083 prange->start, prange->last, prange->work_item.op); 2084 2085 mutex_lock(&svms->lock); 2086 mutex_lock(&prange->migrate_mutex); 2087 while (!list_empty(&prange->child_list)) { 2088 struct svm_range *pchild; 2089 2090 pchild = list_first_entry(&prange->child_list, 2091 struct svm_range, child_list); 2092 pr_debug("child prange 0x%p op %d\n", pchild, 2093 pchild->work_item.op); 2094 list_del_init(&pchild->child_list); 2095 svm_range_handle_list_op(svms, pchild); 2096 } 2097 mutex_unlock(&prange->migrate_mutex); 2098 2099 svm_range_handle_list_op(svms, prange); 2100 mutex_unlock(&svms->lock); 2101 2102 spin_lock(&svms->deferred_list_lock); 2103 } 2104 spin_unlock(&svms->deferred_list_lock); 2105 2106 mmap_write_unlock(mm); 2107 mmput(mm); 2108 pr_debug("exit svms 0x%p\n", svms); 2109 } 2110 2111 void 2112 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 2113 struct mm_struct *mm, enum svm_work_list_ops op) 2114 { 2115 spin_lock(&svms->deferred_list_lock); 2116 /* if prange is on the deferred list */ 2117 if (!list_empty(&prange->deferred_list)) { 2118 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 2119 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 2120 if (op != SVM_OP_NULL && 2121 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2122 prange->work_item.op = op; 2123 } else { 2124 prange->work_item.op = op; 2125 prange->work_item.mm = mm; 2126 list_add_tail(&prange->deferred_list, 2127 &prange->svms->deferred_range_list); 2128 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2129 prange, prange->start, prange->last, op); 2130 } 2131 spin_unlock(&svms->deferred_list_lock); 2132 } 2133 2134 void schedule_deferred_list_work(struct svm_range_list *svms) 2135 { 2136 spin_lock(&svms->deferred_list_lock); 2137 if (!list_empty(&svms->deferred_range_list)) 2138 schedule_work(&svms->deferred_list_work); 2139 spin_unlock(&svms->deferred_list_lock); 2140 } 2141 2142 static void 2143 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2144 struct svm_range *prange, unsigned long start, 2145 unsigned long last) 2146 { 2147 struct svm_range *head; 2148 struct svm_range *tail; 2149 2150 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2151 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 2152 prange->start, prange->last); 2153 return; 2154 } 2155 if (start > prange->last || last < prange->start) 2156 return; 2157 2158 head = tail = prange; 2159 if (start > prange->start) 2160 svm_range_split(prange, prange->start, start - 1, &tail); 2161 if (last < tail->last) 2162 svm_range_split(tail, last + 1, tail->last, &head); 2163 2164 if (head != prange && tail != prange) { 2165 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2166 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2167 } else if (tail != prange) { 2168 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2169 } else if (head != prange) { 2170 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2171 } else if (parent != prange) { 2172 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2173 } 2174 } 2175 2176 static void 2177 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 2178 unsigned long start, unsigned long last) 2179 { 2180 struct svm_range_list *svms; 2181 struct svm_range *pchild; 2182 struct kfd_process *p; 2183 unsigned long s, l; 2184 bool unmap_parent; 2185 2186 p = kfd_lookup_process_by_mm(mm); 2187 if (!p) 2188 return; 2189 svms = &p->svms; 2190 2191 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2192 prange, prange->start, prange->last, start, last); 2193 2194 /* Make sure pending page faults are drained in the deferred worker 2195 * before the range is freed to avoid straggler interrupts on 2196 * unmapped memory causing "phantom faults". 2197 */ 2198 atomic_inc(&svms->drain_pagefaults); 2199 2200 unmap_parent = start <= prange->start && last >= prange->last; 2201 2202 list_for_each_entry(pchild, &prange->child_list, child_list) { 2203 mutex_lock_nested(&pchild->lock, 1); 2204 s = max(start, pchild->start); 2205 l = min(last, pchild->last); 2206 if (l >= s) 2207 svm_range_unmap_from_gpus(pchild, s, l); 2208 svm_range_unmap_split(mm, prange, pchild, start, last); 2209 mutex_unlock(&pchild->lock); 2210 } 2211 s = max(start, prange->start); 2212 l = min(last, prange->last); 2213 if (l >= s) 2214 svm_range_unmap_from_gpus(prange, s, l); 2215 svm_range_unmap_split(mm, prange, prange, start, last); 2216 2217 if (unmap_parent) 2218 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2219 else 2220 svm_range_add_list_work(svms, prange, mm, 2221 SVM_OP_UPDATE_RANGE_NOTIFIER); 2222 schedule_deferred_list_work(svms); 2223 2224 kfd_unref_process(p); 2225 } 2226 2227 /** 2228 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2229 * @mni: mmu_interval_notifier struct 2230 * @range: mmu_notifier_range struct 2231 * @cur_seq: value to pass to mmu_interval_set_seq() 2232 * 2233 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2234 * is from migration, or CPU page invalidation callback. 2235 * 2236 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2237 * work thread, and split prange if only part of prange is unmapped. 2238 * 2239 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2240 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2241 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2242 * update GPU mapping to recover. 2243 * 2244 * Context: mmap lock, notifier_invalidate_start lock are held 2245 * for invalidate event, prange lock is held if this is from migration 2246 */ 2247 static bool 2248 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2249 const struct mmu_notifier_range *range, 2250 unsigned long cur_seq) 2251 { 2252 struct svm_range *prange; 2253 unsigned long start; 2254 unsigned long last; 2255 2256 if (range->event == MMU_NOTIFY_RELEASE) 2257 return true; 2258 2259 start = mni->interval_tree.start; 2260 last = mni->interval_tree.last; 2261 start = (start > range->start ? start : range->start) >> PAGE_SHIFT; 2262 last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT; 2263 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2264 start, last, range->start >> PAGE_SHIFT, 2265 (range->end - 1) >> PAGE_SHIFT, 2266 mni->interval_tree.start >> PAGE_SHIFT, 2267 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2268 2269 prange = container_of(mni, struct svm_range, notifier); 2270 2271 svm_range_lock(prange); 2272 mmu_interval_set_seq(mni, cur_seq); 2273 2274 switch (range->event) { 2275 case MMU_NOTIFY_UNMAP: 2276 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2277 break; 2278 default: 2279 svm_range_evict(prange, mni->mm, start, last); 2280 break; 2281 } 2282 2283 svm_range_unlock(prange); 2284 2285 return true; 2286 } 2287 2288 /** 2289 * svm_range_from_addr - find svm range from fault address 2290 * @svms: svm range list header 2291 * @addr: address to search range interval tree, in pages 2292 * @parent: parent range if range is on child list 2293 * 2294 * Context: The caller must hold svms->lock 2295 * 2296 * Return: the svm_range found or NULL 2297 */ 2298 struct svm_range * 2299 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2300 struct svm_range **parent) 2301 { 2302 struct interval_tree_node *node; 2303 struct svm_range *prange; 2304 struct svm_range *pchild; 2305 2306 node = interval_tree_iter_first(&svms->objects, addr, addr); 2307 if (!node) 2308 return NULL; 2309 2310 prange = container_of(node, struct svm_range, it_node); 2311 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2312 addr, prange->start, prange->last, node->start, node->last); 2313 2314 if (addr >= prange->start && addr <= prange->last) { 2315 if (parent) 2316 *parent = prange; 2317 return prange; 2318 } 2319 list_for_each_entry(pchild, &prange->child_list, child_list) 2320 if (addr >= pchild->start && addr <= pchild->last) { 2321 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2322 addr, pchild->start, pchild->last); 2323 if (parent) 2324 *parent = prange; 2325 return pchild; 2326 } 2327 2328 return NULL; 2329 } 2330 2331 /* svm_range_best_restore_location - decide the best fault restore location 2332 * @prange: svm range structure 2333 * @adev: the GPU on which vm fault happened 2334 * 2335 * This is only called when xnack is on, to decide the best location to restore 2336 * the range mapping after GPU vm fault. Caller uses the best location to do 2337 * migration if actual loc is not best location, then update GPU page table 2338 * mapping to the best location. 2339 * 2340 * If the preferred loc is accessible by faulting GPU, use preferred loc. 2341 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2342 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2343 * if range actual loc is cpu, best_loc is cpu 2344 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2345 * range actual loc. 2346 * Otherwise, GPU no access, best_loc is -1. 2347 * 2348 * Return: 2349 * -1 means vm fault GPU no access 2350 * 0 for CPU or GPU id 2351 */ 2352 static int32_t 2353 svm_range_best_restore_location(struct svm_range *prange, 2354 struct amdgpu_device *adev, 2355 int32_t *gpuidx) 2356 { 2357 struct amdgpu_device *bo_adev, *preferred_adev; 2358 struct kfd_process *p; 2359 uint32_t gpuid; 2360 int r; 2361 2362 p = container_of(prange->svms, struct kfd_process, svms); 2363 2364 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); 2365 if (r < 0) { 2366 pr_debug("failed to get gpuid from kgd\n"); 2367 return -1; 2368 } 2369 2370 if (prange->preferred_loc == gpuid || 2371 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 2372 return prange->preferred_loc; 2373 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2374 preferred_adev = svm_range_get_adev_by_id(prange, 2375 prange->preferred_loc); 2376 if (amdgpu_xgmi_same_hive(adev, preferred_adev)) 2377 return prange->preferred_loc; 2378 /* fall through */ 2379 } 2380 2381 if (test_bit(*gpuidx, prange->bitmap_access)) 2382 return gpuid; 2383 2384 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2385 if (!prange->actual_loc) 2386 return 0; 2387 2388 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2389 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2390 return prange->actual_loc; 2391 else 2392 return 0; 2393 } 2394 2395 return -1; 2396 } 2397 2398 static int 2399 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2400 unsigned long *start, unsigned long *last, 2401 bool *is_heap_stack) 2402 { 2403 struct vm_area_struct *vma; 2404 struct interval_tree_node *node; 2405 unsigned long start_limit, end_limit; 2406 2407 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2408 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2409 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2410 return -EFAULT; 2411 } 2412 2413 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && 2414 vma->vm_end >= vma->vm_mm->start_brk) || 2415 (vma->vm_start <= vma->vm_mm->start_stack && 2416 vma->vm_end >= vma->vm_mm->start_stack); 2417 2418 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2419 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2420 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2421 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2422 /* First range that starts after the fault address */ 2423 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2424 if (node) { 2425 end_limit = min(end_limit, node->start); 2426 /* Last range that ends before the fault address */ 2427 node = container_of(rb_prev(&node->rb), 2428 struct interval_tree_node, rb); 2429 } else { 2430 /* Last range must end before addr because 2431 * there was no range after addr 2432 */ 2433 node = container_of(rb_last(&p->svms.objects.rb_root), 2434 struct interval_tree_node, rb); 2435 } 2436 if (node) { 2437 if (node->last >= addr) { 2438 WARN(1, "Overlap with prev node and page fault addr\n"); 2439 return -EFAULT; 2440 } 2441 start_limit = max(start_limit, node->last + 1); 2442 } 2443 2444 *start = start_limit; 2445 *last = end_limit - 1; 2446 2447 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 2448 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 2449 *start, *last, *is_heap_stack); 2450 2451 return 0; 2452 } 2453 2454 static int 2455 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 2456 uint64_t *bo_s, uint64_t *bo_l) 2457 { 2458 struct amdgpu_bo_va_mapping *mapping; 2459 struct interval_tree_node *node; 2460 struct amdgpu_bo *bo = NULL; 2461 unsigned long userptr; 2462 uint32_t i; 2463 int r; 2464 2465 for (i = 0; i < p->n_pdds; i++) { 2466 struct amdgpu_vm *vm; 2467 2468 if (!p->pdds[i]->drm_priv) 2469 continue; 2470 2471 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2472 r = amdgpu_bo_reserve(vm->root.bo, false); 2473 if (r) 2474 return r; 2475 2476 /* Check userptr by searching entire vm->va interval tree */ 2477 node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 2478 while (node) { 2479 mapping = container_of((struct rb_node *)node, 2480 struct amdgpu_bo_va_mapping, rb); 2481 bo = mapping->bo_va->base.bo; 2482 2483 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 2484 start << PAGE_SHIFT, 2485 last << PAGE_SHIFT, 2486 &userptr)) { 2487 node = interval_tree_iter_next(node, 0, ~0ULL); 2488 continue; 2489 } 2490 2491 pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 2492 start, last); 2493 if (bo_s && bo_l) { 2494 *bo_s = userptr >> PAGE_SHIFT; 2495 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 2496 } 2497 amdgpu_bo_unreserve(vm->root.bo); 2498 return -EADDRINUSE; 2499 } 2500 amdgpu_bo_unreserve(vm->root.bo); 2501 } 2502 return 0; 2503 } 2504 2505 static struct 2506 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2507 struct kfd_process *p, 2508 struct mm_struct *mm, 2509 int64_t addr) 2510 { 2511 struct svm_range *prange = NULL; 2512 unsigned long start, last; 2513 uint32_t gpuid, gpuidx; 2514 bool is_heap_stack; 2515 uint64_t bo_s = 0; 2516 uint64_t bo_l = 0; 2517 int r; 2518 2519 if (svm_range_get_range_boundaries(p, addr, &start, &last, 2520 &is_heap_stack)) 2521 return NULL; 2522 2523 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 2524 if (r != -EADDRINUSE) 2525 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 2526 2527 if (r == -EADDRINUSE) { 2528 if (addr >= bo_s && addr <= bo_l) 2529 return NULL; 2530 2531 /* Create one page svm range if 2MB range overlapping */ 2532 start = addr; 2533 last = addr; 2534 } 2535 2536 prange = svm_range_new(&p->svms, start, last); 2537 if (!prange) { 2538 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2539 return NULL; 2540 } 2541 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { 2542 pr_debug("failed to get gpuid from kgd\n"); 2543 svm_range_free(prange); 2544 return NULL; 2545 } 2546 2547 if (is_heap_stack) 2548 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 2549 2550 svm_range_add_to_svms(prange); 2551 svm_range_add_notifier_locked(mm, prange); 2552 2553 return prange; 2554 } 2555 2556 /* svm_range_skip_recover - decide if prange can be recovered 2557 * @prange: svm range structure 2558 * 2559 * GPU vm retry fault handle skip recover the range for cases: 2560 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2561 * deferred list work will drain the stale fault before free the prange. 2562 * 2. prange is on deferred list to add interval notifier after split, or 2563 * 3. prange is child range, it is split from parent prange, recover later 2564 * after interval notifier is added. 2565 * 2566 * Return: true to skip recover, false to recover 2567 */ 2568 static bool svm_range_skip_recover(struct svm_range *prange) 2569 { 2570 struct svm_range_list *svms = prange->svms; 2571 2572 spin_lock(&svms->deferred_list_lock); 2573 if (list_empty(&prange->deferred_list) && 2574 list_empty(&prange->child_list)) { 2575 spin_unlock(&svms->deferred_list_lock); 2576 return false; 2577 } 2578 spin_unlock(&svms->deferred_list_lock); 2579 2580 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2581 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2582 svms, prange, prange->start, prange->last); 2583 return true; 2584 } 2585 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2586 prange->work_item.op == SVM_OP_ADD_RANGE) { 2587 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2588 svms, prange, prange->start, prange->last); 2589 return true; 2590 } 2591 return false; 2592 } 2593 2594 static void 2595 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, 2596 int32_t gpuidx) 2597 { 2598 struct kfd_process_device *pdd; 2599 2600 /* fault is on different page of same range 2601 * or fault is skipped to recover later 2602 * or fault is on invalid virtual address 2603 */ 2604 if (gpuidx == MAX_GPU_INSTANCE) { 2605 uint32_t gpuid; 2606 int r; 2607 2608 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); 2609 if (r < 0) 2610 return; 2611 } 2612 2613 /* fault is recovered 2614 * or fault cannot recover because GPU no access on the range 2615 */ 2616 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2617 if (pdd) 2618 WRITE_ONCE(pdd->faults, pdd->faults + 1); 2619 } 2620 2621 static bool 2622 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) 2623 { 2624 unsigned long requested = VM_READ; 2625 2626 if (write_fault) 2627 requested |= VM_WRITE; 2628 2629 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 2630 vma->vm_flags); 2631 return (vma->vm_flags & requested) == requested; 2632 } 2633 2634 int 2635 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2636 uint64_t addr, bool write_fault) 2637 { 2638 struct mm_struct *mm = NULL; 2639 struct svm_range_list *svms; 2640 struct svm_range *prange; 2641 struct kfd_process *p; 2642 uint64_t timestamp; 2643 int32_t best_loc; 2644 int32_t gpuidx = MAX_GPU_INSTANCE; 2645 bool write_locked = false; 2646 struct vm_area_struct *vma; 2647 int r = 0; 2648 2649 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2650 pr_debug("device does not support SVM\n"); 2651 return -EFAULT; 2652 } 2653 2654 p = kfd_lookup_process_by_pasid(pasid); 2655 if (!p) { 2656 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2657 return 0; 2658 } 2659 if (!p->xnack_enabled) { 2660 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2661 r = -EFAULT; 2662 goto out; 2663 } 2664 svms = &p->svms; 2665 2666 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2667 2668 if (atomic_read(&svms->drain_pagefaults)) { 2669 pr_debug("draining retry fault, drop fault 0x%llx\n", addr); 2670 r = 0; 2671 goto out; 2672 } 2673 2674 /* p->lead_thread is available as kfd_process_wq_release flush the work 2675 * before releasing task ref. 2676 */ 2677 mm = get_task_mm(p->lead_thread); 2678 if (!mm) { 2679 pr_debug("svms 0x%p failed to get mm\n", svms); 2680 r = 0; 2681 goto out; 2682 } 2683 2684 mmap_read_lock(mm); 2685 retry_write_locked: 2686 mutex_lock(&svms->lock); 2687 prange = svm_range_from_addr(svms, addr, NULL); 2688 if (!prange) { 2689 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2690 svms, addr); 2691 if (!write_locked) { 2692 /* Need the write lock to create new range with MMU notifier. 2693 * Also flush pending deferred work to make sure the interval 2694 * tree is up to date before we add a new range 2695 */ 2696 mutex_unlock(&svms->lock); 2697 mmap_read_unlock(mm); 2698 mmap_write_lock(mm); 2699 write_locked = true; 2700 goto retry_write_locked; 2701 } 2702 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2703 if (!prange) { 2704 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2705 svms, addr); 2706 mmap_write_downgrade(mm); 2707 r = -EFAULT; 2708 goto out_unlock_svms; 2709 } 2710 } 2711 if (write_locked) 2712 mmap_write_downgrade(mm); 2713 2714 mutex_lock(&prange->migrate_mutex); 2715 2716 if (svm_range_skip_recover(prange)) { 2717 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2718 r = 0; 2719 goto out_unlock_range; 2720 } 2721 2722 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2723 /* skip duplicate vm fault on different pages of same range */ 2724 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2725 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2726 svms, prange->start, prange->last); 2727 r = 0; 2728 goto out_unlock_range; 2729 } 2730 2731 /* __do_munmap removed VMA, return success as we are handling stale 2732 * retry fault. 2733 */ 2734 vma = find_vma(mm, addr << PAGE_SHIFT); 2735 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2736 pr_debug("address 0x%llx VMA is removed\n", addr); 2737 r = 0; 2738 goto out_unlock_range; 2739 } 2740 2741 if (!svm_fault_allowed(vma, write_fault)) { 2742 pr_debug("fault addr 0x%llx no %s permission\n", addr, 2743 write_fault ? "write" : "read"); 2744 r = -EPERM; 2745 goto out_unlock_range; 2746 } 2747 2748 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2749 if (best_loc == -1) { 2750 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2751 svms, prange->start, prange->last); 2752 r = -EACCES; 2753 goto out_unlock_range; 2754 } 2755 2756 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2757 svms, prange->start, prange->last, best_loc, 2758 prange->actual_loc); 2759 2760 if (prange->actual_loc != best_loc) { 2761 if (best_loc) { 2762 r = svm_migrate_to_vram(prange, best_loc, mm); 2763 if (r) { 2764 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2765 r, addr); 2766 /* Fallback to system memory if migration to 2767 * VRAM failed 2768 */ 2769 if (prange->actual_loc) 2770 r = svm_migrate_vram_to_ram(prange, mm); 2771 else 2772 r = 0; 2773 } 2774 } else { 2775 r = svm_migrate_vram_to_ram(prange, mm); 2776 } 2777 if (r) { 2778 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2779 r, svms, prange->start, prange->last); 2780 goto out_unlock_range; 2781 } 2782 } 2783 2784 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2785 if (r) 2786 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2787 r, svms, prange->start, prange->last); 2788 2789 out_unlock_range: 2790 mutex_unlock(&prange->migrate_mutex); 2791 out_unlock_svms: 2792 mutex_unlock(&svms->lock); 2793 mmap_read_unlock(mm); 2794 2795 svm_range_count_fault(adev, p, gpuidx); 2796 2797 mmput(mm); 2798 out: 2799 kfd_unref_process(p); 2800 2801 if (r == -EAGAIN) { 2802 pr_debug("recover vm fault later\n"); 2803 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2804 r = 0; 2805 } 2806 return r; 2807 } 2808 2809 void svm_range_list_fini(struct kfd_process *p) 2810 { 2811 struct svm_range *prange; 2812 struct svm_range *next; 2813 2814 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2815 2816 /* Ensure list work is finished before process is destroyed */ 2817 flush_work(&p->svms.deferred_list_work); 2818 2819 /* 2820 * Ensure no retry fault comes in afterwards, as page fault handler will 2821 * not find kfd process and take mm lock to recover fault. 2822 */ 2823 atomic_inc(&p->svms.drain_pagefaults); 2824 svm_range_drain_retry_fault(&p->svms); 2825 2826 2827 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2828 svm_range_unlink(prange); 2829 svm_range_remove_notifier(prange); 2830 svm_range_free(prange); 2831 } 2832 2833 mutex_destroy(&p->svms.lock); 2834 2835 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2836 } 2837 2838 int svm_range_list_init(struct kfd_process *p) 2839 { 2840 struct svm_range_list *svms = &p->svms; 2841 int i; 2842 2843 svms->objects = RB_ROOT_CACHED; 2844 mutex_init(&svms->lock); 2845 INIT_LIST_HEAD(&svms->list); 2846 atomic_set(&svms->evicted_ranges, 0); 2847 atomic_set(&svms->drain_pagefaults, 0); 2848 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2849 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2850 INIT_LIST_HEAD(&svms->deferred_range_list); 2851 spin_lock_init(&svms->deferred_list_lock); 2852 2853 for (i = 0; i < p->n_pdds; i++) 2854 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 2855 bitmap_set(svms->bitmap_supported, i, 1); 2856 2857 return 0; 2858 } 2859 2860 /** 2861 * svm_range_check_vm - check if virtual address range mapped already 2862 * @p: current kfd_process 2863 * @start: range start address, in pages 2864 * @last: range last address, in pages 2865 * @bo_s: mapping start address in pages if address range already mapped 2866 * @bo_l: mapping last address in pages if address range already mapped 2867 * 2868 * The purpose is to avoid virtual address ranges already allocated by 2869 * kfd_ioctl_alloc_memory_of_gpu ioctl. 2870 * It looks for each pdd in the kfd_process. 2871 * 2872 * Context: Process context 2873 * 2874 * Return 0 - OK, if the range is not mapped. 2875 * Otherwise error code: 2876 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 2877 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 2878 * a signal. Release all buffer reservations and return to user-space. 2879 */ 2880 static int 2881 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 2882 uint64_t *bo_s, uint64_t *bo_l) 2883 { 2884 struct amdgpu_bo_va_mapping *mapping; 2885 struct interval_tree_node *node; 2886 uint32_t i; 2887 int r; 2888 2889 for (i = 0; i < p->n_pdds; i++) { 2890 struct amdgpu_vm *vm; 2891 2892 if (!p->pdds[i]->drm_priv) 2893 continue; 2894 2895 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2896 r = amdgpu_bo_reserve(vm->root.bo, false); 2897 if (r) 2898 return r; 2899 2900 node = interval_tree_iter_first(&vm->va, start, last); 2901 if (node) { 2902 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 2903 start, last); 2904 mapping = container_of((struct rb_node *)node, 2905 struct amdgpu_bo_va_mapping, rb); 2906 if (bo_s && bo_l) { 2907 *bo_s = mapping->start; 2908 *bo_l = mapping->last; 2909 } 2910 amdgpu_bo_unreserve(vm->root.bo); 2911 return -EADDRINUSE; 2912 } 2913 amdgpu_bo_unreserve(vm->root.bo); 2914 } 2915 2916 return 0; 2917 } 2918 2919 /** 2920 * svm_range_is_valid - check if virtual address range is valid 2921 * @p: current kfd_process 2922 * @start: range start address, in pages 2923 * @size: range size, in pages 2924 * 2925 * Valid virtual address range means it belongs to one or more VMAs 2926 * 2927 * Context: Process context 2928 * 2929 * Return: 2930 * 0 - OK, otherwise error code 2931 */ 2932 static int 2933 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 2934 { 2935 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2936 struct vm_area_struct *vma; 2937 unsigned long end; 2938 unsigned long start_unchg = start; 2939 2940 start <<= PAGE_SHIFT; 2941 end = start + (size << PAGE_SHIFT); 2942 do { 2943 vma = find_vma(p->mm, start); 2944 if (!vma || start < vma->vm_start || 2945 (vma->vm_flags & device_vma)) 2946 return -EFAULT; 2947 start = min(end, vma->vm_end); 2948 } while (start < end); 2949 2950 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 2951 NULL); 2952 } 2953 2954 /** 2955 * svm_range_best_prefetch_location - decide the best prefetch location 2956 * @prange: svm range structure 2957 * 2958 * For xnack off: 2959 * If range map to single GPU, the best prefetch location is prefetch_loc, which 2960 * can be CPU or GPU. 2961 * 2962 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 2963 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 2964 * the best prefetch location is always CPU, because GPU can not have coherent 2965 * mapping VRAM of other GPUs even with large-BAR PCIe connection. 2966 * 2967 * For xnack on: 2968 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 2969 * prefetch_loc, other GPU access will generate vm fault and trigger migration. 2970 * 2971 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 2972 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 2973 * prefetch location is always CPU. 2974 * 2975 * Context: Process context 2976 * 2977 * Return: 2978 * 0 for CPU or GPU id 2979 */ 2980 static uint32_t 2981 svm_range_best_prefetch_location(struct svm_range *prange) 2982 { 2983 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 2984 uint32_t best_loc = prange->prefetch_loc; 2985 struct kfd_process_device *pdd; 2986 struct amdgpu_device *bo_adev; 2987 struct kfd_process *p; 2988 uint32_t gpuidx; 2989 2990 p = container_of(prange->svms, struct kfd_process, svms); 2991 2992 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 2993 goto out; 2994 2995 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 2996 if (!bo_adev) { 2997 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 2998 best_loc = 0; 2999 goto out; 3000 } 3001 3002 if (p->xnack_enabled) 3003 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 3004 else 3005 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 3006 MAX_GPU_INSTANCE); 3007 3008 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 3009 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 3010 if (!pdd) { 3011 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 3012 continue; 3013 } 3014 3015 if (pdd->dev->adev == bo_adev) 3016 continue; 3017 3018 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 3019 best_loc = 0; 3020 break; 3021 } 3022 } 3023 3024 out: 3025 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 3026 p->xnack_enabled, &p->svms, prange->start, prange->last, 3027 best_loc); 3028 3029 return best_loc; 3030 } 3031 3032 /* FIXME: This is a workaround for page locking bug when some pages are 3033 * invalid during migration to VRAM 3034 */ 3035 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, 3036 void *owner) 3037 { 3038 struct hmm_range *hmm_range; 3039 int r; 3040 3041 if (prange->validated_once) 3042 return; 3043 3044 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 3045 prange->start << PAGE_SHIFT, 3046 prange->npages, &hmm_range, 3047 false, true, owner); 3048 if (!r) { 3049 amdgpu_hmm_range_get_pages_done(hmm_range); 3050 prange->validated_once = true; 3051 } 3052 } 3053 3054 /* svm_range_trigger_migration - start page migration if prefetch loc changed 3055 * @mm: current process mm_struct 3056 * @prange: svm range structure 3057 * @migrated: output, true if migration is triggered 3058 * 3059 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 3060 * from ram to vram. 3061 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 3062 * from vram to ram. 3063 * 3064 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 3065 * and restore work: 3066 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 3067 * stops all queues, schedule restore work 3068 * 2. svm_range_restore_work wait for migration is done by 3069 * a. svm_range_validate_vram takes prange->migrate_mutex 3070 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 3071 * 3. restore work update mappings of GPU, resume all queues. 3072 * 3073 * Context: Process context 3074 * 3075 * Return: 3076 * 0 - OK, otherwise - error code of migration 3077 */ 3078 static int 3079 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 3080 bool *migrated) 3081 { 3082 uint32_t best_loc; 3083 int r = 0; 3084 3085 *migrated = false; 3086 best_loc = svm_range_best_prefetch_location(prange); 3087 3088 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3089 best_loc == prange->actual_loc) 3090 return 0; 3091 3092 if (!best_loc) { 3093 r = svm_migrate_vram_to_ram(prange, mm); 3094 *migrated = !r; 3095 return r; 3096 } 3097 3098 r = svm_migrate_to_vram(prange, best_loc, mm); 3099 *migrated = !r; 3100 3101 return r; 3102 } 3103 3104 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 3105 { 3106 if (!fence) 3107 return -EINVAL; 3108 3109 if (dma_fence_is_signaled(&fence->base)) 3110 return 0; 3111 3112 if (fence->svm_bo) { 3113 WRITE_ONCE(fence->svm_bo->evicting, 1); 3114 schedule_work(&fence->svm_bo->eviction_work); 3115 } 3116 3117 return 0; 3118 } 3119 3120 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 3121 { 3122 struct svm_range_bo *svm_bo; 3123 struct kfd_process *p; 3124 struct mm_struct *mm; 3125 3126 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 3127 if (!svm_bo_ref_unless_zero(svm_bo)) 3128 return; /* svm_bo was freed while eviction was pending */ 3129 3130 /* svm_range_bo_release destroys this worker thread. So during 3131 * the lifetime of this thread, kfd_process and mm will be valid. 3132 */ 3133 p = container_of(svm_bo->svms, struct kfd_process, svms); 3134 mm = p->mm; 3135 if (!mm) 3136 return; 3137 3138 mmap_read_lock(mm); 3139 spin_lock(&svm_bo->list_lock); 3140 while (!list_empty(&svm_bo->range_list)) { 3141 struct svm_range *prange = 3142 list_first_entry(&svm_bo->range_list, 3143 struct svm_range, svm_bo_list); 3144 int retries = 3; 3145 3146 list_del_init(&prange->svm_bo_list); 3147 spin_unlock(&svm_bo->list_lock); 3148 3149 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3150 prange->start, prange->last); 3151 3152 mutex_lock(&prange->migrate_mutex); 3153 do { 3154 svm_migrate_vram_to_ram(prange, 3155 svm_bo->eviction_fence->mm); 3156 } while (prange->actual_loc && --retries); 3157 WARN(prange->actual_loc, "Migration failed during eviction"); 3158 3159 mutex_lock(&prange->lock); 3160 prange->svm_bo = NULL; 3161 mutex_unlock(&prange->lock); 3162 3163 mutex_unlock(&prange->migrate_mutex); 3164 3165 spin_lock(&svm_bo->list_lock); 3166 } 3167 spin_unlock(&svm_bo->list_lock); 3168 mmap_read_unlock(mm); 3169 3170 dma_fence_signal(&svm_bo->eviction_fence->base); 3171 /* This is the last reference to svm_bo, after svm_range_vram_node_free 3172 * has been called in svm_migrate_vram_to_ram 3173 */ 3174 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 3175 svm_range_bo_unref(svm_bo); 3176 } 3177 3178 static int 3179 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3180 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3181 { 3182 struct mm_struct *mm = current->mm; 3183 struct list_head update_list; 3184 struct list_head insert_list; 3185 struct list_head remove_list; 3186 struct svm_range_list *svms; 3187 struct svm_range *prange; 3188 struct svm_range *next; 3189 int r = 0; 3190 3191 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 3192 p->pasid, &p->svms, start, start + size - 1, size); 3193 3194 r = svm_range_check_attr(p, nattr, attrs); 3195 if (r) 3196 return r; 3197 3198 svms = &p->svms; 3199 3200 svm_range_list_lock_and_flush_work(svms, mm); 3201 3202 r = svm_range_is_valid(p, start, size); 3203 if (r) { 3204 pr_debug("invalid range r=%d\n", r); 3205 mmap_write_unlock(mm); 3206 goto out; 3207 } 3208 3209 mutex_lock(&svms->lock); 3210 3211 /* Add new range and split existing ranges as needed */ 3212 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 3213 &insert_list, &remove_list); 3214 if (r) { 3215 mutex_unlock(&svms->lock); 3216 mmap_write_unlock(mm); 3217 goto out; 3218 } 3219 /* Apply changes as a transaction */ 3220 list_for_each_entry_safe(prange, next, &insert_list, insert_list) { 3221 svm_range_add_to_svms(prange); 3222 svm_range_add_notifier_locked(mm, prange); 3223 } 3224 list_for_each_entry(prange, &update_list, update_list) { 3225 svm_range_apply_attrs(p, prange, nattr, attrs); 3226 /* TODO: unmap ranges from GPU that lost access */ 3227 } 3228 list_for_each_entry_safe(prange, next, &remove_list, 3229 remove_list) { 3230 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 3231 prange->svms, prange, prange->start, 3232 prange->last); 3233 svm_range_unlink(prange); 3234 svm_range_remove_notifier(prange); 3235 svm_range_free(prange); 3236 } 3237 3238 mmap_write_downgrade(mm); 3239 /* Trigger migrations and revalidate and map to GPUs as needed. If 3240 * this fails we may be left with partially completed actions. There 3241 * is no clean way of rolling back to the previous state in such a 3242 * case because the rollback wouldn't be guaranteed to work either. 3243 */ 3244 list_for_each_entry(prange, &update_list, update_list) { 3245 bool migrated; 3246 3247 mutex_lock(&prange->migrate_mutex); 3248 3249 r = svm_range_trigger_migration(mm, prange, &migrated); 3250 if (r) 3251 goto out_unlock_range; 3252 3253 if (migrated && !p->xnack_enabled) { 3254 pr_debug("restore_work will update mappings of GPUs\n"); 3255 mutex_unlock(&prange->migrate_mutex); 3256 continue; 3257 } 3258 3259 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 3260 true, true); 3261 if (r) 3262 pr_debug("failed %d to map svm range\n", r); 3263 3264 out_unlock_range: 3265 mutex_unlock(&prange->migrate_mutex); 3266 if (r) 3267 break; 3268 } 3269 3270 svm_range_debug_dump(svms); 3271 3272 mutex_unlock(&svms->lock); 3273 mmap_read_unlock(mm); 3274 out: 3275 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 3276 &p->svms, start, start + size - 1, r); 3277 3278 return r; 3279 } 3280 3281 static int 3282 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3283 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3284 { 3285 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 3286 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 3287 bool get_preferred_loc = false; 3288 bool get_prefetch_loc = false; 3289 bool get_granularity = false; 3290 bool get_accessible = false; 3291 bool get_flags = false; 3292 uint64_t last = start + size - 1UL; 3293 struct mm_struct *mm = current->mm; 3294 uint8_t granularity = 0xff; 3295 struct interval_tree_node *node; 3296 struct svm_range_list *svms; 3297 struct svm_range *prange; 3298 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3299 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3300 uint32_t flags_and = 0xffffffff; 3301 uint32_t flags_or = 0; 3302 int gpuidx; 3303 uint32_t i; 3304 int r = 0; 3305 3306 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 3307 start + size - 1, nattr); 3308 3309 /* Flush pending deferred work to avoid racing with deferred actions from 3310 * previous memory map changes (e.g. munmap). Concurrent memory map changes 3311 * can still race with get_attr because we don't hold the mmap lock. But that 3312 * would be a race condition in the application anyway, and undefined 3313 * behaviour is acceptable in that case. 3314 */ 3315 flush_work(&p->svms.deferred_list_work); 3316 3317 mmap_read_lock(mm); 3318 r = svm_range_is_valid(p, start, size); 3319 mmap_read_unlock(mm); 3320 if (r) { 3321 pr_debug("invalid range r=%d\n", r); 3322 return r; 3323 } 3324 3325 for (i = 0; i < nattr; i++) { 3326 switch (attrs[i].type) { 3327 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3328 get_preferred_loc = true; 3329 break; 3330 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3331 get_prefetch_loc = true; 3332 break; 3333 case KFD_IOCTL_SVM_ATTR_ACCESS: 3334 get_accessible = true; 3335 break; 3336 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3337 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3338 get_flags = true; 3339 break; 3340 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3341 get_granularity = true; 3342 break; 3343 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 3344 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 3345 fallthrough; 3346 default: 3347 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 3348 return -EINVAL; 3349 } 3350 } 3351 3352 svms = &p->svms; 3353 3354 mutex_lock(&svms->lock); 3355 3356 node = interval_tree_iter_first(&svms->objects, start, last); 3357 if (!node) { 3358 pr_debug("range attrs not found return default values\n"); 3359 svm_range_set_default_attributes(&location, &prefetch_loc, 3360 &granularity, &flags_and); 3361 flags_or = flags_and; 3362 if (p->xnack_enabled) 3363 bitmap_copy(bitmap_access, svms->bitmap_supported, 3364 MAX_GPU_INSTANCE); 3365 else 3366 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 3367 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 3368 goto fill_values; 3369 } 3370 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 3371 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 3372 3373 while (node) { 3374 struct interval_tree_node *next; 3375 3376 prange = container_of(node, struct svm_range, it_node); 3377 next = interval_tree_iter_next(node, start, last); 3378 3379 if (get_preferred_loc) { 3380 if (prange->preferred_loc == 3381 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3382 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3383 location != prange->preferred_loc)) { 3384 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3385 get_preferred_loc = false; 3386 } else { 3387 location = prange->preferred_loc; 3388 } 3389 } 3390 if (get_prefetch_loc) { 3391 if (prange->prefetch_loc == 3392 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3393 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3394 prefetch_loc != prange->prefetch_loc)) { 3395 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3396 get_prefetch_loc = false; 3397 } else { 3398 prefetch_loc = prange->prefetch_loc; 3399 } 3400 } 3401 if (get_accessible) { 3402 bitmap_and(bitmap_access, bitmap_access, 3403 prange->bitmap_access, MAX_GPU_INSTANCE); 3404 bitmap_and(bitmap_aip, bitmap_aip, 3405 prange->bitmap_aip, MAX_GPU_INSTANCE); 3406 } 3407 if (get_flags) { 3408 flags_and &= prange->flags; 3409 flags_or |= prange->flags; 3410 } 3411 3412 if (get_granularity && prange->granularity < granularity) 3413 granularity = prange->granularity; 3414 3415 node = next; 3416 } 3417 fill_values: 3418 mutex_unlock(&svms->lock); 3419 3420 for (i = 0; i < nattr; i++) { 3421 switch (attrs[i].type) { 3422 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3423 attrs[i].value = location; 3424 break; 3425 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3426 attrs[i].value = prefetch_loc; 3427 break; 3428 case KFD_IOCTL_SVM_ATTR_ACCESS: 3429 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3430 attrs[i].value); 3431 if (gpuidx < 0) { 3432 pr_debug("invalid gpuid %x\n", attrs[i].value); 3433 return -EINVAL; 3434 } 3435 if (test_bit(gpuidx, bitmap_access)) 3436 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3437 else if (test_bit(gpuidx, bitmap_aip)) 3438 attrs[i].type = 3439 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3440 else 3441 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3442 break; 3443 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3444 attrs[i].value = flags_and; 3445 break; 3446 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3447 attrs[i].value = ~flags_or; 3448 break; 3449 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3450 attrs[i].value = (uint32_t)granularity; 3451 break; 3452 } 3453 } 3454 3455 return 0; 3456 } 3457 3458 int 3459 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3460 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3461 { 3462 int r; 3463 3464 start >>= PAGE_SHIFT; 3465 size >>= PAGE_SHIFT; 3466 3467 switch (op) { 3468 case KFD_IOCTL_SVM_OP_SET_ATTR: 3469 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3470 break; 3471 case KFD_IOCTL_SVM_OP_GET_ATTR: 3472 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3473 break; 3474 default: 3475 r = EINVAL; 3476 break; 3477 } 3478 3479 return r; 3480 } 3481