1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 37 38 /* Long enough to ensure no retry fault comes after svm range is restored and 39 * page table is updated. 40 */ 41 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 42 43 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 44 static bool 45 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 46 const struct mmu_notifier_range *range, 47 unsigned long cur_seq); 48 49 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 50 .invalidate = svm_range_cpu_invalidate_pagetables, 51 }; 52 53 /** 54 * svm_range_unlink - unlink svm_range from lists and interval tree 55 * @prange: svm range structure to be removed 56 * 57 * Remove the svm_range from the svms and svm_bo lists and the svms 58 * interval tree. 59 * 60 * Context: The caller must hold svms->lock 61 */ 62 static void svm_range_unlink(struct svm_range *prange) 63 { 64 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 65 prange, prange->start, prange->last); 66 67 if (prange->svm_bo) { 68 spin_lock(&prange->svm_bo->list_lock); 69 list_del(&prange->svm_bo_list); 70 spin_unlock(&prange->svm_bo->list_lock); 71 } 72 73 list_del(&prange->list); 74 if (prange->it_node.start != 0 && prange->it_node.last != 0) 75 interval_tree_remove(&prange->it_node, &prange->svms->objects); 76 } 77 78 static void 79 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 80 { 81 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 82 prange, prange->start, prange->last); 83 84 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 85 prange->start << PAGE_SHIFT, 86 prange->npages << PAGE_SHIFT, 87 &svm_range_mn_ops); 88 } 89 90 /** 91 * svm_range_add_to_svms - add svm range to svms 92 * @prange: svm range structure to be added 93 * 94 * Add the svm range to svms interval tree and link list 95 * 96 * Context: The caller must hold svms->lock 97 */ 98 static void svm_range_add_to_svms(struct svm_range *prange) 99 { 100 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 101 prange, prange->start, prange->last); 102 103 list_add_tail(&prange->list, &prange->svms->list); 104 prange->it_node.start = prange->start; 105 prange->it_node.last = prange->last; 106 interval_tree_insert(&prange->it_node, &prange->svms->objects); 107 } 108 109 static void svm_range_remove_notifier(struct svm_range *prange) 110 { 111 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 112 prange->svms, prange, 113 prange->notifier.interval_tree.start >> PAGE_SHIFT, 114 prange->notifier.interval_tree.last >> PAGE_SHIFT); 115 116 if (prange->notifier.interval_tree.start != 0 && 117 prange->notifier.interval_tree.last != 0) 118 mmu_interval_notifier_remove(&prange->notifier); 119 } 120 121 static int 122 svm_range_dma_map_dev(struct device *dev, dma_addr_t **dma_addr, 123 unsigned long *hmm_pfns, uint64_t npages) 124 { 125 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 126 dma_addr_t *addr = *dma_addr; 127 struct page *page; 128 int i, r; 129 130 if (!addr) { 131 addr = kvmalloc_array(npages, sizeof(*addr), 132 GFP_KERNEL | __GFP_ZERO); 133 if (!addr) 134 return -ENOMEM; 135 *dma_addr = addr; 136 } 137 138 for (i = 0; i < npages; i++) { 139 if (WARN_ONCE(addr[i] && !dma_mapping_error(dev, addr[i]), 140 "leaking dma mapping\n")) 141 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 142 143 page = hmm_pfn_to_page(hmm_pfns[i]); 144 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 145 r = dma_mapping_error(dev, addr[i]); 146 if (r) { 147 pr_debug("failed %d dma_map_page\n", r); 148 return r; 149 } 150 pr_debug("dma mapping 0x%llx for page addr 0x%lx\n", 151 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 152 } 153 return 0; 154 } 155 156 static int 157 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 158 unsigned long *hmm_pfns) 159 { 160 struct kfd_process *p; 161 uint32_t gpuidx; 162 int r; 163 164 p = container_of(prange->svms, struct kfd_process, svms); 165 166 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 167 struct kfd_process_device *pdd; 168 struct amdgpu_device *adev; 169 170 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 171 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 172 if (!pdd) { 173 pr_debug("failed to find device idx %d\n", gpuidx); 174 return -EINVAL; 175 } 176 adev = (struct amdgpu_device *)pdd->dev->kgd; 177 178 r = svm_range_dma_map_dev(adev->dev, &prange->dma_addr[gpuidx], 179 hmm_pfns, prange->npages); 180 if (r) 181 break; 182 } 183 184 return r; 185 } 186 187 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 188 unsigned long offset, unsigned long npages) 189 { 190 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 191 int i; 192 193 if (!dma_addr) 194 return; 195 196 for (i = offset; i < offset + npages; i++) { 197 if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i])) 198 continue; 199 pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 200 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 201 dma_addr[i] = 0; 202 } 203 } 204 205 void svm_range_free_dma_mappings(struct svm_range *prange) 206 { 207 struct kfd_process_device *pdd; 208 dma_addr_t *dma_addr; 209 struct device *dev; 210 struct kfd_process *p; 211 uint32_t gpuidx; 212 213 p = container_of(prange->svms, struct kfd_process, svms); 214 215 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 216 dma_addr = prange->dma_addr[gpuidx]; 217 if (!dma_addr) 218 continue; 219 220 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 221 if (!pdd) { 222 pr_debug("failed to find device idx %d\n", gpuidx); 223 continue; 224 } 225 dev = &pdd->dev->pdev->dev; 226 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 227 kvfree(dma_addr); 228 prange->dma_addr[gpuidx] = NULL; 229 } 230 } 231 232 static void svm_range_free(struct svm_range *prange) 233 { 234 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 235 prange->start, prange->last); 236 237 svm_range_vram_node_free(prange); 238 svm_range_free_dma_mappings(prange); 239 mutex_destroy(&prange->lock); 240 mutex_destroy(&prange->migrate_mutex); 241 kfree(prange); 242 } 243 244 static void 245 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 246 uint8_t *granularity, uint32_t *flags) 247 { 248 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 249 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 250 *granularity = 9; 251 *flags = 252 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 253 } 254 255 static struct 256 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 257 uint64_t last) 258 { 259 uint64_t size = last - start + 1; 260 struct svm_range *prange; 261 struct kfd_process *p; 262 263 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 264 if (!prange) 265 return NULL; 266 prange->npages = size; 267 prange->svms = svms; 268 prange->start = start; 269 prange->last = last; 270 INIT_LIST_HEAD(&prange->list); 271 INIT_LIST_HEAD(&prange->update_list); 272 INIT_LIST_HEAD(&prange->remove_list); 273 INIT_LIST_HEAD(&prange->insert_list); 274 INIT_LIST_HEAD(&prange->svm_bo_list); 275 INIT_LIST_HEAD(&prange->deferred_list); 276 INIT_LIST_HEAD(&prange->child_list); 277 atomic_set(&prange->invalid, 0); 278 prange->validate_timestamp = 0; 279 mutex_init(&prange->migrate_mutex); 280 mutex_init(&prange->lock); 281 282 p = container_of(svms, struct kfd_process, svms); 283 if (p->xnack_enabled) 284 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 285 MAX_GPU_INSTANCE); 286 287 svm_range_set_default_attributes(&prange->preferred_loc, 288 &prange->prefetch_loc, 289 &prange->granularity, &prange->flags); 290 291 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 292 293 return prange; 294 } 295 296 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 297 { 298 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 299 return false; 300 301 return true; 302 } 303 304 static struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) 305 { 306 if (svm_bo) 307 kref_get(&svm_bo->kref); 308 309 return svm_bo; 310 } 311 312 static void svm_range_bo_release(struct kref *kref) 313 { 314 struct svm_range_bo *svm_bo; 315 316 svm_bo = container_of(kref, struct svm_range_bo, kref); 317 spin_lock(&svm_bo->list_lock); 318 while (!list_empty(&svm_bo->range_list)) { 319 struct svm_range *prange = 320 list_first_entry(&svm_bo->range_list, 321 struct svm_range, svm_bo_list); 322 /* list_del_init tells a concurrent svm_range_vram_node_new when 323 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 324 */ 325 list_del_init(&prange->svm_bo_list); 326 spin_unlock(&svm_bo->list_lock); 327 328 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 329 prange->start, prange->last); 330 mutex_lock(&prange->lock); 331 prange->svm_bo = NULL; 332 mutex_unlock(&prange->lock); 333 334 spin_lock(&svm_bo->list_lock); 335 } 336 spin_unlock(&svm_bo->list_lock); 337 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 338 /* We're not in the eviction worker. 339 * Signal the fence and synchronize with any 340 * pending eviction work. 341 */ 342 dma_fence_signal(&svm_bo->eviction_fence->base); 343 cancel_work_sync(&svm_bo->eviction_work); 344 } 345 dma_fence_put(&svm_bo->eviction_fence->base); 346 amdgpu_bo_unref(&svm_bo->bo); 347 kfree(svm_bo); 348 } 349 350 static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 351 { 352 if (!svm_bo) 353 return; 354 355 kref_put(&svm_bo->kref, svm_range_bo_release); 356 } 357 358 static bool 359 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 360 { 361 struct amdgpu_device *bo_adev; 362 363 mutex_lock(&prange->lock); 364 if (!prange->svm_bo) { 365 mutex_unlock(&prange->lock); 366 return false; 367 } 368 if (prange->ttm_res) { 369 /* We still have a reference, all is well */ 370 mutex_unlock(&prange->lock); 371 return true; 372 } 373 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 374 /* 375 * Migrate from GPU to GPU, remove range from source bo_adev 376 * svm_bo range list, and return false to allocate svm_bo from 377 * destination adev. 378 */ 379 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 380 if (bo_adev != adev) { 381 mutex_unlock(&prange->lock); 382 383 spin_lock(&prange->svm_bo->list_lock); 384 list_del_init(&prange->svm_bo_list); 385 spin_unlock(&prange->svm_bo->list_lock); 386 387 svm_range_bo_unref(prange->svm_bo); 388 return false; 389 } 390 if (READ_ONCE(prange->svm_bo->evicting)) { 391 struct dma_fence *f; 392 struct svm_range_bo *svm_bo; 393 /* The BO is getting evicted, 394 * we need to get a new one 395 */ 396 mutex_unlock(&prange->lock); 397 svm_bo = prange->svm_bo; 398 f = dma_fence_get(&svm_bo->eviction_fence->base); 399 svm_range_bo_unref(prange->svm_bo); 400 /* wait for the fence to avoid long spin-loop 401 * at list_empty_careful 402 */ 403 dma_fence_wait(f, false); 404 dma_fence_put(f); 405 } else { 406 /* The BO was still around and we got 407 * a new reference to it 408 */ 409 mutex_unlock(&prange->lock); 410 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 411 prange->svms, prange->start, prange->last); 412 413 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 414 return true; 415 } 416 417 } else { 418 mutex_unlock(&prange->lock); 419 } 420 421 /* We need a new svm_bo. Spin-loop to wait for concurrent 422 * svm_range_bo_release to finish removing this range from 423 * its range list. After this, it is safe to reuse the 424 * svm_bo pointer and svm_bo_list head. 425 */ 426 while (!list_empty_careful(&prange->svm_bo_list)) 427 ; 428 429 return false; 430 } 431 432 static struct svm_range_bo *svm_range_bo_new(void) 433 { 434 struct svm_range_bo *svm_bo; 435 436 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 437 if (!svm_bo) 438 return NULL; 439 440 kref_init(&svm_bo->kref); 441 INIT_LIST_HEAD(&svm_bo->range_list); 442 spin_lock_init(&svm_bo->list_lock); 443 444 return svm_bo; 445 } 446 447 int 448 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 449 bool clear) 450 { 451 struct amdgpu_bo_param bp; 452 struct svm_range_bo *svm_bo; 453 struct amdgpu_bo_user *ubo; 454 struct amdgpu_bo *bo; 455 struct kfd_process *p; 456 struct mm_struct *mm; 457 int r; 458 459 p = container_of(prange->svms, struct kfd_process, svms); 460 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 461 prange->start, prange->last); 462 463 if (svm_range_validate_svm_bo(adev, prange)) 464 return 0; 465 466 svm_bo = svm_range_bo_new(); 467 if (!svm_bo) { 468 pr_debug("failed to alloc svm bo\n"); 469 return -ENOMEM; 470 } 471 mm = get_task_mm(p->lead_thread); 472 if (!mm) { 473 pr_debug("failed to get mm\n"); 474 kfree(svm_bo); 475 return -ESRCH; 476 } 477 svm_bo->svms = prange->svms; 478 svm_bo->eviction_fence = 479 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 480 mm, 481 svm_bo); 482 mmput(mm); 483 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 484 svm_bo->evicting = 0; 485 memset(&bp, 0, sizeof(bp)); 486 bp.size = prange->npages * PAGE_SIZE; 487 bp.byte_align = PAGE_SIZE; 488 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 489 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 490 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 491 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 492 bp.type = ttm_bo_type_device; 493 bp.resv = NULL; 494 495 r = amdgpu_bo_create_user(adev, &bp, &ubo); 496 if (r) { 497 pr_debug("failed %d to create bo\n", r); 498 goto create_bo_failed; 499 } 500 bo = &ubo->bo; 501 r = amdgpu_bo_reserve(bo, true); 502 if (r) { 503 pr_debug("failed %d to reserve bo\n", r); 504 goto reserve_bo_failed; 505 } 506 507 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 508 if (r) { 509 pr_debug("failed %d to reserve bo\n", r); 510 amdgpu_bo_unreserve(bo); 511 goto reserve_bo_failed; 512 } 513 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 514 515 amdgpu_bo_unreserve(bo); 516 517 svm_bo->bo = bo; 518 prange->svm_bo = svm_bo; 519 prange->ttm_res = bo->tbo.resource; 520 prange->offset = 0; 521 522 spin_lock(&svm_bo->list_lock); 523 list_add(&prange->svm_bo_list, &svm_bo->range_list); 524 spin_unlock(&svm_bo->list_lock); 525 526 return 0; 527 528 reserve_bo_failed: 529 amdgpu_bo_unref(&bo); 530 create_bo_failed: 531 dma_fence_put(&svm_bo->eviction_fence->base); 532 kfree(svm_bo); 533 prange->ttm_res = NULL; 534 535 return r; 536 } 537 538 void svm_range_vram_node_free(struct svm_range *prange) 539 { 540 svm_range_bo_unref(prange->svm_bo); 541 prange->ttm_res = NULL; 542 } 543 544 struct amdgpu_device * 545 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 546 { 547 struct kfd_process_device *pdd; 548 struct kfd_process *p; 549 int32_t gpu_idx; 550 551 p = container_of(prange->svms, struct kfd_process, svms); 552 553 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 554 if (gpu_idx < 0) { 555 pr_debug("failed to get device by id 0x%x\n", gpu_id); 556 return NULL; 557 } 558 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 559 if (!pdd) { 560 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 561 return NULL; 562 } 563 564 return (struct amdgpu_device *)pdd->dev->kgd; 565 } 566 567 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 568 { 569 struct ttm_operation_ctx ctx = { false, false }; 570 571 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 572 573 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 574 } 575 576 static int 577 svm_range_check_attr(struct kfd_process *p, 578 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 579 { 580 uint32_t i; 581 582 for (i = 0; i < nattr; i++) { 583 uint32_t val = attrs[i].value; 584 int gpuidx = MAX_GPU_INSTANCE; 585 586 switch (attrs[i].type) { 587 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 588 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 589 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 590 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 591 break; 592 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 593 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 594 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 595 break; 596 case KFD_IOCTL_SVM_ATTR_ACCESS: 597 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 598 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 599 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 600 break; 601 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 602 break; 603 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 604 break; 605 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 606 break; 607 default: 608 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 609 return -EINVAL; 610 } 611 612 if (gpuidx < 0) { 613 pr_debug("no GPU 0x%x found\n", val); 614 return -EINVAL; 615 } else if (gpuidx < MAX_GPU_INSTANCE && 616 !test_bit(gpuidx, p->svms.bitmap_supported)) { 617 pr_debug("GPU 0x%x not supported\n", val); 618 return -EINVAL; 619 } 620 } 621 622 return 0; 623 } 624 625 static void 626 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 627 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 628 { 629 uint32_t i; 630 int gpuidx; 631 632 for (i = 0; i < nattr; i++) { 633 switch (attrs[i].type) { 634 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 635 prange->preferred_loc = attrs[i].value; 636 break; 637 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 638 prange->prefetch_loc = attrs[i].value; 639 break; 640 case KFD_IOCTL_SVM_ATTR_ACCESS: 641 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 642 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 643 gpuidx = kfd_process_gpuidx_from_gpuid(p, 644 attrs[i].value); 645 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 646 bitmap_clear(prange->bitmap_access, gpuidx, 1); 647 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 648 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 649 bitmap_set(prange->bitmap_access, gpuidx, 1); 650 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 651 } else { 652 bitmap_clear(prange->bitmap_access, gpuidx, 1); 653 bitmap_set(prange->bitmap_aip, gpuidx, 1); 654 } 655 break; 656 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 657 prange->flags |= attrs[i].value; 658 break; 659 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 660 prange->flags &= ~attrs[i].value; 661 break; 662 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 663 prange->granularity = attrs[i].value; 664 break; 665 default: 666 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 667 } 668 } 669 } 670 671 /** 672 * svm_range_debug_dump - print all range information from svms 673 * @svms: svm range list header 674 * 675 * debug output svm range start, end, prefetch location from svms 676 * interval tree and link list 677 * 678 * Context: The caller must hold svms->lock 679 */ 680 static void svm_range_debug_dump(struct svm_range_list *svms) 681 { 682 struct interval_tree_node *node; 683 struct svm_range *prange; 684 685 pr_debug("dump svms 0x%p list\n", svms); 686 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 687 688 list_for_each_entry(prange, &svms->list, list) { 689 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 690 prange, prange->start, prange->npages, 691 prange->start + prange->npages - 1, 692 prange->actual_loc); 693 } 694 695 pr_debug("dump svms 0x%p interval tree\n", svms); 696 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 697 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 698 while (node) { 699 prange = container_of(node, struct svm_range, it_node); 700 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 701 prange, prange->start, prange->npages, 702 prange->start + prange->npages - 1, 703 prange->actual_loc); 704 node = interval_tree_iter_next(node, 0, ~0ULL); 705 } 706 } 707 708 static bool 709 svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new) 710 { 711 return (old->prefetch_loc == new->prefetch_loc && 712 old->flags == new->flags && 713 old->granularity == new->granularity); 714 } 715 716 static int 717 svm_range_split_array(void *ppnew, void *ppold, size_t size, 718 uint64_t old_start, uint64_t old_n, 719 uint64_t new_start, uint64_t new_n) 720 { 721 unsigned char *new, *old, *pold; 722 uint64_t d; 723 724 if (!ppold) 725 return 0; 726 pold = *(unsigned char **)ppold; 727 if (!pold) 728 return 0; 729 730 new = kvmalloc_array(new_n, size, GFP_KERNEL); 731 if (!new) 732 return -ENOMEM; 733 734 d = (new_start - old_start) * size; 735 memcpy(new, pold + d, new_n * size); 736 737 old = kvmalloc_array(old_n, size, GFP_KERNEL); 738 if (!old) { 739 kvfree(new); 740 return -ENOMEM; 741 } 742 743 d = (new_start == old_start) ? new_n * size : 0; 744 memcpy(old, pold + d, old_n * size); 745 746 kvfree(pold); 747 *(void **)ppold = old; 748 *(void **)ppnew = new; 749 750 return 0; 751 } 752 753 static int 754 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 755 uint64_t start, uint64_t last) 756 { 757 uint64_t npages = last - start + 1; 758 int i, r; 759 760 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 761 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 762 sizeof(*old->dma_addr[i]), old->start, 763 npages, new->start, new->npages); 764 if (r) 765 return r; 766 } 767 768 return 0; 769 } 770 771 static int 772 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 773 uint64_t start, uint64_t last) 774 { 775 uint64_t npages = last - start + 1; 776 777 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 778 new->svms, new, new->start, start, last); 779 780 if (new->start == old->start) { 781 new->offset = old->offset; 782 old->offset += new->npages; 783 } else { 784 new->offset = old->offset + npages; 785 } 786 787 new->svm_bo = svm_range_bo_ref(old->svm_bo); 788 new->ttm_res = old->ttm_res; 789 790 spin_lock(&new->svm_bo->list_lock); 791 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 792 spin_unlock(&new->svm_bo->list_lock); 793 794 return 0; 795 } 796 797 /** 798 * svm_range_split_adjust - split range and adjust 799 * 800 * @new: new range 801 * @old: the old range 802 * @start: the old range adjust to start address in pages 803 * @last: the old range adjust to last address in pages 804 * 805 * Copy system memory dma_addr or vram ttm_res in old range to new 806 * range from new_start up to size new->npages, the remaining old range is from 807 * start to last 808 * 809 * Return: 810 * 0 - OK, -ENOMEM - out of memory 811 */ 812 static int 813 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 814 uint64_t start, uint64_t last) 815 { 816 int r; 817 818 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 819 new->svms, new->start, old->start, old->last, start, last); 820 821 if (new->start < old->start || 822 new->last > old->last) { 823 WARN_ONCE(1, "invalid new range start or last\n"); 824 return -EINVAL; 825 } 826 827 r = svm_range_split_pages(new, old, start, last); 828 if (r) 829 return r; 830 831 if (old->actual_loc && old->ttm_res) { 832 r = svm_range_split_nodes(new, old, start, last); 833 if (r) 834 return r; 835 } 836 837 old->npages = last - start + 1; 838 old->start = start; 839 old->last = last; 840 new->flags = old->flags; 841 new->preferred_loc = old->preferred_loc; 842 new->prefetch_loc = old->prefetch_loc; 843 new->actual_loc = old->actual_loc; 844 new->granularity = old->granularity; 845 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 846 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 847 848 return 0; 849 } 850 851 /** 852 * svm_range_split - split a range in 2 ranges 853 * 854 * @prange: the svm range to split 855 * @start: the remaining range start address in pages 856 * @last: the remaining range last address in pages 857 * @new: the result new range generated 858 * 859 * Two cases only: 860 * case 1: if start == prange->start 861 * prange ==> prange[start, last] 862 * new range [last + 1, prange->last] 863 * 864 * case 2: if last == prange->last 865 * prange ==> prange[start, last] 866 * new range [prange->start, start - 1] 867 * 868 * Return: 869 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 870 */ 871 static int 872 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 873 struct svm_range **new) 874 { 875 uint64_t old_start = prange->start; 876 uint64_t old_last = prange->last; 877 struct svm_range_list *svms; 878 int r = 0; 879 880 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 881 old_start, old_last, start, last); 882 883 if (old_start != start && old_last != last) 884 return -EINVAL; 885 if (start < old_start || last > old_last) 886 return -EINVAL; 887 888 svms = prange->svms; 889 if (old_start == start) 890 *new = svm_range_new(svms, last + 1, old_last); 891 else 892 *new = svm_range_new(svms, old_start, start - 1); 893 if (!*new) 894 return -ENOMEM; 895 896 r = svm_range_split_adjust(*new, prange, start, last); 897 if (r) { 898 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 899 r, old_start, old_last, start, last); 900 svm_range_free(*new); 901 *new = NULL; 902 } 903 904 return r; 905 } 906 907 static int 908 svm_range_split_tail(struct svm_range *prange, struct svm_range *new, 909 uint64_t new_last, struct list_head *insert_list) 910 { 911 struct svm_range *tail; 912 int r = svm_range_split(prange, prange->start, new_last, &tail); 913 914 if (!r) 915 list_add(&tail->insert_list, insert_list); 916 return r; 917 } 918 919 static int 920 svm_range_split_head(struct svm_range *prange, struct svm_range *new, 921 uint64_t new_start, struct list_head *insert_list) 922 { 923 struct svm_range *head; 924 int r = svm_range_split(prange, new_start, prange->last, &head); 925 926 if (!r) 927 list_add(&head->insert_list, insert_list); 928 return r; 929 } 930 931 static void 932 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 933 struct svm_range *pchild, enum svm_work_list_ops op) 934 { 935 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 936 pchild, pchild->start, pchild->last, prange, op); 937 938 pchild->work_item.mm = mm; 939 pchild->work_item.op = op; 940 list_add_tail(&pchild->child_list, &prange->child_list); 941 } 942 943 /** 944 * svm_range_split_by_granularity - collect ranges within granularity boundary 945 * 946 * @p: the process with svms list 947 * @mm: mm structure 948 * @addr: the vm fault address in pages, to split the prange 949 * @parent: parent range if prange is from child list 950 * @prange: prange to split 951 * 952 * Trims @prange to be a single aligned block of prange->granularity if 953 * possible. The head and tail are added to the child_list in @parent. 954 * 955 * Context: caller must hold mmap_read_lock and prange->lock 956 * 957 * Return: 958 * 0 - OK, otherwise error code 959 */ 960 int 961 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 962 unsigned long addr, struct svm_range *parent, 963 struct svm_range *prange) 964 { 965 struct svm_range *head, *tail; 966 unsigned long start, last, size; 967 int r; 968 969 /* Align splited range start and size to granularity size, then a single 970 * PTE will be used for whole range, this reduces the number of PTE 971 * updated and the L1 TLB space used for translation. 972 */ 973 size = 1UL << prange->granularity; 974 start = ALIGN_DOWN(addr, size); 975 last = ALIGN(addr + 1, size) - 1; 976 977 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 978 prange->svms, prange->start, prange->last, start, last, size); 979 980 if (start > prange->start) { 981 r = svm_range_split(prange, start, prange->last, &head); 982 if (r) 983 return r; 984 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 985 } 986 987 if (last < prange->last) { 988 r = svm_range_split(prange, prange->start, last, &tail); 989 if (r) 990 return r; 991 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 992 } 993 994 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 995 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 996 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 997 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 998 prange, prange->start, prange->last, 999 SVM_OP_ADD_RANGE_AND_MAP); 1000 } 1001 return 0; 1002 } 1003 1004 static uint64_t 1005 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange) 1006 { 1007 struct amdgpu_device *bo_adev; 1008 uint32_t flags = prange->flags; 1009 uint32_t mapping_flags = 0; 1010 uint64_t pte_flags; 1011 bool snoop = !prange->ttm_res; 1012 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1013 1014 if (prange->svm_bo && prange->ttm_res) 1015 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1016 1017 switch (adev->asic_type) { 1018 case CHIP_ARCTURUS: 1019 if (prange->svm_bo && prange->ttm_res) { 1020 if (bo_adev == adev) { 1021 mapping_flags |= coherent ? 1022 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1023 } else { 1024 mapping_flags |= coherent ? 1025 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1026 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1027 snoop = true; 1028 } 1029 } else { 1030 mapping_flags |= coherent ? 1031 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1032 } 1033 break; 1034 case CHIP_ALDEBARAN: 1035 if (prange->svm_bo && prange->ttm_res) { 1036 if (bo_adev == adev) { 1037 mapping_flags |= coherent ? 1038 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1039 if (adev->gmc.xgmi.connected_to_cpu) 1040 snoop = true; 1041 } else { 1042 mapping_flags |= coherent ? 1043 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1044 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1045 snoop = true; 1046 } 1047 } else { 1048 mapping_flags |= coherent ? 1049 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1050 } 1051 break; 1052 default: 1053 mapping_flags |= coherent ? 1054 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1055 } 1056 1057 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1058 1059 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1060 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1061 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1062 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1063 1064 pte_flags = AMDGPU_PTE_VALID; 1065 pte_flags |= prange->ttm_res ? 0 : AMDGPU_PTE_SYSTEM; 1066 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1067 1068 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1069 1070 pr_debug("svms 0x%p [0x%lx 0x%lx] vram %d PTE 0x%llx mapping 0x%x\n", 1071 prange->svms, prange->start, prange->last, 1072 prange->ttm_res ? 1:0, pte_flags, mapping_flags); 1073 1074 return pte_flags; 1075 } 1076 1077 static int 1078 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1079 uint64_t start, uint64_t last, 1080 struct dma_fence **fence) 1081 { 1082 uint64_t init_pte_value = 0; 1083 1084 pr_debug("[0x%llx 0x%llx]\n", start, last); 1085 1086 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1087 start, last, init_pte_value, 0, 1088 NULL, NULL, fence, NULL); 1089 } 1090 1091 static int 1092 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1093 unsigned long last) 1094 { 1095 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1096 struct kfd_process_device *pdd; 1097 struct dma_fence *fence = NULL; 1098 struct amdgpu_device *adev; 1099 struct kfd_process *p; 1100 uint32_t gpuidx; 1101 int r = 0; 1102 1103 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1104 MAX_GPU_INSTANCE); 1105 p = container_of(prange->svms, struct kfd_process, svms); 1106 1107 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1108 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1109 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1110 if (!pdd) { 1111 pr_debug("failed to find device idx %d\n", gpuidx); 1112 return -EINVAL; 1113 } 1114 adev = (struct amdgpu_device *)pdd->dev->kgd; 1115 1116 r = svm_range_unmap_from_gpu(adev, drm_priv_to_vm(pdd->drm_priv), 1117 start, last, &fence); 1118 if (r) 1119 break; 1120 1121 if (fence) { 1122 r = dma_fence_wait(fence, false); 1123 dma_fence_put(fence); 1124 fence = NULL; 1125 if (r) 1126 break; 1127 } 1128 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev, 1129 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1130 } 1131 1132 return r; 1133 } 1134 1135 static int 1136 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1137 struct svm_range *prange, dma_addr_t *dma_addr, 1138 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1139 { 1140 struct amdgpu_bo_va bo_va; 1141 bool table_freed = false; 1142 uint64_t pte_flags; 1143 int r = 0; 1144 1145 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, 1146 prange->last); 1147 1148 if (prange->svm_bo && prange->ttm_res) { 1149 bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev); 1150 prange->mapping.bo_va = &bo_va; 1151 } 1152 1153 prange->mapping.start = prange->start; 1154 prange->mapping.last = prange->last; 1155 prange->mapping.offset = prange->ttm_res ? prange->offset : 0; 1156 pte_flags = svm_range_get_pte_flags(adev, prange); 1157 1158 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, NULL, 1159 prange->mapping.start, 1160 prange->mapping.last, pte_flags, 1161 prange->mapping.offset, 1162 prange->ttm_res, 1163 dma_addr, &vm->last_update, 1164 &table_freed); 1165 if (r) { 1166 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1167 goto out; 1168 } 1169 1170 r = amdgpu_vm_update_pdes(adev, vm, false); 1171 if (r) { 1172 pr_debug("failed %d to update directories 0x%lx\n", r, 1173 prange->start); 1174 goto out; 1175 } 1176 1177 if (fence) 1178 *fence = dma_fence_get(vm->last_update); 1179 1180 if (table_freed) { 1181 struct kfd_process *p; 1182 1183 p = container_of(prange->svms, struct kfd_process, svms); 1184 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev, 1185 p->pasid, TLB_FLUSH_LEGACY); 1186 } 1187 out: 1188 prange->mapping.bo_va = NULL; 1189 return r; 1190 } 1191 1192 static int svm_range_map_to_gpus(struct svm_range *prange, 1193 unsigned long *bitmap, bool wait) 1194 { 1195 struct kfd_process_device *pdd; 1196 struct amdgpu_device *bo_adev; 1197 struct amdgpu_device *adev; 1198 struct kfd_process *p; 1199 struct dma_fence *fence = NULL; 1200 uint32_t gpuidx; 1201 int r = 0; 1202 1203 if (prange->svm_bo && prange->ttm_res) 1204 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1205 else 1206 bo_adev = NULL; 1207 1208 p = container_of(prange->svms, struct kfd_process, svms); 1209 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1210 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1211 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1212 if (!pdd) { 1213 pr_debug("failed to find device idx %d\n", gpuidx); 1214 return -EINVAL; 1215 } 1216 adev = (struct amdgpu_device *)pdd->dev->kgd; 1217 1218 pdd = kfd_bind_process_to_device(pdd->dev, p); 1219 if (IS_ERR(pdd)) 1220 return -EINVAL; 1221 1222 if (bo_adev && adev != bo_adev && 1223 !amdgpu_xgmi_same_hive(adev, bo_adev)) { 1224 pr_debug("cannot map to device idx %d\n", gpuidx); 1225 continue; 1226 } 1227 1228 r = svm_range_map_to_gpu(adev, drm_priv_to_vm(pdd->drm_priv), 1229 prange, prange->dma_addr[gpuidx], 1230 bo_adev, wait ? &fence : NULL); 1231 if (r) 1232 break; 1233 1234 if (fence) { 1235 r = dma_fence_wait(fence, false); 1236 dma_fence_put(fence); 1237 fence = NULL; 1238 if (r) { 1239 pr_debug("failed %d to dma fence wait\n", r); 1240 break; 1241 } 1242 } 1243 } 1244 1245 return r; 1246 } 1247 1248 struct svm_validate_context { 1249 struct kfd_process *process; 1250 struct svm_range *prange; 1251 bool intr; 1252 unsigned long bitmap[MAX_GPU_INSTANCE]; 1253 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE+1]; 1254 struct list_head validate_list; 1255 struct ww_acquire_ctx ticket; 1256 }; 1257 1258 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1259 { 1260 struct kfd_process_device *pdd; 1261 struct amdgpu_device *adev; 1262 struct amdgpu_vm *vm; 1263 uint32_t gpuidx; 1264 int r; 1265 1266 INIT_LIST_HEAD(&ctx->validate_list); 1267 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1268 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1269 if (!pdd) { 1270 pr_debug("failed to find device idx %d\n", gpuidx); 1271 return -EINVAL; 1272 } 1273 adev = (struct amdgpu_device *)pdd->dev->kgd; 1274 vm = drm_priv_to_vm(pdd->drm_priv); 1275 1276 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1277 ctx->tv[gpuidx].num_shared = 4; 1278 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1279 } 1280 if (ctx->prange->svm_bo && ctx->prange->ttm_res) { 1281 ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo; 1282 ctx->tv[MAX_GPU_INSTANCE].num_shared = 1; 1283 list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list); 1284 } 1285 1286 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1287 ctx->intr, NULL); 1288 if (r) { 1289 pr_debug("failed %d to reserve bo\n", r); 1290 return r; 1291 } 1292 1293 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1294 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1295 if (!pdd) { 1296 pr_debug("failed to find device idx %d\n", gpuidx); 1297 r = -EINVAL; 1298 goto unreserve_out; 1299 } 1300 adev = (struct amdgpu_device *)pdd->dev->kgd; 1301 1302 r = amdgpu_vm_validate_pt_bos(adev, drm_priv_to_vm(pdd->drm_priv), 1303 svm_range_bo_validate, NULL); 1304 if (r) { 1305 pr_debug("failed %d validate pt bos\n", r); 1306 goto unreserve_out; 1307 } 1308 } 1309 1310 return 0; 1311 1312 unreserve_out: 1313 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1314 return r; 1315 } 1316 1317 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1318 { 1319 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1320 } 1321 1322 /* 1323 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1324 * 1325 * To prevent concurrent destruction or change of range attributes, the 1326 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1327 * because that would block concurrent evictions and lead to deadlocks. To 1328 * serialize concurrent migrations or validations of the same range, the 1329 * prange->migrate_mutex must be held. 1330 * 1331 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1332 * eviction fence. 1333 * 1334 * The following sequence ensures race-free validation and GPU mapping: 1335 * 1336 * 1. Reserve page table (and SVM BO if range is in VRAM) 1337 * 2. hmm_range_fault to get page addresses (if system memory) 1338 * 3. DMA-map pages (if system memory) 1339 * 4-a. Take notifier lock 1340 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1341 * 4-c. Check that the range was not split or otherwise invalidated 1342 * 4-d. Update GPU page table 1343 * 4.e. Release notifier lock 1344 * 5. Release page table (and SVM BO) reservation 1345 */ 1346 static int svm_range_validate_and_map(struct mm_struct *mm, 1347 struct svm_range *prange, 1348 int32_t gpuidx, bool intr, bool wait) 1349 { 1350 struct svm_validate_context ctx; 1351 struct hmm_range *hmm_range; 1352 int r = 0; 1353 1354 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1355 ctx.prange = prange; 1356 ctx.intr = intr; 1357 1358 if (gpuidx < MAX_GPU_INSTANCE) { 1359 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1360 bitmap_set(ctx.bitmap, gpuidx, 1); 1361 } else if (ctx.process->xnack_enabled) { 1362 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1363 1364 /* If prefetch range to GPU, or GPU retry fault migrate range to 1365 * GPU, which has ACCESS attribute to the range, create mapping 1366 * on that GPU. 1367 */ 1368 if (prange->actual_loc) { 1369 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1370 prange->actual_loc); 1371 if (gpuidx < 0) { 1372 WARN_ONCE(1, "failed get device by id 0x%x\n", 1373 prange->actual_loc); 1374 return -EINVAL; 1375 } 1376 if (test_bit(gpuidx, prange->bitmap_access)) 1377 bitmap_set(ctx.bitmap, gpuidx, 1); 1378 } 1379 } else { 1380 bitmap_or(ctx.bitmap, prange->bitmap_access, 1381 prange->bitmap_aip, MAX_GPU_INSTANCE); 1382 } 1383 1384 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1385 return 0; 1386 1387 if (prange->actual_loc && !prange->ttm_res) { 1388 /* This should never happen. actual_loc gets set by 1389 * svm_migrate_ram_to_vram after allocating a BO. 1390 */ 1391 WARN(1, "VRAM BO missing during validation\n"); 1392 return -EINVAL; 1393 } 1394 1395 svm_range_reserve_bos(&ctx); 1396 1397 if (!prange->actual_loc) { 1398 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1399 prange->start << PAGE_SHIFT, 1400 prange->npages, &hmm_range, 1401 false, true); 1402 if (r) { 1403 pr_debug("failed %d to get svm range pages\n", r); 1404 goto unreserve_out; 1405 } 1406 1407 r = svm_range_dma_map(prange, ctx.bitmap, 1408 hmm_range->hmm_pfns); 1409 if (r) { 1410 pr_debug("failed %d to dma map range\n", r); 1411 goto unreserve_out; 1412 } 1413 1414 prange->validated_once = true; 1415 } 1416 1417 svm_range_lock(prange); 1418 if (!prange->actual_loc) { 1419 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1420 pr_debug("hmm update the range, need validate again\n"); 1421 r = -EAGAIN; 1422 goto unlock_out; 1423 } 1424 } 1425 if (!list_empty(&prange->child_list)) { 1426 pr_debug("range split by unmap in parallel, validate again\n"); 1427 r = -EAGAIN; 1428 goto unlock_out; 1429 } 1430 1431 r = svm_range_map_to_gpus(prange, ctx.bitmap, wait); 1432 1433 unlock_out: 1434 svm_range_unlock(prange); 1435 unreserve_out: 1436 svm_range_unreserve_bos(&ctx); 1437 1438 if (!r) 1439 prange->validate_timestamp = ktime_to_us(ktime_get()); 1440 1441 return r; 1442 } 1443 1444 /** 1445 * svm_range_list_lock_and_flush_work - flush pending deferred work 1446 * 1447 * @svms: the svm range list 1448 * @mm: the mm structure 1449 * 1450 * Context: Returns with mmap write lock held, pending deferred work flushed 1451 * 1452 */ 1453 static void 1454 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1455 struct mm_struct *mm) 1456 { 1457 retry_flush_work: 1458 flush_work(&svms->deferred_list_work); 1459 mmap_write_lock(mm); 1460 1461 if (list_empty(&svms->deferred_range_list)) 1462 return; 1463 mmap_write_unlock(mm); 1464 pr_debug("retry flush\n"); 1465 goto retry_flush_work; 1466 } 1467 1468 static void svm_range_restore_work(struct work_struct *work) 1469 { 1470 struct delayed_work *dwork = to_delayed_work(work); 1471 struct amdkfd_process_info *process_info; 1472 struct svm_range_list *svms; 1473 struct svm_range *prange; 1474 struct kfd_process *p; 1475 struct mm_struct *mm; 1476 int evicted_ranges; 1477 int invalid; 1478 int r; 1479 1480 svms = container_of(dwork, struct svm_range_list, restore_work); 1481 evicted_ranges = atomic_read(&svms->evicted_ranges); 1482 if (!evicted_ranges) 1483 return; 1484 1485 pr_debug("restore svm ranges\n"); 1486 1487 /* kfd_process_notifier_release destroys this worker thread. So during 1488 * the lifetime of this thread, kfd_process and mm will be valid. 1489 */ 1490 p = container_of(svms, struct kfd_process, svms); 1491 process_info = p->kgd_process_info; 1492 mm = p->mm; 1493 if (!mm) 1494 return; 1495 1496 mutex_lock(&process_info->lock); 1497 svm_range_list_lock_and_flush_work(svms, mm); 1498 mutex_lock(&svms->lock); 1499 1500 evicted_ranges = atomic_read(&svms->evicted_ranges); 1501 1502 list_for_each_entry(prange, &svms->list, list) { 1503 invalid = atomic_read(&prange->invalid); 1504 if (!invalid) 1505 continue; 1506 1507 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1508 prange->svms, prange, prange->start, prange->last, 1509 invalid); 1510 1511 /* 1512 * If range is migrating, wait for migration is done. 1513 */ 1514 mutex_lock(&prange->migrate_mutex); 1515 1516 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1517 false, true); 1518 if (r) 1519 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1520 prange->start); 1521 1522 mutex_unlock(&prange->migrate_mutex); 1523 if (r) 1524 goto out_reschedule; 1525 1526 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1527 goto out_reschedule; 1528 } 1529 1530 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1531 evicted_ranges) 1532 goto out_reschedule; 1533 1534 evicted_ranges = 0; 1535 1536 r = kgd2kfd_resume_mm(mm); 1537 if (r) { 1538 /* No recovery from this failure. Probably the CP is 1539 * hanging. No point trying again. 1540 */ 1541 pr_debug("failed %d to resume KFD\n", r); 1542 } 1543 1544 pr_debug("restore svm ranges successfully\n"); 1545 1546 out_reschedule: 1547 mutex_unlock(&svms->lock); 1548 mmap_write_unlock(mm); 1549 mutex_unlock(&process_info->lock); 1550 1551 /* If validation failed, reschedule another attempt */ 1552 if (evicted_ranges) { 1553 pr_debug("reschedule to restore svm range\n"); 1554 schedule_delayed_work(&svms->restore_work, 1555 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1556 } 1557 } 1558 1559 /** 1560 * svm_range_evict - evict svm range 1561 * 1562 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1563 * return to let CPU evict the buffer and proceed CPU pagetable update. 1564 * 1565 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1566 * If invalidation happens while restore work is running, restore work will 1567 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1568 * the queues. 1569 */ 1570 static int 1571 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1572 unsigned long start, unsigned long last) 1573 { 1574 struct svm_range_list *svms = prange->svms; 1575 struct kfd_process *p; 1576 int r = 0; 1577 1578 p = container_of(svms, struct kfd_process, svms); 1579 1580 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1581 svms, prange->start, prange->last, start, last); 1582 1583 if (!p->xnack_enabled) { 1584 int evicted_ranges; 1585 1586 atomic_inc(&prange->invalid); 1587 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1588 if (evicted_ranges != 1) 1589 return r; 1590 1591 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1592 prange->svms, prange->start, prange->last); 1593 1594 /* First eviction, stop the queues */ 1595 r = kgd2kfd_quiesce_mm(mm); 1596 if (r) 1597 pr_debug("failed to quiesce KFD\n"); 1598 1599 pr_debug("schedule to restore svm %p ranges\n", svms); 1600 schedule_delayed_work(&svms->restore_work, 1601 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1602 } else { 1603 struct svm_range *pchild; 1604 unsigned long s, l; 1605 1606 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1607 prange->svms, start, last); 1608 list_for_each_entry(pchild, &prange->child_list, child_list) { 1609 mutex_lock_nested(&pchild->lock, 1); 1610 s = max(start, pchild->start); 1611 l = min(last, pchild->last); 1612 if (l >= s) 1613 svm_range_unmap_from_gpus(pchild, s, l); 1614 mutex_unlock(&pchild->lock); 1615 } 1616 s = max(start, prange->start); 1617 l = min(last, prange->last); 1618 if (l >= s) 1619 svm_range_unmap_from_gpus(prange, s, l); 1620 } 1621 1622 return r; 1623 } 1624 1625 static struct svm_range *svm_range_clone(struct svm_range *old) 1626 { 1627 struct svm_range *new; 1628 1629 new = svm_range_new(old->svms, old->start, old->last); 1630 if (!new) 1631 return NULL; 1632 1633 if (old->svm_bo) { 1634 new->ttm_res = old->ttm_res; 1635 new->offset = old->offset; 1636 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1637 spin_lock(&new->svm_bo->list_lock); 1638 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1639 spin_unlock(&new->svm_bo->list_lock); 1640 } 1641 new->flags = old->flags; 1642 new->preferred_loc = old->preferred_loc; 1643 new->prefetch_loc = old->prefetch_loc; 1644 new->actual_loc = old->actual_loc; 1645 new->granularity = old->granularity; 1646 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1647 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1648 1649 return new; 1650 } 1651 1652 /** 1653 * svm_range_handle_overlap - split overlap ranges 1654 * @svms: svm range list header 1655 * @new: range added with this attributes 1656 * @start: range added start address, in pages 1657 * @last: range last address, in pages 1658 * @update_list: output, the ranges attributes are updated. For set_attr, this 1659 * will do validation and map to GPUs. For unmap, this will be 1660 * removed and unmap from GPUs 1661 * @insert_list: output, the ranges will be inserted into svms, attributes are 1662 * not changes. For set_attr, this will add into svms. 1663 * @remove_list:output, the ranges will be removed from svms 1664 * @left: the remaining range after overlap, For set_attr, this will be added 1665 * as new range. 1666 * 1667 * Total have 5 overlap cases. 1668 * 1669 * This function handles overlap of an address interval with existing 1670 * struct svm_ranges for applying new attributes. This may require 1671 * splitting existing struct svm_ranges. All changes should be applied to 1672 * the range_list and interval tree transactionally. If any split operation 1673 * fails, the entire update fails. Therefore the existing overlapping 1674 * svm_ranges are cloned and the original svm_ranges left unchanged. If the 1675 * transaction succeeds, the modified clones are added and the originals 1676 * freed. Otherwise the clones are removed and the old svm_ranges remain. 1677 * 1678 * Context: The caller must hold svms->lock 1679 */ 1680 static int 1681 svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new, 1682 unsigned long start, unsigned long last, 1683 struct list_head *update_list, 1684 struct list_head *insert_list, 1685 struct list_head *remove_list, 1686 unsigned long *left) 1687 { 1688 struct interval_tree_node *node; 1689 struct svm_range *prange; 1690 struct svm_range *tmp; 1691 int r = 0; 1692 1693 INIT_LIST_HEAD(update_list); 1694 INIT_LIST_HEAD(insert_list); 1695 INIT_LIST_HEAD(remove_list); 1696 1697 node = interval_tree_iter_first(&svms->objects, start, last); 1698 while (node) { 1699 struct interval_tree_node *next; 1700 struct svm_range *old; 1701 unsigned long next_start; 1702 1703 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1704 node->last); 1705 1706 old = container_of(node, struct svm_range, it_node); 1707 next = interval_tree_iter_next(node, start, last); 1708 next_start = min(node->last, last) + 1; 1709 1710 if (node->start < start || node->last > last) { 1711 /* node intersects the updated range, clone+split it */ 1712 prange = svm_range_clone(old); 1713 if (!prange) { 1714 r = -ENOMEM; 1715 goto out; 1716 } 1717 1718 list_add(&old->remove_list, remove_list); 1719 list_add(&prange->insert_list, insert_list); 1720 1721 if (node->start < start) { 1722 pr_debug("change old range start\n"); 1723 r = svm_range_split_head(prange, new, start, 1724 insert_list); 1725 if (r) 1726 goto out; 1727 } 1728 if (node->last > last) { 1729 pr_debug("change old range last\n"); 1730 r = svm_range_split_tail(prange, new, last, 1731 insert_list); 1732 if (r) 1733 goto out; 1734 } 1735 } else { 1736 /* The node is contained within start..last, 1737 * just update it 1738 */ 1739 prange = old; 1740 } 1741 1742 if (!svm_range_is_same_attrs(prange, new)) 1743 list_add(&prange->update_list, update_list); 1744 1745 /* insert a new node if needed */ 1746 if (node->start > start) { 1747 prange = svm_range_new(prange->svms, start, 1748 node->start - 1); 1749 if (!prange) { 1750 r = -ENOMEM; 1751 goto out; 1752 } 1753 1754 list_add(&prange->insert_list, insert_list); 1755 list_add(&prange->update_list, update_list); 1756 } 1757 1758 node = next; 1759 start = next_start; 1760 } 1761 1762 if (left && start <= last) 1763 *left = last - start + 1; 1764 1765 out: 1766 if (r) 1767 list_for_each_entry_safe(prange, tmp, insert_list, insert_list) 1768 svm_range_free(prange); 1769 1770 return r; 1771 } 1772 1773 static void 1774 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1775 struct svm_range *prange) 1776 { 1777 unsigned long start; 1778 unsigned long last; 1779 1780 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1781 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1782 1783 if (prange->start == start && prange->last == last) 1784 return; 1785 1786 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1787 prange->svms, prange, start, last, prange->start, 1788 prange->last); 1789 1790 if (start != 0 && last != 0) { 1791 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1792 svm_range_remove_notifier(prange); 1793 } 1794 prange->it_node.start = prange->start; 1795 prange->it_node.last = prange->last; 1796 1797 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1798 svm_range_add_notifier_locked(mm, prange); 1799 } 1800 1801 static void 1802 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) 1803 { 1804 struct mm_struct *mm = prange->work_item.mm; 1805 1806 switch (prange->work_item.op) { 1807 case SVM_OP_NULL: 1808 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1809 svms, prange, prange->start, prange->last); 1810 break; 1811 case SVM_OP_UNMAP_RANGE: 1812 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1813 svms, prange, prange->start, prange->last); 1814 svm_range_unlink(prange); 1815 svm_range_remove_notifier(prange); 1816 svm_range_free(prange); 1817 break; 1818 case SVM_OP_UPDATE_RANGE_NOTIFIER: 1819 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1820 svms, prange, prange->start, prange->last); 1821 svm_range_update_notifier_and_interval_tree(mm, prange); 1822 break; 1823 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 1824 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1825 svms, prange, prange->start, prange->last); 1826 svm_range_update_notifier_and_interval_tree(mm, prange); 1827 /* TODO: implement deferred validation and mapping */ 1828 break; 1829 case SVM_OP_ADD_RANGE: 1830 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 1831 prange->start, prange->last); 1832 svm_range_add_to_svms(prange); 1833 svm_range_add_notifier_locked(mm, prange); 1834 break; 1835 case SVM_OP_ADD_RANGE_AND_MAP: 1836 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 1837 prange, prange->start, prange->last); 1838 svm_range_add_to_svms(prange); 1839 svm_range_add_notifier_locked(mm, prange); 1840 /* TODO: implement deferred validation and mapping */ 1841 break; 1842 default: 1843 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 1844 prange->work_item.op); 1845 } 1846 } 1847 1848 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 1849 { 1850 struct kfd_process_device *pdd; 1851 struct amdgpu_device *adev; 1852 struct kfd_process *p; 1853 uint32_t i; 1854 1855 p = container_of(svms, struct kfd_process, svms); 1856 1857 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 1858 pdd = p->pdds[i]; 1859 if (!pdd) 1860 continue; 1861 1862 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 1863 adev = (struct amdgpu_device *)pdd->dev->kgd; 1864 1865 amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1); 1866 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 1867 } 1868 } 1869 1870 static void svm_range_deferred_list_work(struct work_struct *work) 1871 { 1872 struct svm_range_list *svms; 1873 struct svm_range *prange; 1874 struct mm_struct *mm; 1875 1876 svms = container_of(work, struct svm_range_list, deferred_list_work); 1877 pr_debug("enter svms 0x%p\n", svms); 1878 1879 spin_lock(&svms->deferred_list_lock); 1880 while (!list_empty(&svms->deferred_range_list)) { 1881 prange = list_first_entry(&svms->deferred_range_list, 1882 struct svm_range, deferred_list); 1883 spin_unlock(&svms->deferred_list_lock); 1884 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 1885 prange->start, prange->last, prange->work_item.op); 1886 1887 /* Make sure no stale retry fault coming after range is freed */ 1888 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) 1889 svm_range_drain_retry_fault(prange->svms); 1890 1891 mm = prange->work_item.mm; 1892 mmap_write_lock(mm); 1893 mutex_lock(&svms->lock); 1894 1895 /* Remove from deferred_list must be inside mmap write lock, 1896 * otherwise, svm_range_list_lock_and_flush_work may hold mmap 1897 * write lock, and continue because deferred_list is empty, then 1898 * deferred_list handle is blocked by mmap write lock. 1899 */ 1900 spin_lock(&svms->deferred_list_lock); 1901 list_del_init(&prange->deferred_list); 1902 spin_unlock(&svms->deferred_list_lock); 1903 1904 mutex_lock(&prange->migrate_mutex); 1905 while (!list_empty(&prange->child_list)) { 1906 struct svm_range *pchild; 1907 1908 pchild = list_first_entry(&prange->child_list, 1909 struct svm_range, child_list); 1910 pr_debug("child prange 0x%p op %d\n", pchild, 1911 pchild->work_item.op); 1912 list_del_init(&pchild->child_list); 1913 svm_range_handle_list_op(svms, pchild); 1914 } 1915 mutex_unlock(&prange->migrate_mutex); 1916 1917 svm_range_handle_list_op(svms, prange); 1918 mutex_unlock(&svms->lock); 1919 mmap_write_unlock(mm); 1920 1921 spin_lock(&svms->deferred_list_lock); 1922 } 1923 spin_unlock(&svms->deferred_list_lock); 1924 1925 pr_debug("exit svms 0x%p\n", svms); 1926 } 1927 1928 void 1929 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 1930 struct mm_struct *mm, enum svm_work_list_ops op) 1931 { 1932 spin_lock(&svms->deferred_list_lock); 1933 /* if prange is on the deferred list */ 1934 if (!list_empty(&prange->deferred_list)) { 1935 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 1936 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 1937 if (op != SVM_OP_NULL && 1938 prange->work_item.op != SVM_OP_UNMAP_RANGE) 1939 prange->work_item.op = op; 1940 } else { 1941 prange->work_item.op = op; 1942 prange->work_item.mm = mm; 1943 list_add_tail(&prange->deferred_list, 1944 &prange->svms->deferred_range_list); 1945 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 1946 prange, prange->start, prange->last, op); 1947 } 1948 spin_unlock(&svms->deferred_list_lock); 1949 } 1950 1951 void schedule_deferred_list_work(struct svm_range_list *svms) 1952 { 1953 spin_lock(&svms->deferred_list_lock); 1954 if (!list_empty(&svms->deferred_range_list)) 1955 schedule_work(&svms->deferred_list_work); 1956 spin_unlock(&svms->deferred_list_lock); 1957 } 1958 1959 static void 1960 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 1961 struct svm_range *prange, unsigned long start, 1962 unsigned long last) 1963 { 1964 struct svm_range *head; 1965 struct svm_range *tail; 1966 1967 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 1968 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 1969 prange->start, prange->last); 1970 return; 1971 } 1972 if (start > prange->last || last < prange->start) 1973 return; 1974 1975 head = tail = prange; 1976 if (start > prange->start) 1977 svm_range_split(prange, prange->start, start - 1, &tail); 1978 if (last < tail->last) 1979 svm_range_split(tail, last + 1, tail->last, &head); 1980 1981 if (head != prange && tail != prange) { 1982 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 1983 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1984 } else if (tail != prange) { 1985 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 1986 } else if (head != prange) { 1987 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 1988 } else if (parent != prange) { 1989 prange->work_item.op = SVM_OP_UNMAP_RANGE; 1990 } 1991 } 1992 1993 static void 1994 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 1995 unsigned long start, unsigned long last) 1996 { 1997 struct svm_range_list *svms; 1998 struct svm_range *pchild; 1999 struct kfd_process *p; 2000 unsigned long s, l; 2001 bool unmap_parent; 2002 2003 p = kfd_lookup_process_by_mm(mm); 2004 if (!p) 2005 return; 2006 svms = &p->svms; 2007 2008 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2009 prange, prange->start, prange->last, start, last); 2010 2011 unmap_parent = start <= prange->start && last >= prange->last; 2012 2013 list_for_each_entry(pchild, &prange->child_list, child_list) { 2014 mutex_lock_nested(&pchild->lock, 1); 2015 s = max(start, pchild->start); 2016 l = min(last, pchild->last); 2017 if (l >= s) 2018 svm_range_unmap_from_gpus(pchild, s, l); 2019 svm_range_unmap_split(mm, prange, pchild, start, last); 2020 mutex_unlock(&pchild->lock); 2021 } 2022 s = max(start, prange->start); 2023 l = min(last, prange->last); 2024 if (l >= s) 2025 svm_range_unmap_from_gpus(prange, s, l); 2026 svm_range_unmap_split(mm, prange, prange, start, last); 2027 2028 if (unmap_parent) 2029 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2030 else 2031 svm_range_add_list_work(svms, prange, mm, 2032 SVM_OP_UPDATE_RANGE_NOTIFIER); 2033 schedule_deferred_list_work(svms); 2034 2035 kfd_unref_process(p); 2036 } 2037 2038 /** 2039 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2040 * 2041 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2042 * is from migration, or CPU page invalidation callback. 2043 * 2044 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2045 * work thread, and split prange if only part of prange is unmapped. 2046 * 2047 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2048 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2049 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2050 * update GPU mapping to recover. 2051 * 2052 * Context: mmap lock, notifier_invalidate_start lock are held 2053 * for invalidate event, prange lock is held if this is from migration 2054 */ 2055 static bool 2056 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2057 const struct mmu_notifier_range *range, 2058 unsigned long cur_seq) 2059 { 2060 struct svm_range *prange; 2061 unsigned long start; 2062 unsigned long last; 2063 2064 if (range->event == MMU_NOTIFY_RELEASE) 2065 return true; 2066 2067 start = mni->interval_tree.start; 2068 last = mni->interval_tree.last; 2069 start = (start > range->start ? start : range->start) >> PAGE_SHIFT; 2070 last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT; 2071 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2072 start, last, range->start >> PAGE_SHIFT, 2073 (range->end - 1) >> PAGE_SHIFT, 2074 mni->interval_tree.start >> PAGE_SHIFT, 2075 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2076 2077 prange = container_of(mni, struct svm_range, notifier); 2078 2079 svm_range_lock(prange); 2080 mmu_interval_set_seq(mni, cur_seq); 2081 2082 switch (range->event) { 2083 case MMU_NOTIFY_UNMAP: 2084 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2085 break; 2086 default: 2087 svm_range_evict(prange, mni->mm, start, last); 2088 break; 2089 } 2090 2091 svm_range_unlock(prange); 2092 2093 return true; 2094 } 2095 2096 /** 2097 * svm_range_from_addr - find svm range from fault address 2098 * @svms: svm range list header 2099 * @addr: address to search range interval tree, in pages 2100 * @parent: parent range if range is on child list 2101 * 2102 * Context: The caller must hold svms->lock 2103 * 2104 * Return: the svm_range found or NULL 2105 */ 2106 struct svm_range * 2107 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2108 struct svm_range **parent) 2109 { 2110 struct interval_tree_node *node; 2111 struct svm_range *prange; 2112 struct svm_range *pchild; 2113 2114 node = interval_tree_iter_first(&svms->objects, addr, addr); 2115 if (!node) 2116 return NULL; 2117 2118 prange = container_of(node, struct svm_range, it_node); 2119 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2120 addr, prange->start, prange->last, node->start, node->last); 2121 2122 if (addr >= prange->start && addr <= prange->last) { 2123 if (parent) 2124 *parent = prange; 2125 return prange; 2126 } 2127 list_for_each_entry(pchild, &prange->child_list, child_list) 2128 if (addr >= pchild->start && addr <= pchild->last) { 2129 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2130 addr, pchild->start, pchild->last); 2131 if (parent) 2132 *parent = prange; 2133 return pchild; 2134 } 2135 2136 return NULL; 2137 } 2138 2139 /* svm_range_best_restore_location - decide the best fault restore location 2140 * @prange: svm range structure 2141 * @adev: the GPU on which vm fault happened 2142 * 2143 * This is only called when xnack is on, to decide the best location to restore 2144 * the range mapping after GPU vm fault. Caller uses the best location to do 2145 * migration if actual loc is not best location, then update GPU page table 2146 * mapping to the best location. 2147 * 2148 * If vm fault gpu is range preferred loc, the best_loc is preferred loc. 2149 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2150 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2151 * if range actual loc is cpu, best_loc is cpu 2152 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2153 * range actual loc. 2154 * Otherwise, GPU no access, best_loc is -1. 2155 * 2156 * Return: 2157 * -1 means vm fault GPU no access 2158 * 0 for CPU or GPU id 2159 */ 2160 static int32_t 2161 svm_range_best_restore_location(struct svm_range *prange, 2162 struct amdgpu_device *adev, 2163 int32_t *gpuidx) 2164 { 2165 struct amdgpu_device *bo_adev; 2166 struct kfd_process *p; 2167 uint32_t gpuid; 2168 int r; 2169 2170 p = container_of(prange->svms, struct kfd_process, svms); 2171 2172 r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, gpuidx); 2173 if (r < 0) { 2174 pr_debug("failed to get gpuid from kgd\n"); 2175 return -1; 2176 } 2177 2178 if (prange->preferred_loc == gpuid) 2179 return prange->preferred_loc; 2180 2181 if (test_bit(*gpuidx, prange->bitmap_access)) 2182 return gpuid; 2183 2184 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2185 if (!prange->actual_loc) 2186 return 0; 2187 2188 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2189 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2190 return prange->actual_loc; 2191 else 2192 return 0; 2193 } 2194 2195 return -1; 2196 } 2197 static int 2198 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2199 unsigned long *start, unsigned long *last) 2200 { 2201 struct vm_area_struct *vma; 2202 struct interval_tree_node *node; 2203 unsigned long start_limit, end_limit; 2204 2205 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2206 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2207 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2208 return -EFAULT; 2209 } 2210 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2211 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2212 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2213 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2214 /* First range that starts after the fault address */ 2215 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2216 if (node) { 2217 end_limit = min(end_limit, node->start); 2218 /* Last range that ends before the fault address */ 2219 node = container_of(rb_prev(&node->rb), 2220 struct interval_tree_node, rb); 2221 } else { 2222 /* Last range must end before addr because 2223 * there was no range after addr 2224 */ 2225 node = container_of(rb_last(&p->svms.objects.rb_root), 2226 struct interval_tree_node, rb); 2227 } 2228 if (node) { 2229 if (node->last >= addr) { 2230 WARN(1, "Overlap with prev node and page fault addr\n"); 2231 return -EFAULT; 2232 } 2233 start_limit = max(start_limit, node->last + 1); 2234 } 2235 2236 *start = start_limit; 2237 *last = end_limit - 1; 2238 2239 pr_debug("vma start: 0x%lx start: 0x%lx vma end: 0x%lx last: 0x%lx\n", 2240 vma->vm_start >> PAGE_SHIFT, *start, 2241 vma->vm_end >> PAGE_SHIFT, *last); 2242 2243 return 0; 2244 2245 } 2246 static struct 2247 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2248 struct kfd_process *p, 2249 struct mm_struct *mm, 2250 int64_t addr) 2251 { 2252 struct svm_range *prange = NULL; 2253 unsigned long start, last; 2254 uint32_t gpuid, gpuidx; 2255 2256 if (svm_range_get_range_boundaries(p, addr, &start, &last)) 2257 return NULL; 2258 2259 prange = svm_range_new(&p->svms, start, last); 2260 if (!prange) { 2261 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2262 return NULL; 2263 } 2264 if (kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx)) { 2265 pr_debug("failed to get gpuid from kgd\n"); 2266 svm_range_free(prange); 2267 return NULL; 2268 } 2269 2270 svm_range_add_to_svms(prange); 2271 svm_range_add_notifier_locked(mm, prange); 2272 2273 return prange; 2274 } 2275 2276 /* svm_range_skip_recover - decide if prange can be recovered 2277 * @prange: svm range structure 2278 * 2279 * GPU vm retry fault handle skip recover the range for cases: 2280 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2281 * deferred list work will drain the stale fault before free the prange. 2282 * 2. prange is on deferred list to add interval notifier after split, or 2283 * 3. prange is child range, it is split from parent prange, recover later 2284 * after interval notifier is added. 2285 * 2286 * Return: true to skip recover, false to recover 2287 */ 2288 static bool svm_range_skip_recover(struct svm_range *prange) 2289 { 2290 struct svm_range_list *svms = prange->svms; 2291 2292 spin_lock(&svms->deferred_list_lock); 2293 if (list_empty(&prange->deferred_list) && 2294 list_empty(&prange->child_list)) { 2295 spin_unlock(&svms->deferred_list_lock); 2296 return false; 2297 } 2298 spin_unlock(&svms->deferred_list_lock); 2299 2300 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2301 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2302 svms, prange, prange->start, prange->last); 2303 return true; 2304 } 2305 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2306 prange->work_item.op == SVM_OP_ADD_RANGE) { 2307 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2308 svms, prange, prange->start, prange->last); 2309 return true; 2310 } 2311 return false; 2312 } 2313 2314 int 2315 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2316 uint64_t addr) 2317 { 2318 struct mm_struct *mm = NULL; 2319 struct svm_range_list *svms; 2320 struct svm_range *prange; 2321 struct kfd_process *p; 2322 uint64_t timestamp; 2323 int32_t best_loc, gpuidx; 2324 bool write_locked = false; 2325 int r = 0; 2326 2327 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2328 pr_debug("device does not support SVM\n"); 2329 return -EFAULT; 2330 } 2331 2332 p = kfd_lookup_process_by_pasid(pasid); 2333 if (!p) { 2334 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2335 return -ESRCH; 2336 } 2337 if (!p->xnack_enabled) { 2338 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2339 return -EFAULT; 2340 } 2341 svms = &p->svms; 2342 2343 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2344 2345 mm = get_task_mm(p->lead_thread); 2346 if (!mm) { 2347 pr_debug("svms 0x%p failed to get mm\n", svms); 2348 r = -ESRCH; 2349 goto out; 2350 } 2351 2352 mmap_read_lock(mm); 2353 retry_write_locked: 2354 mutex_lock(&svms->lock); 2355 prange = svm_range_from_addr(svms, addr, NULL); 2356 if (!prange) { 2357 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2358 svms, addr); 2359 if (!write_locked) { 2360 /* Need the write lock to create new range with MMU notifier. 2361 * Also flush pending deferred work to make sure the interval 2362 * tree is up to date before we add a new range 2363 */ 2364 mutex_unlock(&svms->lock); 2365 mmap_read_unlock(mm); 2366 mmap_write_lock(mm); 2367 write_locked = true; 2368 goto retry_write_locked; 2369 } 2370 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2371 if (!prange) { 2372 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2373 svms, addr); 2374 mmap_write_downgrade(mm); 2375 r = -EFAULT; 2376 goto out_unlock_svms; 2377 } 2378 } 2379 if (write_locked) 2380 mmap_write_downgrade(mm); 2381 2382 mutex_lock(&prange->migrate_mutex); 2383 2384 if (svm_range_skip_recover(prange)) { 2385 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2386 goto out_unlock_range; 2387 } 2388 2389 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2390 /* skip duplicate vm fault on different pages of same range */ 2391 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2392 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2393 svms, prange->start, prange->last); 2394 goto out_unlock_range; 2395 } 2396 2397 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2398 if (best_loc == -1) { 2399 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2400 svms, prange->start, prange->last); 2401 r = -EACCES; 2402 goto out_unlock_range; 2403 } 2404 2405 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2406 svms, prange->start, prange->last, best_loc, 2407 prange->actual_loc); 2408 2409 if (prange->actual_loc != best_loc) { 2410 if (best_loc) { 2411 r = svm_migrate_to_vram(prange, best_loc, mm); 2412 if (r) { 2413 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2414 r, addr); 2415 /* Fallback to system memory if migration to 2416 * VRAM failed 2417 */ 2418 if (prange->actual_loc) 2419 r = svm_migrate_vram_to_ram(prange, mm); 2420 else 2421 r = 0; 2422 } 2423 } else { 2424 r = svm_migrate_vram_to_ram(prange, mm); 2425 } 2426 if (r) { 2427 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2428 r, svms, prange->start, prange->last); 2429 goto out_unlock_range; 2430 } 2431 } 2432 2433 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2434 if (r) 2435 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2436 r, svms, prange->start, prange->last); 2437 2438 out_unlock_range: 2439 mutex_unlock(&prange->migrate_mutex); 2440 out_unlock_svms: 2441 mutex_unlock(&svms->lock); 2442 mmap_read_unlock(mm); 2443 mmput(mm); 2444 out: 2445 kfd_unref_process(p); 2446 2447 if (r == -EAGAIN) { 2448 pr_debug("recover vm fault later\n"); 2449 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2450 r = 0; 2451 } 2452 return r; 2453 } 2454 2455 void svm_range_list_fini(struct kfd_process *p) 2456 { 2457 struct svm_range *prange; 2458 struct svm_range *next; 2459 2460 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2461 2462 /* Ensure list work is finished before process is destroyed */ 2463 flush_work(&p->svms.deferred_list_work); 2464 2465 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2466 svm_range_unlink(prange); 2467 svm_range_remove_notifier(prange); 2468 svm_range_free(prange); 2469 } 2470 2471 mutex_destroy(&p->svms.lock); 2472 2473 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2474 } 2475 2476 int svm_range_list_init(struct kfd_process *p) 2477 { 2478 struct svm_range_list *svms = &p->svms; 2479 int i; 2480 2481 svms->objects = RB_ROOT_CACHED; 2482 mutex_init(&svms->lock); 2483 INIT_LIST_HEAD(&svms->list); 2484 atomic_set(&svms->evicted_ranges, 0); 2485 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2486 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2487 INIT_LIST_HEAD(&svms->deferred_range_list); 2488 spin_lock_init(&svms->deferred_list_lock); 2489 2490 for (i = 0; i < p->n_pdds; i++) 2491 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 2492 bitmap_set(svms->bitmap_supported, i, 1); 2493 2494 return 0; 2495 } 2496 2497 /** 2498 * svm_range_is_valid - check if virtual address range is valid 2499 * @mm: current process mm_struct 2500 * @start: range start address, in pages 2501 * @size: range size, in pages 2502 * 2503 * Valid virtual address range means it belongs to one or more VMAs 2504 * 2505 * Context: Process context 2506 * 2507 * Return: 2508 * true - valid svm range 2509 * false - invalid svm range 2510 */ 2511 static bool 2512 svm_range_is_valid(struct mm_struct *mm, uint64_t start, uint64_t size) 2513 { 2514 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2515 struct vm_area_struct *vma; 2516 unsigned long end; 2517 2518 start <<= PAGE_SHIFT; 2519 end = start + (size << PAGE_SHIFT); 2520 2521 do { 2522 vma = find_vma(mm, start); 2523 if (!vma || start < vma->vm_start || 2524 (vma->vm_flags & device_vma)) 2525 return false; 2526 start = min(end, vma->vm_end); 2527 } while (start < end); 2528 2529 return true; 2530 } 2531 2532 /** 2533 * svm_range_add - add svm range and handle overlap 2534 * @p: the range add to this process svms 2535 * @start: page size aligned 2536 * @size: page size aligned 2537 * @nattr: number of attributes 2538 * @attrs: array of attributes 2539 * @update_list: output, the ranges need validate and update GPU mapping 2540 * @insert_list: output, the ranges need insert to svms 2541 * @remove_list: output, the ranges are replaced and need remove from svms 2542 * 2543 * Check if the virtual address range has overlap with the registered ranges, 2544 * split the overlapped range, copy and adjust pages address and vram nodes in 2545 * old and new ranges. 2546 * 2547 * Context: Process context, caller must hold svms->lock 2548 * 2549 * Return: 2550 * 0 - OK, otherwise error code 2551 */ 2552 static int 2553 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 2554 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 2555 struct list_head *update_list, struct list_head *insert_list, 2556 struct list_head *remove_list) 2557 { 2558 uint64_t last = start + size - 1UL; 2559 struct svm_range_list *svms; 2560 struct svm_range new = {0}; 2561 struct svm_range *prange; 2562 unsigned long left = 0; 2563 int r = 0; 2564 2565 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", &p->svms, start, last); 2566 2567 svm_range_apply_attrs(p, &new, nattr, attrs); 2568 2569 svms = &p->svms; 2570 2571 r = svm_range_handle_overlap(svms, &new, start, last, update_list, 2572 insert_list, remove_list, &left); 2573 if (r) 2574 return r; 2575 2576 if (left) { 2577 prange = svm_range_new(svms, last - left + 1, last); 2578 list_add(&prange->insert_list, insert_list); 2579 list_add(&prange->update_list, update_list); 2580 } 2581 2582 return 0; 2583 } 2584 2585 /* svm_range_best_prefetch_location - decide the best prefetch location 2586 * @prange: svm range structure 2587 * 2588 * For xnack off: 2589 * If range map to single GPU, the best acutal location is prefetch loc, which 2590 * can be CPU or GPU. 2591 * 2592 * If range map to multiple GPUs, only if mGPU connection on xgmi same hive, 2593 * the best actual location could be prefetch_loc GPU. If mGPU connection on 2594 * PCIe, the best actual location is always CPU, because GPU cannot access vram 2595 * of other GPUs, assuming PCIe small bar (large bar support is not upstream). 2596 * 2597 * For xnack on: 2598 * The best actual location is prefetch location. If mGPU connection on xgmi 2599 * same hive, range map to multiple GPUs. Otherwise, the range only map to 2600 * actual location GPU. Other GPU access vm fault will trigger migration. 2601 * 2602 * Context: Process context 2603 * 2604 * Return: 2605 * 0 for CPU or GPU id 2606 */ 2607 static uint32_t 2608 svm_range_best_prefetch_location(struct svm_range *prange) 2609 { 2610 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 2611 uint32_t best_loc = prange->prefetch_loc; 2612 struct kfd_process_device *pdd; 2613 struct amdgpu_device *bo_adev; 2614 struct amdgpu_device *adev; 2615 struct kfd_process *p; 2616 uint32_t gpuidx; 2617 2618 p = container_of(prange->svms, struct kfd_process, svms); 2619 2620 /* xnack on */ 2621 if (p->xnack_enabled) 2622 goto out; 2623 2624 /* xnack off */ 2625 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 2626 goto out; 2627 2628 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 2629 if (!bo_adev) { 2630 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 2631 best_loc = 0; 2632 goto out; 2633 } 2634 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 2635 MAX_GPU_INSTANCE); 2636 2637 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 2638 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2639 if (!pdd) { 2640 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 2641 continue; 2642 } 2643 adev = (struct amdgpu_device *)pdd->dev->kgd; 2644 2645 if (adev == bo_adev) 2646 continue; 2647 2648 if (!amdgpu_xgmi_same_hive(adev, bo_adev)) { 2649 best_loc = 0; 2650 break; 2651 } 2652 } 2653 2654 out: 2655 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 2656 p->xnack_enabled, &p->svms, prange->start, prange->last, 2657 best_loc); 2658 2659 return best_loc; 2660 } 2661 2662 /* FIXME: This is a workaround for page locking bug when some pages are 2663 * invalid during migration to VRAM 2664 */ 2665 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm) 2666 { 2667 struct hmm_range *hmm_range; 2668 int r; 2669 2670 if (prange->validated_once) 2671 return; 2672 2673 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 2674 prange->start << PAGE_SHIFT, 2675 prange->npages, &hmm_range, 2676 false, true); 2677 if (!r) { 2678 amdgpu_hmm_range_get_pages_done(hmm_range); 2679 prange->validated_once = true; 2680 } 2681 } 2682 2683 /* svm_range_trigger_migration - start page migration if prefetch loc changed 2684 * @mm: current process mm_struct 2685 * @prange: svm range structure 2686 * @migrated: output, true if migration is triggered 2687 * 2688 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 2689 * from ram to vram. 2690 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 2691 * from vram to ram. 2692 * 2693 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 2694 * and restore work: 2695 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 2696 * stops all queues, schedule restore work 2697 * 2. svm_range_restore_work wait for migration is done by 2698 * a. svm_range_validate_vram takes prange->migrate_mutex 2699 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 2700 * 3. restore work update mappings of GPU, resume all queues. 2701 * 2702 * Context: Process context 2703 * 2704 * Return: 2705 * 0 - OK, otherwise - error code of migration 2706 */ 2707 static int 2708 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 2709 bool *migrated) 2710 { 2711 uint32_t best_loc; 2712 int r = 0; 2713 2714 *migrated = false; 2715 best_loc = svm_range_best_prefetch_location(prange); 2716 2717 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 2718 best_loc == prange->actual_loc) 2719 return 0; 2720 2721 /* 2722 * Prefetch to GPU without host access flag, set actual_loc to gpu, then 2723 * validate on gpu and map to gpus will be handled afterwards. 2724 */ 2725 if (best_loc && !prange->actual_loc && 2726 !(prange->flags & KFD_IOCTL_SVM_FLAG_HOST_ACCESS)) { 2727 prange->actual_loc = best_loc; 2728 return 0; 2729 } 2730 2731 if (!best_loc) { 2732 r = svm_migrate_vram_to_ram(prange, mm); 2733 *migrated = !r; 2734 return r; 2735 } 2736 2737 r = svm_migrate_to_vram(prange, best_loc, mm); 2738 *migrated = !r; 2739 2740 return r; 2741 } 2742 2743 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 2744 { 2745 if (!fence) 2746 return -EINVAL; 2747 2748 if (dma_fence_is_signaled(&fence->base)) 2749 return 0; 2750 2751 if (fence->svm_bo) { 2752 WRITE_ONCE(fence->svm_bo->evicting, 1); 2753 schedule_work(&fence->svm_bo->eviction_work); 2754 } 2755 2756 return 0; 2757 } 2758 2759 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 2760 { 2761 struct svm_range_bo *svm_bo; 2762 struct kfd_process *p; 2763 struct mm_struct *mm; 2764 2765 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 2766 if (!svm_bo_ref_unless_zero(svm_bo)) 2767 return; /* svm_bo was freed while eviction was pending */ 2768 2769 /* svm_range_bo_release destroys this worker thread. So during 2770 * the lifetime of this thread, kfd_process and mm will be valid. 2771 */ 2772 p = container_of(svm_bo->svms, struct kfd_process, svms); 2773 mm = p->mm; 2774 if (!mm) 2775 return; 2776 2777 mmap_read_lock(mm); 2778 spin_lock(&svm_bo->list_lock); 2779 while (!list_empty(&svm_bo->range_list)) { 2780 struct svm_range *prange = 2781 list_first_entry(&svm_bo->range_list, 2782 struct svm_range, svm_bo_list); 2783 list_del_init(&prange->svm_bo_list); 2784 spin_unlock(&svm_bo->list_lock); 2785 2786 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 2787 prange->start, prange->last); 2788 2789 mutex_lock(&prange->migrate_mutex); 2790 svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm); 2791 2792 mutex_lock(&prange->lock); 2793 prange->svm_bo = NULL; 2794 mutex_unlock(&prange->lock); 2795 2796 mutex_unlock(&prange->migrate_mutex); 2797 2798 spin_lock(&svm_bo->list_lock); 2799 } 2800 spin_unlock(&svm_bo->list_lock); 2801 mmap_read_unlock(mm); 2802 2803 dma_fence_signal(&svm_bo->eviction_fence->base); 2804 /* This is the last reference to svm_bo, after svm_range_vram_node_free 2805 * has been called in svm_migrate_vram_to_ram 2806 */ 2807 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 2808 svm_range_bo_unref(svm_bo); 2809 } 2810 2811 static int 2812 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 2813 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 2814 { 2815 struct amdkfd_process_info *process_info = p->kgd_process_info; 2816 struct mm_struct *mm = current->mm; 2817 struct list_head update_list; 2818 struct list_head insert_list; 2819 struct list_head remove_list; 2820 struct svm_range_list *svms; 2821 struct svm_range *prange; 2822 struct svm_range *next; 2823 int r = 0; 2824 2825 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 2826 p->pasid, &p->svms, start, start + size - 1, size); 2827 2828 r = svm_range_check_attr(p, nattr, attrs); 2829 if (r) 2830 return r; 2831 2832 svms = &p->svms; 2833 2834 mutex_lock(&process_info->lock); 2835 2836 svm_range_list_lock_and_flush_work(svms, mm); 2837 2838 if (!svm_range_is_valid(mm, start, size)) { 2839 pr_debug("invalid range\n"); 2840 r = -EFAULT; 2841 mmap_write_unlock(mm); 2842 goto out; 2843 } 2844 2845 mutex_lock(&svms->lock); 2846 2847 /* Add new range and split existing ranges as needed */ 2848 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 2849 &insert_list, &remove_list); 2850 if (r) { 2851 mutex_unlock(&svms->lock); 2852 mmap_write_unlock(mm); 2853 goto out; 2854 } 2855 /* Apply changes as a transaction */ 2856 list_for_each_entry_safe(prange, next, &insert_list, insert_list) { 2857 svm_range_add_to_svms(prange); 2858 svm_range_add_notifier_locked(mm, prange); 2859 } 2860 list_for_each_entry(prange, &update_list, update_list) { 2861 svm_range_apply_attrs(p, prange, nattr, attrs); 2862 /* TODO: unmap ranges from GPU that lost access */ 2863 } 2864 list_for_each_entry_safe(prange, next, &remove_list, 2865 remove_list) { 2866 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2867 prange->svms, prange, prange->start, 2868 prange->last); 2869 svm_range_unlink(prange); 2870 svm_range_remove_notifier(prange); 2871 svm_range_free(prange); 2872 } 2873 2874 mmap_write_downgrade(mm); 2875 /* Trigger migrations and revalidate and map to GPUs as needed. If 2876 * this fails we may be left with partially completed actions. There 2877 * is no clean way of rolling back to the previous state in such a 2878 * case because the rollback wouldn't be guaranteed to work either. 2879 */ 2880 list_for_each_entry(prange, &update_list, update_list) { 2881 bool migrated; 2882 2883 mutex_lock(&prange->migrate_mutex); 2884 2885 r = svm_range_trigger_migration(mm, prange, &migrated); 2886 if (r) 2887 goto out_unlock_range; 2888 2889 if (migrated && !p->xnack_enabled) { 2890 pr_debug("restore_work will update mappings of GPUs\n"); 2891 mutex_unlock(&prange->migrate_mutex); 2892 continue; 2893 } 2894 2895 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 2896 true, true); 2897 if (r) 2898 pr_debug("failed %d to map svm range\n", r); 2899 2900 out_unlock_range: 2901 mutex_unlock(&prange->migrate_mutex); 2902 if (r) 2903 break; 2904 } 2905 2906 svm_range_debug_dump(svms); 2907 2908 mutex_unlock(&svms->lock); 2909 mmap_read_unlock(mm); 2910 out: 2911 mutex_unlock(&process_info->lock); 2912 2913 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 2914 &p->svms, start, start + size - 1, r); 2915 2916 return r; 2917 } 2918 2919 static int 2920 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 2921 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 2922 { 2923 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 2924 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 2925 bool get_preferred_loc = false; 2926 bool get_prefetch_loc = false; 2927 bool get_granularity = false; 2928 bool get_accessible = false; 2929 bool get_flags = false; 2930 uint64_t last = start + size - 1UL; 2931 struct mm_struct *mm = current->mm; 2932 uint8_t granularity = 0xff; 2933 struct interval_tree_node *node; 2934 struct svm_range_list *svms; 2935 struct svm_range *prange; 2936 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 2937 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 2938 uint32_t flags = 0xffffffff; 2939 int gpuidx; 2940 uint32_t i; 2941 2942 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 2943 start + size - 1, nattr); 2944 2945 mmap_read_lock(mm); 2946 if (!svm_range_is_valid(mm, start, size)) { 2947 pr_debug("invalid range\n"); 2948 mmap_read_unlock(mm); 2949 return -EINVAL; 2950 } 2951 mmap_read_unlock(mm); 2952 2953 for (i = 0; i < nattr; i++) { 2954 switch (attrs[i].type) { 2955 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 2956 get_preferred_loc = true; 2957 break; 2958 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 2959 get_prefetch_loc = true; 2960 break; 2961 case KFD_IOCTL_SVM_ATTR_ACCESS: 2962 get_accessible = true; 2963 break; 2964 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 2965 get_flags = true; 2966 break; 2967 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 2968 get_granularity = true; 2969 break; 2970 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 2971 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 2972 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 2973 fallthrough; 2974 default: 2975 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 2976 return -EINVAL; 2977 } 2978 } 2979 2980 svms = &p->svms; 2981 2982 mutex_lock(&svms->lock); 2983 2984 node = interval_tree_iter_first(&svms->objects, start, last); 2985 if (!node) { 2986 pr_debug("range attrs not found return default values\n"); 2987 svm_range_set_default_attributes(&location, &prefetch_loc, 2988 &granularity, &flags); 2989 if (p->xnack_enabled) 2990 bitmap_copy(bitmap_access, svms->bitmap_supported, 2991 MAX_GPU_INSTANCE); 2992 else 2993 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 2994 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 2995 goto fill_values; 2996 } 2997 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 2998 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 2999 3000 while (node) { 3001 struct interval_tree_node *next; 3002 3003 prange = container_of(node, struct svm_range, it_node); 3004 next = interval_tree_iter_next(node, start, last); 3005 3006 if (get_preferred_loc) { 3007 if (prange->preferred_loc == 3008 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3009 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3010 location != prange->preferred_loc)) { 3011 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3012 get_preferred_loc = false; 3013 } else { 3014 location = prange->preferred_loc; 3015 } 3016 } 3017 if (get_prefetch_loc) { 3018 if (prange->prefetch_loc == 3019 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3020 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3021 prefetch_loc != prange->prefetch_loc)) { 3022 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3023 get_prefetch_loc = false; 3024 } else { 3025 prefetch_loc = prange->prefetch_loc; 3026 } 3027 } 3028 if (get_accessible) { 3029 bitmap_and(bitmap_access, bitmap_access, 3030 prange->bitmap_access, MAX_GPU_INSTANCE); 3031 bitmap_and(bitmap_aip, bitmap_aip, 3032 prange->bitmap_aip, MAX_GPU_INSTANCE); 3033 } 3034 if (get_flags) 3035 flags &= prange->flags; 3036 3037 if (get_granularity && prange->granularity < granularity) 3038 granularity = prange->granularity; 3039 3040 node = next; 3041 } 3042 fill_values: 3043 mutex_unlock(&svms->lock); 3044 3045 for (i = 0; i < nattr; i++) { 3046 switch (attrs[i].type) { 3047 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3048 attrs[i].value = location; 3049 break; 3050 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3051 attrs[i].value = prefetch_loc; 3052 break; 3053 case KFD_IOCTL_SVM_ATTR_ACCESS: 3054 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3055 attrs[i].value); 3056 if (gpuidx < 0) { 3057 pr_debug("invalid gpuid %x\n", attrs[i].value); 3058 return -EINVAL; 3059 } 3060 if (test_bit(gpuidx, bitmap_access)) 3061 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3062 else if (test_bit(gpuidx, bitmap_aip)) 3063 attrs[i].type = 3064 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3065 else 3066 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3067 break; 3068 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3069 attrs[i].value = flags; 3070 break; 3071 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3072 attrs[i].value = (uint32_t)granularity; 3073 break; 3074 } 3075 } 3076 3077 return 0; 3078 } 3079 3080 int 3081 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3082 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3083 { 3084 int r; 3085 3086 start >>= PAGE_SHIFT; 3087 size >>= PAGE_SHIFT; 3088 3089 switch (op) { 3090 case KFD_IOCTL_SVM_OP_SET_ATTR: 3091 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3092 break; 3093 case KFD_IOCTL_SVM_OP_GET_ATTR: 3094 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3095 break; 3096 default: 3097 r = EINVAL; 3098 break; 3099 } 3100 3101 return r; 3102 } 3103