1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void __flush_svm_range_dev(struct intel_svm *svm, 122 struct intel_svm_dev *sdev, 123 unsigned long address, 124 unsigned long pages, int ih) 125 { 126 struct device_domain_info *info = get_domain_info(sdev->dev); 127 128 if (WARN_ON(!pages)) 129 return; 130 131 qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); 132 if (info->ats_enabled) 133 qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, 134 svm->pasid, sdev->qdep, address, 135 order_base_2(pages)); 136 } 137 138 static void intel_flush_svm_range_dev(struct intel_svm *svm, 139 struct intel_svm_dev *sdev, 140 unsigned long address, 141 unsigned long pages, int ih) 142 { 143 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 144 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 145 unsigned long start = ALIGN_DOWN(address, align); 146 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 147 148 while (start < end) { 149 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 150 start += align; 151 } 152 } 153 154 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 155 unsigned long pages, int ih) 156 { 157 struct intel_svm_dev *sdev; 158 159 rcu_read_lock(); 160 list_for_each_entry_rcu(sdev, &svm->devs, list) 161 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 162 rcu_read_unlock(); 163 } 164 165 /* Pages have been freed at this point */ 166 static void intel_invalidate_range(struct mmu_notifier *mn, 167 struct mm_struct *mm, 168 unsigned long start, unsigned long end) 169 { 170 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 171 172 intel_flush_svm_range(svm, start, 173 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 174 } 175 176 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 177 { 178 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 179 struct intel_svm_dev *sdev; 180 181 /* This might end up being called from exit_mmap(), *before* the page 182 * tables are cleared. And __mmu_notifier_release() will delete us from 183 * the list of notifiers so that our invalidate_range() callback doesn't 184 * get called when the page tables are cleared. So we need to protect 185 * against hardware accessing those page tables. 186 * 187 * We do it by clearing the entry in the PASID table and then flushing 188 * the IOTLB and the PASID table caches. This might upset hardware; 189 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 190 * page) so that we end up taking a fault that the hardware really 191 * *has* to handle gracefully without affecting other processes. 192 */ 193 rcu_read_lock(); 194 list_for_each_entry_rcu(sdev, &svm->devs, list) 195 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 196 svm->pasid, true); 197 rcu_read_unlock(); 198 199 } 200 201 static const struct mmu_notifier_ops intel_mmuops = { 202 .release = intel_mm_release, 203 .invalidate_range = intel_invalidate_range, 204 }; 205 206 static DEFINE_MUTEX(pasid_mutex); 207 static LIST_HEAD(global_svm_list); 208 209 #define for_each_svm_dev(sdev, svm, d) \ 210 list_for_each_entry((sdev), &(svm)->devs, list) \ 211 if ((d) != (sdev)->dev) {} else 212 213 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 214 struct intel_svm **rsvm, 215 struct intel_svm_dev **rsdev) 216 { 217 struct intel_svm_dev *d, *sdev = NULL; 218 struct intel_svm *svm; 219 220 /* The caller should hold the pasid_mutex lock */ 221 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 222 return -EINVAL; 223 224 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 225 return -EINVAL; 226 227 svm = ioasid_find(NULL, pasid, NULL); 228 if (IS_ERR(svm)) 229 return PTR_ERR(svm); 230 231 if (!svm) 232 goto out; 233 234 /* 235 * If we found svm for the PASID, there must be at least one device 236 * bond. 237 */ 238 if (WARN_ON(list_empty(&svm->devs))) 239 return -EINVAL; 240 241 rcu_read_lock(); 242 list_for_each_entry_rcu(d, &svm->devs, list) { 243 if (d->dev == dev) { 244 sdev = d; 245 break; 246 } 247 } 248 rcu_read_unlock(); 249 250 out: 251 *rsvm = svm; 252 *rsdev = sdev; 253 254 return 0; 255 } 256 257 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 258 struct iommu_gpasid_bind_data *data) 259 { 260 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 261 struct intel_svm_dev *sdev = NULL; 262 struct dmar_domain *dmar_domain; 263 struct device_domain_info *info; 264 struct intel_svm *svm = NULL; 265 unsigned long iflags; 266 int ret = 0; 267 268 if (WARN_ON(!iommu) || !data) 269 return -EINVAL; 270 271 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 272 return -EINVAL; 273 274 /* IOMMU core ensures argsz is more than the start of the union */ 275 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 276 return -EINVAL; 277 278 /* Make sure no undefined flags are used in vendor data */ 279 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 280 return -EINVAL; 281 282 if (!dev_is_pci(dev)) 283 return -ENOTSUPP; 284 285 /* VT-d supports devices with full 20 bit PASIDs only */ 286 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 287 return -EINVAL; 288 289 /* 290 * We only check host PASID range, we have no knowledge to check 291 * guest PASID range. 292 */ 293 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 294 return -EINVAL; 295 296 info = get_domain_info(dev); 297 if (!info) 298 return -EINVAL; 299 300 dmar_domain = to_dmar_domain(domain); 301 302 mutex_lock(&pasid_mutex); 303 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 304 if (ret) 305 goto out; 306 307 if (sdev) { 308 /* 309 * Do not allow multiple bindings of the same device-PASID since 310 * there is only one SL page tables per PASID. We may revisit 311 * once sharing PGD across domains are supported. 312 */ 313 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 314 svm->pasid); 315 ret = -EBUSY; 316 goto out; 317 } 318 319 if (!svm) { 320 /* We come here when PASID has never been bond to a device. */ 321 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 322 if (!svm) { 323 ret = -ENOMEM; 324 goto out; 325 } 326 /* REVISIT: upper layer/VFIO can track host process that bind 327 * the PASID. ioasid_set = mm might be sufficient for vfio to 328 * check pasid VMM ownership. We can drop the following line 329 * once VFIO and IOASID set check is in place. 330 */ 331 svm->mm = get_task_mm(current); 332 svm->pasid = data->hpasid; 333 if (data->flags & IOMMU_SVA_GPASID_VAL) { 334 svm->gpasid = data->gpasid; 335 svm->flags |= SVM_FLAG_GUEST_PASID; 336 } 337 ioasid_set_data(data->hpasid, svm); 338 INIT_LIST_HEAD_RCU(&svm->devs); 339 mmput(svm->mm); 340 } 341 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 342 if (!sdev) { 343 ret = -ENOMEM; 344 goto out; 345 } 346 sdev->dev = dev; 347 sdev->sid = PCI_DEVID(info->bus, info->devfn); 348 sdev->iommu = iommu; 349 350 /* Only count users if device has aux domains */ 351 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 352 sdev->users = 1; 353 354 /* Set up device context entry for PASID if not enabled already */ 355 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 356 if (ret) { 357 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 358 kfree(sdev); 359 goto out; 360 } 361 362 /* 363 * PASID table is per device for better security. Therefore, for 364 * each bind of a new device even with an existing PASID, we need to 365 * call the nested mode setup function here. 366 */ 367 spin_lock_irqsave(&iommu->lock, iflags); 368 ret = intel_pasid_setup_nested(iommu, dev, 369 (pgd_t *)(uintptr_t)data->gpgd, 370 data->hpasid, &data->vendor.vtd, dmar_domain, 371 data->addr_width); 372 spin_unlock_irqrestore(&iommu->lock, iflags); 373 if (ret) { 374 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 375 data->hpasid, ret); 376 /* 377 * PASID entry should be in cleared state if nested mode 378 * set up failed. So we only need to clear IOASID tracking 379 * data such that free call will succeed. 380 */ 381 kfree(sdev); 382 goto out; 383 } 384 385 svm->flags |= SVM_FLAG_GUEST_MODE; 386 387 init_rcu_head(&sdev->rcu); 388 list_add_rcu(&sdev->list, &svm->devs); 389 out: 390 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 391 ioasid_set_data(data->hpasid, NULL); 392 kfree(svm); 393 } 394 395 mutex_unlock(&pasid_mutex); 396 return ret; 397 } 398 399 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 400 { 401 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 402 struct intel_svm_dev *sdev; 403 struct intel_svm *svm; 404 int ret; 405 406 if (WARN_ON(!iommu)) 407 return -EINVAL; 408 409 mutex_lock(&pasid_mutex); 410 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 411 if (ret) 412 goto out; 413 414 if (sdev) { 415 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 416 sdev->users--; 417 if (!sdev->users) { 418 list_del_rcu(&sdev->list); 419 intel_pasid_tear_down_entry(iommu, dev, 420 svm->pasid, false); 421 intel_svm_drain_prq(dev, svm->pasid); 422 kfree_rcu(sdev, rcu); 423 424 if (list_empty(&svm->devs)) { 425 /* 426 * We do not free the IOASID here in that 427 * IOMMU driver did not allocate it. 428 * Unlike native SVM, IOASID for guest use was 429 * allocated prior to the bind call. 430 * In any case, if the free call comes before 431 * the unbind, IOMMU driver will get notified 432 * and perform cleanup. 433 */ 434 ioasid_set_data(pasid, NULL); 435 kfree(svm); 436 } 437 } 438 } 439 out: 440 mutex_unlock(&pasid_mutex); 441 return ret; 442 } 443 444 static void _load_pasid(void *unused) 445 { 446 update_pasid(); 447 } 448 449 static void load_pasid(struct mm_struct *mm, u32 pasid) 450 { 451 mutex_lock(&mm->context.lock); 452 453 /* Synchronize with READ_ONCE in update_pasid(). */ 454 smp_store_release(&mm->pasid, pasid); 455 456 /* Update PASID MSR on all CPUs running the mm's tasks. */ 457 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 458 459 mutex_unlock(&mm->context.lock); 460 } 461 462 /* Caller must hold pasid_mutex, mm reference */ 463 static int 464 intel_svm_bind_mm(struct device *dev, unsigned int flags, 465 struct mm_struct *mm, struct intel_svm_dev **sd) 466 { 467 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 468 struct intel_svm *svm = NULL, *t; 469 struct device_domain_info *info; 470 struct intel_svm_dev *sdev; 471 unsigned long iflags; 472 int pasid_max; 473 int ret; 474 475 if (!iommu || dmar_disabled) 476 return -EINVAL; 477 478 if (!intel_svm_capable(iommu)) 479 return -ENOTSUPP; 480 481 if (dev_is_pci(dev)) { 482 pasid_max = pci_max_pasids(to_pci_dev(dev)); 483 if (pasid_max < 0) 484 return -EINVAL; 485 } else 486 pasid_max = 1 << 20; 487 488 /* Bind supervisor PASID shuld have mm = NULL */ 489 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 490 if (!ecap_srs(iommu->ecap) || mm) { 491 pr_err("Supervisor PASID with user provided mm.\n"); 492 return -EINVAL; 493 } 494 } 495 496 list_for_each_entry(t, &global_svm_list, list) { 497 if (t->mm != mm) 498 continue; 499 500 svm = t; 501 if (svm->pasid >= pasid_max) { 502 dev_warn(dev, 503 "Limited PASID width. Cannot use existing PASID %d\n", 504 svm->pasid); 505 ret = -ENOSPC; 506 goto out; 507 } 508 509 /* Find the matching device in svm list */ 510 for_each_svm_dev(sdev, svm, dev) { 511 sdev->users++; 512 goto success; 513 } 514 515 break; 516 } 517 518 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 519 if (!sdev) { 520 ret = -ENOMEM; 521 goto out; 522 } 523 sdev->dev = dev; 524 sdev->iommu = iommu; 525 526 ret = intel_iommu_enable_pasid(iommu, dev); 527 if (ret) { 528 kfree(sdev); 529 goto out; 530 } 531 532 info = get_domain_info(dev); 533 sdev->did = FLPT_DEFAULT_DID; 534 sdev->sid = PCI_DEVID(info->bus, info->devfn); 535 if (info->ats_enabled) { 536 sdev->dev_iotlb = 1; 537 sdev->qdep = info->ats_qdep; 538 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 539 sdev->qdep = 0; 540 } 541 542 /* Finish the setup now we know we're keeping it */ 543 sdev->users = 1; 544 init_rcu_head(&sdev->rcu); 545 546 if (!svm) { 547 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 548 if (!svm) { 549 ret = -ENOMEM; 550 kfree(sdev); 551 goto out; 552 } 553 554 if (pasid_max > intel_pasid_max_id) 555 pasid_max = intel_pasid_max_id; 556 557 /* Do not use PASID 0, reserved for RID to PASID */ 558 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 559 pasid_max - 1, svm); 560 if (svm->pasid == INVALID_IOASID) { 561 kfree(svm); 562 kfree(sdev); 563 ret = -ENOSPC; 564 goto out; 565 } 566 svm->notifier.ops = &intel_mmuops; 567 svm->mm = mm; 568 svm->flags = flags; 569 INIT_LIST_HEAD_RCU(&svm->devs); 570 INIT_LIST_HEAD(&svm->list); 571 ret = -ENOMEM; 572 if (mm) { 573 ret = mmu_notifier_register(&svm->notifier, mm); 574 if (ret) { 575 ioasid_put(svm->pasid); 576 kfree(svm); 577 kfree(sdev); 578 goto out; 579 } 580 } 581 582 spin_lock_irqsave(&iommu->lock, iflags); 583 ret = intel_pasid_setup_first_level(iommu, dev, 584 mm ? mm->pgd : init_mm.pgd, 585 svm->pasid, FLPT_DEFAULT_DID, 586 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 587 (cpu_feature_enabled(X86_FEATURE_LA57) ? 588 PASID_FLAG_FL5LP : 0)); 589 spin_unlock_irqrestore(&iommu->lock, iflags); 590 if (ret) { 591 if (mm) 592 mmu_notifier_unregister(&svm->notifier, mm); 593 ioasid_put(svm->pasid); 594 kfree(svm); 595 kfree(sdev); 596 goto out; 597 } 598 599 list_add_tail(&svm->list, &global_svm_list); 600 if (mm) { 601 /* The newly allocated pasid is loaded to the mm. */ 602 load_pasid(mm, svm->pasid); 603 } 604 } else { 605 /* 606 * Binding a new device with existing PASID, need to setup 607 * the PASID entry. 608 */ 609 spin_lock_irqsave(&iommu->lock, iflags); 610 ret = intel_pasid_setup_first_level(iommu, dev, 611 mm ? mm->pgd : init_mm.pgd, 612 svm->pasid, FLPT_DEFAULT_DID, 613 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 614 (cpu_feature_enabled(X86_FEATURE_LA57) ? 615 PASID_FLAG_FL5LP : 0)); 616 spin_unlock_irqrestore(&iommu->lock, iflags); 617 if (ret) { 618 kfree(sdev); 619 goto out; 620 } 621 } 622 list_add_rcu(&sdev->list, &svm->devs); 623 success: 624 sdev->pasid = svm->pasid; 625 sdev->sva.dev = dev; 626 if (sd) 627 *sd = sdev; 628 ret = 0; 629 out: 630 return ret; 631 } 632 633 /* Caller must hold pasid_mutex */ 634 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 635 { 636 struct intel_svm_dev *sdev; 637 struct intel_iommu *iommu; 638 struct intel_svm *svm; 639 int ret = -EINVAL; 640 641 iommu = device_to_iommu(dev, NULL, NULL); 642 if (!iommu) 643 goto out; 644 645 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 646 if (ret) 647 goto out; 648 649 if (sdev) { 650 sdev->users--; 651 if (!sdev->users) { 652 list_del_rcu(&sdev->list); 653 /* Flush the PASID cache and IOTLB for this device. 654 * Note that we do depend on the hardware *not* using 655 * the PASID any more. Just as we depend on other 656 * devices never using PASIDs that they have no right 657 * to use. We have a *shared* PASID table, because it's 658 * large and has to be physically contiguous. So it's 659 * hard to be as defensive as we might like. */ 660 intel_pasid_tear_down_entry(iommu, dev, 661 svm->pasid, false); 662 intel_svm_drain_prq(dev, svm->pasid); 663 kfree_rcu(sdev, rcu); 664 665 if (list_empty(&svm->devs)) { 666 ioasid_put(svm->pasid); 667 if (svm->mm) { 668 mmu_notifier_unregister(&svm->notifier, svm->mm); 669 /* Clear mm's pasid. */ 670 load_pasid(svm->mm, PASID_DISABLED); 671 } 672 list_del(&svm->list); 673 /* We mandate that no page faults may be outstanding 674 * for the PASID when intel_svm_unbind_mm() is called. 675 * If that is not obeyed, subtle errors will happen. 676 * Let's make them less subtle... */ 677 memset(svm, 0x6b, sizeof(*svm)); 678 kfree(svm); 679 } 680 } 681 } 682 out: 683 return ret; 684 } 685 686 /* Page request queue descriptor */ 687 struct page_req_dsc { 688 union { 689 struct { 690 u64 type:8; 691 u64 pasid_present:1; 692 u64 priv_data_present:1; 693 u64 rsvd:6; 694 u64 rid:16; 695 u64 pasid:20; 696 u64 exe_req:1; 697 u64 pm_req:1; 698 u64 rsvd2:10; 699 }; 700 u64 qw_0; 701 }; 702 union { 703 struct { 704 u64 rd_req:1; 705 u64 wr_req:1; 706 u64 lpig:1; 707 u64 prg_index:9; 708 u64 addr:52; 709 }; 710 u64 qw_1; 711 }; 712 u64 priv_data[2]; 713 }; 714 715 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 716 717 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 718 { 719 unsigned long requested = 0; 720 721 if (req->exe_req) 722 requested |= VM_EXEC; 723 724 if (req->rd_req) 725 requested |= VM_READ; 726 727 if (req->wr_req) 728 requested |= VM_WRITE; 729 730 return (requested & ~vma->vm_flags) != 0; 731 } 732 733 static bool is_canonical_address(u64 addr) 734 { 735 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 736 long saddr = (long) addr; 737 738 return (((saddr << shift) >> shift) == saddr); 739 } 740 741 /** 742 * intel_svm_drain_prq - Drain page requests and responses for a pasid 743 * @dev: target device 744 * @pasid: pasid for draining 745 * 746 * Drain all pending page requests and responses related to @pasid in both 747 * software and hardware. This is supposed to be called after the device 748 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 749 * and DevTLB have been invalidated. 750 * 751 * It waits until all pending page requests for @pasid in the page fault 752 * queue are completed by the prq handling thread. Then follow the steps 753 * described in VT-d spec CH7.10 to drain all page requests and page 754 * responses pending in the hardware. 755 */ 756 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 757 { 758 struct device_domain_info *info; 759 struct dmar_domain *domain; 760 struct intel_iommu *iommu; 761 struct qi_desc desc[3]; 762 struct pci_dev *pdev; 763 int head, tail; 764 u16 sid, did; 765 int qdep; 766 767 info = get_domain_info(dev); 768 if (WARN_ON(!info || !dev_is_pci(dev))) 769 return; 770 771 if (!info->pri_enabled) 772 return; 773 774 iommu = info->iommu; 775 domain = info->domain; 776 pdev = to_pci_dev(dev); 777 sid = PCI_DEVID(info->bus, info->devfn); 778 did = domain->iommu_did[iommu->seq_id]; 779 qdep = pci_ats_queue_depth(pdev); 780 781 /* 782 * Check and wait until all pending page requests in the queue are 783 * handled by the prq handling thread. 784 */ 785 prq_retry: 786 reinit_completion(&iommu->prq_complete); 787 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 788 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 789 while (head != tail) { 790 struct page_req_dsc *req; 791 792 req = &iommu->prq[head / sizeof(*req)]; 793 if (!req->pasid_present || req->pasid != pasid) { 794 head = (head + sizeof(*req)) & PRQ_RING_MASK; 795 continue; 796 } 797 798 wait_for_completion(&iommu->prq_complete); 799 goto prq_retry; 800 } 801 802 /* 803 * Perform steps described in VT-d spec CH7.10 to drain page 804 * requests and responses in hardware. 805 */ 806 memset(desc, 0, sizeof(desc)); 807 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 808 QI_IWD_FENCE | 809 QI_IWD_TYPE; 810 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 811 QI_EIOTLB_DID(did) | 812 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 813 QI_EIOTLB_TYPE; 814 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 815 QI_DEV_EIOTLB_SID(sid) | 816 QI_DEV_EIOTLB_QDEP(qdep) | 817 QI_DEIOTLB_TYPE | 818 QI_DEV_IOTLB_PFSID(info->pfsid); 819 qi_retry: 820 reinit_completion(&iommu->prq_complete); 821 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 822 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 823 wait_for_completion(&iommu->prq_complete); 824 goto qi_retry; 825 } 826 } 827 828 static int prq_to_iommu_prot(struct page_req_dsc *req) 829 { 830 int prot = 0; 831 832 if (req->rd_req) 833 prot |= IOMMU_FAULT_PERM_READ; 834 if (req->wr_req) 835 prot |= IOMMU_FAULT_PERM_WRITE; 836 if (req->exe_req) 837 prot |= IOMMU_FAULT_PERM_EXEC; 838 if (req->pm_req) 839 prot |= IOMMU_FAULT_PERM_PRIV; 840 841 return prot; 842 } 843 844 static int 845 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 846 { 847 struct iommu_fault_event event; 848 849 if (!dev || !dev_is_pci(dev)) 850 return -ENODEV; 851 852 /* Fill in event data for device specific processing */ 853 memset(&event, 0, sizeof(struct iommu_fault_event)); 854 event.fault.type = IOMMU_FAULT_PAGE_REQ; 855 event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; 856 event.fault.prm.pasid = desc->pasid; 857 event.fault.prm.grpid = desc->prg_index; 858 event.fault.prm.perm = prq_to_iommu_prot(desc); 859 860 if (desc->lpig) 861 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 862 if (desc->pasid_present) { 863 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 864 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 865 } 866 if (desc->priv_data_present) { 867 /* 868 * Set last page in group bit if private data is present, 869 * page response is required as it does for LPIG. 870 * iommu_report_device_fault() doesn't understand this vendor 871 * specific requirement thus we set last_page as a workaround. 872 */ 873 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 874 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 875 memcpy(event.fault.prm.private_data, desc->priv_data, 876 sizeof(desc->priv_data)); 877 } 878 879 return iommu_report_device_fault(dev, &event); 880 } 881 882 static irqreturn_t prq_event_thread(int irq, void *d) 883 { 884 struct intel_svm_dev *sdev = NULL; 885 struct intel_iommu *iommu = d; 886 struct intel_svm *svm = NULL; 887 int head, tail, handled = 0; 888 unsigned int flags = 0; 889 890 /* Clear PPR bit before reading head/tail registers, to 891 * ensure that we get a new interrupt if needed. */ 892 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 893 894 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 895 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 896 while (head != tail) { 897 struct vm_area_struct *vma; 898 struct page_req_dsc *req; 899 struct qi_desc resp; 900 int result; 901 vm_fault_t ret; 902 u64 address; 903 904 handled = 1; 905 req = &iommu->prq[head / sizeof(*req)]; 906 result = QI_RESP_INVALID; 907 address = (u64)req->addr << VTD_PAGE_SHIFT; 908 if (!req->pasid_present) { 909 pr_err("%s: Page request without PASID: %08llx %08llx\n", 910 iommu->name, ((unsigned long long *)req)[0], 911 ((unsigned long long *)req)[1]); 912 goto no_pasid; 913 } 914 /* We shall not receive page request for supervisor SVM */ 915 if (req->pm_req && (req->rd_req | req->wr_req)) { 916 pr_err("Unexpected page request in Privilege Mode"); 917 /* No need to find the matching sdev as for bad_req */ 918 goto no_pasid; 919 } 920 /* DMA read with exec requeset is not supported. */ 921 if (req->exe_req && req->rd_req) { 922 pr_err("Execution request not supported\n"); 923 goto no_pasid; 924 } 925 if (!svm || svm->pasid != req->pasid) { 926 rcu_read_lock(); 927 svm = ioasid_find(NULL, req->pasid, NULL); 928 /* It *can't* go away, because the driver is not permitted 929 * to unbind the mm while any page faults are outstanding. 930 * So we only need RCU to protect the internal idr code. */ 931 rcu_read_unlock(); 932 if (IS_ERR_OR_NULL(svm)) { 933 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 934 iommu->name, req->pasid, ((unsigned long long *)req)[0], 935 ((unsigned long long *)req)[1]); 936 goto no_pasid; 937 } 938 } 939 940 if (!sdev || sdev->sid != req->rid) { 941 struct intel_svm_dev *t; 942 943 sdev = NULL; 944 rcu_read_lock(); 945 list_for_each_entry_rcu(t, &svm->devs, list) { 946 if (t->sid == req->rid) { 947 sdev = t; 948 break; 949 } 950 } 951 rcu_read_unlock(); 952 } 953 954 /* Since we're using init_mm.pgd directly, we should never take 955 * any faults on kernel addresses. */ 956 if (!svm->mm) 957 goto bad_req; 958 959 /* If address is not canonical, return invalid response */ 960 if (!is_canonical_address(address)) 961 goto bad_req; 962 963 /* 964 * If prq is to be handled outside iommu driver via receiver of 965 * the fault notifiers, we skip the page response here. 966 */ 967 if (svm->flags & SVM_FLAG_GUEST_MODE) { 968 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 969 goto prq_advance; 970 else 971 goto bad_req; 972 } 973 974 /* If the mm is already defunct, don't handle faults. */ 975 if (!mmget_not_zero(svm->mm)) 976 goto bad_req; 977 978 mmap_read_lock(svm->mm); 979 vma = find_extend_vma(svm->mm, address); 980 if (!vma || address < vma->vm_start) 981 goto invalid; 982 983 if (access_error(vma, req)) 984 goto invalid; 985 986 flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE; 987 if (req->wr_req) 988 flags |= FAULT_FLAG_WRITE; 989 990 ret = handle_mm_fault(vma, address, flags, NULL); 991 if (ret & VM_FAULT_ERROR) 992 goto invalid; 993 994 result = QI_RESP_SUCCESS; 995 invalid: 996 mmap_read_unlock(svm->mm); 997 mmput(svm->mm); 998 bad_req: 999 /* We get here in the error case where the PASID lookup failed, 1000 and these can be NULL. Do not use them below this point! */ 1001 sdev = NULL; 1002 svm = NULL; 1003 no_pasid: 1004 if (req->lpig || req->priv_data_present) { 1005 /* 1006 * Per VT-d spec. v3.0 ch7.7, system software must 1007 * respond with page group response if private data 1008 * is present (PDP) or last page in group (LPIG) bit 1009 * is set. This is an additional VT-d feature beyond 1010 * PCI ATS spec. 1011 */ 1012 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1013 QI_PGRP_DID(req->rid) | 1014 QI_PGRP_PASID_P(req->pasid_present) | 1015 QI_PGRP_PDP(req->priv_data_present) | 1016 QI_PGRP_RESP_CODE(result) | 1017 QI_PGRP_RESP_TYPE; 1018 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1019 QI_PGRP_LPIG(req->lpig); 1020 resp.qw2 = 0; 1021 resp.qw3 = 0; 1022 1023 if (req->priv_data_present) 1024 memcpy(&resp.qw2, req->priv_data, 1025 sizeof(req->priv_data)); 1026 qi_submit_sync(iommu, &resp, 1, 0); 1027 } 1028 prq_advance: 1029 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1030 } 1031 1032 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1033 1034 /* 1035 * Clear the page request overflow bit and wake up all threads that 1036 * are waiting for the completion of this handling. 1037 */ 1038 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 1039 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 1040 iommu->name); 1041 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 1042 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 1043 if (head == tail) { 1044 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1045 pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", 1046 iommu->name); 1047 } 1048 } 1049 1050 if (!completion_done(&iommu->prq_complete)) 1051 complete(&iommu->prq_complete); 1052 1053 return IRQ_RETVAL(handled); 1054 } 1055 1056 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1057 struct iommu_sva * 1058 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1059 { 1060 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1061 struct intel_svm_dev *sdev = NULL; 1062 unsigned int flags = 0; 1063 int ret; 1064 1065 /* 1066 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1067 * It will require shared SVM data structures, i.e. combine io_mm 1068 * and intel_svm etc. 1069 */ 1070 if (drvdata) 1071 flags = *(unsigned int *)drvdata; 1072 mutex_lock(&pasid_mutex); 1073 ret = intel_svm_bind_mm(dev, flags, mm, &sdev); 1074 if (ret) 1075 sva = ERR_PTR(ret); 1076 else if (sdev) 1077 sva = &sdev->sva; 1078 else 1079 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1080 1081 mutex_unlock(&pasid_mutex); 1082 1083 return sva; 1084 } 1085 1086 void intel_svm_unbind(struct iommu_sva *sva) 1087 { 1088 struct intel_svm_dev *sdev; 1089 1090 mutex_lock(&pasid_mutex); 1091 sdev = to_intel_svm_dev(sva); 1092 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1093 mutex_unlock(&pasid_mutex); 1094 } 1095 1096 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1097 { 1098 struct intel_svm_dev *sdev; 1099 u32 pasid; 1100 1101 mutex_lock(&pasid_mutex); 1102 sdev = to_intel_svm_dev(sva); 1103 pasid = sdev->pasid; 1104 mutex_unlock(&pasid_mutex); 1105 1106 return pasid; 1107 } 1108 1109 int intel_svm_page_response(struct device *dev, 1110 struct iommu_fault_event *evt, 1111 struct iommu_page_response *msg) 1112 { 1113 struct iommu_fault_page_request *prm; 1114 struct intel_svm_dev *sdev = NULL; 1115 struct intel_svm *svm = NULL; 1116 struct intel_iommu *iommu; 1117 bool private_present; 1118 bool pasid_present; 1119 bool last_page; 1120 u8 bus, devfn; 1121 int ret = 0; 1122 u16 sid; 1123 1124 if (!dev || !dev_is_pci(dev)) 1125 return -ENODEV; 1126 1127 iommu = device_to_iommu(dev, &bus, &devfn); 1128 if (!iommu) 1129 return -ENODEV; 1130 1131 if (!msg || !evt) 1132 return -EINVAL; 1133 1134 mutex_lock(&pasid_mutex); 1135 1136 prm = &evt->fault.prm; 1137 sid = PCI_DEVID(bus, devfn); 1138 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1139 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1140 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1141 1142 if (!pasid_present) { 1143 ret = -EINVAL; 1144 goto out; 1145 } 1146 1147 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1148 ret = -EINVAL; 1149 goto out; 1150 } 1151 1152 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1153 if (ret || !sdev) { 1154 ret = -ENODEV; 1155 goto out; 1156 } 1157 1158 /* 1159 * For responses from userspace, need to make sure that the 1160 * pasid has been bound to its mm. 1161 */ 1162 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1163 struct mm_struct *mm; 1164 1165 mm = get_task_mm(current); 1166 if (!mm) { 1167 ret = -EINVAL; 1168 goto out; 1169 } 1170 1171 if (mm != svm->mm) { 1172 ret = -ENODEV; 1173 mmput(mm); 1174 goto out; 1175 } 1176 1177 mmput(mm); 1178 } 1179 1180 /* 1181 * Per VT-d spec. v3.0 ch7.7, system software must respond 1182 * with page group response if private data is present (PDP) 1183 * or last page in group (LPIG) bit is set. This is an 1184 * additional VT-d requirement beyond PCI ATS spec. 1185 */ 1186 if (last_page || private_present) { 1187 struct qi_desc desc; 1188 1189 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1190 QI_PGRP_PASID_P(pasid_present) | 1191 QI_PGRP_PDP(private_present) | 1192 QI_PGRP_RESP_CODE(msg->code) | 1193 QI_PGRP_RESP_TYPE; 1194 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1195 desc.qw2 = 0; 1196 desc.qw3 = 0; 1197 if (private_present) 1198 memcpy(&desc.qw2, prm->private_data, 1199 sizeof(prm->private_data)); 1200 1201 qi_submit_sync(iommu, &desc, 1, 0); 1202 } 1203 out: 1204 mutex_unlock(&pasid_mutex); 1205 return ret; 1206 } 1207