1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void __flush_svm_range_dev(struct intel_svm *svm, 122 struct intel_svm_dev *sdev, 123 unsigned long address, 124 unsigned long pages, int ih) 125 { 126 struct qi_desc desc; 127 128 if (pages == -1) { 129 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 130 QI_EIOTLB_DID(sdev->did) | 131 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 132 QI_EIOTLB_TYPE; 133 desc.qw1 = 0; 134 } else { 135 int mask = ilog2(__roundup_pow_of_two(pages)); 136 137 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 138 QI_EIOTLB_DID(sdev->did) | 139 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 140 QI_EIOTLB_TYPE; 141 desc.qw1 = QI_EIOTLB_ADDR(address) | 142 QI_EIOTLB_IH(ih) | 143 QI_EIOTLB_AM(mask); 144 } 145 desc.qw2 = 0; 146 desc.qw3 = 0; 147 qi_submit_sync(sdev->iommu, &desc, 1, 0); 148 149 if (sdev->dev_iotlb) { 150 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | 151 QI_DEV_EIOTLB_SID(sdev->sid) | 152 QI_DEV_EIOTLB_QDEP(sdev->qdep) | 153 QI_DEIOTLB_TYPE; 154 if (pages == -1) { 155 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | 156 QI_DEV_EIOTLB_SIZE; 157 } else if (pages > 1) { 158 /* The least significant zero bit indicates the size. So, 159 * for example, an "address" value of 0x12345f000 will 160 * flush from 0x123440000 to 0x12347ffff (256KiB). */ 161 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); 162 unsigned long mask = __rounddown_pow_of_two(address ^ last); 163 164 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | 165 (mask - 1)) | QI_DEV_EIOTLB_SIZE; 166 } else { 167 desc.qw1 = QI_DEV_EIOTLB_ADDR(address); 168 } 169 desc.qw2 = 0; 170 desc.qw3 = 0; 171 qi_submit_sync(sdev->iommu, &desc, 1, 0); 172 } 173 } 174 175 static void intel_flush_svm_range_dev(struct intel_svm *svm, 176 struct intel_svm_dev *sdev, 177 unsigned long address, 178 unsigned long pages, int ih) 179 { 180 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 181 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 182 unsigned long start = ALIGN_DOWN(address, align); 183 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 184 185 while (start < end) { 186 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 187 start += align; 188 } 189 } 190 191 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 192 unsigned long pages, int ih) 193 { 194 struct intel_svm_dev *sdev; 195 196 rcu_read_lock(); 197 list_for_each_entry_rcu(sdev, &svm->devs, list) 198 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 199 rcu_read_unlock(); 200 } 201 202 /* Pages have been freed at this point */ 203 static void intel_invalidate_range(struct mmu_notifier *mn, 204 struct mm_struct *mm, 205 unsigned long start, unsigned long end) 206 { 207 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 208 209 intel_flush_svm_range(svm, start, 210 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 211 } 212 213 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 214 { 215 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 216 struct intel_svm_dev *sdev; 217 218 /* This might end up being called from exit_mmap(), *before* the page 219 * tables are cleared. And __mmu_notifier_release() will delete us from 220 * the list of notifiers so that our invalidate_range() callback doesn't 221 * get called when the page tables are cleared. So we need to protect 222 * against hardware accessing those page tables. 223 * 224 * We do it by clearing the entry in the PASID table and then flushing 225 * the IOTLB and the PASID table caches. This might upset hardware; 226 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 227 * page) so that we end up taking a fault that the hardware really 228 * *has* to handle gracefully without affecting other processes. 229 */ 230 rcu_read_lock(); 231 list_for_each_entry_rcu(sdev, &svm->devs, list) 232 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 233 svm->pasid, true); 234 rcu_read_unlock(); 235 236 } 237 238 static const struct mmu_notifier_ops intel_mmuops = { 239 .release = intel_mm_release, 240 .invalidate_range = intel_invalidate_range, 241 }; 242 243 static DEFINE_MUTEX(pasid_mutex); 244 static LIST_HEAD(global_svm_list); 245 246 #define for_each_svm_dev(sdev, svm, d) \ 247 list_for_each_entry((sdev), &(svm)->devs, list) \ 248 if ((d) != (sdev)->dev) {} else 249 250 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 251 struct intel_svm **rsvm, 252 struct intel_svm_dev **rsdev) 253 { 254 struct intel_svm_dev *d, *sdev = NULL; 255 struct intel_svm *svm; 256 257 /* The caller should hold the pasid_mutex lock */ 258 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 259 return -EINVAL; 260 261 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 262 return -EINVAL; 263 264 svm = ioasid_find(NULL, pasid, NULL); 265 if (IS_ERR(svm)) 266 return PTR_ERR(svm); 267 268 if (!svm) 269 goto out; 270 271 /* 272 * If we found svm for the PASID, there must be at least one device 273 * bond. 274 */ 275 if (WARN_ON(list_empty(&svm->devs))) 276 return -EINVAL; 277 278 rcu_read_lock(); 279 list_for_each_entry_rcu(d, &svm->devs, list) { 280 if (d->dev == dev) { 281 sdev = d; 282 break; 283 } 284 } 285 rcu_read_unlock(); 286 287 out: 288 *rsvm = svm; 289 *rsdev = sdev; 290 291 return 0; 292 } 293 294 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 295 struct iommu_gpasid_bind_data *data) 296 { 297 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 298 struct intel_svm_dev *sdev = NULL; 299 struct dmar_domain *dmar_domain; 300 struct device_domain_info *info; 301 struct intel_svm *svm = NULL; 302 unsigned long iflags; 303 int ret = 0; 304 305 if (WARN_ON(!iommu) || !data) 306 return -EINVAL; 307 308 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 309 return -EINVAL; 310 311 /* IOMMU core ensures argsz is more than the start of the union */ 312 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 313 return -EINVAL; 314 315 /* Make sure no undefined flags are used in vendor data */ 316 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 317 return -EINVAL; 318 319 if (!dev_is_pci(dev)) 320 return -ENOTSUPP; 321 322 /* VT-d supports devices with full 20 bit PASIDs only */ 323 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 324 return -EINVAL; 325 326 /* 327 * We only check host PASID range, we have no knowledge to check 328 * guest PASID range. 329 */ 330 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 331 return -EINVAL; 332 333 info = get_domain_info(dev); 334 if (!info) 335 return -EINVAL; 336 337 dmar_domain = to_dmar_domain(domain); 338 339 mutex_lock(&pasid_mutex); 340 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 341 if (ret) 342 goto out; 343 344 if (sdev) { 345 /* 346 * Do not allow multiple bindings of the same device-PASID since 347 * there is only one SL page tables per PASID. We may revisit 348 * once sharing PGD across domains are supported. 349 */ 350 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 351 svm->pasid); 352 ret = -EBUSY; 353 goto out; 354 } 355 356 if (!svm) { 357 /* We come here when PASID has never been bond to a device. */ 358 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 359 if (!svm) { 360 ret = -ENOMEM; 361 goto out; 362 } 363 /* REVISIT: upper layer/VFIO can track host process that bind 364 * the PASID. ioasid_set = mm might be sufficient for vfio to 365 * check pasid VMM ownership. We can drop the following line 366 * once VFIO and IOASID set check is in place. 367 */ 368 svm->mm = get_task_mm(current); 369 svm->pasid = data->hpasid; 370 if (data->flags & IOMMU_SVA_GPASID_VAL) { 371 svm->gpasid = data->gpasid; 372 svm->flags |= SVM_FLAG_GUEST_PASID; 373 } 374 ioasid_set_data(data->hpasid, svm); 375 INIT_LIST_HEAD_RCU(&svm->devs); 376 mmput(svm->mm); 377 } 378 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 379 if (!sdev) { 380 ret = -ENOMEM; 381 goto out; 382 } 383 sdev->dev = dev; 384 sdev->sid = PCI_DEVID(info->bus, info->devfn); 385 sdev->iommu = iommu; 386 387 /* Only count users if device has aux domains */ 388 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 389 sdev->users = 1; 390 391 /* Set up device context entry for PASID if not enabled already */ 392 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 393 if (ret) { 394 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 395 kfree(sdev); 396 goto out; 397 } 398 399 /* 400 * PASID table is per device for better security. Therefore, for 401 * each bind of a new device even with an existing PASID, we need to 402 * call the nested mode setup function here. 403 */ 404 spin_lock_irqsave(&iommu->lock, iflags); 405 ret = intel_pasid_setup_nested(iommu, dev, 406 (pgd_t *)(uintptr_t)data->gpgd, 407 data->hpasid, &data->vendor.vtd, dmar_domain, 408 data->addr_width); 409 spin_unlock_irqrestore(&iommu->lock, iflags); 410 if (ret) { 411 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 412 data->hpasid, ret); 413 /* 414 * PASID entry should be in cleared state if nested mode 415 * set up failed. So we only need to clear IOASID tracking 416 * data such that free call will succeed. 417 */ 418 kfree(sdev); 419 goto out; 420 } 421 422 svm->flags |= SVM_FLAG_GUEST_MODE; 423 424 init_rcu_head(&sdev->rcu); 425 list_add_rcu(&sdev->list, &svm->devs); 426 out: 427 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 428 ioasid_set_data(data->hpasid, NULL); 429 kfree(svm); 430 } 431 432 mutex_unlock(&pasid_mutex); 433 return ret; 434 } 435 436 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 437 { 438 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 439 struct intel_svm_dev *sdev; 440 struct intel_svm *svm; 441 int ret; 442 443 if (WARN_ON(!iommu)) 444 return -EINVAL; 445 446 mutex_lock(&pasid_mutex); 447 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 448 if (ret) 449 goto out; 450 451 if (sdev) { 452 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 453 sdev->users--; 454 if (!sdev->users) { 455 list_del_rcu(&sdev->list); 456 intel_pasid_tear_down_entry(iommu, dev, 457 svm->pasid, false); 458 intel_svm_drain_prq(dev, svm->pasid); 459 kfree_rcu(sdev, rcu); 460 461 if (list_empty(&svm->devs)) { 462 /* 463 * We do not free the IOASID here in that 464 * IOMMU driver did not allocate it. 465 * Unlike native SVM, IOASID for guest use was 466 * allocated prior to the bind call. 467 * In any case, if the free call comes before 468 * the unbind, IOMMU driver will get notified 469 * and perform cleanup. 470 */ 471 ioasid_set_data(pasid, NULL); 472 kfree(svm); 473 } 474 } 475 } 476 out: 477 mutex_unlock(&pasid_mutex); 478 return ret; 479 } 480 481 static void _load_pasid(void *unused) 482 { 483 update_pasid(); 484 } 485 486 static void load_pasid(struct mm_struct *mm, u32 pasid) 487 { 488 mutex_lock(&mm->context.lock); 489 490 /* Synchronize with READ_ONCE in update_pasid(). */ 491 smp_store_release(&mm->pasid, pasid); 492 493 /* Update PASID MSR on all CPUs running the mm's tasks. */ 494 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 495 496 mutex_unlock(&mm->context.lock); 497 } 498 499 /* Caller must hold pasid_mutex, mm reference */ 500 static int 501 intel_svm_bind_mm(struct device *dev, unsigned int flags, 502 struct svm_dev_ops *ops, 503 struct mm_struct *mm, struct intel_svm_dev **sd) 504 { 505 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 506 struct device_domain_info *info; 507 struct intel_svm_dev *sdev; 508 struct intel_svm *svm = NULL; 509 unsigned long iflags; 510 int pasid_max; 511 int ret; 512 513 if (!iommu || dmar_disabled) 514 return -EINVAL; 515 516 if (!intel_svm_capable(iommu)) 517 return -ENOTSUPP; 518 519 if (dev_is_pci(dev)) { 520 pasid_max = pci_max_pasids(to_pci_dev(dev)); 521 if (pasid_max < 0) 522 return -EINVAL; 523 } else 524 pasid_max = 1 << 20; 525 526 /* Bind supervisor PASID shuld have mm = NULL */ 527 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 528 if (!ecap_srs(iommu->ecap) || mm) { 529 pr_err("Supervisor PASID with user provided mm.\n"); 530 return -EINVAL; 531 } 532 } 533 534 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 535 struct intel_svm *t; 536 537 list_for_each_entry(t, &global_svm_list, list) { 538 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 539 continue; 540 541 svm = t; 542 if (svm->pasid >= pasid_max) { 543 dev_warn(dev, 544 "Limited PASID width. Cannot use existing PASID %d\n", 545 svm->pasid); 546 ret = -ENOSPC; 547 goto out; 548 } 549 550 /* Find the matching device in svm list */ 551 for_each_svm_dev(sdev, svm, dev) { 552 if (sdev->ops != ops) { 553 ret = -EBUSY; 554 goto out; 555 } 556 sdev->users++; 557 goto success; 558 } 559 560 break; 561 } 562 } 563 564 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 565 if (!sdev) { 566 ret = -ENOMEM; 567 goto out; 568 } 569 sdev->dev = dev; 570 sdev->iommu = iommu; 571 572 ret = intel_iommu_enable_pasid(iommu, dev); 573 if (ret) { 574 kfree(sdev); 575 goto out; 576 } 577 578 info = get_domain_info(dev); 579 sdev->did = FLPT_DEFAULT_DID; 580 sdev->sid = PCI_DEVID(info->bus, info->devfn); 581 if (info->ats_enabled) { 582 sdev->dev_iotlb = 1; 583 sdev->qdep = info->ats_qdep; 584 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 585 sdev->qdep = 0; 586 } 587 588 /* Finish the setup now we know we're keeping it */ 589 sdev->users = 1; 590 sdev->ops = ops; 591 init_rcu_head(&sdev->rcu); 592 593 if (!svm) { 594 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 595 if (!svm) { 596 ret = -ENOMEM; 597 kfree(sdev); 598 goto out; 599 } 600 601 if (pasid_max > intel_pasid_max_id) 602 pasid_max = intel_pasid_max_id; 603 604 /* Do not use PASID 0, reserved for RID to PASID */ 605 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 606 pasid_max - 1, svm); 607 if (svm->pasid == INVALID_IOASID) { 608 kfree(svm); 609 kfree(sdev); 610 ret = -ENOSPC; 611 goto out; 612 } 613 svm->notifier.ops = &intel_mmuops; 614 svm->mm = mm; 615 svm->flags = flags; 616 INIT_LIST_HEAD_RCU(&svm->devs); 617 INIT_LIST_HEAD(&svm->list); 618 ret = -ENOMEM; 619 if (mm) { 620 ret = mmu_notifier_register(&svm->notifier, mm); 621 if (ret) { 622 ioasid_put(svm->pasid); 623 kfree(svm); 624 kfree(sdev); 625 goto out; 626 } 627 } 628 629 spin_lock_irqsave(&iommu->lock, iflags); 630 ret = intel_pasid_setup_first_level(iommu, dev, 631 mm ? mm->pgd : init_mm.pgd, 632 svm->pasid, FLPT_DEFAULT_DID, 633 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 634 (cpu_feature_enabled(X86_FEATURE_LA57) ? 635 PASID_FLAG_FL5LP : 0)); 636 spin_unlock_irqrestore(&iommu->lock, iflags); 637 if (ret) { 638 if (mm) 639 mmu_notifier_unregister(&svm->notifier, mm); 640 ioasid_put(svm->pasid); 641 kfree(svm); 642 kfree(sdev); 643 goto out; 644 } 645 646 list_add_tail(&svm->list, &global_svm_list); 647 if (mm) { 648 /* The newly allocated pasid is loaded to the mm. */ 649 load_pasid(mm, svm->pasid); 650 } 651 } else { 652 /* 653 * Binding a new device with existing PASID, need to setup 654 * the PASID entry. 655 */ 656 spin_lock_irqsave(&iommu->lock, iflags); 657 ret = intel_pasid_setup_first_level(iommu, dev, 658 mm ? mm->pgd : init_mm.pgd, 659 svm->pasid, FLPT_DEFAULT_DID, 660 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 661 (cpu_feature_enabled(X86_FEATURE_LA57) ? 662 PASID_FLAG_FL5LP : 0)); 663 spin_unlock_irqrestore(&iommu->lock, iflags); 664 if (ret) { 665 kfree(sdev); 666 goto out; 667 } 668 } 669 list_add_rcu(&sdev->list, &svm->devs); 670 success: 671 sdev->pasid = svm->pasid; 672 sdev->sva.dev = dev; 673 if (sd) 674 *sd = sdev; 675 ret = 0; 676 out: 677 return ret; 678 } 679 680 /* Caller must hold pasid_mutex */ 681 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 682 { 683 struct intel_svm_dev *sdev; 684 struct intel_iommu *iommu; 685 struct intel_svm *svm; 686 int ret = -EINVAL; 687 688 iommu = device_to_iommu(dev, NULL, NULL); 689 if (!iommu) 690 goto out; 691 692 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 693 if (ret) 694 goto out; 695 696 if (sdev) { 697 sdev->users--; 698 if (!sdev->users) { 699 list_del_rcu(&sdev->list); 700 /* Flush the PASID cache and IOTLB for this device. 701 * Note that we do depend on the hardware *not* using 702 * the PASID any more. Just as we depend on other 703 * devices never using PASIDs that they have no right 704 * to use. We have a *shared* PASID table, because it's 705 * large and has to be physically contiguous. So it's 706 * hard to be as defensive as we might like. */ 707 intel_pasid_tear_down_entry(iommu, dev, 708 svm->pasid, false); 709 intel_svm_drain_prq(dev, svm->pasid); 710 kfree_rcu(sdev, rcu); 711 712 if (list_empty(&svm->devs)) { 713 ioasid_put(svm->pasid); 714 if (svm->mm) { 715 mmu_notifier_unregister(&svm->notifier, svm->mm); 716 /* Clear mm's pasid. */ 717 load_pasid(svm->mm, PASID_DISABLED); 718 } 719 list_del(&svm->list); 720 /* We mandate that no page faults may be outstanding 721 * for the PASID when intel_svm_unbind_mm() is called. 722 * If that is not obeyed, subtle errors will happen. 723 * Let's make them less subtle... */ 724 memset(svm, 0x6b, sizeof(*svm)); 725 kfree(svm); 726 } 727 } 728 } 729 out: 730 return ret; 731 } 732 733 /* Page request queue descriptor */ 734 struct page_req_dsc { 735 union { 736 struct { 737 u64 type:8; 738 u64 pasid_present:1; 739 u64 priv_data_present:1; 740 u64 rsvd:6; 741 u64 rid:16; 742 u64 pasid:20; 743 u64 exe_req:1; 744 u64 pm_req:1; 745 u64 rsvd2:10; 746 }; 747 u64 qw_0; 748 }; 749 union { 750 struct { 751 u64 rd_req:1; 752 u64 wr_req:1; 753 u64 lpig:1; 754 u64 prg_index:9; 755 u64 addr:52; 756 }; 757 u64 qw_1; 758 }; 759 u64 priv_data[2]; 760 }; 761 762 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 763 764 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 765 { 766 unsigned long requested = 0; 767 768 if (req->exe_req) 769 requested |= VM_EXEC; 770 771 if (req->rd_req) 772 requested |= VM_READ; 773 774 if (req->wr_req) 775 requested |= VM_WRITE; 776 777 return (requested & ~vma->vm_flags) != 0; 778 } 779 780 static bool is_canonical_address(u64 addr) 781 { 782 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 783 long saddr = (long) addr; 784 785 return (((saddr << shift) >> shift) == saddr); 786 } 787 788 /** 789 * intel_svm_drain_prq - Drain page requests and responses for a pasid 790 * @dev: target device 791 * @pasid: pasid for draining 792 * 793 * Drain all pending page requests and responses related to @pasid in both 794 * software and hardware. This is supposed to be called after the device 795 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 796 * and DevTLB have been invalidated. 797 * 798 * It waits until all pending page requests for @pasid in the page fault 799 * queue are completed by the prq handling thread. Then follow the steps 800 * described in VT-d spec CH7.10 to drain all page requests and page 801 * responses pending in the hardware. 802 */ 803 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 804 { 805 struct device_domain_info *info; 806 struct dmar_domain *domain; 807 struct intel_iommu *iommu; 808 struct qi_desc desc[3]; 809 struct pci_dev *pdev; 810 int head, tail; 811 u16 sid, did; 812 int qdep; 813 814 info = get_domain_info(dev); 815 if (WARN_ON(!info || !dev_is_pci(dev))) 816 return; 817 818 if (!info->pri_enabled) 819 return; 820 821 iommu = info->iommu; 822 domain = info->domain; 823 pdev = to_pci_dev(dev); 824 sid = PCI_DEVID(info->bus, info->devfn); 825 did = domain->iommu_did[iommu->seq_id]; 826 qdep = pci_ats_queue_depth(pdev); 827 828 /* 829 * Check and wait until all pending page requests in the queue are 830 * handled by the prq handling thread. 831 */ 832 prq_retry: 833 reinit_completion(&iommu->prq_complete); 834 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 835 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 836 while (head != tail) { 837 struct page_req_dsc *req; 838 839 req = &iommu->prq[head / sizeof(*req)]; 840 if (!req->pasid_present || req->pasid != pasid) { 841 head = (head + sizeof(*req)) & PRQ_RING_MASK; 842 continue; 843 } 844 845 wait_for_completion(&iommu->prq_complete); 846 goto prq_retry; 847 } 848 849 /* 850 * Perform steps described in VT-d spec CH7.10 to drain page 851 * requests and responses in hardware. 852 */ 853 memset(desc, 0, sizeof(desc)); 854 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 855 QI_IWD_FENCE | 856 QI_IWD_TYPE; 857 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 858 QI_EIOTLB_DID(did) | 859 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 860 QI_EIOTLB_TYPE; 861 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 862 QI_DEV_EIOTLB_SID(sid) | 863 QI_DEV_EIOTLB_QDEP(qdep) | 864 QI_DEIOTLB_TYPE | 865 QI_DEV_IOTLB_PFSID(info->pfsid); 866 qi_retry: 867 reinit_completion(&iommu->prq_complete); 868 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 869 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 870 wait_for_completion(&iommu->prq_complete); 871 goto qi_retry; 872 } 873 } 874 875 static int prq_to_iommu_prot(struct page_req_dsc *req) 876 { 877 int prot = 0; 878 879 if (req->rd_req) 880 prot |= IOMMU_FAULT_PERM_READ; 881 if (req->wr_req) 882 prot |= IOMMU_FAULT_PERM_WRITE; 883 if (req->exe_req) 884 prot |= IOMMU_FAULT_PERM_EXEC; 885 if (req->pm_req) 886 prot |= IOMMU_FAULT_PERM_PRIV; 887 888 return prot; 889 } 890 891 static int 892 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 893 { 894 struct iommu_fault_event event; 895 896 if (!dev || !dev_is_pci(dev)) 897 return -ENODEV; 898 899 /* Fill in event data for device specific processing */ 900 memset(&event, 0, sizeof(struct iommu_fault_event)); 901 event.fault.type = IOMMU_FAULT_PAGE_REQ; 902 event.fault.prm.addr = desc->addr; 903 event.fault.prm.pasid = desc->pasid; 904 event.fault.prm.grpid = desc->prg_index; 905 event.fault.prm.perm = prq_to_iommu_prot(desc); 906 907 if (desc->lpig) 908 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 909 if (desc->pasid_present) { 910 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 911 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 912 } 913 if (desc->priv_data_present) { 914 /* 915 * Set last page in group bit if private data is present, 916 * page response is required as it does for LPIG. 917 * iommu_report_device_fault() doesn't understand this vendor 918 * specific requirement thus we set last_page as a workaround. 919 */ 920 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 921 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 922 memcpy(event.fault.prm.private_data, desc->priv_data, 923 sizeof(desc->priv_data)); 924 } 925 926 return iommu_report_device_fault(dev, &event); 927 } 928 929 static irqreturn_t prq_event_thread(int irq, void *d) 930 { 931 struct intel_svm_dev *sdev = NULL; 932 struct intel_iommu *iommu = d; 933 struct intel_svm *svm = NULL; 934 int head, tail, handled = 0; 935 936 /* Clear PPR bit before reading head/tail registers, to 937 * ensure that we get a new interrupt if needed. */ 938 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 939 940 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 941 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 942 while (head != tail) { 943 struct vm_area_struct *vma; 944 struct page_req_dsc *req; 945 struct qi_desc resp; 946 int result; 947 vm_fault_t ret; 948 u64 address; 949 950 handled = 1; 951 952 req = &iommu->prq[head / sizeof(*req)]; 953 954 result = QI_RESP_FAILURE; 955 address = (u64)req->addr << VTD_PAGE_SHIFT; 956 if (!req->pasid_present) { 957 pr_err("%s: Page request without PASID: %08llx %08llx\n", 958 iommu->name, ((unsigned long long *)req)[0], 959 ((unsigned long long *)req)[1]); 960 goto no_pasid; 961 } 962 963 if (!svm || svm->pasid != req->pasid) { 964 rcu_read_lock(); 965 svm = ioasid_find(NULL, req->pasid, NULL); 966 /* It *can't* go away, because the driver is not permitted 967 * to unbind the mm while any page faults are outstanding. 968 * So we only need RCU to protect the internal idr code. */ 969 rcu_read_unlock(); 970 if (IS_ERR_OR_NULL(svm)) { 971 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 972 iommu->name, req->pasid, ((unsigned long long *)req)[0], 973 ((unsigned long long *)req)[1]); 974 goto no_pasid; 975 } 976 } 977 978 if (!sdev || sdev->sid != req->rid) { 979 struct intel_svm_dev *t; 980 981 sdev = NULL; 982 rcu_read_lock(); 983 list_for_each_entry_rcu(t, &svm->devs, list) { 984 if (t->sid == req->rid) { 985 sdev = t; 986 break; 987 } 988 } 989 rcu_read_unlock(); 990 } 991 992 result = QI_RESP_INVALID; 993 /* Since we're using init_mm.pgd directly, we should never take 994 * any faults on kernel addresses. */ 995 if (!svm->mm) 996 goto bad_req; 997 998 /* If address is not canonical, return invalid response */ 999 if (!is_canonical_address(address)) 1000 goto bad_req; 1001 1002 /* 1003 * If prq is to be handled outside iommu driver via receiver of 1004 * the fault notifiers, we skip the page response here. 1005 */ 1006 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1007 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 1008 goto prq_advance; 1009 else 1010 goto bad_req; 1011 } 1012 1013 /* If the mm is already defunct, don't handle faults. */ 1014 if (!mmget_not_zero(svm->mm)) 1015 goto bad_req; 1016 1017 mmap_read_lock(svm->mm); 1018 vma = find_extend_vma(svm->mm, address); 1019 if (!vma || address < vma->vm_start) 1020 goto invalid; 1021 1022 if (access_error(vma, req)) 1023 goto invalid; 1024 1025 ret = handle_mm_fault(vma, address, 1026 req->wr_req ? FAULT_FLAG_WRITE : 0, 1027 NULL); 1028 if (ret & VM_FAULT_ERROR) 1029 goto invalid; 1030 1031 result = QI_RESP_SUCCESS; 1032 invalid: 1033 mmap_read_unlock(svm->mm); 1034 mmput(svm->mm); 1035 bad_req: 1036 WARN_ON(!sdev); 1037 if (sdev && sdev->ops && sdev->ops->fault_cb) { 1038 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 1039 (req->exe_req << 1) | (req->pm_req); 1040 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1041 req->priv_data, rwxp, result); 1042 } 1043 /* We get here in the error case where the PASID lookup failed, 1044 and these can be NULL. Do not use them below this point! */ 1045 sdev = NULL; 1046 svm = NULL; 1047 no_pasid: 1048 if (req->lpig || req->priv_data_present) { 1049 /* 1050 * Per VT-d spec. v3.0 ch7.7, system software must 1051 * respond with page group response if private data 1052 * is present (PDP) or last page in group (LPIG) bit 1053 * is set. This is an additional VT-d feature beyond 1054 * PCI ATS spec. 1055 */ 1056 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1057 QI_PGRP_DID(req->rid) | 1058 QI_PGRP_PASID_P(req->pasid_present) | 1059 QI_PGRP_PDP(req->priv_data_present) | 1060 QI_PGRP_RESP_CODE(result) | 1061 QI_PGRP_RESP_TYPE; 1062 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1063 QI_PGRP_LPIG(req->lpig); 1064 1065 if (req->priv_data_present) 1066 memcpy(&resp.qw2, req->priv_data, 1067 sizeof(req->priv_data)); 1068 resp.qw2 = 0; 1069 resp.qw3 = 0; 1070 qi_submit_sync(iommu, &resp, 1, 0); 1071 } 1072 prq_advance: 1073 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1074 } 1075 1076 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1077 1078 /* 1079 * Clear the page request overflow bit and wake up all threads that 1080 * are waiting for the completion of this handling. 1081 */ 1082 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) 1083 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1084 1085 if (!completion_done(&iommu->prq_complete)) 1086 complete(&iommu->prq_complete); 1087 1088 return IRQ_RETVAL(handled); 1089 } 1090 1091 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1092 struct iommu_sva * 1093 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1094 { 1095 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1096 struct intel_svm_dev *sdev = NULL; 1097 unsigned int flags = 0; 1098 int ret; 1099 1100 /* 1101 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1102 * It will require shared SVM data structures, i.e. combine io_mm 1103 * and intel_svm etc. 1104 */ 1105 if (drvdata) 1106 flags = *(unsigned int *)drvdata; 1107 mutex_lock(&pasid_mutex); 1108 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1109 if (ret) 1110 sva = ERR_PTR(ret); 1111 else if (sdev) 1112 sva = &sdev->sva; 1113 else 1114 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1115 1116 mutex_unlock(&pasid_mutex); 1117 1118 return sva; 1119 } 1120 1121 void intel_svm_unbind(struct iommu_sva *sva) 1122 { 1123 struct intel_svm_dev *sdev; 1124 1125 mutex_lock(&pasid_mutex); 1126 sdev = to_intel_svm_dev(sva); 1127 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1128 mutex_unlock(&pasid_mutex); 1129 } 1130 1131 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1132 { 1133 struct intel_svm_dev *sdev; 1134 u32 pasid; 1135 1136 mutex_lock(&pasid_mutex); 1137 sdev = to_intel_svm_dev(sva); 1138 pasid = sdev->pasid; 1139 mutex_unlock(&pasid_mutex); 1140 1141 return pasid; 1142 } 1143 1144 int intel_svm_page_response(struct device *dev, 1145 struct iommu_fault_event *evt, 1146 struct iommu_page_response *msg) 1147 { 1148 struct iommu_fault_page_request *prm; 1149 struct intel_svm_dev *sdev = NULL; 1150 struct intel_svm *svm = NULL; 1151 struct intel_iommu *iommu; 1152 bool private_present; 1153 bool pasid_present; 1154 bool last_page; 1155 u8 bus, devfn; 1156 int ret = 0; 1157 u16 sid; 1158 1159 if (!dev || !dev_is_pci(dev)) 1160 return -ENODEV; 1161 1162 iommu = device_to_iommu(dev, &bus, &devfn); 1163 if (!iommu) 1164 return -ENODEV; 1165 1166 if (!msg || !evt) 1167 return -EINVAL; 1168 1169 mutex_lock(&pasid_mutex); 1170 1171 prm = &evt->fault.prm; 1172 sid = PCI_DEVID(bus, devfn); 1173 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1174 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1175 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1176 1177 if (!pasid_present) { 1178 ret = -EINVAL; 1179 goto out; 1180 } 1181 1182 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1183 ret = -EINVAL; 1184 goto out; 1185 } 1186 1187 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1188 if (ret || !sdev) { 1189 ret = -ENODEV; 1190 goto out; 1191 } 1192 1193 /* 1194 * For responses from userspace, need to make sure that the 1195 * pasid has been bound to its mm. 1196 */ 1197 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1198 struct mm_struct *mm; 1199 1200 mm = get_task_mm(current); 1201 if (!mm) { 1202 ret = -EINVAL; 1203 goto out; 1204 } 1205 1206 if (mm != svm->mm) { 1207 ret = -ENODEV; 1208 mmput(mm); 1209 goto out; 1210 } 1211 1212 mmput(mm); 1213 } 1214 1215 /* 1216 * Per VT-d spec. v3.0 ch7.7, system software must respond 1217 * with page group response if private data is present (PDP) 1218 * or last page in group (LPIG) bit is set. This is an 1219 * additional VT-d requirement beyond PCI ATS spec. 1220 */ 1221 if (last_page || private_present) { 1222 struct qi_desc desc; 1223 1224 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1225 QI_PGRP_PASID_P(pasid_present) | 1226 QI_PGRP_PDP(private_present) | 1227 QI_PGRP_RESP_CODE(msg->code) | 1228 QI_PGRP_RESP_TYPE; 1229 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1230 desc.qw2 = 0; 1231 desc.qw3 = 0; 1232 if (private_present) 1233 memcpy(&desc.qw2, prm->private_data, 1234 sizeof(prm->private_data)); 1235 1236 qi_submit_sync(iommu, &desc, 1, 0); 1237 } 1238 out: 1239 mutex_unlock(&pasid_mutex); 1240 return ret; 1241 } 1242