1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, 122 unsigned long address, unsigned long pages, int ih) 123 { 124 struct qi_desc desc; 125 126 if (pages == -1) { 127 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 128 QI_EIOTLB_DID(sdev->did) | 129 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 130 QI_EIOTLB_TYPE; 131 desc.qw1 = 0; 132 } else { 133 int mask = ilog2(__roundup_pow_of_two(pages)); 134 135 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 136 QI_EIOTLB_DID(sdev->did) | 137 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 138 QI_EIOTLB_TYPE; 139 desc.qw1 = QI_EIOTLB_ADDR(address) | 140 QI_EIOTLB_IH(ih) | 141 QI_EIOTLB_AM(mask); 142 } 143 desc.qw2 = 0; 144 desc.qw3 = 0; 145 qi_submit_sync(svm->iommu, &desc, 1, 0); 146 147 if (sdev->dev_iotlb) { 148 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | 149 QI_DEV_EIOTLB_SID(sdev->sid) | 150 QI_DEV_EIOTLB_QDEP(sdev->qdep) | 151 QI_DEIOTLB_TYPE; 152 if (pages == -1) { 153 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | 154 QI_DEV_EIOTLB_SIZE; 155 } else if (pages > 1) { 156 /* The least significant zero bit indicates the size. So, 157 * for example, an "address" value of 0x12345f000 will 158 * flush from 0x123440000 to 0x12347ffff (256KiB). */ 159 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); 160 unsigned long mask = __rounddown_pow_of_two(address ^ last); 161 162 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | 163 (mask - 1)) | QI_DEV_EIOTLB_SIZE; 164 } else { 165 desc.qw1 = QI_DEV_EIOTLB_ADDR(address); 166 } 167 desc.qw2 = 0; 168 desc.qw3 = 0; 169 qi_submit_sync(svm->iommu, &desc, 1, 0); 170 } 171 } 172 173 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 174 unsigned long pages, int ih) 175 { 176 struct intel_svm_dev *sdev; 177 178 rcu_read_lock(); 179 list_for_each_entry_rcu(sdev, &svm->devs, list) 180 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 181 rcu_read_unlock(); 182 } 183 184 /* Pages have been freed at this point */ 185 static void intel_invalidate_range(struct mmu_notifier *mn, 186 struct mm_struct *mm, 187 unsigned long start, unsigned long end) 188 { 189 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 190 191 intel_flush_svm_range(svm, start, 192 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 193 } 194 195 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 196 { 197 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 198 struct intel_svm_dev *sdev; 199 200 /* This might end up being called from exit_mmap(), *before* the page 201 * tables are cleared. And __mmu_notifier_release() will delete us from 202 * the list of notifiers so that our invalidate_range() callback doesn't 203 * get called when the page tables are cleared. So we need to protect 204 * against hardware accessing those page tables. 205 * 206 * We do it by clearing the entry in the PASID table and then flushing 207 * the IOTLB and the PASID table caches. This might upset hardware; 208 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 209 * page) so that we end up taking a fault that the hardware really 210 * *has* to handle gracefully without affecting other processes. 211 */ 212 rcu_read_lock(); 213 list_for_each_entry_rcu(sdev, &svm->devs, list) 214 intel_pasid_tear_down_entry(svm->iommu, sdev->dev, 215 svm->pasid, true); 216 rcu_read_unlock(); 217 218 } 219 220 static const struct mmu_notifier_ops intel_mmuops = { 221 .release = intel_mm_release, 222 .invalidate_range = intel_invalidate_range, 223 }; 224 225 static DEFINE_MUTEX(pasid_mutex); 226 static LIST_HEAD(global_svm_list); 227 228 #define for_each_svm_dev(sdev, svm, d) \ 229 list_for_each_entry((sdev), &(svm)->devs, list) \ 230 if ((d) != (sdev)->dev) {} else 231 232 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 233 struct intel_svm **rsvm, 234 struct intel_svm_dev **rsdev) 235 { 236 struct intel_svm_dev *d, *sdev = NULL; 237 struct intel_svm *svm; 238 239 /* The caller should hold the pasid_mutex lock */ 240 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 241 return -EINVAL; 242 243 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 244 return -EINVAL; 245 246 svm = ioasid_find(NULL, pasid, NULL); 247 if (IS_ERR(svm)) 248 return PTR_ERR(svm); 249 250 if (!svm) 251 goto out; 252 253 /* 254 * If we found svm for the PASID, there must be at least one device 255 * bond. 256 */ 257 if (WARN_ON(list_empty(&svm->devs))) 258 return -EINVAL; 259 260 rcu_read_lock(); 261 list_for_each_entry_rcu(d, &svm->devs, list) { 262 if (d->dev == dev) { 263 sdev = d; 264 break; 265 } 266 } 267 rcu_read_unlock(); 268 269 out: 270 *rsvm = svm; 271 *rsdev = sdev; 272 273 return 0; 274 } 275 276 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 277 struct iommu_gpasid_bind_data *data) 278 { 279 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 280 struct intel_svm_dev *sdev = NULL; 281 struct dmar_domain *dmar_domain; 282 struct device_domain_info *info; 283 struct intel_svm *svm = NULL; 284 int ret = 0; 285 286 if (WARN_ON(!iommu) || !data) 287 return -EINVAL; 288 289 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 290 return -EINVAL; 291 292 /* IOMMU core ensures argsz is more than the start of the union */ 293 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 294 return -EINVAL; 295 296 /* Make sure no undefined flags are used in vendor data */ 297 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 298 return -EINVAL; 299 300 if (!dev_is_pci(dev)) 301 return -ENOTSUPP; 302 303 /* VT-d supports devices with full 20 bit PASIDs only */ 304 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 305 return -EINVAL; 306 307 /* 308 * We only check host PASID range, we have no knowledge to check 309 * guest PASID range. 310 */ 311 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 312 return -EINVAL; 313 314 info = get_domain_info(dev); 315 if (!info) 316 return -EINVAL; 317 318 dmar_domain = to_dmar_domain(domain); 319 320 mutex_lock(&pasid_mutex); 321 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 322 if (ret) 323 goto out; 324 325 if (sdev) { 326 /* 327 * Do not allow multiple bindings of the same device-PASID since 328 * there is only one SL page tables per PASID. We may revisit 329 * once sharing PGD across domains are supported. 330 */ 331 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 332 svm->pasid); 333 ret = -EBUSY; 334 goto out; 335 } 336 337 if (!svm) { 338 /* We come here when PASID has never been bond to a device. */ 339 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 340 if (!svm) { 341 ret = -ENOMEM; 342 goto out; 343 } 344 /* REVISIT: upper layer/VFIO can track host process that bind 345 * the PASID. ioasid_set = mm might be sufficient for vfio to 346 * check pasid VMM ownership. We can drop the following line 347 * once VFIO and IOASID set check is in place. 348 */ 349 svm->mm = get_task_mm(current); 350 svm->pasid = data->hpasid; 351 if (data->flags & IOMMU_SVA_GPASID_VAL) { 352 svm->gpasid = data->gpasid; 353 svm->flags |= SVM_FLAG_GUEST_PASID; 354 } 355 ioasid_set_data(data->hpasid, svm); 356 INIT_LIST_HEAD_RCU(&svm->devs); 357 mmput(svm->mm); 358 } 359 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 360 if (!sdev) { 361 ret = -ENOMEM; 362 goto out; 363 } 364 sdev->dev = dev; 365 sdev->sid = PCI_DEVID(info->bus, info->devfn); 366 367 /* Only count users if device has aux domains */ 368 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 369 sdev->users = 1; 370 371 /* Set up device context entry for PASID if not enabled already */ 372 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 373 if (ret) { 374 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 375 kfree(sdev); 376 goto out; 377 } 378 379 /* 380 * PASID table is per device for better security. Therefore, for 381 * each bind of a new device even with an existing PASID, we need to 382 * call the nested mode setup function here. 383 */ 384 spin_lock(&iommu->lock); 385 ret = intel_pasid_setup_nested(iommu, dev, 386 (pgd_t *)(uintptr_t)data->gpgd, 387 data->hpasid, &data->vendor.vtd, dmar_domain, 388 data->addr_width); 389 spin_unlock(&iommu->lock); 390 if (ret) { 391 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 392 data->hpasid, ret); 393 /* 394 * PASID entry should be in cleared state if nested mode 395 * set up failed. So we only need to clear IOASID tracking 396 * data such that free call will succeed. 397 */ 398 kfree(sdev); 399 goto out; 400 } 401 402 svm->flags |= SVM_FLAG_GUEST_MODE; 403 404 init_rcu_head(&sdev->rcu); 405 list_add_rcu(&sdev->list, &svm->devs); 406 out: 407 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 408 ioasid_set_data(data->hpasid, NULL); 409 kfree(svm); 410 } 411 412 mutex_unlock(&pasid_mutex); 413 return ret; 414 } 415 416 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 417 { 418 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 419 struct intel_svm_dev *sdev; 420 struct intel_svm *svm; 421 int ret; 422 423 if (WARN_ON(!iommu)) 424 return -EINVAL; 425 426 mutex_lock(&pasid_mutex); 427 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 428 if (ret) 429 goto out; 430 431 if (sdev) { 432 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 433 sdev->users--; 434 if (!sdev->users) { 435 list_del_rcu(&sdev->list); 436 intel_pasid_tear_down_entry(iommu, dev, 437 svm->pasid, false); 438 intel_svm_drain_prq(dev, svm->pasid); 439 kfree_rcu(sdev, rcu); 440 441 if (list_empty(&svm->devs)) { 442 /* 443 * We do not free the IOASID here in that 444 * IOMMU driver did not allocate it. 445 * Unlike native SVM, IOASID for guest use was 446 * allocated prior to the bind call. 447 * In any case, if the free call comes before 448 * the unbind, IOMMU driver will get notified 449 * and perform cleanup. 450 */ 451 ioasid_set_data(pasid, NULL); 452 kfree(svm); 453 } 454 } 455 } 456 out: 457 mutex_unlock(&pasid_mutex); 458 return ret; 459 } 460 461 static void _load_pasid(void *unused) 462 { 463 update_pasid(); 464 } 465 466 static void load_pasid(struct mm_struct *mm, u32 pasid) 467 { 468 mutex_lock(&mm->context.lock); 469 470 /* Synchronize with READ_ONCE in update_pasid(). */ 471 smp_store_release(&mm->pasid, pasid); 472 473 /* Update PASID MSR on all CPUs running the mm's tasks. */ 474 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 475 476 mutex_unlock(&mm->context.lock); 477 } 478 479 /* Caller must hold pasid_mutex, mm reference */ 480 static int 481 intel_svm_bind_mm(struct device *dev, unsigned int flags, 482 struct svm_dev_ops *ops, 483 struct mm_struct *mm, struct intel_svm_dev **sd) 484 { 485 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 486 struct device_domain_info *info; 487 struct intel_svm_dev *sdev; 488 struct intel_svm *svm = NULL; 489 int pasid_max; 490 int ret; 491 492 if (!iommu || dmar_disabled) 493 return -EINVAL; 494 495 if (!intel_svm_capable(iommu)) 496 return -ENOTSUPP; 497 498 if (dev_is_pci(dev)) { 499 pasid_max = pci_max_pasids(to_pci_dev(dev)); 500 if (pasid_max < 0) 501 return -EINVAL; 502 } else 503 pasid_max = 1 << 20; 504 505 /* Bind supervisor PASID shuld have mm = NULL */ 506 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 507 if (!ecap_srs(iommu->ecap) || mm) { 508 pr_err("Supervisor PASID with user provided mm.\n"); 509 return -EINVAL; 510 } 511 } 512 513 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 514 struct intel_svm *t; 515 516 list_for_each_entry(t, &global_svm_list, list) { 517 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 518 continue; 519 520 svm = t; 521 if (svm->pasid >= pasid_max) { 522 dev_warn(dev, 523 "Limited PASID width. Cannot use existing PASID %d\n", 524 svm->pasid); 525 ret = -ENOSPC; 526 goto out; 527 } 528 529 /* Find the matching device in svm list */ 530 for_each_svm_dev(sdev, svm, dev) { 531 if (sdev->ops != ops) { 532 ret = -EBUSY; 533 goto out; 534 } 535 sdev->users++; 536 goto success; 537 } 538 539 break; 540 } 541 } 542 543 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 544 if (!sdev) { 545 ret = -ENOMEM; 546 goto out; 547 } 548 sdev->dev = dev; 549 550 ret = intel_iommu_enable_pasid(iommu, dev); 551 if (ret) { 552 kfree(sdev); 553 goto out; 554 } 555 556 info = get_domain_info(dev); 557 sdev->did = FLPT_DEFAULT_DID; 558 sdev->sid = PCI_DEVID(info->bus, info->devfn); 559 if (info->ats_enabled) { 560 sdev->dev_iotlb = 1; 561 sdev->qdep = info->ats_qdep; 562 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 563 sdev->qdep = 0; 564 } 565 566 /* Finish the setup now we know we're keeping it */ 567 sdev->users = 1; 568 sdev->ops = ops; 569 init_rcu_head(&sdev->rcu); 570 571 if (!svm) { 572 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 573 if (!svm) { 574 ret = -ENOMEM; 575 kfree(sdev); 576 goto out; 577 } 578 svm->iommu = iommu; 579 580 if (pasid_max > intel_pasid_max_id) 581 pasid_max = intel_pasid_max_id; 582 583 /* Do not use PASID 0, reserved for RID to PASID */ 584 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 585 pasid_max - 1, svm); 586 if (svm->pasid == INVALID_IOASID) { 587 kfree(svm); 588 kfree(sdev); 589 ret = -ENOSPC; 590 goto out; 591 } 592 svm->notifier.ops = &intel_mmuops; 593 svm->mm = mm; 594 svm->flags = flags; 595 INIT_LIST_HEAD_RCU(&svm->devs); 596 INIT_LIST_HEAD(&svm->list); 597 ret = -ENOMEM; 598 if (mm) { 599 ret = mmu_notifier_register(&svm->notifier, mm); 600 if (ret) { 601 ioasid_put(svm->pasid); 602 kfree(svm); 603 kfree(sdev); 604 goto out; 605 } 606 } 607 608 spin_lock(&iommu->lock); 609 ret = intel_pasid_setup_first_level(iommu, dev, 610 mm ? mm->pgd : init_mm.pgd, 611 svm->pasid, FLPT_DEFAULT_DID, 612 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 613 (cpu_feature_enabled(X86_FEATURE_LA57) ? 614 PASID_FLAG_FL5LP : 0)); 615 spin_unlock(&iommu->lock); 616 if (ret) { 617 if (mm) 618 mmu_notifier_unregister(&svm->notifier, mm); 619 ioasid_put(svm->pasid); 620 kfree(svm); 621 kfree(sdev); 622 goto out; 623 } 624 625 list_add_tail(&svm->list, &global_svm_list); 626 if (mm) { 627 /* The newly allocated pasid is loaded to the mm. */ 628 load_pasid(mm, svm->pasid); 629 } 630 } else { 631 /* 632 * Binding a new device with existing PASID, need to setup 633 * the PASID entry. 634 */ 635 spin_lock(&iommu->lock); 636 ret = intel_pasid_setup_first_level(iommu, dev, 637 mm ? mm->pgd : init_mm.pgd, 638 svm->pasid, FLPT_DEFAULT_DID, 639 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 640 (cpu_feature_enabled(X86_FEATURE_LA57) ? 641 PASID_FLAG_FL5LP : 0)); 642 spin_unlock(&iommu->lock); 643 if (ret) { 644 kfree(sdev); 645 goto out; 646 } 647 } 648 list_add_rcu(&sdev->list, &svm->devs); 649 success: 650 sdev->pasid = svm->pasid; 651 sdev->sva.dev = dev; 652 if (sd) 653 *sd = sdev; 654 ret = 0; 655 out: 656 return ret; 657 } 658 659 /* Caller must hold pasid_mutex */ 660 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 661 { 662 struct intel_svm_dev *sdev; 663 struct intel_iommu *iommu; 664 struct intel_svm *svm; 665 int ret = -EINVAL; 666 667 iommu = device_to_iommu(dev, NULL, NULL); 668 if (!iommu) 669 goto out; 670 671 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 672 if (ret) 673 goto out; 674 675 if (sdev) { 676 sdev->users--; 677 if (!sdev->users) { 678 list_del_rcu(&sdev->list); 679 /* Flush the PASID cache and IOTLB for this device. 680 * Note that we do depend on the hardware *not* using 681 * the PASID any more. Just as we depend on other 682 * devices never using PASIDs that they have no right 683 * to use. We have a *shared* PASID table, because it's 684 * large and has to be physically contiguous. So it's 685 * hard to be as defensive as we might like. */ 686 intel_pasid_tear_down_entry(iommu, dev, 687 svm->pasid, false); 688 intel_svm_drain_prq(dev, svm->pasid); 689 kfree_rcu(sdev, rcu); 690 691 if (list_empty(&svm->devs)) { 692 ioasid_put(svm->pasid); 693 if (svm->mm) { 694 mmu_notifier_unregister(&svm->notifier, svm->mm); 695 /* Clear mm's pasid. */ 696 load_pasid(svm->mm, PASID_DISABLED); 697 } 698 list_del(&svm->list); 699 /* We mandate that no page faults may be outstanding 700 * for the PASID when intel_svm_unbind_mm() is called. 701 * If that is not obeyed, subtle errors will happen. 702 * Let's make them less subtle... */ 703 memset(svm, 0x6b, sizeof(*svm)); 704 kfree(svm); 705 } 706 } 707 } 708 out: 709 return ret; 710 } 711 712 /* Page request queue descriptor */ 713 struct page_req_dsc { 714 union { 715 struct { 716 u64 type:8; 717 u64 pasid_present:1; 718 u64 priv_data_present:1; 719 u64 rsvd:6; 720 u64 rid:16; 721 u64 pasid:20; 722 u64 exe_req:1; 723 u64 pm_req:1; 724 u64 rsvd2:10; 725 }; 726 u64 qw_0; 727 }; 728 union { 729 struct { 730 u64 rd_req:1; 731 u64 wr_req:1; 732 u64 lpig:1; 733 u64 prg_index:9; 734 u64 addr:52; 735 }; 736 u64 qw_1; 737 }; 738 u64 priv_data[2]; 739 }; 740 741 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 742 743 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 744 { 745 unsigned long requested = 0; 746 747 if (req->exe_req) 748 requested |= VM_EXEC; 749 750 if (req->rd_req) 751 requested |= VM_READ; 752 753 if (req->wr_req) 754 requested |= VM_WRITE; 755 756 return (requested & ~vma->vm_flags) != 0; 757 } 758 759 static bool is_canonical_address(u64 addr) 760 { 761 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 762 long saddr = (long) addr; 763 764 return (((saddr << shift) >> shift) == saddr); 765 } 766 767 /** 768 * intel_svm_drain_prq - Drain page requests and responses for a pasid 769 * @dev: target device 770 * @pasid: pasid for draining 771 * 772 * Drain all pending page requests and responses related to @pasid in both 773 * software and hardware. This is supposed to be called after the device 774 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 775 * and DevTLB have been invalidated. 776 * 777 * It waits until all pending page requests for @pasid in the page fault 778 * queue are completed by the prq handling thread. Then follow the steps 779 * described in VT-d spec CH7.10 to drain all page requests and page 780 * responses pending in the hardware. 781 */ 782 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 783 { 784 struct device_domain_info *info; 785 struct dmar_domain *domain; 786 struct intel_iommu *iommu; 787 struct qi_desc desc[3]; 788 struct pci_dev *pdev; 789 int head, tail; 790 u16 sid, did; 791 int qdep; 792 793 info = get_domain_info(dev); 794 if (WARN_ON(!info || !dev_is_pci(dev))) 795 return; 796 797 if (!info->pri_enabled) 798 return; 799 800 iommu = info->iommu; 801 domain = info->domain; 802 pdev = to_pci_dev(dev); 803 sid = PCI_DEVID(info->bus, info->devfn); 804 did = domain->iommu_did[iommu->seq_id]; 805 qdep = pci_ats_queue_depth(pdev); 806 807 /* 808 * Check and wait until all pending page requests in the queue are 809 * handled by the prq handling thread. 810 */ 811 prq_retry: 812 reinit_completion(&iommu->prq_complete); 813 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 814 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 815 while (head != tail) { 816 struct page_req_dsc *req; 817 818 req = &iommu->prq[head / sizeof(*req)]; 819 if (!req->pasid_present || req->pasid != pasid) { 820 head = (head + sizeof(*req)) & PRQ_RING_MASK; 821 continue; 822 } 823 824 wait_for_completion(&iommu->prq_complete); 825 goto prq_retry; 826 } 827 828 /* 829 * Perform steps described in VT-d spec CH7.10 to drain page 830 * requests and responses in hardware. 831 */ 832 memset(desc, 0, sizeof(desc)); 833 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 834 QI_IWD_FENCE | 835 QI_IWD_TYPE; 836 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 837 QI_EIOTLB_DID(did) | 838 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 839 QI_EIOTLB_TYPE; 840 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 841 QI_DEV_EIOTLB_SID(sid) | 842 QI_DEV_EIOTLB_QDEP(qdep) | 843 QI_DEIOTLB_TYPE | 844 QI_DEV_IOTLB_PFSID(info->pfsid); 845 qi_retry: 846 reinit_completion(&iommu->prq_complete); 847 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 848 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 849 wait_for_completion(&iommu->prq_complete); 850 goto qi_retry; 851 } 852 } 853 854 static int prq_to_iommu_prot(struct page_req_dsc *req) 855 { 856 int prot = 0; 857 858 if (req->rd_req) 859 prot |= IOMMU_FAULT_PERM_READ; 860 if (req->wr_req) 861 prot |= IOMMU_FAULT_PERM_WRITE; 862 if (req->exe_req) 863 prot |= IOMMU_FAULT_PERM_EXEC; 864 if (req->pm_req) 865 prot |= IOMMU_FAULT_PERM_PRIV; 866 867 return prot; 868 } 869 870 static int 871 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 872 { 873 struct iommu_fault_event event; 874 875 if (!dev || !dev_is_pci(dev)) 876 return -ENODEV; 877 878 /* Fill in event data for device specific processing */ 879 memset(&event, 0, sizeof(struct iommu_fault_event)); 880 event.fault.type = IOMMU_FAULT_PAGE_REQ; 881 event.fault.prm.addr = desc->addr; 882 event.fault.prm.pasid = desc->pasid; 883 event.fault.prm.grpid = desc->prg_index; 884 event.fault.prm.perm = prq_to_iommu_prot(desc); 885 886 if (desc->lpig) 887 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 888 if (desc->pasid_present) { 889 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 890 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 891 } 892 if (desc->priv_data_present) { 893 /* 894 * Set last page in group bit if private data is present, 895 * page response is required as it does for LPIG. 896 * iommu_report_device_fault() doesn't understand this vendor 897 * specific requirement thus we set last_page as a workaround. 898 */ 899 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 900 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 901 memcpy(event.fault.prm.private_data, desc->priv_data, 902 sizeof(desc->priv_data)); 903 } 904 905 return iommu_report_device_fault(dev, &event); 906 } 907 908 static irqreturn_t prq_event_thread(int irq, void *d) 909 { 910 struct intel_svm_dev *sdev = NULL; 911 struct intel_iommu *iommu = d; 912 struct intel_svm *svm = NULL; 913 int head, tail, handled = 0; 914 915 /* Clear PPR bit before reading head/tail registers, to 916 * ensure that we get a new interrupt if needed. */ 917 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 918 919 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 920 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 921 while (head != tail) { 922 struct vm_area_struct *vma; 923 struct page_req_dsc *req; 924 struct qi_desc resp; 925 int result; 926 vm_fault_t ret; 927 u64 address; 928 929 handled = 1; 930 931 req = &iommu->prq[head / sizeof(*req)]; 932 933 result = QI_RESP_FAILURE; 934 address = (u64)req->addr << VTD_PAGE_SHIFT; 935 if (!req->pasid_present) { 936 pr_err("%s: Page request without PASID: %08llx %08llx\n", 937 iommu->name, ((unsigned long long *)req)[0], 938 ((unsigned long long *)req)[1]); 939 goto no_pasid; 940 } 941 942 if (!svm || svm->pasid != req->pasid) { 943 rcu_read_lock(); 944 svm = ioasid_find(NULL, req->pasid, NULL); 945 /* It *can't* go away, because the driver is not permitted 946 * to unbind the mm while any page faults are outstanding. 947 * So we only need RCU to protect the internal idr code. */ 948 rcu_read_unlock(); 949 if (IS_ERR_OR_NULL(svm)) { 950 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 951 iommu->name, req->pasid, ((unsigned long long *)req)[0], 952 ((unsigned long long *)req)[1]); 953 goto no_pasid; 954 } 955 } 956 957 if (!sdev || sdev->sid != req->rid) { 958 struct intel_svm_dev *t; 959 960 sdev = NULL; 961 rcu_read_lock(); 962 list_for_each_entry_rcu(t, &svm->devs, list) { 963 if (t->sid == req->rid) { 964 sdev = t; 965 break; 966 } 967 } 968 rcu_read_unlock(); 969 } 970 971 result = QI_RESP_INVALID; 972 /* Since we're using init_mm.pgd directly, we should never take 973 * any faults on kernel addresses. */ 974 if (!svm->mm) 975 goto bad_req; 976 977 /* If address is not canonical, return invalid response */ 978 if (!is_canonical_address(address)) 979 goto bad_req; 980 981 /* 982 * If prq is to be handled outside iommu driver via receiver of 983 * the fault notifiers, we skip the page response here. 984 */ 985 if (svm->flags & SVM_FLAG_GUEST_MODE) { 986 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 987 goto prq_advance; 988 else 989 goto bad_req; 990 } 991 992 /* If the mm is already defunct, don't handle faults. */ 993 if (!mmget_not_zero(svm->mm)) 994 goto bad_req; 995 996 mmap_read_lock(svm->mm); 997 vma = find_extend_vma(svm->mm, address); 998 if (!vma || address < vma->vm_start) 999 goto invalid; 1000 1001 if (access_error(vma, req)) 1002 goto invalid; 1003 1004 ret = handle_mm_fault(vma, address, 1005 req->wr_req ? FAULT_FLAG_WRITE : 0, 1006 NULL); 1007 if (ret & VM_FAULT_ERROR) 1008 goto invalid; 1009 1010 result = QI_RESP_SUCCESS; 1011 invalid: 1012 mmap_read_unlock(svm->mm); 1013 mmput(svm->mm); 1014 bad_req: 1015 WARN_ON(!sdev); 1016 if (sdev && sdev->ops && sdev->ops->fault_cb) { 1017 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 1018 (req->exe_req << 1) | (req->pm_req); 1019 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1020 req->priv_data, rwxp, result); 1021 } 1022 /* We get here in the error case where the PASID lookup failed, 1023 and these can be NULL. Do not use them below this point! */ 1024 sdev = NULL; 1025 svm = NULL; 1026 no_pasid: 1027 if (req->lpig || req->priv_data_present) { 1028 /* 1029 * Per VT-d spec. v3.0 ch7.7, system software must 1030 * respond with page group response if private data 1031 * is present (PDP) or last page in group (LPIG) bit 1032 * is set. This is an additional VT-d feature beyond 1033 * PCI ATS spec. 1034 */ 1035 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1036 QI_PGRP_DID(req->rid) | 1037 QI_PGRP_PASID_P(req->pasid_present) | 1038 QI_PGRP_PDP(req->priv_data_present) | 1039 QI_PGRP_RESP_CODE(result) | 1040 QI_PGRP_RESP_TYPE; 1041 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1042 QI_PGRP_LPIG(req->lpig); 1043 1044 if (req->priv_data_present) 1045 memcpy(&resp.qw2, req->priv_data, 1046 sizeof(req->priv_data)); 1047 resp.qw2 = 0; 1048 resp.qw3 = 0; 1049 qi_submit_sync(iommu, &resp, 1, 0); 1050 } 1051 prq_advance: 1052 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1053 } 1054 1055 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1056 1057 /* 1058 * Clear the page request overflow bit and wake up all threads that 1059 * are waiting for the completion of this handling. 1060 */ 1061 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) 1062 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1063 1064 if (!completion_done(&iommu->prq_complete)) 1065 complete(&iommu->prq_complete); 1066 1067 return IRQ_RETVAL(handled); 1068 } 1069 1070 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1071 struct iommu_sva * 1072 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1073 { 1074 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1075 struct intel_svm_dev *sdev = NULL; 1076 unsigned int flags = 0; 1077 int ret; 1078 1079 /* 1080 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1081 * It will require shared SVM data structures, i.e. combine io_mm 1082 * and intel_svm etc. 1083 */ 1084 if (drvdata) 1085 flags = *(unsigned int *)drvdata; 1086 mutex_lock(&pasid_mutex); 1087 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1088 if (ret) 1089 sva = ERR_PTR(ret); 1090 else if (sdev) 1091 sva = &sdev->sva; 1092 else 1093 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1094 1095 mutex_unlock(&pasid_mutex); 1096 1097 return sva; 1098 } 1099 1100 void intel_svm_unbind(struct iommu_sva *sva) 1101 { 1102 struct intel_svm_dev *sdev; 1103 1104 mutex_lock(&pasid_mutex); 1105 sdev = to_intel_svm_dev(sva); 1106 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1107 mutex_unlock(&pasid_mutex); 1108 } 1109 1110 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1111 { 1112 struct intel_svm_dev *sdev; 1113 u32 pasid; 1114 1115 mutex_lock(&pasid_mutex); 1116 sdev = to_intel_svm_dev(sva); 1117 pasid = sdev->pasid; 1118 mutex_unlock(&pasid_mutex); 1119 1120 return pasid; 1121 } 1122 1123 int intel_svm_page_response(struct device *dev, 1124 struct iommu_fault_event *evt, 1125 struct iommu_page_response *msg) 1126 { 1127 struct iommu_fault_page_request *prm; 1128 struct intel_svm_dev *sdev = NULL; 1129 struct intel_svm *svm = NULL; 1130 struct intel_iommu *iommu; 1131 bool private_present; 1132 bool pasid_present; 1133 bool last_page; 1134 u8 bus, devfn; 1135 int ret = 0; 1136 u16 sid; 1137 1138 if (!dev || !dev_is_pci(dev)) 1139 return -ENODEV; 1140 1141 iommu = device_to_iommu(dev, &bus, &devfn); 1142 if (!iommu) 1143 return -ENODEV; 1144 1145 if (!msg || !evt) 1146 return -EINVAL; 1147 1148 mutex_lock(&pasid_mutex); 1149 1150 prm = &evt->fault.prm; 1151 sid = PCI_DEVID(bus, devfn); 1152 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1153 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1154 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1155 1156 if (!pasid_present) { 1157 ret = -EINVAL; 1158 goto out; 1159 } 1160 1161 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1162 ret = -EINVAL; 1163 goto out; 1164 } 1165 1166 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1167 if (ret || !sdev) { 1168 ret = -ENODEV; 1169 goto out; 1170 } 1171 1172 /* 1173 * For responses from userspace, need to make sure that the 1174 * pasid has been bound to its mm. 1175 */ 1176 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1177 struct mm_struct *mm; 1178 1179 mm = get_task_mm(current); 1180 if (!mm) { 1181 ret = -EINVAL; 1182 goto out; 1183 } 1184 1185 if (mm != svm->mm) { 1186 ret = -ENODEV; 1187 mmput(mm); 1188 goto out; 1189 } 1190 1191 mmput(mm); 1192 } 1193 1194 /* 1195 * Per VT-d spec. v3.0 ch7.7, system software must respond 1196 * with page group response if private data is present (PDP) 1197 * or last page in group (LPIG) bit is set. This is an 1198 * additional VT-d requirement beyond PCI ATS spec. 1199 */ 1200 if (last_page || private_present) { 1201 struct qi_desc desc; 1202 1203 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1204 QI_PGRP_PASID_P(pasid_present) | 1205 QI_PGRP_PDP(private_present) | 1206 QI_PGRP_RESP_CODE(msg->code) | 1207 QI_PGRP_RESP_TYPE; 1208 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1209 desc.qw2 = 0; 1210 desc.qw3 = 0; 1211 if (private_present) 1212 memcpy(&desc.qw2, prm->private_data, 1213 sizeof(prm->private_data)); 1214 1215 qi_submit_sync(iommu, &desc, 1, 0); 1216 } 1217 out: 1218 mutex_unlock(&pasid_mutex); 1219 return ret; 1220 } 1221