1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, 122 unsigned long address, unsigned long pages, int ih) 123 { 124 struct qi_desc desc; 125 126 if (pages == -1) { 127 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 128 QI_EIOTLB_DID(sdev->did) | 129 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 130 QI_EIOTLB_TYPE; 131 desc.qw1 = 0; 132 } else { 133 int mask = ilog2(__roundup_pow_of_two(pages)); 134 135 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 136 QI_EIOTLB_DID(sdev->did) | 137 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 138 QI_EIOTLB_TYPE; 139 desc.qw1 = QI_EIOTLB_ADDR(address) | 140 QI_EIOTLB_IH(ih) | 141 QI_EIOTLB_AM(mask); 142 } 143 desc.qw2 = 0; 144 desc.qw3 = 0; 145 qi_submit_sync(svm->iommu, &desc, 1, 0); 146 147 if (sdev->dev_iotlb) { 148 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | 149 QI_DEV_EIOTLB_SID(sdev->sid) | 150 QI_DEV_EIOTLB_QDEP(sdev->qdep) | 151 QI_DEIOTLB_TYPE; 152 if (pages == -1) { 153 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | 154 QI_DEV_EIOTLB_SIZE; 155 } else if (pages > 1) { 156 /* The least significant zero bit indicates the size. So, 157 * for example, an "address" value of 0x12345f000 will 158 * flush from 0x123440000 to 0x12347ffff (256KiB). */ 159 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); 160 unsigned long mask = __rounddown_pow_of_two(address ^ last); 161 162 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | 163 (mask - 1)) | QI_DEV_EIOTLB_SIZE; 164 } else { 165 desc.qw1 = QI_DEV_EIOTLB_ADDR(address); 166 } 167 desc.qw2 = 0; 168 desc.qw3 = 0; 169 qi_submit_sync(svm->iommu, &desc, 1, 0); 170 } 171 } 172 173 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 174 unsigned long pages, int ih) 175 { 176 struct intel_svm_dev *sdev; 177 178 rcu_read_lock(); 179 list_for_each_entry_rcu(sdev, &svm->devs, list) 180 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 181 rcu_read_unlock(); 182 } 183 184 /* Pages have been freed at this point */ 185 static void intel_invalidate_range(struct mmu_notifier *mn, 186 struct mm_struct *mm, 187 unsigned long start, unsigned long end) 188 { 189 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 190 191 intel_flush_svm_range(svm, start, 192 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 193 } 194 195 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 196 { 197 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 198 struct intel_svm_dev *sdev; 199 200 /* This might end up being called from exit_mmap(), *before* the page 201 * tables are cleared. And __mmu_notifier_release() will delete us from 202 * the list of notifiers so that our invalidate_range() callback doesn't 203 * get called when the page tables are cleared. So we need to protect 204 * against hardware accessing those page tables. 205 * 206 * We do it by clearing the entry in the PASID table and then flushing 207 * the IOTLB and the PASID table caches. This might upset hardware; 208 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 209 * page) so that we end up taking a fault that the hardware really 210 * *has* to handle gracefully without affecting other processes. 211 */ 212 rcu_read_lock(); 213 list_for_each_entry_rcu(sdev, &svm->devs, list) 214 intel_pasid_tear_down_entry(svm->iommu, sdev->dev, 215 svm->pasid, true); 216 rcu_read_unlock(); 217 218 } 219 220 static const struct mmu_notifier_ops intel_mmuops = { 221 .release = intel_mm_release, 222 .invalidate_range = intel_invalidate_range, 223 }; 224 225 static DEFINE_MUTEX(pasid_mutex); 226 static LIST_HEAD(global_svm_list); 227 228 #define for_each_svm_dev(sdev, svm, d) \ 229 list_for_each_entry((sdev), &(svm)->devs, list) \ 230 if ((d) != (sdev)->dev) {} else 231 232 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 233 struct intel_svm **rsvm, 234 struct intel_svm_dev **rsdev) 235 { 236 struct intel_svm_dev *d, *sdev = NULL; 237 struct intel_svm *svm; 238 239 /* The caller should hold the pasid_mutex lock */ 240 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 241 return -EINVAL; 242 243 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 244 return -EINVAL; 245 246 svm = ioasid_find(NULL, pasid, NULL); 247 if (IS_ERR(svm)) 248 return PTR_ERR(svm); 249 250 if (!svm) 251 goto out; 252 253 /* 254 * If we found svm for the PASID, there must be at least one device 255 * bond. 256 */ 257 if (WARN_ON(list_empty(&svm->devs))) 258 return -EINVAL; 259 260 rcu_read_lock(); 261 list_for_each_entry_rcu(d, &svm->devs, list) { 262 if (d->dev == dev) { 263 sdev = d; 264 break; 265 } 266 } 267 rcu_read_unlock(); 268 269 out: 270 *rsvm = svm; 271 *rsdev = sdev; 272 273 return 0; 274 } 275 276 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 277 struct iommu_gpasid_bind_data *data) 278 { 279 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 280 struct intel_svm_dev *sdev = NULL; 281 struct dmar_domain *dmar_domain; 282 struct device_domain_info *info; 283 struct intel_svm *svm = NULL; 284 unsigned long iflags; 285 int ret = 0; 286 287 if (WARN_ON(!iommu) || !data) 288 return -EINVAL; 289 290 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 291 return -EINVAL; 292 293 /* IOMMU core ensures argsz is more than the start of the union */ 294 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 295 return -EINVAL; 296 297 /* Make sure no undefined flags are used in vendor data */ 298 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 299 return -EINVAL; 300 301 if (!dev_is_pci(dev)) 302 return -ENOTSUPP; 303 304 /* VT-d supports devices with full 20 bit PASIDs only */ 305 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 306 return -EINVAL; 307 308 /* 309 * We only check host PASID range, we have no knowledge to check 310 * guest PASID range. 311 */ 312 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 313 return -EINVAL; 314 315 info = get_domain_info(dev); 316 if (!info) 317 return -EINVAL; 318 319 dmar_domain = to_dmar_domain(domain); 320 321 mutex_lock(&pasid_mutex); 322 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 323 if (ret) 324 goto out; 325 326 if (sdev) { 327 /* 328 * Do not allow multiple bindings of the same device-PASID since 329 * there is only one SL page tables per PASID. We may revisit 330 * once sharing PGD across domains are supported. 331 */ 332 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 333 svm->pasid); 334 ret = -EBUSY; 335 goto out; 336 } 337 338 if (!svm) { 339 /* We come here when PASID has never been bond to a device. */ 340 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 341 if (!svm) { 342 ret = -ENOMEM; 343 goto out; 344 } 345 /* REVISIT: upper layer/VFIO can track host process that bind 346 * the PASID. ioasid_set = mm might be sufficient for vfio to 347 * check pasid VMM ownership. We can drop the following line 348 * once VFIO and IOASID set check is in place. 349 */ 350 svm->mm = get_task_mm(current); 351 svm->pasid = data->hpasid; 352 if (data->flags & IOMMU_SVA_GPASID_VAL) { 353 svm->gpasid = data->gpasid; 354 svm->flags |= SVM_FLAG_GUEST_PASID; 355 } 356 ioasid_set_data(data->hpasid, svm); 357 INIT_LIST_HEAD_RCU(&svm->devs); 358 mmput(svm->mm); 359 } 360 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 361 if (!sdev) { 362 ret = -ENOMEM; 363 goto out; 364 } 365 sdev->dev = dev; 366 sdev->sid = PCI_DEVID(info->bus, info->devfn); 367 368 /* Only count users if device has aux domains */ 369 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 370 sdev->users = 1; 371 372 /* Set up device context entry for PASID if not enabled already */ 373 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 374 if (ret) { 375 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 376 kfree(sdev); 377 goto out; 378 } 379 380 /* 381 * PASID table is per device for better security. Therefore, for 382 * each bind of a new device even with an existing PASID, we need to 383 * call the nested mode setup function here. 384 */ 385 spin_lock_irqsave(&iommu->lock, iflags); 386 ret = intel_pasid_setup_nested(iommu, dev, 387 (pgd_t *)(uintptr_t)data->gpgd, 388 data->hpasid, &data->vendor.vtd, dmar_domain, 389 data->addr_width); 390 spin_unlock_irqrestore(&iommu->lock, iflags); 391 if (ret) { 392 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 393 data->hpasid, ret); 394 /* 395 * PASID entry should be in cleared state if nested mode 396 * set up failed. So we only need to clear IOASID tracking 397 * data such that free call will succeed. 398 */ 399 kfree(sdev); 400 goto out; 401 } 402 403 svm->flags |= SVM_FLAG_GUEST_MODE; 404 405 init_rcu_head(&sdev->rcu); 406 list_add_rcu(&sdev->list, &svm->devs); 407 out: 408 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 409 ioasid_set_data(data->hpasid, NULL); 410 kfree(svm); 411 } 412 413 mutex_unlock(&pasid_mutex); 414 return ret; 415 } 416 417 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 418 { 419 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 420 struct intel_svm_dev *sdev; 421 struct intel_svm *svm; 422 int ret; 423 424 if (WARN_ON(!iommu)) 425 return -EINVAL; 426 427 mutex_lock(&pasid_mutex); 428 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 429 if (ret) 430 goto out; 431 432 if (sdev) { 433 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 434 sdev->users--; 435 if (!sdev->users) { 436 list_del_rcu(&sdev->list); 437 intel_pasid_tear_down_entry(iommu, dev, 438 svm->pasid, false); 439 intel_svm_drain_prq(dev, svm->pasid); 440 kfree_rcu(sdev, rcu); 441 442 if (list_empty(&svm->devs)) { 443 /* 444 * We do not free the IOASID here in that 445 * IOMMU driver did not allocate it. 446 * Unlike native SVM, IOASID for guest use was 447 * allocated prior to the bind call. 448 * In any case, if the free call comes before 449 * the unbind, IOMMU driver will get notified 450 * and perform cleanup. 451 */ 452 ioasid_set_data(pasid, NULL); 453 kfree(svm); 454 } 455 } 456 } 457 out: 458 mutex_unlock(&pasid_mutex); 459 return ret; 460 } 461 462 static void _load_pasid(void *unused) 463 { 464 update_pasid(); 465 } 466 467 static void load_pasid(struct mm_struct *mm, u32 pasid) 468 { 469 mutex_lock(&mm->context.lock); 470 471 /* Synchronize with READ_ONCE in update_pasid(). */ 472 smp_store_release(&mm->pasid, pasid); 473 474 /* Update PASID MSR on all CPUs running the mm's tasks. */ 475 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 476 477 mutex_unlock(&mm->context.lock); 478 } 479 480 /* Caller must hold pasid_mutex, mm reference */ 481 static int 482 intel_svm_bind_mm(struct device *dev, unsigned int flags, 483 struct svm_dev_ops *ops, 484 struct mm_struct *mm, struct intel_svm_dev **sd) 485 { 486 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 487 struct device_domain_info *info; 488 struct intel_svm_dev *sdev; 489 struct intel_svm *svm = NULL; 490 unsigned long iflags; 491 int pasid_max; 492 int ret; 493 494 if (!iommu || dmar_disabled) 495 return -EINVAL; 496 497 if (!intel_svm_capable(iommu)) 498 return -ENOTSUPP; 499 500 if (dev_is_pci(dev)) { 501 pasid_max = pci_max_pasids(to_pci_dev(dev)); 502 if (pasid_max < 0) 503 return -EINVAL; 504 } else 505 pasid_max = 1 << 20; 506 507 /* Bind supervisor PASID shuld have mm = NULL */ 508 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 509 if (!ecap_srs(iommu->ecap) || mm) { 510 pr_err("Supervisor PASID with user provided mm.\n"); 511 return -EINVAL; 512 } 513 } 514 515 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 516 struct intel_svm *t; 517 518 list_for_each_entry(t, &global_svm_list, list) { 519 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 520 continue; 521 522 svm = t; 523 if (svm->pasid >= pasid_max) { 524 dev_warn(dev, 525 "Limited PASID width. Cannot use existing PASID %d\n", 526 svm->pasid); 527 ret = -ENOSPC; 528 goto out; 529 } 530 531 /* Find the matching device in svm list */ 532 for_each_svm_dev(sdev, svm, dev) { 533 if (sdev->ops != ops) { 534 ret = -EBUSY; 535 goto out; 536 } 537 sdev->users++; 538 goto success; 539 } 540 541 break; 542 } 543 } 544 545 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 546 if (!sdev) { 547 ret = -ENOMEM; 548 goto out; 549 } 550 sdev->dev = dev; 551 552 ret = intel_iommu_enable_pasid(iommu, dev); 553 if (ret) { 554 kfree(sdev); 555 goto out; 556 } 557 558 info = get_domain_info(dev); 559 sdev->did = FLPT_DEFAULT_DID; 560 sdev->sid = PCI_DEVID(info->bus, info->devfn); 561 if (info->ats_enabled) { 562 sdev->dev_iotlb = 1; 563 sdev->qdep = info->ats_qdep; 564 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 565 sdev->qdep = 0; 566 } 567 568 /* Finish the setup now we know we're keeping it */ 569 sdev->users = 1; 570 sdev->ops = ops; 571 init_rcu_head(&sdev->rcu); 572 573 if (!svm) { 574 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 575 if (!svm) { 576 ret = -ENOMEM; 577 kfree(sdev); 578 goto out; 579 } 580 svm->iommu = iommu; 581 582 if (pasid_max > intel_pasid_max_id) 583 pasid_max = intel_pasid_max_id; 584 585 /* Do not use PASID 0, reserved for RID to PASID */ 586 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 587 pasid_max - 1, svm); 588 if (svm->pasid == INVALID_IOASID) { 589 kfree(svm); 590 kfree(sdev); 591 ret = -ENOSPC; 592 goto out; 593 } 594 svm->notifier.ops = &intel_mmuops; 595 svm->mm = mm; 596 svm->flags = flags; 597 INIT_LIST_HEAD_RCU(&svm->devs); 598 INIT_LIST_HEAD(&svm->list); 599 ret = -ENOMEM; 600 if (mm) { 601 ret = mmu_notifier_register(&svm->notifier, mm); 602 if (ret) { 603 ioasid_put(svm->pasid); 604 kfree(svm); 605 kfree(sdev); 606 goto out; 607 } 608 } 609 610 spin_lock_irqsave(&iommu->lock, iflags); 611 ret = intel_pasid_setup_first_level(iommu, dev, 612 mm ? mm->pgd : init_mm.pgd, 613 svm->pasid, FLPT_DEFAULT_DID, 614 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 615 (cpu_feature_enabled(X86_FEATURE_LA57) ? 616 PASID_FLAG_FL5LP : 0)); 617 spin_unlock_irqrestore(&iommu->lock, iflags); 618 if (ret) { 619 if (mm) 620 mmu_notifier_unregister(&svm->notifier, mm); 621 ioasid_put(svm->pasid); 622 kfree(svm); 623 kfree(sdev); 624 goto out; 625 } 626 627 list_add_tail(&svm->list, &global_svm_list); 628 if (mm) { 629 /* The newly allocated pasid is loaded to the mm. */ 630 load_pasid(mm, svm->pasid); 631 } 632 } else { 633 /* 634 * Binding a new device with existing PASID, need to setup 635 * the PASID entry. 636 */ 637 spin_lock_irqsave(&iommu->lock, iflags); 638 ret = intel_pasid_setup_first_level(iommu, dev, 639 mm ? mm->pgd : init_mm.pgd, 640 svm->pasid, FLPT_DEFAULT_DID, 641 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 642 (cpu_feature_enabled(X86_FEATURE_LA57) ? 643 PASID_FLAG_FL5LP : 0)); 644 spin_unlock_irqrestore(&iommu->lock, iflags); 645 if (ret) { 646 kfree(sdev); 647 goto out; 648 } 649 } 650 list_add_rcu(&sdev->list, &svm->devs); 651 success: 652 sdev->pasid = svm->pasid; 653 sdev->sva.dev = dev; 654 if (sd) 655 *sd = sdev; 656 ret = 0; 657 out: 658 return ret; 659 } 660 661 /* Caller must hold pasid_mutex */ 662 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 663 { 664 struct intel_svm_dev *sdev; 665 struct intel_iommu *iommu; 666 struct intel_svm *svm; 667 int ret = -EINVAL; 668 669 iommu = device_to_iommu(dev, NULL, NULL); 670 if (!iommu) 671 goto out; 672 673 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 674 if (ret) 675 goto out; 676 677 if (sdev) { 678 sdev->users--; 679 if (!sdev->users) { 680 list_del_rcu(&sdev->list); 681 /* Flush the PASID cache and IOTLB for this device. 682 * Note that we do depend on the hardware *not* using 683 * the PASID any more. Just as we depend on other 684 * devices never using PASIDs that they have no right 685 * to use. We have a *shared* PASID table, because it's 686 * large and has to be physically contiguous. So it's 687 * hard to be as defensive as we might like. */ 688 intel_pasid_tear_down_entry(iommu, dev, 689 svm->pasid, false); 690 intel_svm_drain_prq(dev, svm->pasid); 691 kfree_rcu(sdev, rcu); 692 693 if (list_empty(&svm->devs)) { 694 ioasid_put(svm->pasid); 695 if (svm->mm) { 696 mmu_notifier_unregister(&svm->notifier, svm->mm); 697 /* Clear mm's pasid. */ 698 load_pasid(svm->mm, PASID_DISABLED); 699 } 700 list_del(&svm->list); 701 /* We mandate that no page faults may be outstanding 702 * for the PASID when intel_svm_unbind_mm() is called. 703 * If that is not obeyed, subtle errors will happen. 704 * Let's make them less subtle... */ 705 memset(svm, 0x6b, sizeof(*svm)); 706 kfree(svm); 707 } 708 } 709 } 710 out: 711 return ret; 712 } 713 714 /* Page request queue descriptor */ 715 struct page_req_dsc { 716 union { 717 struct { 718 u64 type:8; 719 u64 pasid_present:1; 720 u64 priv_data_present:1; 721 u64 rsvd:6; 722 u64 rid:16; 723 u64 pasid:20; 724 u64 exe_req:1; 725 u64 pm_req:1; 726 u64 rsvd2:10; 727 }; 728 u64 qw_0; 729 }; 730 union { 731 struct { 732 u64 rd_req:1; 733 u64 wr_req:1; 734 u64 lpig:1; 735 u64 prg_index:9; 736 u64 addr:52; 737 }; 738 u64 qw_1; 739 }; 740 u64 priv_data[2]; 741 }; 742 743 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 744 745 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 746 { 747 unsigned long requested = 0; 748 749 if (req->exe_req) 750 requested |= VM_EXEC; 751 752 if (req->rd_req) 753 requested |= VM_READ; 754 755 if (req->wr_req) 756 requested |= VM_WRITE; 757 758 return (requested & ~vma->vm_flags) != 0; 759 } 760 761 static bool is_canonical_address(u64 addr) 762 { 763 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 764 long saddr = (long) addr; 765 766 return (((saddr << shift) >> shift) == saddr); 767 } 768 769 /** 770 * intel_svm_drain_prq - Drain page requests and responses for a pasid 771 * @dev: target device 772 * @pasid: pasid for draining 773 * 774 * Drain all pending page requests and responses related to @pasid in both 775 * software and hardware. This is supposed to be called after the device 776 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 777 * and DevTLB have been invalidated. 778 * 779 * It waits until all pending page requests for @pasid in the page fault 780 * queue are completed by the prq handling thread. Then follow the steps 781 * described in VT-d spec CH7.10 to drain all page requests and page 782 * responses pending in the hardware. 783 */ 784 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 785 { 786 struct device_domain_info *info; 787 struct dmar_domain *domain; 788 struct intel_iommu *iommu; 789 struct qi_desc desc[3]; 790 struct pci_dev *pdev; 791 int head, tail; 792 u16 sid, did; 793 int qdep; 794 795 info = get_domain_info(dev); 796 if (WARN_ON(!info || !dev_is_pci(dev))) 797 return; 798 799 if (!info->pri_enabled) 800 return; 801 802 iommu = info->iommu; 803 domain = info->domain; 804 pdev = to_pci_dev(dev); 805 sid = PCI_DEVID(info->bus, info->devfn); 806 did = domain->iommu_did[iommu->seq_id]; 807 qdep = pci_ats_queue_depth(pdev); 808 809 /* 810 * Check and wait until all pending page requests in the queue are 811 * handled by the prq handling thread. 812 */ 813 prq_retry: 814 reinit_completion(&iommu->prq_complete); 815 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 816 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 817 while (head != tail) { 818 struct page_req_dsc *req; 819 820 req = &iommu->prq[head / sizeof(*req)]; 821 if (!req->pasid_present || req->pasid != pasid) { 822 head = (head + sizeof(*req)) & PRQ_RING_MASK; 823 continue; 824 } 825 826 wait_for_completion(&iommu->prq_complete); 827 goto prq_retry; 828 } 829 830 /* 831 * Perform steps described in VT-d spec CH7.10 to drain page 832 * requests and responses in hardware. 833 */ 834 memset(desc, 0, sizeof(desc)); 835 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 836 QI_IWD_FENCE | 837 QI_IWD_TYPE; 838 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 839 QI_EIOTLB_DID(did) | 840 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 841 QI_EIOTLB_TYPE; 842 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 843 QI_DEV_EIOTLB_SID(sid) | 844 QI_DEV_EIOTLB_QDEP(qdep) | 845 QI_DEIOTLB_TYPE | 846 QI_DEV_IOTLB_PFSID(info->pfsid); 847 qi_retry: 848 reinit_completion(&iommu->prq_complete); 849 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 850 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 851 wait_for_completion(&iommu->prq_complete); 852 goto qi_retry; 853 } 854 } 855 856 static int prq_to_iommu_prot(struct page_req_dsc *req) 857 { 858 int prot = 0; 859 860 if (req->rd_req) 861 prot |= IOMMU_FAULT_PERM_READ; 862 if (req->wr_req) 863 prot |= IOMMU_FAULT_PERM_WRITE; 864 if (req->exe_req) 865 prot |= IOMMU_FAULT_PERM_EXEC; 866 if (req->pm_req) 867 prot |= IOMMU_FAULT_PERM_PRIV; 868 869 return prot; 870 } 871 872 static int 873 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 874 { 875 struct iommu_fault_event event; 876 877 if (!dev || !dev_is_pci(dev)) 878 return -ENODEV; 879 880 /* Fill in event data for device specific processing */ 881 memset(&event, 0, sizeof(struct iommu_fault_event)); 882 event.fault.type = IOMMU_FAULT_PAGE_REQ; 883 event.fault.prm.addr = desc->addr; 884 event.fault.prm.pasid = desc->pasid; 885 event.fault.prm.grpid = desc->prg_index; 886 event.fault.prm.perm = prq_to_iommu_prot(desc); 887 888 if (desc->lpig) 889 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 890 if (desc->pasid_present) { 891 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 892 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 893 } 894 if (desc->priv_data_present) { 895 /* 896 * Set last page in group bit if private data is present, 897 * page response is required as it does for LPIG. 898 * iommu_report_device_fault() doesn't understand this vendor 899 * specific requirement thus we set last_page as a workaround. 900 */ 901 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 902 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 903 memcpy(event.fault.prm.private_data, desc->priv_data, 904 sizeof(desc->priv_data)); 905 } 906 907 return iommu_report_device_fault(dev, &event); 908 } 909 910 static irqreturn_t prq_event_thread(int irq, void *d) 911 { 912 struct intel_svm_dev *sdev = NULL; 913 struct intel_iommu *iommu = d; 914 struct intel_svm *svm = NULL; 915 int head, tail, handled = 0; 916 917 /* Clear PPR bit before reading head/tail registers, to 918 * ensure that we get a new interrupt if needed. */ 919 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 920 921 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 922 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 923 while (head != tail) { 924 struct vm_area_struct *vma; 925 struct page_req_dsc *req; 926 struct qi_desc resp; 927 int result; 928 vm_fault_t ret; 929 u64 address; 930 931 handled = 1; 932 933 req = &iommu->prq[head / sizeof(*req)]; 934 935 result = QI_RESP_FAILURE; 936 address = (u64)req->addr << VTD_PAGE_SHIFT; 937 if (!req->pasid_present) { 938 pr_err("%s: Page request without PASID: %08llx %08llx\n", 939 iommu->name, ((unsigned long long *)req)[0], 940 ((unsigned long long *)req)[1]); 941 goto no_pasid; 942 } 943 944 if (!svm || svm->pasid != req->pasid) { 945 rcu_read_lock(); 946 svm = ioasid_find(NULL, req->pasid, NULL); 947 /* It *can't* go away, because the driver is not permitted 948 * to unbind the mm while any page faults are outstanding. 949 * So we only need RCU to protect the internal idr code. */ 950 rcu_read_unlock(); 951 if (IS_ERR_OR_NULL(svm)) { 952 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 953 iommu->name, req->pasid, ((unsigned long long *)req)[0], 954 ((unsigned long long *)req)[1]); 955 goto no_pasid; 956 } 957 } 958 959 if (!sdev || sdev->sid != req->rid) { 960 struct intel_svm_dev *t; 961 962 sdev = NULL; 963 rcu_read_lock(); 964 list_for_each_entry_rcu(t, &svm->devs, list) { 965 if (t->sid == req->rid) { 966 sdev = t; 967 break; 968 } 969 } 970 rcu_read_unlock(); 971 } 972 973 result = QI_RESP_INVALID; 974 /* Since we're using init_mm.pgd directly, we should never take 975 * any faults on kernel addresses. */ 976 if (!svm->mm) 977 goto bad_req; 978 979 /* If address is not canonical, return invalid response */ 980 if (!is_canonical_address(address)) 981 goto bad_req; 982 983 /* 984 * If prq is to be handled outside iommu driver via receiver of 985 * the fault notifiers, we skip the page response here. 986 */ 987 if (svm->flags & SVM_FLAG_GUEST_MODE) { 988 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 989 goto prq_advance; 990 else 991 goto bad_req; 992 } 993 994 /* If the mm is already defunct, don't handle faults. */ 995 if (!mmget_not_zero(svm->mm)) 996 goto bad_req; 997 998 mmap_read_lock(svm->mm); 999 vma = find_extend_vma(svm->mm, address); 1000 if (!vma || address < vma->vm_start) 1001 goto invalid; 1002 1003 if (access_error(vma, req)) 1004 goto invalid; 1005 1006 ret = handle_mm_fault(vma, address, 1007 req->wr_req ? FAULT_FLAG_WRITE : 0, 1008 NULL); 1009 if (ret & VM_FAULT_ERROR) 1010 goto invalid; 1011 1012 result = QI_RESP_SUCCESS; 1013 invalid: 1014 mmap_read_unlock(svm->mm); 1015 mmput(svm->mm); 1016 bad_req: 1017 WARN_ON(!sdev); 1018 if (sdev && sdev->ops && sdev->ops->fault_cb) { 1019 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 1020 (req->exe_req << 1) | (req->pm_req); 1021 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1022 req->priv_data, rwxp, result); 1023 } 1024 /* We get here in the error case where the PASID lookup failed, 1025 and these can be NULL. Do not use them below this point! */ 1026 sdev = NULL; 1027 svm = NULL; 1028 no_pasid: 1029 if (req->lpig || req->priv_data_present) { 1030 /* 1031 * Per VT-d spec. v3.0 ch7.7, system software must 1032 * respond with page group response if private data 1033 * is present (PDP) or last page in group (LPIG) bit 1034 * is set. This is an additional VT-d feature beyond 1035 * PCI ATS spec. 1036 */ 1037 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1038 QI_PGRP_DID(req->rid) | 1039 QI_PGRP_PASID_P(req->pasid_present) | 1040 QI_PGRP_PDP(req->priv_data_present) | 1041 QI_PGRP_RESP_CODE(result) | 1042 QI_PGRP_RESP_TYPE; 1043 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1044 QI_PGRP_LPIG(req->lpig); 1045 1046 if (req->priv_data_present) 1047 memcpy(&resp.qw2, req->priv_data, 1048 sizeof(req->priv_data)); 1049 resp.qw2 = 0; 1050 resp.qw3 = 0; 1051 qi_submit_sync(iommu, &resp, 1, 0); 1052 } 1053 prq_advance: 1054 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1055 } 1056 1057 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1058 1059 /* 1060 * Clear the page request overflow bit and wake up all threads that 1061 * are waiting for the completion of this handling. 1062 */ 1063 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) 1064 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1065 1066 if (!completion_done(&iommu->prq_complete)) 1067 complete(&iommu->prq_complete); 1068 1069 return IRQ_RETVAL(handled); 1070 } 1071 1072 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1073 struct iommu_sva * 1074 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1075 { 1076 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1077 struct intel_svm_dev *sdev = NULL; 1078 unsigned int flags = 0; 1079 int ret; 1080 1081 /* 1082 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1083 * It will require shared SVM data structures, i.e. combine io_mm 1084 * and intel_svm etc. 1085 */ 1086 if (drvdata) 1087 flags = *(unsigned int *)drvdata; 1088 mutex_lock(&pasid_mutex); 1089 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1090 if (ret) 1091 sva = ERR_PTR(ret); 1092 else if (sdev) 1093 sva = &sdev->sva; 1094 else 1095 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1096 1097 mutex_unlock(&pasid_mutex); 1098 1099 return sva; 1100 } 1101 1102 void intel_svm_unbind(struct iommu_sva *sva) 1103 { 1104 struct intel_svm_dev *sdev; 1105 1106 mutex_lock(&pasid_mutex); 1107 sdev = to_intel_svm_dev(sva); 1108 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1109 mutex_unlock(&pasid_mutex); 1110 } 1111 1112 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1113 { 1114 struct intel_svm_dev *sdev; 1115 u32 pasid; 1116 1117 mutex_lock(&pasid_mutex); 1118 sdev = to_intel_svm_dev(sva); 1119 pasid = sdev->pasid; 1120 mutex_unlock(&pasid_mutex); 1121 1122 return pasid; 1123 } 1124 1125 int intel_svm_page_response(struct device *dev, 1126 struct iommu_fault_event *evt, 1127 struct iommu_page_response *msg) 1128 { 1129 struct iommu_fault_page_request *prm; 1130 struct intel_svm_dev *sdev = NULL; 1131 struct intel_svm *svm = NULL; 1132 struct intel_iommu *iommu; 1133 bool private_present; 1134 bool pasid_present; 1135 bool last_page; 1136 u8 bus, devfn; 1137 int ret = 0; 1138 u16 sid; 1139 1140 if (!dev || !dev_is_pci(dev)) 1141 return -ENODEV; 1142 1143 iommu = device_to_iommu(dev, &bus, &devfn); 1144 if (!iommu) 1145 return -ENODEV; 1146 1147 if (!msg || !evt) 1148 return -EINVAL; 1149 1150 mutex_lock(&pasid_mutex); 1151 1152 prm = &evt->fault.prm; 1153 sid = PCI_DEVID(bus, devfn); 1154 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1155 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1156 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1157 1158 if (!pasid_present) { 1159 ret = -EINVAL; 1160 goto out; 1161 } 1162 1163 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1164 ret = -EINVAL; 1165 goto out; 1166 } 1167 1168 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1169 if (ret || !sdev) { 1170 ret = -ENODEV; 1171 goto out; 1172 } 1173 1174 /* 1175 * For responses from userspace, need to make sure that the 1176 * pasid has been bound to its mm. 1177 */ 1178 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1179 struct mm_struct *mm; 1180 1181 mm = get_task_mm(current); 1182 if (!mm) { 1183 ret = -EINVAL; 1184 goto out; 1185 } 1186 1187 if (mm != svm->mm) { 1188 ret = -ENODEV; 1189 mmput(mm); 1190 goto out; 1191 } 1192 1193 mmput(mm); 1194 } 1195 1196 /* 1197 * Per VT-d spec. v3.0 ch7.7, system software must respond 1198 * with page group response if private data is present (PDP) 1199 * or last page in group (LPIG) bit is set. This is an 1200 * additional VT-d requirement beyond PCI ATS spec. 1201 */ 1202 if (last_page || private_present) { 1203 struct qi_desc desc; 1204 1205 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1206 QI_PGRP_PASID_P(pasid_present) | 1207 QI_PGRP_PDP(private_present) | 1208 QI_PGRP_RESP_CODE(msg->code) | 1209 QI_PGRP_RESP_TYPE; 1210 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1211 desc.qw2 = 0; 1212 desc.qw3 = 0; 1213 if (private_present) 1214 memcpy(&desc.qw2, prm->private_data, 1215 sizeof(prm->private_data)); 1216 1217 qi_submit_sync(iommu, &desc, 1, 0); 1218 } 1219 out: 1220 mutex_unlock(&pasid_mutex); 1221 return ret; 1222 } 1223