1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, 122 unsigned long address, unsigned long pages, int ih) 123 { 124 struct qi_desc desc; 125 126 if (pages == -1) { 127 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 128 QI_EIOTLB_DID(sdev->did) | 129 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 130 QI_EIOTLB_TYPE; 131 desc.qw1 = 0; 132 } else { 133 int mask = ilog2(__roundup_pow_of_two(pages)); 134 135 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 136 QI_EIOTLB_DID(sdev->did) | 137 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 138 QI_EIOTLB_TYPE; 139 desc.qw1 = QI_EIOTLB_ADDR(address) | 140 QI_EIOTLB_IH(ih) | 141 QI_EIOTLB_AM(mask); 142 } 143 desc.qw2 = 0; 144 desc.qw3 = 0; 145 qi_submit_sync(sdev->iommu, &desc, 1, 0); 146 147 if (sdev->dev_iotlb) { 148 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | 149 QI_DEV_EIOTLB_SID(sdev->sid) | 150 QI_DEV_EIOTLB_QDEP(sdev->qdep) | 151 QI_DEIOTLB_TYPE; 152 if (pages == -1) { 153 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | 154 QI_DEV_EIOTLB_SIZE; 155 } else if (pages > 1) { 156 /* The least significant zero bit indicates the size. So, 157 * for example, an "address" value of 0x12345f000 will 158 * flush from 0x123440000 to 0x12347ffff (256KiB). */ 159 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); 160 unsigned long mask = __rounddown_pow_of_two(address ^ last); 161 162 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | 163 (mask - 1)) | QI_DEV_EIOTLB_SIZE; 164 } else { 165 desc.qw1 = QI_DEV_EIOTLB_ADDR(address); 166 } 167 desc.qw2 = 0; 168 desc.qw3 = 0; 169 qi_submit_sync(sdev->iommu, &desc, 1, 0); 170 } 171 } 172 173 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 174 unsigned long pages, int ih) 175 { 176 struct intel_svm_dev *sdev; 177 178 rcu_read_lock(); 179 list_for_each_entry_rcu(sdev, &svm->devs, list) 180 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 181 rcu_read_unlock(); 182 } 183 184 /* Pages have been freed at this point */ 185 static void intel_invalidate_range(struct mmu_notifier *mn, 186 struct mm_struct *mm, 187 unsigned long start, unsigned long end) 188 { 189 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 190 191 intel_flush_svm_range(svm, start, 192 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 193 } 194 195 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 196 { 197 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 198 struct intel_svm_dev *sdev; 199 200 /* This might end up being called from exit_mmap(), *before* the page 201 * tables are cleared. And __mmu_notifier_release() will delete us from 202 * the list of notifiers so that our invalidate_range() callback doesn't 203 * get called when the page tables are cleared. So we need to protect 204 * against hardware accessing those page tables. 205 * 206 * We do it by clearing the entry in the PASID table and then flushing 207 * the IOTLB and the PASID table caches. This might upset hardware; 208 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 209 * page) so that we end up taking a fault that the hardware really 210 * *has* to handle gracefully without affecting other processes. 211 */ 212 rcu_read_lock(); 213 list_for_each_entry_rcu(sdev, &svm->devs, list) 214 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 215 svm->pasid, true); 216 rcu_read_unlock(); 217 218 } 219 220 static const struct mmu_notifier_ops intel_mmuops = { 221 .release = intel_mm_release, 222 .invalidate_range = intel_invalidate_range, 223 }; 224 225 static DEFINE_MUTEX(pasid_mutex); 226 static LIST_HEAD(global_svm_list); 227 228 #define for_each_svm_dev(sdev, svm, d) \ 229 list_for_each_entry((sdev), &(svm)->devs, list) \ 230 if ((d) != (sdev)->dev) {} else 231 232 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 233 struct intel_svm **rsvm, 234 struct intel_svm_dev **rsdev) 235 { 236 struct intel_svm_dev *d, *sdev = NULL; 237 struct intel_svm *svm; 238 239 /* The caller should hold the pasid_mutex lock */ 240 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 241 return -EINVAL; 242 243 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 244 return -EINVAL; 245 246 svm = ioasid_find(NULL, pasid, NULL); 247 if (IS_ERR(svm)) 248 return PTR_ERR(svm); 249 250 if (!svm) 251 goto out; 252 253 /* 254 * If we found svm for the PASID, there must be at least one device 255 * bond. 256 */ 257 if (WARN_ON(list_empty(&svm->devs))) 258 return -EINVAL; 259 260 rcu_read_lock(); 261 list_for_each_entry_rcu(d, &svm->devs, list) { 262 if (d->dev == dev) { 263 sdev = d; 264 break; 265 } 266 } 267 rcu_read_unlock(); 268 269 out: 270 *rsvm = svm; 271 *rsdev = sdev; 272 273 return 0; 274 } 275 276 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 277 struct iommu_gpasid_bind_data *data) 278 { 279 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 280 struct intel_svm_dev *sdev = NULL; 281 struct dmar_domain *dmar_domain; 282 struct device_domain_info *info; 283 struct intel_svm *svm = NULL; 284 unsigned long iflags; 285 int ret = 0; 286 287 if (WARN_ON(!iommu) || !data) 288 return -EINVAL; 289 290 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 291 return -EINVAL; 292 293 /* IOMMU core ensures argsz is more than the start of the union */ 294 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 295 return -EINVAL; 296 297 /* Make sure no undefined flags are used in vendor data */ 298 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 299 return -EINVAL; 300 301 if (!dev_is_pci(dev)) 302 return -ENOTSUPP; 303 304 /* VT-d supports devices with full 20 bit PASIDs only */ 305 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 306 return -EINVAL; 307 308 /* 309 * We only check host PASID range, we have no knowledge to check 310 * guest PASID range. 311 */ 312 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 313 return -EINVAL; 314 315 info = get_domain_info(dev); 316 if (!info) 317 return -EINVAL; 318 319 dmar_domain = to_dmar_domain(domain); 320 321 mutex_lock(&pasid_mutex); 322 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 323 if (ret) 324 goto out; 325 326 if (sdev) { 327 /* 328 * Do not allow multiple bindings of the same device-PASID since 329 * there is only one SL page tables per PASID. We may revisit 330 * once sharing PGD across domains are supported. 331 */ 332 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 333 svm->pasid); 334 ret = -EBUSY; 335 goto out; 336 } 337 338 if (!svm) { 339 /* We come here when PASID has never been bond to a device. */ 340 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 341 if (!svm) { 342 ret = -ENOMEM; 343 goto out; 344 } 345 /* REVISIT: upper layer/VFIO can track host process that bind 346 * the PASID. ioasid_set = mm might be sufficient for vfio to 347 * check pasid VMM ownership. We can drop the following line 348 * once VFIO and IOASID set check is in place. 349 */ 350 svm->mm = get_task_mm(current); 351 svm->pasid = data->hpasid; 352 if (data->flags & IOMMU_SVA_GPASID_VAL) { 353 svm->gpasid = data->gpasid; 354 svm->flags |= SVM_FLAG_GUEST_PASID; 355 } 356 ioasid_set_data(data->hpasid, svm); 357 INIT_LIST_HEAD_RCU(&svm->devs); 358 mmput(svm->mm); 359 } 360 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 361 if (!sdev) { 362 ret = -ENOMEM; 363 goto out; 364 } 365 sdev->dev = dev; 366 sdev->sid = PCI_DEVID(info->bus, info->devfn); 367 sdev->iommu = iommu; 368 369 /* Only count users if device has aux domains */ 370 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 371 sdev->users = 1; 372 373 /* Set up device context entry for PASID if not enabled already */ 374 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 375 if (ret) { 376 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 377 kfree(sdev); 378 goto out; 379 } 380 381 /* 382 * PASID table is per device for better security. Therefore, for 383 * each bind of a new device even with an existing PASID, we need to 384 * call the nested mode setup function here. 385 */ 386 spin_lock_irqsave(&iommu->lock, iflags); 387 ret = intel_pasid_setup_nested(iommu, dev, 388 (pgd_t *)(uintptr_t)data->gpgd, 389 data->hpasid, &data->vendor.vtd, dmar_domain, 390 data->addr_width); 391 spin_unlock_irqrestore(&iommu->lock, iflags); 392 if (ret) { 393 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 394 data->hpasid, ret); 395 /* 396 * PASID entry should be in cleared state if nested mode 397 * set up failed. So we only need to clear IOASID tracking 398 * data such that free call will succeed. 399 */ 400 kfree(sdev); 401 goto out; 402 } 403 404 svm->flags |= SVM_FLAG_GUEST_MODE; 405 406 init_rcu_head(&sdev->rcu); 407 list_add_rcu(&sdev->list, &svm->devs); 408 out: 409 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 410 ioasid_set_data(data->hpasid, NULL); 411 kfree(svm); 412 } 413 414 mutex_unlock(&pasid_mutex); 415 return ret; 416 } 417 418 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 419 { 420 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 421 struct intel_svm_dev *sdev; 422 struct intel_svm *svm; 423 int ret; 424 425 if (WARN_ON(!iommu)) 426 return -EINVAL; 427 428 mutex_lock(&pasid_mutex); 429 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 430 if (ret) 431 goto out; 432 433 if (sdev) { 434 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 435 sdev->users--; 436 if (!sdev->users) { 437 list_del_rcu(&sdev->list); 438 intel_pasid_tear_down_entry(iommu, dev, 439 svm->pasid, false); 440 intel_svm_drain_prq(dev, svm->pasid); 441 kfree_rcu(sdev, rcu); 442 443 if (list_empty(&svm->devs)) { 444 /* 445 * We do not free the IOASID here in that 446 * IOMMU driver did not allocate it. 447 * Unlike native SVM, IOASID for guest use was 448 * allocated prior to the bind call. 449 * In any case, if the free call comes before 450 * the unbind, IOMMU driver will get notified 451 * and perform cleanup. 452 */ 453 ioasid_set_data(pasid, NULL); 454 kfree(svm); 455 } 456 } 457 } 458 out: 459 mutex_unlock(&pasid_mutex); 460 return ret; 461 } 462 463 static void _load_pasid(void *unused) 464 { 465 update_pasid(); 466 } 467 468 static void load_pasid(struct mm_struct *mm, u32 pasid) 469 { 470 mutex_lock(&mm->context.lock); 471 472 /* Synchronize with READ_ONCE in update_pasid(). */ 473 smp_store_release(&mm->pasid, pasid); 474 475 /* Update PASID MSR on all CPUs running the mm's tasks. */ 476 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 477 478 mutex_unlock(&mm->context.lock); 479 } 480 481 /* Caller must hold pasid_mutex, mm reference */ 482 static int 483 intel_svm_bind_mm(struct device *dev, unsigned int flags, 484 struct svm_dev_ops *ops, 485 struct mm_struct *mm, struct intel_svm_dev **sd) 486 { 487 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 488 struct device_domain_info *info; 489 struct intel_svm_dev *sdev; 490 struct intel_svm *svm = NULL; 491 unsigned long iflags; 492 int pasid_max; 493 int ret; 494 495 if (!iommu || dmar_disabled) 496 return -EINVAL; 497 498 if (!intel_svm_capable(iommu)) 499 return -ENOTSUPP; 500 501 if (dev_is_pci(dev)) { 502 pasid_max = pci_max_pasids(to_pci_dev(dev)); 503 if (pasid_max < 0) 504 return -EINVAL; 505 } else 506 pasid_max = 1 << 20; 507 508 /* Bind supervisor PASID shuld have mm = NULL */ 509 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 510 if (!ecap_srs(iommu->ecap) || mm) { 511 pr_err("Supervisor PASID with user provided mm.\n"); 512 return -EINVAL; 513 } 514 } 515 516 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 517 struct intel_svm *t; 518 519 list_for_each_entry(t, &global_svm_list, list) { 520 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 521 continue; 522 523 svm = t; 524 if (svm->pasid >= pasid_max) { 525 dev_warn(dev, 526 "Limited PASID width. Cannot use existing PASID %d\n", 527 svm->pasid); 528 ret = -ENOSPC; 529 goto out; 530 } 531 532 /* Find the matching device in svm list */ 533 for_each_svm_dev(sdev, svm, dev) { 534 if (sdev->ops != ops) { 535 ret = -EBUSY; 536 goto out; 537 } 538 sdev->users++; 539 goto success; 540 } 541 542 break; 543 } 544 } 545 546 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 547 if (!sdev) { 548 ret = -ENOMEM; 549 goto out; 550 } 551 sdev->dev = dev; 552 sdev->iommu = iommu; 553 554 ret = intel_iommu_enable_pasid(iommu, dev); 555 if (ret) { 556 kfree(sdev); 557 goto out; 558 } 559 560 info = get_domain_info(dev); 561 sdev->did = FLPT_DEFAULT_DID; 562 sdev->sid = PCI_DEVID(info->bus, info->devfn); 563 if (info->ats_enabled) { 564 sdev->dev_iotlb = 1; 565 sdev->qdep = info->ats_qdep; 566 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 567 sdev->qdep = 0; 568 } 569 570 /* Finish the setup now we know we're keeping it */ 571 sdev->users = 1; 572 sdev->ops = ops; 573 init_rcu_head(&sdev->rcu); 574 575 if (!svm) { 576 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 577 if (!svm) { 578 ret = -ENOMEM; 579 kfree(sdev); 580 goto out; 581 } 582 583 if (pasid_max > intel_pasid_max_id) 584 pasid_max = intel_pasid_max_id; 585 586 /* Do not use PASID 0, reserved for RID to PASID */ 587 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 588 pasid_max - 1, svm); 589 if (svm->pasid == INVALID_IOASID) { 590 kfree(svm); 591 kfree(sdev); 592 ret = -ENOSPC; 593 goto out; 594 } 595 svm->notifier.ops = &intel_mmuops; 596 svm->mm = mm; 597 svm->flags = flags; 598 INIT_LIST_HEAD_RCU(&svm->devs); 599 INIT_LIST_HEAD(&svm->list); 600 ret = -ENOMEM; 601 if (mm) { 602 ret = mmu_notifier_register(&svm->notifier, mm); 603 if (ret) { 604 ioasid_put(svm->pasid); 605 kfree(svm); 606 kfree(sdev); 607 goto out; 608 } 609 } 610 611 spin_lock_irqsave(&iommu->lock, iflags); 612 ret = intel_pasid_setup_first_level(iommu, dev, 613 mm ? mm->pgd : init_mm.pgd, 614 svm->pasid, FLPT_DEFAULT_DID, 615 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 616 (cpu_feature_enabled(X86_FEATURE_LA57) ? 617 PASID_FLAG_FL5LP : 0)); 618 spin_unlock_irqrestore(&iommu->lock, iflags); 619 if (ret) { 620 if (mm) 621 mmu_notifier_unregister(&svm->notifier, mm); 622 ioasid_put(svm->pasid); 623 kfree(svm); 624 kfree(sdev); 625 goto out; 626 } 627 628 list_add_tail(&svm->list, &global_svm_list); 629 if (mm) { 630 /* The newly allocated pasid is loaded to the mm. */ 631 load_pasid(mm, svm->pasid); 632 } 633 } else { 634 /* 635 * Binding a new device with existing PASID, need to setup 636 * the PASID entry. 637 */ 638 spin_lock_irqsave(&iommu->lock, iflags); 639 ret = intel_pasid_setup_first_level(iommu, dev, 640 mm ? mm->pgd : init_mm.pgd, 641 svm->pasid, FLPT_DEFAULT_DID, 642 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 643 (cpu_feature_enabled(X86_FEATURE_LA57) ? 644 PASID_FLAG_FL5LP : 0)); 645 spin_unlock_irqrestore(&iommu->lock, iflags); 646 if (ret) { 647 kfree(sdev); 648 goto out; 649 } 650 } 651 list_add_rcu(&sdev->list, &svm->devs); 652 success: 653 sdev->pasid = svm->pasid; 654 sdev->sva.dev = dev; 655 if (sd) 656 *sd = sdev; 657 ret = 0; 658 out: 659 return ret; 660 } 661 662 /* Caller must hold pasid_mutex */ 663 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 664 { 665 struct intel_svm_dev *sdev; 666 struct intel_iommu *iommu; 667 struct intel_svm *svm; 668 int ret = -EINVAL; 669 670 iommu = device_to_iommu(dev, NULL, NULL); 671 if (!iommu) 672 goto out; 673 674 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 675 if (ret) 676 goto out; 677 678 if (sdev) { 679 sdev->users--; 680 if (!sdev->users) { 681 list_del_rcu(&sdev->list); 682 /* Flush the PASID cache and IOTLB for this device. 683 * Note that we do depend on the hardware *not* using 684 * the PASID any more. Just as we depend on other 685 * devices never using PASIDs that they have no right 686 * to use. We have a *shared* PASID table, because it's 687 * large and has to be physically contiguous. So it's 688 * hard to be as defensive as we might like. */ 689 intel_pasid_tear_down_entry(iommu, dev, 690 svm->pasid, false); 691 intel_svm_drain_prq(dev, svm->pasid); 692 kfree_rcu(sdev, rcu); 693 694 if (list_empty(&svm->devs)) { 695 ioasid_put(svm->pasid); 696 if (svm->mm) { 697 mmu_notifier_unregister(&svm->notifier, svm->mm); 698 /* Clear mm's pasid. */ 699 load_pasid(svm->mm, PASID_DISABLED); 700 } 701 list_del(&svm->list); 702 /* We mandate that no page faults may be outstanding 703 * for the PASID when intel_svm_unbind_mm() is called. 704 * If that is not obeyed, subtle errors will happen. 705 * Let's make them less subtle... */ 706 memset(svm, 0x6b, sizeof(*svm)); 707 kfree(svm); 708 } 709 } 710 } 711 out: 712 return ret; 713 } 714 715 /* Page request queue descriptor */ 716 struct page_req_dsc { 717 union { 718 struct { 719 u64 type:8; 720 u64 pasid_present:1; 721 u64 priv_data_present:1; 722 u64 rsvd:6; 723 u64 rid:16; 724 u64 pasid:20; 725 u64 exe_req:1; 726 u64 pm_req:1; 727 u64 rsvd2:10; 728 }; 729 u64 qw_0; 730 }; 731 union { 732 struct { 733 u64 rd_req:1; 734 u64 wr_req:1; 735 u64 lpig:1; 736 u64 prg_index:9; 737 u64 addr:52; 738 }; 739 u64 qw_1; 740 }; 741 u64 priv_data[2]; 742 }; 743 744 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 745 746 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 747 { 748 unsigned long requested = 0; 749 750 if (req->exe_req) 751 requested |= VM_EXEC; 752 753 if (req->rd_req) 754 requested |= VM_READ; 755 756 if (req->wr_req) 757 requested |= VM_WRITE; 758 759 return (requested & ~vma->vm_flags) != 0; 760 } 761 762 static bool is_canonical_address(u64 addr) 763 { 764 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 765 long saddr = (long) addr; 766 767 return (((saddr << shift) >> shift) == saddr); 768 } 769 770 /** 771 * intel_svm_drain_prq - Drain page requests and responses for a pasid 772 * @dev: target device 773 * @pasid: pasid for draining 774 * 775 * Drain all pending page requests and responses related to @pasid in both 776 * software and hardware. This is supposed to be called after the device 777 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 778 * and DevTLB have been invalidated. 779 * 780 * It waits until all pending page requests for @pasid in the page fault 781 * queue are completed by the prq handling thread. Then follow the steps 782 * described in VT-d spec CH7.10 to drain all page requests and page 783 * responses pending in the hardware. 784 */ 785 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 786 { 787 struct device_domain_info *info; 788 struct dmar_domain *domain; 789 struct intel_iommu *iommu; 790 struct qi_desc desc[3]; 791 struct pci_dev *pdev; 792 int head, tail; 793 u16 sid, did; 794 int qdep; 795 796 info = get_domain_info(dev); 797 if (WARN_ON(!info || !dev_is_pci(dev))) 798 return; 799 800 if (!info->pri_enabled) 801 return; 802 803 iommu = info->iommu; 804 domain = info->domain; 805 pdev = to_pci_dev(dev); 806 sid = PCI_DEVID(info->bus, info->devfn); 807 did = domain->iommu_did[iommu->seq_id]; 808 qdep = pci_ats_queue_depth(pdev); 809 810 /* 811 * Check and wait until all pending page requests in the queue are 812 * handled by the prq handling thread. 813 */ 814 prq_retry: 815 reinit_completion(&iommu->prq_complete); 816 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 817 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 818 while (head != tail) { 819 struct page_req_dsc *req; 820 821 req = &iommu->prq[head / sizeof(*req)]; 822 if (!req->pasid_present || req->pasid != pasid) { 823 head = (head + sizeof(*req)) & PRQ_RING_MASK; 824 continue; 825 } 826 827 wait_for_completion(&iommu->prq_complete); 828 goto prq_retry; 829 } 830 831 /* 832 * Perform steps described in VT-d spec CH7.10 to drain page 833 * requests and responses in hardware. 834 */ 835 memset(desc, 0, sizeof(desc)); 836 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 837 QI_IWD_FENCE | 838 QI_IWD_TYPE; 839 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 840 QI_EIOTLB_DID(did) | 841 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 842 QI_EIOTLB_TYPE; 843 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 844 QI_DEV_EIOTLB_SID(sid) | 845 QI_DEV_EIOTLB_QDEP(qdep) | 846 QI_DEIOTLB_TYPE | 847 QI_DEV_IOTLB_PFSID(info->pfsid); 848 qi_retry: 849 reinit_completion(&iommu->prq_complete); 850 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 851 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 852 wait_for_completion(&iommu->prq_complete); 853 goto qi_retry; 854 } 855 } 856 857 static int prq_to_iommu_prot(struct page_req_dsc *req) 858 { 859 int prot = 0; 860 861 if (req->rd_req) 862 prot |= IOMMU_FAULT_PERM_READ; 863 if (req->wr_req) 864 prot |= IOMMU_FAULT_PERM_WRITE; 865 if (req->exe_req) 866 prot |= IOMMU_FAULT_PERM_EXEC; 867 if (req->pm_req) 868 prot |= IOMMU_FAULT_PERM_PRIV; 869 870 return prot; 871 } 872 873 static int 874 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 875 { 876 struct iommu_fault_event event; 877 878 if (!dev || !dev_is_pci(dev)) 879 return -ENODEV; 880 881 /* Fill in event data for device specific processing */ 882 memset(&event, 0, sizeof(struct iommu_fault_event)); 883 event.fault.type = IOMMU_FAULT_PAGE_REQ; 884 event.fault.prm.addr = desc->addr; 885 event.fault.prm.pasid = desc->pasid; 886 event.fault.prm.grpid = desc->prg_index; 887 event.fault.prm.perm = prq_to_iommu_prot(desc); 888 889 if (desc->lpig) 890 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 891 if (desc->pasid_present) { 892 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 893 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 894 } 895 if (desc->priv_data_present) { 896 /* 897 * Set last page in group bit if private data is present, 898 * page response is required as it does for LPIG. 899 * iommu_report_device_fault() doesn't understand this vendor 900 * specific requirement thus we set last_page as a workaround. 901 */ 902 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 903 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 904 memcpy(event.fault.prm.private_data, desc->priv_data, 905 sizeof(desc->priv_data)); 906 } 907 908 return iommu_report_device_fault(dev, &event); 909 } 910 911 static irqreturn_t prq_event_thread(int irq, void *d) 912 { 913 struct intel_svm_dev *sdev = NULL; 914 struct intel_iommu *iommu = d; 915 struct intel_svm *svm = NULL; 916 int head, tail, handled = 0; 917 918 /* Clear PPR bit before reading head/tail registers, to 919 * ensure that we get a new interrupt if needed. */ 920 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 921 922 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 923 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 924 while (head != tail) { 925 struct vm_area_struct *vma; 926 struct page_req_dsc *req; 927 struct qi_desc resp; 928 int result; 929 vm_fault_t ret; 930 u64 address; 931 932 handled = 1; 933 934 req = &iommu->prq[head / sizeof(*req)]; 935 936 result = QI_RESP_FAILURE; 937 address = (u64)req->addr << VTD_PAGE_SHIFT; 938 if (!req->pasid_present) { 939 pr_err("%s: Page request without PASID: %08llx %08llx\n", 940 iommu->name, ((unsigned long long *)req)[0], 941 ((unsigned long long *)req)[1]); 942 goto no_pasid; 943 } 944 945 if (!svm || svm->pasid != req->pasid) { 946 rcu_read_lock(); 947 svm = ioasid_find(NULL, req->pasid, NULL); 948 /* It *can't* go away, because the driver is not permitted 949 * to unbind the mm while any page faults are outstanding. 950 * So we only need RCU to protect the internal idr code. */ 951 rcu_read_unlock(); 952 if (IS_ERR_OR_NULL(svm)) { 953 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 954 iommu->name, req->pasid, ((unsigned long long *)req)[0], 955 ((unsigned long long *)req)[1]); 956 goto no_pasid; 957 } 958 } 959 960 if (!sdev || sdev->sid != req->rid) { 961 struct intel_svm_dev *t; 962 963 sdev = NULL; 964 rcu_read_lock(); 965 list_for_each_entry_rcu(t, &svm->devs, list) { 966 if (t->sid == req->rid) { 967 sdev = t; 968 break; 969 } 970 } 971 rcu_read_unlock(); 972 } 973 974 result = QI_RESP_INVALID; 975 /* Since we're using init_mm.pgd directly, we should never take 976 * any faults on kernel addresses. */ 977 if (!svm->mm) 978 goto bad_req; 979 980 /* If address is not canonical, return invalid response */ 981 if (!is_canonical_address(address)) 982 goto bad_req; 983 984 /* 985 * If prq is to be handled outside iommu driver via receiver of 986 * the fault notifiers, we skip the page response here. 987 */ 988 if (svm->flags & SVM_FLAG_GUEST_MODE) { 989 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 990 goto prq_advance; 991 else 992 goto bad_req; 993 } 994 995 /* If the mm is already defunct, don't handle faults. */ 996 if (!mmget_not_zero(svm->mm)) 997 goto bad_req; 998 999 mmap_read_lock(svm->mm); 1000 vma = find_extend_vma(svm->mm, address); 1001 if (!vma || address < vma->vm_start) 1002 goto invalid; 1003 1004 if (access_error(vma, req)) 1005 goto invalid; 1006 1007 ret = handle_mm_fault(vma, address, 1008 req->wr_req ? FAULT_FLAG_WRITE : 0, 1009 NULL); 1010 if (ret & VM_FAULT_ERROR) 1011 goto invalid; 1012 1013 result = QI_RESP_SUCCESS; 1014 invalid: 1015 mmap_read_unlock(svm->mm); 1016 mmput(svm->mm); 1017 bad_req: 1018 WARN_ON(!sdev); 1019 if (sdev && sdev->ops && sdev->ops->fault_cb) { 1020 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 1021 (req->exe_req << 1) | (req->pm_req); 1022 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1023 req->priv_data, rwxp, result); 1024 } 1025 /* We get here in the error case where the PASID lookup failed, 1026 and these can be NULL. Do not use them below this point! */ 1027 sdev = NULL; 1028 svm = NULL; 1029 no_pasid: 1030 if (req->lpig || req->priv_data_present) { 1031 /* 1032 * Per VT-d spec. v3.0 ch7.7, system software must 1033 * respond with page group response if private data 1034 * is present (PDP) or last page in group (LPIG) bit 1035 * is set. This is an additional VT-d feature beyond 1036 * PCI ATS spec. 1037 */ 1038 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1039 QI_PGRP_DID(req->rid) | 1040 QI_PGRP_PASID_P(req->pasid_present) | 1041 QI_PGRP_PDP(req->priv_data_present) | 1042 QI_PGRP_RESP_CODE(result) | 1043 QI_PGRP_RESP_TYPE; 1044 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1045 QI_PGRP_LPIG(req->lpig); 1046 1047 if (req->priv_data_present) 1048 memcpy(&resp.qw2, req->priv_data, 1049 sizeof(req->priv_data)); 1050 resp.qw2 = 0; 1051 resp.qw3 = 0; 1052 qi_submit_sync(iommu, &resp, 1, 0); 1053 } 1054 prq_advance: 1055 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1056 } 1057 1058 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1059 1060 /* 1061 * Clear the page request overflow bit and wake up all threads that 1062 * are waiting for the completion of this handling. 1063 */ 1064 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) 1065 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1066 1067 if (!completion_done(&iommu->prq_complete)) 1068 complete(&iommu->prq_complete); 1069 1070 return IRQ_RETVAL(handled); 1071 } 1072 1073 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1074 struct iommu_sva * 1075 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1076 { 1077 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1078 struct intel_svm_dev *sdev = NULL; 1079 unsigned int flags = 0; 1080 int ret; 1081 1082 /* 1083 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1084 * It will require shared SVM data structures, i.e. combine io_mm 1085 * and intel_svm etc. 1086 */ 1087 if (drvdata) 1088 flags = *(unsigned int *)drvdata; 1089 mutex_lock(&pasid_mutex); 1090 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1091 if (ret) 1092 sva = ERR_PTR(ret); 1093 else if (sdev) 1094 sva = &sdev->sva; 1095 else 1096 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1097 1098 mutex_unlock(&pasid_mutex); 1099 1100 return sva; 1101 } 1102 1103 void intel_svm_unbind(struct iommu_sva *sva) 1104 { 1105 struct intel_svm_dev *sdev; 1106 1107 mutex_lock(&pasid_mutex); 1108 sdev = to_intel_svm_dev(sva); 1109 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1110 mutex_unlock(&pasid_mutex); 1111 } 1112 1113 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1114 { 1115 struct intel_svm_dev *sdev; 1116 u32 pasid; 1117 1118 mutex_lock(&pasid_mutex); 1119 sdev = to_intel_svm_dev(sva); 1120 pasid = sdev->pasid; 1121 mutex_unlock(&pasid_mutex); 1122 1123 return pasid; 1124 } 1125 1126 int intel_svm_page_response(struct device *dev, 1127 struct iommu_fault_event *evt, 1128 struct iommu_page_response *msg) 1129 { 1130 struct iommu_fault_page_request *prm; 1131 struct intel_svm_dev *sdev = NULL; 1132 struct intel_svm *svm = NULL; 1133 struct intel_iommu *iommu; 1134 bool private_present; 1135 bool pasid_present; 1136 bool last_page; 1137 u8 bus, devfn; 1138 int ret = 0; 1139 u16 sid; 1140 1141 if (!dev || !dev_is_pci(dev)) 1142 return -ENODEV; 1143 1144 iommu = device_to_iommu(dev, &bus, &devfn); 1145 if (!iommu) 1146 return -ENODEV; 1147 1148 if (!msg || !evt) 1149 return -EINVAL; 1150 1151 mutex_lock(&pasid_mutex); 1152 1153 prm = &evt->fault.prm; 1154 sid = PCI_DEVID(bus, devfn); 1155 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1156 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1157 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1158 1159 if (!pasid_present) { 1160 ret = -EINVAL; 1161 goto out; 1162 } 1163 1164 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1165 ret = -EINVAL; 1166 goto out; 1167 } 1168 1169 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1170 if (ret || !sdev) { 1171 ret = -ENODEV; 1172 goto out; 1173 } 1174 1175 /* 1176 * For responses from userspace, need to make sure that the 1177 * pasid has been bound to its mm. 1178 */ 1179 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1180 struct mm_struct *mm; 1181 1182 mm = get_task_mm(current); 1183 if (!mm) { 1184 ret = -EINVAL; 1185 goto out; 1186 } 1187 1188 if (mm != svm->mm) { 1189 ret = -ENODEV; 1190 mmput(mm); 1191 goto out; 1192 } 1193 1194 mmput(mm); 1195 } 1196 1197 /* 1198 * Per VT-d spec. v3.0 ch7.7, system software must respond 1199 * with page group response if private data is present (PDP) 1200 * or last page in group (LPIG) bit is set. This is an 1201 * additional VT-d requirement beyond PCI ATS spec. 1202 */ 1203 if (last_page || private_present) { 1204 struct qi_desc desc; 1205 1206 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1207 QI_PGRP_PASID_P(pasid_present) | 1208 QI_PGRP_PDP(private_present) | 1209 QI_PGRP_RESP_CODE(msg->code) | 1210 QI_PGRP_RESP_TYPE; 1211 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1212 desc.qw2 = 0; 1213 desc.qw3 = 0; 1214 if (private_present) 1215 memcpy(&desc.qw2, prm->private_data, 1216 sizeof(prm->private_data)); 1217 1218 qi_submit_sync(iommu, &desc, 1, 0); 1219 } 1220 out: 1221 mutex_unlock(&pasid_mutex); 1222 return ret; 1223 } 1224