1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 26 static void v1_tlb_flush_all(void *cookie) 27 { 28 } 29 30 static void v1_tlb_flush_walk(unsigned long iova, size_t size, 31 size_t granule, void *cookie) 32 { 33 } 34 35 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 36 unsigned long iova, size_t granule, 37 void *cookie) 38 { 39 } 40 41 static const struct iommu_flush_ops v1_flush_ops = { 42 .tlb_flush_all = v1_tlb_flush_all, 43 .tlb_flush_walk = v1_tlb_flush_walk, 44 .tlb_add_page = v1_tlb_add_page, 45 }; 46 47 /* 48 * Helper function to get the first pte of a large mapping 49 */ 50 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 51 unsigned long *count) 52 { 53 unsigned long pte_mask, pg_size, cnt; 54 u64 *fpte; 55 56 pg_size = PTE_PAGE_SIZE(*pte); 57 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 58 pte_mask = ~((cnt << 3) - 1); 59 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 60 61 if (page_size) 62 *page_size = pg_size; 63 64 if (count) 65 *count = cnt; 66 67 return fpte; 68 } 69 70 /**************************************************************************** 71 * 72 * The functions below are used the create the page table mappings for 73 * unity mapped regions. 74 * 75 ****************************************************************************/ 76 77 static void free_page_list(struct page *freelist) 78 { 79 while (freelist != NULL) { 80 unsigned long p = (unsigned long)page_address(freelist); 81 82 freelist = freelist->freelist; 83 free_page(p); 84 } 85 } 86 87 static struct page *free_pt_page(unsigned long pt, struct page *freelist) 88 { 89 struct page *p = virt_to_page((void *)pt); 90 91 p->freelist = freelist; 92 93 return p; 94 } 95 96 #define DEFINE_FREE_PT_FN(LVL, FN) \ 97 static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \ 98 { \ 99 unsigned long p; \ 100 u64 *pt; \ 101 int i; \ 102 \ 103 pt = (u64 *)__pt; \ 104 \ 105 for (i = 0; i < 512; ++i) { \ 106 /* PTE present? */ \ 107 if (!IOMMU_PTE_PRESENT(pt[i])) \ 108 continue; \ 109 \ 110 /* Large PTE? */ \ 111 if (PM_PTE_LEVEL(pt[i]) == 0 || \ 112 PM_PTE_LEVEL(pt[i]) == 7) \ 113 continue; \ 114 \ 115 p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ 116 freelist = FN(p, freelist); \ 117 } \ 118 \ 119 return free_pt_page((unsigned long)pt, freelist); \ 120 } 121 122 DEFINE_FREE_PT_FN(l2, free_pt_page) 123 DEFINE_FREE_PT_FN(l3, free_pt_l2) 124 DEFINE_FREE_PT_FN(l4, free_pt_l3) 125 DEFINE_FREE_PT_FN(l5, free_pt_l4) 126 DEFINE_FREE_PT_FN(l6, free_pt_l5) 127 128 static struct page *free_sub_pt(unsigned long root, int mode, 129 struct page *freelist) 130 { 131 switch (mode) { 132 case PAGE_MODE_NONE: 133 case PAGE_MODE_7_LEVEL: 134 break; 135 case PAGE_MODE_1_LEVEL: 136 freelist = free_pt_page(root, freelist); 137 break; 138 case PAGE_MODE_2_LEVEL: 139 freelist = free_pt_l2(root, freelist); 140 break; 141 case PAGE_MODE_3_LEVEL: 142 freelist = free_pt_l3(root, freelist); 143 break; 144 case PAGE_MODE_4_LEVEL: 145 freelist = free_pt_l4(root, freelist); 146 break; 147 case PAGE_MODE_5_LEVEL: 148 freelist = free_pt_l5(root, freelist); 149 break; 150 case PAGE_MODE_6_LEVEL: 151 freelist = free_pt_l6(root, freelist); 152 break; 153 default: 154 BUG(); 155 } 156 157 return freelist; 158 } 159 160 void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 161 u64 *root, int mode) 162 { 163 u64 pt_root; 164 165 /* lowest 3 bits encode pgtable mode */ 166 pt_root = mode & 7; 167 pt_root |= (u64)root; 168 169 amd_iommu_domain_set_pt_root(domain, pt_root); 170 } 171 172 /* 173 * This function is used to add another level to an IO page table. Adding 174 * another level increases the size of the address space by 9 bits to a size up 175 * to 64 bits. 176 */ 177 static bool increase_address_space(struct protection_domain *domain, 178 unsigned long address, 179 gfp_t gfp) 180 { 181 unsigned long flags; 182 bool ret = true; 183 u64 *pte; 184 185 spin_lock_irqsave(&domain->lock, flags); 186 187 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 188 goto out; 189 190 ret = false; 191 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 192 goto out; 193 194 pte = (void *)get_zeroed_page(gfp); 195 if (!pte) 196 goto out; 197 198 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 199 200 domain->iop.root = pte; 201 domain->iop.mode += 1; 202 amd_iommu_update_and_flush_device_table(domain); 203 amd_iommu_domain_flush_complete(domain); 204 205 /* 206 * Device Table needs to be updated and flushed before the new root can 207 * be published. 208 */ 209 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 210 211 ret = true; 212 213 out: 214 spin_unlock_irqrestore(&domain->lock, flags); 215 216 return ret; 217 } 218 219 static u64 *alloc_pte(struct protection_domain *domain, 220 unsigned long address, 221 unsigned long page_size, 222 u64 **pte_page, 223 gfp_t gfp, 224 bool *updated) 225 { 226 int level, end_lvl; 227 u64 *pte, *page; 228 229 BUG_ON(!is_power_of_2(page_size)); 230 231 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 232 /* 233 * Return an error if there is no memory to update the 234 * page-table. 235 */ 236 if (!increase_address_space(domain, address, gfp)) 237 return NULL; 238 } 239 240 241 level = domain->iop.mode - 1; 242 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 243 address = PAGE_SIZE_ALIGN(address, page_size); 244 end_lvl = PAGE_SIZE_LEVEL(page_size); 245 246 while (level > end_lvl) { 247 u64 __pte, __npte; 248 int pte_level; 249 250 __pte = *pte; 251 pte_level = PM_PTE_LEVEL(__pte); 252 253 /* 254 * If we replace a series of large PTEs, we need 255 * to tear down all of them. 256 */ 257 if (IOMMU_PTE_PRESENT(__pte) && 258 pte_level == PAGE_MODE_7_LEVEL) { 259 unsigned long count, i; 260 u64 *lpte; 261 262 lpte = first_pte_l7(pte, NULL, &count); 263 264 /* 265 * Unmap the replicated PTEs that still match the 266 * original large mapping 267 */ 268 for (i = 0; i < count; ++i) 269 cmpxchg64(&lpte[i], __pte, 0ULL); 270 271 *updated = true; 272 continue; 273 } 274 275 if (!IOMMU_PTE_PRESENT(__pte) || 276 pte_level == PAGE_MODE_NONE) { 277 page = (u64 *)get_zeroed_page(gfp); 278 279 if (!page) 280 return NULL; 281 282 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 283 284 /* pte could have been changed somewhere. */ 285 if (cmpxchg64(pte, __pte, __npte) != __pte) 286 free_page((unsigned long)page); 287 else if (IOMMU_PTE_PRESENT(__pte)) 288 *updated = true; 289 290 continue; 291 } 292 293 /* No level skipping support yet */ 294 if (pte_level != level) 295 return NULL; 296 297 level -= 1; 298 299 pte = IOMMU_PTE_PAGE(__pte); 300 301 if (pte_page && level == end_lvl) 302 *pte_page = pte; 303 304 pte = &pte[PM_LEVEL_INDEX(level, address)]; 305 } 306 307 return pte; 308 } 309 310 /* 311 * This function checks if there is a PTE for a given dma address. If 312 * there is one, it returns the pointer to it. 313 */ 314 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 315 unsigned long address, 316 unsigned long *page_size) 317 { 318 int level; 319 u64 *pte; 320 321 *page_size = 0; 322 323 if (address > PM_LEVEL_SIZE(pgtable->mode)) 324 return NULL; 325 326 level = pgtable->mode - 1; 327 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 328 *page_size = PTE_LEVEL_PAGE_SIZE(level); 329 330 while (level > 0) { 331 332 /* Not Present */ 333 if (!IOMMU_PTE_PRESENT(*pte)) 334 return NULL; 335 336 /* Large PTE */ 337 if (PM_PTE_LEVEL(*pte) == 7 || 338 PM_PTE_LEVEL(*pte) == 0) 339 break; 340 341 /* No level skipping support yet */ 342 if (PM_PTE_LEVEL(*pte) != level) 343 return NULL; 344 345 level -= 1; 346 347 /* Walk to the next level */ 348 pte = IOMMU_PTE_PAGE(*pte); 349 pte = &pte[PM_LEVEL_INDEX(level, address)]; 350 *page_size = PTE_LEVEL_PAGE_SIZE(level); 351 } 352 353 /* 354 * If we have a series of large PTEs, make 355 * sure to return a pointer to the first one. 356 */ 357 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 358 pte = first_pte_l7(pte, page_size, NULL); 359 360 return pte; 361 } 362 363 static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist) 364 { 365 unsigned long pt; 366 int mode; 367 368 while (cmpxchg64(pte, pteval, 0) != pteval) { 369 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 370 pteval = *pte; 371 } 372 373 if (!IOMMU_PTE_PRESENT(pteval)) 374 return freelist; 375 376 pt = (unsigned long)IOMMU_PTE_PAGE(pteval); 377 mode = IOMMU_PTE_MODE(pteval); 378 379 return free_sub_pt(pt, mode, freelist); 380 } 381 382 /* 383 * Generic mapping functions. It maps a physical address into a DMA 384 * address space. It allocates the page table pages if necessary. 385 * In the future it can be extended to a generic mapping function 386 * supporting all features of AMD IOMMU page tables like level skipping 387 * and full 64 bit address spaces. 388 */ 389 static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova, 390 phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 391 { 392 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 393 struct page *freelist = NULL; 394 bool updated = false; 395 u64 __pte, *pte; 396 int ret, i, count; 397 398 BUG_ON(!IS_ALIGNED(iova, size)); 399 BUG_ON(!IS_ALIGNED(paddr, size)); 400 401 ret = -EINVAL; 402 if (!(prot & IOMMU_PROT_MASK)) 403 goto out; 404 405 count = PAGE_SIZE_PTE_COUNT(size); 406 pte = alloc_pte(dom, iova, size, NULL, gfp, &updated); 407 408 ret = -ENOMEM; 409 if (!pte) 410 goto out; 411 412 for (i = 0; i < count; ++i) 413 freelist = free_clear_pte(&pte[i], pte[i], freelist); 414 415 if (freelist != NULL) 416 updated = true; 417 418 if (count > 1) { 419 __pte = PAGE_SIZE_PTE(__sme_set(paddr), size); 420 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 421 } else 422 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 423 424 if (prot & IOMMU_PROT_IR) 425 __pte |= IOMMU_PTE_IR; 426 if (prot & IOMMU_PROT_IW) 427 __pte |= IOMMU_PTE_IW; 428 429 for (i = 0; i < count; ++i) 430 pte[i] = __pte; 431 432 ret = 0; 433 434 out: 435 if (updated) { 436 unsigned long flags; 437 438 spin_lock_irqsave(&dom->lock, flags); 439 /* 440 * Flush domain TLB(s) and wait for completion. Any Device-Table 441 * Updates and flushing already happened in 442 * increase_address_space(). 443 */ 444 amd_iommu_domain_flush_tlb_pde(dom); 445 amd_iommu_domain_flush_complete(dom); 446 spin_unlock_irqrestore(&dom->lock, flags); 447 } 448 449 /* Everything flushed out, free pages now */ 450 free_page_list(freelist); 451 452 return ret; 453 } 454 455 static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops, 456 unsigned long iova, 457 size_t size, 458 struct iommu_iotlb_gather *gather) 459 { 460 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 461 unsigned long long unmapped; 462 unsigned long unmap_size; 463 u64 *pte; 464 465 BUG_ON(!is_power_of_2(size)); 466 467 unmapped = 0; 468 469 while (unmapped < size) { 470 pte = fetch_pte(pgtable, iova, &unmap_size); 471 if (pte) { 472 int i, count; 473 474 count = PAGE_SIZE_PTE_COUNT(unmap_size); 475 for (i = 0; i < count; i++) 476 pte[i] = 0ULL; 477 } 478 479 iova = (iova & ~(unmap_size - 1)) + unmap_size; 480 unmapped += unmap_size; 481 } 482 483 BUG_ON(unmapped && !is_power_of_2(unmapped)); 484 485 return unmapped; 486 } 487 488 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 489 { 490 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 491 unsigned long offset_mask, pte_pgsize; 492 u64 *pte, __pte; 493 494 if (pgtable->mode == PAGE_MODE_NONE) 495 return iova; 496 497 pte = fetch_pte(pgtable, iova, &pte_pgsize); 498 499 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 500 return 0; 501 502 offset_mask = pte_pgsize - 1; 503 __pte = __sme_clr(*pte & PM_ADDR_MASK); 504 505 return (__pte & ~offset_mask) | (iova & offset_mask); 506 } 507 508 /* 509 * ---------------------------------------------------- 510 */ 511 static void v1_free_pgtable(struct io_pgtable *iop) 512 { 513 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 514 struct protection_domain *dom; 515 struct page *freelist = NULL; 516 unsigned long root; 517 518 if (pgtable->mode == PAGE_MODE_NONE) 519 return; 520 521 dom = container_of(pgtable, struct protection_domain, iop); 522 523 /* Update data structure */ 524 amd_iommu_domain_clr_pt_root(dom); 525 526 /* Make changes visible to IOMMUs */ 527 amd_iommu_domain_update(dom); 528 529 /* Page-table is not visible to IOMMU anymore, so free it */ 530 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 531 pgtable->mode > PAGE_MODE_6_LEVEL); 532 533 root = (unsigned long)pgtable->root; 534 freelist = free_sub_pt(root, pgtable->mode, freelist); 535 536 free_page_list(freelist); 537 } 538 539 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 540 { 541 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 542 543 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 544 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 545 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 546 cfg->tlb = &v1_flush_ops; 547 548 pgtable->iop.ops.map = iommu_v1_map_page; 549 pgtable->iop.ops.unmap = iommu_v1_unmap_page; 550 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 551 552 return &pgtable->iop; 553 } 554 555 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 556 .alloc = v1_alloc_pgtable, 557 .free = v1_free_pgtable, 558 }; 559