1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 26 static void v1_tlb_flush_all(void *cookie) 27 { 28 } 29 30 static void v1_tlb_flush_walk(unsigned long iova, size_t size, 31 size_t granule, void *cookie) 32 { 33 } 34 35 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 36 unsigned long iova, size_t granule, 37 void *cookie) 38 { 39 } 40 41 static const struct iommu_flush_ops v1_flush_ops = { 42 .tlb_flush_all = v1_tlb_flush_all, 43 .tlb_flush_walk = v1_tlb_flush_walk, 44 .tlb_add_page = v1_tlb_add_page, 45 }; 46 47 /* 48 * Helper function to get the first pte of a large mapping 49 */ 50 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 51 unsigned long *count) 52 { 53 unsigned long pte_mask, pg_size, cnt; 54 u64 *fpte; 55 56 pg_size = PTE_PAGE_SIZE(*pte); 57 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 58 pte_mask = ~((cnt << 3) - 1); 59 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 60 61 if (page_size) 62 *page_size = pg_size; 63 64 if (count) 65 *count = cnt; 66 67 return fpte; 68 } 69 70 /**************************************************************************** 71 * 72 * The functions below are used the create the page table mappings for 73 * unity mapped regions. 74 * 75 ****************************************************************************/ 76 77 static void free_page_list(struct page *freelist) 78 { 79 while (freelist != NULL) { 80 unsigned long p = (unsigned long)page_address(freelist); 81 82 freelist = freelist->freelist; 83 free_page(p); 84 } 85 } 86 87 static struct page *free_pt_page(unsigned long pt, struct page *freelist) 88 { 89 struct page *p = virt_to_page((void *)pt); 90 91 p->freelist = freelist; 92 93 return p; 94 } 95 96 #define DEFINE_FREE_PT_FN(LVL, FN) \ 97 static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \ 98 { \ 99 unsigned long p; \ 100 u64 *pt; \ 101 int i; \ 102 \ 103 pt = (u64 *)__pt; \ 104 \ 105 for (i = 0; i < 512; ++i) { \ 106 /* PTE present? */ \ 107 if (!IOMMU_PTE_PRESENT(pt[i])) \ 108 continue; \ 109 \ 110 /* Large PTE? */ \ 111 if (PM_PTE_LEVEL(pt[i]) == 0 || \ 112 PM_PTE_LEVEL(pt[i]) == 7) \ 113 continue; \ 114 \ 115 p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ 116 freelist = FN(p, freelist); \ 117 } \ 118 \ 119 return free_pt_page((unsigned long)pt, freelist); \ 120 } 121 122 DEFINE_FREE_PT_FN(l2, free_pt_page) 123 DEFINE_FREE_PT_FN(l3, free_pt_l2) 124 DEFINE_FREE_PT_FN(l4, free_pt_l3) 125 DEFINE_FREE_PT_FN(l5, free_pt_l4) 126 DEFINE_FREE_PT_FN(l6, free_pt_l5) 127 128 static struct page *free_sub_pt(unsigned long root, int mode, 129 struct page *freelist) 130 { 131 switch (mode) { 132 case PAGE_MODE_NONE: 133 case PAGE_MODE_7_LEVEL: 134 break; 135 case PAGE_MODE_1_LEVEL: 136 freelist = free_pt_page(root, freelist); 137 break; 138 case PAGE_MODE_2_LEVEL: 139 freelist = free_pt_l2(root, freelist); 140 break; 141 case PAGE_MODE_3_LEVEL: 142 freelist = free_pt_l3(root, freelist); 143 break; 144 case PAGE_MODE_4_LEVEL: 145 freelist = free_pt_l4(root, freelist); 146 break; 147 case PAGE_MODE_5_LEVEL: 148 freelist = free_pt_l5(root, freelist); 149 break; 150 case PAGE_MODE_6_LEVEL: 151 freelist = free_pt_l6(root, freelist); 152 break; 153 default: 154 BUG(); 155 } 156 157 return freelist; 158 } 159 160 void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 161 u64 *root, int mode) 162 { 163 u64 pt_root; 164 165 /* lowest 3 bits encode pgtable mode */ 166 pt_root = mode & 7; 167 pt_root |= (u64)root; 168 169 amd_iommu_domain_set_pt_root(domain, pt_root); 170 } 171 172 /* 173 * This function is used to add another level to an IO page table. Adding 174 * another level increases the size of the address space by 9 bits to a size up 175 * to 64 bits. 176 */ 177 static bool increase_address_space(struct protection_domain *domain, 178 unsigned long address, 179 gfp_t gfp) 180 { 181 unsigned long flags; 182 bool ret = true; 183 u64 *pte; 184 185 pte = (void *)get_zeroed_page(gfp); 186 if (!pte) 187 return false; 188 189 spin_lock_irqsave(&domain->lock, flags); 190 191 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 192 goto out; 193 194 ret = false; 195 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 196 goto out; 197 198 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 199 200 domain->iop.root = pte; 201 domain->iop.mode += 1; 202 amd_iommu_update_and_flush_device_table(domain); 203 amd_iommu_domain_flush_complete(domain); 204 205 /* 206 * Device Table needs to be updated and flushed before the new root can 207 * be published. 208 */ 209 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 210 211 pte = NULL; 212 ret = true; 213 214 out: 215 spin_unlock_irqrestore(&domain->lock, flags); 216 free_page((unsigned long)pte); 217 218 return ret; 219 } 220 221 static u64 *alloc_pte(struct protection_domain *domain, 222 unsigned long address, 223 unsigned long page_size, 224 u64 **pte_page, 225 gfp_t gfp, 226 bool *updated) 227 { 228 int level, end_lvl; 229 u64 *pte, *page; 230 231 BUG_ON(!is_power_of_2(page_size)); 232 233 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 234 /* 235 * Return an error if there is no memory to update the 236 * page-table. 237 */ 238 if (!increase_address_space(domain, address, gfp)) 239 return NULL; 240 } 241 242 243 level = domain->iop.mode - 1; 244 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 245 address = PAGE_SIZE_ALIGN(address, page_size); 246 end_lvl = PAGE_SIZE_LEVEL(page_size); 247 248 while (level > end_lvl) { 249 u64 __pte, __npte; 250 int pte_level; 251 252 __pte = *pte; 253 pte_level = PM_PTE_LEVEL(__pte); 254 255 /* 256 * If we replace a series of large PTEs, we need 257 * to tear down all of them. 258 */ 259 if (IOMMU_PTE_PRESENT(__pte) && 260 pte_level == PAGE_MODE_7_LEVEL) { 261 unsigned long count, i; 262 u64 *lpte; 263 264 lpte = first_pte_l7(pte, NULL, &count); 265 266 /* 267 * Unmap the replicated PTEs that still match the 268 * original large mapping 269 */ 270 for (i = 0; i < count; ++i) 271 cmpxchg64(&lpte[i], __pte, 0ULL); 272 273 *updated = true; 274 continue; 275 } 276 277 if (!IOMMU_PTE_PRESENT(__pte) || 278 pte_level == PAGE_MODE_NONE) { 279 page = (u64 *)get_zeroed_page(gfp); 280 281 if (!page) 282 return NULL; 283 284 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 285 286 /* pte could have been changed somewhere. */ 287 if (cmpxchg64(pte, __pte, __npte) != __pte) 288 free_page((unsigned long)page); 289 else if (IOMMU_PTE_PRESENT(__pte)) 290 *updated = true; 291 292 continue; 293 } 294 295 /* No level skipping support yet */ 296 if (pte_level != level) 297 return NULL; 298 299 level -= 1; 300 301 pte = IOMMU_PTE_PAGE(__pte); 302 303 if (pte_page && level == end_lvl) 304 *pte_page = pte; 305 306 pte = &pte[PM_LEVEL_INDEX(level, address)]; 307 } 308 309 return pte; 310 } 311 312 /* 313 * This function checks if there is a PTE for a given dma address. If 314 * there is one, it returns the pointer to it. 315 */ 316 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 317 unsigned long address, 318 unsigned long *page_size) 319 { 320 int level; 321 u64 *pte; 322 323 *page_size = 0; 324 325 if (address > PM_LEVEL_SIZE(pgtable->mode)) 326 return NULL; 327 328 level = pgtable->mode - 1; 329 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 330 *page_size = PTE_LEVEL_PAGE_SIZE(level); 331 332 while (level > 0) { 333 334 /* Not Present */ 335 if (!IOMMU_PTE_PRESENT(*pte)) 336 return NULL; 337 338 /* Large PTE */ 339 if (PM_PTE_LEVEL(*pte) == 7 || 340 PM_PTE_LEVEL(*pte) == 0) 341 break; 342 343 /* No level skipping support yet */ 344 if (PM_PTE_LEVEL(*pte) != level) 345 return NULL; 346 347 level -= 1; 348 349 /* Walk to the next level */ 350 pte = IOMMU_PTE_PAGE(*pte); 351 pte = &pte[PM_LEVEL_INDEX(level, address)]; 352 *page_size = PTE_LEVEL_PAGE_SIZE(level); 353 } 354 355 /* 356 * If we have a series of large PTEs, make 357 * sure to return a pointer to the first one. 358 */ 359 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 360 pte = first_pte_l7(pte, page_size, NULL); 361 362 return pte; 363 } 364 365 static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist) 366 { 367 unsigned long pt; 368 int mode; 369 370 while (cmpxchg64(pte, pteval, 0) != pteval) { 371 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 372 pteval = *pte; 373 } 374 375 if (!IOMMU_PTE_PRESENT(pteval)) 376 return freelist; 377 378 pt = (unsigned long)IOMMU_PTE_PAGE(pteval); 379 mode = IOMMU_PTE_MODE(pteval); 380 381 return free_sub_pt(pt, mode, freelist); 382 } 383 384 /* 385 * Generic mapping functions. It maps a physical address into a DMA 386 * address space. It allocates the page table pages if necessary. 387 * In the future it can be extended to a generic mapping function 388 * supporting all features of AMD IOMMU page tables like level skipping 389 * and full 64 bit address spaces. 390 */ 391 static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova, 392 phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 393 { 394 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 395 struct page *freelist = NULL; 396 bool updated = false; 397 u64 __pte, *pte; 398 int ret, i, count; 399 400 BUG_ON(!IS_ALIGNED(iova, size)); 401 BUG_ON(!IS_ALIGNED(paddr, size)); 402 403 ret = -EINVAL; 404 if (!(prot & IOMMU_PROT_MASK)) 405 goto out; 406 407 count = PAGE_SIZE_PTE_COUNT(size); 408 pte = alloc_pte(dom, iova, size, NULL, gfp, &updated); 409 410 ret = -ENOMEM; 411 if (!pte) 412 goto out; 413 414 for (i = 0; i < count; ++i) 415 freelist = free_clear_pte(&pte[i], pte[i], freelist); 416 417 if (freelist != NULL) 418 updated = true; 419 420 if (count > 1) { 421 __pte = PAGE_SIZE_PTE(__sme_set(paddr), size); 422 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 423 } else 424 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 425 426 if (prot & IOMMU_PROT_IR) 427 __pte |= IOMMU_PTE_IR; 428 if (prot & IOMMU_PROT_IW) 429 __pte |= IOMMU_PTE_IW; 430 431 for (i = 0; i < count; ++i) 432 pte[i] = __pte; 433 434 ret = 0; 435 436 out: 437 if (updated) { 438 unsigned long flags; 439 440 spin_lock_irqsave(&dom->lock, flags); 441 /* 442 * Flush domain TLB(s) and wait for completion. Any Device-Table 443 * Updates and flushing already happened in 444 * increase_address_space(). 445 */ 446 amd_iommu_domain_flush_tlb_pde(dom); 447 amd_iommu_domain_flush_complete(dom); 448 spin_unlock_irqrestore(&dom->lock, flags); 449 } 450 451 /* Everything flushed out, free pages now */ 452 free_page_list(freelist); 453 454 return ret; 455 } 456 457 static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops, 458 unsigned long iova, 459 size_t size, 460 struct iommu_iotlb_gather *gather) 461 { 462 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 463 unsigned long long unmapped; 464 unsigned long unmap_size; 465 u64 *pte; 466 467 BUG_ON(!is_power_of_2(size)); 468 469 unmapped = 0; 470 471 while (unmapped < size) { 472 pte = fetch_pte(pgtable, iova, &unmap_size); 473 if (pte) { 474 int i, count; 475 476 count = PAGE_SIZE_PTE_COUNT(unmap_size); 477 for (i = 0; i < count; i++) 478 pte[i] = 0ULL; 479 } 480 481 iova = (iova & ~(unmap_size - 1)) + unmap_size; 482 unmapped += unmap_size; 483 } 484 485 BUG_ON(unmapped && !is_power_of_2(unmapped)); 486 487 return unmapped; 488 } 489 490 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 491 { 492 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 493 unsigned long offset_mask, pte_pgsize; 494 u64 *pte, __pte; 495 496 if (pgtable->mode == PAGE_MODE_NONE) 497 return iova; 498 499 pte = fetch_pte(pgtable, iova, &pte_pgsize); 500 501 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 502 return 0; 503 504 offset_mask = pte_pgsize - 1; 505 __pte = __sme_clr(*pte & PM_ADDR_MASK); 506 507 return (__pte & ~offset_mask) | (iova & offset_mask); 508 } 509 510 /* 511 * ---------------------------------------------------- 512 */ 513 static void v1_free_pgtable(struct io_pgtable *iop) 514 { 515 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 516 struct protection_domain *dom; 517 struct page *freelist = NULL; 518 unsigned long root; 519 520 if (pgtable->mode == PAGE_MODE_NONE) 521 return; 522 523 dom = container_of(pgtable, struct protection_domain, iop); 524 525 /* Update data structure */ 526 amd_iommu_domain_clr_pt_root(dom); 527 528 /* Make changes visible to IOMMUs */ 529 amd_iommu_domain_update(dom); 530 531 /* Page-table is not visible to IOMMU anymore, so free it */ 532 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 533 pgtable->mode > PAGE_MODE_6_LEVEL); 534 535 root = (unsigned long)pgtable->root; 536 freelist = free_sub_pt(root, pgtable->mode, freelist); 537 538 free_page_list(freelist); 539 } 540 541 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 542 { 543 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 544 545 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 546 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 547 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 548 cfg->tlb = &v1_flush_ops; 549 550 pgtable->iop.ops.map = iommu_v1_map_page; 551 pgtable->iop.ops.unmap = iommu_v1_unmap_page; 552 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 553 554 return &pgtable->iop; 555 } 556 557 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 558 .alloc = v1_alloc_pgtable, 559 .free = v1_free_pgtable, 560 }; 561