1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2010 4 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 5 * 6 * This code provides a IOMMU for Xen PV guests with PCI passthrough. 7 * 8 * PV guests under Xen are running in an non-contiguous memory architecture. 9 * 10 * When PCI pass-through is utilized, this necessitates an IOMMU for 11 * translating bus (DMA) to virtual and vice-versa and also providing a 12 * mechanism to have contiguous pages for device drivers operations (say DMA 13 * operations). 14 * 15 * Specifically, under Xen the Linux idea of pages is an illusion. It 16 * assumes that pages start at zero and go up to the available memory. To 17 * help with that, the Linux Xen MMU provides a lookup mechanism to 18 * translate the page frame numbers (PFN) to machine frame numbers (MFN) 19 * and vice-versa. The MFN are the "real" frame numbers. Furthermore 20 * memory is not contiguous. Xen hypervisor stitches memory for guests 21 * from different pools, which means there is no guarantee that PFN==MFN 22 * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are 23 * allocated in descending order (high to low), meaning the guest might 24 * never get any MFN's under the 4GB mark. 25 */ 26 27 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 28 29 #include <linux/memblock.h> 30 #include <linux/dma-direct.h> 31 #include <linux/export.h> 32 #include <xen/swiotlb-xen.h> 33 #include <xen/page.h> 34 #include <xen/xen-ops.h> 35 #include <xen/hvc-console.h> 36 37 #include <asm/dma-mapping.h> 38 #include <asm/xen/page-coherent.h> 39 40 #include <trace/events/swiotlb.h> 41 /* 42 * Used to do a quick range check in swiotlb_tbl_unmap_single and 43 * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this 44 * API. 45 */ 46 47 static char *xen_io_tlb_start, *xen_io_tlb_end; 48 static unsigned long xen_io_tlb_nslabs; 49 /* 50 * Quick lookup value of the bus address of the IOTLB. 51 */ 52 53 static u64 start_dma_addr; 54 55 /* 56 * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t 57 * can be 32bit when dma_addr_t is 64bit leading to a loss in 58 * information if the shift is done before casting to 64bit. 59 */ 60 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) 61 { 62 unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); 63 dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; 64 65 dma |= paddr & ~XEN_PAGE_MASK; 66 67 return dma; 68 } 69 70 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) 71 { 72 unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); 73 dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; 74 phys_addr_t paddr = dma; 75 76 paddr |= baddr & ~XEN_PAGE_MASK; 77 78 return paddr; 79 } 80 81 static inline dma_addr_t xen_virt_to_bus(void *address) 82 { 83 return xen_phys_to_bus(virt_to_phys(address)); 84 } 85 86 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) 87 { 88 unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); 89 unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); 90 91 next_bfn = pfn_to_bfn(xen_pfn); 92 93 for (i = 1; i < nr_pages; i++) 94 if (pfn_to_bfn(++xen_pfn) != ++next_bfn) 95 return 1; 96 97 return 0; 98 } 99 100 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) 101 { 102 unsigned long bfn = XEN_PFN_DOWN(dma_addr); 103 unsigned long xen_pfn = bfn_to_local_pfn(bfn); 104 phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); 105 106 /* If the address is outside our domain, it CAN 107 * have the same virtual address as another address 108 * in our domain. Therefore _only_ check address within our domain. 109 */ 110 if (pfn_valid(PFN_DOWN(paddr))) { 111 return paddr >= virt_to_phys(xen_io_tlb_start) && 112 paddr < virt_to_phys(xen_io_tlb_end); 113 } 114 return 0; 115 } 116 117 static int max_dma_bits = 32; 118 119 static int 120 xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) 121 { 122 int i, rc; 123 int dma_bits; 124 dma_addr_t dma_handle; 125 phys_addr_t p = virt_to_phys(buf); 126 127 dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; 128 129 i = 0; 130 do { 131 int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); 132 133 do { 134 rc = xen_create_contiguous_region( 135 p + (i << IO_TLB_SHIFT), 136 get_order(slabs << IO_TLB_SHIFT), 137 dma_bits, &dma_handle); 138 } while (rc && dma_bits++ < max_dma_bits); 139 if (rc) 140 return rc; 141 142 i += slabs; 143 } while (i < nslabs); 144 return 0; 145 } 146 static unsigned long xen_set_nslabs(unsigned long nr_tbl) 147 { 148 if (!nr_tbl) { 149 xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); 150 xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); 151 } else 152 xen_io_tlb_nslabs = nr_tbl; 153 154 return xen_io_tlb_nslabs << IO_TLB_SHIFT; 155 } 156 157 enum xen_swiotlb_err { 158 XEN_SWIOTLB_UNKNOWN = 0, 159 XEN_SWIOTLB_ENOMEM, 160 XEN_SWIOTLB_EFIXUP 161 }; 162 163 static const char *xen_swiotlb_error(enum xen_swiotlb_err err) 164 { 165 switch (err) { 166 case XEN_SWIOTLB_ENOMEM: 167 return "Cannot allocate Xen-SWIOTLB buffer\n"; 168 case XEN_SWIOTLB_EFIXUP: 169 return "Failed to get contiguous memory for DMA from Xen!\n"\ 170 "You either: don't have the permissions, do not have"\ 171 " enough free memory under 4GB, or the hypervisor memory"\ 172 " is too fragmented!"; 173 default: 174 break; 175 } 176 return ""; 177 } 178 int __ref xen_swiotlb_init(int verbose, bool early) 179 { 180 unsigned long bytes, order; 181 int rc = -ENOMEM; 182 enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; 183 unsigned int repeat = 3; 184 185 xen_io_tlb_nslabs = swiotlb_nr_tbl(); 186 retry: 187 bytes = xen_set_nslabs(xen_io_tlb_nslabs); 188 order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); 189 190 /* 191 * IO TLB memory already allocated. Just use it. 192 */ 193 if (io_tlb_start != 0) { 194 xen_io_tlb_start = phys_to_virt(io_tlb_start); 195 goto end; 196 } 197 198 /* 199 * Get IO TLB memory from any location. 200 */ 201 if (early) { 202 xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), 203 PAGE_SIZE); 204 if (!xen_io_tlb_start) 205 panic("%s: Failed to allocate %lu bytes align=0x%lx\n", 206 __func__, PAGE_ALIGN(bytes), PAGE_SIZE); 207 } else { 208 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) 209 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) 210 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 211 xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); 212 if (xen_io_tlb_start) 213 break; 214 order--; 215 } 216 if (order != get_order(bytes)) { 217 pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", 218 (PAGE_SIZE << order) >> 20); 219 xen_io_tlb_nslabs = SLABS_PER_PAGE << order; 220 bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; 221 } 222 } 223 if (!xen_io_tlb_start) { 224 m_ret = XEN_SWIOTLB_ENOMEM; 225 goto error; 226 } 227 /* 228 * And replace that memory with pages under 4GB. 229 */ 230 rc = xen_swiotlb_fixup(xen_io_tlb_start, 231 bytes, 232 xen_io_tlb_nslabs); 233 if (rc) { 234 if (early) 235 memblock_free(__pa(xen_io_tlb_start), 236 PAGE_ALIGN(bytes)); 237 else { 238 free_pages((unsigned long)xen_io_tlb_start, order); 239 xen_io_tlb_start = NULL; 240 } 241 m_ret = XEN_SWIOTLB_EFIXUP; 242 goto error; 243 } 244 start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); 245 if (early) { 246 if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, 247 verbose)) 248 panic("Cannot allocate SWIOTLB buffer"); 249 rc = 0; 250 } else 251 rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); 252 253 end: 254 xen_io_tlb_end = xen_io_tlb_start + bytes; 255 if (!rc) 256 swiotlb_set_max_segment(PAGE_SIZE); 257 258 return rc; 259 error: 260 if (repeat--) { 261 xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ 262 (xen_io_tlb_nslabs >> 1)); 263 pr_info("Lowering to %luMB\n", 264 (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); 265 goto retry; 266 } 267 pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); 268 if (early) 269 panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); 270 else 271 free_pages((unsigned long)xen_io_tlb_start, order); 272 return rc; 273 } 274 275 static void * 276 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 277 dma_addr_t *dma_handle, gfp_t flags, 278 unsigned long attrs) 279 { 280 void *ret; 281 int order = get_order(size); 282 u64 dma_mask = DMA_BIT_MASK(32); 283 phys_addr_t phys; 284 dma_addr_t dev_addr; 285 286 /* 287 * Ignore region specifiers - the kernel's ideas of 288 * pseudo-phys memory layout has nothing to do with the 289 * machine physical layout. We can't allocate highmem 290 * because we can't return a pointer to it. 291 */ 292 flags &= ~(__GFP_DMA | __GFP_HIGHMEM); 293 294 /* Convert the size to actually allocated. */ 295 size = 1UL << (order + XEN_PAGE_SHIFT); 296 297 /* On ARM this function returns an ioremap'ped virtual address for 298 * which virt_to_phys doesn't return the corresponding physical 299 * address. In fact on ARM virt_to_phys only works for kernel direct 300 * mapped RAM memory. Also see comment below. 301 */ 302 ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); 303 304 if (!ret) 305 return ret; 306 307 if (hwdev && hwdev->coherent_dma_mask) 308 dma_mask = hwdev->coherent_dma_mask; 309 310 /* At this point dma_handle is the physical address, next we are 311 * going to set it to the machine address. 312 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond 313 * to *dma_handle. */ 314 phys = *dma_handle; 315 dev_addr = xen_phys_to_bus(phys); 316 if (((dev_addr + size - 1 <= dma_mask)) && 317 !range_straddles_page_boundary(phys, size)) 318 *dma_handle = dev_addr; 319 else { 320 if (xen_create_contiguous_region(phys, order, 321 fls64(dma_mask), dma_handle) != 0) { 322 xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); 323 return NULL; 324 } 325 SetPageXenRemapped(virt_to_page(ret)); 326 } 327 memset(ret, 0, size); 328 return ret; 329 } 330 331 static void 332 xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, 333 dma_addr_t dev_addr, unsigned long attrs) 334 { 335 int order = get_order(size); 336 phys_addr_t phys; 337 u64 dma_mask = DMA_BIT_MASK(32); 338 339 if (hwdev && hwdev->coherent_dma_mask) 340 dma_mask = hwdev->coherent_dma_mask; 341 342 /* do not use virt_to_phys because on ARM it doesn't return you the 343 * physical address */ 344 phys = xen_bus_to_phys(dev_addr); 345 346 /* Convert the size to actually allocated. */ 347 size = 1UL << (order + XEN_PAGE_SHIFT); 348 349 if (!WARN_ON((dev_addr + size - 1 > dma_mask) || 350 range_straddles_page_boundary(phys, size)) && 351 TestClearPageXenRemapped(virt_to_page(vaddr))) 352 xen_destroy_contiguous_region(phys, order); 353 354 xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); 355 } 356 357 /* 358 * Map a single buffer of the indicated size for DMA in streaming mode. The 359 * physical address to use is returned. 360 * 361 * Once the device is given the dma address, the device owns this memory until 362 * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. 363 */ 364 static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, 365 unsigned long offset, size_t size, 366 enum dma_data_direction dir, 367 unsigned long attrs) 368 { 369 phys_addr_t map, phys = page_to_phys(page) + offset; 370 dma_addr_t dev_addr = xen_phys_to_bus(phys); 371 372 BUG_ON(dir == DMA_NONE); 373 /* 374 * If the address happens to be in the device's DMA window, 375 * we can safely return the device addr and not worry about bounce 376 * buffering it. 377 */ 378 if (dma_capable(dev, dev_addr, size) && 379 !range_straddles_page_boundary(phys, size) && 380 !xen_arch_need_swiotlb(dev, phys, dev_addr) && 381 swiotlb_force != SWIOTLB_FORCE) 382 goto done; 383 384 /* 385 * Oh well, have to allocate and map a bounce buffer. 386 */ 387 trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); 388 389 map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, 390 attrs); 391 if (map == (phys_addr_t)DMA_MAPPING_ERROR) 392 return DMA_MAPPING_ERROR; 393 394 dev_addr = xen_phys_to_bus(map); 395 396 /* 397 * Ensure that the address returned is DMA'ble 398 */ 399 if (unlikely(!dma_capable(dev, dev_addr, size))) { 400 swiotlb_tbl_unmap_single(dev, map, size, dir, 401 attrs | DMA_ATTR_SKIP_CPU_SYNC); 402 return DMA_MAPPING_ERROR; 403 } 404 405 page = pfn_to_page(map >> PAGE_SHIFT); 406 offset = map & ~PAGE_MASK; 407 done: 408 /* 409 * we are not interested in the dma_addr returned by xen_dma_map_page, 410 * only in the potential cache flushes executed by the function. 411 */ 412 xen_dma_map_page(dev, page, dev_addr, offset, size, dir, attrs); 413 return dev_addr; 414 } 415 416 /* 417 * Unmap a single streaming mode DMA translation. The dma_addr and size must 418 * match what was provided for in a previous xen_swiotlb_map_page call. All 419 * other usages are undefined. 420 * 421 * After this call, reads by the cpu to the buffer are guaranteed to see 422 * whatever the device wrote there. 423 */ 424 static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, 425 size_t size, enum dma_data_direction dir, 426 unsigned long attrs) 427 { 428 phys_addr_t paddr = xen_bus_to_phys(dev_addr); 429 430 BUG_ON(dir == DMA_NONE); 431 432 xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); 433 434 /* NOTE: We use dev_addr here, not paddr! */ 435 if (is_xen_swiotlb_buffer(dev_addr)) 436 swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); 437 } 438 439 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 440 size_t size, enum dma_data_direction dir, 441 unsigned long attrs) 442 { 443 xen_unmap_single(hwdev, dev_addr, size, dir, attrs); 444 } 445 446 static void 447 xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, 448 size_t size, enum dma_data_direction dir) 449 { 450 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 451 452 xen_dma_sync_single_for_cpu(dev, dma_addr, size, dir); 453 454 if (is_xen_swiotlb_buffer(dma_addr)) 455 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); 456 } 457 458 static void 459 xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, 460 size_t size, enum dma_data_direction dir) 461 { 462 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 463 464 if (is_xen_swiotlb_buffer(dma_addr)) 465 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); 466 467 xen_dma_sync_single_for_device(dev, dma_addr, size, dir); 468 } 469 470 /* 471 * Unmap a set of streaming mode DMA translations. Again, cpu read rules 472 * concerning calls here are the same as for swiotlb_unmap_page() above. 473 */ 474 static void 475 xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, 476 enum dma_data_direction dir, unsigned long attrs) 477 { 478 struct scatterlist *sg; 479 int i; 480 481 BUG_ON(dir == DMA_NONE); 482 483 for_each_sg(sgl, sg, nelems, i) 484 xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); 485 486 } 487 488 static int 489 xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, 490 enum dma_data_direction dir, unsigned long attrs) 491 { 492 struct scatterlist *sg; 493 int i; 494 495 BUG_ON(dir == DMA_NONE); 496 497 for_each_sg(sgl, sg, nelems, i) { 498 sg->dma_address = xen_swiotlb_map_page(dev, sg_page(sg), 499 sg->offset, sg->length, dir, attrs); 500 if (sg->dma_address == DMA_MAPPING_ERROR) 501 goto out_unmap; 502 sg_dma_len(sg) = sg->length; 503 } 504 505 return nelems; 506 out_unmap: 507 xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 508 sg_dma_len(sgl) = 0; 509 return 0; 510 } 511 512 static void 513 xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, 514 int nelems, enum dma_data_direction dir) 515 { 516 struct scatterlist *sg; 517 int i; 518 519 for_each_sg(sgl, sg, nelems, i) { 520 xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, 521 sg->length, dir); 522 } 523 } 524 525 static void 526 xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, 527 int nelems, enum dma_data_direction dir) 528 { 529 struct scatterlist *sg; 530 int i; 531 532 for_each_sg(sgl, sg, nelems, i) { 533 xen_swiotlb_sync_single_for_device(dev, sg->dma_address, 534 sg->length, dir); 535 } 536 } 537 538 /* 539 * Return whether the given device DMA address mask can be supported 540 * properly. For example, if your device can only drive the low 24-bits 541 * during bus mastering, then you would pass 0x00ffffff as the mask to 542 * this function. 543 */ 544 static int 545 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) 546 { 547 return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; 548 } 549 550 /* 551 * Create userspace mapping for the DMA-coherent memory. 552 * This function should be called with the pages from the current domain only, 553 * passing pages mapped from other domains would lead to memory corruption. 554 */ 555 static int 556 xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, 557 void *cpu_addr, dma_addr_t dma_addr, size_t size, 558 unsigned long attrs) 559 { 560 #ifdef CONFIG_ARM 561 if (xen_get_dma_ops(dev)->mmap) 562 return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, 563 dma_addr, size, attrs); 564 #endif 565 return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); 566 } 567 568 /* 569 * This function should be called with the pages from the current domain only, 570 * passing pages mapped from other domains would lead to memory corruption. 571 */ 572 static int 573 xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, 574 void *cpu_addr, dma_addr_t handle, size_t size, 575 unsigned long attrs) 576 { 577 #ifdef CONFIG_ARM 578 if (xen_get_dma_ops(dev)->get_sgtable) { 579 #if 0 580 /* 581 * This check verifies that the page belongs to the current domain and 582 * is not one mapped from another domain. 583 * This check is for debug only, and should not go to production build 584 */ 585 unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); 586 BUG_ON (!page_is_ram(bfn)); 587 #endif 588 return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, 589 handle, size, attrs); 590 } 591 #endif 592 return dma_common_get_sgtable(dev, sgt, cpu_addr, handle, size, attrs); 593 } 594 595 const struct dma_map_ops xen_swiotlb_dma_ops = { 596 .alloc = xen_swiotlb_alloc_coherent, 597 .free = xen_swiotlb_free_coherent, 598 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, 599 .sync_single_for_device = xen_swiotlb_sync_single_for_device, 600 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, 601 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, 602 .map_sg = xen_swiotlb_map_sg, 603 .unmap_sg = xen_swiotlb_unmap_sg, 604 .map_page = xen_swiotlb_map_page, 605 .unmap_page = xen_swiotlb_unmap_page, 606 .dma_supported = xen_swiotlb_dma_supported, 607 .mmap = xen_swiotlb_dma_mmap, 608 .get_sgtable = xen_swiotlb_get_sgtable, 609 }; 610