1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2010 4 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 5 * 6 * This code provides a IOMMU for Xen PV guests with PCI passthrough. 7 * 8 * PV guests under Xen are running in an non-contiguous memory architecture. 9 * 10 * When PCI pass-through is utilized, this necessitates an IOMMU for 11 * translating bus (DMA) to virtual and vice-versa and also providing a 12 * mechanism to have contiguous pages for device drivers operations (say DMA 13 * operations). 14 * 15 * Specifically, under Xen the Linux idea of pages is an illusion. It 16 * assumes that pages start at zero and go up to the available memory. To 17 * help with that, the Linux Xen MMU provides a lookup mechanism to 18 * translate the page frame numbers (PFN) to machine frame numbers (MFN) 19 * and vice-versa. The MFN are the "real" frame numbers. Furthermore 20 * memory is not contiguous. Xen hypervisor stitches memory for guests 21 * from different pools, which means there is no guarantee that PFN==MFN 22 * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are 23 * allocated in descending order (high to low), meaning the guest might 24 * never get any MFN's under the 4GB mark. 25 */ 26 27 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 28 29 #include <linux/memblock.h> 30 #include <linux/dma-direct.h> 31 #include <linux/dma-noncoherent.h> 32 #include <linux/export.h> 33 #include <xen/swiotlb-xen.h> 34 #include <xen/page.h> 35 #include <xen/xen-ops.h> 36 #include <xen/hvc-console.h> 37 38 #include <asm/dma-mapping.h> 39 #include <asm/xen/page-coherent.h> 40 41 #include <trace/events/swiotlb.h> 42 /* 43 * Used to do a quick range check in swiotlb_tbl_unmap_single and 44 * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this 45 * API. 46 */ 47 48 static char *xen_io_tlb_start, *xen_io_tlb_end; 49 static unsigned long xen_io_tlb_nslabs; 50 /* 51 * Quick lookup value of the bus address of the IOTLB. 52 */ 53 54 static u64 start_dma_addr; 55 56 /* 57 * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t 58 * can be 32bit when dma_addr_t is 64bit leading to a loss in 59 * information if the shift is done before casting to 64bit. 60 */ 61 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) 62 { 63 unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); 64 dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; 65 66 dma |= paddr & ~XEN_PAGE_MASK; 67 68 return dma; 69 } 70 71 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) 72 { 73 unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); 74 dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; 75 phys_addr_t paddr = dma; 76 77 paddr |= baddr & ~XEN_PAGE_MASK; 78 79 return paddr; 80 } 81 82 static inline dma_addr_t xen_virt_to_bus(void *address) 83 { 84 return xen_phys_to_bus(virt_to_phys(address)); 85 } 86 87 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) 88 { 89 unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); 90 unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); 91 92 next_bfn = pfn_to_bfn(xen_pfn); 93 94 for (i = 1; i < nr_pages; i++) 95 if (pfn_to_bfn(++xen_pfn) != ++next_bfn) 96 return 1; 97 98 return 0; 99 } 100 101 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) 102 { 103 unsigned long bfn = XEN_PFN_DOWN(dma_addr); 104 unsigned long xen_pfn = bfn_to_local_pfn(bfn); 105 phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); 106 107 /* If the address is outside our domain, it CAN 108 * have the same virtual address as another address 109 * in our domain. Therefore _only_ check address within our domain. 110 */ 111 if (pfn_valid(PFN_DOWN(paddr))) { 112 return paddr >= virt_to_phys(xen_io_tlb_start) && 113 paddr < virt_to_phys(xen_io_tlb_end); 114 } 115 return 0; 116 } 117 118 static int max_dma_bits = 32; 119 120 static int 121 xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) 122 { 123 int i, rc; 124 int dma_bits; 125 dma_addr_t dma_handle; 126 phys_addr_t p = virt_to_phys(buf); 127 128 dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; 129 130 i = 0; 131 do { 132 int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); 133 134 do { 135 rc = xen_create_contiguous_region( 136 p + (i << IO_TLB_SHIFT), 137 get_order(slabs << IO_TLB_SHIFT), 138 dma_bits, &dma_handle); 139 } while (rc && dma_bits++ < max_dma_bits); 140 if (rc) 141 return rc; 142 143 i += slabs; 144 } while (i < nslabs); 145 return 0; 146 } 147 static unsigned long xen_set_nslabs(unsigned long nr_tbl) 148 { 149 if (!nr_tbl) { 150 xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); 151 xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); 152 } else 153 xen_io_tlb_nslabs = nr_tbl; 154 155 return xen_io_tlb_nslabs << IO_TLB_SHIFT; 156 } 157 158 enum xen_swiotlb_err { 159 XEN_SWIOTLB_UNKNOWN = 0, 160 XEN_SWIOTLB_ENOMEM, 161 XEN_SWIOTLB_EFIXUP 162 }; 163 164 static const char *xen_swiotlb_error(enum xen_swiotlb_err err) 165 { 166 switch (err) { 167 case XEN_SWIOTLB_ENOMEM: 168 return "Cannot allocate Xen-SWIOTLB buffer\n"; 169 case XEN_SWIOTLB_EFIXUP: 170 return "Failed to get contiguous memory for DMA from Xen!\n"\ 171 "You either: don't have the permissions, do not have"\ 172 " enough free memory under 4GB, or the hypervisor memory"\ 173 " is too fragmented!"; 174 default: 175 break; 176 } 177 return ""; 178 } 179 int __ref xen_swiotlb_init(int verbose, bool early) 180 { 181 unsigned long bytes, order; 182 int rc = -ENOMEM; 183 enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; 184 unsigned int repeat = 3; 185 186 xen_io_tlb_nslabs = swiotlb_nr_tbl(); 187 retry: 188 bytes = xen_set_nslabs(xen_io_tlb_nslabs); 189 order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); 190 191 /* 192 * IO TLB memory already allocated. Just use it. 193 */ 194 if (io_tlb_start != 0) { 195 xen_io_tlb_start = phys_to_virt(io_tlb_start); 196 goto end; 197 } 198 199 /* 200 * Get IO TLB memory from any location. 201 */ 202 if (early) { 203 xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), 204 PAGE_SIZE); 205 if (!xen_io_tlb_start) 206 panic("%s: Failed to allocate %lu bytes align=0x%lx\n", 207 __func__, PAGE_ALIGN(bytes), PAGE_SIZE); 208 } else { 209 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) 210 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) 211 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 212 xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); 213 if (xen_io_tlb_start) 214 break; 215 order--; 216 } 217 if (order != get_order(bytes)) { 218 pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", 219 (PAGE_SIZE << order) >> 20); 220 xen_io_tlb_nslabs = SLABS_PER_PAGE << order; 221 bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; 222 } 223 } 224 if (!xen_io_tlb_start) { 225 m_ret = XEN_SWIOTLB_ENOMEM; 226 goto error; 227 } 228 /* 229 * And replace that memory with pages under 4GB. 230 */ 231 rc = xen_swiotlb_fixup(xen_io_tlb_start, 232 bytes, 233 xen_io_tlb_nslabs); 234 if (rc) { 235 if (early) 236 memblock_free(__pa(xen_io_tlb_start), 237 PAGE_ALIGN(bytes)); 238 else { 239 free_pages((unsigned long)xen_io_tlb_start, order); 240 xen_io_tlb_start = NULL; 241 } 242 m_ret = XEN_SWIOTLB_EFIXUP; 243 goto error; 244 } 245 start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); 246 if (early) { 247 if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, 248 verbose)) 249 panic("Cannot allocate SWIOTLB buffer"); 250 rc = 0; 251 } else 252 rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); 253 254 end: 255 xen_io_tlb_end = xen_io_tlb_start + bytes; 256 if (!rc) 257 swiotlb_set_max_segment(PAGE_SIZE); 258 259 return rc; 260 error: 261 if (repeat--) { 262 xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ 263 (xen_io_tlb_nslabs >> 1)); 264 pr_info("Lowering to %luMB\n", 265 (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); 266 goto retry; 267 } 268 pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); 269 if (early) 270 panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); 271 else 272 free_pages((unsigned long)xen_io_tlb_start, order); 273 return rc; 274 } 275 276 static void * 277 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 278 dma_addr_t *dma_handle, gfp_t flags, 279 unsigned long attrs) 280 { 281 void *ret; 282 int order = get_order(size); 283 u64 dma_mask = DMA_BIT_MASK(32); 284 phys_addr_t phys; 285 dma_addr_t dev_addr; 286 287 /* 288 * Ignore region specifiers - the kernel's ideas of 289 * pseudo-phys memory layout has nothing to do with the 290 * machine physical layout. We can't allocate highmem 291 * because we can't return a pointer to it. 292 */ 293 flags &= ~(__GFP_DMA | __GFP_HIGHMEM); 294 295 /* Convert the size to actually allocated. */ 296 size = 1UL << (order + XEN_PAGE_SHIFT); 297 298 /* On ARM this function returns an ioremap'ped virtual address for 299 * which virt_to_phys doesn't return the corresponding physical 300 * address. In fact on ARM virt_to_phys only works for kernel direct 301 * mapped RAM memory. Also see comment below. 302 */ 303 ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); 304 305 if (!ret) 306 return ret; 307 308 if (hwdev && hwdev->coherent_dma_mask) 309 dma_mask = hwdev->coherent_dma_mask; 310 311 /* At this point dma_handle is the physical address, next we are 312 * going to set it to the machine address. 313 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond 314 * to *dma_handle. */ 315 phys = *dma_handle; 316 dev_addr = xen_phys_to_bus(phys); 317 if (((dev_addr + size - 1 <= dma_mask)) && 318 !range_straddles_page_boundary(phys, size)) 319 *dma_handle = dev_addr; 320 else { 321 if (xen_create_contiguous_region(phys, order, 322 fls64(dma_mask), dma_handle) != 0) { 323 xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); 324 return NULL; 325 } 326 SetPageXenRemapped(virt_to_page(ret)); 327 } 328 memset(ret, 0, size); 329 return ret; 330 } 331 332 static void 333 xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, 334 dma_addr_t dev_addr, unsigned long attrs) 335 { 336 int order = get_order(size); 337 phys_addr_t phys; 338 u64 dma_mask = DMA_BIT_MASK(32); 339 340 if (hwdev && hwdev->coherent_dma_mask) 341 dma_mask = hwdev->coherent_dma_mask; 342 343 /* do not use virt_to_phys because on ARM it doesn't return you the 344 * physical address */ 345 phys = xen_bus_to_phys(dev_addr); 346 347 /* Convert the size to actually allocated. */ 348 size = 1UL << (order + XEN_PAGE_SHIFT); 349 350 if (!WARN_ON((dev_addr + size - 1 > dma_mask) || 351 range_straddles_page_boundary(phys, size)) && 352 TestClearPageXenRemapped(virt_to_page(vaddr))) 353 xen_destroy_contiguous_region(phys, order); 354 355 xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); 356 } 357 358 /* 359 * Map a single buffer of the indicated size for DMA in streaming mode. The 360 * physical address to use is returned. 361 * 362 * Once the device is given the dma address, the device owns this memory until 363 * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. 364 */ 365 static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, 366 unsigned long offset, size_t size, 367 enum dma_data_direction dir, 368 unsigned long attrs) 369 { 370 phys_addr_t map, phys = page_to_phys(page) + offset; 371 dma_addr_t dev_addr = xen_phys_to_bus(phys); 372 373 BUG_ON(dir == DMA_NONE); 374 /* 375 * If the address happens to be in the device's DMA window, 376 * we can safely return the device addr and not worry about bounce 377 * buffering it. 378 */ 379 if (dma_capable(dev, dev_addr, size) && 380 !range_straddles_page_boundary(phys, size) && 381 !xen_arch_need_swiotlb(dev, phys, dev_addr) && 382 swiotlb_force != SWIOTLB_FORCE) 383 goto done; 384 385 /* 386 * Oh well, have to allocate and map a bounce buffer. 387 */ 388 trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); 389 390 map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, 391 size, size, dir, attrs); 392 if (map == (phys_addr_t)DMA_MAPPING_ERROR) 393 return DMA_MAPPING_ERROR; 394 395 phys = map; 396 dev_addr = xen_phys_to_bus(map); 397 398 /* 399 * Ensure that the address returned is DMA'ble 400 */ 401 if (unlikely(!dma_capable(dev, dev_addr, size))) { 402 swiotlb_tbl_unmap_single(dev, map, size, size, dir, 403 attrs | DMA_ATTR_SKIP_CPU_SYNC); 404 return DMA_MAPPING_ERROR; 405 } 406 407 done: 408 if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 409 xen_dma_sync_for_device(dev, dev_addr, phys, size, dir); 410 return dev_addr; 411 } 412 413 /* 414 * Unmap a single streaming mode DMA translation. The dma_addr and size must 415 * match what was provided for in a previous xen_swiotlb_map_page call. All 416 * other usages are undefined. 417 * 418 * After this call, reads by the cpu to the buffer are guaranteed to see 419 * whatever the device wrote there. 420 */ 421 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 422 size_t size, enum dma_data_direction dir, unsigned long attrs) 423 { 424 phys_addr_t paddr = xen_bus_to_phys(dev_addr); 425 426 BUG_ON(dir == DMA_NONE); 427 428 if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 429 xen_dma_sync_for_cpu(hwdev, dev_addr, paddr, size, dir); 430 431 /* NOTE: We use dev_addr here, not paddr! */ 432 if (is_xen_swiotlb_buffer(dev_addr)) 433 swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs); 434 } 435 436 static void 437 xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, 438 size_t size, enum dma_data_direction dir) 439 { 440 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 441 442 if (!dev_is_dma_coherent(dev)) 443 xen_dma_sync_for_cpu(dev, dma_addr, paddr, size, dir); 444 445 if (is_xen_swiotlb_buffer(dma_addr)) 446 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); 447 } 448 449 static void 450 xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, 451 size_t size, enum dma_data_direction dir) 452 { 453 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 454 455 if (is_xen_swiotlb_buffer(dma_addr)) 456 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); 457 458 if (!dev_is_dma_coherent(dev)) 459 xen_dma_sync_for_device(dev, dma_addr, paddr, size, dir); 460 } 461 462 /* 463 * Unmap a set of streaming mode DMA translations. Again, cpu read rules 464 * concerning calls here are the same as for swiotlb_unmap_page() above. 465 */ 466 static void 467 xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, 468 enum dma_data_direction dir, unsigned long attrs) 469 { 470 struct scatterlist *sg; 471 int i; 472 473 BUG_ON(dir == DMA_NONE); 474 475 for_each_sg(sgl, sg, nelems, i) 476 xen_swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), 477 dir, attrs); 478 479 } 480 481 static int 482 xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, 483 enum dma_data_direction dir, unsigned long attrs) 484 { 485 struct scatterlist *sg; 486 int i; 487 488 BUG_ON(dir == DMA_NONE); 489 490 for_each_sg(sgl, sg, nelems, i) { 491 sg->dma_address = xen_swiotlb_map_page(dev, sg_page(sg), 492 sg->offset, sg->length, dir, attrs); 493 if (sg->dma_address == DMA_MAPPING_ERROR) 494 goto out_unmap; 495 sg_dma_len(sg) = sg->length; 496 } 497 498 return nelems; 499 out_unmap: 500 xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 501 sg_dma_len(sgl) = 0; 502 return 0; 503 } 504 505 static void 506 xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, 507 int nelems, enum dma_data_direction dir) 508 { 509 struct scatterlist *sg; 510 int i; 511 512 for_each_sg(sgl, sg, nelems, i) { 513 xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, 514 sg->length, dir); 515 } 516 } 517 518 static void 519 xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, 520 int nelems, enum dma_data_direction dir) 521 { 522 struct scatterlist *sg; 523 int i; 524 525 for_each_sg(sgl, sg, nelems, i) { 526 xen_swiotlb_sync_single_for_device(dev, sg->dma_address, 527 sg->length, dir); 528 } 529 } 530 531 /* 532 * Return whether the given device DMA address mask can be supported 533 * properly. For example, if your device can only drive the low 24-bits 534 * during bus mastering, then you would pass 0x00ffffff as the mask to 535 * this function. 536 */ 537 static int 538 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) 539 { 540 return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; 541 } 542 543 const struct dma_map_ops xen_swiotlb_dma_ops = { 544 .alloc = xen_swiotlb_alloc_coherent, 545 .free = xen_swiotlb_free_coherent, 546 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, 547 .sync_single_for_device = xen_swiotlb_sync_single_for_device, 548 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, 549 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, 550 .map_sg = xen_swiotlb_map_sg, 551 .unmap_sg = xen_swiotlb_unmap_sg, 552 .map_page = xen_swiotlb_map_page, 553 .unmap_page = xen_swiotlb_unmap_page, 554 .dma_supported = xen_swiotlb_dma_supported, 555 .mmap = dma_common_mmap, 556 .get_sgtable = dma_common_get_sgtable, 557 }; 558