1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2010 4 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 5 * 6 * This code provides a IOMMU for Xen PV guests with PCI passthrough. 7 * 8 * PV guests under Xen are running in an non-contiguous memory architecture. 9 * 10 * When PCI pass-through is utilized, this necessitates an IOMMU for 11 * translating bus (DMA) to virtual and vice-versa and also providing a 12 * mechanism to have contiguous pages for device drivers operations (say DMA 13 * operations). 14 * 15 * Specifically, under Xen the Linux idea of pages is an illusion. It 16 * assumes that pages start at zero and go up to the available memory. To 17 * help with that, the Linux Xen MMU provides a lookup mechanism to 18 * translate the page frame numbers (PFN) to machine frame numbers (MFN) 19 * and vice-versa. The MFN are the "real" frame numbers. Furthermore 20 * memory is not contiguous. Xen hypervisor stitches memory for guests 21 * from different pools, which means there is no guarantee that PFN==MFN 22 * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are 23 * allocated in descending order (high to low), meaning the guest might 24 * never get any MFN's under the 4GB mark. 25 */ 26 27 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 28 29 #include <linux/memblock.h> 30 #include <linux/dma-direct.h> 31 #include <linux/export.h> 32 #include <xen/swiotlb-xen.h> 33 #include <xen/page.h> 34 #include <xen/xen-ops.h> 35 #include <xen/hvc-console.h> 36 37 #include <asm/dma-mapping.h> 38 #include <asm/xen/page-coherent.h> 39 40 #include <trace/events/swiotlb.h> 41 /* 42 * Used to do a quick range check in swiotlb_tbl_unmap_single and 43 * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this 44 * API. 45 */ 46 47 static char *xen_io_tlb_start, *xen_io_tlb_end; 48 static unsigned long xen_io_tlb_nslabs; 49 /* 50 * Quick lookup value of the bus address of the IOTLB. 51 */ 52 53 static u64 start_dma_addr; 54 55 /* 56 * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t 57 * can be 32bit when dma_addr_t is 64bit leading to a loss in 58 * information if the shift is done before casting to 64bit. 59 */ 60 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) 61 { 62 unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); 63 dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; 64 65 dma |= paddr & ~XEN_PAGE_MASK; 66 67 return dma; 68 } 69 70 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) 71 { 72 unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); 73 dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; 74 phys_addr_t paddr = dma; 75 76 paddr |= baddr & ~XEN_PAGE_MASK; 77 78 return paddr; 79 } 80 81 static inline dma_addr_t xen_virt_to_bus(void *address) 82 { 83 return xen_phys_to_bus(virt_to_phys(address)); 84 } 85 86 static int check_pages_physically_contiguous(unsigned long xen_pfn, 87 unsigned int offset, 88 size_t length) 89 { 90 unsigned long next_bfn; 91 int i; 92 int nr_pages; 93 94 next_bfn = pfn_to_bfn(xen_pfn); 95 nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT; 96 97 for (i = 1; i < nr_pages; i++) { 98 if (pfn_to_bfn(++xen_pfn) != ++next_bfn) 99 return 0; 100 } 101 return 1; 102 } 103 104 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) 105 { 106 unsigned long xen_pfn = XEN_PFN_DOWN(p); 107 unsigned int offset = p & ~XEN_PAGE_MASK; 108 109 if (offset + size <= XEN_PAGE_SIZE) 110 return 0; 111 if (check_pages_physically_contiguous(xen_pfn, offset, size)) 112 return 0; 113 return 1; 114 } 115 116 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) 117 { 118 unsigned long bfn = XEN_PFN_DOWN(dma_addr); 119 unsigned long xen_pfn = bfn_to_local_pfn(bfn); 120 phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); 121 122 /* If the address is outside our domain, it CAN 123 * have the same virtual address as another address 124 * in our domain. Therefore _only_ check address within our domain. 125 */ 126 if (pfn_valid(PFN_DOWN(paddr))) { 127 return paddr >= virt_to_phys(xen_io_tlb_start) && 128 paddr < virt_to_phys(xen_io_tlb_end); 129 } 130 return 0; 131 } 132 133 static int max_dma_bits = 32; 134 135 static int 136 xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) 137 { 138 int i, rc; 139 int dma_bits; 140 dma_addr_t dma_handle; 141 phys_addr_t p = virt_to_phys(buf); 142 143 dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; 144 145 i = 0; 146 do { 147 int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); 148 149 do { 150 rc = xen_create_contiguous_region( 151 p + (i << IO_TLB_SHIFT), 152 get_order(slabs << IO_TLB_SHIFT), 153 dma_bits, &dma_handle); 154 } while (rc && dma_bits++ < max_dma_bits); 155 if (rc) 156 return rc; 157 158 i += slabs; 159 } while (i < nslabs); 160 return 0; 161 } 162 static unsigned long xen_set_nslabs(unsigned long nr_tbl) 163 { 164 if (!nr_tbl) { 165 xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); 166 xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); 167 } else 168 xen_io_tlb_nslabs = nr_tbl; 169 170 return xen_io_tlb_nslabs << IO_TLB_SHIFT; 171 } 172 173 enum xen_swiotlb_err { 174 XEN_SWIOTLB_UNKNOWN = 0, 175 XEN_SWIOTLB_ENOMEM, 176 XEN_SWIOTLB_EFIXUP 177 }; 178 179 static const char *xen_swiotlb_error(enum xen_swiotlb_err err) 180 { 181 switch (err) { 182 case XEN_SWIOTLB_ENOMEM: 183 return "Cannot allocate Xen-SWIOTLB buffer\n"; 184 case XEN_SWIOTLB_EFIXUP: 185 return "Failed to get contiguous memory for DMA from Xen!\n"\ 186 "You either: don't have the permissions, do not have"\ 187 " enough free memory under 4GB, or the hypervisor memory"\ 188 " is too fragmented!"; 189 default: 190 break; 191 } 192 return ""; 193 } 194 int __ref xen_swiotlb_init(int verbose, bool early) 195 { 196 unsigned long bytes, order; 197 int rc = -ENOMEM; 198 enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; 199 unsigned int repeat = 3; 200 201 xen_io_tlb_nslabs = swiotlb_nr_tbl(); 202 retry: 203 bytes = xen_set_nslabs(xen_io_tlb_nslabs); 204 order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); 205 /* 206 * Get IO TLB memory from any location. 207 */ 208 if (early) { 209 xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), 210 PAGE_SIZE); 211 if (!xen_io_tlb_start) 212 panic("%s: Failed to allocate %lu bytes align=0x%lx\n", 213 __func__, PAGE_ALIGN(bytes), PAGE_SIZE); 214 } else { 215 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) 216 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) 217 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 218 xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); 219 if (xen_io_tlb_start) 220 break; 221 order--; 222 } 223 if (order != get_order(bytes)) { 224 pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", 225 (PAGE_SIZE << order) >> 20); 226 xen_io_tlb_nslabs = SLABS_PER_PAGE << order; 227 bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; 228 } 229 } 230 if (!xen_io_tlb_start) { 231 m_ret = XEN_SWIOTLB_ENOMEM; 232 goto error; 233 } 234 xen_io_tlb_end = xen_io_tlb_start + bytes; 235 /* 236 * And replace that memory with pages under 4GB. 237 */ 238 rc = xen_swiotlb_fixup(xen_io_tlb_start, 239 bytes, 240 xen_io_tlb_nslabs); 241 if (rc) { 242 if (early) 243 memblock_free(__pa(xen_io_tlb_start), 244 PAGE_ALIGN(bytes)); 245 else { 246 free_pages((unsigned long)xen_io_tlb_start, order); 247 xen_io_tlb_start = NULL; 248 } 249 m_ret = XEN_SWIOTLB_EFIXUP; 250 goto error; 251 } 252 start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); 253 if (early) { 254 if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, 255 verbose)) 256 panic("Cannot allocate SWIOTLB buffer"); 257 rc = 0; 258 } else 259 rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); 260 261 if (!rc) 262 swiotlb_set_max_segment(PAGE_SIZE); 263 264 return rc; 265 error: 266 if (repeat--) { 267 xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ 268 (xen_io_tlb_nslabs >> 1)); 269 pr_info("Lowering to %luMB\n", 270 (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); 271 goto retry; 272 } 273 pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); 274 if (early) 275 panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); 276 else 277 free_pages((unsigned long)xen_io_tlb_start, order); 278 return rc; 279 } 280 281 static void * 282 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 283 dma_addr_t *dma_handle, gfp_t flags, 284 unsigned long attrs) 285 { 286 void *ret; 287 int order = get_order(size); 288 u64 dma_mask = DMA_BIT_MASK(32); 289 phys_addr_t phys; 290 dma_addr_t dev_addr; 291 292 /* 293 * Ignore region specifiers - the kernel's ideas of 294 * pseudo-phys memory layout has nothing to do with the 295 * machine physical layout. We can't allocate highmem 296 * because we can't return a pointer to it. 297 */ 298 flags &= ~(__GFP_DMA | __GFP_HIGHMEM); 299 300 /* Convert the size to actually allocated. */ 301 size = 1UL << (order + XEN_PAGE_SHIFT); 302 303 /* On ARM this function returns an ioremap'ped virtual address for 304 * which virt_to_phys doesn't return the corresponding physical 305 * address. In fact on ARM virt_to_phys only works for kernel direct 306 * mapped RAM memory. Also see comment below. 307 */ 308 ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); 309 310 if (!ret) 311 return ret; 312 313 if (hwdev && hwdev->coherent_dma_mask) 314 dma_mask = hwdev->coherent_dma_mask; 315 316 /* At this point dma_handle is the physical address, next we are 317 * going to set it to the machine address. 318 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond 319 * to *dma_handle. */ 320 phys = *dma_handle; 321 dev_addr = xen_phys_to_bus(phys); 322 if (((dev_addr + size - 1 <= dma_mask)) && 323 !range_straddles_page_boundary(phys, size)) 324 *dma_handle = dev_addr; 325 else { 326 if (xen_create_contiguous_region(phys, order, 327 fls64(dma_mask), dma_handle) != 0) { 328 xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); 329 return NULL; 330 } 331 } 332 memset(ret, 0, size); 333 return ret; 334 } 335 336 static void 337 xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, 338 dma_addr_t dev_addr, unsigned long attrs) 339 { 340 int order = get_order(size); 341 phys_addr_t phys; 342 u64 dma_mask = DMA_BIT_MASK(32); 343 344 if (hwdev && hwdev->coherent_dma_mask) 345 dma_mask = hwdev->coherent_dma_mask; 346 347 /* do not use virt_to_phys because on ARM it doesn't return you the 348 * physical address */ 349 phys = xen_bus_to_phys(dev_addr); 350 351 /* Convert the size to actually allocated. */ 352 size = 1UL << (order + XEN_PAGE_SHIFT); 353 354 if (((dev_addr + size - 1 <= dma_mask)) || 355 range_straddles_page_boundary(phys, size)) 356 xen_destroy_contiguous_region(phys, order); 357 358 xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); 359 } 360 361 /* 362 * Map a single buffer of the indicated size for DMA in streaming mode. The 363 * physical address to use is returned. 364 * 365 * Once the device is given the dma address, the device owns this memory until 366 * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. 367 */ 368 static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, 369 unsigned long offset, size_t size, 370 enum dma_data_direction dir, 371 unsigned long attrs) 372 { 373 phys_addr_t map, phys = page_to_phys(page) + offset; 374 dma_addr_t dev_addr = xen_phys_to_bus(phys); 375 376 BUG_ON(dir == DMA_NONE); 377 /* 378 * If the address happens to be in the device's DMA window, 379 * we can safely return the device addr and not worry about bounce 380 * buffering it. 381 */ 382 if (dma_capable(dev, dev_addr, size) && 383 !range_straddles_page_boundary(phys, size) && 384 !xen_arch_need_swiotlb(dev, phys, dev_addr) && 385 swiotlb_force != SWIOTLB_FORCE) 386 goto done; 387 388 /* 389 * Oh well, have to allocate and map a bounce buffer. 390 */ 391 trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); 392 393 map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, 394 attrs); 395 if (map == DMA_MAPPING_ERROR) 396 return DMA_MAPPING_ERROR; 397 398 dev_addr = xen_phys_to_bus(map); 399 400 /* 401 * Ensure that the address returned is DMA'ble 402 */ 403 if (unlikely(!dma_capable(dev, dev_addr, size))) { 404 swiotlb_tbl_unmap_single(dev, map, size, dir, 405 attrs | DMA_ATTR_SKIP_CPU_SYNC); 406 return DMA_MAPPING_ERROR; 407 } 408 409 page = pfn_to_page(map >> PAGE_SHIFT); 410 offset = map & ~PAGE_MASK; 411 done: 412 /* 413 * we are not interested in the dma_addr returned by xen_dma_map_page, 414 * only in the potential cache flushes executed by the function. 415 */ 416 xen_dma_map_page(dev, page, dev_addr, offset, size, dir, attrs); 417 return dev_addr; 418 } 419 420 /* 421 * Unmap a single streaming mode DMA translation. The dma_addr and size must 422 * match what was provided for in a previous xen_swiotlb_map_page call. All 423 * other usages are undefined. 424 * 425 * After this call, reads by the cpu to the buffer are guaranteed to see 426 * whatever the device wrote there. 427 */ 428 static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, 429 size_t size, enum dma_data_direction dir, 430 unsigned long attrs) 431 { 432 phys_addr_t paddr = xen_bus_to_phys(dev_addr); 433 434 BUG_ON(dir == DMA_NONE); 435 436 xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); 437 438 /* NOTE: We use dev_addr here, not paddr! */ 439 if (is_xen_swiotlb_buffer(dev_addr)) 440 swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); 441 } 442 443 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 444 size_t size, enum dma_data_direction dir, 445 unsigned long attrs) 446 { 447 xen_unmap_single(hwdev, dev_addr, size, dir, attrs); 448 } 449 450 static void 451 xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, 452 size_t size, enum dma_data_direction dir) 453 { 454 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 455 456 xen_dma_sync_single_for_cpu(dev, dma_addr, size, dir); 457 458 if (is_xen_swiotlb_buffer(dma_addr)) 459 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); 460 } 461 462 static void 463 xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, 464 size_t size, enum dma_data_direction dir) 465 { 466 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 467 468 if (is_xen_swiotlb_buffer(dma_addr)) 469 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); 470 471 xen_dma_sync_single_for_device(dev, dma_addr, size, dir); 472 } 473 474 /* 475 * Unmap a set of streaming mode DMA translations. Again, cpu read rules 476 * concerning calls here are the same as for swiotlb_unmap_page() above. 477 */ 478 static void 479 xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, 480 enum dma_data_direction dir, unsigned long attrs) 481 { 482 struct scatterlist *sg; 483 int i; 484 485 BUG_ON(dir == DMA_NONE); 486 487 for_each_sg(sgl, sg, nelems, i) 488 xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); 489 490 } 491 492 static int 493 xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, 494 enum dma_data_direction dir, unsigned long attrs) 495 { 496 struct scatterlist *sg; 497 int i; 498 499 BUG_ON(dir == DMA_NONE); 500 501 for_each_sg(sgl, sg, nelems, i) { 502 sg->dma_address = xen_swiotlb_map_page(dev, sg_page(sg), 503 sg->offset, sg->length, dir, attrs); 504 if (sg->dma_address == DMA_MAPPING_ERROR) 505 goto out_unmap; 506 sg_dma_len(sg) = sg->length; 507 } 508 509 return nelems; 510 out_unmap: 511 xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 512 sg_dma_len(sgl) = 0; 513 return 0; 514 } 515 516 static void 517 xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, 518 int nelems, enum dma_data_direction dir) 519 { 520 struct scatterlist *sg; 521 int i; 522 523 for_each_sg(sgl, sg, nelems, i) { 524 xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, 525 sg->length, dir); 526 } 527 } 528 529 static void 530 xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, 531 int nelems, enum dma_data_direction dir) 532 { 533 struct scatterlist *sg; 534 int i; 535 536 for_each_sg(sgl, sg, nelems, i) { 537 xen_swiotlb_sync_single_for_device(dev, sg->dma_address, 538 sg->length, dir); 539 } 540 } 541 542 /* 543 * Return whether the given device DMA address mask can be supported 544 * properly. For example, if your device can only drive the low 24-bits 545 * during bus mastering, then you would pass 0x00ffffff as the mask to 546 * this function. 547 */ 548 static int 549 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) 550 { 551 return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; 552 } 553 554 /* 555 * Create userspace mapping for the DMA-coherent memory. 556 * This function should be called with the pages from the current domain only, 557 * passing pages mapped from other domains would lead to memory corruption. 558 */ 559 static int 560 xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, 561 void *cpu_addr, dma_addr_t dma_addr, size_t size, 562 unsigned long attrs) 563 { 564 #ifdef CONFIG_ARM 565 if (xen_get_dma_ops(dev)->mmap) 566 return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, 567 dma_addr, size, attrs); 568 #endif 569 return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); 570 } 571 572 /* 573 * This function should be called with the pages from the current domain only, 574 * passing pages mapped from other domains would lead to memory corruption. 575 */ 576 static int 577 xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, 578 void *cpu_addr, dma_addr_t handle, size_t size, 579 unsigned long attrs) 580 { 581 #ifdef CONFIG_ARM 582 if (xen_get_dma_ops(dev)->get_sgtable) { 583 #if 0 584 /* 585 * This check verifies that the page belongs to the current domain and 586 * is not one mapped from another domain. 587 * This check is for debug only, and should not go to production build 588 */ 589 unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); 590 BUG_ON (!page_is_ram(bfn)); 591 #endif 592 return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, 593 handle, size, attrs); 594 } 595 #endif 596 return dma_common_get_sgtable(dev, sgt, cpu_addr, handle, size, attrs); 597 } 598 599 const struct dma_map_ops xen_swiotlb_dma_ops = { 600 .alloc = xen_swiotlb_alloc_coherent, 601 .free = xen_swiotlb_free_coherent, 602 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, 603 .sync_single_for_device = xen_swiotlb_sync_single_for_device, 604 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, 605 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, 606 .map_sg = xen_swiotlb_map_sg, 607 .unmap_sg = xen_swiotlb_unmap_sg, 608 .map_page = xen_swiotlb_map_page, 609 .unmap_page = xen_swiotlb_unmap_page, 610 .dma_supported = xen_swiotlb_dma_supported, 611 .mmap = xen_swiotlb_dma_mmap, 612 .get_sgtable = xen_swiotlb_get_sgtable, 613 }; 614