1 /* 2 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 3 * 4 * Rewrite, cleanup, new allocation schemes, virtual merging: 5 * Copyright (C) 2004 Olof Johansson, IBM Corporation 6 * and Ben. Herrenschmidt, IBM Corporation 7 * 8 * Dynamic DMA mapping support, bus-independent parts. 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 26 #include <linux/init.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/mm.h> 30 #include <linux/spinlock.h> 31 #include <linux/string.h> 32 #include <linux/dma-mapping.h> 33 #include <linux/bitops.h> 34 #include <linux/iommu-helper.h> 35 #include <asm/io.h> 36 #include <asm/prom.h> 37 #include <asm/iommu.h> 38 #include <asm/pci-bridge.h> 39 #include <asm/machdep.h> 40 #include <asm/kdump.h> 41 42 #define DBG(...) 43 44 #ifdef CONFIG_IOMMU_VMERGE 45 static int novmerge = 0; 46 #else 47 static int novmerge = 1; 48 #endif 49 50 static int protect4gb = 1; 51 52 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); 53 54 static inline unsigned long iommu_num_pages(unsigned long vaddr, 55 unsigned long slen) 56 { 57 unsigned long npages; 58 59 npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK); 60 npages >>= IOMMU_PAGE_SHIFT; 61 62 return npages; 63 } 64 65 static int __init setup_protect4gb(char *str) 66 { 67 if (strcmp(str, "on") == 0) 68 protect4gb = 1; 69 else if (strcmp(str, "off") == 0) 70 protect4gb = 0; 71 72 return 1; 73 } 74 75 static int __init setup_iommu(char *str) 76 { 77 if (!strcmp(str, "novmerge")) 78 novmerge = 1; 79 else if (!strcmp(str, "vmerge")) 80 novmerge = 0; 81 return 1; 82 } 83 84 __setup("protect4gb=", setup_protect4gb); 85 __setup("iommu=", setup_iommu); 86 87 static unsigned long iommu_range_alloc(struct device *dev, 88 struct iommu_table *tbl, 89 unsigned long npages, 90 unsigned long *handle, 91 unsigned long mask, 92 unsigned int align_order) 93 { 94 unsigned long n, end, start; 95 unsigned long limit; 96 int largealloc = npages > 15; 97 int pass = 0; 98 unsigned long align_mask; 99 unsigned long boundary_size; 100 101 align_mask = 0xffffffffffffffffl >> (64 - align_order); 102 103 /* This allocator was derived from x86_64's bit string search */ 104 105 /* Sanity check */ 106 if (unlikely(npages == 0)) { 107 if (printk_ratelimit()) 108 WARN_ON(1); 109 return DMA_ERROR_CODE; 110 } 111 112 if (handle && *handle) 113 start = *handle; 114 else 115 start = largealloc ? tbl->it_largehint : tbl->it_hint; 116 117 /* Use only half of the table for small allocs (15 pages or less) */ 118 limit = largealloc ? tbl->it_size : tbl->it_halfpoint; 119 120 if (largealloc && start < tbl->it_halfpoint) 121 start = tbl->it_halfpoint; 122 123 /* The case below can happen if we have a small segment appended 124 * to a large, or when the previous alloc was at the very end of 125 * the available space. If so, go back to the initial start. 126 */ 127 if (start >= limit) 128 start = largealloc ? tbl->it_largehint : tbl->it_hint; 129 130 again: 131 132 if (limit + tbl->it_offset > mask) { 133 limit = mask - tbl->it_offset + 1; 134 /* If we're constrained on address range, first try 135 * at the masked hint to avoid O(n) search complexity, 136 * but on second pass, start at 0. 137 */ 138 if ((start & mask) >= limit || pass > 0) 139 start = 0; 140 else 141 start &= mask; 142 } 143 144 if (dev) 145 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 146 1 << IOMMU_PAGE_SHIFT); 147 else 148 boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT); 149 /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ 150 151 n = iommu_area_alloc(tbl->it_map, limit, start, npages, 152 tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, 153 align_mask); 154 if (n == -1) { 155 if (likely(pass < 2)) { 156 /* First failure, just rescan the half of the table. 157 * Second failure, rescan the other half of the table. 158 */ 159 start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; 160 limit = pass ? tbl->it_size : limit; 161 pass++; 162 goto again; 163 } else { 164 /* Third failure, give up */ 165 return DMA_ERROR_CODE; 166 } 167 } 168 169 end = n + npages; 170 171 /* Bump the hint to a new block for small allocs. */ 172 if (largealloc) { 173 /* Don't bump to new block to avoid fragmentation */ 174 tbl->it_largehint = end; 175 } else { 176 /* Overflow will be taken care of at the next allocation */ 177 tbl->it_hint = (end + tbl->it_blocksize - 1) & 178 ~(tbl->it_blocksize - 1); 179 } 180 181 /* Update handle for SG allocations */ 182 if (handle) 183 *handle = end; 184 185 return n; 186 } 187 188 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, 189 void *page, unsigned int npages, 190 enum dma_data_direction direction, 191 unsigned long mask, unsigned int align_order, 192 struct dma_attrs *attrs) 193 { 194 unsigned long entry, flags; 195 dma_addr_t ret = DMA_ERROR_CODE; 196 int build_fail; 197 198 spin_lock_irqsave(&(tbl->it_lock), flags); 199 200 entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); 201 202 if (unlikely(entry == DMA_ERROR_CODE)) { 203 spin_unlock_irqrestore(&(tbl->it_lock), flags); 204 return DMA_ERROR_CODE; 205 } 206 207 entry += tbl->it_offset; /* Offset into real TCE table */ 208 ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ 209 210 /* Put the TCEs in the HW table */ 211 build_fail = ppc_md.tce_build(tbl, entry, npages, 212 (unsigned long)page & IOMMU_PAGE_MASK, 213 direction, attrs); 214 215 /* ppc_md.tce_build() only returns non-zero for transient errors. 216 * Clean up the table bitmap in this case and return 217 * DMA_ERROR_CODE. For all other errors the functionality is 218 * not altered. 219 */ 220 if (unlikely(build_fail)) { 221 __iommu_free(tbl, ret, npages); 222 223 spin_unlock_irqrestore(&(tbl->it_lock), flags); 224 return DMA_ERROR_CODE; 225 } 226 227 /* Flush/invalidate TLB caches if necessary */ 228 if (ppc_md.tce_flush) 229 ppc_md.tce_flush(tbl); 230 231 spin_unlock_irqrestore(&(tbl->it_lock), flags); 232 233 /* Make sure updates are seen by hardware */ 234 mb(); 235 236 return ret; 237 } 238 239 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 240 unsigned int npages) 241 { 242 unsigned long entry, free_entry; 243 244 entry = dma_addr >> IOMMU_PAGE_SHIFT; 245 free_entry = entry - tbl->it_offset; 246 247 if (((free_entry + npages) > tbl->it_size) || 248 (entry < tbl->it_offset)) { 249 if (printk_ratelimit()) { 250 printk(KERN_INFO "iommu_free: invalid entry\n"); 251 printk(KERN_INFO "\tentry = 0x%lx\n", entry); 252 printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr); 253 printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl); 254 printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno); 255 printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size); 256 printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset); 257 printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index); 258 WARN_ON(1); 259 } 260 return; 261 } 262 263 ppc_md.tce_free(tbl, entry, npages); 264 iommu_area_free(tbl->it_map, free_entry, npages); 265 } 266 267 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 268 unsigned int npages) 269 { 270 unsigned long flags; 271 272 spin_lock_irqsave(&(tbl->it_lock), flags); 273 274 __iommu_free(tbl, dma_addr, npages); 275 276 /* Make sure TLB cache is flushed if the HW needs it. We do 277 * not do an mb() here on purpose, it is not needed on any of 278 * the current platforms. 279 */ 280 if (ppc_md.tce_flush) 281 ppc_md.tce_flush(tbl); 282 283 spin_unlock_irqrestore(&(tbl->it_lock), flags); 284 } 285 286 int iommu_map_sg(struct device *dev, struct iommu_table *tbl, 287 struct scatterlist *sglist, int nelems, 288 unsigned long mask, enum dma_data_direction direction, 289 struct dma_attrs *attrs) 290 { 291 dma_addr_t dma_next = 0, dma_addr; 292 unsigned long flags; 293 struct scatterlist *s, *outs, *segstart; 294 int outcount, incount, i, build_fail = 0; 295 unsigned int align; 296 unsigned long handle; 297 unsigned int max_seg_size; 298 299 BUG_ON(direction == DMA_NONE); 300 301 if ((nelems == 0) || !tbl) 302 return 0; 303 304 outs = s = segstart = &sglist[0]; 305 outcount = 1; 306 incount = nelems; 307 handle = 0; 308 309 /* Init first segment length for backout at failure */ 310 outs->dma_length = 0; 311 312 DBG("sg mapping %d elements:\n", nelems); 313 314 spin_lock_irqsave(&(tbl->it_lock), flags); 315 316 max_seg_size = dma_get_max_seg_size(dev); 317 for_each_sg(sglist, s, nelems, i) { 318 unsigned long vaddr, npages, entry, slen; 319 320 slen = s->length; 321 /* Sanity check */ 322 if (slen == 0) { 323 dma_next = 0; 324 continue; 325 } 326 /* Allocate iommu entries for that segment */ 327 vaddr = (unsigned long) sg_virt(s); 328 npages = iommu_num_pages(vaddr, slen); 329 align = 0; 330 if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE && 331 (vaddr & ~PAGE_MASK) == 0) 332 align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; 333 entry = iommu_range_alloc(dev, tbl, npages, &handle, 334 mask >> IOMMU_PAGE_SHIFT, align); 335 336 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 337 338 /* Handle failure */ 339 if (unlikely(entry == DMA_ERROR_CODE)) { 340 if (printk_ratelimit()) 341 printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx" 342 " npages %lx\n", tbl, vaddr, npages); 343 goto failure; 344 } 345 346 /* Convert entry to a dma_addr_t */ 347 entry += tbl->it_offset; 348 dma_addr = entry << IOMMU_PAGE_SHIFT; 349 dma_addr |= (s->offset & ~IOMMU_PAGE_MASK); 350 351 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 352 npages, entry, dma_addr); 353 354 /* Insert into HW table */ 355 build_fail = ppc_md.tce_build(tbl, entry, npages, 356 vaddr & IOMMU_PAGE_MASK, 357 direction, attrs); 358 if(unlikely(build_fail)) 359 goto failure; 360 361 /* If we are in an open segment, try merging */ 362 if (segstart != s) { 363 DBG(" - trying merge...\n"); 364 /* We cannot merge if: 365 * - allocated dma_addr isn't contiguous to previous allocation 366 */ 367 if (novmerge || (dma_addr != dma_next) || 368 (outs->dma_length + s->length > max_seg_size)) { 369 /* Can't merge: create a new segment */ 370 segstart = s; 371 outcount++; 372 outs = sg_next(outs); 373 DBG(" can't merge, new segment.\n"); 374 } else { 375 outs->dma_length += s->length; 376 DBG(" merged, new len: %ux\n", outs->dma_length); 377 } 378 } 379 380 if (segstart == s) { 381 /* This is a new segment, fill entries */ 382 DBG(" - filling new segment.\n"); 383 outs->dma_address = dma_addr; 384 outs->dma_length = slen; 385 } 386 387 /* Calculate next page pointer for contiguous check */ 388 dma_next = dma_addr + slen; 389 390 DBG(" - dma next is: %lx\n", dma_next); 391 } 392 393 /* Flush/invalidate TLB caches if necessary */ 394 if (ppc_md.tce_flush) 395 ppc_md.tce_flush(tbl); 396 397 spin_unlock_irqrestore(&(tbl->it_lock), flags); 398 399 DBG("mapped %d elements:\n", outcount); 400 401 /* For the sake of iommu_unmap_sg, we clear out the length in the 402 * next entry of the sglist if we didn't fill the list completely 403 */ 404 if (outcount < incount) { 405 outs = sg_next(outs); 406 outs->dma_address = DMA_ERROR_CODE; 407 outs->dma_length = 0; 408 } 409 410 /* Make sure updates are seen by hardware */ 411 mb(); 412 413 return outcount; 414 415 failure: 416 for_each_sg(sglist, s, nelems, i) { 417 if (s->dma_length != 0) { 418 unsigned long vaddr, npages; 419 420 vaddr = s->dma_address & IOMMU_PAGE_MASK; 421 npages = iommu_num_pages(s->dma_address, s->dma_length); 422 __iommu_free(tbl, vaddr, npages); 423 s->dma_address = DMA_ERROR_CODE; 424 s->dma_length = 0; 425 } 426 if (s == outs) 427 break; 428 } 429 spin_unlock_irqrestore(&(tbl->it_lock), flags); 430 return 0; 431 } 432 433 434 void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, 435 int nelems, enum dma_data_direction direction, 436 struct dma_attrs *attrs) 437 { 438 struct scatterlist *sg; 439 unsigned long flags; 440 441 BUG_ON(direction == DMA_NONE); 442 443 if (!tbl) 444 return; 445 446 spin_lock_irqsave(&(tbl->it_lock), flags); 447 448 sg = sglist; 449 while (nelems--) { 450 unsigned int npages; 451 dma_addr_t dma_handle = sg->dma_address; 452 453 if (sg->dma_length == 0) 454 break; 455 npages = iommu_num_pages(dma_handle, sg->dma_length); 456 __iommu_free(tbl, dma_handle, npages); 457 sg = sg_next(sg); 458 } 459 460 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we 461 * do not do an mb() here, the affected platforms do not need it 462 * when freeing. 463 */ 464 if (ppc_md.tce_flush) 465 ppc_md.tce_flush(tbl); 466 467 spin_unlock_irqrestore(&(tbl->it_lock), flags); 468 } 469 470 /* 471 * Build a iommu_table structure. This contains a bit map which 472 * is used to manage allocation of the tce space. 473 */ 474 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) 475 { 476 unsigned long sz; 477 static int welcomed = 0; 478 struct page *page; 479 480 /* Set aside 1/4 of the table for large allocations. */ 481 tbl->it_halfpoint = tbl->it_size * 3 / 4; 482 483 /* number of bytes needed for the bitmap */ 484 sz = (tbl->it_size + 7) >> 3; 485 486 page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz)); 487 if (!page) 488 panic("iommu_init_table: Can't allocate %ld bytes\n", sz); 489 tbl->it_map = page_address(page); 490 memset(tbl->it_map, 0, sz); 491 492 tbl->it_hint = 0; 493 tbl->it_largehint = tbl->it_halfpoint; 494 spin_lock_init(&tbl->it_lock); 495 496 #ifdef CONFIG_CRASH_DUMP 497 if (ppc_md.tce_get) { 498 unsigned long index; 499 unsigned long tceval; 500 unsigned long tcecount = 0; 501 502 /* 503 * Reserve the existing mappings left by the first kernel. 504 */ 505 for (index = 0; index < tbl->it_size; index++) { 506 tceval = ppc_md.tce_get(tbl, index + tbl->it_offset); 507 /* 508 * Freed TCE entry contains 0x7fffffffffffffff on JS20 509 */ 510 if (tceval && (tceval != 0x7fffffffffffffffUL)) { 511 __set_bit(index, tbl->it_map); 512 tcecount++; 513 } 514 } 515 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { 516 printk(KERN_WARNING "TCE table is full; "); 517 printk(KERN_WARNING "freeing %d entries for the kdump boot\n", 518 KDUMP_MIN_TCE_ENTRIES); 519 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; 520 index < tbl->it_size; index++) 521 __clear_bit(index, tbl->it_map); 522 } 523 } 524 #else 525 /* Clear the hardware table in case firmware left allocations in it */ 526 ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); 527 #endif 528 529 if (!welcomed) { 530 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", 531 novmerge ? "disabled" : "enabled"); 532 welcomed = 1; 533 } 534 535 return tbl; 536 } 537 538 void iommu_free_table(struct iommu_table *tbl, const char *node_name) 539 { 540 unsigned long bitmap_sz, i; 541 unsigned int order; 542 543 if (!tbl || !tbl->it_map) { 544 printk(KERN_ERR "%s: expected TCE map for %s\n", __func__, 545 node_name); 546 return; 547 } 548 549 /* verify that table contains no entries */ 550 /* it_size is in entries, and we're examining 64 at a time */ 551 for (i = 0; i < (tbl->it_size/64); i++) { 552 if (tbl->it_map[i] != 0) { 553 printk(KERN_WARNING "%s: Unexpected TCEs for %s\n", 554 __func__, node_name); 555 break; 556 } 557 } 558 559 /* calculate bitmap size in bytes */ 560 bitmap_sz = (tbl->it_size + 7) / 8; 561 562 /* free bitmap */ 563 order = get_order(bitmap_sz); 564 free_pages((unsigned long) tbl->it_map, order); 565 566 /* free table */ 567 kfree(tbl); 568 } 569 570 /* Creates TCEs for a user provided buffer. The user buffer must be 571 * contiguous real kernel storage (not vmalloc). The address of the buffer 572 * passed here is the kernel (virtual) address of the buffer. The buffer 573 * need not be page aligned, the dma_addr_t returned will point to the same 574 * byte within the page as vaddr. 575 */ 576 dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl, 577 void *vaddr, size_t size, unsigned long mask, 578 enum dma_data_direction direction, struct dma_attrs *attrs) 579 { 580 dma_addr_t dma_handle = DMA_ERROR_CODE; 581 unsigned long uaddr; 582 unsigned int npages, align; 583 584 BUG_ON(direction == DMA_NONE); 585 586 uaddr = (unsigned long)vaddr; 587 npages = iommu_num_pages(uaddr, size); 588 589 if (tbl) { 590 align = 0; 591 if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE && 592 ((unsigned long)vaddr & ~PAGE_MASK) == 0) 593 align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; 594 595 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, 596 mask >> IOMMU_PAGE_SHIFT, align, 597 attrs); 598 if (dma_handle == DMA_ERROR_CODE) { 599 if (printk_ratelimit()) { 600 printk(KERN_INFO "iommu_alloc failed, " 601 "tbl %p vaddr %p npages %d\n", 602 tbl, vaddr, npages); 603 } 604 } else 605 dma_handle |= (uaddr & ~IOMMU_PAGE_MASK); 606 } 607 608 return dma_handle; 609 } 610 611 void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, 612 size_t size, enum dma_data_direction direction, 613 struct dma_attrs *attrs) 614 { 615 unsigned int npages; 616 617 BUG_ON(direction == DMA_NONE); 618 619 if (tbl) { 620 npages = iommu_num_pages(dma_handle, size); 621 iommu_free(tbl, dma_handle, npages); 622 } 623 } 624 625 /* Allocates a contiguous real buffer and creates mappings over it. 626 * Returns the virtual address of the buffer and sets dma_handle 627 * to the dma address (mapping) of the first page. 628 */ 629 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, 630 size_t size, dma_addr_t *dma_handle, 631 unsigned long mask, gfp_t flag, int node) 632 { 633 void *ret = NULL; 634 dma_addr_t mapping; 635 unsigned int order; 636 unsigned int nio_pages, io_order; 637 struct page *page; 638 639 size = PAGE_ALIGN(size); 640 order = get_order(size); 641 642 /* 643 * Client asked for way too much space. This is checked later 644 * anyway. It is easier to debug here for the drivers than in 645 * the tce tables. 646 */ 647 if (order >= IOMAP_MAX_ORDER) { 648 printk("iommu_alloc_consistent size too large: 0x%lx\n", size); 649 return NULL; 650 } 651 652 if (!tbl) 653 return NULL; 654 655 /* Alloc enough pages (and possibly more) */ 656 page = alloc_pages_node(node, flag, order); 657 if (!page) 658 return NULL; 659 ret = page_address(page); 660 memset(ret, 0, size); 661 662 /* Set up tces to cover the allocated range */ 663 nio_pages = size >> IOMMU_PAGE_SHIFT; 664 io_order = get_iommu_order(size); 665 mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 666 mask >> IOMMU_PAGE_SHIFT, io_order, NULL); 667 if (mapping == DMA_ERROR_CODE) { 668 free_pages((unsigned long)ret, order); 669 return NULL; 670 } 671 *dma_handle = mapping; 672 return ret; 673 } 674 675 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 676 void *vaddr, dma_addr_t dma_handle) 677 { 678 if (tbl) { 679 unsigned int nio_pages; 680 681 size = PAGE_ALIGN(size); 682 nio_pages = size >> IOMMU_PAGE_SHIFT; 683 iommu_free(tbl, dma_handle, nio_pages); 684 size = PAGE_ALIGN(size); 685 free_pages((unsigned long)vaddr, get_order(size)); 686 } 687 } 688