1 /* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 4 * Copyright (c) 2013 Cisco Systems. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35 #include <linux/mm.h> 36 #include <linux/dma-mapping.h> 37 #include <linux/sched/signal.h> 38 #include <linux/sched/mm.h> 39 #include <linux/hugetlb.h> 40 #include <linux/iommu.h> 41 #include <linux/workqueue.h> 42 #include <linux/list.h> 43 #include <linux/pci.h> 44 #include <rdma/ib_verbs.h> 45 46 #include "usnic_log.h" 47 #include "usnic_uiom.h" 48 #include "usnic_uiom_interval_tree.h" 49 50 static struct workqueue_struct *usnic_uiom_wq; 51 52 #define USNIC_UIOM_PAGE_CHUNK \ 53 ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\ 54 ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ 55 (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) 56 57 static void usnic_uiom_reg_account(struct work_struct *work) 58 { 59 struct usnic_uiom_reg *umem = container_of(work, 60 struct usnic_uiom_reg, work); 61 62 down_write(&umem->mm->mmap_sem); 63 umem->mm->locked_vm -= umem->diff; 64 up_write(&umem->mm->mmap_sem); 65 mmput(umem->mm); 66 kfree(umem); 67 } 68 69 static int usnic_uiom_dma_fault(struct iommu_domain *domain, 70 struct device *dev, 71 unsigned long iova, int flags, 72 void *token) 73 { 74 usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", 75 dev_name(dev), 76 domain, iova, flags); 77 return -ENOSYS; 78 } 79 80 static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) 81 { 82 struct usnic_uiom_chunk *chunk, *tmp; 83 struct page *page; 84 struct scatterlist *sg; 85 int i; 86 dma_addr_t pa; 87 88 list_for_each_entry_safe(chunk, tmp, chunk_list, list) { 89 for_each_sg(chunk->page_list, sg, chunk->nents, i) { 90 page = sg_page(sg); 91 pa = sg_phys(sg); 92 if (!PageDirty(page) && dirty) 93 set_page_dirty_lock(page); 94 put_page(page); 95 usnic_dbg("pa: %pa\n", &pa); 96 } 97 kfree(chunk); 98 } 99 } 100 101 static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, 102 int dmasync, struct list_head *chunk_list) 103 { 104 struct page **page_list; 105 struct scatterlist *sg; 106 struct usnic_uiom_chunk *chunk; 107 unsigned long locked; 108 unsigned long lock_limit; 109 unsigned long cur_base; 110 unsigned long npages; 111 int ret; 112 int off; 113 int i; 114 int flags; 115 dma_addr_t pa; 116 unsigned int gup_flags; 117 118 /* 119 * If the combination of the addr and size requested for this memory 120 * region causes an integer overflow, return error. 121 */ 122 if (((addr + size) < addr) || PAGE_ALIGN(addr + size) < (addr + size)) 123 return -EINVAL; 124 125 if (!size) 126 return -EINVAL; 127 128 if (!can_do_mlock()) 129 return -EPERM; 130 131 INIT_LIST_HEAD(chunk_list); 132 133 page_list = (struct page **) __get_free_page(GFP_KERNEL); 134 if (!page_list) 135 return -ENOMEM; 136 137 npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; 138 139 down_write(¤t->mm->mmap_sem); 140 141 locked = npages + current->mm->pinned_vm; 142 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 143 144 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { 145 ret = -ENOMEM; 146 goto out; 147 } 148 149 flags = IOMMU_READ | IOMMU_CACHE; 150 flags |= (writable) ? IOMMU_WRITE : 0; 151 gup_flags = FOLL_WRITE; 152 gup_flags |= (writable) ? 0 : FOLL_FORCE; 153 cur_base = addr & PAGE_MASK; 154 ret = 0; 155 156 while (npages) { 157 ret = get_user_pages_longterm(cur_base, 158 min_t(unsigned long, npages, 159 PAGE_SIZE / sizeof(struct page *)), 160 gup_flags, page_list, NULL); 161 162 if (ret < 0) 163 goto out; 164 165 npages -= ret; 166 off = 0; 167 168 while (ret) { 169 chunk = kmalloc(sizeof(*chunk) + 170 sizeof(struct scatterlist) * 171 min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), 172 GFP_KERNEL); 173 if (!chunk) { 174 ret = -ENOMEM; 175 goto out; 176 } 177 178 chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); 179 sg_init_table(chunk->page_list, chunk->nents); 180 for_each_sg(chunk->page_list, sg, chunk->nents, i) { 181 sg_set_page(sg, page_list[i + off], 182 PAGE_SIZE, 0); 183 pa = sg_phys(sg); 184 usnic_dbg("va: 0x%lx pa: %pa\n", 185 cur_base + i*PAGE_SIZE, &pa); 186 } 187 cur_base += chunk->nents * PAGE_SIZE; 188 ret -= chunk->nents; 189 off += chunk->nents; 190 list_add_tail(&chunk->list, chunk_list); 191 } 192 193 ret = 0; 194 } 195 196 out: 197 if (ret < 0) 198 usnic_uiom_put_pages(chunk_list, 0); 199 else 200 current->mm->pinned_vm = locked; 201 202 up_write(¤t->mm->mmap_sem); 203 free_page((unsigned long) page_list); 204 return ret; 205 } 206 207 static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, 208 struct usnic_uiom_pd *pd) 209 { 210 struct usnic_uiom_interval_node *interval, *tmp; 211 long unsigned va, size; 212 213 list_for_each_entry_safe(interval, tmp, intervals, link) { 214 va = interval->start << PAGE_SHIFT; 215 size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; 216 while (size > 0) { 217 /* Workaround for RH 970401 */ 218 usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); 219 iommu_unmap(pd->domain, va, PAGE_SIZE); 220 va += PAGE_SIZE; 221 size -= PAGE_SIZE; 222 } 223 } 224 } 225 226 static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, 227 struct usnic_uiom_reg *uiomr, 228 int dirty) 229 { 230 int npages; 231 unsigned long vpn_start, vpn_last; 232 struct usnic_uiom_interval_node *interval, *tmp; 233 int writable = 0; 234 LIST_HEAD(rm_intervals); 235 236 npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; 237 vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; 238 vpn_last = vpn_start + npages - 1; 239 240 spin_lock(&pd->lock); 241 usnic_uiom_remove_interval(&pd->root, vpn_start, 242 vpn_last, &rm_intervals); 243 usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); 244 245 list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { 246 if (interval->flags & IOMMU_WRITE) 247 writable = 1; 248 list_del(&interval->link); 249 kfree(interval); 250 } 251 252 usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); 253 spin_unlock(&pd->lock); 254 } 255 256 static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, 257 struct usnic_uiom_reg *uiomr) 258 { 259 int i, err; 260 size_t size; 261 struct usnic_uiom_chunk *chunk; 262 struct usnic_uiom_interval_node *interval_node; 263 dma_addr_t pa; 264 dma_addr_t pa_start = 0; 265 dma_addr_t pa_end = 0; 266 long int va_start = -EINVAL; 267 struct usnic_uiom_pd *pd = uiomr->pd; 268 long int va = uiomr->va & PAGE_MASK; 269 int flags = IOMMU_READ | IOMMU_CACHE; 270 271 flags |= (uiomr->writable) ? IOMMU_WRITE : 0; 272 chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, 273 list); 274 list_for_each_entry(interval_node, intervals, link) { 275 iter_chunk: 276 for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { 277 pa = sg_phys(&chunk->page_list[i]); 278 if ((va >> PAGE_SHIFT) < interval_node->start) 279 continue; 280 281 if ((va >> PAGE_SHIFT) == interval_node->start) { 282 /* First page of the interval */ 283 va_start = va; 284 pa_start = pa; 285 pa_end = pa; 286 } 287 288 WARN_ON(va_start == -EINVAL); 289 290 if ((pa_end + PAGE_SIZE != pa) && 291 (pa != pa_start)) { 292 /* PAs are not contiguous */ 293 size = pa_end - pa_start + PAGE_SIZE; 294 usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", 295 va_start, &pa_start, size, flags); 296 err = iommu_map(pd->domain, va_start, pa_start, 297 size, flags); 298 if (err) { 299 usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", 300 va_start, &pa_start, size, err); 301 goto err_out; 302 } 303 va_start = va; 304 pa_start = pa; 305 pa_end = pa; 306 } 307 308 if ((va >> PAGE_SHIFT) == interval_node->last) { 309 /* Last page of the interval */ 310 size = pa - pa_start + PAGE_SIZE; 311 usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", 312 va_start, &pa_start, size, flags); 313 err = iommu_map(pd->domain, va_start, pa_start, 314 size, flags); 315 if (err) { 316 usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", 317 va_start, &pa_start, size, err); 318 goto err_out; 319 } 320 break; 321 } 322 323 if (pa != pa_start) 324 pa_end += PAGE_SIZE; 325 } 326 327 if (i == chunk->nents) { 328 /* 329 * Hit last entry of the chunk, 330 * hence advance to next chunk 331 */ 332 chunk = list_first_entry(&chunk->list, 333 struct usnic_uiom_chunk, 334 list); 335 goto iter_chunk; 336 } 337 } 338 339 return 0; 340 341 err_out: 342 usnic_uiom_unmap_sorted_intervals(intervals, pd); 343 return err; 344 } 345 346 struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, 347 unsigned long addr, size_t size, 348 int writable, int dmasync) 349 { 350 struct usnic_uiom_reg *uiomr; 351 unsigned long va_base, vpn_start, vpn_last; 352 unsigned long npages; 353 int offset, err; 354 LIST_HEAD(sorted_diff_intervals); 355 356 /* 357 * Intel IOMMU map throws an error if a translation entry is 358 * changed from read to write. This module may not unmap 359 * and then remap the entry after fixing the permission 360 * b/c this open up a small windows where hw DMA may page fault 361 * Hence, make all entries to be writable. 362 */ 363 writable = 1; 364 365 va_base = addr & PAGE_MASK; 366 offset = addr & ~PAGE_MASK; 367 npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; 368 vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; 369 vpn_last = vpn_start + npages - 1; 370 371 uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); 372 if (!uiomr) 373 return ERR_PTR(-ENOMEM); 374 375 uiomr->va = va_base; 376 uiomr->offset = offset; 377 uiomr->length = size; 378 uiomr->writable = writable; 379 uiomr->pd = pd; 380 381 err = usnic_uiom_get_pages(addr, size, writable, dmasync, 382 &uiomr->chunk_list); 383 if (err) { 384 usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", 385 vpn_start, vpn_last, err); 386 goto out_free_uiomr; 387 } 388 389 spin_lock(&pd->lock); 390 err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, 391 (writable) ? IOMMU_WRITE : 0, 392 IOMMU_WRITE, 393 &pd->root, 394 &sorted_diff_intervals); 395 if (err) { 396 usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", 397 vpn_start, vpn_last, err); 398 goto out_put_pages; 399 } 400 401 err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); 402 if (err) { 403 usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", 404 vpn_start, vpn_last, err); 405 goto out_put_intervals; 406 407 } 408 409 err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last, 410 (writable) ? IOMMU_WRITE : 0); 411 if (err) { 412 usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", 413 vpn_start, vpn_last, err); 414 goto out_unmap_intervals; 415 } 416 417 usnic_uiom_put_interval_set(&sorted_diff_intervals); 418 spin_unlock(&pd->lock); 419 420 return uiomr; 421 422 out_unmap_intervals: 423 usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); 424 out_put_intervals: 425 usnic_uiom_put_interval_set(&sorted_diff_intervals); 426 out_put_pages: 427 usnic_uiom_put_pages(&uiomr->chunk_list, 0); 428 spin_unlock(&pd->lock); 429 out_free_uiomr: 430 kfree(uiomr); 431 return ERR_PTR(err); 432 } 433 434 void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, 435 struct ib_ucontext *ucontext) 436 { 437 struct task_struct *task; 438 struct mm_struct *mm; 439 unsigned long diff; 440 441 __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); 442 443 task = get_pid_task(ucontext->tgid, PIDTYPE_PID); 444 if (!task) 445 goto out; 446 mm = get_task_mm(task); 447 put_task_struct(task); 448 if (!mm) 449 goto out; 450 451 diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; 452 453 /* 454 * We may be called with the mm's mmap_sem already held. This 455 * can happen when a userspace munmap() is the call that drops 456 * the last reference to our file and calls our release 457 * method. If there are memory regions to destroy, we'll end 458 * up here and not be able to take the mmap_sem. In that case 459 * we defer the vm_locked accounting to the system workqueue. 460 */ 461 if (ucontext->closing) { 462 if (!down_write_trylock(&mm->mmap_sem)) { 463 INIT_WORK(&uiomr->work, usnic_uiom_reg_account); 464 uiomr->mm = mm; 465 uiomr->diff = diff; 466 467 queue_work(usnic_uiom_wq, &uiomr->work); 468 return; 469 } 470 } else 471 down_write(&mm->mmap_sem); 472 473 mm->pinned_vm -= diff; 474 up_write(&mm->mmap_sem); 475 mmput(mm); 476 out: 477 kfree(uiomr); 478 } 479 480 struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) 481 { 482 struct usnic_uiom_pd *pd; 483 void *domain; 484 485 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 486 if (!pd) 487 return ERR_PTR(-ENOMEM); 488 489 pd->domain = domain = iommu_domain_alloc(&pci_bus_type); 490 if (!domain) { 491 usnic_err("Failed to allocate IOMMU domain"); 492 kfree(pd); 493 return ERR_PTR(-ENOMEM); 494 } 495 496 iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); 497 498 spin_lock_init(&pd->lock); 499 INIT_LIST_HEAD(&pd->devs); 500 501 return pd; 502 } 503 504 void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) 505 { 506 iommu_domain_free(pd->domain); 507 kfree(pd); 508 } 509 510 int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) 511 { 512 struct usnic_uiom_dev *uiom_dev; 513 int err; 514 515 uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); 516 if (!uiom_dev) 517 return -ENOMEM; 518 uiom_dev->dev = dev; 519 520 err = iommu_attach_device(pd->domain, dev); 521 if (err) 522 goto out_free_dev; 523 524 if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { 525 usnic_err("IOMMU of %s does not support cache coherency\n", 526 dev_name(dev)); 527 err = -EINVAL; 528 goto out_detach_device; 529 } 530 531 spin_lock(&pd->lock); 532 list_add_tail(&uiom_dev->link, &pd->devs); 533 pd->dev_cnt++; 534 spin_unlock(&pd->lock); 535 536 return 0; 537 538 out_detach_device: 539 iommu_detach_device(pd->domain, dev); 540 out_free_dev: 541 kfree(uiom_dev); 542 return err; 543 } 544 545 void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) 546 { 547 struct usnic_uiom_dev *uiom_dev; 548 int found = 0; 549 550 spin_lock(&pd->lock); 551 list_for_each_entry(uiom_dev, &pd->devs, link) { 552 if (uiom_dev->dev == dev) { 553 found = 1; 554 break; 555 } 556 } 557 558 if (!found) { 559 usnic_err("Unable to free dev %s - not found\n", 560 dev_name(dev)); 561 spin_unlock(&pd->lock); 562 return; 563 } 564 565 list_del(&uiom_dev->link); 566 pd->dev_cnt--; 567 spin_unlock(&pd->lock); 568 569 return iommu_detach_device(pd->domain, dev); 570 } 571 572 struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) 573 { 574 struct usnic_uiom_dev *uiom_dev; 575 struct device **devs; 576 int i = 0; 577 578 spin_lock(&pd->lock); 579 devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); 580 if (!devs) { 581 devs = ERR_PTR(-ENOMEM); 582 goto out; 583 } 584 585 list_for_each_entry(uiom_dev, &pd->devs, link) { 586 devs[i++] = uiom_dev->dev; 587 } 588 out: 589 spin_unlock(&pd->lock); 590 return devs; 591 } 592 593 void usnic_uiom_free_dev_list(struct device **devs) 594 { 595 kfree(devs); 596 } 597 598 int usnic_uiom_init(char *drv_name) 599 { 600 if (!iommu_present(&pci_bus_type)) { 601 usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); 602 return -EPERM; 603 } 604 605 usnic_uiom_wq = create_workqueue(drv_name); 606 if (!usnic_uiom_wq) { 607 usnic_err("Unable to alloc wq for drv %s\n", drv_name); 608 return -ENOMEM; 609 } 610 611 return 0; 612 } 613 614 void usnic_uiom_fini(void) 615 { 616 flush_workqueue(usnic_uiom_wq); 617 destroy_workqueue(usnic_uiom_wq); 618 } 619