1 /* 2 * Copyright(c) 2015-2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <asm/page.h> 48 #include <linux/string.h> 49 50 #include "mmu_rb.h" 51 #include "user_exp_rcv.h" 52 #include "trace.h" 53 54 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 55 struct exp_tid_set *set, 56 struct hfi1_filedata *fd); 57 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); 58 static int set_rcvarray_entry(struct hfi1_filedata *fd, 59 struct tid_user_buf *tbuf, 60 u32 rcventry, struct tid_group *grp, 61 u16 pageidx, unsigned int npages); 62 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 63 struct tid_rb_node *tnode); 64 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 65 const struct mmu_notifier_range *range, 66 unsigned long cur_seq); 67 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, 68 struct tid_group *grp, 69 unsigned int start, u16 count, 70 u32 *tidlist, unsigned int *tididx, 71 unsigned int *pmapped); 72 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 73 struct tid_group **grp); 74 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); 75 76 static const struct mmu_interval_notifier_ops tid_mn_ops = { 77 .invalidate = tid_rb_invalidate, 78 }; 79 80 /* 81 * Initialize context and file private data needed for Expected 82 * receive caching. This needs to be done after the context has 83 * been configured with the eager/expected RcvEntry counts. 84 */ 85 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, 86 struct hfi1_ctxtdata *uctxt) 87 { 88 int ret = 0; 89 90 fd->entry_to_rb = kcalloc(uctxt->expected_count, 91 sizeof(struct rb_node *), 92 GFP_KERNEL); 93 if (!fd->entry_to_rb) 94 return -ENOMEM; 95 96 if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { 97 fd->invalid_tid_idx = 0; 98 fd->invalid_tids = kcalloc(uctxt->expected_count, 99 sizeof(*fd->invalid_tids), 100 GFP_KERNEL); 101 if (!fd->invalid_tids) { 102 kfree(fd->entry_to_rb); 103 fd->entry_to_rb = NULL; 104 return -ENOMEM; 105 } 106 fd->use_mn = true; 107 } 108 109 /* 110 * PSM does not have a good way to separate, count, and 111 * effectively enforce a limit on RcvArray entries used by 112 * subctxts (when context sharing is used) when TID caching 113 * is enabled. To help with that, we calculate a per-process 114 * RcvArray entry share and enforce that. 115 * If TID caching is not in use, PSM deals with usage on its 116 * own. In that case, we allow any subctxt to take all of the 117 * entries. 118 * 119 * Make sure that we set the tid counts only after successful 120 * init. 121 */ 122 spin_lock(&fd->tid_lock); 123 if (uctxt->subctxt_cnt && fd->use_mn) { 124 u16 remainder; 125 126 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; 127 remainder = uctxt->expected_count % uctxt->subctxt_cnt; 128 if (remainder && fd->subctxt < remainder) 129 fd->tid_limit++; 130 } else { 131 fd->tid_limit = uctxt->expected_count; 132 } 133 spin_unlock(&fd->tid_lock); 134 135 return ret; 136 } 137 138 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) 139 { 140 struct hfi1_ctxtdata *uctxt = fd->uctxt; 141 142 mutex_lock(&uctxt->exp_mutex); 143 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) 144 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); 145 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) 146 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); 147 mutex_unlock(&uctxt->exp_mutex); 148 149 kfree(fd->invalid_tids); 150 fd->invalid_tids = NULL; 151 152 kfree(fd->entry_to_rb); 153 fd->entry_to_rb = NULL; 154 } 155 156 /** 157 * Release pinned receive buffer pages. 158 * 159 * @mapped - true if the pages have been DMA mapped. false otherwise. 160 * @idx - Index of the first page to unpin. 161 * @npages - No of pages to unpin. 162 * 163 * If the pages have been DMA mapped (indicated by mapped parameter), their 164 * info will be passed via a struct tid_rb_node. If they haven't been mapped, 165 * their info will be passed via a struct tid_user_buf. 166 */ 167 static void unpin_rcv_pages(struct hfi1_filedata *fd, 168 struct tid_user_buf *tidbuf, 169 struct tid_rb_node *node, 170 unsigned int idx, 171 unsigned int npages, 172 bool mapped) 173 { 174 struct page **pages; 175 struct hfi1_devdata *dd = fd->uctxt->dd; 176 177 if (mapped) { 178 pci_unmap_single(dd->pcidev, node->dma_addr, 179 node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); 180 pages = &node->pages[idx]; 181 } else { 182 pages = &tidbuf->pages[idx]; 183 } 184 hfi1_release_user_pages(fd->mm, pages, npages, mapped); 185 fd->tid_n_pinned -= npages; 186 } 187 188 /** 189 * Pin receive buffer pages. 190 */ 191 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) 192 { 193 int pinned; 194 unsigned int npages; 195 unsigned long vaddr = tidbuf->vaddr; 196 struct page **pages = NULL; 197 struct hfi1_devdata *dd = fd->uctxt->dd; 198 199 /* Get the number of pages the user buffer spans */ 200 npages = num_user_pages(vaddr, tidbuf->length); 201 if (!npages) 202 return -EINVAL; 203 204 if (npages > fd->uctxt->expected_count) { 205 dd_dev_err(dd, "Expected buffer too big\n"); 206 return -EINVAL; 207 } 208 209 /* Allocate the array of struct page pointers needed for pinning */ 210 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 211 if (!pages) 212 return -ENOMEM; 213 214 /* 215 * Pin all the pages of the user buffer. If we can't pin all the 216 * pages, accept the amount pinned so far and program only that. 217 * User space knows how to deal with partially programmed buffers. 218 */ 219 if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { 220 kfree(pages); 221 return -ENOMEM; 222 } 223 224 pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); 225 if (pinned <= 0) { 226 kfree(pages); 227 return pinned; 228 } 229 tidbuf->pages = pages; 230 tidbuf->npages = npages; 231 fd->tid_n_pinned += pinned; 232 return pinned; 233 } 234 235 /* 236 * RcvArray entry allocation for Expected Receives is done by the 237 * following algorithm: 238 * 239 * The context keeps 3 lists of groups of RcvArray entries: 240 * 1. List of empty groups - tid_group_list 241 * This list is created during user context creation and 242 * contains elements which describe sets (of 8) of empty 243 * RcvArray entries. 244 * 2. List of partially used groups - tid_used_list 245 * This list contains sets of RcvArray entries which are 246 * not completely used up. Another mapping request could 247 * use some of all of the remaining entries. 248 * 3. List of full groups - tid_full_list 249 * This is the list where sets that are completely used 250 * up go. 251 * 252 * An attempt to optimize the usage of RcvArray entries is 253 * made by finding all sets of physically contiguous pages in a 254 * user's buffer. 255 * These physically contiguous sets are further split into 256 * sizes supported by the receive engine of the HFI. The 257 * resulting sets of pages are stored in struct tid_pageset, 258 * which describes the sets as: 259 * * .count - number of pages in this set 260 * * .idx - starting index into struct page ** array 261 * of this set 262 * 263 * From this point on, the algorithm deals with the page sets 264 * described above. The number of pagesets is divided by the 265 * RcvArray group size to produce the number of full groups 266 * needed. 267 * 268 * Groups from the 3 lists are manipulated using the following 269 * rules: 270 * 1. For each set of 8 pagesets, a complete group from 271 * tid_group_list is taken, programmed, and moved to 272 * the tid_full_list list. 273 * 2. For all remaining pagesets: 274 * 2.1 If the tid_used_list is empty and the tid_group_list 275 * is empty, stop processing pageset and return only 276 * what has been programmed up to this point. 277 * 2.2 If the tid_used_list is empty and the tid_group_list 278 * is not empty, move a group from tid_group_list to 279 * tid_used_list. 280 * 2.3 For each group is tid_used_group, program as much as 281 * can fit into the group. If the group becomes fully 282 * used, move it to tid_full_list. 283 */ 284 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, 285 struct hfi1_tid_info *tinfo) 286 { 287 int ret = 0, need_group = 0, pinned; 288 struct hfi1_ctxtdata *uctxt = fd->uctxt; 289 struct hfi1_devdata *dd = uctxt->dd; 290 unsigned int ngroups, pageidx = 0, pageset_count, 291 tididx = 0, mapped, mapped_pages = 0; 292 u32 *tidlist = NULL; 293 struct tid_user_buf *tidbuf; 294 295 if (!PAGE_ALIGNED(tinfo->vaddr)) 296 return -EINVAL; 297 298 tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); 299 if (!tidbuf) 300 return -ENOMEM; 301 302 tidbuf->vaddr = tinfo->vaddr; 303 tidbuf->length = tinfo->length; 304 tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), 305 GFP_KERNEL); 306 if (!tidbuf->psets) { 307 kfree(tidbuf); 308 return -ENOMEM; 309 } 310 311 pinned = pin_rcv_pages(fd, tidbuf); 312 if (pinned <= 0) { 313 kfree(tidbuf->psets); 314 kfree(tidbuf); 315 return pinned; 316 } 317 318 /* Find sets of physically contiguous pages */ 319 tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); 320 321 /* 322 * We don't need to access this under a lock since tid_used is per 323 * process and the same process cannot be in hfi1_user_exp_rcv_clear() 324 * and hfi1_user_exp_rcv_setup() at the same time. 325 */ 326 spin_lock(&fd->tid_lock); 327 if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) 328 pageset_count = fd->tid_limit - fd->tid_used; 329 else 330 pageset_count = tidbuf->n_psets; 331 spin_unlock(&fd->tid_lock); 332 333 if (!pageset_count) 334 goto bail; 335 336 ngroups = pageset_count / dd->rcv_entries.group_size; 337 tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); 338 if (!tidlist) { 339 ret = -ENOMEM; 340 goto nomem; 341 } 342 343 tididx = 0; 344 345 /* 346 * From this point on, we are going to be using shared (between master 347 * and subcontexts) context resources. We need to take the lock. 348 */ 349 mutex_lock(&uctxt->exp_mutex); 350 /* 351 * The first step is to program the RcvArray entries which are complete 352 * groups. 353 */ 354 while (ngroups && uctxt->tid_group_list.count) { 355 struct tid_group *grp = 356 tid_group_pop(&uctxt->tid_group_list); 357 358 ret = program_rcvarray(fd, tidbuf, grp, 359 pageidx, dd->rcv_entries.group_size, 360 tidlist, &tididx, &mapped); 361 /* 362 * If there was a failure to program the RcvArray 363 * entries for the entire group, reset the grp fields 364 * and add the grp back to the free group list. 365 */ 366 if (ret <= 0) { 367 tid_group_add_tail(grp, &uctxt->tid_group_list); 368 hfi1_cdbg(TID, 369 "Failed to program RcvArray group %d", ret); 370 goto unlock; 371 } 372 373 tid_group_add_tail(grp, &uctxt->tid_full_list); 374 ngroups--; 375 pageidx += ret; 376 mapped_pages += mapped; 377 } 378 379 while (pageidx < pageset_count) { 380 struct tid_group *grp, *ptr; 381 /* 382 * If we don't have any partially used tid groups, check 383 * if we have empty groups. If so, take one from there and 384 * put in the partially used list. 385 */ 386 if (!uctxt->tid_used_list.count || need_group) { 387 if (!uctxt->tid_group_list.count) 388 goto unlock; 389 390 grp = tid_group_pop(&uctxt->tid_group_list); 391 tid_group_add_tail(grp, &uctxt->tid_used_list); 392 need_group = 0; 393 } 394 /* 395 * There is an optimization opportunity here - instead of 396 * fitting as many page sets as we can, check for a group 397 * later on in the list that could fit all of them. 398 */ 399 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, 400 list) { 401 unsigned use = min_t(unsigned, pageset_count - pageidx, 402 grp->size - grp->used); 403 404 ret = program_rcvarray(fd, tidbuf, grp, 405 pageidx, use, tidlist, 406 &tididx, &mapped); 407 if (ret < 0) { 408 hfi1_cdbg(TID, 409 "Failed to program RcvArray entries %d", 410 ret); 411 goto unlock; 412 } else if (ret > 0) { 413 if (grp->used == grp->size) 414 tid_group_move(grp, 415 &uctxt->tid_used_list, 416 &uctxt->tid_full_list); 417 pageidx += ret; 418 mapped_pages += mapped; 419 need_group = 0; 420 /* Check if we are done so we break out early */ 421 if (pageidx >= pageset_count) 422 break; 423 } else if (WARN_ON(ret == 0)) { 424 /* 425 * If ret is 0, we did not program any entries 426 * into this group, which can only happen if 427 * we've screwed up the accounting somewhere. 428 * Warn and try to continue. 429 */ 430 need_group = 1; 431 } 432 } 433 } 434 unlock: 435 mutex_unlock(&uctxt->exp_mutex); 436 nomem: 437 hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, 438 mapped_pages, ret); 439 if (tididx) { 440 spin_lock(&fd->tid_lock); 441 fd->tid_used += tididx; 442 spin_unlock(&fd->tid_lock); 443 tinfo->tidcnt = tididx; 444 tinfo->length = mapped_pages * PAGE_SIZE; 445 446 if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), 447 tidlist, sizeof(tidlist[0]) * tididx)) { 448 /* 449 * On failure to copy to the user level, we need to undo 450 * everything done so far so we don't leak resources. 451 */ 452 tinfo->tidlist = (unsigned long)&tidlist; 453 hfi1_user_exp_rcv_clear(fd, tinfo); 454 tinfo->tidlist = 0; 455 ret = -EFAULT; 456 goto bail; 457 } 458 } 459 460 /* 461 * If not everything was mapped (due to insufficient RcvArray entries, 462 * for example), unpin all unmapped pages so we can pin them nex time. 463 */ 464 if (mapped_pages != pinned) 465 unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, 466 (pinned - mapped_pages), false); 467 bail: 468 kfree(tidbuf->psets); 469 kfree(tidlist); 470 kfree(tidbuf->pages); 471 kfree(tidbuf); 472 return ret > 0 ? 0 : ret; 473 } 474 475 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, 476 struct hfi1_tid_info *tinfo) 477 { 478 int ret = 0; 479 struct hfi1_ctxtdata *uctxt = fd->uctxt; 480 u32 *tidinfo; 481 unsigned tididx; 482 483 if (unlikely(tinfo->tidcnt > fd->tid_used)) 484 return -EINVAL; 485 486 tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist), 487 sizeof(tidinfo[0]) * tinfo->tidcnt); 488 if (IS_ERR(tidinfo)) 489 return PTR_ERR(tidinfo); 490 491 mutex_lock(&uctxt->exp_mutex); 492 for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { 493 ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); 494 if (ret) { 495 hfi1_cdbg(TID, "Failed to unprogram rcv array %d", 496 ret); 497 break; 498 } 499 } 500 spin_lock(&fd->tid_lock); 501 fd->tid_used -= tididx; 502 spin_unlock(&fd->tid_lock); 503 tinfo->tidcnt = tididx; 504 mutex_unlock(&uctxt->exp_mutex); 505 506 kfree(tidinfo); 507 return ret; 508 } 509 510 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, 511 struct hfi1_tid_info *tinfo) 512 { 513 struct hfi1_ctxtdata *uctxt = fd->uctxt; 514 unsigned long *ev = uctxt->dd->events + 515 (uctxt_offset(uctxt) + fd->subctxt); 516 u32 *array; 517 int ret = 0; 518 519 /* 520 * copy_to_user() can sleep, which will leave the invalid_lock 521 * locked and cause the MMU notifier to be blocked on the lock 522 * for a long time. 523 * Copy the data to a local buffer so we can release the lock. 524 */ 525 array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); 526 if (!array) 527 return -EFAULT; 528 529 spin_lock(&fd->invalid_lock); 530 if (fd->invalid_tid_idx) { 531 memcpy(array, fd->invalid_tids, sizeof(*array) * 532 fd->invalid_tid_idx); 533 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * 534 fd->invalid_tid_idx); 535 tinfo->tidcnt = fd->invalid_tid_idx; 536 fd->invalid_tid_idx = 0; 537 /* 538 * Reset the user flag while still holding the lock. 539 * Otherwise, PSM can miss events. 540 */ 541 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 542 } else { 543 tinfo->tidcnt = 0; 544 } 545 spin_unlock(&fd->invalid_lock); 546 547 if (tinfo->tidcnt) { 548 if (copy_to_user((void __user *)tinfo->tidlist, 549 array, sizeof(*array) * tinfo->tidcnt)) 550 ret = -EFAULT; 551 } 552 kfree(array); 553 554 return ret; 555 } 556 557 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) 558 { 559 unsigned pagecount, pageidx, setcount = 0, i; 560 unsigned long pfn, this_pfn; 561 struct page **pages = tidbuf->pages; 562 struct tid_pageset *list = tidbuf->psets; 563 564 if (!npages) 565 return 0; 566 567 /* 568 * Look for sets of physically contiguous pages in the user buffer. 569 * This will allow us to optimize Expected RcvArray entry usage by 570 * using the bigger supported sizes. 571 */ 572 pfn = page_to_pfn(pages[0]); 573 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 574 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; 575 576 /* 577 * If the pfn's are not sequential, pages are not physically 578 * contiguous. 579 */ 580 if (this_pfn != ++pfn) { 581 /* 582 * At this point we have to loop over the set of 583 * physically contiguous pages and break them down it 584 * sizes supported by the HW. 585 * There are two main constraints: 586 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 587 * If the total set size is bigger than that 588 * program only a MAX_EXPECTED_BUFFER chunk. 589 * 2. The buffer size has to be a power of two. If 590 * it is not, round down to the closes power of 591 * 2 and program that size. 592 */ 593 while (pagecount) { 594 int maxpages = pagecount; 595 u32 bufsize = pagecount * PAGE_SIZE; 596 597 if (bufsize > MAX_EXPECTED_BUFFER) 598 maxpages = 599 MAX_EXPECTED_BUFFER >> 600 PAGE_SHIFT; 601 else if (!is_power_of_2(bufsize)) 602 maxpages = 603 rounddown_pow_of_two(bufsize) >> 604 PAGE_SHIFT; 605 606 list[setcount].idx = pageidx; 607 list[setcount].count = maxpages; 608 pagecount -= maxpages; 609 pageidx += maxpages; 610 setcount++; 611 } 612 pageidx = i; 613 pagecount = 1; 614 pfn = this_pfn; 615 } else { 616 pagecount++; 617 } 618 } 619 return setcount; 620 } 621 622 /** 623 * program_rcvarray() - program an RcvArray group with receive buffers 624 * @fd: filedata pointer 625 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting 626 * virtual address, buffer length, page pointers, pagesets (array of 627 * struct tid_pageset holding information on physically contiguous 628 * chunks from the user buffer), and other fields. 629 * @grp: RcvArray group 630 * @start: starting index into sets array 631 * @count: number of struct tid_pageset's to program 632 * @tidlist: the array of u32 elements when the information about the 633 * programmed RcvArray entries is to be encoded. 634 * @tididx: starting offset into tidlist 635 * @pmapped: (output parameter) number of pages programmed into the RcvArray 636 * entries. 637 * 638 * This function will program up to 'count' number of RcvArray entries from the 639 * group 'grp'. To make best use of write-combining writes, the function will 640 * perform writes to the unused RcvArray entries which will be ignored by the 641 * HW. Each RcvArray entry will be programmed with a physically contiguous 642 * buffer chunk from the user's virtual buffer. 643 * 644 * Return: 645 * -EINVAL if the requested count is larger than the size of the group, 646 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or 647 * number of RcvArray entries programmed. 648 */ 649 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, 650 struct tid_group *grp, 651 unsigned int start, u16 count, 652 u32 *tidlist, unsigned int *tididx, 653 unsigned int *pmapped) 654 { 655 struct hfi1_ctxtdata *uctxt = fd->uctxt; 656 struct hfi1_devdata *dd = uctxt->dd; 657 u16 idx; 658 u32 tidinfo = 0, rcventry, useidx = 0; 659 int mapped = 0; 660 661 /* Count should never be larger than the group size */ 662 if (count > grp->size) 663 return -EINVAL; 664 665 /* Find the first unused entry in the group */ 666 for (idx = 0; idx < grp->size; idx++) { 667 if (!(grp->map & (1 << idx))) { 668 useidx = idx; 669 break; 670 } 671 rcv_array_wc_fill(dd, grp->base + idx); 672 } 673 674 idx = 0; 675 while (idx < count) { 676 u16 npages, pageidx, setidx = start + idx; 677 int ret = 0; 678 679 /* 680 * If this entry in the group is used, move to the next one. 681 * If we go past the end of the group, exit the loop. 682 */ 683 if (useidx >= grp->size) { 684 break; 685 } else if (grp->map & (1 << useidx)) { 686 rcv_array_wc_fill(dd, grp->base + useidx); 687 useidx++; 688 continue; 689 } 690 691 rcventry = grp->base + useidx; 692 npages = tbuf->psets[setidx].count; 693 pageidx = tbuf->psets[setidx].idx; 694 695 ret = set_rcvarray_entry(fd, tbuf, 696 rcventry, grp, pageidx, 697 npages); 698 if (ret) 699 return ret; 700 mapped += npages; 701 702 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | 703 EXP_TID_SET(LEN, npages); 704 tidlist[(*tididx)++] = tidinfo; 705 grp->used++; 706 grp->map |= 1 << useidx++; 707 idx++; 708 } 709 710 /* Fill the rest of the group with "blank" writes */ 711 for (; useidx < grp->size; useidx++) 712 rcv_array_wc_fill(dd, grp->base + useidx); 713 *pmapped = mapped; 714 return idx; 715 } 716 717 static int set_rcvarray_entry(struct hfi1_filedata *fd, 718 struct tid_user_buf *tbuf, 719 u32 rcventry, struct tid_group *grp, 720 u16 pageidx, unsigned int npages) 721 { 722 int ret; 723 struct hfi1_ctxtdata *uctxt = fd->uctxt; 724 struct tid_rb_node *node; 725 struct hfi1_devdata *dd = uctxt->dd; 726 dma_addr_t phys; 727 struct page **pages = tbuf->pages + pageidx; 728 729 /* 730 * Allocate the node first so we can handle a potential 731 * failure before we've programmed anything. 732 */ 733 node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), 734 GFP_KERNEL); 735 if (!node) 736 return -ENOMEM; 737 738 phys = pci_map_single(dd->pcidev, 739 __va(page_to_phys(pages[0])), 740 npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); 741 if (dma_mapping_error(&dd->pcidev->dev, phys)) { 742 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", 743 phys); 744 kfree(node); 745 return -EFAULT; 746 } 747 748 node->fdata = fd; 749 node->phys = page_to_phys(pages[0]); 750 node->npages = npages; 751 node->rcventry = rcventry; 752 node->dma_addr = phys; 753 node->grp = grp; 754 node->freed = false; 755 memcpy(node->pages, pages, sizeof(struct page *) * npages); 756 757 if (fd->use_mn) { 758 ret = mmu_interval_notifier_insert( 759 &node->notifier, fd->mm, 760 tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, 761 &tid_mn_ops); 762 if (ret) 763 goto out_unmap; 764 /* 765 * FIXME: This is in the wrong order, the notifier should be 766 * established before the pages are pinned by pin_rcv_pages. 767 */ 768 mmu_interval_read_begin(&node->notifier); 769 } 770 fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; 771 772 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); 773 trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, 774 node->notifier.interval_tree.start, node->phys, 775 phys); 776 return 0; 777 778 out_unmap: 779 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", 780 node->rcventry, node->notifier.interval_tree.start, 781 node->phys, ret); 782 pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, 783 PCI_DMA_FROMDEVICE); 784 kfree(node); 785 return -EFAULT; 786 } 787 788 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 789 struct tid_group **grp) 790 { 791 struct hfi1_ctxtdata *uctxt = fd->uctxt; 792 struct hfi1_devdata *dd = uctxt->dd; 793 struct tid_rb_node *node; 794 u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); 795 u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; 796 797 if (tididx >= uctxt->expected_count) { 798 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", 799 tididx, uctxt->ctxt); 800 return -EINVAL; 801 } 802 803 if (tidctrl == 0x3) 804 return -EINVAL; 805 806 rcventry = tididx + (tidctrl - 1); 807 808 node = fd->entry_to_rb[rcventry]; 809 if (!node || node->rcventry != (uctxt->expected_base + rcventry)) 810 return -EBADF; 811 812 if (grp) 813 *grp = node->grp; 814 815 if (fd->use_mn) 816 mmu_interval_notifier_remove(&node->notifier); 817 cacheless_tid_rb_remove(fd, node); 818 819 return 0; 820 } 821 822 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 823 { 824 struct hfi1_ctxtdata *uctxt = fd->uctxt; 825 struct hfi1_devdata *dd = uctxt->dd; 826 827 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, 828 node->npages, 829 node->notifier.interval_tree.start, node->phys, 830 node->dma_addr); 831 832 /* 833 * Make sure device has seen the write before we unpin the 834 * pages. 835 */ 836 hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); 837 838 unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); 839 840 node->grp->used--; 841 node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); 842 843 if (node->grp->used == node->grp->size - 1) 844 tid_group_move(node->grp, &uctxt->tid_full_list, 845 &uctxt->tid_used_list); 846 else if (!node->grp->used) 847 tid_group_move(node->grp, &uctxt->tid_used_list, 848 &uctxt->tid_group_list); 849 kfree(node); 850 } 851 852 /* 853 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with 854 * clearing nodes in the non-cached case. 855 */ 856 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 857 struct exp_tid_set *set, 858 struct hfi1_filedata *fd) 859 { 860 struct tid_group *grp, *ptr; 861 int i; 862 863 list_for_each_entry_safe(grp, ptr, &set->list, list) { 864 list_del_init(&grp->list); 865 866 for (i = 0; i < grp->size; i++) { 867 if (grp->map & (1 << i)) { 868 u16 rcventry = grp->base + i; 869 struct tid_rb_node *node; 870 871 node = fd->entry_to_rb[rcventry - 872 uctxt->expected_base]; 873 if (!node || node->rcventry != rcventry) 874 continue; 875 876 if (fd->use_mn) 877 mmu_interval_notifier_remove( 878 &node->notifier); 879 cacheless_tid_rb_remove(fd, node); 880 } 881 } 882 } 883 } 884 885 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 886 const struct mmu_notifier_range *range, 887 unsigned long cur_seq) 888 { 889 struct tid_rb_node *node = 890 container_of(mni, struct tid_rb_node, notifier); 891 struct hfi1_filedata *fdata = node->fdata; 892 struct hfi1_ctxtdata *uctxt = fdata->uctxt; 893 894 if (node->freed) 895 return true; 896 897 trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, 898 node->notifier.interval_tree.start, 899 node->rcventry, node->npages, node->dma_addr); 900 node->freed = true; 901 902 spin_lock(&fdata->invalid_lock); 903 if (fdata->invalid_tid_idx < uctxt->expected_count) { 904 fdata->invalid_tids[fdata->invalid_tid_idx] = 905 rcventry2tidinfo(node->rcventry - uctxt->expected_base); 906 fdata->invalid_tids[fdata->invalid_tid_idx] |= 907 EXP_TID_SET(LEN, node->npages); 908 if (!fdata->invalid_tid_idx) { 909 unsigned long *ev; 910 911 /* 912 * hfi1_set_uevent_bits() sets a user event flag 913 * for all processes. Because calling into the 914 * driver to process TID cache invalidations is 915 * expensive and TID cache invalidations are 916 * handled on a per-process basis, we can 917 * optimize this to set the flag only for the 918 * process in question. 919 */ 920 ev = uctxt->dd->events + 921 (uctxt_offset(uctxt) + fdata->subctxt); 922 set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 923 } 924 fdata->invalid_tid_idx++; 925 } 926 spin_unlock(&fdata->invalid_lock); 927 return true; 928 } 929 930 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 931 struct tid_rb_node *tnode) 932 { 933 u32 base = fdata->uctxt->expected_base; 934 935 fdata->entry_to_rb[tnode->rcventry - base] = NULL; 936 clear_tid_node(fdata, tnode); 937 } 938