1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2020 Cornelis Networks, Inc. 4 * Copyright(c) 2015-2018 Intel Corporation. 5 */ 6 #include <asm/page.h> 7 #include <linux/string.h> 8 9 #include "mmu_rb.h" 10 #include "user_exp_rcv.h" 11 #include "trace.h" 12 13 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 14 struct exp_tid_set *set, 15 struct hfi1_filedata *fd); 16 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); 17 static int set_rcvarray_entry(struct hfi1_filedata *fd, 18 struct tid_user_buf *tbuf, 19 u32 rcventry, struct tid_group *grp, 20 u16 pageidx, unsigned int npages); 21 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 22 struct tid_rb_node *tnode); 23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 24 const struct mmu_notifier_range *range, 25 unsigned long cur_seq); 26 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, 27 struct tid_group *grp, 28 unsigned int start, u16 count, 29 u32 *tidlist, unsigned int *tididx, 30 unsigned int *pmapped); 31 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 32 struct tid_group **grp); 33 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); 34 35 static const struct mmu_interval_notifier_ops tid_mn_ops = { 36 .invalidate = tid_rb_invalidate, 37 }; 38 39 /* 40 * Initialize context and file private data needed for Expected 41 * receive caching. This needs to be done after the context has 42 * been configured with the eager/expected RcvEntry counts. 43 */ 44 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, 45 struct hfi1_ctxtdata *uctxt) 46 { 47 int ret = 0; 48 49 fd->entry_to_rb = kcalloc(uctxt->expected_count, 50 sizeof(struct rb_node *), 51 GFP_KERNEL); 52 if (!fd->entry_to_rb) 53 return -ENOMEM; 54 55 if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { 56 fd->invalid_tid_idx = 0; 57 fd->invalid_tids = kcalloc(uctxt->expected_count, 58 sizeof(*fd->invalid_tids), 59 GFP_KERNEL); 60 if (!fd->invalid_tids) { 61 kfree(fd->entry_to_rb); 62 fd->entry_to_rb = NULL; 63 return -ENOMEM; 64 } 65 fd->use_mn = true; 66 } 67 68 /* 69 * PSM does not have a good way to separate, count, and 70 * effectively enforce a limit on RcvArray entries used by 71 * subctxts (when context sharing is used) when TID caching 72 * is enabled. To help with that, we calculate a per-process 73 * RcvArray entry share and enforce that. 74 * If TID caching is not in use, PSM deals with usage on its 75 * own. In that case, we allow any subctxt to take all of the 76 * entries. 77 * 78 * Make sure that we set the tid counts only after successful 79 * init. 80 */ 81 spin_lock(&fd->tid_lock); 82 if (uctxt->subctxt_cnt && fd->use_mn) { 83 u16 remainder; 84 85 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; 86 remainder = uctxt->expected_count % uctxt->subctxt_cnt; 87 if (remainder && fd->subctxt < remainder) 88 fd->tid_limit++; 89 } else { 90 fd->tid_limit = uctxt->expected_count; 91 } 92 spin_unlock(&fd->tid_lock); 93 94 return ret; 95 } 96 97 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) 98 { 99 struct hfi1_ctxtdata *uctxt = fd->uctxt; 100 101 mutex_lock(&uctxt->exp_mutex); 102 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) 103 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); 104 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) 105 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); 106 mutex_unlock(&uctxt->exp_mutex); 107 108 kfree(fd->invalid_tids); 109 fd->invalid_tids = NULL; 110 111 kfree(fd->entry_to_rb); 112 fd->entry_to_rb = NULL; 113 } 114 115 /* 116 * Release pinned receive buffer pages. 117 * 118 * @mapped: true if the pages have been DMA mapped. false otherwise. 119 * @idx: Index of the first page to unpin. 120 * @npages: No of pages to unpin. 121 * 122 * If the pages have been DMA mapped (indicated by mapped parameter), their 123 * info will be passed via a struct tid_rb_node. If they haven't been mapped, 124 * their info will be passed via a struct tid_user_buf. 125 */ 126 static void unpin_rcv_pages(struct hfi1_filedata *fd, 127 struct tid_user_buf *tidbuf, 128 struct tid_rb_node *node, 129 unsigned int idx, 130 unsigned int npages, 131 bool mapped) 132 { 133 struct page **pages; 134 struct hfi1_devdata *dd = fd->uctxt->dd; 135 struct mm_struct *mm; 136 137 if (mapped) { 138 dma_unmap_single(&dd->pcidev->dev, node->dma_addr, 139 node->npages * PAGE_SIZE, DMA_FROM_DEVICE); 140 pages = &node->pages[idx]; 141 mm = mm_from_tid_node(node); 142 } else { 143 pages = &tidbuf->pages[idx]; 144 mm = current->mm; 145 } 146 hfi1_release_user_pages(mm, pages, npages, mapped); 147 fd->tid_n_pinned -= npages; 148 } 149 150 /* 151 * Pin receive buffer pages. 152 */ 153 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) 154 { 155 int pinned; 156 unsigned int npages; 157 unsigned long vaddr = tidbuf->vaddr; 158 struct page **pages = NULL; 159 struct hfi1_devdata *dd = fd->uctxt->dd; 160 161 /* Get the number of pages the user buffer spans */ 162 npages = num_user_pages(vaddr, tidbuf->length); 163 if (!npages) 164 return -EINVAL; 165 166 if (npages > fd->uctxt->expected_count) { 167 dd_dev_err(dd, "Expected buffer too big\n"); 168 return -EINVAL; 169 } 170 171 /* Allocate the array of struct page pointers needed for pinning */ 172 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 173 if (!pages) 174 return -ENOMEM; 175 176 /* 177 * Pin all the pages of the user buffer. If we can't pin all the 178 * pages, accept the amount pinned so far and program only that. 179 * User space knows how to deal with partially programmed buffers. 180 */ 181 if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) { 182 kfree(pages); 183 return -ENOMEM; 184 } 185 186 pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages); 187 if (pinned <= 0) { 188 kfree(pages); 189 return pinned; 190 } 191 tidbuf->pages = pages; 192 tidbuf->npages = npages; 193 fd->tid_n_pinned += pinned; 194 return pinned; 195 } 196 197 /* 198 * RcvArray entry allocation for Expected Receives is done by the 199 * following algorithm: 200 * 201 * The context keeps 3 lists of groups of RcvArray entries: 202 * 1. List of empty groups - tid_group_list 203 * This list is created during user context creation and 204 * contains elements which describe sets (of 8) of empty 205 * RcvArray entries. 206 * 2. List of partially used groups - tid_used_list 207 * This list contains sets of RcvArray entries which are 208 * not completely used up. Another mapping request could 209 * use some of all of the remaining entries. 210 * 3. List of full groups - tid_full_list 211 * This is the list where sets that are completely used 212 * up go. 213 * 214 * An attempt to optimize the usage of RcvArray entries is 215 * made by finding all sets of physically contiguous pages in a 216 * user's buffer. 217 * These physically contiguous sets are further split into 218 * sizes supported by the receive engine of the HFI. The 219 * resulting sets of pages are stored in struct tid_pageset, 220 * which describes the sets as: 221 * * .count - number of pages in this set 222 * * .idx - starting index into struct page ** array 223 * of this set 224 * 225 * From this point on, the algorithm deals with the page sets 226 * described above. The number of pagesets is divided by the 227 * RcvArray group size to produce the number of full groups 228 * needed. 229 * 230 * Groups from the 3 lists are manipulated using the following 231 * rules: 232 * 1. For each set of 8 pagesets, a complete group from 233 * tid_group_list is taken, programmed, and moved to 234 * the tid_full_list list. 235 * 2. For all remaining pagesets: 236 * 2.1 If the tid_used_list is empty and the tid_group_list 237 * is empty, stop processing pageset and return only 238 * what has been programmed up to this point. 239 * 2.2 If the tid_used_list is empty and the tid_group_list 240 * is not empty, move a group from tid_group_list to 241 * tid_used_list. 242 * 2.3 For each group is tid_used_group, program as much as 243 * can fit into the group. If the group becomes fully 244 * used, move it to tid_full_list. 245 */ 246 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, 247 struct hfi1_tid_info *tinfo) 248 { 249 int ret = 0, need_group = 0, pinned; 250 struct hfi1_ctxtdata *uctxt = fd->uctxt; 251 struct hfi1_devdata *dd = uctxt->dd; 252 unsigned int ngroups, pageidx = 0, pageset_count, 253 tididx = 0, mapped, mapped_pages = 0; 254 u32 *tidlist = NULL; 255 struct tid_user_buf *tidbuf; 256 257 if (!PAGE_ALIGNED(tinfo->vaddr)) 258 return -EINVAL; 259 260 tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); 261 if (!tidbuf) 262 return -ENOMEM; 263 264 tidbuf->vaddr = tinfo->vaddr; 265 tidbuf->length = tinfo->length; 266 tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), 267 GFP_KERNEL); 268 if (!tidbuf->psets) { 269 kfree(tidbuf); 270 return -ENOMEM; 271 } 272 273 pinned = pin_rcv_pages(fd, tidbuf); 274 if (pinned <= 0) { 275 kfree(tidbuf->psets); 276 kfree(tidbuf); 277 return pinned; 278 } 279 280 /* Find sets of physically contiguous pages */ 281 tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); 282 283 /* 284 * We don't need to access this under a lock since tid_used is per 285 * process and the same process cannot be in hfi1_user_exp_rcv_clear() 286 * and hfi1_user_exp_rcv_setup() at the same time. 287 */ 288 spin_lock(&fd->tid_lock); 289 if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) 290 pageset_count = fd->tid_limit - fd->tid_used; 291 else 292 pageset_count = tidbuf->n_psets; 293 spin_unlock(&fd->tid_lock); 294 295 if (!pageset_count) 296 goto bail; 297 298 ngroups = pageset_count / dd->rcv_entries.group_size; 299 tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); 300 if (!tidlist) { 301 ret = -ENOMEM; 302 goto nomem; 303 } 304 305 tididx = 0; 306 307 /* 308 * From this point on, we are going to be using shared (between master 309 * and subcontexts) context resources. We need to take the lock. 310 */ 311 mutex_lock(&uctxt->exp_mutex); 312 /* 313 * The first step is to program the RcvArray entries which are complete 314 * groups. 315 */ 316 while (ngroups && uctxt->tid_group_list.count) { 317 struct tid_group *grp = 318 tid_group_pop(&uctxt->tid_group_list); 319 320 ret = program_rcvarray(fd, tidbuf, grp, 321 pageidx, dd->rcv_entries.group_size, 322 tidlist, &tididx, &mapped); 323 /* 324 * If there was a failure to program the RcvArray 325 * entries for the entire group, reset the grp fields 326 * and add the grp back to the free group list. 327 */ 328 if (ret <= 0) { 329 tid_group_add_tail(grp, &uctxt->tid_group_list); 330 hfi1_cdbg(TID, 331 "Failed to program RcvArray group %d", ret); 332 goto unlock; 333 } 334 335 tid_group_add_tail(grp, &uctxt->tid_full_list); 336 ngroups--; 337 pageidx += ret; 338 mapped_pages += mapped; 339 } 340 341 while (pageidx < pageset_count) { 342 struct tid_group *grp, *ptr; 343 /* 344 * If we don't have any partially used tid groups, check 345 * if we have empty groups. If so, take one from there and 346 * put in the partially used list. 347 */ 348 if (!uctxt->tid_used_list.count || need_group) { 349 if (!uctxt->tid_group_list.count) 350 goto unlock; 351 352 grp = tid_group_pop(&uctxt->tid_group_list); 353 tid_group_add_tail(grp, &uctxt->tid_used_list); 354 need_group = 0; 355 } 356 /* 357 * There is an optimization opportunity here - instead of 358 * fitting as many page sets as we can, check for a group 359 * later on in the list that could fit all of them. 360 */ 361 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, 362 list) { 363 unsigned use = min_t(unsigned, pageset_count - pageidx, 364 grp->size - grp->used); 365 366 ret = program_rcvarray(fd, tidbuf, grp, 367 pageidx, use, tidlist, 368 &tididx, &mapped); 369 if (ret < 0) { 370 hfi1_cdbg(TID, 371 "Failed to program RcvArray entries %d", 372 ret); 373 goto unlock; 374 } else if (ret > 0) { 375 if (grp->used == grp->size) 376 tid_group_move(grp, 377 &uctxt->tid_used_list, 378 &uctxt->tid_full_list); 379 pageidx += ret; 380 mapped_pages += mapped; 381 need_group = 0; 382 /* Check if we are done so we break out early */ 383 if (pageidx >= pageset_count) 384 break; 385 } else if (WARN_ON(ret == 0)) { 386 /* 387 * If ret is 0, we did not program any entries 388 * into this group, which can only happen if 389 * we've screwed up the accounting somewhere. 390 * Warn and try to continue. 391 */ 392 need_group = 1; 393 } 394 } 395 } 396 unlock: 397 mutex_unlock(&uctxt->exp_mutex); 398 nomem: 399 hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, 400 mapped_pages, ret); 401 if (tididx) { 402 spin_lock(&fd->tid_lock); 403 fd->tid_used += tididx; 404 spin_unlock(&fd->tid_lock); 405 tinfo->tidcnt = tididx; 406 tinfo->length = mapped_pages * PAGE_SIZE; 407 408 if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), 409 tidlist, sizeof(tidlist[0]) * tididx)) { 410 /* 411 * On failure to copy to the user level, we need to undo 412 * everything done so far so we don't leak resources. 413 */ 414 tinfo->tidlist = (unsigned long)&tidlist; 415 hfi1_user_exp_rcv_clear(fd, tinfo); 416 tinfo->tidlist = 0; 417 ret = -EFAULT; 418 goto bail; 419 } 420 } 421 422 /* 423 * If not everything was mapped (due to insufficient RcvArray entries, 424 * for example), unpin all unmapped pages so we can pin them nex time. 425 */ 426 if (mapped_pages != pinned) 427 unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, 428 (pinned - mapped_pages), false); 429 bail: 430 kfree(tidbuf->psets); 431 kfree(tidlist); 432 kfree(tidbuf->pages); 433 kfree(tidbuf); 434 return ret > 0 ? 0 : ret; 435 } 436 437 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, 438 struct hfi1_tid_info *tinfo) 439 { 440 int ret = 0; 441 struct hfi1_ctxtdata *uctxt = fd->uctxt; 442 u32 *tidinfo; 443 unsigned tididx; 444 445 if (unlikely(tinfo->tidcnt > fd->tid_used)) 446 return -EINVAL; 447 448 tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist), 449 sizeof(tidinfo[0]) * tinfo->tidcnt); 450 if (IS_ERR(tidinfo)) 451 return PTR_ERR(tidinfo); 452 453 mutex_lock(&uctxt->exp_mutex); 454 for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { 455 ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); 456 if (ret) { 457 hfi1_cdbg(TID, "Failed to unprogram rcv array %d", 458 ret); 459 break; 460 } 461 } 462 spin_lock(&fd->tid_lock); 463 fd->tid_used -= tididx; 464 spin_unlock(&fd->tid_lock); 465 tinfo->tidcnt = tididx; 466 mutex_unlock(&uctxt->exp_mutex); 467 468 kfree(tidinfo); 469 return ret; 470 } 471 472 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, 473 struct hfi1_tid_info *tinfo) 474 { 475 struct hfi1_ctxtdata *uctxt = fd->uctxt; 476 unsigned long *ev = uctxt->dd->events + 477 (uctxt_offset(uctxt) + fd->subctxt); 478 u32 *array; 479 int ret = 0; 480 481 /* 482 * copy_to_user() can sleep, which will leave the invalid_lock 483 * locked and cause the MMU notifier to be blocked on the lock 484 * for a long time. 485 * Copy the data to a local buffer so we can release the lock. 486 */ 487 array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); 488 if (!array) 489 return -EFAULT; 490 491 spin_lock(&fd->invalid_lock); 492 if (fd->invalid_tid_idx) { 493 memcpy(array, fd->invalid_tids, sizeof(*array) * 494 fd->invalid_tid_idx); 495 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * 496 fd->invalid_tid_idx); 497 tinfo->tidcnt = fd->invalid_tid_idx; 498 fd->invalid_tid_idx = 0; 499 /* 500 * Reset the user flag while still holding the lock. 501 * Otherwise, PSM can miss events. 502 */ 503 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 504 } else { 505 tinfo->tidcnt = 0; 506 } 507 spin_unlock(&fd->invalid_lock); 508 509 if (tinfo->tidcnt) { 510 if (copy_to_user((void __user *)tinfo->tidlist, 511 array, sizeof(*array) * tinfo->tidcnt)) 512 ret = -EFAULT; 513 } 514 kfree(array); 515 516 return ret; 517 } 518 519 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) 520 { 521 unsigned pagecount, pageidx, setcount = 0, i; 522 unsigned long pfn, this_pfn; 523 struct page **pages = tidbuf->pages; 524 struct tid_pageset *list = tidbuf->psets; 525 526 if (!npages) 527 return 0; 528 529 /* 530 * Look for sets of physically contiguous pages in the user buffer. 531 * This will allow us to optimize Expected RcvArray entry usage by 532 * using the bigger supported sizes. 533 */ 534 pfn = page_to_pfn(pages[0]); 535 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 536 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; 537 538 /* 539 * If the pfn's are not sequential, pages are not physically 540 * contiguous. 541 */ 542 if (this_pfn != ++pfn) { 543 /* 544 * At this point we have to loop over the set of 545 * physically contiguous pages and break them down it 546 * sizes supported by the HW. 547 * There are two main constraints: 548 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 549 * If the total set size is bigger than that 550 * program only a MAX_EXPECTED_BUFFER chunk. 551 * 2. The buffer size has to be a power of two. If 552 * it is not, round down to the closes power of 553 * 2 and program that size. 554 */ 555 while (pagecount) { 556 int maxpages = pagecount; 557 u32 bufsize = pagecount * PAGE_SIZE; 558 559 if (bufsize > MAX_EXPECTED_BUFFER) 560 maxpages = 561 MAX_EXPECTED_BUFFER >> 562 PAGE_SHIFT; 563 else if (!is_power_of_2(bufsize)) 564 maxpages = 565 rounddown_pow_of_two(bufsize) >> 566 PAGE_SHIFT; 567 568 list[setcount].idx = pageidx; 569 list[setcount].count = maxpages; 570 pagecount -= maxpages; 571 pageidx += maxpages; 572 setcount++; 573 } 574 pageidx = i; 575 pagecount = 1; 576 pfn = this_pfn; 577 } else { 578 pagecount++; 579 } 580 } 581 return setcount; 582 } 583 584 /** 585 * program_rcvarray() - program an RcvArray group with receive buffers 586 * @fd: filedata pointer 587 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting 588 * virtual address, buffer length, page pointers, pagesets (array of 589 * struct tid_pageset holding information on physically contiguous 590 * chunks from the user buffer), and other fields. 591 * @grp: RcvArray group 592 * @start: starting index into sets array 593 * @count: number of struct tid_pageset's to program 594 * @tidlist: the array of u32 elements when the information about the 595 * programmed RcvArray entries is to be encoded. 596 * @tididx: starting offset into tidlist 597 * @pmapped: (output parameter) number of pages programmed into the RcvArray 598 * entries. 599 * 600 * This function will program up to 'count' number of RcvArray entries from the 601 * group 'grp'. To make best use of write-combining writes, the function will 602 * perform writes to the unused RcvArray entries which will be ignored by the 603 * HW. Each RcvArray entry will be programmed with a physically contiguous 604 * buffer chunk from the user's virtual buffer. 605 * 606 * Return: 607 * -EINVAL if the requested count is larger than the size of the group, 608 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or 609 * number of RcvArray entries programmed. 610 */ 611 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, 612 struct tid_group *grp, 613 unsigned int start, u16 count, 614 u32 *tidlist, unsigned int *tididx, 615 unsigned int *pmapped) 616 { 617 struct hfi1_ctxtdata *uctxt = fd->uctxt; 618 struct hfi1_devdata *dd = uctxt->dd; 619 u16 idx; 620 u32 tidinfo = 0, rcventry, useidx = 0; 621 int mapped = 0; 622 623 /* Count should never be larger than the group size */ 624 if (count > grp->size) 625 return -EINVAL; 626 627 /* Find the first unused entry in the group */ 628 for (idx = 0; idx < grp->size; idx++) { 629 if (!(grp->map & (1 << idx))) { 630 useidx = idx; 631 break; 632 } 633 rcv_array_wc_fill(dd, grp->base + idx); 634 } 635 636 idx = 0; 637 while (idx < count) { 638 u16 npages, pageidx, setidx = start + idx; 639 int ret = 0; 640 641 /* 642 * If this entry in the group is used, move to the next one. 643 * If we go past the end of the group, exit the loop. 644 */ 645 if (useidx >= grp->size) { 646 break; 647 } else if (grp->map & (1 << useidx)) { 648 rcv_array_wc_fill(dd, grp->base + useidx); 649 useidx++; 650 continue; 651 } 652 653 rcventry = grp->base + useidx; 654 npages = tbuf->psets[setidx].count; 655 pageidx = tbuf->psets[setidx].idx; 656 657 ret = set_rcvarray_entry(fd, tbuf, 658 rcventry, grp, pageidx, 659 npages); 660 if (ret) 661 return ret; 662 mapped += npages; 663 664 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | 665 EXP_TID_SET(LEN, npages); 666 tidlist[(*tididx)++] = tidinfo; 667 grp->used++; 668 grp->map |= 1 << useidx++; 669 idx++; 670 } 671 672 /* Fill the rest of the group with "blank" writes */ 673 for (; useidx < grp->size; useidx++) 674 rcv_array_wc_fill(dd, grp->base + useidx); 675 *pmapped = mapped; 676 return idx; 677 } 678 679 static int set_rcvarray_entry(struct hfi1_filedata *fd, 680 struct tid_user_buf *tbuf, 681 u32 rcventry, struct tid_group *grp, 682 u16 pageidx, unsigned int npages) 683 { 684 int ret; 685 struct hfi1_ctxtdata *uctxt = fd->uctxt; 686 struct tid_rb_node *node; 687 struct hfi1_devdata *dd = uctxt->dd; 688 dma_addr_t phys; 689 struct page **pages = tbuf->pages + pageidx; 690 691 /* 692 * Allocate the node first so we can handle a potential 693 * failure before we've programmed anything. 694 */ 695 node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), 696 GFP_KERNEL); 697 if (!node) 698 return -ENOMEM; 699 700 phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])), 701 npages * PAGE_SIZE, DMA_FROM_DEVICE); 702 if (dma_mapping_error(&dd->pcidev->dev, phys)) { 703 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", 704 phys); 705 kfree(node); 706 return -EFAULT; 707 } 708 709 node->fdata = fd; 710 node->phys = page_to_phys(pages[0]); 711 node->npages = npages; 712 node->rcventry = rcventry; 713 node->dma_addr = phys; 714 node->grp = grp; 715 node->freed = false; 716 memcpy(node->pages, pages, sizeof(struct page *) * npages); 717 718 if (fd->use_mn) { 719 ret = mmu_interval_notifier_insert( 720 &node->notifier, current->mm, 721 tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, 722 &tid_mn_ops); 723 if (ret) 724 goto out_unmap; 725 /* 726 * FIXME: This is in the wrong order, the notifier should be 727 * established before the pages are pinned by pin_rcv_pages. 728 */ 729 mmu_interval_read_begin(&node->notifier); 730 } 731 fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; 732 733 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); 734 trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, 735 node->notifier.interval_tree.start, node->phys, 736 phys); 737 return 0; 738 739 out_unmap: 740 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", 741 node->rcventry, node->notifier.interval_tree.start, 742 node->phys, ret); 743 dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE, 744 DMA_FROM_DEVICE); 745 kfree(node); 746 return -EFAULT; 747 } 748 749 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 750 struct tid_group **grp) 751 { 752 struct hfi1_ctxtdata *uctxt = fd->uctxt; 753 struct hfi1_devdata *dd = uctxt->dd; 754 struct tid_rb_node *node; 755 u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); 756 u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; 757 758 if (tididx >= uctxt->expected_count) { 759 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", 760 tididx, uctxt->ctxt); 761 return -EINVAL; 762 } 763 764 if (tidctrl == 0x3) 765 return -EINVAL; 766 767 rcventry = tididx + (tidctrl - 1); 768 769 node = fd->entry_to_rb[rcventry]; 770 if (!node || node->rcventry != (uctxt->expected_base + rcventry)) 771 return -EBADF; 772 773 if (grp) 774 *grp = node->grp; 775 776 if (fd->use_mn) 777 mmu_interval_notifier_remove(&node->notifier); 778 cacheless_tid_rb_remove(fd, node); 779 780 return 0; 781 } 782 783 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 784 { 785 struct hfi1_ctxtdata *uctxt = fd->uctxt; 786 struct hfi1_devdata *dd = uctxt->dd; 787 788 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, 789 node->npages, 790 node->notifier.interval_tree.start, node->phys, 791 node->dma_addr); 792 793 /* 794 * Make sure device has seen the write before we unpin the 795 * pages. 796 */ 797 hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); 798 799 unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); 800 801 node->grp->used--; 802 node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); 803 804 if (node->grp->used == node->grp->size - 1) 805 tid_group_move(node->grp, &uctxt->tid_full_list, 806 &uctxt->tid_used_list); 807 else if (!node->grp->used) 808 tid_group_move(node->grp, &uctxt->tid_used_list, 809 &uctxt->tid_group_list); 810 kfree(node); 811 } 812 813 /* 814 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with 815 * clearing nodes in the non-cached case. 816 */ 817 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 818 struct exp_tid_set *set, 819 struct hfi1_filedata *fd) 820 { 821 struct tid_group *grp, *ptr; 822 int i; 823 824 list_for_each_entry_safe(grp, ptr, &set->list, list) { 825 list_del_init(&grp->list); 826 827 for (i = 0; i < grp->size; i++) { 828 if (grp->map & (1 << i)) { 829 u16 rcventry = grp->base + i; 830 struct tid_rb_node *node; 831 832 node = fd->entry_to_rb[rcventry - 833 uctxt->expected_base]; 834 if (!node || node->rcventry != rcventry) 835 continue; 836 837 if (fd->use_mn) 838 mmu_interval_notifier_remove( 839 &node->notifier); 840 cacheless_tid_rb_remove(fd, node); 841 } 842 } 843 } 844 } 845 846 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 847 const struct mmu_notifier_range *range, 848 unsigned long cur_seq) 849 { 850 struct tid_rb_node *node = 851 container_of(mni, struct tid_rb_node, notifier); 852 struct hfi1_filedata *fdata = node->fdata; 853 struct hfi1_ctxtdata *uctxt = fdata->uctxt; 854 855 if (node->freed) 856 return true; 857 858 trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, 859 node->notifier.interval_tree.start, 860 node->rcventry, node->npages, node->dma_addr); 861 node->freed = true; 862 863 spin_lock(&fdata->invalid_lock); 864 if (fdata->invalid_tid_idx < uctxt->expected_count) { 865 fdata->invalid_tids[fdata->invalid_tid_idx] = 866 rcventry2tidinfo(node->rcventry - uctxt->expected_base); 867 fdata->invalid_tids[fdata->invalid_tid_idx] |= 868 EXP_TID_SET(LEN, node->npages); 869 if (!fdata->invalid_tid_idx) { 870 unsigned long *ev; 871 872 /* 873 * hfi1_set_uevent_bits() sets a user event flag 874 * for all processes. Because calling into the 875 * driver to process TID cache invalidations is 876 * expensive and TID cache invalidations are 877 * handled on a per-process basis, we can 878 * optimize this to set the flag only for the 879 * process in question. 880 */ 881 ev = uctxt->dd->events + 882 (uctxt_offset(uctxt) + fdata->subctxt); 883 set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 884 } 885 fdata->invalid_tid_idx++; 886 } 887 spin_unlock(&fdata->invalid_lock); 888 return true; 889 } 890 891 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 892 struct tid_rb_node *tnode) 893 { 894 u32 base = fdata->uctxt->expected_base; 895 896 fdata->entry_to_rb[tnode->rcventry - base] = NULL; 897 clear_tid_node(fdata, tnode); 898 } 899