xref: /openbmc/linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c (revision 19dc81b4017baffd6e919fd71cfc8dcbd5442e15)
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 /*
3  * Copyright(c) 2020 Cornelis Networks, Inc.
4  * Copyright(c) 2015-2018 Intel Corporation.
5  */
6 #include <asm/page.h>
7 #include <linux/string.h>
8 
9 #include "mmu_rb.h"
10 #include "user_exp_rcv.h"
11 #include "trace.h"
12 
13 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14 			    struct exp_tid_set *set,
15 			    struct hfi1_filedata *fd);
16 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
17 static int set_rcvarray_entry(struct hfi1_filedata *fd,
18 			      struct tid_user_buf *tbuf,
19 			      u32 rcventry, struct tid_group *grp,
20 			      u16 pageidx, unsigned int npages);
21 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22 				    struct tid_rb_node *tnode);
23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24 			      const struct mmu_notifier_range *range,
25 			      unsigned long cur_seq);
26 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
27 			    struct tid_group *grp,
28 			    unsigned int start, u16 count,
29 			    u32 *tidlist, unsigned int *tididx,
30 			    unsigned int *pmapped);
31 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
32 			      struct tid_group **grp);
33 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
34 
35 static const struct mmu_interval_notifier_ops tid_mn_ops = {
36 	.invalidate = tid_rb_invalidate,
37 };
38 
39 /*
40  * Initialize context and file private data needed for Expected
41  * receive caching. This needs to be done after the context has
42  * been configured with the eager/expected RcvEntry counts.
43  */
44 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
45 			   struct hfi1_ctxtdata *uctxt)
46 {
47 	int ret = 0;
48 
49 	fd->entry_to_rb = kcalloc(uctxt->expected_count,
50 				  sizeof(struct rb_node *),
51 				  GFP_KERNEL);
52 	if (!fd->entry_to_rb)
53 		return -ENOMEM;
54 
55 	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
56 		fd->invalid_tid_idx = 0;
57 		fd->invalid_tids = kcalloc(uctxt->expected_count,
58 					   sizeof(*fd->invalid_tids),
59 					   GFP_KERNEL);
60 		if (!fd->invalid_tids) {
61 			kfree(fd->entry_to_rb);
62 			fd->entry_to_rb = NULL;
63 			return -ENOMEM;
64 		}
65 		fd->use_mn = true;
66 	}
67 
68 	/*
69 	 * PSM does not have a good way to separate, count, and
70 	 * effectively enforce a limit on RcvArray entries used by
71 	 * subctxts (when context sharing is used) when TID caching
72 	 * is enabled. To help with that, we calculate a per-process
73 	 * RcvArray entry share and enforce that.
74 	 * If TID caching is not in use, PSM deals with usage on its
75 	 * own. In that case, we allow any subctxt to take all of the
76 	 * entries.
77 	 *
78 	 * Make sure that we set the tid counts only after successful
79 	 * init.
80 	 */
81 	spin_lock(&fd->tid_lock);
82 	if (uctxt->subctxt_cnt && fd->use_mn) {
83 		u16 remainder;
84 
85 		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
86 		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
87 		if (remainder && fd->subctxt < remainder)
88 			fd->tid_limit++;
89 	} else {
90 		fd->tid_limit = uctxt->expected_count;
91 	}
92 	spin_unlock(&fd->tid_lock);
93 
94 	return ret;
95 }
96 
97 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
98 {
99 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
100 
101 	mutex_lock(&uctxt->exp_mutex);
102 	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
103 		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
104 	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
105 		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
106 	mutex_unlock(&uctxt->exp_mutex);
107 
108 	kfree(fd->invalid_tids);
109 	fd->invalid_tids = NULL;
110 
111 	kfree(fd->entry_to_rb);
112 	fd->entry_to_rb = NULL;
113 }
114 
115 /*
116  * Release pinned receive buffer pages.
117  *
118  * @mapped: true if the pages have been DMA mapped. false otherwise.
119  * @idx: Index of the first page to unpin.
120  * @npages: No of pages to unpin.
121  *
122  * If the pages have been DMA mapped (indicated by mapped parameter), their
123  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
124  * their info will be passed via a struct tid_user_buf.
125  */
126 static void unpin_rcv_pages(struct hfi1_filedata *fd,
127 			    struct tid_user_buf *tidbuf,
128 			    struct tid_rb_node *node,
129 			    unsigned int idx,
130 			    unsigned int npages,
131 			    bool mapped)
132 {
133 	struct page **pages;
134 	struct hfi1_devdata *dd = fd->uctxt->dd;
135 	struct mm_struct *mm;
136 
137 	if (mapped) {
138 		dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
139 				 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
140 		pages = &node->pages[idx];
141 		mm = mm_from_tid_node(node);
142 	} else {
143 		pages = &tidbuf->pages[idx];
144 		mm = current->mm;
145 	}
146 	hfi1_release_user_pages(mm, pages, npages, mapped);
147 	fd->tid_n_pinned -= npages;
148 }
149 
150 /*
151  * Pin receive buffer pages.
152  */
153 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
154 {
155 	int pinned;
156 	unsigned int npages;
157 	unsigned long vaddr = tidbuf->vaddr;
158 	struct page **pages = NULL;
159 	struct hfi1_devdata *dd = fd->uctxt->dd;
160 
161 	/* Get the number of pages the user buffer spans */
162 	npages = num_user_pages(vaddr, tidbuf->length);
163 	if (!npages)
164 		return -EINVAL;
165 
166 	if (npages > fd->uctxt->expected_count) {
167 		dd_dev_err(dd, "Expected buffer too big\n");
168 		return -EINVAL;
169 	}
170 
171 	/* Allocate the array of struct page pointers needed for pinning */
172 	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
173 	if (!pages)
174 		return -ENOMEM;
175 
176 	/*
177 	 * Pin all the pages of the user buffer. If we can't pin all the
178 	 * pages, accept the amount pinned so far and program only that.
179 	 * User space knows how to deal with partially programmed buffers.
180 	 */
181 	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
182 		kfree(pages);
183 		return -ENOMEM;
184 	}
185 
186 	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
187 	if (pinned <= 0) {
188 		kfree(pages);
189 		return pinned;
190 	}
191 	tidbuf->pages = pages;
192 	tidbuf->npages = npages;
193 	fd->tid_n_pinned += pinned;
194 	return pinned;
195 }
196 
197 /*
198  * RcvArray entry allocation for Expected Receives is done by the
199  * following algorithm:
200  *
201  * The context keeps 3 lists of groups of RcvArray entries:
202  *   1. List of empty groups - tid_group_list
203  *      This list is created during user context creation and
204  *      contains elements which describe sets (of 8) of empty
205  *      RcvArray entries.
206  *   2. List of partially used groups - tid_used_list
207  *      This list contains sets of RcvArray entries which are
208  *      not completely used up. Another mapping request could
209  *      use some of all of the remaining entries.
210  *   3. List of full groups - tid_full_list
211  *      This is the list where sets that are completely used
212  *      up go.
213  *
214  * An attempt to optimize the usage of RcvArray entries is
215  * made by finding all sets of physically contiguous pages in a
216  * user's buffer.
217  * These physically contiguous sets are further split into
218  * sizes supported by the receive engine of the HFI. The
219  * resulting sets of pages are stored in struct tid_pageset,
220  * which describes the sets as:
221  *    * .count - number of pages in this set
222  *    * .idx - starting index into struct page ** array
223  *                    of this set
224  *
225  * From this point on, the algorithm deals with the page sets
226  * described above. The number of pagesets is divided by the
227  * RcvArray group size to produce the number of full groups
228  * needed.
229  *
230  * Groups from the 3 lists are manipulated using the following
231  * rules:
232  *   1. For each set of 8 pagesets, a complete group from
233  *      tid_group_list is taken, programmed, and moved to
234  *      the tid_full_list list.
235  *   2. For all remaining pagesets:
236  *      2.1 If the tid_used_list is empty and the tid_group_list
237  *          is empty, stop processing pageset and return only
238  *          what has been programmed up to this point.
239  *      2.2 If the tid_used_list is empty and the tid_group_list
240  *          is not empty, move a group from tid_group_list to
241  *          tid_used_list.
242  *      2.3 For each group is tid_used_group, program as much as
243  *          can fit into the group. If the group becomes fully
244  *          used, move it to tid_full_list.
245  */
246 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
247 			    struct hfi1_tid_info *tinfo)
248 {
249 	int ret = 0, need_group = 0, pinned;
250 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
251 	struct hfi1_devdata *dd = uctxt->dd;
252 	unsigned int ngroups, pageidx = 0, pageset_count,
253 		tididx = 0, mapped, mapped_pages = 0;
254 	u32 *tidlist = NULL;
255 	struct tid_user_buf *tidbuf;
256 
257 	if (!PAGE_ALIGNED(tinfo->vaddr))
258 		return -EINVAL;
259 
260 	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
261 	if (!tidbuf)
262 		return -ENOMEM;
263 
264 	tidbuf->vaddr = tinfo->vaddr;
265 	tidbuf->length = tinfo->length;
266 	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
267 				GFP_KERNEL);
268 	if (!tidbuf->psets) {
269 		kfree(tidbuf);
270 		return -ENOMEM;
271 	}
272 
273 	pinned = pin_rcv_pages(fd, tidbuf);
274 	if (pinned <= 0) {
275 		kfree(tidbuf->psets);
276 		kfree(tidbuf);
277 		return pinned;
278 	}
279 
280 	/* Find sets of physically contiguous pages */
281 	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
282 
283 	/*
284 	 * We don't need to access this under a lock since tid_used is per
285 	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
286 	 * and hfi1_user_exp_rcv_setup() at the same time.
287 	 */
288 	spin_lock(&fd->tid_lock);
289 	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
290 		pageset_count = fd->tid_limit - fd->tid_used;
291 	else
292 		pageset_count = tidbuf->n_psets;
293 	spin_unlock(&fd->tid_lock);
294 
295 	if (!pageset_count)
296 		goto bail;
297 
298 	ngroups = pageset_count / dd->rcv_entries.group_size;
299 	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
300 	if (!tidlist) {
301 		ret = -ENOMEM;
302 		goto nomem;
303 	}
304 
305 	tididx = 0;
306 
307 	/*
308 	 * From this point on, we are going to be using shared (between master
309 	 * and subcontexts) context resources. We need to take the lock.
310 	 */
311 	mutex_lock(&uctxt->exp_mutex);
312 	/*
313 	 * The first step is to program the RcvArray entries which are complete
314 	 * groups.
315 	 */
316 	while (ngroups && uctxt->tid_group_list.count) {
317 		struct tid_group *grp =
318 			tid_group_pop(&uctxt->tid_group_list);
319 
320 		ret = program_rcvarray(fd, tidbuf, grp,
321 				       pageidx, dd->rcv_entries.group_size,
322 				       tidlist, &tididx, &mapped);
323 		/*
324 		 * If there was a failure to program the RcvArray
325 		 * entries for the entire group, reset the grp fields
326 		 * and add the grp back to the free group list.
327 		 */
328 		if (ret <= 0) {
329 			tid_group_add_tail(grp, &uctxt->tid_group_list);
330 			hfi1_cdbg(TID,
331 				  "Failed to program RcvArray group %d", ret);
332 			goto unlock;
333 		}
334 
335 		tid_group_add_tail(grp, &uctxt->tid_full_list);
336 		ngroups--;
337 		pageidx += ret;
338 		mapped_pages += mapped;
339 	}
340 
341 	while (pageidx < pageset_count) {
342 		struct tid_group *grp, *ptr;
343 		/*
344 		 * If we don't have any partially used tid groups, check
345 		 * if we have empty groups. If so, take one from there and
346 		 * put in the partially used list.
347 		 */
348 		if (!uctxt->tid_used_list.count || need_group) {
349 			if (!uctxt->tid_group_list.count)
350 				goto unlock;
351 
352 			grp = tid_group_pop(&uctxt->tid_group_list);
353 			tid_group_add_tail(grp, &uctxt->tid_used_list);
354 			need_group = 0;
355 		}
356 		/*
357 		 * There is an optimization opportunity here - instead of
358 		 * fitting as many page sets as we can, check for a group
359 		 * later on in the list that could fit all of them.
360 		 */
361 		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
362 					 list) {
363 			unsigned use = min_t(unsigned, pageset_count - pageidx,
364 					     grp->size - grp->used);
365 
366 			ret = program_rcvarray(fd, tidbuf, grp,
367 					       pageidx, use, tidlist,
368 					       &tididx, &mapped);
369 			if (ret < 0) {
370 				hfi1_cdbg(TID,
371 					  "Failed to program RcvArray entries %d",
372 					  ret);
373 				goto unlock;
374 			} else if (ret > 0) {
375 				if (grp->used == grp->size)
376 					tid_group_move(grp,
377 						       &uctxt->tid_used_list,
378 						       &uctxt->tid_full_list);
379 				pageidx += ret;
380 				mapped_pages += mapped;
381 				need_group = 0;
382 				/* Check if we are done so we break out early */
383 				if (pageidx >= pageset_count)
384 					break;
385 			} else if (WARN_ON(ret == 0)) {
386 				/*
387 				 * If ret is 0, we did not program any entries
388 				 * into this group, which can only happen if
389 				 * we've screwed up the accounting somewhere.
390 				 * Warn and try to continue.
391 				 */
392 				need_group = 1;
393 			}
394 		}
395 	}
396 unlock:
397 	mutex_unlock(&uctxt->exp_mutex);
398 nomem:
399 	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
400 		  mapped_pages, ret);
401 	if (tididx) {
402 		spin_lock(&fd->tid_lock);
403 		fd->tid_used += tididx;
404 		spin_unlock(&fd->tid_lock);
405 		tinfo->tidcnt = tididx;
406 		tinfo->length = mapped_pages * PAGE_SIZE;
407 
408 		if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
409 				 tidlist, sizeof(tidlist[0]) * tididx)) {
410 			/*
411 			 * On failure to copy to the user level, we need to undo
412 			 * everything done so far so we don't leak resources.
413 			 */
414 			tinfo->tidlist = (unsigned long)&tidlist;
415 			hfi1_user_exp_rcv_clear(fd, tinfo);
416 			tinfo->tidlist = 0;
417 			ret = -EFAULT;
418 			goto bail;
419 		}
420 	}
421 
422 	/*
423 	 * If not everything was mapped (due to insufficient RcvArray entries,
424 	 * for example), unpin all unmapped pages so we can pin them nex time.
425 	 */
426 	if (mapped_pages != pinned)
427 		unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
428 				(pinned - mapped_pages), false);
429 bail:
430 	kfree(tidbuf->psets);
431 	kfree(tidlist);
432 	kfree(tidbuf->pages);
433 	kfree(tidbuf);
434 	return ret > 0 ? 0 : ret;
435 }
436 
437 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
438 			    struct hfi1_tid_info *tinfo)
439 {
440 	int ret = 0;
441 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
442 	u32 *tidinfo;
443 	unsigned tididx;
444 
445 	if (unlikely(tinfo->tidcnt > fd->tid_used))
446 		return -EINVAL;
447 
448 	tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
449 			      sizeof(tidinfo[0]) * tinfo->tidcnt);
450 	if (IS_ERR(tidinfo))
451 		return PTR_ERR(tidinfo);
452 
453 	mutex_lock(&uctxt->exp_mutex);
454 	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
455 		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
456 		if (ret) {
457 			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
458 				  ret);
459 			break;
460 		}
461 	}
462 	spin_lock(&fd->tid_lock);
463 	fd->tid_used -= tididx;
464 	spin_unlock(&fd->tid_lock);
465 	tinfo->tidcnt = tididx;
466 	mutex_unlock(&uctxt->exp_mutex);
467 
468 	kfree(tidinfo);
469 	return ret;
470 }
471 
472 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
473 			      struct hfi1_tid_info *tinfo)
474 {
475 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
476 	unsigned long *ev = uctxt->dd->events +
477 		(uctxt_offset(uctxt) + fd->subctxt);
478 	u32 *array;
479 	int ret = 0;
480 
481 	/*
482 	 * copy_to_user() can sleep, which will leave the invalid_lock
483 	 * locked and cause the MMU notifier to be blocked on the lock
484 	 * for a long time.
485 	 * Copy the data to a local buffer so we can release the lock.
486 	 */
487 	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
488 	if (!array)
489 		return -EFAULT;
490 
491 	spin_lock(&fd->invalid_lock);
492 	if (fd->invalid_tid_idx) {
493 		memcpy(array, fd->invalid_tids, sizeof(*array) *
494 		       fd->invalid_tid_idx);
495 		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
496 		       fd->invalid_tid_idx);
497 		tinfo->tidcnt = fd->invalid_tid_idx;
498 		fd->invalid_tid_idx = 0;
499 		/*
500 		 * Reset the user flag while still holding the lock.
501 		 * Otherwise, PSM can miss events.
502 		 */
503 		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
504 	} else {
505 		tinfo->tidcnt = 0;
506 	}
507 	spin_unlock(&fd->invalid_lock);
508 
509 	if (tinfo->tidcnt) {
510 		if (copy_to_user((void __user *)tinfo->tidlist,
511 				 array, sizeof(*array) * tinfo->tidcnt))
512 			ret = -EFAULT;
513 	}
514 	kfree(array);
515 
516 	return ret;
517 }
518 
519 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
520 {
521 	unsigned pagecount, pageidx, setcount = 0, i;
522 	unsigned long pfn, this_pfn;
523 	struct page **pages = tidbuf->pages;
524 	struct tid_pageset *list = tidbuf->psets;
525 
526 	if (!npages)
527 		return 0;
528 
529 	/*
530 	 * Look for sets of physically contiguous pages in the user buffer.
531 	 * This will allow us to optimize Expected RcvArray entry usage by
532 	 * using the bigger supported sizes.
533 	 */
534 	pfn = page_to_pfn(pages[0]);
535 	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
536 		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
537 
538 		/*
539 		 * If the pfn's are not sequential, pages are not physically
540 		 * contiguous.
541 		 */
542 		if (this_pfn != ++pfn) {
543 			/*
544 			 * At this point we have to loop over the set of
545 			 * physically contiguous pages and break them down it
546 			 * sizes supported by the HW.
547 			 * There are two main constraints:
548 			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
549 			 *        If the total set size is bigger than that
550 			 *        program only a MAX_EXPECTED_BUFFER chunk.
551 			 *     2. The buffer size has to be a power of two. If
552 			 *        it is not, round down to the closes power of
553 			 *        2 and program that size.
554 			 */
555 			while (pagecount) {
556 				int maxpages = pagecount;
557 				u32 bufsize = pagecount * PAGE_SIZE;
558 
559 				if (bufsize > MAX_EXPECTED_BUFFER)
560 					maxpages =
561 						MAX_EXPECTED_BUFFER >>
562 						PAGE_SHIFT;
563 				else if (!is_power_of_2(bufsize))
564 					maxpages =
565 						rounddown_pow_of_two(bufsize) >>
566 						PAGE_SHIFT;
567 
568 				list[setcount].idx = pageidx;
569 				list[setcount].count = maxpages;
570 				pagecount -= maxpages;
571 				pageidx += maxpages;
572 				setcount++;
573 			}
574 			pageidx = i;
575 			pagecount = 1;
576 			pfn = this_pfn;
577 		} else {
578 			pagecount++;
579 		}
580 	}
581 	return setcount;
582 }
583 
584 /**
585  * program_rcvarray() - program an RcvArray group with receive buffers
586  * @fd: filedata pointer
587  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
588  *	  virtual address, buffer length, page pointers, pagesets (array of
589  *	  struct tid_pageset holding information on physically contiguous
590  *	  chunks from the user buffer), and other fields.
591  * @grp: RcvArray group
592  * @start: starting index into sets array
593  * @count: number of struct tid_pageset's to program
594  * @tidlist: the array of u32 elements when the information about the
595  *           programmed RcvArray entries is to be encoded.
596  * @tididx: starting offset into tidlist
597  * @pmapped: (output parameter) number of pages programmed into the RcvArray
598  *           entries.
599  *
600  * This function will program up to 'count' number of RcvArray entries from the
601  * group 'grp'. To make best use of write-combining writes, the function will
602  * perform writes to the unused RcvArray entries which will be ignored by the
603  * HW. Each RcvArray entry will be programmed with a physically contiguous
604  * buffer chunk from the user's virtual buffer.
605  *
606  * Return:
607  * -EINVAL if the requested count is larger than the size of the group,
608  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
609  * number of RcvArray entries programmed.
610  */
611 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
612 			    struct tid_group *grp,
613 			    unsigned int start, u16 count,
614 			    u32 *tidlist, unsigned int *tididx,
615 			    unsigned int *pmapped)
616 {
617 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
618 	struct hfi1_devdata *dd = uctxt->dd;
619 	u16 idx;
620 	u32 tidinfo = 0, rcventry, useidx = 0;
621 	int mapped = 0;
622 
623 	/* Count should never be larger than the group size */
624 	if (count > grp->size)
625 		return -EINVAL;
626 
627 	/* Find the first unused entry in the group */
628 	for (idx = 0; idx < grp->size; idx++) {
629 		if (!(grp->map & (1 << idx))) {
630 			useidx = idx;
631 			break;
632 		}
633 		rcv_array_wc_fill(dd, grp->base + idx);
634 	}
635 
636 	idx = 0;
637 	while (idx < count) {
638 		u16 npages, pageidx, setidx = start + idx;
639 		int ret = 0;
640 
641 		/*
642 		 * If this entry in the group is used, move to the next one.
643 		 * If we go past the end of the group, exit the loop.
644 		 */
645 		if (useidx >= grp->size) {
646 			break;
647 		} else if (grp->map & (1 << useidx)) {
648 			rcv_array_wc_fill(dd, grp->base + useidx);
649 			useidx++;
650 			continue;
651 		}
652 
653 		rcventry = grp->base + useidx;
654 		npages = tbuf->psets[setidx].count;
655 		pageidx = tbuf->psets[setidx].idx;
656 
657 		ret = set_rcvarray_entry(fd, tbuf,
658 					 rcventry, grp, pageidx,
659 					 npages);
660 		if (ret)
661 			return ret;
662 		mapped += npages;
663 
664 		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
665 			EXP_TID_SET(LEN, npages);
666 		tidlist[(*tididx)++] = tidinfo;
667 		grp->used++;
668 		grp->map |= 1 << useidx++;
669 		idx++;
670 	}
671 
672 	/* Fill the rest of the group with "blank" writes */
673 	for (; useidx < grp->size; useidx++)
674 		rcv_array_wc_fill(dd, grp->base + useidx);
675 	*pmapped = mapped;
676 	return idx;
677 }
678 
679 static int set_rcvarray_entry(struct hfi1_filedata *fd,
680 			      struct tid_user_buf *tbuf,
681 			      u32 rcventry, struct tid_group *grp,
682 			      u16 pageidx, unsigned int npages)
683 {
684 	int ret;
685 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
686 	struct tid_rb_node *node;
687 	struct hfi1_devdata *dd = uctxt->dd;
688 	dma_addr_t phys;
689 	struct page **pages = tbuf->pages + pageidx;
690 
691 	/*
692 	 * Allocate the node first so we can handle a potential
693 	 * failure before we've programmed anything.
694 	 */
695 	node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
696 	if (!node)
697 		return -ENOMEM;
698 
699 	phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
700 			      npages * PAGE_SIZE, DMA_FROM_DEVICE);
701 	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
702 		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
703 			   phys);
704 		kfree(node);
705 		return -EFAULT;
706 	}
707 
708 	node->fdata = fd;
709 	node->phys = page_to_phys(pages[0]);
710 	node->npages = npages;
711 	node->rcventry = rcventry;
712 	node->dma_addr = phys;
713 	node->grp = grp;
714 	node->freed = false;
715 	memcpy(node->pages, pages, flex_array_size(node, pages, npages));
716 
717 	if (fd->use_mn) {
718 		ret = mmu_interval_notifier_insert(
719 			&node->notifier, current->mm,
720 			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
721 			&tid_mn_ops);
722 		if (ret)
723 			goto out_unmap;
724 		/*
725 		 * FIXME: This is in the wrong order, the notifier should be
726 		 * established before the pages are pinned by pin_rcv_pages.
727 		 */
728 		mmu_interval_read_begin(&node->notifier);
729 	}
730 	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
731 
732 	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
733 	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
734 			       node->notifier.interval_tree.start, node->phys,
735 			       phys);
736 	return 0;
737 
738 out_unmap:
739 	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
740 		  node->rcventry, node->notifier.interval_tree.start,
741 		  node->phys, ret);
742 	dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
743 			 DMA_FROM_DEVICE);
744 	kfree(node);
745 	return -EFAULT;
746 }
747 
748 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
749 			      struct tid_group **grp)
750 {
751 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
752 	struct hfi1_devdata *dd = uctxt->dd;
753 	struct tid_rb_node *node;
754 	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
755 	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
756 
757 	if (tididx >= uctxt->expected_count) {
758 		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
759 			   tididx, uctxt->ctxt);
760 		return -EINVAL;
761 	}
762 
763 	if (tidctrl == 0x3)
764 		return -EINVAL;
765 
766 	rcventry = tididx + (tidctrl - 1);
767 
768 	node = fd->entry_to_rb[rcventry];
769 	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
770 		return -EBADF;
771 
772 	if (grp)
773 		*grp = node->grp;
774 
775 	if (fd->use_mn)
776 		mmu_interval_notifier_remove(&node->notifier);
777 	cacheless_tid_rb_remove(fd, node);
778 
779 	return 0;
780 }
781 
782 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
783 {
784 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
785 	struct hfi1_devdata *dd = uctxt->dd;
786 
787 	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
788 				 node->npages,
789 				 node->notifier.interval_tree.start, node->phys,
790 				 node->dma_addr);
791 
792 	/*
793 	 * Make sure device has seen the write before we unpin the
794 	 * pages.
795 	 */
796 	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
797 
798 	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
799 
800 	node->grp->used--;
801 	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
802 
803 	if (node->grp->used == node->grp->size - 1)
804 		tid_group_move(node->grp, &uctxt->tid_full_list,
805 			       &uctxt->tid_used_list);
806 	else if (!node->grp->used)
807 		tid_group_move(node->grp, &uctxt->tid_used_list,
808 			       &uctxt->tid_group_list);
809 	kfree(node);
810 }
811 
812 /*
813  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
814  * clearing nodes in the non-cached case.
815  */
816 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
817 			    struct exp_tid_set *set,
818 			    struct hfi1_filedata *fd)
819 {
820 	struct tid_group *grp, *ptr;
821 	int i;
822 
823 	list_for_each_entry_safe(grp, ptr, &set->list, list) {
824 		list_del_init(&grp->list);
825 
826 		for (i = 0; i < grp->size; i++) {
827 			if (grp->map & (1 << i)) {
828 				u16 rcventry = grp->base + i;
829 				struct tid_rb_node *node;
830 
831 				node = fd->entry_to_rb[rcventry -
832 							  uctxt->expected_base];
833 				if (!node || node->rcventry != rcventry)
834 					continue;
835 
836 				if (fd->use_mn)
837 					mmu_interval_notifier_remove(
838 						&node->notifier);
839 				cacheless_tid_rb_remove(fd, node);
840 			}
841 		}
842 	}
843 }
844 
845 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
846 			      const struct mmu_notifier_range *range,
847 			      unsigned long cur_seq)
848 {
849 	struct tid_rb_node *node =
850 		container_of(mni, struct tid_rb_node, notifier);
851 	struct hfi1_filedata *fdata = node->fdata;
852 	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
853 
854 	if (node->freed)
855 		return true;
856 
857 	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
858 				 node->notifier.interval_tree.start,
859 				 node->rcventry, node->npages, node->dma_addr);
860 	node->freed = true;
861 
862 	spin_lock(&fdata->invalid_lock);
863 	if (fdata->invalid_tid_idx < uctxt->expected_count) {
864 		fdata->invalid_tids[fdata->invalid_tid_idx] =
865 			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
866 		fdata->invalid_tids[fdata->invalid_tid_idx] |=
867 			EXP_TID_SET(LEN, node->npages);
868 		if (!fdata->invalid_tid_idx) {
869 			unsigned long *ev;
870 
871 			/*
872 			 * hfi1_set_uevent_bits() sets a user event flag
873 			 * for all processes. Because calling into the
874 			 * driver to process TID cache invalidations is
875 			 * expensive and TID cache invalidations are
876 			 * handled on a per-process basis, we can
877 			 * optimize this to set the flag only for the
878 			 * process in question.
879 			 */
880 			ev = uctxt->dd->events +
881 				(uctxt_offset(uctxt) + fdata->subctxt);
882 			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
883 		}
884 		fdata->invalid_tid_idx++;
885 	}
886 	spin_unlock(&fdata->invalid_lock);
887 	return true;
888 }
889 
890 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
891 				    struct tid_rb_node *tnode)
892 {
893 	u32 base = fdata->uctxt->expected_base;
894 
895 	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
896 	clear_tid_node(fdata, tnode);
897 }
898