xref: /openbmc/linux/drivers/infiniband/hw/mlx5/odp.c (revision 4f6cce39)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
35 
36 #include "mlx5_ib.h"
37 #include "cmd.h"
38 
39 #define MAX_PREFETCH_LEN (4*1024*1024U)
40 
41 /* Timeout in ms to wait for an active mmu notifier to complete when handling
42  * a pagefault. */
43 #define MMU_NOTIFIER_TIMEOUT 1000
44 
45 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
46 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
47 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
48 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
49 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
50 
51 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
52 
53 static u64 mlx5_imr_ksm_entries;
54 
55 static int check_parent(struct ib_umem_odp *odp,
56 			       struct mlx5_ib_mr *parent)
57 {
58 	struct mlx5_ib_mr *mr = odp->private;
59 
60 	return mr && mr->parent == parent;
61 }
62 
63 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
64 {
65 	struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
66 	struct ib_ucontext *ctx = odp->umem->context;
67 	struct rb_node *rb;
68 
69 	down_read(&ctx->umem_rwsem);
70 	while (1) {
71 		rb = rb_next(&odp->interval_tree.rb);
72 		if (!rb)
73 			goto not_found;
74 		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
75 		if (check_parent(odp, parent))
76 			goto end;
77 	}
78 not_found:
79 	odp = NULL;
80 end:
81 	up_read(&ctx->umem_rwsem);
82 	return odp;
83 }
84 
85 static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
86 				      u64 start, u64 length,
87 				      struct mlx5_ib_mr *parent)
88 {
89 	struct ib_umem_odp *odp;
90 	struct rb_node *rb;
91 
92 	down_read(&ctx->umem_rwsem);
93 	odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
94 	if (!odp)
95 		goto end;
96 
97 	while (1) {
98 		if (check_parent(odp, parent))
99 			goto end;
100 		rb = rb_next(&odp->interval_tree.rb);
101 		if (!rb)
102 			goto not_found;
103 		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
104 		if (ib_umem_start(odp->umem) > start + length)
105 			goto not_found;
106 	}
107 not_found:
108 	odp = NULL;
109 end:
110 	up_read(&ctx->umem_rwsem);
111 	return odp;
112 }
113 
114 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
115 			   size_t nentries, struct mlx5_ib_mr *mr, int flags)
116 {
117 	struct ib_pd *pd = mr->ibmr.pd;
118 	struct ib_ucontext *ctx = pd->uobject->context;
119 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
120 	struct ib_umem_odp *odp;
121 	unsigned long va;
122 	int i;
123 
124 	if (flags & MLX5_IB_UPD_XLT_ZAP) {
125 		for (i = 0; i < nentries; i++, pklm++) {
126 			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
127 			pklm->key = cpu_to_be32(dev->null_mkey);
128 			pklm->va = 0;
129 		}
130 		return;
131 	}
132 
133 	odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
134 			     nentries * MLX5_IMR_MTT_SIZE, mr);
135 
136 	for (i = 0; i < nentries; i++, pklm++) {
137 		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
138 		va = (offset + i) * MLX5_IMR_MTT_SIZE;
139 		if (odp && odp->umem->address == va) {
140 			struct mlx5_ib_mr *mtt = odp->private;
141 
142 			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
143 			odp = odp_next(odp);
144 		} else {
145 			pklm->key = cpu_to_be32(dev->null_mkey);
146 		}
147 		mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
148 			    i, va, be32_to_cpu(pklm->key));
149 	}
150 }
151 
152 static void mr_leaf_free_action(struct work_struct *work)
153 {
154 	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
155 	int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
156 	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
157 
158 	mr->parent = NULL;
159 	synchronize_srcu(&mr->dev->mr_srcu);
160 
161 	if (!READ_ONCE(odp->dying)) {
162 		mr->parent = imr;
163 		if (atomic_dec_and_test(&imr->num_leaf_free))
164 			wake_up(&imr->q_leaf_free);
165 		return;
166 	}
167 
168 	ib_umem_release(odp->umem);
169 	if (imr->live)
170 		mlx5_ib_update_xlt(imr, idx, 1, 0,
171 				   MLX5_IB_UPD_XLT_INDIRECT |
172 				   MLX5_IB_UPD_XLT_ATOMIC);
173 	mlx5_mr_cache_free(mr->dev, mr);
174 
175 	if (atomic_dec_and_test(&imr->num_leaf_free))
176 		wake_up(&imr->q_leaf_free);
177 }
178 
179 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
180 			      unsigned long end)
181 {
182 	struct mlx5_ib_mr *mr;
183 	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
184 				    sizeof(struct mlx5_mtt)) - 1;
185 	u64 idx = 0, blk_start_idx = 0;
186 	int in_block = 0;
187 	u64 addr;
188 
189 	if (!umem || !umem->odp_data) {
190 		pr_err("invalidation called on NULL umem or non-ODP umem\n");
191 		return;
192 	}
193 
194 	mr = umem->odp_data->private;
195 
196 	if (!mr || !mr->ibmr.pd)
197 		return;
198 
199 	start = max_t(u64, ib_umem_start(umem), start);
200 	end = min_t(u64, ib_umem_end(umem), end);
201 
202 	/*
203 	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
204 	 * while we are doing the invalidation, no page fault will attempt to
205 	 * overwrite the same MTTs.  Concurent invalidations might race us,
206 	 * but they will write 0s as well, so no difference in the end result.
207 	 */
208 
209 	for (addr = start; addr < end; addr += (u64)umem->page_size) {
210 		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
211 		/*
212 		 * Strive to write the MTTs in chunks, but avoid overwriting
213 		 * non-existing MTTs. The huristic here can be improved to
214 		 * estimate the cost of another UMR vs. the cost of bigger
215 		 * UMR.
216 		 */
217 		if (umem->odp_data->dma_list[idx] &
218 		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
219 			if (!in_block) {
220 				blk_start_idx = idx;
221 				in_block = 1;
222 			}
223 		} else {
224 			u64 umr_offset = idx & umr_block_mask;
225 
226 			if (in_block && umr_offset == 0) {
227 				mlx5_ib_update_xlt(mr, blk_start_idx,
228 						   idx - blk_start_idx,
229 						   PAGE_SHIFT,
230 						   MLX5_IB_UPD_XLT_ZAP |
231 						   MLX5_IB_UPD_XLT_ATOMIC);
232 				in_block = 0;
233 			}
234 		}
235 	}
236 	if (in_block)
237 		mlx5_ib_update_xlt(mr, blk_start_idx,
238 				   idx - blk_start_idx + 1,
239 				   PAGE_SHIFT,
240 				   MLX5_IB_UPD_XLT_ZAP |
241 				   MLX5_IB_UPD_XLT_ATOMIC);
242 	/*
243 	 * We are now sure that the device will not access the
244 	 * memory. We can safely unmap it, and mark it as dirty if
245 	 * needed.
246 	 */
247 
248 	ib_umem_odp_unmap_dma_pages(umem, start, end);
249 
250 	if (unlikely(!umem->npages && mr->parent &&
251 		     !umem->odp_data->dying)) {
252 		WRITE_ONCE(umem->odp_data->dying, 1);
253 		atomic_inc(&mr->parent->num_leaf_free);
254 		schedule_work(&umem->odp_data->work);
255 	}
256 }
257 
258 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
259 {
260 	struct ib_odp_caps *caps = &dev->odp_caps;
261 
262 	memset(caps, 0, sizeof(*caps));
263 
264 	if (!MLX5_CAP_GEN(dev->mdev, pg))
265 		return;
266 
267 	caps->general_caps = IB_ODP_SUPPORT;
268 
269 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
270 		dev->odp_max_size = U64_MAX;
271 	else
272 		dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
273 
274 	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
275 		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
276 
277 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
278 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
279 
280 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
281 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
282 
283 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
284 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
285 
286 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
287 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
288 
289 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
290 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
291 
292 	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
293 	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
294 	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
295 		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
296 
297 	return;
298 }
299 
300 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
301 						   u32 key)
302 {
303 	u32 base_key = mlx5_base_mkey(key);
304 	struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key);
305 	struct mlx5_ib_mr *mr;
306 
307 	if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR)
308 		return NULL;
309 
310 	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
311 
312 	if (!mr->live)
313 		return NULL;
314 
315 	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
316 }
317 
318 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
319 				      struct mlx5_pagefault *pfault,
320 				      int error)
321 {
322 	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
323 		     pfault->wqe.wq_num : pfault->token;
324 	int ret = mlx5_core_page_fault_resume(dev->mdev,
325 					      pfault->token,
326 					      wq_num,
327 					      pfault->type,
328 					      error);
329 	if (ret)
330 		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
331 			    wq_num);
332 }
333 
334 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
335 					    struct ib_umem *umem,
336 					    bool ksm, int access_flags)
337 {
338 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
339 	struct mlx5_ib_mr *mr;
340 	int err;
341 
342 	mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
343 					    MLX5_IMR_MTT_CACHE_ENTRY);
344 
345 	if (IS_ERR(mr))
346 		return mr;
347 
348 	mr->ibmr.pd = pd;
349 
350 	mr->dev = dev;
351 	mr->access_flags = access_flags;
352 	mr->mmkey.iova = 0;
353 	mr->umem = umem;
354 
355 	if (ksm) {
356 		err = mlx5_ib_update_xlt(mr, 0,
357 					 mlx5_imr_ksm_entries,
358 					 MLX5_KSM_PAGE_SHIFT,
359 					 MLX5_IB_UPD_XLT_INDIRECT |
360 					 MLX5_IB_UPD_XLT_ZAP |
361 					 MLX5_IB_UPD_XLT_ENABLE);
362 
363 	} else {
364 		err = mlx5_ib_update_xlt(mr, 0,
365 					 MLX5_IMR_MTT_ENTRIES,
366 					 PAGE_SHIFT,
367 					 MLX5_IB_UPD_XLT_ZAP |
368 					 MLX5_IB_UPD_XLT_ENABLE |
369 					 MLX5_IB_UPD_XLT_ATOMIC);
370 	}
371 
372 	if (err)
373 		goto fail;
374 
375 	mr->ibmr.lkey = mr->mmkey.key;
376 	mr->ibmr.rkey = mr->mmkey.key;
377 
378 	mr->live = 1;
379 
380 	mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
381 		    mr->mmkey.key, dev->mdev, mr);
382 
383 	return mr;
384 
385 fail:
386 	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
387 	mlx5_mr_cache_free(dev, mr);
388 
389 	return ERR_PTR(err);
390 }
391 
392 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
393 						u64 io_virt, size_t bcnt)
394 {
395 	struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
396 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
397 	struct ib_umem_odp *odp, *result = NULL;
398 	u64 addr = io_virt & MLX5_IMR_MTT_MASK;
399 	int nentries = 0, start_idx = 0, ret;
400 	struct mlx5_ib_mr *mtt;
401 	struct ib_umem *umem;
402 
403 	mutex_lock(&mr->umem->odp_data->umem_mutex);
404 	odp = odp_lookup(ctx, addr, 1, mr);
405 
406 	mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
407 		    io_virt, bcnt, addr, odp);
408 
409 next_mr:
410 	if (likely(odp)) {
411 		if (nentries)
412 			nentries++;
413 	} else {
414 		umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
415 		if (IS_ERR(umem)) {
416 			mutex_unlock(&mr->umem->odp_data->umem_mutex);
417 			return ERR_CAST(umem);
418 		}
419 
420 		mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
421 		if (IS_ERR(mtt)) {
422 			mutex_unlock(&mr->umem->odp_data->umem_mutex);
423 			ib_umem_release(umem);
424 			return ERR_CAST(mtt);
425 		}
426 
427 		odp = umem->odp_data;
428 		odp->private = mtt;
429 		mtt->umem = umem;
430 		mtt->mmkey.iova = addr;
431 		mtt->parent = mr;
432 		INIT_WORK(&odp->work, mr_leaf_free_action);
433 
434 		if (!nentries)
435 			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
436 		nentries++;
437 	}
438 
439 	odp->dying = 0;
440 
441 	/* Return first odp if region not covered by single one */
442 	if (likely(!result))
443 		result = odp;
444 
445 	addr += MLX5_IMR_MTT_SIZE;
446 	if (unlikely(addr < io_virt + bcnt)) {
447 		odp = odp_next(odp);
448 		if (odp && odp->umem->address != addr)
449 			odp = NULL;
450 		goto next_mr;
451 	}
452 
453 	if (unlikely(nentries)) {
454 		ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
455 					 MLX5_IB_UPD_XLT_INDIRECT |
456 					 MLX5_IB_UPD_XLT_ATOMIC);
457 		if (ret) {
458 			mlx5_ib_err(dev, "Failed to update PAS\n");
459 			result = ERR_PTR(ret);
460 		}
461 	}
462 
463 	mutex_unlock(&mr->umem->odp_data->umem_mutex);
464 	return result;
465 }
466 
467 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
468 					     int access_flags)
469 {
470 	struct ib_ucontext *ctx = pd->ibpd.uobject->context;
471 	struct mlx5_ib_mr *imr;
472 	struct ib_umem *umem;
473 
474 	umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
475 	if (IS_ERR(umem))
476 		return ERR_CAST(umem);
477 
478 	imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
479 	if (IS_ERR(imr)) {
480 		ib_umem_release(umem);
481 		return ERR_CAST(imr);
482 	}
483 
484 	imr->umem = umem;
485 	init_waitqueue_head(&imr->q_leaf_free);
486 	atomic_set(&imr->num_leaf_free, 0);
487 
488 	return imr;
489 }
490 
491 static int mr_leaf_free(struct ib_umem *umem, u64 start,
492 			u64 end, void *cookie)
493 {
494 	struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
495 
496 	if (mr->parent != imr)
497 		return 0;
498 
499 	ib_umem_odp_unmap_dma_pages(umem,
500 				    ib_umem_start(umem),
501 				    ib_umem_end(umem));
502 
503 	if (umem->odp_data->dying)
504 		return 0;
505 
506 	WRITE_ONCE(umem->odp_data->dying, 1);
507 	atomic_inc(&imr->num_leaf_free);
508 	schedule_work(&umem->odp_data->work);
509 
510 	return 0;
511 }
512 
513 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
514 {
515 	struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
516 
517 	down_read(&ctx->umem_rwsem);
518 	rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
519 				      mr_leaf_free, imr);
520 	up_read(&ctx->umem_rwsem);
521 
522 	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
523 }
524 
525 /*
526  * Handle a single data segment in a page-fault WQE or RDMA region.
527  *
528  * Returns number of pages retrieved on success. The caller may continue to
529  * the next data segment.
530  * Can return the following error codes:
531  * -EAGAIN to designate a temporary error. The caller will abort handling the
532  *  page fault and resolve it.
533  * -EFAULT when there's an error mapping the requested pages. The caller will
534  *  abort the page fault handling.
535  */
536 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
537 					 u32 key, u64 io_virt, size_t bcnt,
538 					 u32 *bytes_committed,
539 					 u32 *bytes_mapped)
540 {
541 	int srcu_key;
542 	unsigned int current_seq = 0;
543 	u64 start_idx;
544 	int npages = 0, ret = 0;
545 	struct mlx5_ib_mr *mr;
546 	u64 access_mask = ODP_READ_ALLOWED_BIT;
547 	struct ib_umem_odp *odp;
548 	int implicit = 0;
549 	size_t size;
550 
551 	srcu_key = srcu_read_lock(&dev->mr_srcu);
552 	mr = mlx5_ib_odp_find_mr_lkey(dev, key);
553 	/*
554 	 * If we didn't find the MR, it means the MR was closed while we were
555 	 * handling the ODP event. In this case we return -EFAULT so that the
556 	 * QP will be closed.
557 	 */
558 	if (!mr || !mr->ibmr.pd) {
559 		mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
560 			    key);
561 		ret = -EFAULT;
562 		goto srcu_unlock;
563 	}
564 	if (!mr->umem->odp_data) {
565 		mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
566 			    key);
567 		if (bytes_mapped)
568 			*bytes_mapped +=
569 				(bcnt - *bytes_committed);
570 		goto srcu_unlock;
571 	}
572 
573 	/*
574 	 * Avoid branches - this code will perform correctly
575 	 * in all iterations (in iteration 2 and above,
576 	 * bytes_committed == 0).
577 	 */
578 	io_virt += *bytes_committed;
579 	bcnt -= *bytes_committed;
580 
581 	if (!mr->umem->odp_data->page_list) {
582 		odp = implicit_mr_get_data(mr, io_virt, bcnt);
583 
584 		if (IS_ERR(odp)) {
585 			ret = PTR_ERR(odp);
586 			goto srcu_unlock;
587 		}
588 		mr = odp->private;
589 		implicit = 1;
590 
591 	} else {
592 		odp = mr->umem->odp_data;
593 	}
594 
595 next_mr:
596 	current_seq = READ_ONCE(odp->notifiers_seq);
597 	/*
598 	 * Ensure the sequence number is valid for some time before we call
599 	 * gup.
600 	 */
601 	smp_rmb();
602 
603 	size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
604 	start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
605 
606 	if (mr->umem->writable)
607 		access_mask |= ODP_WRITE_ALLOWED_BIT;
608 
609 	ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
610 					access_mask, current_seq);
611 
612 	if (ret < 0)
613 		goto srcu_unlock;
614 
615 	if (ret > 0) {
616 		int np = ret;
617 
618 		mutex_lock(&odp->umem_mutex);
619 		if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
620 			/*
621 			 * No need to check whether the MTTs really belong to
622 			 * this MR, since ib_umem_odp_map_dma_pages already
623 			 * checks this.
624 			 */
625 			ret = mlx5_ib_update_xlt(mr, start_idx, np,
626 						 PAGE_SHIFT,
627 						 MLX5_IB_UPD_XLT_ATOMIC);
628 		} else {
629 			ret = -EAGAIN;
630 		}
631 		mutex_unlock(&odp->umem_mutex);
632 		if (ret < 0) {
633 			if (ret != -EAGAIN)
634 				mlx5_ib_err(dev, "Failed to update mkey page tables\n");
635 			goto srcu_unlock;
636 		}
637 
638 		if (bytes_mapped) {
639 			u32 new_mappings = np * PAGE_SIZE -
640 				(io_virt - round_down(io_virt, PAGE_SIZE));
641 			*bytes_mapped += min_t(u32, new_mappings, size);
642 		}
643 
644 		npages += np;
645 	}
646 
647 	bcnt -= size;
648 	if (unlikely(bcnt)) {
649 		struct ib_umem_odp *next;
650 
651 		io_virt += size;
652 		next = odp_next(odp);
653 		if (unlikely(!next || next->umem->address != io_virt)) {
654 			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
655 				    io_virt, next);
656 			ret = -EAGAIN;
657 			goto srcu_unlock_no_wait;
658 		}
659 		odp = next;
660 		mr = odp->private;
661 		goto next_mr;
662 	}
663 
664 srcu_unlock:
665 	if (ret == -EAGAIN) {
666 		if (implicit || !odp->dying) {
667 			unsigned long timeout =
668 				msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
669 
670 			if (!wait_for_completion_timeout(
671 					&odp->notifier_completion,
672 					timeout)) {
673 				mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
674 					     current_seq, odp->notifiers_seq);
675 			}
676 		} else {
677 			/* The MR is being killed, kill the QP as well. */
678 			ret = -EFAULT;
679 		}
680 	}
681 
682 srcu_unlock_no_wait:
683 	srcu_read_unlock(&dev->mr_srcu, srcu_key);
684 	*bytes_committed = 0;
685 	return ret ? ret : npages;
686 }
687 
688 /**
689  * Parse a series of data segments for page fault handling.
690  *
691  * @qp the QP on which the fault occurred.
692  * @pfault contains page fault information.
693  * @wqe points at the first data segment in the WQE.
694  * @wqe_end points after the end of the WQE.
695  * @bytes_mapped receives the number of bytes that the function was able to
696  *               map. This allows the caller to decide intelligently whether
697  *               enough memory was mapped to resolve the page fault
698  *               successfully (e.g. enough for the next MTU, or the entire
699  *               WQE).
700  * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
701  *                  the committed bytes).
702  *
703  * Returns the number of pages loaded if positive, zero for an empty WQE, or a
704  * negative error code.
705  */
706 static int pagefault_data_segments(struct mlx5_ib_dev *dev,
707 				   struct mlx5_pagefault *pfault,
708 				   struct mlx5_ib_qp *qp, void *wqe,
709 				   void *wqe_end, u32 *bytes_mapped,
710 				   u32 *total_wqe_bytes, int receive_queue)
711 {
712 	int ret = 0, npages = 0;
713 	u64 io_virt;
714 	u32 key;
715 	u32 byte_count;
716 	size_t bcnt;
717 	int inline_segment;
718 
719 	/* Skip SRQ next-WQE segment. */
720 	if (receive_queue && qp->ibqp.srq)
721 		wqe += sizeof(struct mlx5_wqe_srq_next_seg);
722 
723 	if (bytes_mapped)
724 		*bytes_mapped = 0;
725 	if (total_wqe_bytes)
726 		*total_wqe_bytes = 0;
727 
728 	while (wqe < wqe_end) {
729 		struct mlx5_wqe_data_seg *dseg = wqe;
730 
731 		io_virt = be64_to_cpu(dseg->addr);
732 		key = be32_to_cpu(dseg->lkey);
733 		byte_count = be32_to_cpu(dseg->byte_count);
734 		inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
735 		bcnt	       = byte_count & ~MLX5_INLINE_SEG;
736 
737 		if (inline_segment) {
738 			bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
739 			wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
740 				     16);
741 		} else {
742 			wqe += sizeof(*dseg);
743 		}
744 
745 		/* receive WQE end of sg list. */
746 		if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
747 		    io_virt == 0)
748 			break;
749 
750 		if (!inline_segment && total_wqe_bytes) {
751 			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
752 					pfault->bytes_committed);
753 		}
754 
755 		/* A zero length data segment designates a length of 2GB. */
756 		if (bcnt == 0)
757 			bcnt = 1U << 31;
758 
759 		if (inline_segment || bcnt <= pfault->bytes_committed) {
760 			pfault->bytes_committed -=
761 				min_t(size_t, bcnt,
762 				      pfault->bytes_committed);
763 			continue;
764 		}
765 
766 		ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
767 						    &pfault->bytes_committed,
768 						    bytes_mapped);
769 		if (ret < 0)
770 			break;
771 		npages += ret;
772 	}
773 
774 	return ret < 0 ? ret : npages;
775 }
776 
777 static const u32 mlx5_ib_odp_opcode_cap[] = {
778 	[MLX5_OPCODE_SEND]	       = IB_ODP_SUPPORT_SEND,
779 	[MLX5_OPCODE_SEND_IMM]	       = IB_ODP_SUPPORT_SEND,
780 	[MLX5_OPCODE_SEND_INVAL]       = IB_ODP_SUPPORT_SEND,
781 	[MLX5_OPCODE_RDMA_WRITE]       = IB_ODP_SUPPORT_WRITE,
782 	[MLX5_OPCODE_RDMA_WRITE_IMM]   = IB_ODP_SUPPORT_WRITE,
783 	[MLX5_OPCODE_RDMA_READ]	       = IB_ODP_SUPPORT_READ,
784 	[MLX5_OPCODE_ATOMIC_CS]	       = IB_ODP_SUPPORT_ATOMIC,
785 	[MLX5_OPCODE_ATOMIC_FA]	       = IB_ODP_SUPPORT_ATOMIC,
786 };
787 
788 /*
789  * Parse initiator WQE. Advances the wqe pointer to point at the
790  * scatter-gather list, and set wqe_end to the end of the WQE.
791  */
792 static int mlx5_ib_mr_initiator_pfault_handler(
793 	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
794 	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
795 {
796 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
797 	u16 wqe_index = pfault->wqe.wqe_index;
798 	u32 transport_caps;
799 	struct mlx5_base_av *av;
800 	unsigned ds, opcode;
801 #if defined(DEBUG)
802 	u32 ctrl_wqe_index, ctrl_qpn;
803 #endif
804 	u32 qpn = qp->trans_qp.base.mqp.qpn;
805 
806 	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
807 	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
808 		mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
809 			    ds, wqe_length);
810 		return -EFAULT;
811 	}
812 
813 	if (ds == 0) {
814 		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
815 			    wqe_index, qpn);
816 		return -EFAULT;
817 	}
818 
819 #if defined(DEBUG)
820 	ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
821 			MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
822 			MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
823 	if (wqe_index != ctrl_wqe_index) {
824 		mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
825 			    wqe_index, qpn,
826 			    ctrl_wqe_index);
827 		return -EFAULT;
828 	}
829 
830 	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
831 		MLX5_WQE_CTRL_QPN_SHIFT;
832 	if (qpn != ctrl_qpn) {
833 		mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
834 			    wqe_index, qpn,
835 			    ctrl_qpn);
836 		return -EFAULT;
837 	}
838 #endif /* DEBUG */
839 
840 	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
841 	*wqe += sizeof(*ctrl);
842 
843 	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
844 		 MLX5_WQE_CTRL_OPCODE_MASK;
845 
846 	switch (qp->ibqp.qp_type) {
847 	case IB_QPT_RC:
848 		transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
849 		break;
850 	case IB_QPT_UD:
851 		transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
852 		break;
853 	default:
854 		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
855 			    qp->ibqp.qp_type);
856 		return -EFAULT;
857 	}
858 
859 	if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) /
860 	    sizeof(mlx5_ib_odp_opcode_cap[0]) ||
861 	    !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
862 		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
863 			    opcode);
864 		return -EFAULT;
865 	}
866 
867 	if (qp->ibqp.qp_type != IB_QPT_RC) {
868 		av = *wqe;
869 		if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT))
870 			*wqe += sizeof(struct mlx5_av);
871 		else
872 			*wqe += sizeof(struct mlx5_base_av);
873 	}
874 
875 	switch (opcode) {
876 	case MLX5_OPCODE_RDMA_WRITE:
877 	case MLX5_OPCODE_RDMA_WRITE_IMM:
878 	case MLX5_OPCODE_RDMA_READ:
879 		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
880 		break;
881 	case MLX5_OPCODE_ATOMIC_CS:
882 	case MLX5_OPCODE_ATOMIC_FA:
883 		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
884 		*wqe += sizeof(struct mlx5_wqe_atomic_seg);
885 		break;
886 	}
887 
888 	return 0;
889 }
890 
891 /*
892  * Parse responder WQE. Advances the wqe pointer to point at the
893  * scatter-gather list, and set wqe_end to the end of the WQE.
894  */
895 static int mlx5_ib_mr_responder_pfault_handler(
896 	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
897 	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
898 {
899 	struct mlx5_ib_wq *wq = &qp->rq;
900 	int wqe_size = 1 << wq->wqe_shift;
901 
902 	if (qp->ibqp.srq) {
903 		mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
904 		return -EFAULT;
905 	}
906 
907 	if (qp->wq_sig) {
908 		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
909 		return -EFAULT;
910 	}
911 
912 	if (wqe_size > wqe_length) {
913 		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
914 		return -EFAULT;
915 	}
916 
917 	switch (qp->ibqp.qp_type) {
918 	case IB_QPT_RC:
919 		if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
920 		      IB_ODP_SUPPORT_RECV))
921 			goto invalid_transport_or_opcode;
922 		break;
923 	default:
924 invalid_transport_or_opcode:
925 		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
926 			    qp->ibqp.qp_type);
927 		return -EFAULT;
928 	}
929 
930 	*wqe_end = *wqe + wqe_size;
931 
932 	return 0;
933 }
934 
935 static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
936 					      u32 wq_num)
937 {
938 	struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
939 
940 	if (!mqp) {
941 		mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
942 		return NULL;
943 	}
944 
945 	return to_mibqp(mqp);
946 }
947 
948 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
949 					  struct mlx5_pagefault *pfault)
950 {
951 	int ret;
952 	void *wqe, *wqe_end;
953 	u32 bytes_mapped, total_wqe_bytes;
954 	char *buffer = NULL;
955 	int resume_with_error = 1;
956 	u16 wqe_index = pfault->wqe.wqe_index;
957 	int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
958 	struct mlx5_ib_qp *qp;
959 
960 	buffer = (char *)__get_free_page(GFP_KERNEL);
961 	if (!buffer) {
962 		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
963 		goto resolve_page_fault;
964 	}
965 
966 	qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
967 	if (!qp)
968 		goto resolve_page_fault;
969 
970 	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
971 				    PAGE_SIZE, &qp->trans_qp.base);
972 	if (ret < 0) {
973 		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
974 			    ret, wqe_index, pfault->token);
975 		goto resolve_page_fault;
976 	}
977 
978 	wqe = buffer;
979 	if (requestor)
980 		ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
981 							  &wqe_end, ret);
982 	else
983 		ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
984 							  &wqe_end, ret);
985 	if (ret < 0)
986 		goto resolve_page_fault;
987 
988 	if (wqe >= wqe_end) {
989 		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
990 		goto resolve_page_fault;
991 	}
992 
993 	ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
994 				      &bytes_mapped, &total_wqe_bytes,
995 				      !requestor);
996 	if (ret == -EAGAIN) {
997 		resume_with_error = 0;
998 		goto resolve_page_fault;
999 	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
1000 		if (ret != -ENOENT)
1001 			mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n",
1002 				    ret, pfault->wqe.wq_num, pfault->type);
1003 		goto resolve_page_fault;
1004 	}
1005 
1006 	resume_with_error = 0;
1007 resolve_page_fault:
1008 	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
1009 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1010 		    pfault->wqe.wq_num, resume_with_error,
1011 		    pfault->type);
1012 	free_page((unsigned long)buffer);
1013 }
1014 
1015 static int pages_in_range(u64 address, u32 length)
1016 {
1017 	return (ALIGN(address + length, PAGE_SIZE) -
1018 		(address & PAGE_MASK)) >> PAGE_SHIFT;
1019 }
1020 
1021 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1022 					   struct mlx5_pagefault *pfault)
1023 {
1024 	u64 address;
1025 	u32 length;
1026 	u32 prefetch_len = pfault->bytes_committed;
1027 	int prefetch_activated = 0;
1028 	u32 rkey = pfault->rdma.r_key;
1029 	int ret;
1030 
1031 	/* The RDMA responder handler handles the page fault in two parts.
1032 	 * First it brings the necessary pages for the current packet
1033 	 * (and uses the pfault context), and then (after resuming the QP)
1034 	 * prefetches more pages. The second operation cannot use the pfault
1035 	 * context and therefore uses the dummy_pfault context allocated on
1036 	 * the stack */
1037 	pfault->rdma.rdma_va += pfault->bytes_committed;
1038 	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
1039 					 pfault->rdma.rdma_op_len);
1040 	pfault->bytes_committed = 0;
1041 
1042 	address = pfault->rdma.rdma_va;
1043 	length  = pfault->rdma.rdma_op_len;
1044 
1045 	/* For some operations, the hardware cannot tell the exact message
1046 	 * length, and in those cases it reports zero. Use prefetch
1047 	 * logic. */
1048 	if (length == 0) {
1049 		prefetch_activated = 1;
1050 		length = pfault->rdma.packet_size;
1051 		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
1052 	}
1053 
1054 	ret = pagefault_single_data_segment(dev, rkey, address, length,
1055 					    &pfault->bytes_committed, NULL);
1056 	if (ret == -EAGAIN) {
1057 		/* We're racing with an invalidation, don't prefetch */
1058 		prefetch_activated = 0;
1059 	} else if (ret < 0 || pages_in_range(address, length) > ret) {
1060 		mlx5_ib_page_fault_resume(dev, pfault, 1);
1061 		if (ret != -ENOENT)
1062 			mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1063 				     ret, pfault->token, pfault->type);
1064 		return;
1065 	}
1066 
1067 	mlx5_ib_page_fault_resume(dev, pfault, 0);
1068 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1069 		    pfault->token, pfault->type,
1070 		    prefetch_activated);
1071 
1072 	/* At this point, there might be a new pagefault already arriving in
1073 	 * the eq, switch to the dummy pagefault for the rest of the
1074 	 * processing. We're still OK with the objects being alive as the
1075 	 * work-queue is being fenced. */
1076 
1077 	if (prefetch_activated) {
1078 		u32 bytes_committed = 0;
1079 
1080 		ret = pagefault_single_data_segment(dev, rkey, address,
1081 						    prefetch_len,
1082 						    &bytes_committed, NULL);
1083 		if (ret < 0 && ret != -EAGAIN) {
1084 			mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1085 				     ret, pfault->token, address, prefetch_len);
1086 		}
1087 	}
1088 }
1089 
1090 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
1091 		    struct mlx5_pagefault *pfault)
1092 {
1093 	struct mlx5_ib_dev *dev = context;
1094 	u8 event_subtype = pfault->event_subtype;
1095 
1096 	switch (event_subtype) {
1097 	case MLX5_PFAULT_SUBTYPE_WQE:
1098 		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
1099 		break;
1100 	case MLX5_PFAULT_SUBTYPE_RDMA:
1101 		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
1102 		break;
1103 	default:
1104 		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
1105 			    event_subtype);
1106 		mlx5_ib_page_fault_resume(dev, pfault, 1);
1107 	}
1108 }
1109 
1110 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1111 {
1112 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1113 		return;
1114 
1115 	switch (ent->order - 2) {
1116 	case MLX5_IMR_MTT_CACHE_ENTRY:
1117 		ent->page = PAGE_SHIFT;
1118 		ent->xlt = MLX5_IMR_MTT_ENTRIES *
1119 			   sizeof(struct mlx5_mtt) /
1120 			   MLX5_IB_UMR_OCTOWORD;
1121 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1122 		ent->limit = 0;
1123 		break;
1124 
1125 	case MLX5_IMR_KSM_CACHE_ENTRY:
1126 		ent->page = MLX5_KSM_PAGE_SHIFT;
1127 		ent->xlt = mlx5_imr_ksm_entries *
1128 			   sizeof(struct mlx5_klm) /
1129 			   MLX5_IB_UMR_OCTOWORD;
1130 		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
1131 		ent->limit = 0;
1132 		break;
1133 	}
1134 }
1135 
1136 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
1137 {
1138 	int ret;
1139 
1140 	ret = init_srcu_struct(&dev->mr_srcu);
1141 	if (ret)
1142 		return ret;
1143 
1144 	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
1145 		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
1146 		if (ret) {
1147 			mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
1148 			return ret;
1149 		}
1150 	}
1151 
1152 	return 0;
1153 }
1154 
1155 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
1156 {
1157 	cleanup_srcu_struct(&dev->mr_srcu);
1158 }
1159 
1160 int mlx5_ib_odp_init(void)
1161 {
1162 	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
1163 				       MLX5_IMR_MTT_BITS);
1164 
1165 	return 0;
1166 }
1167 
1168