xref: /openbmc/linux/drivers/infiniband/hw/mlx5/umr.c (revision c832da79)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
3 
4 #include <rdma/ib_umem_odp.h>
5 #include "mlx5_ib.h"
6 #include "umr.h"
7 #include "wr.h"
8 
9 /*
10  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
11  * work on kernel modules memory
12  */
13 void *xlt_emergency_page;
14 static DEFINE_MUTEX(xlt_emergency_page_mutex);
15 
16 static __be64 get_umr_enable_mr_mask(void)
17 {
18 	u64 result;
19 
20 	result = MLX5_MKEY_MASK_KEY |
21 		 MLX5_MKEY_MASK_FREE;
22 
23 	return cpu_to_be64(result);
24 }
25 
26 static __be64 get_umr_disable_mr_mask(void)
27 {
28 	u64 result;
29 
30 	result = MLX5_MKEY_MASK_FREE;
31 
32 	return cpu_to_be64(result);
33 }
34 
35 static __be64 get_umr_update_translation_mask(void)
36 {
37 	u64 result;
38 
39 	result = MLX5_MKEY_MASK_LEN |
40 		 MLX5_MKEY_MASK_PAGE_SIZE |
41 		 MLX5_MKEY_MASK_START_ADDR;
42 
43 	return cpu_to_be64(result);
44 }
45 
46 static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
47 {
48 	u64 result;
49 
50 	result = MLX5_MKEY_MASK_LR |
51 		 MLX5_MKEY_MASK_LW |
52 		 MLX5_MKEY_MASK_RR |
53 		 MLX5_MKEY_MASK_RW;
54 
55 	if (MLX5_CAP_GEN(dev->mdev, atomic))
56 		result |= MLX5_MKEY_MASK_A;
57 
58 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
59 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
60 
61 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
62 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
63 
64 	return cpu_to_be64(result);
65 }
66 
67 static __be64 get_umr_update_pd_mask(void)
68 {
69 	u64 result;
70 
71 	result = MLX5_MKEY_MASK_PD;
72 
73 	return cpu_to_be64(result);
74 }
75 
76 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
77 {
78 	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
79 	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
80 		return -EPERM;
81 
82 	if (mask & MLX5_MKEY_MASK_A &&
83 	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
84 		return -EPERM;
85 
86 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
87 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
88 		return -EPERM;
89 
90 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
91 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
92 		return -EPERM;
93 
94 	return 0;
95 }
96 
97 enum {
98 	MAX_UMR_WR = 128,
99 };
100 
101 static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
102 {
103 	struct ib_qp_attr attr = {};
104 	int ret;
105 
106 	attr.qp_state = IB_QPS_INIT;
107 	attr.port_num = 1;
108 	ret = ib_modify_qp(qp, &attr,
109 			   IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
110 	if (ret) {
111 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
112 		return ret;
113 	}
114 
115 	memset(&attr, 0, sizeof(attr));
116 	attr.qp_state = IB_QPS_RTR;
117 
118 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
119 	if (ret) {
120 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
121 		return ret;
122 	}
123 
124 	memset(&attr, 0, sizeof(attr));
125 	attr.qp_state = IB_QPS_RTS;
126 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
127 	if (ret) {
128 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
129 		return ret;
130 	}
131 
132 	return 0;
133 }
134 
135 int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
136 {
137 	struct ib_qp_init_attr init_attr = {};
138 	struct ib_pd *pd;
139 	struct ib_cq *cq;
140 	struct ib_qp *qp;
141 	int ret;
142 
143 	pd = ib_alloc_pd(&dev->ib_dev, 0);
144 	if (IS_ERR(pd)) {
145 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
146 		return PTR_ERR(pd);
147 	}
148 
149 	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
150 	if (IS_ERR(cq)) {
151 		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
152 		ret = PTR_ERR(cq);
153 		goto destroy_pd;
154 	}
155 
156 	init_attr.send_cq = cq;
157 	init_attr.recv_cq = cq;
158 	init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
159 	init_attr.cap.max_send_wr = MAX_UMR_WR;
160 	init_attr.cap.max_send_sge = 1;
161 	init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
162 	init_attr.port_num = 1;
163 	qp = ib_create_qp(pd, &init_attr);
164 	if (IS_ERR(qp)) {
165 		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
166 		ret = PTR_ERR(qp);
167 		goto destroy_cq;
168 	}
169 
170 	ret = mlx5r_umr_qp_rst2rts(dev, qp);
171 	if (ret)
172 		goto destroy_qp;
173 
174 	dev->umrc.qp = qp;
175 	dev->umrc.cq = cq;
176 	dev->umrc.pd = pd;
177 
178 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
179 	mutex_init(&dev->umrc.lock);
180 
181 	return 0;
182 
183 destroy_qp:
184 	ib_destroy_qp(qp);
185 destroy_cq:
186 	ib_free_cq(cq);
187 destroy_pd:
188 	ib_dealloc_pd(pd);
189 	return ret;
190 }
191 
192 void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
193 {
194 	ib_destroy_qp(dev->umrc.qp);
195 	ib_free_cq(dev->umrc.cq);
196 	ib_dealloc_pd(dev->umrc.pd);
197 }
198 
199 static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
200 {
201 	struct umr_common *umrc = &dev->umrc;
202 	struct ib_qp_attr attr;
203 	int err;
204 
205 	attr.qp_state = IB_QPS_RESET;
206 	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
207 	if (err) {
208 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
209 		goto err;
210 	}
211 
212 	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
213 	if (err)
214 		goto err;
215 
216 	umrc->state = MLX5_UMR_STATE_ACTIVE;
217 	return 0;
218 
219 err:
220 	umrc->state = MLX5_UMR_STATE_ERR;
221 	return err;
222 }
223 
224 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
225 			       struct mlx5r_umr_wqe *wqe, bool with_data)
226 {
227 	unsigned int wqe_size =
228 		with_data ? sizeof(struct mlx5r_umr_wqe) :
229 			    sizeof(struct mlx5r_umr_wqe) -
230 				    sizeof(struct mlx5_wqe_data_seg);
231 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
232 	struct mlx5_core_dev *mdev = dev->mdev;
233 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
234 	struct mlx5_wqe_ctrl_seg *ctrl;
235 	union {
236 		struct ib_cqe *ib_cqe;
237 		u64 wr_id;
238 	} id;
239 	void *cur_edge, *seg;
240 	unsigned long flags;
241 	unsigned int idx;
242 	int size, err;
243 
244 	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
245 		return -EIO;
246 
247 	spin_lock_irqsave(&qp->sq.lock, flags);
248 
249 	err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
250 			      cpu_to_be32(mkey), false, false);
251 	if (WARN_ON(err))
252 		goto out;
253 
254 	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
255 
256 	mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
257 
258 	id.ib_cqe = cqe;
259 	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
260 			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
261 
262 	mlx5r_ring_db(qp, 1, ctrl);
263 
264 out:
265 	spin_unlock_irqrestore(&qp->sq.lock, flags);
266 
267 	return err;
268 }
269 
270 static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
271 {
272 	struct mlx5_ib_umr_context *context =
273 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
274 
275 	context->status = wc->status;
276 	complete(&context->done);
277 }
278 
279 static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
280 {
281 	context->cqe.done = mlx5r_umr_done;
282 	init_completion(&context->done);
283 }
284 
285 static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
286 				   struct mlx5r_umr_wqe *wqe, bool with_data)
287 {
288 	struct umr_common *umrc = &dev->umrc;
289 	struct mlx5r_umr_context umr_context;
290 	int err;
291 
292 	err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
293 	if (WARN_ON(err))
294 		return err;
295 
296 	mlx5r_umr_init_context(&umr_context);
297 
298 	down(&umrc->sem);
299 	while (true) {
300 		mutex_lock(&umrc->lock);
301 		if (umrc->state == MLX5_UMR_STATE_ERR) {
302 			mutex_unlock(&umrc->lock);
303 			err = -EFAULT;
304 			break;
305 		}
306 
307 		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
308 			mutex_unlock(&umrc->lock);
309 			usleep_range(3000, 5000);
310 			continue;
311 		}
312 
313 		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
314 					  with_data);
315 		mutex_unlock(&umrc->lock);
316 		if (err) {
317 			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
318 				     err);
319 			break;
320 		}
321 
322 		wait_for_completion(&umr_context.done);
323 
324 		if (umr_context.status == IB_WC_SUCCESS)
325 			break;
326 
327 		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
328 			continue;
329 
330 		WARN_ON_ONCE(1);
331 		mlx5_ib_warn(dev,
332 			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
333 			umr_context.status);
334 		mutex_lock(&umrc->lock);
335 		err = mlx5r_umr_recover(dev);
336 		mutex_unlock(&umrc->lock);
337 		if (err)
338 			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
339 				     err);
340 		err = -EFAULT;
341 		break;
342 	}
343 	up(&umrc->sem);
344 	return err;
345 }
346 
347 /**
348  * mlx5r_umr_revoke_mr - Fence all DMA on the MR
349  * @mr: The MR to fence
350  *
351  * Upon return the NIC will not be doing any DMA to the pages under the MR,
352  * and any DMA in progress will be completed. Failure of this function
353  * indicates the HW has failed catastrophically.
354  */
355 int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
356 {
357 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
358 	struct mlx5r_umr_wqe wqe = {};
359 
360 	if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
361 		return 0;
362 
363 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
364 	wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
365 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
366 
367 	MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
368 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
369 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
370 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
371 		 mlx5_mkey_variant(mr->mmkey.key));
372 
373 	return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
374 }
375 
376 static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
377 				       struct mlx5_mkey_seg *seg,
378 				       unsigned int access_flags)
379 {
380 	MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
381 	MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
382 	MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
383 	MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
384 	MLX5_SET(mkc, seg, lr, 1);
385 	MLX5_SET(mkc, seg, relaxed_ordering_write,
386 		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
387 	MLX5_SET(mkc, seg, relaxed_ordering_read,
388 		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
389 }
390 
391 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
392 			      int access_flags)
393 {
394 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
395 	struct mlx5r_umr_wqe wqe = {};
396 	int err;
397 
398 	wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
399 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
400 	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
401 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
402 
403 	mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
404 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
405 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
406 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
407 		 mlx5_mkey_variant(mr->mmkey.key));
408 
409 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
410 	if (err)
411 		return err;
412 
413 	mr->access_flags = access_flags;
414 	return 0;
415 }
416 
417 #define MLX5_MAX_UMR_CHUNK                                                     \
418 	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_MTT_ALIGNMENT)
419 #define MLX5_SPARE_UMR_CHUNK 0x10000
420 
421 /*
422  * Allocate a temporary buffer to hold the per-page information to transfer to
423  * HW. For efficiency this should be as large as it can be, but buffer
424  * allocation failure is not allowed, so try smaller sizes.
425  */
426 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
427 {
428 	const size_t xlt_chunk_align = MLX5_UMR_MTT_ALIGNMENT / ent_size;
429 	size_t size;
430 	void *res = NULL;
431 
432 	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
433 
434 	/*
435 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
436 	 * allocation can't trigger any kind of reclaim.
437 	 */
438 	might_sleep();
439 
440 	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
441 
442 	/*
443 	 * If the system already has a suitable high order page then just use
444 	 * that, but don't try hard to create one. This max is about 1M, so a
445 	 * free x86 huge page will satisfy it.
446 	 */
447 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
448 		     MLX5_MAX_UMR_CHUNK);
449 	*nents = size / ent_size;
450 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
451 				       get_order(size));
452 	if (res)
453 		return res;
454 
455 	if (size > MLX5_SPARE_UMR_CHUNK) {
456 		size = MLX5_SPARE_UMR_CHUNK;
457 		*nents = size / ent_size;
458 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
459 					       get_order(size));
460 		if (res)
461 			return res;
462 	}
463 
464 	*nents = PAGE_SIZE / ent_size;
465 	res = (void *)__get_free_page(gfp_mask);
466 	if (res)
467 		return res;
468 
469 	mutex_lock(&xlt_emergency_page_mutex);
470 	memset(xlt_emergency_page, 0, PAGE_SIZE);
471 	return xlt_emergency_page;
472 }
473 
474 static void mlx5r_umr_free_xlt(void *xlt, size_t length)
475 {
476 	if (xlt == xlt_emergency_page) {
477 		mutex_unlock(&xlt_emergency_page_mutex);
478 		return;
479 	}
480 
481 	free_pages((unsigned long)xlt, get_order(length));
482 }
483 
484 static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
485 				     struct ib_sge *sg)
486 {
487 	struct device *ddev = &dev->mdev->pdev->dev;
488 
489 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
490 	mlx5r_umr_free_xlt(xlt, sg->length);
491 }
492 
493 /*
494  * Create an XLT buffer ready for submission.
495  */
496 static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
497 				  size_t nents, size_t ent_size,
498 				  unsigned int flags)
499 {
500 	struct device *ddev = &dev->mdev->pdev->dev;
501 	dma_addr_t dma;
502 	void *xlt;
503 
504 	xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
505 				 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
506 								  GFP_KERNEL);
507 	sg->length = nents * ent_size;
508 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
509 	if (dma_mapping_error(ddev, dma)) {
510 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
511 		mlx5r_umr_free_xlt(xlt, sg->length);
512 		return NULL;
513 	}
514 	sg->addr = dma;
515 	sg->lkey = dev->umrc.pd->local_dma_lkey;
516 
517 	return xlt;
518 }
519 
520 static void
521 mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
522 				  unsigned int flags, struct ib_sge *sg)
523 {
524 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
525 		/* fail if free */
526 		ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
527 	else
528 		/* fail if not free */
529 		ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
530 	ctrl_seg->xlt_octowords =
531 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
532 }
533 
534 static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
535 					      struct mlx5_mkey_seg *mkey_seg,
536 					      struct mlx5_ib_mr *mr,
537 					      unsigned int page_shift)
538 {
539 	mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
540 	MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
541 	MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
542 	MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
543 	MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
544 	MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
545 	MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
546 }
547 
548 static void
549 mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
550 				  struct ib_sge *sg)
551 {
552 	data_seg->byte_count = cpu_to_be32(sg->length);
553 	data_seg->lkey = cpu_to_be32(sg->lkey);
554 	data_seg->addr = cpu_to_be64(sg->addr);
555 }
556 
557 static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
558 				    u64 offset)
559 {
560 	u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
561 
562 	ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
563 	ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
564 	ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
565 }
566 
567 static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
568 				       struct mlx5r_umr_wqe *wqe,
569 				       struct mlx5_ib_mr *mr, struct ib_sge *sg,
570 				       unsigned int flags)
571 {
572 	bool update_pd_access, update_translation;
573 
574 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
575 		wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
576 
577 	update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
578 			   flags & MLX5_IB_UPD_XLT_PD ||
579 			   flags & MLX5_IB_UPD_XLT_ACCESS;
580 
581 	if (update_pd_access) {
582 		wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
583 		wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
584 	}
585 
586 	update_translation =
587 		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
588 
589 	if (update_translation) {
590 		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
591 		if (!mr->ibmr.length)
592 			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
593 	}
594 
595 	wqe->ctrl_seg.xlt_octowords =
596 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
597 	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
598 }
599 
600 /*
601  * Send the DMA list to the HW for a normal MR using UMR.
602  * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
603  * flag may be used.
604  */
605 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
606 {
607 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
608 	struct device *ddev = &dev->mdev->pdev->dev;
609 	struct mlx5r_umr_wqe wqe = {};
610 	struct ib_block_iter biter;
611 	struct mlx5_mtt *cur_mtt;
612 	size_t orig_sg_length;
613 	struct mlx5_mtt *mtt;
614 	size_t final_size;
615 	struct ib_sge sg;
616 	u64 offset = 0;
617 	int err = 0;
618 
619 	if (WARN_ON(mr->umem->is_odp))
620 		return -EINVAL;
621 
622 	mtt = mlx5r_umr_create_xlt(
623 		dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
624 		sizeof(*mtt), flags);
625 	if (!mtt)
626 		return -ENOMEM;
627 
628 	orig_sg_length = sg.length;
629 
630 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
631 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
632 					  mr->page_shift);
633 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
634 
635 	cur_mtt = mtt;
636 	rdma_for_each_block(mr->umem->sgt_append.sgt.sgl, &biter,
637 			    mr->umem->sgt_append.sgt.nents,
638 			    BIT(mr->page_shift)) {
639 		if (cur_mtt == (void *)mtt + sg.length) {
640 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
641 						   DMA_TO_DEVICE);
642 
643 			err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
644 						       true);
645 			if (err)
646 				goto err;
647 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
648 						DMA_TO_DEVICE);
649 			offset += sg.length;
650 			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
651 
652 			cur_mtt = mtt;
653 		}
654 
655 		cur_mtt->ptag =
656 			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
657 				    MLX5_IB_MTT_PRESENT);
658 
659 		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
660 			cur_mtt->ptag = 0;
661 
662 		cur_mtt++;
663 	}
664 
665 	final_size = (void *)cur_mtt - (void *)mtt;
666 	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
667 	memset(cur_mtt, 0, sg.length - final_size);
668 	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
669 
670 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
671 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
672 
673 err:
674 	sg.length = orig_sg_length;
675 	mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
676 	return err;
677 }
678 
679 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
680 {
681 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
682 }
683 
684 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
685 			 int page_shift, int flags)
686 {
687 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
688 			       ? sizeof(struct mlx5_klm)
689 			       : sizeof(struct mlx5_mtt);
690 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
691 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
692 	struct device *ddev = &dev->mdev->pdev->dev;
693 	const int page_mask = page_align - 1;
694 	struct mlx5r_umr_wqe wqe = {};
695 	size_t pages_mapped = 0;
696 	size_t pages_to_map = 0;
697 	size_t size_to_map = 0;
698 	size_t orig_sg_length;
699 	size_t pages_iter;
700 	struct ib_sge sg;
701 	int err = 0;
702 	void *xlt;
703 
704 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
705 	    !umr_can_use_indirect_mkey(dev))
706 		return -EPERM;
707 
708 	if (WARN_ON(!mr->umem->is_odp))
709 		return -EINVAL;
710 
711 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
712 	 * so we need to align the offset and length accordingly
713 	 */
714 	if (idx & page_mask) {
715 		npages += idx & page_mask;
716 		idx &= ~page_mask;
717 	}
718 	pages_to_map = ALIGN(npages, page_align);
719 
720 	xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
721 	if (!xlt)
722 		return -ENOMEM;
723 
724 	pages_iter = sg.length / desc_size;
725 	orig_sg_length = sg.length;
726 
727 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
728 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
729 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
730 
731 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
732 	}
733 
734 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
735 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
736 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
737 
738 	for (pages_mapped = 0;
739 	     pages_mapped < pages_to_map && !err;
740 	     pages_mapped += pages_iter, idx += pages_iter) {
741 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
742 		size_to_map = npages * desc_size;
743 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
744 					DMA_TO_DEVICE);
745 		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
746 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
747 					   DMA_TO_DEVICE);
748 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
749 
750 		if (pages_mapped + pages_iter >= pages_to_map)
751 			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
752 		mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
753 		err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
754 	}
755 	sg.length = orig_sg_length;
756 	mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
757 	return err;
758 }
759