xref: /openbmc/linux/drivers/infiniband/hw/mlx5/mr.c (revision 58919326e72f63c380dc3271dd1cc8bdf1bbe3e4)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  * Copyright (c) 2020, Intel Corporation. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem.h>
43 #include <rdma/ib_umem_odp.h>
44 #include <rdma/ib_verbs.h>
45 #include "dm.h"
46 #include "mlx5_ib.h"
47 
48 /*
49  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
50  * work on kernel modules memory
51  */
52 void *xlt_emergency_page;
53 static DEFINE_MUTEX(xlt_emergency_page_mutex);
54 
55 enum {
56 	MAX_PENDING_REG_MR = 8,
57 };
58 
59 #define MLX5_UMR_ALIGN 2048
60 
61 static void
62 create_mkey_callback(int status, struct mlx5_async_work *context);
63 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
64 				     u64 iova, int access_flags,
65 				     unsigned int page_size, bool populate);
66 
67 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
68 					  struct ib_pd *pd)
69 {
70 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
71 
72 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
73 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
74 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
75 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
76 	MLX5_SET(mkc, mkc, lr, 1);
77 
78 	if ((acc & IB_ACCESS_RELAXED_ORDERING) &&
79 	    pcie_relaxed_ordering_enabled(dev->mdev->pdev)) {
80 		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
81 			MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
82 		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
83 			MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
84 	}
85 
86 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
87 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
88 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
89 }
90 
91 static void assign_mkey_variant(struct mlx5_ib_dev *dev,
92 				struct mlx5_ib_mkey *mkey, u32 *in)
93 {
94 	u8 key = atomic_inc_return(&dev->mkey_var);
95 	void *mkc;
96 
97 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
98 	MLX5_SET(mkc, mkc, mkey_7_0, key);
99 	mkey->key = key;
100 }
101 
102 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
103 			       struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
104 {
105 	int ret;
106 
107 	assign_mkey_variant(dev, mkey, in);
108 	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
109 	if (!ret)
110 		init_waitqueue_head(&mkey->wait);
111 
112 	return ret;
113 }
114 
115 static int
116 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
117 		       struct mlx5_ib_mkey *mkey,
118 		       struct mlx5_async_ctx *async_ctx,
119 		       u32 *in, int inlen, u32 *out, int outlen,
120 		       struct mlx5_async_work *context)
121 {
122 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
123 	assign_mkey_variant(dev, mkey, in);
124 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
125 				create_mkey_callback, context);
126 }
127 
128 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
129 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
130 
131 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
132 {
133 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
134 }
135 
136 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
137 {
138 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
139 
140 	return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
141 }
142 
143 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
144 {
145 	if (status == -ENXIO) /* core driver is not available */
146 		return;
147 
148 	mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
149 	if (status != -EREMOTEIO) /* driver specific failure */
150 		return;
151 
152 	/* Failed in FW, print cmd out failure details */
153 	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
154 }
155 
156 static void create_mkey_callback(int status, struct mlx5_async_work *context)
157 {
158 	struct mlx5_ib_mr *mr =
159 		container_of(context, struct mlx5_ib_mr, cb_work);
160 	struct mlx5_cache_ent *ent = mr->cache_ent;
161 	struct mlx5_ib_dev *dev = ent->dev;
162 	unsigned long flags;
163 
164 	if (status) {
165 		create_mkey_warn(dev, status, mr->out);
166 		kfree(mr);
167 		spin_lock_irqsave(&ent->lock, flags);
168 		ent->pending--;
169 		WRITE_ONCE(dev->fill_delay, 1);
170 		spin_unlock_irqrestore(&ent->lock, flags);
171 		mod_timer(&dev->delay_timer, jiffies + HZ);
172 		return;
173 	}
174 
175 	mr->mmkey.type = MLX5_MKEY_MR;
176 	mr->mmkey.key |= mlx5_idx_to_mkey(
177 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
178 	init_waitqueue_head(&mr->mmkey.wait);
179 
180 	WRITE_ONCE(dev->cache.last_add, jiffies);
181 
182 	spin_lock_irqsave(&ent->lock, flags);
183 	list_add_tail(&mr->list, &ent->head);
184 	ent->available_mrs++;
185 	ent->total_mrs++;
186 	/* If we are doing fill_to_high_water then keep going. */
187 	queue_adjust_cache_locked(ent);
188 	ent->pending--;
189 	spin_unlock_irqrestore(&ent->lock, flags);
190 }
191 
192 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
193 {
194 	int ret = 0;
195 
196 	switch (access_mode) {
197 	case MLX5_MKC_ACCESS_MODE_MTT:
198 		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
199 						   sizeof(struct mlx5_mtt));
200 		break;
201 	case MLX5_MKC_ACCESS_MODE_KSM:
202 		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
203 						   sizeof(struct mlx5_klm));
204 		break;
205 	default:
206 		WARN_ON(1);
207 	}
208 	return ret;
209 }
210 
211 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
212 {
213 	struct mlx5_ib_mr *mr;
214 
215 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
216 	if (!mr)
217 		return NULL;
218 	mr->cache_ent = ent;
219 
220 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
221 	MLX5_SET(mkc, mkc, free, 1);
222 	MLX5_SET(mkc, mkc, umr_en, 1);
223 	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
224 	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
225 
226 	MLX5_SET(mkc, mkc, translations_octword_size,
227 		 get_mkc_octo_size(ent->access_mode, ent->ndescs));
228 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
229 	return mr;
230 }
231 
232 /* Asynchronously schedule new MRs to be populated in the cache. */
233 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
234 {
235 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
236 	struct mlx5_ib_mr *mr;
237 	void *mkc;
238 	u32 *in;
239 	int err = 0;
240 	int i;
241 
242 	in = kzalloc(inlen, GFP_KERNEL);
243 	if (!in)
244 		return -ENOMEM;
245 
246 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
247 	for (i = 0; i < num; i++) {
248 		mr = alloc_cache_mr(ent, mkc);
249 		if (!mr) {
250 			err = -ENOMEM;
251 			break;
252 		}
253 		spin_lock_irq(&ent->lock);
254 		if (ent->pending >= MAX_PENDING_REG_MR) {
255 			err = -EAGAIN;
256 			spin_unlock_irq(&ent->lock);
257 			kfree(mr);
258 			break;
259 		}
260 		ent->pending++;
261 		spin_unlock_irq(&ent->lock);
262 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
263 					     &ent->dev->async_ctx, in, inlen,
264 					     mr->out, sizeof(mr->out),
265 					     &mr->cb_work);
266 		if (err) {
267 			spin_lock_irq(&ent->lock);
268 			ent->pending--;
269 			spin_unlock_irq(&ent->lock);
270 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
271 			kfree(mr);
272 			break;
273 		}
274 	}
275 
276 	kfree(in);
277 	return err;
278 }
279 
280 /* Synchronously create a MR in the cache */
281 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
282 {
283 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
284 	struct mlx5_ib_mr *mr;
285 	void *mkc;
286 	u32 *in;
287 	int err;
288 
289 	in = kzalloc(inlen, GFP_KERNEL);
290 	if (!in)
291 		return ERR_PTR(-ENOMEM);
292 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
293 
294 	mr = alloc_cache_mr(ent, mkc);
295 	if (!mr) {
296 		err = -ENOMEM;
297 		goto free_in;
298 	}
299 
300 	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
301 	if (err)
302 		goto free_mr;
303 
304 	init_waitqueue_head(&mr->mmkey.wait);
305 	mr->mmkey.type = MLX5_MKEY_MR;
306 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
307 	spin_lock_irq(&ent->lock);
308 	ent->total_mrs++;
309 	spin_unlock_irq(&ent->lock);
310 	kfree(in);
311 	return mr;
312 free_mr:
313 	kfree(mr);
314 free_in:
315 	kfree(in);
316 	return ERR_PTR(err);
317 }
318 
319 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
320 {
321 	struct mlx5_ib_mr *mr;
322 
323 	lockdep_assert_held(&ent->lock);
324 	if (list_empty(&ent->head))
325 		return;
326 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
327 	list_del(&mr->list);
328 	ent->available_mrs--;
329 	ent->total_mrs--;
330 	spin_unlock_irq(&ent->lock);
331 	mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
332 	kfree(mr);
333 	spin_lock_irq(&ent->lock);
334 }
335 
336 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
337 				bool limit_fill)
338 {
339 	int err;
340 
341 	lockdep_assert_held(&ent->lock);
342 
343 	while (true) {
344 		if (limit_fill)
345 			target = ent->limit * 2;
346 		if (target == ent->available_mrs + ent->pending)
347 			return 0;
348 		if (target > ent->available_mrs + ent->pending) {
349 			u32 todo = target - (ent->available_mrs + ent->pending);
350 
351 			spin_unlock_irq(&ent->lock);
352 			err = add_keys(ent, todo);
353 			if (err == -EAGAIN)
354 				usleep_range(3000, 5000);
355 			spin_lock_irq(&ent->lock);
356 			if (err) {
357 				if (err != -EAGAIN)
358 					return err;
359 			} else
360 				return 0;
361 		} else {
362 			remove_cache_mr_locked(ent);
363 		}
364 	}
365 }
366 
367 static ssize_t size_write(struct file *filp, const char __user *buf,
368 			  size_t count, loff_t *pos)
369 {
370 	struct mlx5_cache_ent *ent = filp->private_data;
371 	u32 target;
372 	int err;
373 
374 	err = kstrtou32_from_user(buf, count, 0, &target);
375 	if (err)
376 		return err;
377 
378 	/*
379 	 * Target is the new value of total_mrs the user requests, however we
380 	 * cannot free MRs that are in use. Compute the target value for
381 	 * available_mrs.
382 	 */
383 	spin_lock_irq(&ent->lock);
384 	if (target < ent->total_mrs - ent->available_mrs) {
385 		err = -EINVAL;
386 		goto err_unlock;
387 	}
388 	target = target - (ent->total_mrs - ent->available_mrs);
389 	if (target < ent->limit || target > ent->limit*2) {
390 		err = -EINVAL;
391 		goto err_unlock;
392 	}
393 	err = resize_available_mrs(ent, target, false);
394 	if (err)
395 		goto err_unlock;
396 	spin_unlock_irq(&ent->lock);
397 
398 	return count;
399 
400 err_unlock:
401 	spin_unlock_irq(&ent->lock);
402 	return err;
403 }
404 
405 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
406 			 loff_t *pos)
407 {
408 	struct mlx5_cache_ent *ent = filp->private_data;
409 	char lbuf[20];
410 	int err;
411 
412 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
413 	if (err < 0)
414 		return err;
415 
416 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
417 }
418 
419 static const struct file_operations size_fops = {
420 	.owner	= THIS_MODULE,
421 	.open	= simple_open,
422 	.write	= size_write,
423 	.read	= size_read,
424 };
425 
426 static ssize_t limit_write(struct file *filp, const char __user *buf,
427 			   size_t count, loff_t *pos)
428 {
429 	struct mlx5_cache_ent *ent = filp->private_data;
430 	u32 var;
431 	int err;
432 
433 	err = kstrtou32_from_user(buf, count, 0, &var);
434 	if (err)
435 		return err;
436 
437 	/*
438 	 * Upon set we immediately fill the cache to high water mark implied by
439 	 * the limit.
440 	 */
441 	spin_lock_irq(&ent->lock);
442 	ent->limit = var;
443 	err = resize_available_mrs(ent, 0, true);
444 	spin_unlock_irq(&ent->lock);
445 	if (err)
446 		return err;
447 	return count;
448 }
449 
450 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
451 			  loff_t *pos)
452 {
453 	struct mlx5_cache_ent *ent = filp->private_data;
454 	char lbuf[20];
455 	int err;
456 
457 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
458 	if (err < 0)
459 		return err;
460 
461 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
462 }
463 
464 static const struct file_operations limit_fops = {
465 	.owner	= THIS_MODULE,
466 	.open	= simple_open,
467 	.write	= limit_write,
468 	.read	= limit_read,
469 };
470 
471 static bool someone_adding(struct mlx5_mr_cache *cache)
472 {
473 	unsigned int i;
474 
475 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
476 		struct mlx5_cache_ent *ent = &cache->ent[i];
477 		bool ret;
478 
479 		spin_lock_irq(&ent->lock);
480 		ret = ent->available_mrs < ent->limit;
481 		spin_unlock_irq(&ent->lock);
482 		if (ret)
483 			return true;
484 	}
485 	return false;
486 }
487 
488 /*
489  * Check if the bucket is outside the high/low water mark and schedule an async
490  * update. The cache refill has hysteresis, once the low water mark is hit it is
491  * refilled up to the high mark.
492  */
493 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
494 {
495 	lockdep_assert_held(&ent->lock);
496 
497 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
498 		return;
499 	if (ent->available_mrs < ent->limit) {
500 		ent->fill_to_high_water = true;
501 		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
502 	} else if (ent->fill_to_high_water &&
503 		   ent->available_mrs + ent->pending < 2 * ent->limit) {
504 		/*
505 		 * Once we start populating due to hitting a low water mark
506 		 * continue until we pass the high water mark.
507 		 */
508 		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
509 	} else if (ent->available_mrs == 2 * ent->limit) {
510 		ent->fill_to_high_water = false;
511 	} else if (ent->available_mrs > 2 * ent->limit) {
512 		/* Queue deletion of excess entries */
513 		ent->fill_to_high_water = false;
514 		if (ent->pending)
515 			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
516 					   msecs_to_jiffies(1000));
517 		else
518 			mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
519 	}
520 }
521 
522 static void __cache_work_func(struct mlx5_cache_ent *ent)
523 {
524 	struct mlx5_ib_dev *dev = ent->dev;
525 	struct mlx5_mr_cache *cache = &dev->cache;
526 	int err;
527 
528 	spin_lock_irq(&ent->lock);
529 	if (ent->disabled)
530 		goto out;
531 
532 	if (ent->fill_to_high_water &&
533 	    ent->available_mrs + ent->pending < 2 * ent->limit &&
534 	    !READ_ONCE(dev->fill_delay)) {
535 		spin_unlock_irq(&ent->lock);
536 		err = add_keys(ent, 1);
537 		spin_lock_irq(&ent->lock);
538 		if (ent->disabled)
539 			goto out;
540 		if (err) {
541 			/*
542 			 * EAGAIN only happens if pending is positive, so we
543 			 * will be rescheduled from reg_mr_callback(). The only
544 			 * failure path here is ENOMEM.
545 			 */
546 			if (err != -EAGAIN) {
547 				mlx5_ib_warn(
548 					dev,
549 					"command failed order %d, err %d\n",
550 					ent->order, err);
551 				queue_delayed_work(cache->wq, &ent->dwork,
552 						   msecs_to_jiffies(1000));
553 			}
554 		}
555 	} else if (ent->available_mrs > 2 * ent->limit) {
556 		bool need_delay;
557 
558 		/*
559 		 * The remove_cache_mr() logic is performed as garbage
560 		 * collection task. Such task is intended to be run when no
561 		 * other active processes are running.
562 		 *
563 		 * The need_resched() will return TRUE if there are user tasks
564 		 * to be activated in near future.
565 		 *
566 		 * In such case, we don't execute remove_cache_mr() and postpone
567 		 * the garbage collection work to try to run in next cycle, in
568 		 * order to free CPU resources to other tasks.
569 		 */
570 		spin_unlock_irq(&ent->lock);
571 		need_delay = need_resched() || someone_adding(cache) ||
572 			     !time_after(jiffies,
573 					 READ_ONCE(cache->last_add) + 300 * HZ);
574 		spin_lock_irq(&ent->lock);
575 		if (ent->disabled)
576 			goto out;
577 		if (need_delay) {
578 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
579 			goto out;
580 		}
581 		remove_cache_mr_locked(ent);
582 		queue_adjust_cache_locked(ent);
583 	}
584 out:
585 	spin_unlock_irq(&ent->lock);
586 }
587 
588 static void delayed_cache_work_func(struct work_struct *work)
589 {
590 	struct mlx5_cache_ent *ent;
591 
592 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
593 	__cache_work_func(ent);
594 }
595 
596 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
597 				       struct mlx5_cache_ent *ent,
598 				       int access_flags)
599 {
600 	struct mlx5_ib_mr *mr;
601 
602 	/* Matches access in alloc_cache_mr() */
603 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
604 		return ERR_PTR(-EOPNOTSUPP);
605 
606 	spin_lock_irq(&ent->lock);
607 	if (list_empty(&ent->head)) {
608 		queue_adjust_cache_locked(ent);
609 		ent->miss++;
610 		spin_unlock_irq(&ent->lock);
611 		mr = create_cache_mr(ent);
612 		if (IS_ERR(mr))
613 			return mr;
614 	} else {
615 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
616 		list_del(&mr->list);
617 		ent->available_mrs--;
618 		queue_adjust_cache_locked(ent);
619 		spin_unlock_irq(&ent->lock);
620 
621 		mlx5_clear_mr(mr);
622 	}
623 	return mr;
624 }
625 
626 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
627 {
628 	struct mlx5_cache_ent *ent = mr->cache_ent;
629 
630 	WRITE_ONCE(dev->cache.last_add, jiffies);
631 	spin_lock_irq(&ent->lock);
632 	list_add_tail(&mr->list, &ent->head);
633 	ent->available_mrs++;
634 	queue_adjust_cache_locked(ent);
635 	spin_unlock_irq(&ent->lock);
636 }
637 
638 static void clean_keys(struct mlx5_ib_dev *dev, int c)
639 {
640 	struct mlx5_mr_cache *cache = &dev->cache;
641 	struct mlx5_cache_ent *ent = &cache->ent[c];
642 	struct mlx5_ib_mr *tmp_mr;
643 	struct mlx5_ib_mr *mr;
644 	LIST_HEAD(del_list);
645 
646 	cancel_delayed_work(&ent->dwork);
647 	while (1) {
648 		spin_lock_irq(&ent->lock);
649 		if (list_empty(&ent->head)) {
650 			spin_unlock_irq(&ent->lock);
651 			break;
652 		}
653 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
654 		list_move(&mr->list, &del_list);
655 		ent->available_mrs--;
656 		ent->total_mrs--;
657 		spin_unlock_irq(&ent->lock);
658 		mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
659 	}
660 
661 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
662 		list_del(&mr->list);
663 		kfree(mr);
664 	}
665 }
666 
667 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
668 {
669 	if (!mlx5_debugfs_root || dev->is_rep)
670 		return;
671 
672 	debugfs_remove_recursive(dev->cache.root);
673 	dev->cache.root = NULL;
674 }
675 
676 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
677 {
678 	struct mlx5_mr_cache *cache = &dev->cache;
679 	struct mlx5_cache_ent *ent;
680 	struct dentry *dir;
681 	int i;
682 
683 	if (!mlx5_debugfs_root || dev->is_rep)
684 		return;
685 
686 	cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
687 
688 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
689 		ent = &cache->ent[i];
690 		sprintf(ent->name, "%d", ent->order);
691 		dir = debugfs_create_dir(ent->name, cache->root);
692 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
693 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
694 		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
695 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
696 	}
697 }
698 
699 static void delay_time_func(struct timer_list *t)
700 {
701 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
702 
703 	WRITE_ONCE(dev->fill_delay, 0);
704 }
705 
706 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
707 {
708 	struct mlx5_mr_cache *cache = &dev->cache;
709 	struct mlx5_cache_ent *ent;
710 	int i;
711 
712 	mutex_init(&dev->slow_path_mutex);
713 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
714 	if (!cache->wq) {
715 		mlx5_ib_warn(dev, "failed to create work queue\n");
716 		return -ENOMEM;
717 	}
718 
719 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
720 	timer_setup(&dev->delay_timer, delay_time_func, 0);
721 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
722 		ent = &cache->ent[i];
723 		INIT_LIST_HEAD(&ent->head);
724 		spin_lock_init(&ent->lock);
725 		ent->order = i + 2;
726 		ent->dev = dev;
727 		ent->limit = 0;
728 
729 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
730 
731 		if (i > MR_CACHE_LAST_STD_ENTRY) {
732 			mlx5_odp_init_mr_cache_entry(ent);
733 			continue;
734 		}
735 
736 		if (ent->order > mr_cache_max_order(dev))
737 			continue;
738 
739 		ent->page = PAGE_SHIFT;
740 		ent->ndescs = 1 << ent->order;
741 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
742 		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
743 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
744 		    mlx5_ib_can_load_pas_with_umr(dev, 0))
745 			ent->limit = dev->mdev->profile.mr_cache[i].limit;
746 		else
747 			ent->limit = 0;
748 		spin_lock_irq(&ent->lock);
749 		queue_adjust_cache_locked(ent);
750 		spin_unlock_irq(&ent->lock);
751 	}
752 
753 	mlx5_mr_cache_debugfs_init(dev);
754 
755 	return 0;
756 }
757 
758 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
759 {
760 	unsigned int i;
761 
762 	if (!dev->cache.wq)
763 		return 0;
764 
765 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
766 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
767 
768 		spin_lock_irq(&ent->lock);
769 		ent->disabled = true;
770 		spin_unlock_irq(&ent->lock);
771 		cancel_delayed_work_sync(&ent->dwork);
772 	}
773 
774 	mlx5_mr_cache_debugfs_cleanup(dev);
775 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
776 
777 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
778 		clean_keys(dev, i);
779 
780 	destroy_workqueue(dev->cache.wq);
781 	del_timer_sync(&dev->delay_timer);
782 
783 	return 0;
784 }
785 
786 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
787 {
788 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
789 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
790 	struct mlx5_ib_mr *mr;
791 	void *mkc;
792 	u32 *in;
793 	int err;
794 
795 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
796 	if (!mr)
797 		return ERR_PTR(-ENOMEM);
798 
799 	in = kzalloc(inlen, GFP_KERNEL);
800 	if (!in) {
801 		err = -ENOMEM;
802 		goto err_free;
803 	}
804 
805 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
806 
807 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
808 	MLX5_SET(mkc, mkc, length64, 1);
809 	set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
810 				      pd);
811 
812 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
813 	if (err)
814 		goto err_in;
815 
816 	kfree(in);
817 	mr->mmkey.type = MLX5_MKEY_MR;
818 	mr->ibmr.lkey = mr->mmkey.key;
819 	mr->ibmr.rkey = mr->mmkey.key;
820 	mr->umem = NULL;
821 
822 	return &mr->ibmr;
823 
824 err_in:
825 	kfree(in);
826 
827 err_free:
828 	kfree(mr);
829 
830 	return ERR_PTR(err);
831 }
832 
833 static int get_octo_len(u64 addr, u64 len, int page_shift)
834 {
835 	u64 page_size = 1ULL << page_shift;
836 	u64 offset;
837 	int npages;
838 
839 	offset = addr & (page_size - 1);
840 	npages = ALIGN(len + offset, page_size) >> page_shift;
841 	return (npages + 1) / 2;
842 }
843 
844 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
845 {
846 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
847 		return MR_CACHE_LAST_STD_ENTRY + 2;
848 	return MLX5_MAX_UMR_SHIFT;
849 }
850 
851 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
852 {
853 	struct mlx5_ib_umr_context *context =
854 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
855 
856 	context->status = wc->status;
857 	complete(&context->done);
858 }
859 
860 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
861 {
862 	context->cqe.done = mlx5_ib_umr_done;
863 	context->status = -1;
864 	init_completion(&context->done);
865 }
866 
867 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
868 				  struct mlx5_umr_wr *umrwr)
869 {
870 	struct umr_common *umrc = &dev->umrc;
871 	const struct ib_send_wr *bad;
872 	int err;
873 	struct mlx5_ib_umr_context umr_context;
874 
875 	mlx5_ib_init_umr_context(&umr_context);
876 	umrwr->wr.wr_cqe = &umr_context.cqe;
877 
878 	down(&umrc->sem);
879 	err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
880 	if (err) {
881 		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
882 	} else {
883 		wait_for_completion(&umr_context.done);
884 		if (umr_context.status != IB_WC_SUCCESS) {
885 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
886 				     umr_context.status);
887 			err = -EFAULT;
888 		}
889 	}
890 	up(&umrc->sem);
891 	return err;
892 }
893 
894 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
895 						      unsigned int order)
896 {
897 	struct mlx5_mr_cache *cache = &dev->cache;
898 
899 	if (order < cache->ent[0].order)
900 		return &cache->ent[0];
901 	order = order - cache->ent[0].order;
902 	if (order > MR_CACHE_LAST_STD_ENTRY)
903 		return NULL;
904 	return &cache->ent[order];
905 }
906 
907 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
908 			  u64 length, int access_flags, u64 iova)
909 {
910 	mr->ibmr.lkey = mr->mmkey.key;
911 	mr->ibmr.rkey = mr->mmkey.key;
912 	mr->ibmr.length = length;
913 	mr->ibmr.device = &dev->ib_dev;
914 	mr->ibmr.iova = iova;
915 	mr->access_flags = access_flags;
916 }
917 
918 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
919 						  u64 iova)
920 {
921 	/*
922 	 * The alignment of iova has already been checked upon entering
923 	 * UVERBS_METHOD_REG_DMABUF_MR
924 	 */
925 	umem->iova = iova;
926 	return PAGE_SIZE;
927 }
928 
929 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
930 					     struct ib_umem *umem, u64 iova,
931 					     int access_flags)
932 {
933 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
934 	struct mlx5_cache_ent *ent;
935 	struct mlx5_ib_mr *mr;
936 	unsigned int page_size;
937 
938 	if (umem->is_dmabuf)
939 		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
940 	else
941 		page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
942 						     0, iova);
943 	if (WARN_ON(!page_size))
944 		return ERR_PTR(-EINVAL);
945 	ent = mr_cache_ent_from_order(
946 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
947 	/*
948 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
949 	 * cache then synchronously create an uncached one.
950 	 */
951 	if (!ent || ent->limit == 0 ||
952 	    !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
953 		mutex_lock(&dev->slow_path_mutex);
954 		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
955 		mutex_unlock(&dev->slow_path_mutex);
956 		return mr;
957 	}
958 
959 	mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
960 	if (IS_ERR(mr))
961 		return mr;
962 
963 	mr->ibmr.pd = pd;
964 	mr->umem = umem;
965 	mr->page_shift = order_base_2(page_size);
966 	set_mr_fields(dev, mr, umem->length, access_flags, iova);
967 
968 	return mr;
969 }
970 
971 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
972 			    MLX5_UMR_MTT_ALIGNMENT)
973 #define MLX5_SPARE_UMR_CHUNK 0x10000
974 
975 /*
976  * Allocate a temporary buffer to hold the per-page information to transfer to
977  * HW. For efficiency this should be as large as it can be, but buffer
978  * allocation failure is not allowed, so try smaller sizes.
979  */
980 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
981 {
982 	const size_t xlt_chunk_align =
983 		MLX5_UMR_MTT_ALIGNMENT / ent_size;
984 	size_t size;
985 	void *res = NULL;
986 
987 	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
988 
989 	/*
990 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
991 	 * allocation can't trigger any kind of reclaim.
992 	 */
993 	might_sleep();
994 
995 	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
996 
997 	/*
998 	 * If the system already has a suitable high order page then just use
999 	 * that, but don't try hard to create one. This max is about 1M, so a
1000 	 * free x86 huge page will satisfy it.
1001 	 */
1002 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1003 		     MLX5_MAX_UMR_CHUNK);
1004 	*nents = size / ent_size;
1005 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1006 				       get_order(size));
1007 	if (res)
1008 		return res;
1009 
1010 	if (size > MLX5_SPARE_UMR_CHUNK) {
1011 		size = MLX5_SPARE_UMR_CHUNK;
1012 		*nents = size / ent_size;
1013 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1014 					       get_order(size));
1015 		if (res)
1016 			return res;
1017 	}
1018 
1019 	*nents = PAGE_SIZE / ent_size;
1020 	res = (void *)__get_free_page(gfp_mask);
1021 	if (res)
1022 		return res;
1023 
1024 	mutex_lock(&xlt_emergency_page_mutex);
1025 	memset(xlt_emergency_page, 0, PAGE_SIZE);
1026 	return xlt_emergency_page;
1027 }
1028 
1029 static void mlx5_ib_free_xlt(void *xlt, size_t length)
1030 {
1031 	if (xlt == xlt_emergency_page) {
1032 		mutex_unlock(&xlt_emergency_page_mutex);
1033 		return;
1034 	}
1035 
1036 	free_pages((unsigned long)xlt, get_order(length));
1037 }
1038 
1039 /*
1040  * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1041  * submission.
1042  */
1043 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1044 				   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1045 				   size_t nents, size_t ent_size,
1046 				   unsigned int flags)
1047 {
1048 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1049 	struct device *ddev = &dev->mdev->pdev->dev;
1050 	dma_addr_t dma;
1051 	void *xlt;
1052 
1053 	xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1054 				flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1055 								 GFP_KERNEL);
1056 	sg->length = nents * ent_size;
1057 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1058 	if (dma_mapping_error(ddev, dma)) {
1059 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1060 		mlx5_ib_free_xlt(xlt, sg->length);
1061 		return NULL;
1062 	}
1063 	sg->addr = dma;
1064 	sg->lkey = dev->umrc.pd->local_dma_lkey;
1065 
1066 	memset(wr, 0, sizeof(*wr));
1067 	wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1068 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1069 		wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1070 	wr->wr.sg_list = sg;
1071 	wr->wr.num_sge = 1;
1072 	wr->wr.opcode = MLX5_IB_WR_UMR;
1073 	wr->pd = mr->ibmr.pd;
1074 	wr->mkey = mr->mmkey.key;
1075 	wr->length = mr->ibmr.length;
1076 	wr->virt_addr = mr->ibmr.iova;
1077 	wr->access_flags = mr->access_flags;
1078 	wr->page_shift = mr->page_shift;
1079 	wr->xlt_size = sg->length;
1080 	return xlt;
1081 }
1082 
1083 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1084 				   struct ib_sge *sg)
1085 {
1086 	struct device *ddev = &dev->mdev->pdev->dev;
1087 
1088 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1089 	mlx5_ib_free_xlt(xlt, sg->length);
1090 }
1091 
1092 static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1093 {
1094 	unsigned int res = 0;
1095 
1096 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
1097 		res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1098 		       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1099 		       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1100 	if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1101 		res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1102 	if (flags & MLX5_IB_UPD_XLT_ADDR)
1103 		res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1104 	return res;
1105 }
1106 
1107 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1108 		       int page_shift, int flags)
1109 {
1110 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1111 	struct device *ddev = &dev->mdev->pdev->dev;
1112 	void *xlt;
1113 	struct mlx5_umr_wr wr;
1114 	struct ib_sge sg;
1115 	int err = 0;
1116 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1117 			       ? sizeof(struct mlx5_klm)
1118 			       : sizeof(struct mlx5_mtt);
1119 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1120 	const int page_mask = page_align - 1;
1121 	size_t pages_mapped = 0;
1122 	size_t pages_to_map = 0;
1123 	size_t pages_iter;
1124 	size_t size_to_map = 0;
1125 	size_t orig_sg_length;
1126 
1127 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1128 	    !umr_can_use_indirect_mkey(dev))
1129 		return -EPERM;
1130 
1131 	if (WARN_ON(!mr->umem->is_odp))
1132 		return -EINVAL;
1133 
1134 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1135 	 * so we need to align the offset and length accordingly
1136 	 */
1137 	if (idx & page_mask) {
1138 		npages += idx & page_mask;
1139 		idx &= ~page_mask;
1140 	}
1141 	pages_to_map = ALIGN(npages, page_align);
1142 
1143 	xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1144 	if (!xlt)
1145 		return -ENOMEM;
1146 	pages_iter = sg.length / desc_size;
1147 	orig_sg_length = sg.length;
1148 
1149 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1150 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1151 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1152 
1153 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
1154 	}
1155 
1156 	wr.page_shift = page_shift;
1157 
1158 	for (pages_mapped = 0;
1159 	     pages_mapped < pages_to_map && !err;
1160 	     pages_mapped += pages_iter, idx += pages_iter) {
1161 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1162 		size_to_map = npages * desc_size;
1163 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1164 					DMA_TO_DEVICE);
1165 		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1166 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
1167 					   DMA_TO_DEVICE);
1168 
1169 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1170 
1171 		if (pages_mapped + pages_iter >= pages_to_map)
1172 			wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1173 
1174 		wr.offset = idx * desc_size;
1175 		wr.xlt_size = sg.length;
1176 
1177 		err = mlx5_ib_post_send_wait(dev, &wr);
1178 	}
1179 	sg.length = orig_sg_length;
1180 	mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1181 	return err;
1182 }
1183 
1184 /*
1185  * Send the DMA list to the HW for a normal MR using UMR.
1186  * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1187  * flag may be used.
1188  */
1189 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1190 {
1191 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1192 	struct device *ddev = &dev->mdev->pdev->dev;
1193 	struct ib_block_iter biter;
1194 	struct mlx5_mtt *cur_mtt;
1195 	struct mlx5_umr_wr wr;
1196 	size_t orig_sg_length;
1197 	struct mlx5_mtt *mtt;
1198 	size_t final_size;
1199 	struct ib_sge sg;
1200 	int err = 0;
1201 
1202 	if (WARN_ON(mr->umem->is_odp))
1203 		return -EINVAL;
1204 
1205 	mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1206 				    ib_umem_num_dma_blocks(mr->umem,
1207 							   1 << mr->page_shift),
1208 				    sizeof(*mtt), flags);
1209 	if (!mtt)
1210 		return -ENOMEM;
1211 	orig_sg_length = sg.length;
1212 
1213 	cur_mtt = mtt;
1214 	rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter,
1215 			     mr->umem->sgt_append.sgt.nents,
1216 			     BIT(mr->page_shift)) {
1217 		if (cur_mtt == (void *)mtt + sg.length) {
1218 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
1219 						   DMA_TO_DEVICE);
1220 			err = mlx5_ib_post_send_wait(dev, &wr);
1221 			if (err)
1222 				goto err;
1223 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1224 						DMA_TO_DEVICE);
1225 			wr.offset += sg.length;
1226 			cur_mtt = mtt;
1227 		}
1228 
1229 		cur_mtt->ptag =
1230 			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1231 				    MLX5_IB_MTT_PRESENT);
1232 
1233 		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1234 			cur_mtt->ptag = 0;
1235 
1236 		cur_mtt++;
1237 	}
1238 
1239 	final_size = (void *)cur_mtt - (void *)mtt;
1240 	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1241 	memset(cur_mtt, 0, sg.length - final_size);
1242 	wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1243 	wr.xlt_size = sg.length;
1244 
1245 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1246 	err = mlx5_ib_post_send_wait(dev, &wr);
1247 
1248 err:
1249 	sg.length = orig_sg_length;
1250 	mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1251 	return err;
1252 }
1253 
1254 /*
1255  * If ibmr is NULL it will be allocated by reg_create.
1256  * Else, the given ibmr will be used.
1257  */
1258 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1259 				     u64 iova, int access_flags,
1260 				     unsigned int page_size, bool populate)
1261 {
1262 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1263 	struct mlx5_ib_mr *mr;
1264 	__be64 *pas;
1265 	void *mkc;
1266 	int inlen;
1267 	u32 *in;
1268 	int err;
1269 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1270 
1271 	if (!page_size)
1272 		return ERR_PTR(-EINVAL);
1273 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1274 	if (!mr)
1275 		return ERR_PTR(-ENOMEM);
1276 
1277 	mr->ibmr.pd = pd;
1278 	mr->access_flags = access_flags;
1279 	mr->page_shift = order_base_2(page_size);
1280 
1281 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1282 	if (populate)
1283 		inlen += sizeof(*pas) *
1284 			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1285 	in = kvzalloc(inlen, GFP_KERNEL);
1286 	if (!in) {
1287 		err = -ENOMEM;
1288 		goto err_1;
1289 	}
1290 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1291 	if (populate) {
1292 		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1293 			err = -EINVAL;
1294 			goto err_2;
1295 		}
1296 		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1297 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1298 	}
1299 
1300 	/* The pg_access bit allows setting the access flags
1301 	 * in the page list submitted with the command. */
1302 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1303 
1304 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1305 	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1306 				      populate ? pd : dev->umrc.pd);
1307 	MLX5_SET(mkc, mkc, free, !populate);
1308 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1309 	MLX5_SET(mkc, mkc, umr_en, 1);
1310 
1311 	MLX5_SET64(mkc, mkc, len, umem->length);
1312 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1313 	MLX5_SET(mkc, mkc, translations_octword_size,
1314 		 get_octo_len(iova, umem->length, mr->page_shift));
1315 	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1316 	if (populate) {
1317 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1318 			 get_octo_len(iova, umem->length, mr->page_shift));
1319 	}
1320 
1321 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1322 	if (err) {
1323 		mlx5_ib_warn(dev, "create mkey failed\n");
1324 		goto err_2;
1325 	}
1326 	mr->mmkey.type = MLX5_MKEY_MR;
1327 	mr->umem = umem;
1328 	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1329 	kvfree(in);
1330 
1331 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1332 
1333 	return mr;
1334 
1335 err_2:
1336 	kvfree(in);
1337 err_1:
1338 	kfree(mr);
1339 	return ERR_PTR(err);
1340 }
1341 
1342 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1343 				       u64 length, int acc, int mode)
1344 {
1345 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1346 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1347 	struct mlx5_ib_mr *mr;
1348 	void *mkc;
1349 	u32 *in;
1350 	int err;
1351 
1352 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1353 	if (!mr)
1354 		return ERR_PTR(-ENOMEM);
1355 
1356 	in = kzalloc(inlen, GFP_KERNEL);
1357 	if (!in) {
1358 		err = -ENOMEM;
1359 		goto err_free;
1360 	}
1361 
1362 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1363 
1364 	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1365 	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1366 	MLX5_SET64(mkc, mkc, len, length);
1367 	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1368 
1369 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1370 	if (err)
1371 		goto err_in;
1372 
1373 	kfree(in);
1374 
1375 	set_mr_fields(dev, mr, length, acc, start_addr);
1376 
1377 	return &mr->ibmr;
1378 
1379 err_in:
1380 	kfree(in);
1381 
1382 err_free:
1383 	kfree(mr);
1384 
1385 	return ERR_PTR(err);
1386 }
1387 
1388 int mlx5_ib_advise_mr(struct ib_pd *pd,
1389 		      enum ib_uverbs_advise_mr_advice advice,
1390 		      u32 flags,
1391 		      struct ib_sge *sg_list,
1392 		      u32 num_sge,
1393 		      struct uverbs_attr_bundle *attrs)
1394 {
1395 	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1396 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1397 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1398 		return -EOPNOTSUPP;
1399 
1400 	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1401 					 sg_list, num_sge);
1402 }
1403 
1404 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1405 				struct ib_dm_mr_attr *attr,
1406 				struct uverbs_attr_bundle *attrs)
1407 {
1408 	struct mlx5_ib_dm *mdm = to_mdm(dm);
1409 	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1410 	u64 start_addr = mdm->dev_addr + attr->offset;
1411 	int mode;
1412 
1413 	switch (mdm->type) {
1414 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1415 		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1416 			return ERR_PTR(-EINVAL);
1417 
1418 		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1419 		start_addr -= pci_resource_start(dev->pdev, 0);
1420 		break;
1421 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1422 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1423 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1424 			return ERR_PTR(-EINVAL);
1425 
1426 		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1427 		break;
1428 	default:
1429 		return ERR_PTR(-EINVAL);
1430 	}
1431 
1432 	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1433 				 attr->access_flags, mode);
1434 }
1435 
1436 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1437 				    u64 iova, int access_flags)
1438 {
1439 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1440 	struct mlx5_ib_mr *mr = NULL;
1441 	bool xlt_with_umr;
1442 	int err;
1443 
1444 	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1445 	if (xlt_with_umr) {
1446 		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1447 	} else {
1448 		unsigned int page_size = mlx5_umem_find_best_pgsz(
1449 			umem, mkc, log_page_size, 0, iova);
1450 
1451 		mutex_lock(&dev->slow_path_mutex);
1452 		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1453 		mutex_unlock(&dev->slow_path_mutex);
1454 	}
1455 	if (IS_ERR(mr)) {
1456 		ib_umem_release(umem);
1457 		return ERR_CAST(mr);
1458 	}
1459 
1460 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1461 
1462 	atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1463 
1464 	if (xlt_with_umr) {
1465 		/*
1466 		 * If the MR was created with reg_create then it will be
1467 		 * configured properly but left disabled. It is safe to go ahead
1468 		 * and configure it again via UMR while enabling it.
1469 		 */
1470 		err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1471 		if (err) {
1472 			mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1473 			return ERR_PTR(err);
1474 		}
1475 	}
1476 	return &mr->ibmr;
1477 }
1478 
1479 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1480 					u64 iova, int access_flags,
1481 					struct ib_udata *udata)
1482 {
1483 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1484 	struct ib_umem_odp *odp;
1485 	struct mlx5_ib_mr *mr;
1486 	int err;
1487 
1488 	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1489 		return ERR_PTR(-EOPNOTSUPP);
1490 
1491 	err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1492 	if (err)
1493 		return ERR_PTR(err);
1494 	if (!start && length == U64_MAX) {
1495 		if (iova != 0)
1496 			return ERR_PTR(-EINVAL);
1497 		if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1498 			return ERR_PTR(-EINVAL);
1499 
1500 		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1501 		if (IS_ERR(mr))
1502 			return ERR_CAST(mr);
1503 		return &mr->ibmr;
1504 	}
1505 
1506 	/* ODP requires xlt update via umr to work. */
1507 	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1508 		return ERR_PTR(-EINVAL);
1509 
1510 	odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1511 			      &mlx5_mn_ops);
1512 	if (IS_ERR(odp))
1513 		return ERR_CAST(odp);
1514 
1515 	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1516 	if (IS_ERR(mr)) {
1517 		ib_umem_release(&odp->umem);
1518 		return ERR_CAST(mr);
1519 	}
1520 	xa_init(&mr->implicit_children);
1521 
1522 	odp->private = mr;
1523 	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1524 	if (err)
1525 		goto err_dereg_mr;
1526 
1527 	err = mlx5_ib_init_odp_mr(mr);
1528 	if (err)
1529 		goto err_dereg_mr;
1530 	return &mr->ibmr;
1531 
1532 err_dereg_mr:
1533 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1534 	return ERR_PTR(err);
1535 }
1536 
1537 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1538 				  u64 iova, int access_flags,
1539 				  struct ib_udata *udata)
1540 {
1541 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1542 	struct ib_umem *umem;
1543 
1544 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1545 		return ERR_PTR(-EOPNOTSUPP);
1546 
1547 	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1548 		    start, iova, length, access_flags);
1549 
1550 	if (access_flags & IB_ACCESS_ON_DEMAND)
1551 		return create_user_odp_mr(pd, start, length, iova, access_flags,
1552 					  udata);
1553 	umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1554 	if (IS_ERR(umem))
1555 		return ERR_CAST(umem);
1556 	return create_real_mr(pd, umem, iova, access_flags);
1557 }
1558 
1559 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1560 {
1561 	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1562 	struct mlx5_ib_mr *mr = umem_dmabuf->private;
1563 
1564 	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1565 
1566 	if (!umem_dmabuf->sgt)
1567 		return;
1568 
1569 	mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1570 	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1571 }
1572 
1573 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1574 	.allow_peer2peer = 1,
1575 	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
1576 };
1577 
1578 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1579 					 u64 length, u64 virt_addr,
1580 					 int fd, int access_flags,
1581 					 struct ib_udata *udata)
1582 {
1583 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1584 	struct mlx5_ib_mr *mr = NULL;
1585 	struct ib_umem_dmabuf *umem_dmabuf;
1586 	int err;
1587 
1588 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1589 	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1590 		return ERR_PTR(-EOPNOTSUPP);
1591 
1592 	mlx5_ib_dbg(dev,
1593 		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1594 		    offset, virt_addr, length, fd, access_flags);
1595 
1596 	/* dmabuf requires xlt update via umr to work. */
1597 	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1598 		return ERR_PTR(-EINVAL);
1599 
1600 	umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1601 					 access_flags,
1602 					 &mlx5_ib_dmabuf_attach_ops);
1603 	if (IS_ERR(umem_dmabuf)) {
1604 		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1605 			    PTR_ERR(umem_dmabuf));
1606 		return ERR_CAST(umem_dmabuf);
1607 	}
1608 
1609 	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1610 				access_flags);
1611 	if (IS_ERR(mr)) {
1612 		ib_umem_release(&umem_dmabuf->umem);
1613 		return ERR_CAST(mr);
1614 	}
1615 
1616 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1617 
1618 	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1619 	umem_dmabuf->private = mr;
1620 	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1621 	if (err)
1622 		goto err_dereg_mr;
1623 
1624 	err = mlx5_ib_init_dmabuf_mr(mr);
1625 	if (err)
1626 		goto err_dereg_mr;
1627 	return &mr->ibmr;
1628 
1629 err_dereg_mr:
1630 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1631 	return ERR_PTR(err);
1632 }
1633 
1634 /**
1635  * revoke_mr - Fence all DMA on the MR
1636  * @mr: The MR to fence
1637  *
1638  * Upon return the NIC will not be doing any DMA to the pages under the MR,
1639  * and any DMA in progress will be completed. Failure of this function
1640  * indicates the HW has failed catastrophically.
1641  */
1642 static int revoke_mr(struct mlx5_ib_mr *mr)
1643 {
1644 	struct mlx5_umr_wr umrwr = {};
1645 
1646 	if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1647 		return 0;
1648 
1649 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1650 			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1651 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
1652 	umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1653 	umrwr.mkey = mr->mmkey.key;
1654 	umrwr.ignore_free_state = 1;
1655 
1656 	return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1657 }
1658 
1659 /*
1660  * True if the change in access flags can be done via UMR, only some access
1661  * flags can be updated.
1662  */
1663 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1664 				     unsigned int current_access_flags,
1665 				     unsigned int target_access_flags)
1666 {
1667 	unsigned int diffs = current_access_flags ^ target_access_flags;
1668 
1669 	if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1670 		      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1671 		return false;
1672 	return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1673 					     target_access_flags);
1674 }
1675 
1676 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1677 			       int access_flags)
1678 {
1679 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1680 	struct mlx5_umr_wr umrwr = {
1681 		.wr = {
1682 			.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1683 				      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1684 			.opcode = MLX5_IB_WR_UMR,
1685 		},
1686 		.mkey = mr->mmkey.key,
1687 		.pd = pd,
1688 		.access_flags = access_flags,
1689 	};
1690 	int err;
1691 
1692 	err = mlx5_ib_post_send_wait(dev, &umrwr);
1693 	if (err)
1694 		return err;
1695 
1696 	mr->access_flags = access_flags;
1697 	return 0;
1698 }
1699 
1700 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1701 				  struct ib_umem *new_umem,
1702 				  int new_access_flags, u64 iova,
1703 				  unsigned long *page_size)
1704 {
1705 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1706 
1707 	/* We only track the allocated sizes of MRs from the cache */
1708 	if (!mr->cache_ent)
1709 		return false;
1710 	if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1711 		return false;
1712 
1713 	*page_size =
1714 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1715 	if (WARN_ON(!*page_size))
1716 		return false;
1717 	return (1ULL << mr->cache_ent->order) >=
1718 	       ib_umem_num_dma_blocks(new_umem, *page_size);
1719 }
1720 
1721 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1722 			 int access_flags, int flags, struct ib_umem *new_umem,
1723 			 u64 iova, unsigned long page_size)
1724 {
1725 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1726 	int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1727 	struct ib_umem *old_umem = mr->umem;
1728 	int err;
1729 
1730 	/*
1731 	 * To keep everything simple the MR is revoked before we start to mess
1732 	 * with it. This ensure the change is atomic relative to any use of the
1733 	 * MR.
1734 	 */
1735 	err = revoke_mr(mr);
1736 	if (err)
1737 		return err;
1738 
1739 	if (flags & IB_MR_REREG_PD) {
1740 		mr->ibmr.pd = pd;
1741 		upd_flags |= MLX5_IB_UPD_XLT_PD;
1742 	}
1743 	if (flags & IB_MR_REREG_ACCESS) {
1744 		mr->access_flags = access_flags;
1745 		upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1746 	}
1747 
1748 	mr->ibmr.length = new_umem->length;
1749 	mr->ibmr.iova = iova;
1750 	mr->ibmr.length = new_umem->length;
1751 	mr->page_shift = order_base_2(page_size);
1752 	mr->umem = new_umem;
1753 	err = mlx5_ib_update_mr_pas(mr, upd_flags);
1754 	if (err) {
1755 		/*
1756 		 * The MR is revoked at this point so there is no issue to free
1757 		 * new_umem.
1758 		 */
1759 		mr->umem = old_umem;
1760 		return err;
1761 	}
1762 
1763 	atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1764 	ib_umem_release(old_umem);
1765 	atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1766 	return 0;
1767 }
1768 
1769 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1770 				    u64 length, u64 iova, int new_access_flags,
1771 				    struct ib_pd *new_pd,
1772 				    struct ib_udata *udata)
1773 {
1774 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1775 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1776 	int err;
1777 
1778 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1779 		return ERR_PTR(-EOPNOTSUPP);
1780 
1781 	mlx5_ib_dbg(
1782 		dev,
1783 		"start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1784 		start, iova, length, new_access_flags);
1785 
1786 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1787 		return ERR_PTR(-EOPNOTSUPP);
1788 
1789 	if (!(flags & IB_MR_REREG_ACCESS))
1790 		new_access_flags = mr->access_flags;
1791 	if (!(flags & IB_MR_REREG_PD))
1792 		new_pd = ib_mr->pd;
1793 
1794 	if (!(flags & IB_MR_REREG_TRANS)) {
1795 		struct ib_umem *umem;
1796 
1797 		/* Fast path for PD/access change */
1798 		if (can_use_umr_rereg_access(dev, mr->access_flags,
1799 					     new_access_flags)) {
1800 			err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1801 			if (err)
1802 				return ERR_PTR(err);
1803 			return NULL;
1804 		}
1805 		/* DM or ODP MR's don't have a normal umem so we can't re-use it */
1806 		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1807 			goto recreate;
1808 
1809 		/*
1810 		 * Only one active MR can refer to a umem at one time, revoke
1811 		 * the old MR before assigning the umem to the new one.
1812 		 */
1813 		err = revoke_mr(mr);
1814 		if (err)
1815 			return ERR_PTR(err);
1816 		umem = mr->umem;
1817 		mr->umem = NULL;
1818 		atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1819 
1820 		return create_real_mr(new_pd, umem, mr->ibmr.iova,
1821 				      new_access_flags);
1822 	}
1823 
1824 	/*
1825 	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1826 	 * but the logic around releasing the umem is different
1827 	 */
1828 	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1829 		goto recreate;
1830 
1831 	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1832 	    can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1833 		struct ib_umem *new_umem;
1834 		unsigned long page_size;
1835 
1836 		new_umem = ib_umem_get(&dev->ib_dev, start, length,
1837 				       new_access_flags);
1838 		if (IS_ERR(new_umem))
1839 			return ERR_CAST(new_umem);
1840 
1841 		/* Fast path for PAS change */
1842 		if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1843 					  &page_size)) {
1844 			err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1845 					    new_umem, iova, page_size);
1846 			if (err) {
1847 				ib_umem_release(new_umem);
1848 				return ERR_PTR(err);
1849 			}
1850 			return NULL;
1851 		}
1852 		return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1853 	}
1854 
1855 	/*
1856 	 * Everything else has no state we can preserve, just create a new MR
1857 	 * from scratch
1858 	 */
1859 recreate:
1860 	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1861 				   new_access_flags, udata);
1862 }
1863 
1864 static int
1865 mlx5_alloc_priv_descs(struct ib_device *device,
1866 		      struct mlx5_ib_mr *mr,
1867 		      int ndescs,
1868 		      int desc_size)
1869 {
1870 	struct mlx5_ib_dev *dev = to_mdev(device);
1871 	struct device *ddev = &dev->mdev->pdev->dev;
1872 	int size = ndescs * desc_size;
1873 	int add_size;
1874 	int ret;
1875 
1876 	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1877 
1878 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1879 	if (!mr->descs_alloc)
1880 		return -ENOMEM;
1881 
1882 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1883 
1884 	mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1885 	if (dma_mapping_error(ddev, mr->desc_map)) {
1886 		ret = -ENOMEM;
1887 		goto err;
1888 	}
1889 
1890 	return 0;
1891 err:
1892 	kfree(mr->descs_alloc);
1893 
1894 	return ret;
1895 }
1896 
1897 static void
1898 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1899 {
1900 	if (!mr->umem && mr->descs) {
1901 		struct ib_device *device = mr->ibmr.device;
1902 		int size = mr->max_descs * mr->desc_size;
1903 		struct mlx5_ib_dev *dev = to_mdev(device);
1904 
1905 		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1906 				 DMA_TO_DEVICE);
1907 		kfree(mr->descs_alloc);
1908 		mr->descs = NULL;
1909 	}
1910 }
1911 
1912 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1913 {
1914 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1915 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1916 	int rc;
1917 
1918 	/*
1919 	 * Any async use of the mr must hold the refcount, once the refcount
1920 	 * goes to zero no other thread, such as ODP page faults, prefetch, any
1921 	 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1922 	 */
1923 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1924 	    refcount_read(&mr->mmkey.usecount) != 0 &&
1925 	    xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1926 		mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1927 
1928 	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1929 		xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1930 			   mr->sig, NULL, GFP_KERNEL);
1931 
1932 		if (mr->mtt_mr) {
1933 			rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1934 			if (rc)
1935 				return rc;
1936 			mr->mtt_mr = NULL;
1937 		}
1938 		if (mr->klm_mr) {
1939 			rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1940 			if (rc)
1941 				return rc;
1942 			mr->klm_mr = NULL;
1943 		}
1944 
1945 		if (mlx5_core_destroy_psv(dev->mdev,
1946 					  mr->sig->psv_memory.psv_idx))
1947 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1948 				     mr->sig->psv_memory.psv_idx);
1949 		if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1950 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1951 				     mr->sig->psv_wire.psv_idx);
1952 		kfree(mr->sig);
1953 		mr->sig = NULL;
1954 	}
1955 
1956 	/* Stop DMA */
1957 	if (mr->cache_ent) {
1958 		if (revoke_mr(mr)) {
1959 			spin_lock_irq(&mr->cache_ent->lock);
1960 			mr->cache_ent->total_mrs--;
1961 			spin_unlock_irq(&mr->cache_ent->lock);
1962 			mr->cache_ent = NULL;
1963 		}
1964 	}
1965 	if (!mr->cache_ent) {
1966 		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1967 		if (rc)
1968 			return rc;
1969 	}
1970 
1971 	if (mr->umem) {
1972 		bool is_odp = is_odp_mr(mr);
1973 
1974 		if (!is_odp)
1975 			atomic_sub(ib_umem_num_pages(mr->umem),
1976 				   &dev->mdev->priv.reg_pages);
1977 		ib_umem_release(mr->umem);
1978 		if (is_odp)
1979 			mlx5_ib_free_odp_mr(mr);
1980 	}
1981 
1982 	if (mr->cache_ent) {
1983 		mlx5_mr_cache_free(dev, mr);
1984 	} else {
1985 		mlx5_free_priv_descs(mr);
1986 		kfree(mr);
1987 	}
1988 	return 0;
1989 }
1990 
1991 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1992 				   int access_mode, int page_shift)
1993 {
1994 	void *mkc;
1995 
1996 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1997 
1998 	/* This is only used from the kernel, so setting the PD is OK. */
1999 	set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2000 	MLX5_SET(mkc, mkc, free, 1);
2001 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2002 	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2003 	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2004 	MLX5_SET(mkc, mkc, umr_en, 1);
2005 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
2006 }
2007 
2008 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2009 				  int ndescs, int desc_size, int page_shift,
2010 				  int access_mode, u32 *in, int inlen)
2011 {
2012 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2013 	int err;
2014 
2015 	mr->access_mode = access_mode;
2016 	mr->desc_size = desc_size;
2017 	mr->max_descs = ndescs;
2018 
2019 	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2020 	if (err)
2021 		return err;
2022 
2023 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2024 
2025 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2026 	if (err)
2027 		goto err_free_descs;
2028 
2029 	mr->mmkey.type = MLX5_MKEY_MR;
2030 	mr->ibmr.lkey = mr->mmkey.key;
2031 	mr->ibmr.rkey = mr->mmkey.key;
2032 
2033 	return 0;
2034 
2035 err_free_descs:
2036 	mlx5_free_priv_descs(mr);
2037 	return err;
2038 }
2039 
2040 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2041 				u32 max_num_sg, u32 max_num_meta_sg,
2042 				int desc_size, int access_mode)
2043 {
2044 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2045 	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2046 	int page_shift = 0;
2047 	struct mlx5_ib_mr *mr;
2048 	u32 *in;
2049 	int err;
2050 
2051 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2052 	if (!mr)
2053 		return ERR_PTR(-ENOMEM);
2054 
2055 	mr->ibmr.pd = pd;
2056 	mr->ibmr.device = pd->device;
2057 
2058 	in = kzalloc(inlen, GFP_KERNEL);
2059 	if (!in) {
2060 		err = -ENOMEM;
2061 		goto err_free;
2062 	}
2063 
2064 	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2065 		page_shift = PAGE_SHIFT;
2066 
2067 	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2068 				     access_mode, in, inlen);
2069 	if (err)
2070 		goto err_free_in;
2071 
2072 	mr->umem = NULL;
2073 	kfree(in);
2074 
2075 	return mr;
2076 
2077 err_free_in:
2078 	kfree(in);
2079 err_free:
2080 	kfree(mr);
2081 	return ERR_PTR(err);
2082 }
2083 
2084 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2085 				    int ndescs, u32 *in, int inlen)
2086 {
2087 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2088 				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2089 				      inlen);
2090 }
2091 
2092 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2093 				    int ndescs, u32 *in, int inlen)
2094 {
2095 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2096 				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2097 }
2098 
2099 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2100 				      int max_num_sg, int max_num_meta_sg,
2101 				      u32 *in, int inlen)
2102 {
2103 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2104 	u32 psv_index[2];
2105 	void *mkc;
2106 	int err;
2107 
2108 	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2109 	if (!mr->sig)
2110 		return -ENOMEM;
2111 
2112 	/* create mem & wire PSVs */
2113 	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2114 	if (err)
2115 		goto err_free_sig;
2116 
2117 	mr->sig->psv_memory.psv_idx = psv_index[0];
2118 	mr->sig->psv_wire.psv_idx = psv_index[1];
2119 
2120 	mr->sig->sig_status_checked = true;
2121 	mr->sig->sig_err_exists = false;
2122 	/* Next UMR, Arm SIGERR */
2123 	++mr->sig->sigerr_count;
2124 	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2125 					 sizeof(struct mlx5_klm),
2126 					 MLX5_MKC_ACCESS_MODE_KLMS);
2127 	if (IS_ERR(mr->klm_mr)) {
2128 		err = PTR_ERR(mr->klm_mr);
2129 		goto err_destroy_psv;
2130 	}
2131 	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2132 					 sizeof(struct mlx5_mtt),
2133 					 MLX5_MKC_ACCESS_MODE_MTT);
2134 	if (IS_ERR(mr->mtt_mr)) {
2135 		err = PTR_ERR(mr->mtt_mr);
2136 		goto err_free_klm_mr;
2137 	}
2138 
2139 	/* Set bsf descriptors for mkey */
2140 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2141 	MLX5_SET(mkc, mkc, bsf_en, 1);
2142 	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2143 
2144 	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2145 				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2146 	if (err)
2147 		goto err_free_mtt_mr;
2148 
2149 	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2150 			      mr->sig, GFP_KERNEL));
2151 	if (err)
2152 		goto err_free_descs;
2153 	return 0;
2154 
2155 err_free_descs:
2156 	destroy_mkey(dev, mr);
2157 	mlx5_free_priv_descs(mr);
2158 err_free_mtt_mr:
2159 	mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2160 	mr->mtt_mr = NULL;
2161 err_free_klm_mr:
2162 	mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2163 	mr->klm_mr = NULL;
2164 err_destroy_psv:
2165 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2166 		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2167 			     mr->sig->psv_memory.psv_idx);
2168 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2169 		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2170 			     mr->sig->psv_wire.psv_idx);
2171 err_free_sig:
2172 	kfree(mr->sig);
2173 
2174 	return err;
2175 }
2176 
2177 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2178 					enum ib_mr_type mr_type, u32 max_num_sg,
2179 					u32 max_num_meta_sg)
2180 {
2181 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2182 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2183 	int ndescs = ALIGN(max_num_sg, 4);
2184 	struct mlx5_ib_mr *mr;
2185 	u32 *in;
2186 	int err;
2187 
2188 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2189 	if (!mr)
2190 		return ERR_PTR(-ENOMEM);
2191 
2192 	in = kzalloc(inlen, GFP_KERNEL);
2193 	if (!in) {
2194 		err = -ENOMEM;
2195 		goto err_free;
2196 	}
2197 
2198 	mr->ibmr.device = pd->device;
2199 	mr->umem = NULL;
2200 
2201 	switch (mr_type) {
2202 	case IB_MR_TYPE_MEM_REG:
2203 		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2204 		break;
2205 	case IB_MR_TYPE_SG_GAPS:
2206 		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2207 		break;
2208 	case IB_MR_TYPE_INTEGRITY:
2209 		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2210 						 max_num_meta_sg, in, inlen);
2211 		break;
2212 	default:
2213 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2214 		err = -EINVAL;
2215 	}
2216 
2217 	if (err)
2218 		goto err_free_in;
2219 
2220 	kfree(in);
2221 
2222 	return &mr->ibmr;
2223 
2224 err_free_in:
2225 	kfree(in);
2226 err_free:
2227 	kfree(mr);
2228 	return ERR_PTR(err);
2229 }
2230 
2231 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2232 			       u32 max_num_sg)
2233 {
2234 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2235 }
2236 
2237 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2238 					 u32 max_num_sg, u32 max_num_meta_sg)
2239 {
2240 	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2241 				  max_num_meta_sg);
2242 }
2243 
2244 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2245 {
2246 	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2247 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2248 	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2249 	unsigned int ndescs;
2250 	u32 *in = NULL;
2251 	void *mkc;
2252 	int err;
2253 	struct mlx5_ib_alloc_mw req = {};
2254 	struct {
2255 		__u32	comp_mask;
2256 		__u32	response_length;
2257 	} resp = {};
2258 
2259 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2260 	if (err)
2261 		return err;
2262 
2263 	if (req.comp_mask || req.reserved1 || req.reserved2)
2264 		return -EOPNOTSUPP;
2265 
2266 	if (udata->inlen > sizeof(req) &&
2267 	    !ib_is_udata_cleared(udata, sizeof(req),
2268 				 udata->inlen - sizeof(req)))
2269 		return -EOPNOTSUPP;
2270 
2271 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2272 
2273 	in = kzalloc(inlen, GFP_KERNEL);
2274 	if (!in) {
2275 		err = -ENOMEM;
2276 		goto free;
2277 	}
2278 
2279 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2280 
2281 	MLX5_SET(mkc, mkc, free, 1);
2282 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2283 	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2284 	MLX5_SET(mkc, mkc, umr_en, 1);
2285 	MLX5_SET(mkc, mkc, lr, 1);
2286 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2287 	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2288 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2289 
2290 	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2291 	if (err)
2292 		goto free;
2293 
2294 	mw->mmkey.type = MLX5_MKEY_MW;
2295 	ibmw->rkey = mw->mmkey.key;
2296 	mw->mmkey.ndescs = ndescs;
2297 
2298 	resp.response_length =
2299 		min(offsetofend(typeof(resp), response_length), udata->outlen);
2300 	if (resp.response_length) {
2301 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2302 		if (err)
2303 			goto free_mkey;
2304 	}
2305 
2306 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2307 		err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2308 		if (err)
2309 			goto free_mkey;
2310 	}
2311 
2312 	kfree(in);
2313 	return 0;
2314 
2315 free_mkey:
2316 	mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2317 free:
2318 	kfree(in);
2319 	return err;
2320 }
2321 
2322 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2323 {
2324 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2325 	struct mlx5_ib_mw *mmw = to_mmw(mw);
2326 
2327 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2328 	    xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2329 		/*
2330 		 * pagefault_single_data_segment() may be accessing mmw
2331 		 * if the user bound an ODP MR to this MW.
2332 		 */
2333 		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2334 
2335 	return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2336 }
2337 
2338 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2339 			    struct ib_mr_status *mr_status)
2340 {
2341 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2342 	int ret = 0;
2343 
2344 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2345 		pr_err("Invalid status check mask\n");
2346 		ret = -EINVAL;
2347 		goto done;
2348 	}
2349 
2350 	mr_status->fail_status = 0;
2351 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2352 		if (!mmr->sig) {
2353 			ret = -EINVAL;
2354 			pr_err("signature status check requested on a non-signature enabled MR\n");
2355 			goto done;
2356 		}
2357 
2358 		mmr->sig->sig_status_checked = true;
2359 		if (!mmr->sig->sig_err_exists)
2360 			goto done;
2361 
2362 		if (ibmr->lkey == mmr->sig->err_item.key)
2363 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2364 			       sizeof(mr_status->sig_err));
2365 		else {
2366 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2367 			mr_status->sig_err.sig_err_offset = 0;
2368 			mr_status->sig_err.key = mmr->sig->err_item.key;
2369 		}
2370 
2371 		mmr->sig->sig_err_exists = false;
2372 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2373 	}
2374 
2375 done:
2376 	return ret;
2377 }
2378 
2379 static int
2380 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2381 			int data_sg_nents, unsigned int *data_sg_offset,
2382 			struct scatterlist *meta_sg, int meta_sg_nents,
2383 			unsigned int *meta_sg_offset)
2384 {
2385 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2386 	unsigned int sg_offset = 0;
2387 	int n = 0;
2388 
2389 	mr->meta_length = 0;
2390 	if (data_sg_nents == 1) {
2391 		n++;
2392 		mr->mmkey.ndescs = 1;
2393 		if (data_sg_offset)
2394 			sg_offset = *data_sg_offset;
2395 		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2396 		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2397 		if (meta_sg_nents == 1) {
2398 			n++;
2399 			mr->meta_ndescs = 1;
2400 			if (meta_sg_offset)
2401 				sg_offset = *meta_sg_offset;
2402 			else
2403 				sg_offset = 0;
2404 			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2405 			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2406 		}
2407 		ibmr->length = mr->data_length + mr->meta_length;
2408 	}
2409 
2410 	return n;
2411 }
2412 
2413 static int
2414 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2415 		   struct scatterlist *sgl,
2416 		   unsigned short sg_nents,
2417 		   unsigned int *sg_offset_p,
2418 		   struct scatterlist *meta_sgl,
2419 		   unsigned short meta_sg_nents,
2420 		   unsigned int *meta_sg_offset_p)
2421 {
2422 	struct scatterlist *sg = sgl;
2423 	struct mlx5_klm *klms = mr->descs;
2424 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2425 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2426 	int i, j = 0;
2427 
2428 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2429 	mr->ibmr.length = 0;
2430 
2431 	for_each_sg(sgl, sg, sg_nents, i) {
2432 		if (unlikely(i >= mr->max_descs))
2433 			break;
2434 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2435 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2436 		klms[i].key = cpu_to_be32(lkey);
2437 		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2438 
2439 		sg_offset = 0;
2440 	}
2441 
2442 	if (sg_offset_p)
2443 		*sg_offset_p = sg_offset;
2444 
2445 	mr->mmkey.ndescs = i;
2446 	mr->data_length = mr->ibmr.length;
2447 
2448 	if (meta_sg_nents) {
2449 		sg = meta_sgl;
2450 		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2451 		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2452 			if (unlikely(i + j >= mr->max_descs))
2453 				break;
2454 			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2455 						     sg_offset);
2456 			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2457 							 sg_offset);
2458 			klms[i + j].key = cpu_to_be32(lkey);
2459 			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2460 
2461 			sg_offset = 0;
2462 		}
2463 		if (meta_sg_offset_p)
2464 			*meta_sg_offset_p = sg_offset;
2465 
2466 		mr->meta_ndescs = j;
2467 		mr->meta_length = mr->ibmr.length - mr->data_length;
2468 	}
2469 
2470 	return i + j;
2471 }
2472 
2473 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2474 {
2475 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2476 	__be64 *descs;
2477 
2478 	if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2479 		return -ENOMEM;
2480 
2481 	descs = mr->descs;
2482 	descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2483 
2484 	return 0;
2485 }
2486 
2487 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2488 {
2489 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2490 	__be64 *descs;
2491 
2492 	if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2493 		return -ENOMEM;
2494 
2495 	descs = mr->descs;
2496 	descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2497 		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2498 
2499 	return 0;
2500 }
2501 
2502 static int
2503 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2504 			 int data_sg_nents, unsigned int *data_sg_offset,
2505 			 struct scatterlist *meta_sg, int meta_sg_nents,
2506 			 unsigned int *meta_sg_offset)
2507 {
2508 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2509 	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2510 	int n;
2511 
2512 	pi_mr->mmkey.ndescs = 0;
2513 	pi_mr->meta_ndescs = 0;
2514 	pi_mr->meta_length = 0;
2515 
2516 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2517 				   pi_mr->desc_size * pi_mr->max_descs,
2518 				   DMA_TO_DEVICE);
2519 
2520 	pi_mr->ibmr.page_size = ibmr->page_size;
2521 	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2522 			   mlx5_set_page);
2523 	if (n != data_sg_nents)
2524 		return n;
2525 
2526 	pi_mr->data_iova = pi_mr->ibmr.iova;
2527 	pi_mr->data_length = pi_mr->ibmr.length;
2528 	pi_mr->ibmr.length = pi_mr->data_length;
2529 	ibmr->length = pi_mr->data_length;
2530 
2531 	if (meta_sg_nents) {
2532 		u64 page_mask = ~((u64)ibmr->page_size - 1);
2533 		u64 iova = pi_mr->data_iova;
2534 
2535 		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2536 				    meta_sg_offset, mlx5_set_page_pi);
2537 
2538 		pi_mr->meta_length = pi_mr->ibmr.length;
2539 		/*
2540 		 * PI address for the HW is the offset of the metadata address
2541 		 * relative to the first data page address.
2542 		 * It equals to first data page address + size of data pages +
2543 		 * metadata offset at the first metadata page
2544 		 */
2545 		pi_mr->pi_iova = (iova & page_mask) +
2546 				 pi_mr->mmkey.ndescs * ibmr->page_size +
2547 				 (pi_mr->ibmr.iova & ~page_mask);
2548 		/*
2549 		 * In order to use one MTT MR for data and metadata, we register
2550 		 * also the gaps between the end of the data and the start of
2551 		 * the metadata (the sig MR will verify that the HW will access
2552 		 * to right addresses). This mapping is safe because we use
2553 		 * internal mkey for the registration.
2554 		 */
2555 		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2556 		pi_mr->ibmr.iova = iova;
2557 		ibmr->length += pi_mr->meta_length;
2558 	}
2559 
2560 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2561 				      pi_mr->desc_size * pi_mr->max_descs,
2562 				      DMA_TO_DEVICE);
2563 
2564 	return n;
2565 }
2566 
2567 static int
2568 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2569 			 int data_sg_nents, unsigned int *data_sg_offset,
2570 			 struct scatterlist *meta_sg, int meta_sg_nents,
2571 			 unsigned int *meta_sg_offset)
2572 {
2573 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2574 	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2575 	int n;
2576 
2577 	pi_mr->mmkey.ndescs = 0;
2578 	pi_mr->meta_ndescs = 0;
2579 	pi_mr->meta_length = 0;
2580 
2581 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2582 				   pi_mr->desc_size * pi_mr->max_descs,
2583 				   DMA_TO_DEVICE);
2584 
2585 	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2586 			       meta_sg, meta_sg_nents, meta_sg_offset);
2587 
2588 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2589 				      pi_mr->desc_size * pi_mr->max_descs,
2590 				      DMA_TO_DEVICE);
2591 
2592 	/* This is zero-based memory region */
2593 	pi_mr->data_iova = 0;
2594 	pi_mr->ibmr.iova = 0;
2595 	pi_mr->pi_iova = pi_mr->data_length;
2596 	ibmr->length = pi_mr->ibmr.length;
2597 
2598 	return n;
2599 }
2600 
2601 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2602 			 int data_sg_nents, unsigned int *data_sg_offset,
2603 			 struct scatterlist *meta_sg, int meta_sg_nents,
2604 			 unsigned int *meta_sg_offset)
2605 {
2606 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2607 	struct mlx5_ib_mr *pi_mr = NULL;
2608 	int n;
2609 
2610 	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2611 
2612 	mr->mmkey.ndescs = 0;
2613 	mr->data_length = 0;
2614 	mr->data_iova = 0;
2615 	mr->meta_ndescs = 0;
2616 	mr->pi_iova = 0;
2617 	/*
2618 	 * As a performance optimization, if possible, there is no need to
2619 	 * perform UMR operation to register the data/metadata buffers.
2620 	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2621 	 * Fallback to UMR only in case of a failure.
2622 	 */
2623 	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2624 				    data_sg_offset, meta_sg, meta_sg_nents,
2625 				    meta_sg_offset);
2626 	if (n == data_sg_nents + meta_sg_nents)
2627 		goto out;
2628 	/*
2629 	 * As a performance optimization, if possible, there is no need to map
2630 	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2631 	 * descriptors and fallback to KLM only in case of a failure.
2632 	 * It's more efficient for the HW to work with MTT descriptors
2633 	 * (especially in high load).
2634 	 * Use KLM (indirect access) only if it's mandatory.
2635 	 */
2636 	pi_mr = mr->mtt_mr;
2637 	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2638 				     data_sg_offset, meta_sg, meta_sg_nents,
2639 				     meta_sg_offset);
2640 	if (n == data_sg_nents + meta_sg_nents)
2641 		goto out;
2642 
2643 	pi_mr = mr->klm_mr;
2644 	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2645 				     data_sg_offset, meta_sg, meta_sg_nents,
2646 				     meta_sg_offset);
2647 	if (unlikely(n != data_sg_nents + meta_sg_nents))
2648 		return -ENOMEM;
2649 
2650 out:
2651 	/* This is zero-based memory region */
2652 	ibmr->iova = 0;
2653 	mr->pi_mr = pi_mr;
2654 	if (pi_mr)
2655 		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2656 	else
2657 		ibmr->sig_attrs->meta_length = mr->meta_length;
2658 
2659 	return 0;
2660 }
2661 
2662 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2663 		      unsigned int *sg_offset)
2664 {
2665 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2666 	int n;
2667 
2668 	mr->mmkey.ndescs = 0;
2669 
2670 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2671 				   mr->desc_size * mr->max_descs,
2672 				   DMA_TO_DEVICE);
2673 
2674 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2675 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2676 				       NULL);
2677 	else
2678 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2679 				mlx5_set_page);
2680 
2681 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2682 				      mr->desc_size * mr->max_descs,
2683 				      DMA_TO_DEVICE);
2684 
2685 	return n;
2686 }
2687