xref: /openbmc/linux/drivers/infiniband/hw/mlx5/mr.c (revision 8010d74b9965b33182651767e9788ed84cf8e5f9)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 
34 #include <linux/kref.h>
35 #include <linux/random.h>
36 #include <linux/debugfs.h>
37 #include <linux/export.h>
38 #include <linux/delay.h>
39 #include <rdma/ib_umem.h>
40 #include <rdma/ib_umem_odp.h>
41 #include <rdma/ib_verbs.h>
42 #include "mlx5_ib.h"
43 
44 /*
45  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
46  * work on kernel modules memory
47  */
48 void *xlt_emergency_page;
49 static DEFINE_MUTEX(xlt_emergency_page_mutex);
50 
51 enum {
52 	MAX_PENDING_REG_MR = 8,
53 };
54 
55 #define MLX5_UMR_ALIGN 2048
56 
57 static void
58 create_mkey_callback(int status, struct mlx5_async_work *context);
59 
60 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
61 					  struct ib_pd *pd)
62 {
63 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
64 
65 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
66 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
67 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
68 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
69 	MLX5_SET(mkc, mkc, lr, 1);
70 
71 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
72 		MLX5_SET(mkc, mkc, relaxed_ordering_write,
73 			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
74 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
75 		MLX5_SET(mkc, mkc, relaxed_ordering_read,
76 			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
77 
78 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
79 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
80 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
81 }
82 
83 static void
84 assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
85 		    u32 *in)
86 {
87 	u8 key = atomic_inc_return(&dev->mkey_var);
88 	void *mkc;
89 
90 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
91 	MLX5_SET(mkc, mkc, mkey_7_0, key);
92 	mkey->key = key;
93 }
94 
95 static int
96 mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
97 		    u32 *in, int inlen)
98 {
99 	assign_mkey_variant(dev, mkey, in);
100 	return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
101 }
102 
103 static int
104 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
105 		       struct mlx5_core_mkey *mkey,
106 		       struct mlx5_async_ctx *async_ctx,
107 		       u32 *in, int inlen, u32 *out, int outlen,
108 		       struct mlx5_async_work *context)
109 {
110 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
111 	assign_mkey_variant(dev, mkey, in);
112 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
113 				create_mkey_callback, context);
114 }
115 
116 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
117 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
118 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
119 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
120 
121 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
122 {
123 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
124 }
125 
126 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
127 {
128 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
129 
130 	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
131 }
132 
133 static inline bool mlx5_ib_pas_fits_in_mr(struct mlx5_ib_mr *mr, u64 start,
134 					  u64 length)
135 {
136 	if (!mr->cache_ent)
137 		return false;
138 	return ((u64)1 << mr->cache_ent->order) * MLX5_ADAPTER_PAGE_SIZE >=
139 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
140 }
141 
142 static void create_mkey_callback(int status, struct mlx5_async_work *context)
143 {
144 	struct mlx5_ib_mr *mr =
145 		container_of(context, struct mlx5_ib_mr, cb_work);
146 	struct mlx5_ib_dev *dev = mr->dev;
147 	struct mlx5_cache_ent *ent = mr->cache_ent;
148 	unsigned long flags;
149 
150 	if (status) {
151 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
152 		kfree(mr);
153 		spin_lock_irqsave(&ent->lock, flags);
154 		ent->pending--;
155 		WRITE_ONCE(dev->fill_delay, 1);
156 		spin_unlock_irqrestore(&ent->lock, flags);
157 		mod_timer(&dev->delay_timer, jiffies + HZ);
158 		return;
159 	}
160 
161 	mr->mmkey.type = MLX5_MKEY_MR;
162 	mr->mmkey.key |= mlx5_idx_to_mkey(
163 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
164 
165 	WRITE_ONCE(dev->cache.last_add, jiffies);
166 
167 	spin_lock_irqsave(&ent->lock, flags);
168 	list_add_tail(&mr->list, &ent->head);
169 	ent->available_mrs++;
170 	ent->total_mrs++;
171 	/* If we are doing fill_to_high_water then keep going. */
172 	queue_adjust_cache_locked(ent);
173 	ent->pending--;
174 	spin_unlock_irqrestore(&ent->lock, flags);
175 }
176 
177 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
178 {
179 	struct mlx5_ib_mr *mr;
180 
181 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
182 	if (!mr)
183 		return NULL;
184 	mr->cache_ent = ent;
185 	mr->dev = ent->dev;
186 
187 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
188 	MLX5_SET(mkc, mkc, free, 1);
189 	MLX5_SET(mkc, mkc, umr_en, 1);
190 	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
191 	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
192 
193 	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
194 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
195 	return mr;
196 }
197 
198 /* Asynchronously schedule new MRs to be populated in the cache. */
199 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
200 {
201 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
202 	struct mlx5_ib_mr *mr;
203 	void *mkc;
204 	u32 *in;
205 	int err = 0;
206 	int i;
207 
208 	in = kzalloc(inlen, GFP_KERNEL);
209 	if (!in)
210 		return -ENOMEM;
211 
212 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
213 	for (i = 0; i < num; i++) {
214 		mr = alloc_cache_mr(ent, mkc);
215 		if (!mr) {
216 			err = -ENOMEM;
217 			break;
218 		}
219 		spin_lock_irq(&ent->lock);
220 		if (ent->pending >= MAX_PENDING_REG_MR) {
221 			err = -EAGAIN;
222 			spin_unlock_irq(&ent->lock);
223 			kfree(mr);
224 			break;
225 		}
226 		ent->pending++;
227 		spin_unlock_irq(&ent->lock);
228 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
229 					     &ent->dev->async_ctx, in, inlen,
230 					     mr->out, sizeof(mr->out),
231 					     &mr->cb_work);
232 		if (err) {
233 			spin_lock_irq(&ent->lock);
234 			ent->pending--;
235 			spin_unlock_irq(&ent->lock);
236 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
237 			kfree(mr);
238 			break;
239 		}
240 	}
241 
242 	kfree(in);
243 	return err;
244 }
245 
246 /* Synchronously create a MR in the cache */
247 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
248 {
249 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
250 	struct mlx5_ib_mr *mr;
251 	void *mkc;
252 	u32 *in;
253 	int err;
254 
255 	in = kzalloc(inlen, GFP_KERNEL);
256 	if (!in)
257 		return ERR_PTR(-ENOMEM);
258 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
259 
260 	mr = alloc_cache_mr(ent, mkc);
261 	if (!mr) {
262 		err = -ENOMEM;
263 		goto free_in;
264 	}
265 
266 	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
267 	if (err)
268 		goto free_mr;
269 
270 	mr->mmkey.type = MLX5_MKEY_MR;
271 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
272 	spin_lock_irq(&ent->lock);
273 	ent->total_mrs++;
274 	spin_unlock_irq(&ent->lock);
275 	kfree(in);
276 	return mr;
277 free_mr:
278 	kfree(mr);
279 free_in:
280 	kfree(in);
281 	return ERR_PTR(err);
282 }
283 
284 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
285 {
286 	struct mlx5_ib_mr *mr;
287 
288 	lockdep_assert_held(&ent->lock);
289 	if (list_empty(&ent->head))
290 		return;
291 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
292 	list_del(&mr->list);
293 	ent->available_mrs--;
294 	ent->total_mrs--;
295 	spin_unlock_irq(&ent->lock);
296 	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
297 	kfree(mr);
298 	spin_lock_irq(&ent->lock);
299 }
300 
301 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
302 				bool limit_fill)
303 {
304 	int err;
305 
306 	lockdep_assert_held(&ent->lock);
307 
308 	while (true) {
309 		if (limit_fill)
310 			target = ent->limit * 2;
311 		if (target == ent->available_mrs + ent->pending)
312 			return 0;
313 		if (target > ent->available_mrs + ent->pending) {
314 			u32 todo = target - (ent->available_mrs + ent->pending);
315 
316 			spin_unlock_irq(&ent->lock);
317 			err = add_keys(ent, todo);
318 			if (err == -EAGAIN)
319 				usleep_range(3000, 5000);
320 			spin_lock_irq(&ent->lock);
321 			if (err) {
322 				if (err != -EAGAIN)
323 					return err;
324 			} else
325 				return 0;
326 		} else {
327 			remove_cache_mr_locked(ent);
328 		}
329 	}
330 }
331 
332 static ssize_t size_write(struct file *filp, const char __user *buf,
333 			  size_t count, loff_t *pos)
334 {
335 	struct mlx5_cache_ent *ent = filp->private_data;
336 	u32 target;
337 	int err;
338 
339 	err = kstrtou32_from_user(buf, count, 0, &target);
340 	if (err)
341 		return err;
342 
343 	/*
344 	 * Target is the new value of total_mrs the user requests, however we
345 	 * cannot free MRs that are in use. Compute the target value for
346 	 * available_mrs.
347 	 */
348 	spin_lock_irq(&ent->lock);
349 	if (target < ent->total_mrs - ent->available_mrs) {
350 		err = -EINVAL;
351 		goto err_unlock;
352 	}
353 	target = target - (ent->total_mrs - ent->available_mrs);
354 	if (target < ent->limit || target > ent->limit*2) {
355 		err = -EINVAL;
356 		goto err_unlock;
357 	}
358 	err = resize_available_mrs(ent, target, false);
359 	if (err)
360 		goto err_unlock;
361 	spin_unlock_irq(&ent->lock);
362 
363 	return count;
364 
365 err_unlock:
366 	spin_unlock_irq(&ent->lock);
367 	return err;
368 }
369 
370 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
371 			 loff_t *pos)
372 {
373 	struct mlx5_cache_ent *ent = filp->private_data;
374 	char lbuf[20];
375 	int err;
376 
377 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
378 	if (err < 0)
379 		return err;
380 
381 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
382 }
383 
384 static const struct file_operations size_fops = {
385 	.owner	= THIS_MODULE,
386 	.open	= simple_open,
387 	.write	= size_write,
388 	.read	= size_read,
389 };
390 
391 static ssize_t limit_write(struct file *filp, const char __user *buf,
392 			   size_t count, loff_t *pos)
393 {
394 	struct mlx5_cache_ent *ent = filp->private_data;
395 	u32 var;
396 	int err;
397 
398 	err = kstrtou32_from_user(buf, count, 0, &var);
399 	if (err)
400 		return err;
401 
402 	/*
403 	 * Upon set we immediately fill the cache to high water mark implied by
404 	 * the limit.
405 	 */
406 	spin_lock_irq(&ent->lock);
407 	ent->limit = var;
408 	err = resize_available_mrs(ent, 0, true);
409 	spin_unlock_irq(&ent->lock);
410 	if (err)
411 		return err;
412 	return count;
413 }
414 
415 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
416 			  loff_t *pos)
417 {
418 	struct mlx5_cache_ent *ent = filp->private_data;
419 	char lbuf[20];
420 	int err;
421 
422 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
423 	if (err < 0)
424 		return err;
425 
426 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
427 }
428 
429 static const struct file_operations limit_fops = {
430 	.owner	= THIS_MODULE,
431 	.open	= simple_open,
432 	.write	= limit_write,
433 	.read	= limit_read,
434 };
435 
436 static bool someone_adding(struct mlx5_mr_cache *cache)
437 {
438 	unsigned int i;
439 
440 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
441 		struct mlx5_cache_ent *ent = &cache->ent[i];
442 		bool ret;
443 
444 		spin_lock_irq(&ent->lock);
445 		ret = ent->available_mrs < ent->limit;
446 		spin_unlock_irq(&ent->lock);
447 		if (ret)
448 			return true;
449 	}
450 	return false;
451 }
452 
453 /*
454  * Check if the bucket is outside the high/low water mark and schedule an async
455  * update. The cache refill has hysteresis, once the low water mark is hit it is
456  * refilled up to the high mark.
457  */
458 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
459 {
460 	lockdep_assert_held(&ent->lock);
461 
462 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
463 		return;
464 	if (ent->available_mrs < ent->limit) {
465 		ent->fill_to_high_water = true;
466 		queue_work(ent->dev->cache.wq, &ent->work);
467 	} else if (ent->fill_to_high_water &&
468 		   ent->available_mrs + ent->pending < 2 * ent->limit) {
469 		/*
470 		 * Once we start populating due to hitting a low water mark
471 		 * continue until we pass the high water mark.
472 		 */
473 		queue_work(ent->dev->cache.wq, &ent->work);
474 	} else if (ent->available_mrs == 2 * ent->limit) {
475 		ent->fill_to_high_water = false;
476 	} else if (ent->available_mrs > 2 * ent->limit) {
477 		/* Queue deletion of excess entries */
478 		ent->fill_to_high_water = false;
479 		if (ent->pending)
480 			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
481 					   msecs_to_jiffies(1000));
482 		else
483 			queue_work(ent->dev->cache.wq, &ent->work);
484 	}
485 }
486 
487 static void __cache_work_func(struct mlx5_cache_ent *ent)
488 {
489 	struct mlx5_ib_dev *dev = ent->dev;
490 	struct mlx5_mr_cache *cache = &dev->cache;
491 	int err;
492 
493 	spin_lock_irq(&ent->lock);
494 	if (ent->disabled)
495 		goto out;
496 
497 	if (ent->fill_to_high_water &&
498 	    ent->available_mrs + ent->pending < 2 * ent->limit &&
499 	    !READ_ONCE(dev->fill_delay)) {
500 		spin_unlock_irq(&ent->lock);
501 		err = add_keys(ent, 1);
502 		spin_lock_irq(&ent->lock);
503 		if (ent->disabled)
504 			goto out;
505 		if (err) {
506 			/*
507 			 * EAGAIN only happens if pending is positive, so we
508 			 * will be rescheduled from reg_mr_callback(). The only
509 			 * failure path here is ENOMEM.
510 			 */
511 			if (err != -EAGAIN) {
512 				mlx5_ib_warn(
513 					dev,
514 					"command failed order %d, err %d\n",
515 					ent->order, err);
516 				queue_delayed_work(cache->wq, &ent->dwork,
517 						   msecs_to_jiffies(1000));
518 			}
519 		}
520 	} else if (ent->available_mrs > 2 * ent->limit) {
521 		bool need_delay;
522 
523 		/*
524 		 * The remove_cache_mr() logic is performed as garbage
525 		 * collection task. Such task is intended to be run when no
526 		 * other active processes are running.
527 		 *
528 		 * The need_resched() will return TRUE if there are user tasks
529 		 * to be activated in near future.
530 		 *
531 		 * In such case, we don't execute remove_cache_mr() and postpone
532 		 * the garbage collection work to try to run in next cycle, in
533 		 * order to free CPU resources to other tasks.
534 		 */
535 		spin_unlock_irq(&ent->lock);
536 		need_delay = need_resched() || someone_adding(cache) ||
537 			     time_after(jiffies,
538 					READ_ONCE(cache->last_add) + 300 * HZ);
539 		spin_lock_irq(&ent->lock);
540 		if (ent->disabled)
541 			goto out;
542 		if (need_delay)
543 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
544 		remove_cache_mr_locked(ent);
545 		queue_adjust_cache_locked(ent);
546 	}
547 out:
548 	spin_unlock_irq(&ent->lock);
549 }
550 
551 static void delayed_cache_work_func(struct work_struct *work)
552 {
553 	struct mlx5_cache_ent *ent;
554 
555 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
556 	__cache_work_func(ent);
557 }
558 
559 static void cache_work_func(struct work_struct *work)
560 {
561 	struct mlx5_cache_ent *ent;
562 
563 	ent = container_of(work, struct mlx5_cache_ent, work);
564 	__cache_work_func(ent);
565 }
566 
567 /* Allocate a special entry from the cache */
568 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
569 				       unsigned int entry, int access_flags)
570 {
571 	struct mlx5_mr_cache *cache = &dev->cache;
572 	struct mlx5_cache_ent *ent;
573 	struct mlx5_ib_mr *mr;
574 
575 	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
576 		    entry >= ARRAY_SIZE(cache->ent)))
577 		return ERR_PTR(-EINVAL);
578 
579 	/* Matches access in alloc_cache_mr() */
580 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
581 		return ERR_PTR(-EOPNOTSUPP);
582 
583 	ent = &cache->ent[entry];
584 	spin_lock_irq(&ent->lock);
585 	if (list_empty(&ent->head)) {
586 		spin_unlock_irq(&ent->lock);
587 		mr = create_cache_mr(ent);
588 		if (IS_ERR(mr))
589 			return mr;
590 	} else {
591 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
592 		list_del(&mr->list);
593 		ent->available_mrs--;
594 		queue_adjust_cache_locked(ent);
595 		spin_unlock_irq(&ent->lock);
596 	}
597 	mr->access_flags = access_flags;
598 	return mr;
599 }
600 
601 /* Return a MR already available in the cache */
602 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
603 {
604 	struct mlx5_ib_dev *dev = req_ent->dev;
605 	struct mlx5_ib_mr *mr = NULL;
606 	struct mlx5_cache_ent *ent = req_ent;
607 
608 	/* Try larger MR pools from the cache to satisfy the allocation */
609 	for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
610 		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
611 			    ent - dev->cache.ent);
612 
613 		spin_lock_irq(&ent->lock);
614 		if (!list_empty(&ent->head)) {
615 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
616 					      list);
617 			list_del(&mr->list);
618 			ent->available_mrs--;
619 			queue_adjust_cache_locked(ent);
620 			spin_unlock_irq(&ent->lock);
621 			break;
622 		}
623 		queue_adjust_cache_locked(ent);
624 		spin_unlock_irq(&ent->lock);
625 	}
626 
627 	if (!mr)
628 		req_ent->miss++;
629 
630 	return mr;
631 }
632 
633 static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
634 {
635 	struct mlx5_cache_ent *ent = mr->cache_ent;
636 
637 	mr->cache_ent = NULL;
638 	spin_lock_irq(&ent->lock);
639 	ent->total_mrs--;
640 	spin_unlock_irq(&ent->lock);
641 }
642 
643 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
644 {
645 	struct mlx5_cache_ent *ent = mr->cache_ent;
646 
647 	if (!ent)
648 		return;
649 
650 	if (mlx5_mr_cache_invalidate(mr)) {
651 		detach_mr_from_cache(mr);
652 		destroy_mkey(dev, mr);
653 		return;
654 	}
655 
656 	spin_lock_irq(&ent->lock);
657 	list_add_tail(&mr->list, &ent->head);
658 	ent->available_mrs++;
659 	queue_adjust_cache_locked(ent);
660 	spin_unlock_irq(&ent->lock);
661 }
662 
663 static void clean_keys(struct mlx5_ib_dev *dev, int c)
664 {
665 	struct mlx5_mr_cache *cache = &dev->cache;
666 	struct mlx5_cache_ent *ent = &cache->ent[c];
667 	struct mlx5_ib_mr *tmp_mr;
668 	struct mlx5_ib_mr *mr;
669 	LIST_HEAD(del_list);
670 
671 	cancel_delayed_work(&ent->dwork);
672 	while (1) {
673 		spin_lock_irq(&ent->lock);
674 		if (list_empty(&ent->head)) {
675 			spin_unlock_irq(&ent->lock);
676 			break;
677 		}
678 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
679 		list_move(&mr->list, &del_list);
680 		ent->available_mrs--;
681 		ent->total_mrs--;
682 		spin_unlock_irq(&ent->lock);
683 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
684 	}
685 
686 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
687 		list_del(&mr->list);
688 		kfree(mr);
689 	}
690 }
691 
692 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
693 {
694 	if (!mlx5_debugfs_root || dev->is_rep)
695 		return;
696 
697 	debugfs_remove_recursive(dev->cache.root);
698 	dev->cache.root = NULL;
699 }
700 
701 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
702 {
703 	struct mlx5_mr_cache *cache = &dev->cache;
704 	struct mlx5_cache_ent *ent;
705 	struct dentry *dir;
706 	int i;
707 
708 	if (!mlx5_debugfs_root || dev->is_rep)
709 		return;
710 
711 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
712 
713 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
714 		ent = &cache->ent[i];
715 		sprintf(ent->name, "%d", ent->order);
716 		dir = debugfs_create_dir(ent->name, cache->root);
717 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
718 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
719 		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
720 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
721 	}
722 }
723 
724 static void delay_time_func(struct timer_list *t)
725 {
726 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
727 
728 	WRITE_ONCE(dev->fill_delay, 0);
729 }
730 
731 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
732 {
733 	struct mlx5_mr_cache *cache = &dev->cache;
734 	struct mlx5_cache_ent *ent;
735 	int i;
736 
737 	mutex_init(&dev->slow_path_mutex);
738 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
739 	if (!cache->wq) {
740 		mlx5_ib_warn(dev, "failed to create work queue\n");
741 		return -ENOMEM;
742 	}
743 
744 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
745 	timer_setup(&dev->delay_timer, delay_time_func, 0);
746 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
747 		ent = &cache->ent[i];
748 		INIT_LIST_HEAD(&ent->head);
749 		spin_lock_init(&ent->lock);
750 		ent->order = i + 2;
751 		ent->dev = dev;
752 		ent->limit = 0;
753 
754 		INIT_WORK(&ent->work, cache_work_func);
755 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
756 
757 		if (i > MR_CACHE_LAST_STD_ENTRY) {
758 			mlx5_odp_init_mr_cache_entry(ent);
759 			continue;
760 		}
761 
762 		if (ent->order > mr_cache_max_order(dev))
763 			continue;
764 
765 		ent->page = PAGE_SHIFT;
766 		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
767 			   MLX5_IB_UMR_OCTOWORD;
768 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
769 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
770 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
771 		    mlx5_ib_can_load_pas_with_umr(dev, 0))
772 			ent->limit = dev->mdev->profile->mr_cache[i].limit;
773 		else
774 			ent->limit = 0;
775 		spin_lock_irq(&ent->lock);
776 		queue_adjust_cache_locked(ent);
777 		spin_unlock_irq(&ent->lock);
778 	}
779 
780 	mlx5_mr_cache_debugfs_init(dev);
781 
782 	return 0;
783 }
784 
785 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
786 {
787 	unsigned int i;
788 
789 	if (!dev->cache.wq)
790 		return 0;
791 
792 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
793 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
794 
795 		spin_lock_irq(&ent->lock);
796 		ent->disabled = true;
797 		spin_unlock_irq(&ent->lock);
798 		cancel_work_sync(&ent->work);
799 		cancel_delayed_work_sync(&ent->dwork);
800 	}
801 
802 	mlx5_mr_cache_debugfs_cleanup(dev);
803 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
804 
805 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
806 		clean_keys(dev, i);
807 
808 	destroy_workqueue(dev->cache.wq);
809 	del_timer_sync(&dev->delay_timer);
810 
811 	return 0;
812 }
813 
814 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
815 {
816 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
817 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
818 	struct mlx5_ib_mr *mr;
819 	void *mkc;
820 	u32 *in;
821 	int err;
822 
823 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
824 	if (!mr)
825 		return ERR_PTR(-ENOMEM);
826 
827 	in = kzalloc(inlen, GFP_KERNEL);
828 	if (!in) {
829 		err = -ENOMEM;
830 		goto err_free;
831 	}
832 
833 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
834 
835 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
836 	MLX5_SET(mkc, mkc, length64, 1);
837 	set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
838 
839 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
840 	if (err)
841 		goto err_in;
842 
843 	kfree(in);
844 	mr->mmkey.type = MLX5_MKEY_MR;
845 	mr->ibmr.lkey = mr->mmkey.key;
846 	mr->ibmr.rkey = mr->mmkey.key;
847 	mr->umem = NULL;
848 
849 	return &mr->ibmr;
850 
851 err_in:
852 	kfree(in);
853 
854 err_free:
855 	kfree(mr);
856 
857 	return ERR_PTR(err);
858 }
859 
860 static int get_octo_len(u64 addr, u64 len, int page_shift)
861 {
862 	u64 page_size = 1ULL << page_shift;
863 	u64 offset;
864 	int npages;
865 
866 	offset = addr & (page_size - 1);
867 	npages = ALIGN(len + offset, page_size) >> page_shift;
868 	return (npages + 1) / 2;
869 }
870 
871 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
872 {
873 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
874 		return MR_CACHE_LAST_STD_ENTRY + 2;
875 	return MLX5_MAX_UMR_SHIFT;
876 }
877 
878 static struct ib_umem *mr_umem_get(struct mlx5_ib_dev *dev, u64 start,
879 				   u64 length, int access_flags)
880 {
881 	struct ib_umem *u;
882 
883 	if (access_flags & IB_ACCESS_ON_DEMAND) {
884 		struct ib_umem_odp *odp;
885 
886 		odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
887 				      &mlx5_mn_ops);
888 		if (IS_ERR(odp)) {
889 			mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
890 				    PTR_ERR(odp));
891 			return ERR_CAST(odp);
892 		}
893 		return &odp->umem;
894 	}
895 
896 	u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
897 	if (IS_ERR(u)) {
898 		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
899 		return u;
900 	}
901 	return u;
902 }
903 
904 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
905 {
906 	struct mlx5_ib_umr_context *context =
907 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
908 
909 	context->status = wc->status;
910 	complete(&context->done);
911 }
912 
913 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
914 {
915 	context->cqe.done = mlx5_ib_umr_done;
916 	context->status = -1;
917 	init_completion(&context->done);
918 }
919 
920 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
921 				  struct mlx5_umr_wr *umrwr)
922 {
923 	struct umr_common *umrc = &dev->umrc;
924 	const struct ib_send_wr *bad;
925 	int err;
926 	struct mlx5_ib_umr_context umr_context;
927 
928 	mlx5_ib_init_umr_context(&umr_context);
929 	umrwr->wr.wr_cqe = &umr_context.cqe;
930 
931 	down(&umrc->sem);
932 	err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
933 	if (err) {
934 		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
935 	} else {
936 		wait_for_completion(&umr_context.done);
937 		if (umr_context.status != IB_WC_SUCCESS) {
938 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
939 				     umr_context.status);
940 			err = -EFAULT;
941 		}
942 	}
943 	up(&umrc->sem);
944 	return err;
945 }
946 
947 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
948 						      unsigned int order)
949 {
950 	struct mlx5_mr_cache *cache = &dev->cache;
951 
952 	if (order < cache->ent[0].order)
953 		return &cache->ent[0];
954 	order = order - cache->ent[0].order;
955 	if (order > MR_CACHE_LAST_STD_ENTRY)
956 		return NULL;
957 	return &cache->ent[order];
958 }
959 
960 static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
961 					      struct ib_umem *umem, u64 iova,
962 					      int access_flags)
963 {
964 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
965 	struct mlx5_cache_ent *ent;
966 	struct mlx5_ib_mr *mr;
967 	int page_shift;
968 
969 	mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
970 	ent = mr_cache_ent_from_order(dev, order_base_2(ib_umem_num_dma_blocks(
971 						   umem, 1UL << page_shift)));
972 	if (!ent)
973 		return ERR_PTR(-E2BIG);
974 
975 	/* Matches access in alloc_cache_mr() */
976 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
977 		return ERR_PTR(-EOPNOTSUPP);
978 
979 	mr = get_cache_mr(ent);
980 	if (!mr) {
981 		mr = create_cache_mr(ent);
982 		if (IS_ERR(mr))
983 			return mr;
984 	}
985 
986 	mr->ibmr.pd = pd;
987 	mr->umem = umem;
988 	mr->access_flags = access_flags;
989 	mr->desc_size = sizeof(struct mlx5_mtt);
990 	mr->mmkey.iova = iova;
991 	mr->mmkey.size = umem->length;
992 	mr->mmkey.pd = to_mpd(pd)->pdn;
993 	mr->page_shift = page_shift;
994 
995 	return mr;
996 }
997 
998 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
999 			    MLX5_UMR_MTT_ALIGNMENT)
1000 #define MLX5_SPARE_UMR_CHUNK 0x10000
1001 
1002 /*
1003  * Allocate a temporary buffer to hold the per-page information to transfer to
1004  * HW. For efficiency this should be as large as it can be, but buffer
1005  * allocation failure is not allowed, so try smaller sizes.
1006  */
1007 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
1008 {
1009 	const size_t xlt_chunk_align =
1010 		MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size);
1011 	size_t size;
1012 	void *res = NULL;
1013 
1014 	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
1015 
1016 	/*
1017 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1018 	 * allocation can't trigger any kind of reclaim.
1019 	 */
1020 	might_sleep();
1021 
1022 	gfp_mask |= __GFP_ZERO;
1023 
1024 	/*
1025 	 * If the system already has a suitable high order page then just use
1026 	 * that, but don't try hard to create one. This max is about 1M, so a
1027 	 * free x86 huge page will satisfy it.
1028 	 */
1029 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1030 		     MLX5_MAX_UMR_CHUNK);
1031 	*nents = size / ent_size;
1032 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1033 				       get_order(size));
1034 	if (res)
1035 		return res;
1036 
1037 	if (size > MLX5_SPARE_UMR_CHUNK) {
1038 		size = MLX5_SPARE_UMR_CHUNK;
1039 		*nents = get_order(size) / ent_size;
1040 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1041 					       get_order(size));
1042 		if (res)
1043 			return res;
1044 	}
1045 
1046 	*nents = PAGE_SIZE / ent_size;
1047 	res = (void *)__get_free_page(gfp_mask);
1048 	if (res)
1049 		return res;
1050 
1051 	mutex_lock(&xlt_emergency_page_mutex);
1052 	memset(xlt_emergency_page, 0, PAGE_SIZE);
1053 	return xlt_emergency_page;
1054 }
1055 
1056 static void mlx5_ib_free_xlt(void *xlt, size_t length)
1057 {
1058 	if (xlt == xlt_emergency_page) {
1059 		mutex_unlock(&xlt_emergency_page_mutex);
1060 		return;
1061 	}
1062 
1063 	free_pages((unsigned long)xlt, get_order(length));
1064 }
1065 
1066 /*
1067  * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1068  * submission.
1069  */
1070 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1071 				   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1072 				   size_t nents, size_t ent_size,
1073 				   unsigned int flags)
1074 {
1075 	struct mlx5_ib_dev *dev = mr->dev;
1076 	struct device *ddev = dev->ib_dev.dev.parent;
1077 	dma_addr_t dma;
1078 	void *xlt;
1079 
1080 	xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1081 				flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1082 								 GFP_KERNEL);
1083 	sg->length = nents * ent_size;
1084 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1085 	if (dma_mapping_error(ddev, dma)) {
1086 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1087 		mlx5_ib_free_xlt(xlt, sg->length);
1088 		return NULL;
1089 	}
1090 	sg->addr = dma;
1091 	sg->lkey = dev->umrc.pd->local_dma_lkey;
1092 
1093 	memset(wr, 0, sizeof(*wr));
1094 	wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1095 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1096 		wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1097 	wr->wr.sg_list = sg;
1098 	wr->wr.num_sge = 1;
1099 	wr->wr.opcode = MLX5_IB_WR_UMR;
1100 	wr->pd = mr->ibmr.pd;
1101 	wr->mkey = mr->mmkey.key;
1102 	wr->length = mr->mmkey.size;
1103 	wr->virt_addr = mr->mmkey.iova;
1104 	wr->access_flags = mr->access_flags;
1105 	wr->page_shift = mr->page_shift;
1106 	wr->xlt_size = sg->length;
1107 	return xlt;
1108 }
1109 
1110 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1111 				   struct ib_sge *sg)
1112 {
1113 	struct device *ddev = dev->ib_dev.dev.parent;
1114 
1115 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1116 	mlx5_ib_free_xlt(xlt, sg->length);
1117 }
1118 
1119 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1120 		       int page_shift, int flags)
1121 {
1122 	struct mlx5_ib_dev *dev = mr->dev;
1123 	struct device *ddev = dev->ib_dev.dev.parent;
1124 	void *xlt;
1125 	struct mlx5_umr_wr wr;
1126 	struct ib_sge sg;
1127 	int err = 0;
1128 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1129 			       ? sizeof(struct mlx5_klm)
1130 			       : sizeof(struct mlx5_mtt);
1131 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1132 	const int page_mask = page_align - 1;
1133 	size_t pages_mapped = 0;
1134 	size_t pages_to_map = 0;
1135 	size_t pages_iter;
1136 	size_t size_to_map = 0;
1137 	size_t orig_sg_length;
1138 
1139 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1140 	    !umr_can_use_indirect_mkey(dev))
1141 		return -EPERM;
1142 
1143 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1144 	 * so we need to align the offset and length accordingly
1145 	 */
1146 	if (idx & page_mask) {
1147 		npages += idx & page_mask;
1148 		idx &= ~page_mask;
1149 	}
1150 	pages_to_map = ALIGN(npages, page_align);
1151 
1152 	xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1153 	if (!xlt)
1154 		return -ENOMEM;
1155 	pages_iter = sg.length / desc_size;
1156 	orig_sg_length = sg.length;
1157 
1158 	if (mr->umem->is_odp) {
1159 		if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1160 			struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1161 			size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1162 
1163 			pages_to_map = min_t(size_t, pages_to_map, max_pages);
1164 		}
1165 	}
1166 
1167 	wr.page_shift = page_shift;
1168 
1169 	for (pages_mapped = 0;
1170 	     pages_mapped < pages_to_map && !err;
1171 	     pages_mapped += pages_iter, idx += pages_iter) {
1172 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1173 		size_to_map = npages * desc_size;
1174 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1175 					DMA_TO_DEVICE);
1176 		if (mr->umem->is_odp) {
1177 			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1178 		} else {
1179 			__mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
1180 					       npages, xlt,
1181 					       MLX5_IB_MTT_PRESENT);
1182 			/* Clear padding after the pages
1183 			 * brought from the umem.
1184 			 */
1185 			memset(xlt + size_to_map, 0, sg.length - size_to_map);
1186 		}
1187 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
1188 					   DMA_TO_DEVICE);
1189 
1190 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1191 
1192 		if (pages_mapped + pages_iter >= pages_to_map) {
1193 			if (flags & MLX5_IB_UPD_XLT_ENABLE)
1194 				wr.wr.send_flags |=
1195 					MLX5_IB_SEND_UMR_ENABLE_MR |
1196 					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1197 					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1198 			if (flags & MLX5_IB_UPD_XLT_PD ||
1199 			    flags & MLX5_IB_UPD_XLT_ACCESS)
1200 				wr.wr.send_flags |=
1201 					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1202 			if (flags & MLX5_IB_UPD_XLT_ADDR)
1203 				wr.wr.send_flags |=
1204 					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1205 		}
1206 
1207 		wr.offset = idx * desc_size;
1208 		wr.xlt_size = sg.length;
1209 
1210 		err = mlx5_ib_post_send_wait(dev, &wr);
1211 	}
1212 	sg.length = orig_sg_length;
1213 	mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1214 	return err;
1215 }
1216 
1217 /*
1218  * If ibmr is NULL it will be allocated by reg_create.
1219  * Else, the given ibmr will be used.
1220  */
1221 static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1222 				     struct ib_umem *umem, u64 iova,
1223 				     int access_flags, bool populate)
1224 {
1225 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1226 	struct mlx5_ib_mr *mr;
1227 	int page_shift;
1228 	__be64 *pas;
1229 	void *mkc;
1230 	int inlen;
1231 	u32 *in;
1232 	int err;
1233 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1234 
1235 	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1236 	if (!mr)
1237 		return ERR_PTR(-ENOMEM);
1238 
1239 	mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
1240 
1241 	mr->page_shift = page_shift;
1242 	mr->ibmr.pd = pd;
1243 	mr->access_flags = access_flags;
1244 
1245 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1246 	if (populate)
1247 		inlen +=
1248 			sizeof(*pas) *
1249 			roundup(ib_umem_num_dma_blocks(umem, 1UL << page_shift),
1250 				2);
1251 	in = kvzalloc(inlen, GFP_KERNEL);
1252 	if (!in) {
1253 		err = -ENOMEM;
1254 		goto err_1;
1255 	}
1256 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1257 	if (populate) {
1258 		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1259 			err = -EINVAL;
1260 			goto err_2;
1261 		}
1262 		mlx5_ib_populate_pas(umem, 1ULL << page_shift, pas,
1263 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1264 	}
1265 
1266 	/* The pg_access bit allows setting the access flags
1267 	 * in the page list submitted with the command. */
1268 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1269 
1270 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1271 	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1272 				      populate ? pd : dev->umrc.pd);
1273 	MLX5_SET(mkc, mkc, free, !populate);
1274 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1275 	MLX5_SET(mkc, mkc, umr_en, 1);
1276 
1277 	MLX5_SET64(mkc, mkc, len, umem->length);
1278 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1279 	MLX5_SET(mkc, mkc, translations_octword_size,
1280 		 get_octo_len(iova, umem->length, page_shift));
1281 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
1282 	if (populate) {
1283 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1284 			 get_octo_len(iova, umem->length, page_shift));
1285 	}
1286 
1287 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1288 	if (err) {
1289 		mlx5_ib_warn(dev, "create mkey failed\n");
1290 		goto err_2;
1291 	}
1292 	mr->mmkey.type = MLX5_MKEY_MR;
1293 	mr->desc_size = sizeof(struct mlx5_mtt);
1294 	mr->dev = dev;
1295 	kvfree(in);
1296 
1297 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1298 
1299 	return mr;
1300 
1301 err_2:
1302 	kvfree(in);
1303 
1304 err_1:
1305 	if (!ibmr)
1306 		kfree(mr);
1307 
1308 	return ERR_PTR(err);
1309 }
1310 
1311 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1312 			  u64 length, int access_flags)
1313 {
1314 	mr->ibmr.lkey = mr->mmkey.key;
1315 	mr->ibmr.rkey = mr->mmkey.key;
1316 	mr->ibmr.length = length;
1317 	mr->access_flags = access_flags;
1318 }
1319 
1320 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1321 				       u64 length, int acc, int mode)
1322 {
1323 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1324 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1325 	struct mlx5_ib_mr *mr;
1326 	void *mkc;
1327 	u32 *in;
1328 	int err;
1329 
1330 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1331 	if (!mr)
1332 		return ERR_PTR(-ENOMEM);
1333 
1334 	in = kzalloc(inlen, GFP_KERNEL);
1335 	if (!in) {
1336 		err = -ENOMEM;
1337 		goto err_free;
1338 	}
1339 
1340 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1341 
1342 	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1343 	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1344 	MLX5_SET64(mkc, mkc, len, length);
1345 	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1346 
1347 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1348 	if (err)
1349 		goto err_in;
1350 
1351 	kfree(in);
1352 
1353 	set_mr_fields(dev, mr, length, acc);
1354 
1355 	return &mr->ibmr;
1356 
1357 err_in:
1358 	kfree(in);
1359 
1360 err_free:
1361 	kfree(mr);
1362 
1363 	return ERR_PTR(err);
1364 }
1365 
1366 int mlx5_ib_advise_mr(struct ib_pd *pd,
1367 		      enum ib_uverbs_advise_mr_advice advice,
1368 		      u32 flags,
1369 		      struct ib_sge *sg_list,
1370 		      u32 num_sge,
1371 		      struct uverbs_attr_bundle *attrs)
1372 {
1373 	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1374 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1375 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1376 		return -EOPNOTSUPP;
1377 
1378 	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1379 					 sg_list, num_sge);
1380 }
1381 
1382 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1383 				struct ib_dm_mr_attr *attr,
1384 				struct uverbs_attr_bundle *attrs)
1385 {
1386 	struct mlx5_ib_dm *mdm = to_mdm(dm);
1387 	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1388 	u64 start_addr = mdm->dev_addr + attr->offset;
1389 	int mode;
1390 
1391 	switch (mdm->type) {
1392 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1393 		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1394 			return ERR_PTR(-EINVAL);
1395 
1396 		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1397 		start_addr -= pci_resource_start(dev->pdev, 0);
1398 		break;
1399 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1400 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1401 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1402 			return ERR_PTR(-EINVAL);
1403 
1404 		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1405 		break;
1406 	default:
1407 		return ERR_PTR(-EINVAL);
1408 	}
1409 
1410 	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1411 				 attr->access_flags, mode);
1412 }
1413 
1414 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1415 				  u64 virt_addr, int access_flags,
1416 				  struct ib_udata *udata)
1417 {
1418 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1419 	struct mlx5_ib_mr *mr = NULL;
1420 	bool xlt_with_umr;
1421 	struct ib_umem *umem;
1422 	int err;
1423 
1424 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1425 		return ERR_PTR(-EOPNOTSUPP);
1426 
1427 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1428 		    start, virt_addr, length, access_flags);
1429 
1430 	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, length);
1431 	/* ODP requires xlt update via umr to work. */
1432 	if (!xlt_with_umr && (access_flags & IB_ACCESS_ON_DEMAND))
1433 		return ERR_PTR(-EINVAL);
1434 
1435 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
1436 	    length == U64_MAX) {
1437 		if (virt_addr != start)
1438 			return ERR_PTR(-EINVAL);
1439 		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
1440 		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1441 			return ERR_PTR(-EINVAL);
1442 
1443 		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
1444 		if (IS_ERR(mr))
1445 			return ERR_CAST(mr);
1446 		return &mr->ibmr;
1447 	}
1448 
1449 	umem = mr_umem_get(dev, start, length, access_flags);
1450 	if (IS_ERR(umem))
1451 		return ERR_CAST(umem);
1452 
1453 	if (xlt_with_umr) {
1454 		mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
1455 		if (IS_ERR(mr))
1456 			mr = NULL;
1457 	}
1458 
1459 	if (!mr) {
1460 		mutex_lock(&dev->slow_path_mutex);
1461 		mr = reg_create(NULL, pd, umem, virt_addr, access_flags,
1462 				!xlt_with_umr);
1463 		mutex_unlock(&dev->slow_path_mutex);
1464 	}
1465 
1466 	if (IS_ERR(mr)) {
1467 		err = PTR_ERR(mr);
1468 		goto error;
1469 	}
1470 
1471 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1472 
1473 	mr->umem = umem;
1474 	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1475 	set_mr_fields(dev, mr, length, access_flags);
1476 
1477 	if (xlt_with_umr && !(access_flags & IB_ACCESS_ON_DEMAND)) {
1478 		/*
1479 		 * If the MR was created with reg_create then it will be
1480 		 * configured properly but left disabled. It is safe to go ahead
1481 		 * and configure it again via UMR while enabling it.
1482 		 */
1483 		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
1484 
1485 		err = mlx5_ib_update_xlt(
1486 			mr, 0,
1487 			ib_umem_num_dma_blocks(umem, 1UL << mr->page_shift),
1488 			mr->page_shift, update_xlt_flags);
1489 		if (err) {
1490 			dereg_mr(dev, mr);
1491 			return ERR_PTR(err);
1492 		}
1493 	}
1494 
1495 	if (is_odp_mr(mr)) {
1496 		to_ib_umem_odp(mr->umem)->private = mr;
1497 		init_waitqueue_head(&mr->q_deferred_work);
1498 		atomic_set(&mr->num_deferred_work, 0);
1499 		err = xa_err(xa_store(&dev->odp_mkeys,
1500 				      mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
1501 				      GFP_KERNEL));
1502 		if (err) {
1503 			dereg_mr(dev, mr);
1504 			return ERR_PTR(err);
1505 		}
1506 
1507 		err = mlx5_ib_init_odp_mr(mr, xlt_with_umr);
1508 		if (err) {
1509 			dereg_mr(dev, mr);
1510 			return ERR_PTR(err);
1511 		}
1512 	}
1513 
1514 	return &mr->ibmr;
1515 error:
1516 	ib_umem_release(umem);
1517 	return ERR_PTR(err);
1518 }
1519 
1520 /**
1521  * mlx5_mr_cache_invalidate - Fence all DMA on the MR
1522  * @mr: The MR to fence
1523  *
1524  * Upon return the NIC will not be doing any DMA to the pages under the MR,
1525  * and any DMA inprogress will be completed. Failure of this function
1526  * indicates the HW has failed catastrophically.
1527  */
1528 int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
1529 {
1530 	struct mlx5_umr_wr umrwr = {};
1531 
1532 	if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1533 		return 0;
1534 
1535 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1536 			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1537 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
1538 	umrwr.pd = mr->dev->umrc.pd;
1539 	umrwr.mkey = mr->mmkey.key;
1540 	umrwr.ignore_free_state = 1;
1541 
1542 	return mlx5_ib_post_send_wait(mr->dev, &umrwr);
1543 }
1544 
1545 static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1546 		     int access_flags, int flags)
1547 {
1548 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1549 	struct mlx5_umr_wr umrwr = {};
1550 	int err;
1551 
1552 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1553 
1554 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
1555 	umrwr.mkey = mr->mmkey.key;
1556 
1557 	if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
1558 		umrwr.pd = pd;
1559 		umrwr.access_flags = access_flags;
1560 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1561 	}
1562 
1563 	err = mlx5_ib_post_send_wait(dev, &umrwr);
1564 
1565 	return err;
1566 }
1567 
1568 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1569 			  u64 length, u64 virt_addr, int new_access_flags,
1570 			  struct ib_pd *new_pd, struct ib_udata *udata)
1571 {
1572 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1573 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1574 	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1575 	int access_flags = flags & IB_MR_REREG_ACCESS ?
1576 			    new_access_flags :
1577 			    mr->access_flags;
1578 	int upd_flags = 0;
1579 	u64 addr, len;
1580 	int err;
1581 
1582 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1583 		    start, virt_addr, length, access_flags);
1584 
1585 	if (!mr->umem)
1586 		return -EINVAL;
1587 
1588 	if (is_odp_mr(mr))
1589 		return -EOPNOTSUPP;
1590 
1591 	if (flags & IB_MR_REREG_TRANS) {
1592 		addr = virt_addr;
1593 		len = length;
1594 	} else {
1595 		addr = mr->umem->address;
1596 		len = mr->umem->length;
1597 	}
1598 
1599 	if (flags != IB_MR_REREG_PD) {
1600 		/*
1601 		 * Replace umem. This needs to be done whether or not UMR is
1602 		 * used.
1603 		 */
1604 		flags |= IB_MR_REREG_TRANS;
1605 		atomic_sub(ib_umem_num_pages(mr->umem),
1606 			   &dev->mdev->priv.reg_pages);
1607 		ib_umem_release(mr->umem);
1608 		mr->umem = mr_umem_get(dev, addr, len, access_flags);
1609 		if (IS_ERR(mr->umem)) {
1610 			err = PTR_ERR(mr->umem);
1611 			mr->umem = NULL;
1612 			goto err;
1613 		}
1614 		atomic_add(ib_umem_num_pages(mr->umem),
1615 			   &dev->mdev->priv.reg_pages);
1616 	}
1617 
1618 	if (!mlx5_ib_can_reconfig_with_umr(dev, mr->access_flags,
1619 					   access_flags) ||
1620 	    !mlx5_ib_can_load_pas_with_umr(dev, len) ||
1621 	    (flags & IB_MR_REREG_TRANS &&
1622 	     !mlx5_ib_pas_fits_in_mr(mr, addr, len))) {
1623 		/*
1624 		 * UMR can't be used - MKey needs to be replaced.
1625 		 */
1626 		if (mr->cache_ent)
1627 			detach_mr_from_cache(mr);
1628 		err = destroy_mkey(dev, mr);
1629 		if (err)
1630 			goto err;
1631 
1632 		mr = reg_create(ib_mr, pd, mr->umem, addr, access_flags, true);
1633 		if (IS_ERR(mr)) {
1634 			err = PTR_ERR(mr);
1635 			mr = to_mmr(ib_mr);
1636 			goto err;
1637 		}
1638 	} else {
1639 		/*
1640 		 * Send a UMR WQE
1641 		 */
1642 		mr->ibmr.pd = pd;
1643 		mr->access_flags = access_flags;
1644 		mr->mmkey.iova = addr;
1645 		mr->mmkey.size = len;
1646 		mr->mmkey.pd = to_mpd(pd)->pdn;
1647 
1648 		if (flags & IB_MR_REREG_TRANS) {
1649 			upd_flags = MLX5_IB_UPD_XLT_ADDR;
1650 			if (flags & IB_MR_REREG_PD)
1651 				upd_flags |= MLX5_IB_UPD_XLT_PD;
1652 			if (flags & IB_MR_REREG_ACCESS)
1653 				upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1654 			err = mlx5_ib_update_xlt(
1655 				mr, 0,
1656 				ib_umem_num_dma_blocks(mr->umem,
1657 						       1UL << mr->page_shift),
1658 				mr->page_shift, upd_flags);
1659 		} else {
1660 			err = rereg_umr(pd, mr, access_flags, flags);
1661 		}
1662 
1663 		if (err)
1664 			goto err;
1665 	}
1666 
1667 	set_mr_fields(dev, mr, len, access_flags);
1668 
1669 	return 0;
1670 
1671 err:
1672 	ib_umem_release(mr->umem);
1673 	mr->umem = NULL;
1674 
1675 	clean_mr(dev, mr);
1676 	return err;
1677 }
1678 
1679 static int
1680 mlx5_alloc_priv_descs(struct ib_device *device,
1681 		      struct mlx5_ib_mr *mr,
1682 		      int ndescs,
1683 		      int desc_size)
1684 {
1685 	int size = ndescs * desc_size;
1686 	int add_size;
1687 	int ret;
1688 
1689 	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1690 
1691 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1692 	if (!mr->descs_alloc)
1693 		return -ENOMEM;
1694 
1695 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1696 
1697 	mr->desc_map = dma_map_single(device->dev.parent, mr->descs,
1698 				      size, DMA_TO_DEVICE);
1699 	if (dma_mapping_error(device->dev.parent, mr->desc_map)) {
1700 		ret = -ENOMEM;
1701 		goto err;
1702 	}
1703 
1704 	return 0;
1705 err:
1706 	kfree(mr->descs_alloc);
1707 
1708 	return ret;
1709 }
1710 
1711 static void
1712 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1713 {
1714 	if (mr->descs) {
1715 		struct ib_device *device = mr->ibmr.device;
1716 		int size = mr->max_descs * mr->desc_size;
1717 
1718 		dma_unmap_single(device->dev.parent, mr->desc_map,
1719 				 size, DMA_TO_DEVICE);
1720 		kfree(mr->descs_alloc);
1721 		mr->descs = NULL;
1722 	}
1723 }
1724 
1725 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1726 {
1727 	if (mr->sig) {
1728 		if (mlx5_core_destroy_psv(dev->mdev,
1729 					  mr->sig->psv_memory.psv_idx))
1730 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1731 				     mr->sig->psv_memory.psv_idx);
1732 		if (mlx5_core_destroy_psv(dev->mdev,
1733 					  mr->sig->psv_wire.psv_idx))
1734 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1735 				     mr->sig->psv_wire.psv_idx);
1736 		xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
1737 		kfree(mr->sig);
1738 		mr->sig = NULL;
1739 	}
1740 
1741 	if (!mr->cache_ent) {
1742 		destroy_mkey(dev, mr);
1743 		mlx5_free_priv_descs(mr);
1744 	}
1745 }
1746 
1747 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1748 {
1749 	struct ib_umem *umem = mr->umem;
1750 
1751 	/* Stop all DMA */
1752 	if (is_odp_mr(mr))
1753 		mlx5_ib_fence_odp_mr(mr);
1754 	else
1755 		clean_mr(dev, mr);
1756 
1757 	if (umem) {
1758 		if (!is_odp_mr(mr))
1759 			atomic_sub(ib_umem_num_pages(umem),
1760 				   &dev->mdev->priv.reg_pages);
1761 		ib_umem_release(umem);
1762 	}
1763 
1764 	if (mr->cache_ent)
1765 		mlx5_mr_cache_free(dev, mr);
1766 	else
1767 		kfree(mr);
1768 }
1769 
1770 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1771 {
1772 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1773 
1774 	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1775 		dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
1776 		dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
1777 	}
1778 
1779 	if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
1780 		mlx5_ib_free_implicit_mr(mmr);
1781 		return 0;
1782 	}
1783 
1784 	dereg_mr(to_mdev(ibmr->device), mmr);
1785 
1786 	return 0;
1787 }
1788 
1789 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1790 				   int access_mode, int page_shift)
1791 {
1792 	void *mkc;
1793 
1794 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1795 
1796 	/* This is only used from the kernel, so setting the PD is OK. */
1797 	set_mkc_access_pd_addr_fields(mkc, 0, 0, pd);
1798 	MLX5_SET(mkc, mkc, free, 1);
1799 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1800 	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1801 	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1802 	MLX5_SET(mkc, mkc, umr_en, 1);
1803 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
1804 }
1805 
1806 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1807 				  int ndescs, int desc_size, int page_shift,
1808 				  int access_mode, u32 *in, int inlen)
1809 {
1810 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1811 	int err;
1812 
1813 	mr->access_mode = access_mode;
1814 	mr->desc_size = desc_size;
1815 	mr->max_descs = ndescs;
1816 
1817 	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1818 	if (err)
1819 		return err;
1820 
1821 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1822 
1823 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1824 	if (err)
1825 		goto err_free_descs;
1826 
1827 	mr->mmkey.type = MLX5_MKEY_MR;
1828 	mr->ibmr.lkey = mr->mmkey.key;
1829 	mr->ibmr.rkey = mr->mmkey.key;
1830 
1831 	return 0;
1832 
1833 err_free_descs:
1834 	mlx5_free_priv_descs(mr);
1835 	return err;
1836 }
1837 
1838 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1839 				u32 max_num_sg, u32 max_num_meta_sg,
1840 				int desc_size, int access_mode)
1841 {
1842 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1843 	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1844 	int page_shift = 0;
1845 	struct mlx5_ib_mr *mr;
1846 	u32 *in;
1847 	int err;
1848 
1849 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1850 	if (!mr)
1851 		return ERR_PTR(-ENOMEM);
1852 
1853 	mr->ibmr.pd = pd;
1854 	mr->ibmr.device = pd->device;
1855 
1856 	in = kzalloc(inlen, GFP_KERNEL);
1857 	if (!in) {
1858 		err = -ENOMEM;
1859 		goto err_free;
1860 	}
1861 
1862 	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1863 		page_shift = PAGE_SHIFT;
1864 
1865 	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1866 				     access_mode, in, inlen);
1867 	if (err)
1868 		goto err_free_in;
1869 
1870 	mr->umem = NULL;
1871 	kfree(in);
1872 
1873 	return mr;
1874 
1875 err_free_in:
1876 	kfree(in);
1877 err_free:
1878 	kfree(mr);
1879 	return ERR_PTR(err);
1880 }
1881 
1882 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1883 				    int ndescs, u32 *in, int inlen)
1884 {
1885 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
1886 				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
1887 				      inlen);
1888 }
1889 
1890 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1891 				    int ndescs, u32 *in, int inlen)
1892 {
1893 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
1894 				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1895 }
1896 
1897 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1898 				      int max_num_sg, int max_num_meta_sg,
1899 				      u32 *in, int inlen)
1900 {
1901 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1902 	u32 psv_index[2];
1903 	void *mkc;
1904 	int err;
1905 
1906 	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1907 	if (!mr->sig)
1908 		return -ENOMEM;
1909 
1910 	/* create mem & wire PSVs */
1911 	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
1912 	if (err)
1913 		goto err_free_sig;
1914 
1915 	mr->sig->psv_memory.psv_idx = psv_index[0];
1916 	mr->sig->psv_wire.psv_idx = psv_index[1];
1917 
1918 	mr->sig->sig_status_checked = true;
1919 	mr->sig->sig_err_exists = false;
1920 	/* Next UMR, Arm SIGERR */
1921 	++mr->sig->sigerr_count;
1922 	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1923 					 sizeof(struct mlx5_klm),
1924 					 MLX5_MKC_ACCESS_MODE_KLMS);
1925 	if (IS_ERR(mr->klm_mr)) {
1926 		err = PTR_ERR(mr->klm_mr);
1927 		goto err_destroy_psv;
1928 	}
1929 	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1930 					 sizeof(struct mlx5_mtt),
1931 					 MLX5_MKC_ACCESS_MODE_MTT);
1932 	if (IS_ERR(mr->mtt_mr)) {
1933 		err = PTR_ERR(mr->mtt_mr);
1934 		goto err_free_klm_mr;
1935 	}
1936 
1937 	/* Set bsf descriptors for mkey */
1938 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1939 	MLX5_SET(mkc, mkc, bsf_en, 1);
1940 	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1941 
1942 	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
1943 				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1944 	if (err)
1945 		goto err_free_mtt_mr;
1946 
1947 	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1948 			      mr->sig, GFP_KERNEL));
1949 	if (err)
1950 		goto err_free_descs;
1951 	return 0;
1952 
1953 err_free_descs:
1954 	destroy_mkey(dev, mr);
1955 	mlx5_free_priv_descs(mr);
1956 err_free_mtt_mr:
1957 	dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
1958 	mr->mtt_mr = NULL;
1959 err_free_klm_mr:
1960 	dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
1961 	mr->klm_mr = NULL;
1962 err_destroy_psv:
1963 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
1964 		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1965 			     mr->sig->psv_memory.psv_idx);
1966 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1967 		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1968 			     mr->sig->psv_wire.psv_idx);
1969 err_free_sig:
1970 	kfree(mr->sig);
1971 
1972 	return err;
1973 }
1974 
1975 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
1976 					enum ib_mr_type mr_type, u32 max_num_sg,
1977 					u32 max_num_meta_sg)
1978 {
1979 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1980 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1981 	int ndescs = ALIGN(max_num_sg, 4);
1982 	struct mlx5_ib_mr *mr;
1983 	u32 *in;
1984 	int err;
1985 
1986 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1987 	if (!mr)
1988 		return ERR_PTR(-ENOMEM);
1989 
1990 	in = kzalloc(inlen, GFP_KERNEL);
1991 	if (!in) {
1992 		err = -ENOMEM;
1993 		goto err_free;
1994 	}
1995 
1996 	mr->ibmr.device = pd->device;
1997 	mr->umem = NULL;
1998 
1999 	switch (mr_type) {
2000 	case IB_MR_TYPE_MEM_REG:
2001 		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2002 		break;
2003 	case IB_MR_TYPE_SG_GAPS:
2004 		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2005 		break;
2006 	case IB_MR_TYPE_INTEGRITY:
2007 		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2008 						 max_num_meta_sg, in, inlen);
2009 		break;
2010 	default:
2011 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2012 		err = -EINVAL;
2013 	}
2014 
2015 	if (err)
2016 		goto err_free_in;
2017 
2018 	kfree(in);
2019 
2020 	return &mr->ibmr;
2021 
2022 err_free_in:
2023 	kfree(in);
2024 err_free:
2025 	kfree(mr);
2026 	return ERR_PTR(err);
2027 }
2028 
2029 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2030 			       u32 max_num_sg)
2031 {
2032 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2033 }
2034 
2035 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2036 					 u32 max_num_sg, u32 max_num_meta_sg)
2037 {
2038 	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2039 				  max_num_meta_sg);
2040 }
2041 
2042 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2043 {
2044 	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2045 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2046 	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2047 	u32 *in = NULL;
2048 	void *mkc;
2049 	int ndescs;
2050 	int err;
2051 	struct mlx5_ib_alloc_mw req = {};
2052 	struct {
2053 		__u32	comp_mask;
2054 		__u32	response_length;
2055 	} resp = {};
2056 
2057 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2058 	if (err)
2059 		return err;
2060 
2061 	if (req.comp_mask || req.reserved1 || req.reserved2)
2062 		return -EOPNOTSUPP;
2063 
2064 	if (udata->inlen > sizeof(req) &&
2065 	    !ib_is_udata_cleared(udata, sizeof(req),
2066 				 udata->inlen - sizeof(req)))
2067 		return -EOPNOTSUPP;
2068 
2069 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2070 
2071 	in = kzalloc(inlen, GFP_KERNEL);
2072 	if (!in) {
2073 		err = -ENOMEM;
2074 		goto free;
2075 	}
2076 
2077 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2078 
2079 	MLX5_SET(mkc, mkc, free, 1);
2080 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2081 	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2082 	MLX5_SET(mkc, mkc, umr_en, 1);
2083 	MLX5_SET(mkc, mkc, lr, 1);
2084 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2085 	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2086 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2087 
2088 	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2089 	if (err)
2090 		goto free;
2091 
2092 	mw->mmkey.type = MLX5_MKEY_MW;
2093 	ibmw->rkey = mw->mmkey.key;
2094 	mw->ndescs = ndescs;
2095 
2096 	resp.response_length =
2097 		min(offsetofend(typeof(resp), response_length), udata->outlen);
2098 	if (resp.response_length) {
2099 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2100 		if (err)
2101 			goto free_mkey;
2102 	}
2103 
2104 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2105 		err = xa_err(xa_store(&dev->odp_mkeys,
2106 				      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
2107 				      GFP_KERNEL));
2108 		if (err)
2109 			goto free_mkey;
2110 	}
2111 
2112 	kfree(in);
2113 	return 0;
2114 
2115 free_mkey:
2116 	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
2117 free:
2118 	kfree(in);
2119 	return err;
2120 }
2121 
2122 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2123 {
2124 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2125 	struct mlx5_ib_mw *mmw = to_mmw(mw);
2126 
2127 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2128 		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
2129 		/*
2130 		 * pagefault_single_data_segment() may be accessing mmw under
2131 		 * SRCU if the user bound an ODP MR to this MW.
2132 		 */
2133 		synchronize_srcu(&dev->odp_srcu);
2134 	}
2135 
2136 	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
2137 }
2138 
2139 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2140 			    struct ib_mr_status *mr_status)
2141 {
2142 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2143 	int ret = 0;
2144 
2145 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2146 		pr_err("Invalid status check mask\n");
2147 		ret = -EINVAL;
2148 		goto done;
2149 	}
2150 
2151 	mr_status->fail_status = 0;
2152 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2153 		if (!mmr->sig) {
2154 			ret = -EINVAL;
2155 			pr_err("signature status check requested on a non-signature enabled MR\n");
2156 			goto done;
2157 		}
2158 
2159 		mmr->sig->sig_status_checked = true;
2160 		if (!mmr->sig->sig_err_exists)
2161 			goto done;
2162 
2163 		if (ibmr->lkey == mmr->sig->err_item.key)
2164 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2165 			       sizeof(mr_status->sig_err));
2166 		else {
2167 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2168 			mr_status->sig_err.sig_err_offset = 0;
2169 			mr_status->sig_err.key = mmr->sig->err_item.key;
2170 		}
2171 
2172 		mmr->sig->sig_err_exists = false;
2173 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2174 	}
2175 
2176 done:
2177 	return ret;
2178 }
2179 
2180 static int
2181 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2182 			int data_sg_nents, unsigned int *data_sg_offset,
2183 			struct scatterlist *meta_sg, int meta_sg_nents,
2184 			unsigned int *meta_sg_offset)
2185 {
2186 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2187 	unsigned int sg_offset = 0;
2188 	int n = 0;
2189 
2190 	mr->meta_length = 0;
2191 	if (data_sg_nents == 1) {
2192 		n++;
2193 		mr->ndescs = 1;
2194 		if (data_sg_offset)
2195 			sg_offset = *data_sg_offset;
2196 		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2197 		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2198 		if (meta_sg_nents == 1) {
2199 			n++;
2200 			mr->meta_ndescs = 1;
2201 			if (meta_sg_offset)
2202 				sg_offset = *meta_sg_offset;
2203 			else
2204 				sg_offset = 0;
2205 			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2206 			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2207 		}
2208 		ibmr->length = mr->data_length + mr->meta_length;
2209 	}
2210 
2211 	return n;
2212 }
2213 
2214 static int
2215 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2216 		   struct scatterlist *sgl,
2217 		   unsigned short sg_nents,
2218 		   unsigned int *sg_offset_p,
2219 		   struct scatterlist *meta_sgl,
2220 		   unsigned short meta_sg_nents,
2221 		   unsigned int *meta_sg_offset_p)
2222 {
2223 	struct scatterlist *sg = sgl;
2224 	struct mlx5_klm *klms = mr->descs;
2225 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2226 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2227 	int i, j = 0;
2228 
2229 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2230 	mr->ibmr.length = 0;
2231 
2232 	for_each_sg(sgl, sg, sg_nents, i) {
2233 		if (unlikely(i >= mr->max_descs))
2234 			break;
2235 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2236 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2237 		klms[i].key = cpu_to_be32(lkey);
2238 		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2239 
2240 		sg_offset = 0;
2241 	}
2242 
2243 	if (sg_offset_p)
2244 		*sg_offset_p = sg_offset;
2245 
2246 	mr->ndescs = i;
2247 	mr->data_length = mr->ibmr.length;
2248 
2249 	if (meta_sg_nents) {
2250 		sg = meta_sgl;
2251 		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2252 		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2253 			if (unlikely(i + j >= mr->max_descs))
2254 				break;
2255 			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2256 						     sg_offset);
2257 			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2258 							 sg_offset);
2259 			klms[i + j].key = cpu_to_be32(lkey);
2260 			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2261 
2262 			sg_offset = 0;
2263 		}
2264 		if (meta_sg_offset_p)
2265 			*meta_sg_offset_p = sg_offset;
2266 
2267 		mr->meta_ndescs = j;
2268 		mr->meta_length = mr->ibmr.length - mr->data_length;
2269 	}
2270 
2271 	return i + j;
2272 }
2273 
2274 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2275 {
2276 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2277 	__be64 *descs;
2278 
2279 	if (unlikely(mr->ndescs == mr->max_descs))
2280 		return -ENOMEM;
2281 
2282 	descs = mr->descs;
2283 	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2284 
2285 	return 0;
2286 }
2287 
2288 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2289 {
2290 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2291 	__be64 *descs;
2292 
2293 	if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2294 		return -ENOMEM;
2295 
2296 	descs = mr->descs;
2297 	descs[mr->ndescs + mr->meta_ndescs++] =
2298 		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2299 
2300 	return 0;
2301 }
2302 
2303 static int
2304 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2305 			 int data_sg_nents, unsigned int *data_sg_offset,
2306 			 struct scatterlist *meta_sg, int meta_sg_nents,
2307 			 unsigned int *meta_sg_offset)
2308 {
2309 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2310 	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2311 	int n;
2312 
2313 	pi_mr->ndescs = 0;
2314 	pi_mr->meta_ndescs = 0;
2315 	pi_mr->meta_length = 0;
2316 
2317 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2318 				   pi_mr->desc_size * pi_mr->max_descs,
2319 				   DMA_TO_DEVICE);
2320 
2321 	pi_mr->ibmr.page_size = ibmr->page_size;
2322 	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2323 			   mlx5_set_page);
2324 	if (n != data_sg_nents)
2325 		return n;
2326 
2327 	pi_mr->data_iova = pi_mr->ibmr.iova;
2328 	pi_mr->data_length = pi_mr->ibmr.length;
2329 	pi_mr->ibmr.length = pi_mr->data_length;
2330 	ibmr->length = pi_mr->data_length;
2331 
2332 	if (meta_sg_nents) {
2333 		u64 page_mask = ~((u64)ibmr->page_size - 1);
2334 		u64 iova = pi_mr->data_iova;
2335 
2336 		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2337 				    meta_sg_offset, mlx5_set_page_pi);
2338 
2339 		pi_mr->meta_length = pi_mr->ibmr.length;
2340 		/*
2341 		 * PI address for the HW is the offset of the metadata address
2342 		 * relative to the first data page address.
2343 		 * It equals to first data page address + size of data pages +
2344 		 * metadata offset at the first metadata page
2345 		 */
2346 		pi_mr->pi_iova = (iova & page_mask) +
2347 				 pi_mr->ndescs * ibmr->page_size +
2348 				 (pi_mr->ibmr.iova & ~page_mask);
2349 		/*
2350 		 * In order to use one MTT MR for data and metadata, we register
2351 		 * also the gaps between the end of the data and the start of
2352 		 * the metadata (the sig MR will verify that the HW will access
2353 		 * to right addresses). This mapping is safe because we use
2354 		 * internal mkey for the registration.
2355 		 */
2356 		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2357 		pi_mr->ibmr.iova = iova;
2358 		ibmr->length += pi_mr->meta_length;
2359 	}
2360 
2361 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2362 				      pi_mr->desc_size * pi_mr->max_descs,
2363 				      DMA_TO_DEVICE);
2364 
2365 	return n;
2366 }
2367 
2368 static int
2369 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2370 			 int data_sg_nents, unsigned int *data_sg_offset,
2371 			 struct scatterlist *meta_sg, int meta_sg_nents,
2372 			 unsigned int *meta_sg_offset)
2373 {
2374 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2375 	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2376 	int n;
2377 
2378 	pi_mr->ndescs = 0;
2379 	pi_mr->meta_ndescs = 0;
2380 	pi_mr->meta_length = 0;
2381 
2382 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2383 				   pi_mr->desc_size * pi_mr->max_descs,
2384 				   DMA_TO_DEVICE);
2385 
2386 	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2387 			       meta_sg, meta_sg_nents, meta_sg_offset);
2388 
2389 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2390 				      pi_mr->desc_size * pi_mr->max_descs,
2391 				      DMA_TO_DEVICE);
2392 
2393 	/* This is zero-based memory region */
2394 	pi_mr->data_iova = 0;
2395 	pi_mr->ibmr.iova = 0;
2396 	pi_mr->pi_iova = pi_mr->data_length;
2397 	ibmr->length = pi_mr->ibmr.length;
2398 
2399 	return n;
2400 }
2401 
2402 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2403 			 int data_sg_nents, unsigned int *data_sg_offset,
2404 			 struct scatterlist *meta_sg, int meta_sg_nents,
2405 			 unsigned int *meta_sg_offset)
2406 {
2407 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2408 	struct mlx5_ib_mr *pi_mr = NULL;
2409 	int n;
2410 
2411 	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2412 
2413 	mr->ndescs = 0;
2414 	mr->data_length = 0;
2415 	mr->data_iova = 0;
2416 	mr->meta_ndescs = 0;
2417 	mr->pi_iova = 0;
2418 	/*
2419 	 * As a performance optimization, if possible, there is no need to
2420 	 * perform UMR operation to register the data/metadata buffers.
2421 	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2422 	 * Fallback to UMR only in case of a failure.
2423 	 */
2424 	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2425 				    data_sg_offset, meta_sg, meta_sg_nents,
2426 				    meta_sg_offset);
2427 	if (n == data_sg_nents + meta_sg_nents)
2428 		goto out;
2429 	/*
2430 	 * As a performance optimization, if possible, there is no need to map
2431 	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2432 	 * descriptors and fallback to KLM only in case of a failure.
2433 	 * It's more efficient for the HW to work with MTT descriptors
2434 	 * (especially in high load).
2435 	 * Use KLM (indirect access) only if it's mandatory.
2436 	 */
2437 	pi_mr = mr->mtt_mr;
2438 	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2439 				     data_sg_offset, meta_sg, meta_sg_nents,
2440 				     meta_sg_offset);
2441 	if (n == data_sg_nents + meta_sg_nents)
2442 		goto out;
2443 
2444 	pi_mr = mr->klm_mr;
2445 	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2446 				     data_sg_offset, meta_sg, meta_sg_nents,
2447 				     meta_sg_offset);
2448 	if (unlikely(n != data_sg_nents + meta_sg_nents))
2449 		return -ENOMEM;
2450 
2451 out:
2452 	/* This is zero-based memory region */
2453 	ibmr->iova = 0;
2454 	mr->pi_mr = pi_mr;
2455 	if (pi_mr)
2456 		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2457 	else
2458 		ibmr->sig_attrs->meta_length = mr->meta_length;
2459 
2460 	return 0;
2461 }
2462 
2463 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2464 		      unsigned int *sg_offset)
2465 {
2466 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2467 	int n;
2468 
2469 	mr->ndescs = 0;
2470 
2471 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2472 				   mr->desc_size * mr->max_descs,
2473 				   DMA_TO_DEVICE);
2474 
2475 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2476 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2477 				       NULL);
2478 	else
2479 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2480 				mlx5_set_page);
2481 
2482 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2483 				      mr->desc_size * mr->max_descs,
2484 				      DMA_TO_DEVICE);
2485 
2486 	return n;
2487 }
2488