xref: /openbmc/linux/drivers/infiniband/hw/mlx5/mr.c (revision 0edbfea5)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 
34 #include <linux/kref.h>
35 #include <linux/random.h>
36 #include <linux/debugfs.h>
37 #include <linux/export.h>
38 #include <linux/delay.h>
39 #include <rdma/ib_umem.h>
40 #include <rdma/ib_umem_odp.h>
41 #include <rdma/ib_verbs.h>
42 #include "mlx5_ib.h"
43 #include "user.h"
44 
45 enum {
46 	MAX_PENDING_REG_MR = 8,
47 };
48 
49 #define MLX5_UMR_ALIGN 2048
50 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
51 static __be64 mlx5_ib_update_mtt_emergency_buffer[
52 		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
53 	__aligned(MLX5_UMR_ALIGN);
54 static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
55 #endif
56 
57 static int clean_mr(struct mlx5_ib_mr *mr);
58 
59 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
60 {
61 	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
62 
63 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
64 	/* Wait until all page fault handlers using the mr complete. */
65 	synchronize_srcu(&dev->mr_srcu);
66 #endif
67 
68 	return err;
69 }
70 
71 static int order2idx(struct mlx5_ib_dev *dev, int order)
72 {
73 	struct mlx5_mr_cache *cache = &dev->cache;
74 
75 	if (order < cache->ent[0].order)
76 		return 0;
77 	else
78 		return order - cache->ent[0].order;
79 }
80 
81 static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
82 {
83 	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
84 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
85 }
86 
87 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
88 static void update_odp_mr(struct mlx5_ib_mr *mr)
89 {
90 	if (mr->umem->odp_data) {
91 		/*
92 		 * This barrier prevents the compiler from moving the
93 		 * setting of umem->odp_data->private to point to our
94 		 * MR, before reg_umr finished, to ensure that the MR
95 		 * initialization have finished before starting to
96 		 * handle invalidations.
97 		 */
98 		smp_wmb();
99 		mr->umem->odp_data->private = mr;
100 		/*
101 		 * Make sure we will see the new
102 		 * umem->odp_data->private value in the invalidation
103 		 * routines, before we can get page faults on the
104 		 * MR. Page faults can happen once we put the MR in
105 		 * the tree, below this line. Without the barrier,
106 		 * there can be a fault handling and an invalidation
107 		 * before umem->odp_data->private == mr is visible to
108 		 * the invalidation handler.
109 		 */
110 		smp_wmb();
111 	}
112 }
113 #endif
114 
115 static void reg_mr_callback(int status, void *context)
116 {
117 	struct mlx5_ib_mr *mr = context;
118 	struct mlx5_ib_dev *dev = mr->dev;
119 	struct mlx5_mr_cache *cache = &dev->cache;
120 	int c = order2idx(dev, mr->order);
121 	struct mlx5_cache_ent *ent = &cache->ent[c];
122 	u8 key;
123 	unsigned long flags;
124 	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
125 	int err;
126 
127 	spin_lock_irqsave(&ent->lock, flags);
128 	ent->pending--;
129 	spin_unlock_irqrestore(&ent->lock, flags);
130 	if (status) {
131 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
132 		kfree(mr);
133 		dev->fill_delay = 1;
134 		mod_timer(&dev->delay_timer, jiffies + HZ);
135 		return;
136 	}
137 
138 	if (mr->out.hdr.status) {
139 		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
140 			     mr->out.hdr.status,
141 			     be32_to_cpu(mr->out.hdr.syndrome));
142 		kfree(mr);
143 		dev->fill_delay = 1;
144 		mod_timer(&dev->delay_timer, jiffies + HZ);
145 		return;
146 	}
147 
148 	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
149 	key = dev->mdev->priv.mkey_key++;
150 	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
151 	mr->mmkey.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
152 
153 	cache->last_add = jiffies;
154 
155 	spin_lock_irqsave(&ent->lock, flags);
156 	list_add_tail(&mr->list, &ent->head);
157 	ent->cur++;
158 	ent->size++;
159 	spin_unlock_irqrestore(&ent->lock, flags);
160 
161 	write_lock_irqsave(&table->lock, flags);
162 	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
163 				&mr->mmkey);
164 	if (err)
165 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
166 	write_unlock_irqrestore(&table->lock, flags);
167 }
168 
169 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
170 {
171 	struct mlx5_mr_cache *cache = &dev->cache;
172 	struct mlx5_cache_ent *ent = &cache->ent[c];
173 	struct mlx5_create_mkey_mbox_in *in;
174 	struct mlx5_ib_mr *mr;
175 	int npages = 1 << ent->order;
176 	int err = 0;
177 	int i;
178 
179 	in = kzalloc(sizeof(*in), GFP_KERNEL);
180 	if (!in)
181 		return -ENOMEM;
182 
183 	for (i = 0; i < num; i++) {
184 		if (ent->pending >= MAX_PENDING_REG_MR) {
185 			err = -EAGAIN;
186 			break;
187 		}
188 
189 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
190 		if (!mr) {
191 			err = -ENOMEM;
192 			break;
193 		}
194 		mr->order = ent->order;
195 		mr->umred = 1;
196 		mr->dev = dev;
197 		in->seg.status = MLX5_MKEY_STATUS_FREE;
198 		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
199 		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
200 		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
201 		in->seg.log2_page_size = 12;
202 
203 		spin_lock_irq(&ent->lock);
204 		ent->pending++;
205 		spin_unlock_irq(&ent->lock);
206 		err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in,
207 					    sizeof(*in), reg_mr_callback,
208 					    mr, &mr->out);
209 		if (err) {
210 			spin_lock_irq(&ent->lock);
211 			ent->pending--;
212 			spin_unlock_irq(&ent->lock);
213 			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
214 			kfree(mr);
215 			break;
216 		}
217 	}
218 
219 	kfree(in);
220 	return err;
221 }
222 
223 static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
224 {
225 	struct mlx5_mr_cache *cache = &dev->cache;
226 	struct mlx5_cache_ent *ent = &cache->ent[c];
227 	struct mlx5_ib_mr *mr;
228 	int err;
229 	int i;
230 
231 	for (i = 0; i < num; i++) {
232 		spin_lock_irq(&ent->lock);
233 		if (list_empty(&ent->head)) {
234 			spin_unlock_irq(&ent->lock);
235 			return;
236 		}
237 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
238 		list_del(&mr->list);
239 		ent->cur--;
240 		ent->size--;
241 		spin_unlock_irq(&ent->lock);
242 		err = destroy_mkey(dev, mr);
243 		if (err)
244 			mlx5_ib_warn(dev, "failed destroy mkey\n");
245 		else
246 			kfree(mr);
247 	}
248 }
249 
250 static ssize_t size_write(struct file *filp, const char __user *buf,
251 			  size_t count, loff_t *pos)
252 {
253 	struct mlx5_cache_ent *ent = filp->private_data;
254 	struct mlx5_ib_dev *dev = ent->dev;
255 	char lbuf[20];
256 	u32 var;
257 	int err;
258 	int c;
259 
260 	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
261 		return -EFAULT;
262 
263 	c = order2idx(dev, ent->order);
264 	lbuf[sizeof(lbuf) - 1] = 0;
265 
266 	if (sscanf(lbuf, "%u", &var) != 1)
267 		return -EINVAL;
268 
269 	if (var < ent->limit)
270 		return -EINVAL;
271 
272 	if (var > ent->size) {
273 		do {
274 			err = add_keys(dev, c, var - ent->size);
275 			if (err && err != -EAGAIN)
276 				return err;
277 
278 			usleep_range(3000, 5000);
279 		} while (err);
280 	} else if (var < ent->size) {
281 		remove_keys(dev, c, ent->size - var);
282 	}
283 
284 	return count;
285 }
286 
287 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
288 			 loff_t *pos)
289 {
290 	struct mlx5_cache_ent *ent = filp->private_data;
291 	char lbuf[20];
292 	int err;
293 
294 	if (*pos)
295 		return 0;
296 
297 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
298 	if (err < 0)
299 		return err;
300 
301 	if (copy_to_user(buf, lbuf, err))
302 		return -EFAULT;
303 
304 	*pos += err;
305 
306 	return err;
307 }
308 
309 static const struct file_operations size_fops = {
310 	.owner	= THIS_MODULE,
311 	.open	= simple_open,
312 	.write	= size_write,
313 	.read	= size_read,
314 };
315 
316 static ssize_t limit_write(struct file *filp, const char __user *buf,
317 			   size_t count, loff_t *pos)
318 {
319 	struct mlx5_cache_ent *ent = filp->private_data;
320 	struct mlx5_ib_dev *dev = ent->dev;
321 	char lbuf[20];
322 	u32 var;
323 	int err;
324 	int c;
325 
326 	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
327 		return -EFAULT;
328 
329 	c = order2idx(dev, ent->order);
330 	lbuf[sizeof(lbuf) - 1] = 0;
331 
332 	if (sscanf(lbuf, "%u", &var) != 1)
333 		return -EINVAL;
334 
335 	if (var > ent->size)
336 		return -EINVAL;
337 
338 	ent->limit = var;
339 
340 	if (ent->cur < ent->limit) {
341 		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
342 		if (err)
343 			return err;
344 	}
345 
346 	return count;
347 }
348 
349 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
350 			  loff_t *pos)
351 {
352 	struct mlx5_cache_ent *ent = filp->private_data;
353 	char lbuf[20];
354 	int err;
355 
356 	if (*pos)
357 		return 0;
358 
359 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
360 	if (err < 0)
361 		return err;
362 
363 	if (copy_to_user(buf, lbuf, err))
364 		return -EFAULT;
365 
366 	*pos += err;
367 
368 	return err;
369 }
370 
371 static const struct file_operations limit_fops = {
372 	.owner	= THIS_MODULE,
373 	.open	= simple_open,
374 	.write	= limit_write,
375 	.read	= limit_read,
376 };
377 
378 static int someone_adding(struct mlx5_mr_cache *cache)
379 {
380 	int i;
381 
382 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
383 		if (cache->ent[i].cur < cache->ent[i].limit)
384 			return 1;
385 	}
386 
387 	return 0;
388 }
389 
390 static void __cache_work_func(struct mlx5_cache_ent *ent)
391 {
392 	struct mlx5_ib_dev *dev = ent->dev;
393 	struct mlx5_mr_cache *cache = &dev->cache;
394 	int i = order2idx(dev, ent->order);
395 	int err;
396 
397 	if (cache->stopped)
398 		return;
399 
400 	ent = &dev->cache.ent[i];
401 	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
402 		err = add_keys(dev, i, 1);
403 		if (ent->cur < 2 * ent->limit) {
404 			if (err == -EAGAIN) {
405 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
406 					    i + 2);
407 				queue_delayed_work(cache->wq, &ent->dwork,
408 						   msecs_to_jiffies(3));
409 			} else if (err) {
410 				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
411 					     i + 2, err);
412 				queue_delayed_work(cache->wq, &ent->dwork,
413 						   msecs_to_jiffies(1000));
414 			} else {
415 				queue_work(cache->wq, &ent->work);
416 			}
417 		}
418 	} else if (ent->cur > 2 * ent->limit) {
419 		/*
420 		 * The remove_keys() logic is performed as garbage collection
421 		 * task. Such task is intended to be run when no other active
422 		 * processes are running.
423 		 *
424 		 * The need_resched() will return TRUE if there are user tasks
425 		 * to be activated in near future.
426 		 *
427 		 * In such case, we don't execute remove_keys() and postpone
428 		 * the garbage collection work to try to run in next cycle,
429 		 * in order to free CPU resources to other tasks.
430 		 */
431 		if (!need_resched() && !someone_adding(cache) &&
432 		    time_after(jiffies, cache->last_add + 300 * HZ)) {
433 			remove_keys(dev, i, 1);
434 			if (ent->cur > ent->limit)
435 				queue_work(cache->wq, &ent->work);
436 		} else {
437 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
438 		}
439 	}
440 }
441 
442 static void delayed_cache_work_func(struct work_struct *work)
443 {
444 	struct mlx5_cache_ent *ent;
445 
446 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
447 	__cache_work_func(ent);
448 }
449 
450 static void cache_work_func(struct work_struct *work)
451 {
452 	struct mlx5_cache_ent *ent;
453 
454 	ent = container_of(work, struct mlx5_cache_ent, work);
455 	__cache_work_func(ent);
456 }
457 
458 static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
459 {
460 	struct mlx5_mr_cache *cache = &dev->cache;
461 	struct mlx5_ib_mr *mr = NULL;
462 	struct mlx5_cache_ent *ent;
463 	int c;
464 	int i;
465 
466 	c = order2idx(dev, order);
467 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
468 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
469 		return NULL;
470 	}
471 
472 	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
473 		ent = &cache->ent[i];
474 
475 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
476 
477 		spin_lock_irq(&ent->lock);
478 		if (!list_empty(&ent->head)) {
479 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
480 					      list);
481 			list_del(&mr->list);
482 			ent->cur--;
483 			spin_unlock_irq(&ent->lock);
484 			if (ent->cur < ent->limit)
485 				queue_work(cache->wq, &ent->work);
486 			break;
487 		}
488 		spin_unlock_irq(&ent->lock);
489 
490 		queue_work(cache->wq, &ent->work);
491 	}
492 
493 	if (!mr)
494 		cache->ent[c].miss++;
495 
496 	return mr;
497 }
498 
499 static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
500 {
501 	struct mlx5_mr_cache *cache = &dev->cache;
502 	struct mlx5_cache_ent *ent;
503 	int shrink = 0;
504 	int c;
505 
506 	c = order2idx(dev, mr->order);
507 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
508 		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
509 		return;
510 	}
511 	ent = &cache->ent[c];
512 	spin_lock_irq(&ent->lock);
513 	list_add_tail(&mr->list, &ent->head);
514 	ent->cur++;
515 	if (ent->cur > 2 * ent->limit)
516 		shrink = 1;
517 	spin_unlock_irq(&ent->lock);
518 
519 	if (shrink)
520 		queue_work(cache->wq, &ent->work);
521 }
522 
523 static void clean_keys(struct mlx5_ib_dev *dev, int c)
524 {
525 	struct mlx5_mr_cache *cache = &dev->cache;
526 	struct mlx5_cache_ent *ent = &cache->ent[c];
527 	struct mlx5_ib_mr *mr;
528 	int err;
529 
530 	cancel_delayed_work(&ent->dwork);
531 	while (1) {
532 		spin_lock_irq(&ent->lock);
533 		if (list_empty(&ent->head)) {
534 			spin_unlock_irq(&ent->lock);
535 			return;
536 		}
537 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
538 		list_del(&mr->list);
539 		ent->cur--;
540 		ent->size--;
541 		spin_unlock_irq(&ent->lock);
542 		err = destroy_mkey(dev, mr);
543 		if (err)
544 			mlx5_ib_warn(dev, "failed destroy mkey\n");
545 		else
546 			kfree(mr);
547 	}
548 }
549 
550 static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
551 {
552 	struct mlx5_mr_cache *cache = &dev->cache;
553 	struct mlx5_cache_ent *ent;
554 	int i;
555 
556 	if (!mlx5_debugfs_root)
557 		return 0;
558 
559 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
560 	if (!cache->root)
561 		return -ENOMEM;
562 
563 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
564 		ent = &cache->ent[i];
565 		sprintf(ent->name, "%d", ent->order);
566 		ent->dir = debugfs_create_dir(ent->name,  cache->root);
567 		if (!ent->dir)
568 			return -ENOMEM;
569 
570 		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
571 						 &size_fops);
572 		if (!ent->fsize)
573 			return -ENOMEM;
574 
575 		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
576 						  &limit_fops);
577 		if (!ent->flimit)
578 			return -ENOMEM;
579 
580 		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
581 					       &ent->cur);
582 		if (!ent->fcur)
583 			return -ENOMEM;
584 
585 		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
586 						&ent->miss);
587 		if (!ent->fmiss)
588 			return -ENOMEM;
589 	}
590 
591 	return 0;
592 }
593 
594 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
595 {
596 	if (!mlx5_debugfs_root)
597 		return;
598 
599 	debugfs_remove_recursive(dev->cache.root);
600 }
601 
602 static void delay_time_func(unsigned long ctx)
603 {
604 	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
605 
606 	dev->fill_delay = 0;
607 }
608 
609 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
610 {
611 	struct mlx5_mr_cache *cache = &dev->cache;
612 	struct mlx5_cache_ent *ent;
613 	int limit;
614 	int err;
615 	int i;
616 
617 	cache->wq = create_singlethread_workqueue("mkey_cache");
618 	if (!cache->wq) {
619 		mlx5_ib_warn(dev, "failed to create work queue\n");
620 		return -ENOMEM;
621 	}
622 
623 	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
624 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
625 		INIT_LIST_HEAD(&cache->ent[i].head);
626 		spin_lock_init(&cache->ent[i].lock);
627 
628 		ent = &cache->ent[i];
629 		INIT_LIST_HEAD(&ent->head);
630 		spin_lock_init(&ent->lock);
631 		ent->order = i + 2;
632 		ent->dev = dev;
633 
634 		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
635 			limit = dev->mdev->profile->mr_cache[i].limit;
636 		else
637 			limit = 0;
638 
639 		INIT_WORK(&ent->work, cache_work_func);
640 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
641 		ent->limit = limit;
642 		queue_work(cache->wq, &ent->work);
643 	}
644 
645 	err = mlx5_mr_cache_debugfs_init(dev);
646 	if (err)
647 		mlx5_ib_warn(dev, "cache debugfs failure\n");
648 
649 	return 0;
650 }
651 
652 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
653 {
654 	int i;
655 
656 	dev->cache.stopped = 1;
657 	flush_workqueue(dev->cache.wq);
658 
659 	mlx5_mr_cache_debugfs_cleanup(dev);
660 
661 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
662 		clean_keys(dev, i);
663 
664 	destroy_workqueue(dev->cache.wq);
665 	del_timer_sync(&dev->delay_timer);
666 
667 	return 0;
668 }
669 
670 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
671 {
672 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
673 	struct mlx5_core_dev *mdev = dev->mdev;
674 	struct mlx5_create_mkey_mbox_in *in;
675 	struct mlx5_mkey_seg *seg;
676 	struct mlx5_ib_mr *mr;
677 	int err;
678 
679 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
680 	if (!mr)
681 		return ERR_PTR(-ENOMEM);
682 
683 	in = kzalloc(sizeof(*in), GFP_KERNEL);
684 	if (!in) {
685 		err = -ENOMEM;
686 		goto err_free;
687 	}
688 
689 	seg = &in->seg;
690 	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
691 	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
692 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
693 	seg->start_addr = 0;
694 
695 	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL,
696 				    NULL);
697 	if (err)
698 		goto err_in;
699 
700 	kfree(in);
701 	mr->ibmr.lkey = mr->mmkey.key;
702 	mr->ibmr.rkey = mr->mmkey.key;
703 	mr->umem = NULL;
704 
705 	return &mr->ibmr;
706 
707 err_in:
708 	kfree(in);
709 
710 err_free:
711 	kfree(mr);
712 
713 	return ERR_PTR(err);
714 }
715 
716 static int get_octo_len(u64 addr, u64 len, int page_size)
717 {
718 	u64 offset;
719 	int npages;
720 
721 	offset = addr & (page_size - 1);
722 	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
723 	return (npages + 1) / 2;
724 }
725 
726 static int use_umr(int order)
727 {
728 	return order <= MLX5_MAX_UMR_SHIFT;
729 }
730 
731 static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
732 			  int npages, int page_shift, int *size,
733 			  __be64 **mr_pas, dma_addr_t *dma)
734 {
735 	__be64 *pas;
736 	struct device *ddev = dev->ib_dev.dma_device;
737 
738 	/*
739 	 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
740 	 * To avoid copying garbage after the pas array, we allocate
741 	 * a little more.
742 	 */
743 	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
744 	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
745 	if (!(*mr_pas))
746 		return -ENOMEM;
747 
748 	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
749 	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
750 	/* Clear padding after the actual pages. */
751 	memset(pas + npages, 0, *size - npages * sizeof(u64));
752 
753 	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
754 	if (dma_mapping_error(ddev, *dma)) {
755 		kfree(*mr_pas);
756 		return -ENOMEM;
757 	}
758 
759 	return 0;
760 }
761 
762 static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
763 				struct ib_sge *sg, u64 dma, int n, u32 key,
764 				int page_shift)
765 {
766 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
767 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
768 
769 	sg->addr = dma;
770 	sg->length = ALIGN(sizeof(u64) * n, 64);
771 	sg->lkey = dev->umrc.pd->local_dma_lkey;
772 
773 	wr->next = NULL;
774 	wr->sg_list = sg;
775 	if (n)
776 		wr->num_sge = 1;
777 	else
778 		wr->num_sge = 0;
779 
780 	wr->opcode = MLX5_IB_WR_UMR;
781 
782 	umrwr->npages = n;
783 	umrwr->page_shift = page_shift;
784 	umrwr->mkey = key;
785 }
786 
787 static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
788 			     struct ib_sge *sg, u64 dma, int n, u32 key,
789 			     int page_shift, u64 virt_addr, u64 len,
790 			     int access_flags)
791 {
792 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
793 
794 	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
795 
796 	wr->send_flags = 0;
797 
798 	umrwr->target.virt_addr = virt_addr;
799 	umrwr->length = len;
800 	umrwr->access_flags = access_flags;
801 	umrwr->pd = pd;
802 }
803 
804 static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
805 			       struct ib_send_wr *wr, u32 key)
806 {
807 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
808 
809 	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
810 	wr->opcode = MLX5_IB_WR_UMR;
811 	umrwr->mkey = key;
812 }
813 
814 static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
815 				   int access_flags, int *npages,
816 				   int *page_shift, int *ncont, int *order)
817 {
818 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
819 	struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
820 					   access_flags, 0);
821 	if (IS_ERR(umem)) {
822 		mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
823 		return (void *)umem;
824 	}
825 
826 	mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order);
827 	if (!*npages) {
828 		mlx5_ib_warn(dev, "avoid zero region\n");
829 		ib_umem_release(umem);
830 		return ERR_PTR(-EINVAL);
831 	}
832 
833 	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
834 		    *npages, *ncont, *order, *page_shift);
835 
836 	return umem;
837 }
838 
839 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
840 {
841 	struct mlx5_ib_umr_context *context =
842 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
843 
844 	context->status = wc->status;
845 	complete(&context->done);
846 }
847 
848 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
849 {
850 	context->cqe.done = mlx5_ib_umr_done;
851 	context->status = -1;
852 	init_completion(&context->done);
853 }
854 
855 static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
856 				  u64 virt_addr, u64 len, int npages,
857 				  int page_shift, int order, int access_flags)
858 {
859 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
860 	struct device *ddev = dev->ib_dev.dma_device;
861 	struct umr_common *umrc = &dev->umrc;
862 	struct mlx5_ib_umr_context umr_context;
863 	struct mlx5_umr_wr umrwr = {};
864 	struct ib_send_wr *bad;
865 	struct mlx5_ib_mr *mr;
866 	struct ib_sge sg;
867 	int size;
868 	__be64 *mr_pas;
869 	dma_addr_t dma;
870 	int err = 0;
871 	int i;
872 
873 	for (i = 0; i < 1; i++) {
874 		mr = alloc_cached_mr(dev, order);
875 		if (mr)
876 			break;
877 
878 		err = add_keys(dev, order2idx(dev, order), 1);
879 		if (err && err != -EAGAIN) {
880 			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
881 			break;
882 		}
883 	}
884 
885 	if (!mr)
886 		return ERR_PTR(-EAGAIN);
887 
888 	err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
889 			     &dma);
890 	if (err)
891 		goto free_mr;
892 
893 	mlx5_ib_init_umr_context(&umr_context);
894 
895 	umrwr.wr.wr_cqe = &umr_context.cqe;
896 	prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
897 			 page_shift, virt_addr, len, access_flags);
898 
899 	down(&umrc->sem);
900 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
901 	if (err) {
902 		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
903 		goto unmap_dma;
904 	} else {
905 		wait_for_completion(&umr_context.done);
906 		if (umr_context.status != IB_WC_SUCCESS) {
907 			mlx5_ib_warn(dev, "reg umr failed\n");
908 			err = -EFAULT;
909 		}
910 	}
911 
912 	mr->mmkey.iova = virt_addr;
913 	mr->mmkey.size = len;
914 	mr->mmkey.pd = to_mpd(pd)->pdn;
915 
916 	mr->live = 1;
917 
918 unmap_dma:
919 	up(&umrc->sem);
920 	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
921 
922 	kfree(mr_pas);
923 
924 free_mr:
925 	if (err) {
926 		free_cached_mr(dev, mr);
927 		return ERR_PTR(err);
928 	}
929 
930 	return mr;
931 }
932 
933 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
934 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
935 		       int zap)
936 {
937 	struct mlx5_ib_dev *dev = mr->dev;
938 	struct device *ddev = dev->ib_dev.dma_device;
939 	struct umr_common *umrc = &dev->umrc;
940 	struct mlx5_ib_umr_context umr_context;
941 	struct ib_umem *umem = mr->umem;
942 	int size;
943 	__be64 *pas;
944 	dma_addr_t dma;
945 	struct ib_send_wr *bad;
946 	struct mlx5_umr_wr wr;
947 	struct ib_sge sg;
948 	int err = 0;
949 	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
950 	const int page_index_mask = page_index_alignment - 1;
951 	size_t pages_mapped = 0;
952 	size_t pages_to_map = 0;
953 	size_t pages_iter = 0;
954 	int use_emergency_buf = 0;
955 
956 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
957 	 * so we need to align the offset and length accordingly */
958 	if (start_page_index & page_index_mask) {
959 		npages += start_page_index & page_index_mask;
960 		start_page_index &= ~page_index_mask;
961 	}
962 
963 	pages_to_map = ALIGN(npages, page_index_alignment);
964 
965 	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
966 		return -EINVAL;
967 
968 	size = sizeof(u64) * pages_to_map;
969 	size = min_t(int, PAGE_SIZE, size);
970 	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
971 	 * code, when we are called from an invalidation. The pas buffer must
972 	 * be 2k-aligned for Connect-IB. */
973 	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
974 	if (!pas) {
975 		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
976 		pas = mlx5_ib_update_mtt_emergency_buffer;
977 		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
978 		use_emergency_buf = 1;
979 		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
980 		memset(pas, 0, size);
981 	}
982 	pages_iter = size / sizeof(u64);
983 	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
984 	if (dma_mapping_error(ddev, dma)) {
985 		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
986 		err = -ENOMEM;
987 		goto free_pas;
988 	}
989 
990 	for (pages_mapped = 0;
991 	     pages_mapped < pages_to_map && !err;
992 	     pages_mapped += pages_iter, start_page_index += pages_iter) {
993 		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
994 
995 		npages = min_t(size_t,
996 			       pages_iter,
997 			       ib_umem_num_pages(umem) - start_page_index);
998 
999 		if (!zap) {
1000 			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
1001 					       start_page_index, npages, pas,
1002 					       MLX5_IB_MTT_PRESENT);
1003 			/* Clear padding after the pages brought from the
1004 			 * umem. */
1005 			memset(pas + npages, 0, size - npages * sizeof(u64));
1006 		}
1007 
1008 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
1009 
1010 		mlx5_ib_init_umr_context(&umr_context);
1011 
1012 		memset(&wr, 0, sizeof(wr));
1013 		wr.wr.wr_cqe = &umr_context.cqe;
1014 
1015 		sg.addr = dma;
1016 		sg.length = ALIGN(npages * sizeof(u64),
1017 				MLX5_UMR_MTT_ALIGNMENT);
1018 		sg.lkey = dev->umrc.pd->local_dma_lkey;
1019 
1020 		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1021 				MLX5_IB_SEND_UMR_UPDATE_MTT;
1022 		wr.wr.sg_list = &sg;
1023 		wr.wr.num_sge = 1;
1024 		wr.wr.opcode = MLX5_IB_WR_UMR;
1025 		wr.npages = sg.length / sizeof(u64);
1026 		wr.page_shift = PAGE_SHIFT;
1027 		wr.mkey = mr->mmkey.key;
1028 		wr.target.offset = start_page_index;
1029 
1030 		down(&umrc->sem);
1031 		err = ib_post_send(umrc->qp, &wr.wr, &bad);
1032 		if (err) {
1033 			mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
1034 		} else {
1035 			wait_for_completion(&umr_context.done);
1036 			if (umr_context.status != IB_WC_SUCCESS) {
1037 				mlx5_ib_err(dev, "UMR completion failed, code %d\n",
1038 					    umr_context.status);
1039 				err = -EFAULT;
1040 			}
1041 		}
1042 		up(&umrc->sem);
1043 	}
1044 	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1045 
1046 free_pas:
1047 	if (!use_emergency_buf)
1048 		free_page((unsigned long)pas);
1049 	else
1050 		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
1051 
1052 	return err;
1053 }
1054 #endif
1055 
1056 /*
1057  * If ibmr is NULL it will be allocated by reg_create.
1058  * Else, the given ibmr will be used.
1059  */
1060 static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1061 				     u64 virt_addr, u64 length,
1062 				     struct ib_umem *umem, int npages,
1063 				     int page_shift, int access_flags)
1064 {
1065 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1066 	struct mlx5_create_mkey_mbox_in *in;
1067 	struct mlx5_ib_mr *mr;
1068 	int inlen;
1069 	int err;
1070 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1071 
1072 	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1073 	if (!mr)
1074 		return ERR_PTR(-ENOMEM);
1075 
1076 	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
1077 	in = mlx5_vzalloc(inlen);
1078 	if (!in) {
1079 		err = -ENOMEM;
1080 		goto err_1;
1081 	}
1082 	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
1083 			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1084 
1085 	/* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
1086 	 * in the page list submitted with the command. */
1087 	in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
1088 	in->seg.flags = convert_access(access_flags) |
1089 		MLX5_ACCESS_MODE_MTT;
1090 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1091 	in->seg.start_addr = cpu_to_be64(virt_addr);
1092 	in->seg.len = cpu_to_be64(length);
1093 	in->seg.bsfs_octo_size = 0;
1094 	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
1095 	in->seg.log2_page_size = page_shift;
1096 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1097 	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
1098 							 1 << page_shift));
1099 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen, NULL,
1100 				    NULL, NULL);
1101 	if (err) {
1102 		mlx5_ib_warn(dev, "create mkey failed\n");
1103 		goto err_2;
1104 	}
1105 	mr->umem = umem;
1106 	mr->dev = dev;
1107 	mr->live = 1;
1108 	kvfree(in);
1109 
1110 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1111 
1112 	return mr;
1113 
1114 err_2:
1115 	kvfree(in);
1116 
1117 err_1:
1118 	if (!ibmr)
1119 		kfree(mr);
1120 
1121 	return ERR_PTR(err);
1122 }
1123 
1124 static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1125 			  int npages, u64 length, int access_flags)
1126 {
1127 	mr->npages = npages;
1128 	atomic_add(npages, &dev->mdev->priv.reg_pages);
1129 	mr->ibmr.lkey = mr->mmkey.key;
1130 	mr->ibmr.rkey = mr->mmkey.key;
1131 	mr->ibmr.length = length;
1132 	mr->access_flags = access_flags;
1133 }
1134 
1135 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1136 				  u64 virt_addr, int access_flags,
1137 				  struct ib_udata *udata)
1138 {
1139 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1140 	struct mlx5_ib_mr *mr = NULL;
1141 	struct ib_umem *umem;
1142 	int page_shift;
1143 	int npages;
1144 	int ncont;
1145 	int order;
1146 	int err;
1147 
1148 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1149 		    start, virt_addr, length, access_flags);
1150 	umem = mr_umem_get(pd, start, length, access_flags, &npages,
1151 			   &page_shift, &ncont, &order);
1152 
1153 	if (IS_ERR(umem))
1154 		return (void *)umem;
1155 
1156 	if (use_umr(order)) {
1157 		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
1158 			     order, access_flags);
1159 		if (PTR_ERR(mr) == -EAGAIN) {
1160 			mlx5_ib_dbg(dev, "cache empty for order %d", order);
1161 			mr = NULL;
1162 		}
1163 	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
1164 		err = -EINVAL;
1165 		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
1166 		goto error;
1167 	}
1168 
1169 	if (!mr)
1170 		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1171 				page_shift, access_flags);
1172 
1173 	if (IS_ERR(mr)) {
1174 		err = PTR_ERR(mr);
1175 		goto error;
1176 	}
1177 
1178 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1179 
1180 	mr->umem = umem;
1181 	set_mr_fileds(dev, mr, npages, length, access_flags);
1182 
1183 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1184 	update_odp_mr(mr);
1185 #endif
1186 
1187 	return &mr->ibmr;
1188 
1189 error:
1190 	ib_umem_release(umem);
1191 	return ERR_PTR(err);
1192 }
1193 
1194 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1195 {
1196 	struct umr_common *umrc = &dev->umrc;
1197 	struct mlx5_ib_umr_context umr_context;
1198 	struct mlx5_umr_wr umrwr = {};
1199 	struct ib_send_wr *bad;
1200 	int err;
1201 
1202 	mlx5_ib_init_umr_context(&umr_context);
1203 
1204 	umrwr.wr.wr_cqe = &umr_context.cqe;
1205 	prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key);
1206 
1207 	down(&umrc->sem);
1208 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1209 	if (err) {
1210 		up(&umrc->sem);
1211 		mlx5_ib_dbg(dev, "err %d\n", err);
1212 		goto error;
1213 	} else {
1214 		wait_for_completion(&umr_context.done);
1215 		up(&umrc->sem);
1216 	}
1217 	if (umr_context.status != IB_WC_SUCCESS) {
1218 		mlx5_ib_warn(dev, "unreg umr failed\n");
1219 		err = -EFAULT;
1220 		goto error;
1221 	}
1222 	return 0;
1223 
1224 error:
1225 	return err;
1226 }
1227 
1228 static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
1229 		     u64 length, int npages, int page_shift, int order,
1230 		     int access_flags, int flags)
1231 {
1232 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1233 	struct device *ddev = dev->ib_dev.dma_device;
1234 	struct mlx5_ib_umr_context umr_context;
1235 	struct ib_send_wr *bad;
1236 	struct mlx5_umr_wr umrwr = {};
1237 	struct ib_sge sg;
1238 	struct umr_common *umrc = &dev->umrc;
1239 	dma_addr_t dma = 0;
1240 	__be64 *mr_pas = NULL;
1241 	int size;
1242 	int err;
1243 
1244 	mlx5_ib_init_umr_context(&umr_context);
1245 
1246 	umrwr.wr.wr_cqe = &umr_context.cqe;
1247 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1248 
1249 	if (flags & IB_MR_REREG_TRANS) {
1250 		err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
1251 				     &mr_pas, &dma);
1252 		if (err)
1253 			return err;
1254 
1255 		umrwr.target.virt_addr = virt_addr;
1256 		umrwr.length = length;
1257 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1258 	}
1259 
1260 	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
1261 			    page_shift);
1262 
1263 	if (flags & IB_MR_REREG_PD) {
1264 		umrwr.pd = pd;
1265 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
1266 	}
1267 
1268 	if (flags & IB_MR_REREG_ACCESS) {
1269 		umrwr.access_flags = access_flags;
1270 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
1271 	}
1272 
1273 	/* post send request to UMR QP */
1274 	down(&umrc->sem);
1275 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1276 
1277 	if (err) {
1278 		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
1279 	} else {
1280 		wait_for_completion(&umr_context.done);
1281 		if (umr_context.status != IB_WC_SUCCESS) {
1282 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
1283 				     umr_context.status);
1284 			err = -EFAULT;
1285 		}
1286 	}
1287 
1288 	up(&umrc->sem);
1289 	if (flags & IB_MR_REREG_TRANS) {
1290 		dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1291 		kfree(mr_pas);
1292 	}
1293 	return err;
1294 }
1295 
1296 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1297 			  u64 length, u64 virt_addr, int new_access_flags,
1298 			  struct ib_pd *new_pd, struct ib_udata *udata)
1299 {
1300 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1301 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1302 	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1303 	int access_flags = flags & IB_MR_REREG_ACCESS ?
1304 			    new_access_flags :
1305 			    mr->access_flags;
1306 	u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
1307 	u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
1308 	int page_shift = 0;
1309 	int npages = 0;
1310 	int ncont = 0;
1311 	int order = 0;
1312 	int err;
1313 
1314 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1315 		    start, virt_addr, length, access_flags);
1316 
1317 	if (flags != IB_MR_REREG_PD) {
1318 		/*
1319 		 * Replace umem. This needs to be done whether or not UMR is
1320 		 * used.
1321 		 */
1322 		flags |= IB_MR_REREG_TRANS;
1323 		ib_umem_release(mr->umem);
1324 		mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages,
1325 				       &page_shift, &ncont, &order);
1326 		if (IS_ERR(mr->umem)) {
1327 			err = PTR_ERR(mr->umem);
1328 			mr->umem = NULL;
1329 			return err;
1330 		}
1331 	}
1332 
1333 	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1334 		/*
1335 		 * UMR can't be used - MKey needs to be replaced.
1336 		 */
1337 		if (mr->umred) {
1338 			err = unreg_umr(dev, mr);
1339 			if (err)
1340 				mlx5_ib_warn(dev, "Failed to unregister MR\n");
1341 		} else {
1342 			err = destroy_mkey(dev, mr);
1343 			if (err)
1344 				mlx5_ib_warn(dev, "Failed to destroy MKey\n");
1345 		}
1346 		if (err)
1347 			return err;
1348 
1349 		mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1350 				page_shift, access_flags);
1351 
1352 		if (IS_ERR(mr))
1353 			return PTR_ERR(mr);
1354 
1355 		mr->umred = 0;
1356 	} else {
1357 		/*
1358 		 * Send a UMR WQE
1359 		 */
1360 		err = rereg_umr(pd, mr, addr, len, npages, page_shift,
1361 				order, access_flags, flags);
1362 		if (err) {
1363 			mlx5_ib_warn(dev, "Failed to rereg UMR\n");
1364 			return err;
1365 		}
1366 	}
1367 
1368 	if (flags & IB_MR_REREG_PD) {
1369 		ib_mr->pd = pd;
1370 		mr->mmkey.pd = to_mpd(pd)->pdn;
1371 	}
1372 
1373 	if (flags & IB_MR_REREG_ACCESS)
1374 		mr->access_flags = access_flags;
1375 
1376 	if (flags & IB_MR_REREG_TRANS) {
1377 		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1378 		set_mr_fileds(dev, mr, npages, len, access_flags);
1379 		mr->mmkey.iova = addr;
1380 		mr->mmkey.size = len;
1381 	}
1382 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1383 	update_odp_mr(mr);
1384 #endif
1385 
1386 	return 0;
1387 }
1388 
1389 static int
1390 mlx5_alloc_priv_descs(struct ib_device *device,
1391 		      struct mlx5_ib_mr *mr,
1392 		      int ndescs,
1393 		      int desc_size)
1394 {
1395 	int size = ndescs * desc_size;
1396 	int add_size;
1397 	int ret;
1398 
1399 	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1400 
1401 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1402 	if (!mr->descs_alloc)
1403 		return -ENOMEM;
1404 
1405 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1406 
1407 	mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1408 				      size, DMA_TO_DEVICE);
1409 	if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1410 		ret = -ENOMEM;
1411 		goto err;
1412 	}
1413 
1414 	return 0;
1415 err:
1416 	kfree(mr->descs_alloc);
1417 
1418 	return ret;
1419 }
1420 
1421 static void
1422 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1423 {
1424 	if (mr->descs) {
1425 		struct ib_device *device = mr->ibmr.device;
1426 		int size = mr->max_descs * mr->desc_size;
1427 
1428 		dma_unmap_single(device->dma_device, mr->desc_map,
1429 				 size, DMA_TO_DEVICE);
1430 		kfree(mr->descs_alloc);
1431 		mr->descs = NULL;
1432 	}
1433 }
1434 
1435 static int clean_mr(struct mlx5_ib_mr *mr)
1436 {
1437 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1438 	int umred = mr->umred;
1439 	int err;
1440 
1441 	if (mr->sig) {
1442 		if (mlx5_core_destroy_psv(dev->mdev,
1443 					  mr->sig->psv_memory.psv_idx))
1444 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1445 				     mr->sig->psv_memory.psv_idx);
1446 		if (mlx5_core_destroy_psv(dev->mdev,
1447 					  mr->sig->psv_wire.psv_idx))
1448 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1449 				     mr->sig->psv_wire.psv_idx);
1450 		kfree(mr->sig);
1451 		mr->sig = NULL;
1452 	}
1453 
1454 	mlx5_free_priv_descs(mr);
1455 
1456 	if (!umred) {
1457 		err = destroy_mkey(dev, mr);
1458 		if (err) {
1459 			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1460 				     mr->mmkey.key, err);
1461 			return err;
1462 		}
1463 	} else {
1464 		err = unreg_umr(dev, mr);
1465 		if (err) {
1466 			mlx5_ib_warn(dev, "failed unregister\n");
1467 			return err;
1468 		}
1469 		free_cached_mr(dev, mr);
1470 	}
1471 
1472 	if (!umred)
1473 		kfree(mr);
1474 
1475 	return 0;
1476 }
1477 
1478 int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1479 {
1480 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1481 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1482 	int npages = mr->npages;
1483 	struct ib_umem *umem = mr->umem;
1484 
1485 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1486 	if (umem && umem->odp_data) {
1487 		/* Prevent new page faults from succeeding */
1488 		mr->live = 0;
1489 		/* Wait for all running page-fault handlers to finish. */
1490 		synchronize_srcu(&dev->mr_srcu);
1491 		/* Destroy all page mappings */
1492 		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1493 					 ib_umem_end(umem));
1494 		/*
1495 		 * We kill the umem before the MR for ODP,
1496 		 * so that there will not be any invalidations in
1497 		 * flight, looking at the *mr struct.
1498 		 */
1499 		ib_umem_release(umem);
1500 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1501 
1502 		/* Avoid double-freeing the umem. */
1503 		umem = NULL;
1504 	}
1505 #endif
1506 
1507 	clean_mr(mr);
1508 
1509 	if (umem) {
1510 		ib_umem_release(umem);
1511 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1512 	}
1513 
1514 	return 0;
1515 }
1516 
1517 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1518 			       enum ib_mr_type mr_type,
1519 			       u32 max_num_sg)
1520 {
1521 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1522 	struct mlx5_create_mkey_mbox_in *in;
1523 	struct mlx5_ib_mr *mr;
1524 	int ndescs = ALIGN(max_num_sg, 4);
1525 	int err;
1526 
1527 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1528 	if (!mr)
1529 		return ERR_PTR(-ENOMEM);
1530 
1531 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1532 	if (!in) {
1533 		err = -ENOMEM;
1534 		goto err_free;
1535 	}
1536 
1537 	in->seg.status = MLX5_MKEY_STATUS_FREE;
1538 	in->seg.xlt_oct_size = cpu_to_be32(ndescs);
1539 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1540 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1541 
1542 	if (mr_type == IB_MR_TYPE_MEM_REG) {
1543 		mr->access_mode = MLX5_ACCESS_MODE_MTT;
1544 		in->seg.log2_page_size = PAGE_SHIFT;
1545 
1546 		err = mlx5_alloc_priv_descs(pd->device, mr,
1547 					    ndescs, sizeof(u64));
1548 		if (err)
1549 			goto err_free_in;
1550 
1551 		mr->desc_size = sizeof(u64);
1552 		mr->max_descs = ndescs;
1553 	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1554 		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1555 
1556 		err = mlx5_alloc_priv_descs(pd->device, mr,
1557 					    ndescs, sizeof(struct mlx5_klm));
1558 		if (err)
1559 			goto err_free_in;
1560 		mr->desc_size = sizeof(struct mlx5_klm);
1561 		mr->max_descs = ndescs;
1562 	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1563 		u32 psv_index[2];
1564 
1565 		in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
1566 							   MLX5_MKEY_BSF_EN);
1567 		in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
1568 		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1569 		if (!mr->sig) {
1570 			err = -ENOMEM;
1571 			goto err_free_in;
1572 		}
1573 
1574 		/* create mem & wire PSVs */
1575 		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1576 					   2, psv_index);
1577 		if (err)
1578 			goto err_free_sig;
1579 
1580 		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1581 		mr->sig->psv_memory.psv_idx = psv_index[0];
1582 		mr->sig->psv_wire.psv_idx = psv_index[1];
1583 
1584 		mr->sig->sig_status_checked = true;
1585 		mr->sig->sig_err_exists = false;
1586 		/* Next UMR, Arm SIGERR */
1587 		++mr->sig->sigerr_count;
1588 	} else {
1589 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1590 		err = -EINVAL;
1591 		goto err_free_in;
1592 	}
1593 
1594 	in->seg.flags = MLX5_PERM_UMR_EN | mr->access_mode;
1595 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in),
1596 				    NULL, NULL, NULL);
1597 	if (err)
1598 		goto err_destroy_psv;
1599 
1600 	mr->ibmr.lkey = mr->mmkey.key;
1601 	mr->ibmr.rkey = mr->mmkey.key;
1602 	mr->umem = NULL;
1603 	kfree(in);
1604 
1605 	return &mr->ibmr;
1606 
1607 err_destroy_psv:
1608 	if (mr->sig) {
1609 		if (mlx5_core_destroy_psv(dev->mdev,
1610 					  mr->sig->psv_memory.psv_idx))
1611 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1612 				     mr->sig->psv_memory.psv_idx);
1613 		if (mlx5_core_destroy_psv(dev->mdev,
1614 					  mr->sig->psv_wire.psv_idx))
1615 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1616 				     mr->sig->psv_wire.psv_idx);
1617 	}
1618 	mlx5_free_priv_descs(mr);
1619 err_free_sig:
1620 	kfree(mr->sig);
1621 err_free_in:
1622 	kfree(in);
1623 err_free:
1624 	kfree(mr);
1625 	return ERR_PTR(err);
1626 }
1627 
1628 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1629 			       struct ib_udata *udata)
1630 {
1631 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1632 	struct mlx5_create_mkey_mbox_in *in = NULL;
1633 	struct mlx5_ib_mw *mw = NULL;
1634 	int ndescs;
1635 	int err;
1636 	struct mlx5_ib_alloc_mw req = {};
1637 	struct {
1638 		__u32	comp_mask;
1639 		__u32	response_length;
1640 	} resp = {};
1641 
1642 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1643 	if (err)
1644 		return ERR_PTR(err);
1645 
1646 	if (req.comp_mask || req.reserved1 || req.reserved2)
1647 		return ERR_PTR(-EOPNOTSUPP);
1648 
1649 	if (udata->inlen > sizeof(req) &&
1650 	    !ib_is_udata_cleared(udata, sizeof(req),
1651 				 udata->inlen - sizeof(req)))
1652 		return ERR_PTR(-EOPNOTSUPP);
1653 
1654 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1655 
1656 	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1657 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1658 	if (!mw || !in) {
1659 		err = -ENOMEM;
1660 		goto free;
1661 	}
1662 
1663 	in->seg.status = MLX5_MKEY_STATUS_FREE;
1664 	in->seg.xlt_oct_size = cpu_to_be32(ndescs);
1665 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1666 	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_KLM |
1667 		MLX5_PERM_LOCAL_READ;
1668 	if (type == IB_MW_TYPE_2)
1669 		in->seg.flags_pd |= cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
1670 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1671 
1672 	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, sizeof(*in),
1673 				    NULL, NULL, NULL);
1674 	if (err)
1675 		goto free;
1676 
1677 	mw->ibmw.rkey = mw->mmkey.key;
1678 
1679 	resp.response_length = min(offsetof(typeof(resp), response_length) +
1680 				   sizeof(resp.response_length), udata->outlen);
1681 	if (resp.response_length) {
1682 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
1683 		if (err) {
1684 			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1685 			goto free;
1686 		}
1687 	}
1688 
1689 	kfree(in);
1690 	return &mw->ibmw;
1691 
1692 free:
1693 	kfree(mw);
1694 	kfree(in);
1695 	return ERR_PTR(err);
1696 }
1697 
1698 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1699 {
1700 	struct mlx5_ib_mw *mmw = to_mmw(mw);
1701 	int err;
1702 
1703 	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1704 				      &mmw->mmkey);
1705 	if (!err)
1706 		kfree(mmw);
1707 	return err;
1708 }
1709 
1710 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1711 			    struct ib_mr_status *mr_status)
1712 {
1713 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1714 	int ret = 0;
1715 
1716 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1717 		pr_err("Invalid status check mask\n");
1718 		ret = -EINVAL;
1719 		goto done;
1720 	}
1721 
1722 	mr_status->fail_status = 0;
1723 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1724 		if (!mmr->sig) {
1725 			ret = -EINVAL;
1726 			pr_err("signature status check requested on a non-signature enabled MR\n");
1727 			goto done;
1728 		}
1729 
1730 		mmr->sig->sig_status_checked = true;
1731 		if (!mmr->sig->sig_err_exists)
1732 			goto done;
1733 
1734 		if (ibmr->lkey == mmr->sig->err_item.key)
1735 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1736 			       sizeof(mr_status->sig_err));
1737 		else {
1738 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1739 			mr_status->sig_err.sig_err_offset = 0;
1740 			mr_status->sig_err.key = mmr->sig->err_item.key;
1741 		}
1742 
1743 		mmr->sig->sig_err_exists = false;
1744 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1745 	}
1746 
1747 done:
1748 	return ret;
1749 }
1750 
1751 static int
1752 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
1753 		   struct scatterlist *sgl,
1754 		   unsigned short sg_nents,
1755 		   unsigned int *sg_offset_p)
1756 {
1757 	struct scatterlist *sg = sgl;
1758 	struct mlx5_klm *klms = mr->descs;
1759 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
1760 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
1761 	int i;
1762 
1763 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
1764 	mr->ibmr.length = 0;
1765 	mr->ndescs = sg_nents;
1766 
1767 	for_each_sg(sgl, sg, sg_nents, i) {
1768 		if (unlikely(i > mr->max_descs))
1769 			break;
1770 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
1771 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
1772 		klms[i].key = cpu_to_be32(lkey);
1773 		mr->ibmr.length += sg_dma_len(sg);
1774 
1775 		sg_offset = 0;
1776 	}
1777 
1778 	if (sg_offset_p)
1779 		*sg_offset_p = sg_offset;
1780 
1781 	return i;
1782 }
1783 
1784 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1785 {
1786 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1787 	__be64 *descs;
1788 
1789 	if (unlikely(mr->ndescs == mr->max_descs))
1790 		return -ENOMEM;
1791 
1792 	descs = mr->descs;
1793 	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1794 
1795 	return 0;
1796 }
1797 
1798 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
1799 		      unsigned int *sg_offset)
1800 {
1801 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1802 	int n;
1803 
1804 	mr->ndescs = 0;
1805 
1806 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1807 				   mr->desc_size * mr->max_descs,
1808 				   DMA_TO_DEVICE);
1809 
1810 	if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
1811 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
1812 	else
1813 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
1814 				mlx5_set_page);
1815 
1816 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1817 				      mr->desc_size * mr->max_descs,
1818 				      DMA_TO_DEVICE);
1819 
1820 	return n;
1821 }
1822