xref: /openbmc/linux/drivers/infiniband/hw/mlx5/mr.c (revision e23feb16)
1 /*
2  * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 
34 #include <linux/kref.h>
35 #include <linux/random.h>
36 #include <linux/debugfs.h>
37 #include <linux/export.h>
38 #include <rdma/ib_umem.h>
39 #include "mlx5_ib.h"
40 
41 enum {
42 	DEF_CACHE_SIZE	= 10,
43 };
44 
45 static __be64 *mr_align(__be64 *ptr, int align)
46 {
47 	unsigned long mask = align - 1;
48 
49 	return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
50 }
51 
52 static int order2idx(struct mlx5_ib_dev *dev, int order)
53 {
54 	struct mlx5_mr_cache *cache = &dev->cache;
55 
56 	if (order < cache->ent[0].order)
57 		return 0;
58 	else
59 		return order - cache->ent[0].order;
60 }
61 
62 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
63 {
64 	struct device *ddev = dev->ib_dev.dma_device;
65 	struct mlx5_mr_cache *cache = &dev->cache;
66 	struct mlx5_cache_ent *ent = &cache->ent[c];
67 	struct mlx5_create_mkey_mbox_in *in;
68 	struct mlx5_ib_mr *mr;
69 	int npages = 1 << ent->order;
70 	int size = sizeof(u64) * npages;
71 	int err = 0;
72 	int i;
73 
74 	in = kzalloc(sizeof(*in), GFP_KERNEL);
75 	if (!in)
76 		return -ENOMEM;
77 
78 	for (i = 0; i < num; i++) {
79 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
80 		if (!mr) {
81 			err = -ENOMEM;
82 			goto out;
83 		}
84 		mr->order = ent->order;
85 		mr->umred = 1;
86 		mr->pas = kmalloc(size + 0x3f, GFP_KERNEL);
87 		if (!mr->pas) {
88 			kfree(mr);
89 			err = -ENOMEM;
90 			goto out;
91 		}
92 		mr->dma = dma_map_single(ddev, mr_align(mr->pas, 0x40), size,
93 					 DMA_TO_DEVICE);
94 		if (dma_mapping_error(ddev, mr->dma)) {
95 			kfree(mr->pas);
96 			kfree(mr);
97 			err = -ENOMEM;
98 			goto out;
99 		}
100 
101 		in->seg.status = 1 << 6;
102 		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
103 		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
104 		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
105 		in->seg.log2_page_size = 12;
106 
107 		err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in,
108 					    sizeof(*in));
109 		if (err) {
110 			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
111 			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
112 			kfree(mr->pas);
113 			kfree(mr);
114 			goto out;
115 		}
116 		cache->last_add = jiffies;
117 
118 		spin_lock(&ent->lock);
119 		list_add_tail(&mr->list, &ent->head);
120 		ent->cur++;
121 		ent->size++;
122 		spin_unlock(&ent->lock);
123 	}
124 
125 out:
126 	kfree(in);
127 	return err;
128 }
129 
130 static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
131 {
132 	struct device *ddev = dev->ib_dev.dma_device;
133 	struct mlx5_mr_cache *cache = &dev->cache;
134 	struct mlx5_cache_ent *ent = &cache->ent[c];
135 	struct mlx5_ib_mr *mr;
136 	int size;
137 	int err;
138 	int i;
139 
140 	for (i = 0; i < num; i++) {
141 		spin_lock(&ent->lock);
142 		if (list_empty(&ent->head)) {
143 			spin_unlock(&ent->lock);
144 			return;
145 		}
146 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
147 		list_del(&mr->list);
148 		ent->cur--;
149 		ent->size--;
150 		spin_unlock(&ent->lock);
151 		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
152 		if (err) {
153 			mlx5_ib_warn(dev, "failed destroy mkey\n");
154 		} else {
155 			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40);
156 			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
157 			kfree(mr->pas);
158 			kfree(mr);
159 		}
160 	}
161 }
162 
163 static ssize_t size_write(struct file *filp, const char __user *buf,
164 			  size_t count, loff_t *pos)
165 {
166 	struct mlx5_cache_ent *ent = filp->private_data;
167 	struct mlx5_ib_dev *dev = ent->dev;
168 	char lbuf[20];
169 	u32 var;
170 	int err;
171 	int c;
172 
173 	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
174 		return -EFAULT;
175 
176 	c = order2idx(dev, ent->order);
177 	lbuf[sizeof(lbuf) - 1] = 0;
178 
179 	if (sscanf(lbuf, "%u", &var) != 1)
180 		return -EINVAL;
181 
182 	if (var < ent->limit)
183 		return -EINVAL;
184 
185 	if (var > ent->size) {
186 		err = add_keys(dev, c, var - ent->size);
187 		if (err)
188 			return err;
189 	} else if (var < ent->size) {
190 		remove_keys(dev, c, ent->size - var);
191 	}
192 
193 	return count;
194 }
195 
196 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
197 			 loff_t *pos)
198 {
199 	struct mlx5_cache_ent *ent = filp->private_data;
200 	char lbuf[20];
201 	int err;
202 
203 	if (*pos)
204 		return 0;
205 
206 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
207 	if (err < 0)
208 		return err;
209 
210 	if (copy_to_user(buf, lbuf, err))
211 		return -EFAULT;
212 
213 	*pos += err;
214 
215 	return err;
216 }
217 
218 static const struct file_operations size_fops = {
219 	.owner	= THIS_MODULE,
220 	.open	= simple_open,
221 	.write	= size_write,
222 	.read	= size_read,
223 };
224 
225 static ssize_t limit_write(struct file *filp, const char __user *buf,
226 			   size_t count, loff_t *pos)
227 {
228 	struct mlx5_cache_ent *ent = filp->private_data;
229 	struct mlx5_ib_dev *dev = ent->dev;
230 	char lbuf[20];
231 	u32 var;
232 	int err;
233 	int c;
234 
235 	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
236 		return -EFAULT;
237 
238 	c = order2idx(dev, ent->order);
239 	lbuf[sizeof(lbuf) - 1] = 0;
240 
241 	if (sscanf(lbuf, "%u", &var) != 1)
242 		return -EINVAL;
243 
244 	if (var > ent->size)
245 		return -EINVAL;
246 
247 	ent->limit = var;
248 
249 	if (ent->cur < ent->limit) {
250 		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
251 		if (err)
252 			return err;
253 	}
254 
255 	return count;
256 }
257 
258 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
259 			  loff_t *pos)
260 {
261 	struct mlx5_cache_ent *ent = filp->private_data;
262 	char lbuf[20];
263 	int err;
264 
265 	if (*pos)
266 		return 0;
267 
268 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
269 	if (err < 0)
270 		return err;
271 
272 	if (copy_to_user(buf, lbuf, err))
273 		return -EFAULT;
274 
275 	*pos += err;
276 
277 	return err;
278 }
279 
280 static const struct file_operations limit_fops = {
281 	.owner	= THIS_MODULE,
282 	.open	= simple_open,
283 	.write	= limit_write,
284 	.read	= limit_read,
285 };
286 
287 static int someone_adding(struct mlx5_mr_cache *cache)
288 {
289 	int i;
290 
291 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
292 		if (cache->ent[i].cur < cache->ent[i].limit)
293 			return 1;
294 	}
295 
296 	return 0;
297 }
298 
299 static void __cache_work_func(struct mlx5_cache_ent *ent)
300 {
301 	struct mlx5_ib_dev *dev = ent->dev;
302 	struct mlx5_mr_cache *cache = &dev->cache;
303 	int i = order2idx(dev, ent->order);
304 
305 	if (cache->stopped)
306 		return;
307 
308 	ent = &dev->cache.ent[i];
309 	if (ent->cur < 2 * ent->limit) {
310 		add_keys(dev, i, 1);
311 		if (ent->cur < 2 * ent->limit)
312 			queue_work(cache->wq, &ent->work);
313 	} else if (ent->cur > 2 * ent->limit) {
314 		if (!someone_adding(cache) &&
315 		    time_after(jiffies, cache->last_add + 60 * HZ)) {
316 			remove_keys(dev, i, 1);
317 			if (ent->cur > ent->limit)
318 				queue_work(cache->wq, &ent->work);
319 		} else {
320 			queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ);
321 		}
322 	}
323 }
324 
325 static void delayed_cache_work_func(struct work_struct *work)
326 {
327 	struct mlx5_cache_ent *ent;
328 
329 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
330 	__cache_work_func(ent);
331 }
332 
333 static void cache_work_func(struct work_struct *work)
334 {
335 	struct mlx5_cache_ent *ent;
336 
337 	ent = container_of(work, struct mlx5_cache_ent, work);
338 	__cache_work_func(ent);
339 }
340 
341 static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
342 {
343 	struct mlx5_mr_cache *cache = &dev->cache;
344 	struct mlx5_ib_mr *mr = NULL;
345 	struct mlx5_cache_ent *ent;
346 	int c;
347 	int i;
348 
349 	c = order2idx(dev, order);
350 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
351 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
352 		return NULL;
353 	}
354 
355 	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
356 		ent = &cache->ent[i];
357 
358 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
359 
360 		spin_lock(&ent->lock);
361 		if (!list_empty(&ent->head)) {
362 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
363 					      list);
364 			list_del(&mr->list);
365 			ent->cur--;
366 			spin_unlock(&ent->lock);
367 			if (ent->cur < ent->limit)
368 				queue_work(cache->wq, &ent->work);
369 			break;
370 		}
371 		spin_unlock(&ent->lock);
372 
373 		queue_work(cache->wq, &ent->work);
374 
375 		if (mr)
376 			break;
377 	}
378 
379 	if (!mr)
380 		cache->ent[c].miss++;
381 
382 	return mr;
383 }
384 
385 static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
386 {
387 	struct mlx5_mr_cache *cache = &dev->cache;
388 	struct mlx5_cache_ent *ent;
389 	int shrink = 0;
390 	int c;
391 
392 	c = order2idx(dev, mr->order);
393 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
394 		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
395 		return;
396 	}
397 	ent = &cache->ent[c];
398 	spin_lock(&ent->lock);
399 	list_add_tail(&mr->list, &ent->head);
400 	ent->cur++;
401 	if (ent->cur > 2 * ent->limit)
402 		shrink = 1;
403 	spin_unlock(&ent->lock);
404 
405 	if (shrink)
406 		queue_work(cache->wq, &ent->work);
407 }
408 
409 static void clean_keys(struct mlx5_ib_dev *dev, int c)
410 {
411 	struct device *ddev = dev->ib_dev.dma_device;
412 	struct mlx5_mr_cache *cache = &dev->cache;
413 	struct mlx5_cache_ent *ent = &cache->ent[c];
414 	struct mlx5_ib_mr *mr;
415 	int size;
416 	int err;
417 
418 	while (1) {
419 		spin_lock(&ent->lock);
420 		if (list_empty(&ent->head)) {
421 			spin_unlock(&ent->lock);
422 			return;
423 		}
424 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
425 		list_del(&mr->list);
426 		ent->cur--;
427 		ent->size--;
428 		spin_unlock(&ent->lock);
429 		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
430 		if (err) {
431 			mlx5_ib_warn(dev, "failed destroy mkey\n");
432 		} else {
433 			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40);
434 			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
435 			kfree(mr->pas);
436 			kfree(mr);
437 		}
438 	}
439 }
440 
441 static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
442 {
443 	struct mlx5_mr_cache *cache = &dev->cache;
444 	struct mlx5_cache_ent *ent;
445 	int i;
446 
447 	if (!mlx5_debugfs_root)
448 		return 0;
449 
450 	cache->root = debugfs_create_dir("mr_cache", dev->mdev.priv.dbg_root);
451 	if (!cache->root)
452 		return -ENOMEM;
453 
454 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
455 		ent = &cache->ent[i];
456 		sprintf(ent->name, "%d", ent->order);
457 		ent->dir = debugfs_create_dir(ent->name,  cache->root);
458 		if (!ent->dir)
459 			return -ENOMEM;
460 
461 		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
462 						 &size_fops);
463 		if (!ent->fsize)
464 			return -ENOMEM;
465 
466 		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
467 						  &limit_fops);
468 		if (!ent->flimit)
469 			return -ENOMEM;
470 
471 		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
472 					       &ent->cur);
473 		if (!ent->fcur)
474 			return -ENOMEM;
475 
476 		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
477 						&ent->miss);
478 		if (!ent->fmiss)
479 			return -ENOMEM;
480 	}
481 
482 	return 0;
483 }
484 
485 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
486 {
487 	if (!mlx5_debugfs_root)
488 		return;
489 
490 	debugfs_remove_recursive(dev->cache.root);
491 }
492 
493 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
494 {
495 	struct mlx5_mr_cache *cache = &dev->cache;
496 	struct mlx5_cache_ent *ent;
497 	int limit;
498 	int size;
499 	int err;
500 	int i;
501 
502 	cache->wq = create_singlethread_workqueue("mkey_cache");
503 	if (!cache->wq) {
504 		mlx5_ib_warn(dev, "failed to create work queue\n");
505 		return -ENOMEM;
506 	}
507 
508 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
509 		INIT_LIST_HEAD(&cache->ent[i].head);
510 		spin_lock_init(&cache->ent[i].lock);
511 
512 		ent = &cache->ent[i];
513 		INIT_LIST_HEAD(&ent->head);
514 		spin_lock_init(&ent->lock);
515 		ent->order = i + 2;
516 		ent->dev = dev;
517 
518 		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) {
519 			size = dev->mdev.profile->mr_cache[i].size;
520 			limit = dev->mdev.profile->mr_cache[i].limit;
521 		} else {
522 			size = DEF_CACHE_SIZE;
523 			limit = 0;
524 		}
525 		INIT_WORK(&ent->work, cache_work_func);
526 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
527 		ent->limit = limit;
528 		queue_work(cache->wq, &ent->work);
529 	}
530 
531 	err = mlx5_mr_cache_debugfs_init(dev);
532 	if (err)
533 		mlx5_ib_warn(dev, "cache debugfs failure\n");
534 
535 	return 0;
536 }
537 
538 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
539 {
540 	int i;
541 
542 	dev->cache.stopped = 1;
543 	destroy_workqueue(dev->cache.wq);
544 
545 	mlx5_mr_cache_debugfs_cleanup(dev);
546 
547 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
548 		clean_keys(dev, i);
549 
550 	return 0;
551 }
552 
553 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
554 {
555 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
556 	struct mlx5_core_dev *mdev = &dev->mdev;
557 	struct mlx5_create_mkey_mbox_in *in;
558 	struct mlx5_mkey_seg *seg;
559 	struct mlx5_ib_mr *mr;
560 	int err;
561 
562 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
563 	if (!mr)
564 		return ERR_PTR(-ENOMEM);
565 
566 	in = kzalloc(sizeof(*in), GFP_KERNEL);
567 	if (!in) {
568 		err = -ENOMEM;
569 		goto err_free;
570 	}
571 
572 	seg = &in->seg;
573 	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
574 	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
575 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
576 	seg->start_addr = 0;
577 
578 	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in));
579 	if (err)
580 		goto err_in;
581 
582 	kfree(in);
583 	mr->ibmr.lkey = mr->mmr.key;
584 	mr->ibmr.rkey = mr->mmr.key;
585 	mr->umem = NULL;
586 
587 	return &mr->ibmr;
588 
589 err_in:
590 	kfree(in);
591 
592 err_free:
593 	kfree(mr);
594 
595 	return ERR_PTR(err);
596 }
597 
598 static int get_octo_len(u64 addr, u64 len, int page_size)
599 {
600 	u64 offset;
601 	int npages;
602 
603 	offset = addr & (page_size - 1);
604 	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
605 	return (npages + 1) / 2;
606 }
607 
608 static int use_umr(int order)
609 {
610 	return order <= 17;
611 }
612 
613 static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
614 			     struct ib_sge *sg, u64 dma, int n, u32 key,
615 			     int page_shift, u64 virt_addr, u64 len,
616 			     int access_flags)
617 {
618 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
619 	struct ib_mr *mr = dev->umrc.mr;
620 
621 	sg->addr = dma;
622 	sg->length = ALIGN(sizeof(u64) * n, 64);
623 	sg->lkey = mr->lkey;
624 
625 	wr->next = NULL;
626 	wr->send_flags = 0;
627 	wr->sg_list = sg;
628 	if (n)
629 		wr->num_sge = 1;
630 	else
631 		wr->num_sge = 0;
632 
633 	wr->opcode = MLX5_IB_WR_UMR;
634 	wr->wr.fast_reg.page_list_len = n;
635 	wr->wr.fast_reg.page_shift = page_shift;
636 	wr->wr.fast_reg.rkey = key;
637 	wr->wr.fast_reg.iova_start = virt_addr;
638 	wr->wr.fast_reg.length = len;
639 	wr->wr.fast_reg.access_flags = access_flags;
640 	wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd;
641 }
642 
643 static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
644 			       struct ib_send_wr *wr, u32 key)
645 {
646 	wr->send_flags = MLX5_IB_SEND_UMR_UNREG;
647 	wr->opcode = MLX5_IB_WR_UMR;
648 	wr->wr.fast_reg.rkey = key;
649 }
650 
651 void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
652 {
653 	struct mlx5_ib_mr *mr;
654 	struct ib_wc wc;
655 	int err;
656 
657 	while (1) {
658 		err = ib_poll_cq(cq, 1, &wc);
659 		if (err < 0) {
660 			pr_warn("poll cq error %d\n", err);
661 			return;
662 		}
663 		if (err == 0)
664 			break;
665 
666 		mr = (struct mlx5_ib_mr *)(unsigned long)wc.wr_id;
667 		mr->status = wc.status;
668 		complete(&mr->done);
669 	}
670 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
671 }
672 
673 static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
674 				  u64 virt_addr, u64 len, int npages,
675 				  int page_shift, int order, int access_flags)
676 {
677 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
678 	struct umr_common *umrc = &dev->umrc;
679 	struct ib_send_wr wr, *bad;
680 	struct mlx5_ib_mr *mr;
681 	struct ib_sge sg;
682 	int err;
683 	int i;
684 
685 	for (i = 0; i < 10; i++) {
686 		mr = alloc_cached_mr(dev, order);
687 		if (mr)
688 			break;
689 
690 		err = add_keys(dev, order2idx(dev, order), 1);
691 		if (err) {
692 			mlx5_ib_warn(dev, "add_keys failed\n");
693 			break;
694 		}
695 	}
696 
697 	if (!mr)
698 		return ERR_PTR(-EAGAIN);
699 
700 	mlx5_ib_populate_pas(dev, umem, page_shift, mr_align(mr->pas, 0x40), 1);
701 
702 	memset(&wr, 0, sizeof(wr));
703 	wr.wr_id = (u64)(unsigned long)mr;
704 	prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags);
705 
706 	/* We serialize polls so one process does not kidnap another's
707 	 * completion. This is not a problem since wr is completed in
708 	 * around 1 usec
709 	 */
710 	down(&umrc->sem);
711 	init_completion(&mr->done);
712 	err = ib_post_send(umrc->qp, &wr, &bad);
713 	if (err) {
714 		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
715 		up(&umrc->sem);
716 		goto error;
717 	}
718 	wait_for_completion(&mr->done);
719 	up(&umrc->sem);
720 
721 	if (mr->status != IB_WC_SUCCESS) {
722 		mlx5_ib_warn(dev, "reg umr failed\n");
723 		err = -EFAULT;
724 		goto error;
725 	}
726 
727 	return mr;
728 
729 error:
730 	free_cached_mr(dev, mr);
731 	return ERR_PTR(err);
732 }
733 
734 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
735 				     u64 length, struct ib_umem *umem,
736 				     int npages, int page_shift,
737 				     int access_flags)
738 {
739 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
740 	struct mlx5_create_mkey_mbox_in *in;
741 	struct mlx5_ib_mr *mr;
742 	int inlen;
743 	int err;
744 
745 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
746 	if (!mr)
747 		return ERR_PTR(-ENOMEM);
748 
749 	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
750 	in = mlx5_vzalloc(inlen);
751 	if (!in) {
752 		err = -ENOMEM;
753 		goto err_1;
754 	}
755 	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0);
756 
757 	in->seg.flags = convert_access(access_flags) |
758 		MLX5_ACCESS_MODE_MTT;
759 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
760 	in->seg.start_addr = cpu_to_be64(virt_addr);
761 	in->seg.len = cpu_to_be64(length);
762 	in->seg.bsfs_octo_size = 0;
763 	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
764 	in->seg.log2_page_size = page_shift;
765 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
766 	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
767 	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen);
768 	if (err) {
769 		mlx5_ib_warn(dev, "create mkey failed\n");
770 		goto err_2;
771 	}
772 	mr->umem = umem;
773 	mlx5_vfree(in);
774 
775 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
776 
777 	return mr;
778 
779 err_2:
780 	mlx5_vfree(in);
781 
782 err_1:
783 	kfree(mr);
784 
785 	return ERR_PTR(err);
786 }
787 
788 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
789 				  u64 virt_addr, int access_flags,
790 				  struct ib_udata *udata)
791 {
792 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
793 	struct mlx5_ib_mr *mr = NULL;
794 	struct ib_umem *umem;
795 	int page_shift;
796 	int npages;
797 	int ncont;
798 	int order;
799 	int err;
800 
801 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx\n",
802 		    start, virt_addr, length);
803 	umem = ib_umem_get(pd->uobject->context, start, length, access_flags,
804 			   0);
805 	if (IS_ERR(umem)) {
806 		mlx5_ib_dbg(dev, "umem get failed\n");
807 		return (void *)umem;
808 	}
809 
810 	mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order);
811 	if (!npages) {
812 		mlx5_ib_warn(dev, "avoid zero region\n");
813 		err = -EINVAL;
814 		goto error;
815 	}
816 
817 	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
818 		    npages, ncont, order, page_shift);
819 
820 	if (use_umr(order)) {
821 		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
822 			     order, access_flags);
823 		if (PTR_ERR(mr) == -EAGAIN) {
824 			mlx5_ib_dbg(dev, "cache empty for order %d", order);
825 			mr = NULL;
826 		}
827 	}
828 
829 	if (!mr)
830 		mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift,
831 				access_flags);
832 
833 	if (IS_ERR(mr)) {
834 		err = PTR_ERR(mr);
835 		goto error;
836 	}
837 
838 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
839 
840 	mr->umem = umem;
841 	mr->npages = npages;
842 	spin_lock(&dev->mr_lock);
843 	dev->mdev.priv.reg_pages += npages;
844 	spin_unlock(&dev->mr_lock);
845 	mr->ibmr.lkey = mr->mmr.key;
846 	mr->ibmr.rkey = mr->mmr.key;
847 
848 	return &mr->ibmr;
849 
850 error:
851 	ib_umem_release(umem);
852 	return ERR_PTR(err);
853 }
854 
855 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
856 {
857 	struct umr_common *umrc = &dev->umrc;
858 	struct ib_send_wr wr, *bad;
859 	int err;
860 
861 	memset(&wr, 0, sizeof(wr));
862 	wr.wr_id = (u64)(unsigned long)mr;
863 	prep_umr_unreg_wqe(dev, &wr, mr->mmr.key);
864 
865 	down(&umrc->sem);
866 	init_completion(&mr->done);
867 	err = ib_post_send(umrc->qp, &wr, &bad);
868 	if (err) {
869 		up(&umrc->sem);
870 		mlx5_ib_dbg(dev, "err %d\n", err);
871 		goto error;
872 	}
873 	wait_for_completion(&mr->done);
874 	up(&umrc->sem);
875 	if (mr->status != IB_WC_SUCCESS) {
876 		mlx5_ib_warn(dev, "unreg umr failed\n");
877 		err = -EFAULT;
878 		goto error;
879 	}
880 	return 0;
881 
882 error:
883 	return err;
884 }
885 
886 int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
887 {
888 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
889 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
890 	struct ib_umem *umem = mr->umem;
891 	int npages = mr->npages;
892 	int umred = mr->umred;
893 	int err;
894 
895 	if (!umred) {
896 		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
897 		if (err) {
898 			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
899 				     mr->mmr.key, err);
900 			return err;
901 		}
902 	} else {
903 		err = unreg_umr(dev, mr);
904 		if (err) {
905 			mlx5_ib_warn(dev, "failed unregister\n");
906 			return err;
907 		}
908 		free_cached_mr(dev, mr);
909 	}
910 
911 	if (umem) {
912 		ib_umem_release(umem);
913 		spin_lock(&dev->mr_lock);
914 		dev->mdev.priv.reg_pages -= npages;
915 		spin_unlock(&dev->mr_lock);
916 	}
917 
918 	if (!umred)
919 		kfree(mr);
920 
921 	return 0;
922 }
923 
924 struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
925 					int max_page_list_len)
926 {
927 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
928 	struct mlx5_create_mkey_mbox_in *in;
929 	struct mlx5_ib_mr *mr;
930 	int err;
931 
932 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
933 	if (!mr)
934 		return ERR_PTR(-ENOMEM);
935 
936 	in = kzalloc(sizeof(*in), GFP_KERNEL);
937 	if (!in) {
938 		err = -ENOMEM;
939 		goto err_free;
940 	}
941 
942 	in->seg.status = 1 << 6; /* free */
943 	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
944 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
945 	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
946 	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
947 	/*
948 	 * TBD not needed - issue 197292 */
949 	in->seg.log2_page_size = PAGE_SHIFT;
950 
951 	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in));
952 	kfree(in);
953 	if (err)
954 		goto err_free;
955 
956 	mr->ibmr.lkey = mr->mmr.key;
957 	mr->ibmr.rkey = mr->mmr.key;
958 	mr->umem = NULL;
959 
960 	return &mr->ibmr;
961 
962 err_free:
963 	kfree(mr);
964 	return ERR_PTR(err);
965 }
966 
967 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
968 							       int page_list_len)
969 {
970 	struct mlx5_ib_fast_reg_page_list *mfrpl;
971 	int size = page_list_len * sizeof(u64);
972 
973 	mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL);
974 	if (!mfrpl)
975 		return ERR_PTR(-ENOMEM);
976 
977 	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
978 	if (!mfrpl->ibfrpl.page_list)
979 		goto err_free;
980 
981 	mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device,
982 						     size, &mfrpl->map,
983 						     GFP_KERNEL);
984 	if (!mfrpl->mapped_page_list)
985 		goto err_free;
986 
987 	WARN_ON(mfrpl->map & 0x3f);
988 
989 	return &mfrpl->ibfrpl;
990 
991 err_free:
992 	kfree(mfrpl->ibfrpl.page_list);
993 	kfree(mfrpl);
994 	return ERR_PTR(-ENOMEM);
995 }
996 
997 void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
998 {
999 	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
1000 	struct mlx5_ib_dev *dev = to_mdev(page_list->device);
1001 	int size = page_list->max_page_list_len * sizeof(u64);
1002 
1003 	dma_free_coherent(&dev->mdev.pdev->dev, size, mfrpl->mapped_page_list,
1004 			  mfrpl->map);
1005 	kfree(mfrpl->ibfrpl.page_list);
1006 	kfree(mfrpl);
1007 }
1008