xref: /openbmc/linux/drivers/infiniband/hw/mlx5/mr.c (revision 34d6f206a88c2651d216bd3487ac956a40b2ba8e)
1  /*
2   * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3   * Copyright (c) 2020, Intel Corporation. All rights reserved.
4   *
5   * This software is available to you under a choice of one of two
6   * licenses.  You may choose to be licensed under the terms of the GNU
7   * General Public License (GPL) Version 2, available from the file
8   * COPYING in the main directory of this source tree, or the
9   * OpenIB.org BSD license below:
10   *
11   *     Redistribution and use in source and binary forms, with or
12   *     without modification, are permitted provided that the following
13   *     conditions are met:
14   *
15   *      - Redistributions of source code must retain the above
16   *        copyright notice, this list of conditions and the following
17   *        disclaimer.
18   *
19   *      - Redistributions in binary form must reproduce the above
20   *        copyright notice, this list of conditions and the following
21   *        disclaimer in the documentation and/or other materials
22   *        provided with the distribution.
23   *
24   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31   * SOFTWARE.
32   */
33  
34  
35  #include <linux/kref.h>
36  #include <linux/random.h>
37  #include <linux/debugfs.h>
38  #include <linux/export.h>
39  #include <linux/delay.h>
40  #include <linux/dma-buf.h>
41  #include <linux/dma-resv.h>
42  #include <rdma/ib_umem_odp.h>
43  #include "dm.h"
44  #include "mlx5_ib.h"
45  #include "umr.h"
46  
47  enum {
48  	MAX_PENDING_REG_MR = 8,
49  };
50  
51  #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
52  #define MLX5_UMR_ALIGN 2048
53  
54  static void
55  create_mkey_callback(int status, struct mlx5_async_work *context);
56  static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
57  				     u64 iova, int access_flags,
58  				     unsigned int page_size, bool populate);
59  
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)60  static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
61  					  struct ib_pd *pd)
62  {
63  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
64  
65  	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
66  	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
67  	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
68  	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
69  	MLX5_SET(mkc, mkc, lr, 1);
70  
71  	if (acc & IB_ACCESS_RELAXED_ORDERING) {
72  		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
73  			MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
74  
75  		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
76  		    (MLX5_CAP_GEN(dev->mdev,
77  				  relaxed_ordering_read_pci_enabled) &&
78  		     pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
79  			MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
80  	}
81  
82  	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
83  	MLX5_SET(mkc, mkc, qpn, 0xffffff);
84  	MLX5_SET64(mkc, mkc, start_addr, start_addr);
85  }
86  
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)87  static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
88  {
89  	u8 key = atomic_inc_return(&dev->mkey_var);
90  	void *mkc;
91  
92  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
93  	MLX5_SET(mkc, mkc, mkey_7_0, key);
94  	*mkey = key;
95  }
96  
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)97  static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
98  			       struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
99  {
100  	int ret;
101  
102  	assign_mkey_variant(dev, &mkey->key, in);
103  	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
104  	if (!ret)
105  		init_waitqueue_head(&mkey->wait);
106  
107  	return ret;
108  }
109  
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)110  static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
111  {
112  	struct mlx5_ib_dev *dev = async_create->ent->dev;
113  	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
114  	size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
115  
116  	MLX5_SET(create_mkey_in, async_create->in, opcode,
117  		 MLX5_CMD_OP_CREATE_MKEY);
118  	assign_mkey_variant(dev, &async_create->mkey, async_create->in);
119  	return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
120  				async_create->out, outlen, create_mkey_callback,
121  				&async_create->cb_work);
122  }
123  
124  static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
125  static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
126  
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)127  static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
128  {
129  	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
130  
131  	return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
132  }
133  
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)134  static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
135  {
136  	if (status == -ENXIO) /* core driver is not available */
137  		return;
138  
139  	mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
140  	if (status != -EREMOTEIO) /* driver specific failure */
141  		return;
142  
143  	/* Failed in FW, print cmd out failure details */
144  	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
145  }
146  
push_mkey_locked(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)147  static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
148  			    void *to_store)
149  {
150  	XA_STATE(xas, &ent->mkeys, 0);
151  	void *curr;
152  
153  	if (limit_pendings &&
154  	    (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
155  		return -EAGAIN;
156  
157  	while (1) {
158  		/*
159  		 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
160  		 * doesn't transparently unlock. Instead we set the xas index to
161  		 * the current value of reserved every iteration.
162  		 */
163  		xas_set(&xas, ent->reserved);
164  		curr = xas_load(&xas);
165  		if (!curr) {
166  			if (to_store && ent->stored == ent->reserved)
167  				xas_store(&xas, to_store);
168  			else
169  				xas_store(&xas, XA_ZERO_ENTRY);
170  			if (xas_valid(&xas)) {
171  				ent->reserved++;
172  				if (to_store) {
173  					if (ent->stored != ent->reserved)
174  						__xa_store(&ent->mkeys,
175  							   ent->stored,
176  							   to_store,
177  							   GFP_KERNEL);
178  					ent->stored++;
179  					queue_adjust_cache_locked(ent);
180  					WRITE_ONCE(ent->dev->cache.last_add,
181  						   jiffies);
182  				}
183  			}
184  		}
185  		xa_unlock_irq(&ent->mkeys);
186  
187  		/*
188  		 * Notice xas_nomem() must always be called as it cleans
189  		 * up any cached allocation.
190  		 */
191  		if (!xas_nomem(&xas, GFP_KERNEL))
192  			break;
193  		xa_lock_irq(&ent->mkeys);
194  	}
195  	xa_lock_irq(&ent->mkeys);
196  	if (xas_error(&xas))
197  		return xas_error(&xas);
198  	if (WARN_ON(curr))
199  		return -EINVAL;
200  	return 0;
201  }
202  
push_mkey(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)203  static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
204  		     void *to_store)
205  {
206  	int ret;
207  
208  	xa_lock_irq(&ent->mkeys);
209  	ret = push_mkey_locked(ent, limit_pendings, to_store);
210  	xa_unlock_irq(&ent->mkeys);
211  	return ret;
212  }
213  
undo_push_reserve_mkey(struct mlx5_cache_ent * ent)214  static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
215  {
216  	void *old;
217  
218  	ent->reserved--;
219  	old = __xa_erase(&ent->mkeys, ent->reserved);
220  	WARN_ON(old);
221  }
222  
push_to_reserved(struct mlx5_cache_ent * ent,u32 mkey)223  static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
224  {
225  	void *old;
226  
227  	old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
228  	WARN_ON(old);
229  	ent->stored++;
230  }
231  
pop_stored_mkey(struct mlx5_cache_ent * ent)232  static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
233  {
234  	void *old, *xa_mkey;
235  
236  	ent->stored--;
237  	ent->reserved--;
238  
239  	if (ent->stored == ent->reserved) {
240  		xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
241  		WARN_ON(!xa_mkey);
242  		return (u32)xa_to_value(xa_mkey);
243  	}
244  
245  	xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
246  			     GFP_KERNEL);
247  	WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
248  	old = __xa_erase(&ent->mkeys, ent->reserved);
249  	WARN_ON(old);
250  	return (u32)xa_to_value(xa_mkey);
251  }
252  
create_mkey_callback(int status,struct mlx5_async_work * context)253  static void create_mkey_callback(int status, struct mlx5_async_work *context)
254  {
255  	struct mlx5r_async_create_mkey *mkey_out =
256  		container_of(context, struct mlx5r_async_create_mkey, cb_work);
257  	struct mlx5_cache_ent *ent = mkey_out->ent;
258  	struct mlx5_ib_dev *dev = ent->dev;
259  	unsigned long flags;
260  
261  	if (status) {
262  		create_mkey_warn(dev, status, mkey_out->out);
263  		kfree(mkey_out);
264  		xa_lock_irqsave(&ent->mkeys, flags);
265  		undo_push_reserve_mkey(ent);
266  		WRITE_ONCE(dev->fill_delay, 1);
267  		xa_unlock_irqrestore(&ent->mkeys, flags);
268  		mod_timer(&dev->delay_timer, jiffies + HZ);
269  		return;
270  	}
271  
272  	mkey_out->mkey |= mlx5_idx_to_mkey(
273  		MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
274  	WRITE_ONCE(dev->cache.last_add, jiffies);
275  
276  	xa_lock_irqsave(&ent->mkeys, flags);
277  	push_to_reserved(ent, mkey_out->mkey);
278  	/* If we are doing fill_to_high_water then keep going. */
279  	queue_adjust_cache_locked(ent);
280  	xa_unlock_irqrestore(&ent->mkeys, flags);
281  	kfree(mkey_out);
282  }
283  
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)284  static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
285  {
286  	int ret = 0;
287  
288  	switch (access_mode) {
289  	case MLX5_MKC_ACCESS_MODE_MTT:
290  		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
291  						   sizeof(struct mlx5_mtt));
292  		break;
293  	case MLX5_MKC_ACCESS_MODE_KSM:
294  		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
295  						   sizeof(struct mlx5_klm));
296  		break;
297  	default:
298  		WARN_ON(1);
299  	}
300  	return ret;
301  }
302  
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)303  static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
304  {
305  	set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
306  				      ent->dev->umrc.pd);
307  	MLX5_SET(mkc, mkc, free, 1);
308  	MLX5_SET(mkc, mkc, umr_en, 1);
309  	MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
310  	MLX5_SET(mkc, mkc, access_mode_4_2,
311  		(ent->rb_key.access_mode >> 2) & 0x7);
312  	MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
313  
314  	MLX5_SET(mkc, mkc, translations_octword_size,
315  		 get_mkc_octo_size(ent->rb_key.access_mode,
316  				   ent->rb_key.ndescs));
317  	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
318  }
319  
320  /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)321  static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
322  {
323  	struct mlx5r_async_create_mkey *async_create;
324  	void *mkc;
325  	int err = 0;
326  	int i;
327  
328  	for (i = 0; i < num; i++) {
329  		async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
330  				       GFP_KERNEL);
331  		if (!async_create)
332  			return -ENOMEM;
333  		mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
334  				   memory_key_mkey_entry);
335  		set_cache_mkc(ent, mkc);
336  		async_create->ent = ent;
337  
338  		err = push_mkey(ent, true, NULL);
339  		if (err)
340  			goto free_async_create;
341  
342  		err = mlx5_ib_create_mkey_cb(async_create);
343  		if (err) {
344  			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
345  			goto err_undo_reserve;
346  		}
347  	}
348  
349  	return 0;
350  
351  err_undo_reserve:
352  	xa_lock_irq(&ent->mkeys);
353  	undo_push_reserve_mkey(ent);
354  	xa_unlock_irq(&ent->mkeys);
355  free_async_create:
356  	kfree(async_create);
357  	return err;
358  }
359  
360  /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)361  static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
362  {
363  	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
364  	void *mkc;
365  	u32 *in;
366  	int err;
367  
368  	in = kzalloc(inlen, GFP_KERNEL);
369  	if (!in)
370  		return -ENOMEM;
371  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
372  	set_cache_mkc(ent, mkc);
373  
374  	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
375  	if (err)
376  		goto free_in;
377  
378  	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
379  free_in:
380  	kfree(in);
381  	return err;
382  }
383  
remove_cache_mr_locked(struct mlx5_cache_ent * ent)384  static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
385  {
386  	u32 mkey;
387  
388  	lockdep_assert_held(&ent->mkeys.xa_lock);
389  	if (!ent->stored)
390  		return;
391  	mkey = pop_stored_mkey(ent);
392  	xa_unlock_irq(&ent->mkeys);
393  	mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
394  	xa_lock_irq(&ent->mkeys);
395  }
396  
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)397  static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
398  				bool limit_fill)
399  	 __acquires(&ent->mkeys) __releases(&ent->mkeys)
400  {
401  	int err;
402  
403  	lockdep_assert_held(&ent->mkeys.xa_lock);
404  
405  	while (true) {
406  		if (limit_fill)
407  			target = ent->limit * 2;
408  		if (target == ent->reserved)
409  			return 0;
410  		if (target > ent->reserved) {
411  			u32 todo = target - ent->reserved;
412  
413  			xa_unlock_irq(&ent->mkeys);
414  			err = add_keys(ent, todo);
415  			if (err == -EAGAIN)
416  				usleep_range(3000, 5000);
417  			xa_lock_irq(&ent->mkeys);
418  			if (err) {
419  				if (err != -EAGAIN)
420  					return err;
421  			} else
422  				return 0;
423  		} else {
424  			remove_cache_mr_locked(ent);
425  		}
426  	}
427  }
428  
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)429  static ssize_t size_write(struct file *filp, const char __user *buf,
430  			  size_t count, loff_t *pos)
431  {
432  	struct mlx5_cache_ent *ent = filp->private_data;
433  	u32 target;
434  	int err;
435  
436  	err = kstrtou32_from_user(buf, count, 0, &target);
437  	if (err)
438  		return err;
439  
440  	/*
441  	 * Target is the new value of total_mrs the user requests, however we
442  	 * cannot free MRs that are in use. Compute the target value for stored
443  	 * mkeys.
444  	 */
445  	xa_lock_irq(&ent->mkeys);
446  	if (target < ent->in_use) {
447  		err = -EINVAL;
448  		goto err_unlock;
449  	}
450  	target = target - ent->in_use;
451  	if (target < ent->limit || target > ent->limit*2) {
452  		err = -EINVAL;
453  		goto err_unlock;
454  	}
455  	err = resize_available_mrs(ent, target, false);
456  	if (err)
457  		goto err_unlock;
458  	xa_unlock_irq(&ent->mkeys);
459  
460  	return count;
461  
462  err_unlock:
463  	xa_unlock_irq(&ent->mkeys);
464  	return err;
465  }
466  
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)467  static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
468  			 loff_t *pos)
469  {
470  	struct mlx5_cache_ent *ent = filp->private_data;
471  	char lbuf[20];
472  	int err;
473  
474  	err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
475  	if (err < 0)
476  		return err;
477  
478  	return simple_read_from_buffer(buf, count, pos, lbuf, err);
479  }
480  
481  static const struct file_operations size_fops = {
482  	.owner	= THIS_MODULE,
483  	.open	= simple_open,
484  	.write	= size_write,
485  	.read	= size_read,
486  };
487  
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)488  static ssize_t limit_write(struct file *filp, const char __user *buf,
489  			   size_t count, loff_t *pos)
490  {
491  	struct mlx5_cache_ent *ent = filp->private_data;
492  	u32 var;
493  	int err;
494  
495  	err = kstrtou32_from_user(buf, count, 0, &var);
496  	if (err)
497  		return err;
498  
499  	/*
500  	 * Upon set we immediately fill the cache to high water mark implied by
501  	 * the limit.
502  	 */
503  	xa_lock_irq(&ent->mkeys);
504  	ent->limit = var;
505  	err = resize_available_mrs(ent, 0, true);
506  	xa_unlock_irq(&ent->mkeys);
507  	if (err)
508  		return err;
509  	return count;
510  }
511  
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)512  static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
513  			  loff_t *pos)
514  {
515  	struct mlx5_cache_ent *ent = filp->private_data;
516  	char lbuf[20];
517  	int err;
518  
519  	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
520  	if (err < 0)
521  		return err;
522  
523  	return simple_read_from_buffer(buf, count, pos, lbuf, err);
524  }
525  
526  static const struct file_operations limit_fops = {
527  	.owner	= THIS_MODULE,
528  	.open	= simple_open,
529  	.write	= limit_write,
530  	.read	= limit_read,
531  };
532  
someone_adding(struct mlx5_mkey_cache * cache)533  static bool someone_adding(struct mlx5_mkey_cache *cache)
534  {
535  	struct mlx5_cache_ent *ent;
536  	struct rb_node *node;
537  	bool ret;
538  
539  	mutex_lock(&cache->rb_lock);
540  	for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
541  		ent = rb_entry(node, struct mlx5_cache_ent, node);
542  		xa_lock_irq(&ent->mkeys);
543  		ret = ent->stored < ent->limit;
544  		xa_unlock_irq(&ent->mkeys);
545  		if (ret) {
546  			mutex_unlock(&cache->rb_lock);
547  			return true;
548  		}
549  	}
550  	mutex_unlock(&cache->rb_lock);
551  	return false;
552  }
553  
554  /*
555   * Check if the bucket is outside the high/low water mark and schedule an async
556   * update. The cache refill has hysteresis, once the low water mark is hit it is
557   * refilled up to the high mark.
558   */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)559  static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
560  {
561  	lockdep_assert_held(&ent->mkeys.xa_lock);
562  
563  	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
564  		return;
565  	if (ent->stored < ent->limit) {
566  		ent->fill_to_high_water = true;
567  		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
568  	} else if (ent->fill_to_high_water &&
569  		   ent->reserved < 2 * ent->limit) {
570  		/*
571  		 * Once we start populating due to hitting a low water mark
572  		 * continue until we pass the high water mark.
573  		 */
574  		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
575  	} else if (ent->stored == 2 * ent->limit) {
576  		ent->fill_to_high_water = false;
577  	} else if (ent->stored > 2 * ent->limit) {
578  		/* Queue deletion of excess entries */
579  		ent->fill_to_high_water = false;
580  		if (ent->stored != ent->reserved)
581  			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
582  					   msecs_to_jiffies(1000));
583  		else
584  			mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
585  	}
586  }
587  
__cache_work_func(struct mlx5_cache_ent * ent)588  static void __cache_work_func(struct mlx5_cache_ent *ent)
589  {
590  	struct mlx5_ib_dev *dev = ent->dev;
591  	struct mlx5_mkey_cache *cache = &dev->cache;
592  	int err;
593  
594  	xa_lock_irq(&ent->mkeys);
595  	if (ent->disabled)
596  		goto out;
597  
598  	if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
599  	    !READ_ONCE(dev->fill_delay)) {
600  		xa_unlock_irq(&ent->mkeys);
601  		err = add_keys(ent, 1);
602  		xa_lock_irq(&ent->mkeys);
603  		if (ent->disabled)
604  			goto out;
605  		if (err) {
606  			/*
607  			 * EAGAIN only happens if there are pending MRs, so we
608  			 * will be rescheduled when storing them. The only
609  			 * failure path here is ENOMEM.
610  			 */
611  			if (err != -EAGAIN) {
612  				mlx5_ib_warn(
613  					dev,
614  					"add keys command failed, err %d\n",
615  					err);
616  				queue_delayed_work(cache->wq, &ent->dwork,
617  						   msecs_to_jiffies(1000));
618  			}
619  		}
620  	} else if (ent->stored > 2 * ent->limit) {
621  		bool need_delay;
622  
623  		/*
624  		 * The remove_cache_mr() logic is performed as garbage
625  		 * collection task. Such task is intended to be run when no
626  		 * other active processes are running.
627  		 *
628  		 * The need_resched() will return TRUE if there are user tasks
629  		 * to be activated in near future.
630  		 *
631  		 * In such case, we don't execute remove_cache_mr() and postpone
632  		 * the garbage collection work to try to run in next cycle, in
633  		 * order to free CPU resources to other tasks.
634  		 */
635  		xa_unlock_irq(&ent->mkeys);
636  		need_delay = need_resched() || someone_adding(cache) ||
637  			     !time_after(jiffies,
638  					 READ_ONCE(cache->last_add) + 300 * HZ);
639  		xa_lock_irq(&ent->mkeys);
640  		if (ent->disabled)
641  			goto out;
642  		if (need_delay) {
643  			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
644  			goto out;
645  		}
646  		remove_cache_mr_locked(ent);
647  		queue_adjust_cache_locked(ent);
648  	}
649  out:
650  	xa_unlock_irq(&ent->mkeys);
651  }
652  
delayed_cache_work_func(struct work_struct * work)653  static void delayed_cache_work_func(struct work_struct *work)
654  {
655  	struct mlx5_cache_ent *ent;
656  
657  	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
658  	__cache_work_func(ent);
659  }
660  
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)661  static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
662  			     struct mlx5r_cache_rb_key key2)
663  {
664  	int res;
665  
666  	res = key1.ats - key2.ats;
667  	if (res)
668  		return res;
669  
670  	res = key1.access_mode - key2.access_mode;
671  	if (res)
672  		return res;
673  
674  	res = key1.access_flags - key2.access_flags;
675  	if (res)
676  		return res;
677  
678  	/*
679  	 * keep ndescs the last in the compare table since the find function
680  	 * searches for an exact match on all properties and only closest
681  	 * match in size.
682  	 */
683  	return key1.ndescs - key2.ndescs;
684  }
685  
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)686  static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
687  				 struct mlx5_cache_ent *ent)
688  {
689  	struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
690  	struct mlx5_cache_ent *cur;
691  	int cmp;
692  
693  	/* Figure out where to put new node */
694  	while (*new) {
695  		cur = rb_entry(*new, struct mlx5_cache_ent, node);
696  		parent = *new;
697  		cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
698  		if (cmp > 0)
699  			new = &((*new)->rb_left);
700  		if (cmp < 0)
701  			new = &((*new)->rb_right);
702  		if (cmp == 0)
703  			return -EEXIST;
704  	}
705  
706  	/* Add new node and rebalance tree. */
707  	rb_link_node(&ent->node, parent, new);
708  	rb_insert_color(&ent->node, &cache->rb_root);
709  
710  	return 0;
711  }
712  
713  static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)714  mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
715  			   struct mlx5r_cache_rb_key rb_key)
716  {
717  	struct rb_node *node = dev->cache.rb_root.rb_node;
718  	struct mlx5_cache_ent *cur, *smallest = NULL;
719  	u64 ndescs_limit;
720  	int cmp;
721  
722  	/*
723  	 * Find the smallest ent with order >= requested_order.
724  	 */
725  	while (node) {
726  		cur = rb_entry(node, struct mlx5_cache_ent, node);
727  		cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
728  		if (cmp > 0) {
729  			smallest = cur;
730  			node = node->rb_left;
731  		}
732  		if (cmp < 0)
733  			node = node->rb_right;
734  		if (cmp == 0)
735  			return cur;
736  	}
737  
738  	/*
739  	 * Limit the usage of mkeys larger than twice the required size while
740  	 * also allowing the usage of smallest cache entry for small MRs.
741  	 */
742  	ndescs_limit = max_t(u64, rb_key.ndescs * 2,
743  			     MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
744  
745  	return (smallest &&
746  		smallest->rb_key.access_mode == rb_key.access_mode &&
747  		smallest->rb_key.access_flags == rb_key.access_flags &&
748  		smallest->rb_key.ats == rb_key.ats &&
749  		smallest->rb_key.ndescs <= ndescs_limit) ?
750  		       smallest :
751  		       NULL;
752  }
753  
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent,int access_flags)754  static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
755  					struct mlx5_cache_ent *ent,
756  					int access_flags)
757  {
758  	struct mlx5_ib_mr *mr;
759  	int err;
760  
761  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
762  	if (!mr)
763  		return ERR_PTR(-ENOMEM);
764  
765  	xa_lock_irq(&ent->mkeys);
766  	ent->in_use++;
767  
768  	if (!ent->stored) {
769  		queue_adjust_cache_locked(ent);
770  		ent->miss++;
771  		xa_unlock_irq(&ent->mkeys);
772  		err = create_cache_mkey(ent, &mr->mmkey.key);
773  		if (err) {
774  			xa_lock_irq(&ent->mkeys);
775  			ent->in_use--;
776  			xa_unlock_irq(&ent->mkeys);
777  			kfree(mr);
778  			return ERR_PTR(err);
779  		}
780  	} else {
781  		mr->mmkey.key = pop_stored_mkey(ent);
782  		queue_adjust_cache_locked(ent);
783  		xa_unlock_irq(&ent->mkeys);
784  	}
785  	mr->mmkey.cache_ent = ent;
786  	mr->mmkey.type = MLX5_MKEY_MR;
787  	init_waitqueue_head(&mr->mmkey.wait);
788  	return mr;
789  }
790  
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)791  static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
792  					 int access_flags)
793  {
794  	int ret = 0;
795  
796  	if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
797  	    MLX5_CAP_GEN(dev->mdev, atomic) &&
798  	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
799  		ret |= IB_ACCESS_REMOTE_ATOMIC;
800  
801  	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
802  	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
803  	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
804  		ret |= IB_ACCESS_RELAXED_ORDERING;
805  
806  	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
807  	    (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
808  	     MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
809  	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
810  		ret |= IB_ACCESS_RELAXED_ORDERING;
811  
812  	return ret;
813  }
814  
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)815  struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
816  				       int access_flags, int access_mode,
817  				       int ndescs)
818  {
819  	struct mlx5r_cache_rb_key rb_key = {
820  		.ndescs = ndescs,
821  		.access_mode = access_mode,
822  		.access_flags = get_unchangeable_access_flags(dev, access_flags)
823  	};
824  	struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
825  
826  	if (!ent)
827  		return ERR_PTR(-EOPNOTSUPP);
828  
829  	return _mlx5_mr_cache_alloc(dev, ent, access_flags);
830  }
831  
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)832  static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
833  {
834  	u32 mkey;
835  
836  	cancel_delayed_work(&ent->dwork);
837  	xa_lock_irq(&ent->mkeys);
838  	while (ent->stored) {
839  		mkey = pop_stored_mkey(ent);
840  		xa_unlock_irq(&ent->mkeys);
841  		mlx5_core_destroy_mkey(dev->mdev, mkey);
842  		xa_lock_irq(&ent->mkeys);
843  	}
844  	xa_unlock_irq(&ent->mkeys);
845  }
846  
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)847  static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
848  {
849  	if (!mlx5_debugfs_root || dev->is_rep)
850  		return;
851  
852  	debugfs_remove_recursive(dev->cache.fs_root);
853  	dev->cache.fs_root = NULL;
854  }
855  
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)856  static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
857  					    struct mlx5_cache_ent *ent)
858  {
859  	int order = order_base_2(ent->rb_key.ndescs);
860  	struct dentry *dir;
861  
862  	if (!mlx5_debugfs_root || dev->is_rep)
863  		return;
864  
865  	if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
866  		order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
867  
868  	sprintf(ent->name, "%d", order);
869  	dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
870  	debugfs_create_file("size", 0600, dir, ent, &size_fops);
871  	debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
872  	debugfs_create_ulong("cur", 0400, dir, &ent->stored);
873  	debugfs_create_u32("miss", 0600, dir, &ent->miss);
874  }
875  
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)876  static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
877  {
878  	struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
879  	struct mlx5_mkey_cache *cache = &dev->cache;
880  
881  	if (!mlx5_debugfs_root || dev->is_rep)
882  		return;
883  
884  	cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
885  }
886  
delay_time_func(struct timer_list * t)887  static void delay_time_func(struct timer_list *t)
888  {
889  	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
890  
891  	WRITE_ONCE(dev->fill_delay, 0);
892  }
893  
894  struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)895  mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
896  			      struct mlx5r_cache_rb_key rb_key,
897  			      bool persistent_entry)
898  {
899  	struct mlx5_cache_ent *ent;
900  	int order;
901  	int ret;
902  
903  	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
904  	if (!ent)
905  		return ERR_PTR(-ENOMEM);
906  
907  	xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
908  	ent->rb_key = rb_key;
909  	ent->dev = dev;
910  	ent->is_tmp = !persistent_entry;
911  
912  	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
913  
914  	ret = mlx5_cache_ent_insert(&dev->cache, ent);
915  	if (ret) {
916  		kfree(ent);
917  		return ERR_PTR(ret);
918  	}
919  
920  	if (persistent_entry) {
921  		if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
922  			order = MLX5_IMR_KSM_CACHE_ENTRY;
923  		else
924  			order = order_base_2(rb_key.ndescs) - 2;
925  
926  		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
927  		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
928  		    mlx5r_umr_can_load_pas(dev, 0))
929  			ent->limit = dev->mdev->profile.mr_cache[order].limit;
930  		else
931  			ent->limit = 0;
932  
933  		mlx5_mkey_cache_debugfs_add_ent(dev, ent);
934  	} else {
935  		mod_delayed_work(ent->dev->cache.wq,
936  				 &ent->dev->cache.remove_ent_dwork,
937  				 msecs_to_jiffies(30 * 1000));
938  	}
939  
940  	return ent;
941  }
942  
remove_ent_work_func(struct work_struct * work)943  static void remove_ent_work_func(struct work_struct *work)
944  {
945  	struct mlx5_mkey_cache *cache;
946  	struct mlx5_cache_ent *ent;
947  	struct rb_node *cur;
948  
949  	cache = container_of(work, struct mlx5_mkey_cache,
950  			     remove_ent_dwork.work);
951  	mutex_lock(&cache->rb_lock);
952  	cur = rb_last(&cache->rb_root);
953  	while (cur) {
954  		ent = rb_entry(cur, struct mlx5_cache_ent, node);
955  		cur = rb_prev(cur);
956  		mutex_unlock(&cache->rb_lock);
957  
958  		xa_lock_irq(&ent->mkeys);
959  		if (!ent->is_tmp) {
960  			xa_unlock_irq(&ent->mkeys);
961  			mutex_lock(&cache->rb_lock);
962  			continue;
963  		}
964  		xa_unlock_irq(&ent->mkeys);
965  
966  		clean_keys(ent->dev, ent);
967  		mutex_lock(&cache->rb_lock);
968  	}
969  	mutex_unlock(&cache->rb_lock);
970  }
971  
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)972  int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
973  {
974  	struct mlx5_mkey_cache *cache = &dev->cache;
975  	struct rb_root *root = &dev->cache.rb_root;
976  	struct mlx5r_cache_rb_key rb_key = {
977  		.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
978  	};
979  	struct mlx5_cache_ent *ent;
980  	struct rb_node *node;
981  	int ret;
982  	int i;
983  
984  	mutex_init(&dev->slow_path_mutex);
985  	mutex_init(&dev->cache.rb_lock);
986  	dev->cache.rb_root = RB_ROOT;
987  	INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
988  	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
989  	if (!cache->wq) {
990  		mlx5_ib_warn(dev, "failed to create work queue\n");
991  		return -ENOMEM;
992  	}
993  
994  	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
995  	timer_setup(&dev->delay_timer, delay_time_func, 0);
996  	mlx5_mkey_cache_debugfs_init(dev);
997  	mutex_lock(&cache->rb_lock);
998  	for (i = 0; i <= mkey_cache_max_order(dev); i++) {
999  		rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
1000  		ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
1001  		if (IS_ERR(ent)) {
1002  			ret = PTR_ERR(ent);
1003  			goto err;
1004  		}
1005  	}
1006  
1007  	ret = mlx5_odp_init_mkey_cache(dev);
1008  	if (ret)
1009  		goto err;
1010  
1011  	mutex_unlock(&cache->rb_lock);
1012  	for (node = rb_first(root); node; node = rb_next(node)) {
1013  		ent = rb_entry(node, struct mlx5_cache_ent, node);
1014  		xa_lock_irq(&ent->mkeys);
1015  		queue_adjust_cache_locked(ent);
1016  		xa_unlock_irq(&ent->mkeys);
1017  	}
1018  
1019  	return 0;
1020  
1021  err:
1022  	mutex_unlock(&cache->rb_lock);
1023  	mlx5_mkey_cache_debugfs_cleanup(dev);
1024  	mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
1025  	return ret;
1026  }
1027  
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)1028  void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
1029  {
1030  	struct rb_root *root = &dev->cache.rb_root;
1031  	struct mlx5_cache_ent *ent;
1032  	struct rb_node *node;
1033  
1034  	if (!dev->cache.wq)
1035  		return;
1036  
1037  	mutex_lock(&dev->cache.rb_lock);
1038  	cancel_delayed_work(&dev->cache.remove_ent_dwork);
1039  	for (node = rb_first(root); node; node = rb_next(node)) {
1040  		ent = rb_entry(node, struct mlx5_cache_ent, node);
1041  		xa_lock_irq(&ent->mkeys);
1042  		ent->disabled = true;
1043  		xa_unlock_irq(&ent->mkeys);
1044  		cancel_delayed_work(&ent->dwork);
1045  	}
1046  	mutex_unlock(&dev->cache.rb_lock);
1047  
1048  	/*
1049  	 * After all entries are disabled and will not reschedule on WQ,
1050  	 * flush it and all async commands.
1051  	 */
1052  	flush_workqueue(dev->cache.wq);
1053  
1054  	mlx5_mkey_cache_debugfs_cleanup(dev);
1055  	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1056  
1057  	/* At this point all entries are disabled and have no concurrent work. */
1058  	mutex_lock(&dev->cache.rb_lock);
1059  	node = rb_first(root);
1060  	while (node) {
1061  		ent = rb_entry(node, struct mlx5_cache_ent, node);
1062  		node = rb_next(node);
1063  		clean_keys(dev, ent);
1064  		rb_erase(&ent->node, root);
1065  		kfree(ent);
1066  	}
1067  	mutex_unlock(&dev->cache.rb_lock);
1068  
1069  	destroy_workqueue(dev->cache.wq);
1070  	del_timer_sync(&dev->delay_timer);
1071  }
1072  
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1073  struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1074  {
1075  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1076  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1077  	struct mlx5_ib_mr *mr;
1078  	void *mkc;
1079  	u32 *in;
1080  	int err;
1081  
1082  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1083  	if (!mr)
1084  		return ERR_PTR(-ENOMEM);
1085  
1086  	in = kzalloc(inlen, GFP_KERNEL);
1087  	if (!in) {
1088  		err = -ENOMEM;
1089  		goto err_free;
1090  	}
1091  
1092  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1093  
1094  	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1095  	MLX5_SET(mkc, mkc, length64, 1);
1096  	set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1097  				      pd);
1098  
1099  	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1100  	if (err)
1101  		goto err_in;
1102  
1103  	kfree(in);
1104  	mr->mmkey.type = MLX5_MKEY_MR;
1105  	mr->ibmr.lkey = mr->mmkey.key;
1106  	mr->ibmr.rkey = mr->mmkey.key;
1107  	mr->umem = NULL;
1108  
1109  	return &mr->ibmr;
1110  
1111  err_in:
1112  	kfree(in);
1113  
1114  err_free:
1115  	kfree(mr);
1116  
1117  	return ERR_PTR(err);
1118  }
1119  
get_octo_len(u64 addr,u64 len,int page_shift)1120  static int get_octo_len(u64 addr, u64 len, int page_shift)
1121  {
1122  	u64 page_size = 1ULL << page_shift;
1123  	u64 offset;
1124  	int npages;
1125  
1126  	offset = addr & (page_size - 1);
1127  	npages = ALIGN(len + offset, page_size) >> page_shift;
1128  	return (npages + 1) / 2;
1129  }
1130  
mkey_cache_max_order(struct mlx5_ib_dev * dev)1131  static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1132  {
1133  	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1134  		return MKEY_CACHE_LAST_STD_ENTRY;
1135  	return MLX5_MAX_UMR_SHIFT;
1136  }
1137  
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1138  static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1139  			  u64 length, int access_flags, u64 iova)
1140  {
1141  	mr->ibmr.lkey = mr->mmkey.key;
1142  	mr->ibmr.rkey = mr->mmkey.key;
1143  	mr->ibmr.length = length;
1144  	mr->ibmr.device = &dev->ib_dev;
1145  	mr->ibmr.iova = iova;
1146  	mr->access_flags = access_flags;
1147  }
1148  
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1149  static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1150  						  u64 iova)
1151  {
1152  	/*
1153  	 * The alignment of iova has already been checked upon entering
1154  	 * UVERBS_METHOD_REG_DMABUF_MR
1155  	 */
1156  	umem->iova = iova;
1157  	return PAGE_SIZE;
1158  }
1159  
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1160  static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1161  					     struct ib_umem *umem, u64 iova,
1162  					     int access_flags)
1163  {
1164  	struct mlx5r_cache_rb_key rb_key = {
1165  		.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
1166  	};
1167  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1168  	struct mlx5_cache_ent *ent;
1169  	struct mlx5_ib_mr *mr;
1170  	unsigned int page_size;
1171  
1172  	if (umem->is_dmabuf)
1173  		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1174  	else
1175  		page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
1176  						     0, iova);
1177  	if (WARN_ON(!page_size))
1178  		return ERR_PTR(-EINVAL);
1179  
1180  	rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1181  	rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1182  	rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1183  	ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1184  	/*
1185  	 * If the MR can't come from the cache then synchronously create an uncached
1186  	 * one.
1187  	 */
1188  	if (!ent) {
1189  		mutex_lock(&dev->slow_path_mutex);
1190  		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
1191  		mutex_unlock(&dev->slow_path_mutex);
1192  		if (IS_ERR(mr))
1193  			return mr;
1194  		mr->mmkey.rb_key = rb_key;
1195  		return mr;
1196  	}
1197  
1198  	mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1199  	if (IS_ERR(mr))
1200  		return mr;
1201  
1202  	mr->ibmr.pd = pd;
1203  	mr->umem = umem;
1204  	mr->page_shift = order_base_2(page_size);
1205  	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1206  
1207  	return mr;
1208  }
1209  
1210  /*
1211   * If ibmr is NULL it will be allocated by reg_create.
1212   * Else, the given ibmr will be used.
1213   */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned int page_size,bool populate)1214  static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1215  				     u64 iova, int access_flags,
1216  				     unsigned int page_size, bool populate)
1217  {
1218  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1219  	struct mlx5_ib_mr *mr;
1220  	__be64 *pas;
1221  	void *mkc;
1222  	int inlen;
1223  	u32 *in;
1224  	int err;
1225  	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1226  
1227  	if (!page_size)
1228  		return ERR_PTR(-EINVAL);
1229  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1230  	if (!mr)
1231  		return ERR_PTR(-ENOMEM);
1232  
1233  	mr->ibmr.pd = pd;
1234  	mr->access_flags = access_flags;
1235  	mr->page_shift = order_base_2(page_size);
1236  
1237  	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1238  	if (populate)
1239  		inlen += sizeof(*pas) *
1240  			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1241  	in = kvzalloc(inlen, GFP_KERNEL);
1242  	if (!in) {
1243  		err = -ENOMEM;
1244  		goto err_1;
1245  	}
1246  	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1247  	if (populate) {
1248  		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1249  			err = -EINVAL;
1250  			goto err_2;
1251  		}
1252  		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1253  				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1254  	}
1255  
1256  	/* The pg_access bit allows setting the access flags
1257  	 * in the page list submitted with the command.
1258  	 */
1259  	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1260  
1261  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1262  	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1263  				      populate ? pd : dev->umrc.pd);
1264  	MLX5_SET(mkc, mkc, free, !populate);
1265  	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1266  	MLX5_SET(mkc, mkc, umr_en, 1);
1267  
1268  	MLX5_SET64(mkc, mkc, len, umem->length);
1269  	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1270  	MLX5_SET(mkc, mkc, translations_octword_size,
1271  		 get_octo_len(iova, umem->length, mr->page_shift));
1272  	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1273  	if (mlx5_umem_needs_ats(dev, umem, access_flags))
1274  		MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1275  	if (populate) {
1276  		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1277  			 get_octo_len(iova, umem->length, mr->page_shift));
1278  	}
1279  
1280  	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1281  	if (err) {
1282  		mlx5_ib_warn(dev, "create mkey failed\n");
1283  		goto err_2;
1284  	}
1285  	mr->mmkey.type = MLX5_MKEY_MR;
1286  	mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1287  	mr->umem = umem;
1288  	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1289  	kvfree(in);
1290  
1291  	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1292  
1293  	return mr;
1294  
1295  err_2:
1296  	kvfree(in);
1297  err_1:
1298  	kfree(mr);
1299  	return ERR_PTR(err);
1300  }
1301  
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1302  static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1303  				       u64 length, int acc, int mode)
1304  {
1305  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1306  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1307  	struct mlx5_ib_mr *mr;
1308  	void *mkc;
1309  	u32 *in;
1310  	int err;
1311  
1312  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1313  	if (!mr)
1314  		return ERR_PTR(-ENOMEM);
1315  
1316  	in = kzalloc(inlen, GFP_KERNEL);
1317  	if (!in) {
1318  		err = -ENOMEM;
1319  		goto err_free;
1320  	}
1321  
1322  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1323  
1324  	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1325  	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1326  	MLX5_SET64(mkc, mkc, len, length);
1327  	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1328  
1329  	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1330  	if (err)
1331  		goto err_in;
1332  
1333  	kfree(in);
1334  
1335  	set_mr_fields(dev, mr, length, acc, start_addr);
1336  
1337  	return &mr->ibmr;
1338  
1339  err_in:
1340  	kfree(in);
1341  
1342  err_free:
1343  	kfree(mr);
1344  
1345  	return ERR_PTR(err);
1346  }
1347  
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1348  int mlx5_ib_advise_mr(struct ib_pd *pd,
1349  		      enum ib_uverbs_advise_mr_advice advice,
1350  		      u32 flags,
1351  		      struct ib_sge *sg_list,
1352  		      u32 num_sge,
1353  		      struct uverbs_attr_bundle *attrs)
1354  {
1355  	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1356  	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1357  	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1358  		return -EOPNOTSUPP;
1359  
1360  	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1361  					 sg_list, num_sge);
1362  }
1363  
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1364  struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1365  				struct ib_dm_mr_attr *attr,
1366  				struct uverbs_attr_bundle *attrs)
1367  {
1368  	struct mlx5_ib_dm *mdm = to_mdm(dm);
1369  	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1370  	u64 start_addr = mdm->dev_addr + attr->offset;
1371  	int mode;
1372  
1373  	switch (mdm->type) {
1374  	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1375  		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1376  			return ERR_PTR(-EINVAL);
1377  
1378  		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1379  		start_addr -= pci_resource_start(dev->pdev, 0);
1380  		break;
1381  	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1382  	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1383  	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1384  		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1385  			return ERR_PTR(-EINVAL);
1386  
1387  		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1388  		break;
1389  	default:
1390  		return ERR_PTR(-EINVAL);
1391  	}
1392  
1393  	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1394  				 attr->access_flags, mode);
1395  }
1396  
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1397  static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1398  				    u64 iova, int access_flags)
1399  {
1400  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1401  	struct mlx5_ib_mr *mr = NULL;
1402  	bool xlt_with_umr;
1403  	int err;
1404  
1405  	xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1406  	if (xlt_with_umr) {
1407  		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1408  	} else {
1409  		unsigned int page_size = mlx5_umem_find_best_pgsz(
1410  			umem, mkc, log_page_size, 0, iova);
1411  
1412  		mutex_lock(&dev->slow_path_mutex);
1413  		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1414  		mutex_unlock(&dev->slow_path_mutex);
1415  	}
1416  	if (IS_ERR(mr)) {
1417  		ib_umem_release(umem);
1418  		return ERR_CAST(mr);
1419  	}
1420  
1421  	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1422  
1423  	atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1424  
1425  	if (xlt_with_umr) {
1426  		/*
1427  		 * If the MR was created with reg_create then it will be
1428  		 * configured properly but left disabled. It is safe to go ahead
1429  		 * and configure it again via UMR while enabling it.
1430  		 */
1431  		err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1432  		if (err) {
1433  			mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1434  			return ERR_PTR(err);
1435  		}
1436  	}
1437  	return &mr->ibmr;
1438  }
1439  
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1440  static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1441  					u64 iova, int access_flags,
1442  					struct ib_udata *udata)
1443  {
1444  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1445  	struct ib_umem_odp *odp;
1446  	struct mlx5_ib_mr *mr;
1447  	int err;
1448  
1449  	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1450  		return ERR_PTR(-EOPNOTSUPP);
1451  
1452  	err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1453  	if (err)
1454  		return ERR_PTR(err);
1455  	if (!start && length == U64_MAX) {
1456  		if (iova != 0)
1457  			return ERR_PTR(-EINVAL);
1458  		if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1459  			return ERR_PTR(-EINVAL);
1460  
1461  		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1462  		if (IS_ERR(mr))
1463  			return ERR_CAST(mr);
1464  		return &mr->ibmr;
1465  	}
1466  
1467  	/* ODP requires xlt update via umr to work. */
1468  	if (!mlx5r_umr_can_load_pas(dev, length))
1469  		return ERR_PTR(-EINVAL);
1470  
1471  	odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1472  			      &mlx5_mn_ops);
1473  	if (IS_ERR(odp))
1474  		return ERR_CAST(odp);
1475  
1476  	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1477  	if (IS_ERR(mr)) {
1478  		ib_umem_release(&odp->umem);
1479  		return ERR_CAST(mr);
1480  	}
1481  	xa_init(&mr->implicit_children);
1482  
1483  	odp->private = mr;
1484  	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1485  	if (err)
1486  		goto err_dereg_mr;
1487  
1488  	err = mlx5_ib_init_odp_mr(mr);
1489  	if (err)
1490  		goto err_dereg_mr;
1491  	return &mr->ibmr;
1492  
1493  err_dereg_mr:
1494  	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1495  	return ERR_PTR(err);
1496  }
1497  
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1498  struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1499  				  u64 iova, int access_flags,
1500  				  struct ib_udata *udata)
1501  {
1502  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1503  	struct ib_umem *umem;
1504  
1505  	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1506  		return ERR_PTR(-EOPNOTSUPP);
1507  
1508  	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1509  		    start, iova, length, access_flags);
1510  
1511  	if (access_flags & IB_ACCESS_ON_DEMAND)
1512  		return create_user_odp_mr(pd, start, length, iova, access_flags,
1513  					  udata);
1514  	umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1515  	if (IS_ERR(umem))
1516  		return ERR_CAST(umem);
1517  	return create_real_mr(pd, umem, iova, access_flags);
1518  }
1519  
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1520  static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1521  {
1522  	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1523  	struct mlx5_ib_mr *mr = umem_dmabuf->private;
1524  
1525  	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1526  
1527  	if (!umem_dmabuf->sgt)
1528  		return;
1529  
1530  	mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1531  	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1532  }
1533  
1534  static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1535  	.allow_peer2peer = 1,
1536  	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
1537  };
1538  
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct ib_udata * udata)1539  struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1540  					 u64 length, u64 virt_addr,
1541  					 int fd, int access_flags,
1542  					 struct ib_udata *udata)
1543  {
1544  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1545  	struct mlx5_ib_mr *mr = NULL;
1546  	struct ib_umem_dmabuf *umem_dmabuf;
1547  	int err;
1548  
1549  	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1550  	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1551  		return ERR_PTR(-EOPNOTSUPP);
1552  
1553  	mlx5_ib_dbg(dev,
1554  		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1555  		    offset, virt_addr, length, fd, access_flags);
1556  
1557  	/* dmabuf requires xlt update via umr to work. */
1558  	if (!mlx5r_umr_can_load_pas(dev, length))
1559  		return ERR_PTR(-EINVAL);
1560  
1561  	umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1562  					 access_flags,
1563  					 &mlx5_ib_dmabuf_attach_ops);
1564  	if (IS_ERR(umem_dmabuf)) {
1565  		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1566  			    PTR_ERR(umem_dmabuf));
1567  		return ERR_CAST(umem_dmabuf);
1568  	}
1569  
1570  	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1571  				access_flags);
1572  	if (IS_ERR(mr)) {
1573  		ib_umem_release(&umem_dmabuf->umem);
1574  		return ERR_CAST(mr);
1575  	}
1576  
1577  	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1578  
1579  	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1580  	umem_dmabuf->private = mr;
1581  	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1582  	if (err)
1583  		goto err_dereg_mr;
1584  
1585  	err = mlx5_ib_init_dmabuf_mr(mr);
1586  	if (err)
1587  		goto err_dereg_mr;
1588  	return &mr->ibmr;
1589  
1590  err_dereg_mr:
1591  	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1592  	return ERR_PTR(err);
1593  }
1594  
1595  /*
1596   * True if the change in access flags can be done via UMR, only some access
1597   * flags can be updated.
1598   */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1599  static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1600  				     unsigned int current_access_flags,
1601  				     unsigned int target_access_flags)
1602  {
1603  	unsigned int diffs = current_access_flags ^ target_access_flags;
1604  
1605  	if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1606  		      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1607  		      IB_ACCESS_REMOTE_ATOMIC))
1608  		return false;
1609  	return mlx5r_umr_can_reconfig(dev, current_access_flags,
1610  				      target_access_flags);
1611  }
1612  
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1613  static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1614  				  struct ib_umem *new_umem,
1615  				  int new_access_flags, u64 iova,
1616  				  unsigned long *page_size)
1617  {
1618  	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1619  
1620  	/* We only track the allocated sizes of MRs from the cache */
1621  	if (!mr->mmkey.cache_ent)
1622  		return false;
1623  	if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1624  		return false;
1625  
1626  	*page_size =
1627  		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1628  	if (WARN_ON(!*page_size))
1629  		return false;
1630  	return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1631  	       ib_umem_num_dma_blocks(new_umem, *page_size);
1632  }
1633  
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1634  static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1635  			 int access_flags, int flags, struct ib_umem *new_umem,
1636  			 u64 iova, unsigned long page_size)
1637  {
1638  	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1639  	int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1640  	struct ib_umem *old_umem = mr->umem;
1641  	int err;
1642  
1643  	/*
1644  	 * To keep everything simple the MR is revoked before we start to mess
1645  	 * with it. This ensure the change is atomic relative to any use of the
1646  	 * MR.
1647  	 */
1648  	err = mlx5r_umr_revoke_mr(mr);
1649  	if (err)
1650  		return err;
1651  
1652  	if (flags & IB_MR_REREG_PD) {
1653  		mr->ibmr.pd = pd;
1654  		upd_flags |= MLX5_IB_UPD_XLT_PD;
1655  	}
1656  	if (flags & IB_MR_REREG_ACCESS) {
1657  		mr->access_flags = access_flags;
1658  		upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1659  	}
1660  
1661  	mr->ibmr.iova = iova;
1662  	mr->ibmr.length = new_umem->length;
1663  	mr->page_shift = order_base_2(page_size);
1664  	mr->umem = new_umem;
1665  	err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1666  	if (err) {
1667  		/*
1668  		 * The MR is revoked at this point so there is no issue to free
1669  		 * new_umem.
1670  		 */
1671  		mr->umem = old_umem;
1672  		return err;
1673  	}
1674  
1675  	atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1676  	ib_umem_release(old_umem);
1677  	atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1678  	return 0;
1679  }
1680  
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1681  struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1682  				    u64 length, u64 iova, int new_access_flags,
1683  				    struct ib_pd *new_pd,
1684  				    struct ib_udata *udata)
1685  {
1686  	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1687  	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1688  	int err;
1689  
1690  	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1691  		return ERR_PTR(-EOPNOTSUPP);
1692  
1693  	mlx5_ib_dbg(
1694  		dev,
1695  		"start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1696  		start, iova, length, new_access_flags);
1697  
1698  	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1699  		return ERR_PTR(-EOPNOTSUPP);
1700  
1701  	if (!(flags & IB_MR_REREG_ACCESS))
1702  		new_access_flags = mr->access_flags;
1703  	if (!(flags & IB_MR_REREG_PD))
1704  		new_pd = ib_mr->pd;
1705  
1706  	if (!(flags & IB_MR_REREG_TRANS)) {
1707  		struct ib_umem *umem;
1708  
1709  		/* Fast path for PD/access change */
1710  		if (can_use_umr_rereg_access(dev, mr->access_flags,
1711  					     new_access_flags)) {
1712  			err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1713  							new_access_flags);
1714  			if (err)
1715  				return ERR_PTR(err);
1716  			return NULL;
1717  		}
1718  		/* DM or ODP MR's don't have a normal umem so we can't re-use it */
1719  		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1720  			goto recreate;
1721  
1722  		/*
1723  		 * Only one active MR can refer to a umem at one time, revoke
1724  		 * the old MR before assigning the umem to the new one.
1725  		 */
1726  		err = mlx5r_umr_revoke_mr(mr);
1727  		if (err)
1728  			return ERR_PTR(err);
1729  		umem = mr->umem;
1730  		mr->umem = NULL;
1731  		atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1732  
1733  		return create_real_mr(new_pd, umem, mr->ibmr.iova,
1734  				      new_access_flags);
1735  	}
1736  
1737  	/*
1738  	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1739  	 * but the logic around releasing the umem is different
1740  	 */
1741  	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1742  		goto recreate;
1743  
1744  	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1745  	    can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1746  		struct ib_umem *new_umem;
1747  		unsigned long page_size;
1748  
1749  		new_umem = ib_umem_get(&dev->ib_dev, start, length,
1750  				       new_access_flags);
1751  		if (IS_ERR(new_umem))
1752  			return ERR_CAST(new_umem);
1753  
1754  		/* Fast path for PAS change */
1755  		if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1756  					  &page_size)) {
1757  			err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1758  					    new_umem, iova, page_size);
1759  			if (err) {
1760  				ib_umem_release(new_umem);
1761  				return ERR_PTR(err);
1762  			}
1763  			return NULL;
1764  		}
1765  		return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1766  	}
1767  
1768  	/*
1769  	 * Everything else has no state we can preserve, just create a new MR
1770  	 * from scratch
1771  	 */
1772  recreate:
1773  	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1774  				   new_access_flags, udata);
1775  }
1776  
1777  static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1778  mlx5_alloc_priv_descs(struct ib_device *device,
1779  		      struct mlx5_ib_mr *mr,
1780  		      int ndescs,
1781  		      int desc_size)
1782  {
1783  	struct mlx5_ib_dev *dev = to_mdev(device);
1784  	struct device *ddev = &dev->mdev->pdev->dev;
1785  	int size = ndescs * desc_size;
1786  	int add_size;
1787  	int ret;
1788  
1789  	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1790  	if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1791  		int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1792  
1793  		add_size = min_t(int, end - size, add_size);
1794  	}
1795  
1796  	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1797  	if (!mr->descs_alloc)
1798  		return -ENOMEM;
1799  
1800  	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1801  
1802  	mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1803  	if (dma_mapping_error(ddev, mr->desc_map)) {
1804  		ret = -ENOMEM;
1805  		goto err;
1806  	}
1807  
1808  	return 0;
1809  err:
1810  	kfree(mr->descs_alloc);
1811  
1812  	return ret;
1813  }
1814  
1815  static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1816  mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1817  {
1818  	if (!mr->umem && mr->descs) {
1819  		struct ib_device *device = mr->ibmr.device;
1820  		int size = mr->max_descs * mr->desc_size;
1821  		struct mlx5_ib_dev *dev = to_mdev(device);
1822  
1823  		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1824  				 DMA_TO_DEVICE);
1825  		kfree(mr->descs_alloc);
1826  		mr->descs = NULL;
1827  	}
1828  }
1829  
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1830  static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1831  				    struct mlx5_ib_mr *mr)
1832  {
1833  	struct mlx5_mkey_cache *cache = &dev->cache;
1834  	struct mlx5_cache_ent *ent;
1835  	int ret;
1836  
1837  	if (mr->mmkey.cache_ent) {
1838  		xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1839  		mr->mmkey.cache_ent->in_use--;
1840  		goto end;
1841  	}
1842  
1843  	mutex_lock(&cache->rb_lock);
1844  	ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1845  	if (ent) {
1846  		if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1847  			if (ent->disabled) {
1848  				mutex_unlock(&cache->rb_lock);
1849  				return -EOPNOTSUPP;
1850  			}
1851  			mr->mmkey.cache_ent = ent;
1852  			xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1853  			mutex_unlock(&cache->rb_lock);
1854  			goto end;
1855  		}
1856  	}
1857  
1858  	ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1859  	mutex_unlock(&cache->rb_lock);
1860  	if (IS_ERR(ent))
1861  		return PTR_ERR(ent);
1862  
1863  	mr->mmkey.cache_ent = ent;
1864  	xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1865  
1866  end:
1867  	ret = push_mkey_locked(mr->mmkey.cache_ent, false,
1868  			       xa_mk_value(mr->mmkey.key));
1869  	xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
1870  	return ret;
1871  }
1872  
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)1873  int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1874  {
1875  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1876  	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1877  	int rc;
1878  
1879  	/*
1880  	 * Any async use of the mr must hold the refcount, once the refcount
1881  	 * goes to zero no other thread, such as ODP page faults, prefetch, any
1882  	 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1883  	 */
1884  	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1885  	    refcount_read(&mr->mmkey.usecount) != 0 &&
1886  	    xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1887  		mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1888  
1889  	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1890  		xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1891  			   mr->sig, NULL, GFP_KERNEL);
1892  
1893  		if (mr->mtt_mr) {
1894  			rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1895  			if (rc)
1896  				return rc;
1897  			mr->mtt_mr = NULL;
1898  		}
1899  		if (mr->klm_mr) {
1900  			rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1901  			if (rc)
1902  				return rc;
1903  			mr->klm_mr = NULL;
1904  		}
1905  
1906  		if (mlx5_core_destroy_psv(dev->mdev,
1907  					  mr->sig->psv_memory.psv_idx))
1908  			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1909  				     mr->sig->psv_memory.psv_idx);
1910  		if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1911  			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1912  				     mr->sig->psv_wire.psv_idx);
1913  		kfree(mr->sig);
1914  		mr->sig = NULL;
1915  	}
1916  
1917  	/* Stop DMA */
1918  	if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
1919  		if (mlx5r_umr_revoke_mr(mr) ||
1920  		    cache_ent_find_and_store(dev, mr))
1921  			mr->mmkey.cache_ent = NULL;
1922  
1923  	if (!mr->mmkey.cache_ent) {
1924  		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1925  		if (rc)
1926  			return rc;
1927  	}
1928  
1929  	if (mr->umem) {
1930  		bool is_odp = is_odp_mr(mr);
1931  
1932  		if (!is_odp)
1933  			atomic_sub(ib_umem_num_pages(mr->umem),
1934  				   &dev->mdev->priv.reg_pages);
1935  		ib_umem_release(mr->umem);
1936  		if (is_odp)
1937  			mlx5_ib_free_odp_mr(mr);
1938  	}
1939  
1940  	if (!mr->mmkey.cache_ent)
1941  		mlx5_free_priv_descs(mr);
1942  
1943  	kfree(mr);
1944  	return 0;
1945  }
1946  
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)1947  static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1948  				   int access_mode, int page_shift)
1949  {
1950  	void *mkc;
1951  
1952  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1953  
1954  	/* This is only used from the kernel, so setting the PD is OK. */
1955  	set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
1956  	MLX5_SET(mkc, mkc, free, 1);
1957  	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1958  	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1959  	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1960  	MLX5_SET(mkc, mkc, umr_en, 1);
1961  	MLX5_SET(mkc, mkc, log_page_size, page_shift);
1962  }
1963  
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)1964  static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1965  				  int ndescs, int desc_size, int page_shift,
1966  				  int access_mode, u32 *in, int inlen)
1967  {
1968  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1969  	int err;
1970  
1971  	mr->access_mode = access_mode;
1972  	mr->desc_size = desc_size;
1973  	mr->max_descs = ndescs;
1974  
1975  	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1976  	if (err)
1977  		return err;
1978  
1979  	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1980  
1981  	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1982  	if (err)
1983  		goto err_free_descs;
1984  
1985  	mr->mmkey.type = MLX5_MKEY_MR;
1986  	mr->ibmr.lkey = mr->mmkey.key;
1987  	mr->ibmr.rkey = mr->mmkey.key;
1988  
1989  	return 0;
1990  
1991  err_free_descs:
1992  	mlx5_free_priv_descs(mr);
1993  	return err;
1994  }
1995  
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)1996  static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1997  				u32 max_num_sg, u32 max_num_meta_sg,
1998  				int desc_size, int access_mode)
1999  {
2000  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2001  	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2002  	int page_shift = 0;
2003  	struct mlx5_ib_mr *mr;
2004  	u32 *in;
2005  	int err;
2006  
2007  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2008  	if (!mr)
2009  		return ERR_PTR(-ENOMEM);
2010  
2011  	mr->ibmr.pd = pd;
2012  	mr->ibmr.device = pd->device;
2013  
2014  	in = kzalloc(inlen, GFP_KERNEL);
2015  	if (!in) {
2016  		err = -ENOMEM;
2017  		goto err_free;
2018  	}
2019  
2020  	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2021  		page_shift = PAGE_SHIFT;
2022  
2023  	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2024  				     access_mode, in, inlen);
2025  	if (err)
2026  		goto err_free_in;
2027  
2028  	mr->umem = NULL;
2029  	kfree(in);
2030  
2031  	return mr;
2032  
2033  err_free_in:
2034  	kfree(in);
2035  err_free:
2036  	kfree(mr);
2037  	return ERR_PTR(err);
2038  }
2039  
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2040  static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2041  				    int ndescs, u32 *in, int inlen)
2042  {
2043  	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2044  				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2045  				      inlen);
2046  }
2047  
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2048  static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2049  				    int ndescs, u32 *in, int inlen)
2050  {
2051  	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2052  				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2053  }
2054  
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2055  static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2056  				      int max_num_sg, int max_num_meta_sg,
2057  				      u32 *in, int inlen)
2058  {
2059  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2060  	u32 psv_index[2];
2061  	void *mkc;
2062  	int err;
2063  
2064  	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2065  	if (!mr->sig)
2066  		return -ENOMEM;
2067  
2068  	/* create mem & wire PSVs */
2069  	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2070  	if (err)
2071  		goto err_free_sig;
2072  
2073  	mr->sig->psv_memory.psv_idx = psv_index[0];
2074  	mr->sig->psv_wire.psv_idx = psv_index[1];
2075  
2076  	mr->sig->sig_status_checked = true;
2077  	mr->sig->sig_err_exists = false;
2078  	/* Next UMR, Arm SIGERR */
2079  	++mr->sig->sigerr_count;
2080  	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2081  					 sizeof(struct mlx5_klm),
2082  					 MLX5_MKC_ACCESS_MODE_KLMS);
2083  	if (IS_ERR(mr->klm_mr)) {
2084  		err = PTR_ERR(mr->klm_mr);
2085  		goto err_destroy_psv;
2086  	}
2087  	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2088  					 sizeof(struct mlx5_mtt),
2089  					 MLX5_MKC_ACCESS_MODE_MTT);
2090  	if (IS_ERR(mr->mtt_mr)) {
2091  		err = PTR_ERR(mr->mtt_mr);
2092  		goto err_free_klm_mr;
2093  	}
2094  
2095  	/* Set bsf descriptors for mkey */
2096  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2097  	MLX5_SET(mkc, mkc, bsf_en, 1);
2098  	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2099  
2100  	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2101  				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2102  	if (err)
2103  		goto err_free_mtt_mr;
2104  
2105  	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2106  			      mr->sig, GFP_KERNEL));
2107  	if (err)
2108  		goto err_free_descs;
2109  	return 0;
2110  
2111  err_free_descs:
2112  	destroy_mkey(dev, mr);
2113  	mlx5_free_priv_descs(mr);
2114  err_free_mtt_mr:
2115  	mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2116  	mr->mtt_mr = NULL;
2117  err_free_klm_mr:
2118  	mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2119  	mr->klm_mr = NULL;
2120  err_destroy_psv:
2121  	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2122  		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2123  			     mr->sig->psv_memory.psv_idx);
2124  	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2125  		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2126  			     mr->sig->psv_wire.psv_idx);
2127  err_free_sig:
2128  	kfree(mr->sig);
2129  
2130  	return err;
2131  }
2132  
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2133  static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2134  					enum ib_mr_type mr_type, u32 max_num_sg,
2135  					u32 max_num_meta_sg)
2136  {
2137  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2138  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2139  	int ndescs = ALIGN(max_num_sg, 4);
2140  	struct mlx5_ib_mr *mr;
2141  	u32 *in;
2142  	int err;
2143  
2144  	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2145  	if (!mr)
2146  		return ERR_PTR(-ENOMEM);
2147  
2148  	in = kzalloc(inlen, GFP_KERNEL);
2149  	if (!in) {
2150  		err = -ENOMEM;
2151  		goto err_free;
2152  	}
2153  
2154  	mr->ibmr.device = pd->device;
2155  	mr->umem = NULL;
2156  
2157  	switch (mr_type) {
2158  	case IB_MR_TYPE_MEM_REG:
2159  		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2160  		break;
2161  	case IB_MR_TYPE_SG_GAPS:
2162  		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2163  		break;
2164  	case IB_MR_TYPE_INTEGRITY:
2165  		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2166  						 max_num_meta_sg, in, inlen);
2167  		break;
2168  	default:
2169  		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2170  		err = -EINVAL;
2171  	}
2172  
2173  	if (err)
2174  		goto err_free_in;
2175  
2176  	kfree(in);
2177  
2178  	return &mr->ibmr;
2179  
2180  err_free_in:
2181  	kfree(in);
2182  err_free:
2183  	kfree(mr);
2184  	return ERR_PTR(err);
2185  }
2186  
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2187  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2188  			       u32 max_num_sg)
2189  {
2190  	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2191  }
2192  
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2193  struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2194  					 u32 max_num_sg, u32 max_num_meta_sg)
2195  {
2196  	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2197  				  max_num_meta_sg);
2198  }
2199  
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2200  int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2201  {
2202  	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2203  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2204  	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2205  	unsigned int ndescs;
2206  	u32 *in = NULL;
2207  	void *mkc;
2208  	int err;
2209  	struct mlx5_ib_alloc_mw req = {};
2210  	struct {
2211  		__u32	comp_mask;
2212  		__u32	response_length;
2213  	} resp = {};
2214  
2215  	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2216  	if (err)
2217  		return err;
2218  
2219  	if (req.comp_mask || req.reserved1 || req.reserved2)
2220  		return -EOPNOTSUPP;
2221  
2222  	if (udata->inlen > sizeof(req) &&
2223  	    !ib_is_udata_cleared(udata, sizeof(req),
2224  				 udata->inlen - sizeof(req)))
2225  		return -EOPNOTSUPP;
2226  
2227  	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2228  
2229  	in = kzalloc(inlen, GFP_KERNEL);
2230  	if (!in)
2231  		return -ENOMEM;
2232  
2233  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2234  
2235  	MLX5_SET(mkc, mkc, free, 1);
2236  	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2237  	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2238  	MLX5_SET(mkc, mkc, umr_en, 1);
2239  	MLX5_SET(mkc, mkc, lr, 1);
2240  	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2241  	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2242  	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2243  
2244  	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2245  	if (err)
2246  		goto free;
2247  
2248  	mw->mmkey.type = MLX5_MKEY_MW;
2249  	ibmw->rkey = mw->mmkey.key;
2250  	mw->mmkey.ndescs = ndescs;
2251  
2252  	resp.response_length =
2253  		min(offsetofend(typeof(resp), response_length), udata->outlen);
2254  	if (resp.response_length) {
2255  		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2256  		if (err)
2257  			goto free_mkey;
2258  	}
2259  
2260  	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2261  		err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2262  		if (err)
2263  			goto free_mkey;
2264  	}
2265  
2266  	kfree(in);
2267  	return 0;
2268  
2269  free_mkey:
2270  	mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2271  free:
2272  	kfree(in);
2273  	return err;
2274  }
2275  
mlx5_ib_dealloc_mw(struct ib_mw * mw)2276  int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2277  {
2278  	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2279  	struct mlx5_ib_mw *mmw = to_mmw(mw);
2280  
2281  	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2282  	    xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2283  		/*
2284  		 * pagefault_single_data_segment() may be accessing mmw
2285  		 * if the user bound an ODP MR to this MW.
2286  		 */
2287  		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2288  
2289  	return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2290  }
2291  
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2292  int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2293  			    struct ib_mr_status *mr_status)
2294  {
2295  	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2296  	int ret = 0;
2297  
2298  	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2299  		pr_err("Invalid status check mask\n");
2300  		ret = -EINVAL;
2301  		goto done;
2302  	}
2303  
2304  	mr_status->fail_status = 0;
2305  	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2306  		if (!mmr->sig) {
2307  			ret = -EINVAL;
2308  			pr_err("signature status check requested on a non-signature enabled MR\n");
2309  			goto done;
2310  		}
2311  
2312  		mmr->sig->sig_status_checked = true;
2313  		if (!mmr->sig->sig_err_exists)
2314  			goto done;
2315  
2316  		if (ibmr->lkey == mmr->sig->err_item.key)
2317  			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2318  			       sizeof(mr_status->sig_err));
2319  		else {
2320  			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2321  			mr_status->sig_err.sig_err_offset = 0;
2322  			mr_status->sig_err.key = mmr->sig->err_item.key;
2323  		}
2324  
2325  		mmr->sig->sig_err_exists = false;
2326  		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2327  	}
2328  
2329  done:
2330  	return ret;
2331  }
2332  
2333  static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2334  mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2335  			int data_sg_nents, unsigned int *data_sg_offset,
2336  			struct scatterlist *meta_sg, int meta_sg_nents,
2337  			unsigned int *meta_sg_offset)
2338  {
2339  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2340  	unsigned int sg_offset = 0;
2341  	int n = 0;
2342  
2343  	mr->meta_length = 0;
2344  	if (data_sg_nents == 1) {
2345  		n++;
2346  		mr->mmkey.ndescs = 1;
2347  		if (data_sg_offset)
2348  			sg_offset = *data_sg_offset;
2349  		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2350  		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2351  		if (meta_sg_nents == 1) {
2352  			n++;
2353  			mr->meta_ndescs = 1;
2354  			if (meta_sg_offset)
2355  				sg_offset = *meta_sg_offset;
2356  			else
2357  				sg_offset = 0;
2358  			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2359  			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2360  		}
2361  		ibmr->length = mr->data_length + mr->meta_length;
2362  	}
2363  
2364  	return n;
2365  }
2366  
2367  static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2368  mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2369  		   struct scatterlist *sgl,
2370  		   unsigned short sg_nents,
2371  		   unsigned int *sg_offset_p,
2372  		   struct scatterlist *meta_sgl,
2373  		   unsigned short meta_sg_nents,
2374  		   unsigned int *meta_sg_offset_p)
2375  {
2376  	struct scatterlist *sg = sgl;
2377  	struct mlx5_klm *klms = mr->descs;
2378  	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2379  	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2380  	int i, j = 0;
2381  
2382  	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2383  	mr->ibmr.length = 0;
2384  
2385  	for_each_sg(sgl, sg, sg_nents, i) {
2386  		if (unlikely(i >= mr->max_descs))
2387  			break;
2388  		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2389  		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2390  		klms[i].key = cpu_to_be32(lkey);
2391  		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2392  
2393  		sg_offset = 0;
2394  	}
2395  
2396  	if (sg_offset_p)
2397  		*sg_offset_p = sg_offset;
2398  
2399  	mr->mmkey.ndescs = i;
2400  	mr->data_length = mr->ibmr.length;
2401  
2402  	if (meta_sg_nents) {
2403  		sg = meta_sgl;
2404  		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2405  		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2406  			if (unlikely(i + j >= mr->max_descs))
2407  				break;
2408  			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2409  						     sg_offset);
2410  			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2411  							 sg_offset);
2412  			klms[i + j].key = cpu_to_be32(lkey);
2413  			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2414  
2415  			sg_offset = 0;
2416  		}
2417  		if (meta_sg_offset_p)
2418  			*meta_sg_offset_p = sg_offset;
2419  
2420  		mr->meta_ndescs = j;
2421  		mr->meta_length = mr->ibmr.length - mr->data_length;
2422  	}
2423  
2424  	return i + j;
2425  }
2426  
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2427  static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2428  {
2429  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2430  	__be64 *descs;
2431  
2432  	if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2433  		return -ENOMEM;
2434  
2435  	descs = mr->descs;
2436  	descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2437  
2438  	return 0;
2439  }
2440  
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2441  static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2442  {
2443  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2444  	__be64 *descs;
2445  
2446  	if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2447  		return -ENOMEM;
2448  
2449  	descs = mr->descs;
2450  	descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2451  		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2452  
2453  	return 0;
2454  }
2455  
2456  static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2457  mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2458  			 int data_sg_nents, unsigned int *data_sg_offset,
2459  			 struct scatterlist *meta_sg, int meta_sg_nents,
2460  			 unsigned int *meta_sg_offset)
2461  {
2462  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2463  	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2464  	int n;
2465  
2466  	pi_mr->mmkey.ndescs = 0;
2467  	pi_mr->meta_ndescs = 0;
2468  	pi_mr->meta_length = 0;
2469  
2470  	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2471  				   pi_mr->desc_size * pi_mr->max_descs,
2472  				   DMA_TO_DEVICE);
2473  
2474  	pi_mr->ibmr.page_size = ibmr->page_size;
2475  	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2476  			   mlx5_set_page);
2477  	if (n != data_sg_nents)
2478  		return n;
2479  
2480  	pi_mr->data_iova = pi_mr->ibmr.iova;
2481  	pi_mr->data_length = pi_mr->ibmr.length;
2482  	pi_mr->ibmr.length = pi_mr->data_length;
2483  	ibmr->length = pi_mr->data_length;
2484  
2485  	if (meta_sg_nents) {
2486  		u64 page_mask = ~((u64)ibmr->page_size - 1);
2487  		u64 iova = pi_mr->data_iova;
2488  
2489  		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2490  				    meta_sg_offset, mlx5_set_page_pi);
2491  
2492  		pi_mr->meta_length = pi_mr->ibmr.length;
2493  		/*
2494  		 * PI address for the HW is the offset of the metadata address
2495  		 * relative to the first data page address.
2496  		 * It equals to first data page address + size of data pages +
2497  		 * metadata offset at the first metadata page
2498  		 */
2499  		pi_mr->pi_iova = (iova & page_mask) +
2500  				 pi_mr->mmkey.ndescs * ibmr->page_size +
2501  				 (pi_mr->ibmr.iova & ~page_mask);
2502  		/*
2503  		 * In order to use one MTT MR for data and metadata, we register
2504  		 * also the gaps between the end of the data and the start of
2505  		 * the metadata (the sig MR will verify that the HW will access
2506  		 * to right addresses). This mapping is safe because we use
2507  		 * internal mkey for the registration.
2508  		 */
2509  		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2510  		pi_mr->ibmr.iova = iova;
2511  		ibmr->length += pi_mr->meta_length;
2512  	}
2513  
2514  	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2515  				      pi_mr->desc_size * pi_mr->max_descs,
2516  				      DMA_TO_DEVICE);
2517  
2518  	return n;
2519  }
2520  
2521  static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2522  mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2523  			 int data_sg_nents, unsigned int *data_sg_offset,
2524  			 struct scatterlist *meta_sg, int meta_sg_nents,
2525  			 unsigned int *meta_sg_offset)
2526  {
2527  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2528  	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2529  	int n;
2530  
2531  	pi_mr->mmkey.ndescs = 0;
2532  	pi_mr->meta_ndescs = 0;
2533  	pi_mr->meta_length = 0;
2534  
2535  	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2536  				   pi_mr->desc_size * pi_mr->max_descs,
2537  				   DMA_TO_DEVICE);
2538  
2539  	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2540  			       meta_sg, meta_sg_nents, meta_sg_offset);
2541  
2542  	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2543  				      pi_mr->desc_size * pi_mr->max_descs,
2544  				      DMA_TO_DEVICE);
2545  
2546  	/* This is zero-based memory region */
2547  	pi_mr->data_iova = 0;
2548  	pi_mr->ibmr.iova = 0;
2549  	pi_mr->pi_iova = pi_mr->data_length;
2550  	ibmr->length = pi_mr->ibmr.length;
2551  
2552  	return n;
2553  }
2554  
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2555  int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2556  			 int data_sg_nents, unsigned int *data_sg_offset,
2557  			 struct scatterlist *meta_sg, int meta_sg_nents,
2558  			 unsigned int *meta_sg_offset)
2559  {
2560  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2561  	struct mlx5_ib_mr *pi_mr = NULL;
2562  	int n;
2563  
2564  	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2565  
2566  	mr->mmkey.ndescs = 0;
2567  	mr->data_length = 0;
2568  	mr->data_iova = 0;
2569  	mr->meta_ndescs = 0;
2570  	mr->pi_iova = 0;
2571  	/*
2572  	 * As a performance optimization, if possible, there is no need to
2573  	 * perform UMR operation to register the data/metadata buffers.
2574  	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2575  	 * Fallback to UMR only in case of a failure.
2576  	 */
2577  	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2578  				    data_sg_offset, meta_sg, meta_sg_nents,
2579  				    meta_sg_offset);
2580  	if (n == data_sg_nents + meta_sg_nents)
2581  		goto out;
2582  	/*
2583  	 * As a performance optimization, if possible, there is no need to map
2584  	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2585  	 * descriptors and fallback to KLM only in case of a failure.
2586  	 * It's more efficient for the HW to work with MTT descriptors
2587  	 * (especially in high load).
2588  	 * Use KLM (indirect access) only if it's mandatory.
2589  	 */
2590  	pi_mr = mr->mtt_mr;
2591  	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2592  				     data_sg_offset, meta_sg, meta_sg_nents,
2593  				     meta_sg_offset);
2594  	if (n == data_sg_nents + meta_sg_nents)
2595  		goto out;
2596  
2597  	pi_mr = mr->klm_mr;
2598  	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2599  				     data_sg_offset, meta_sg, meta_sg_nents,
2600  				     meta_sg_offset);
2601  	if (unlikely(n != data_sg_nents + meta_sg_nents))
2602  		return -ENOMEM;
2603  
2604  out:
2605  	/* This is zero-based memory region */
2606  	ibmr->iova = 0;
2607  	mr->pi_mr = pi_mr;
2608  	if (pi_mr)
2609  		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2610  	else
2611  		ibmr->sig_attrs->meta_length = mr->meta_length;
2612  
2613  	return 0;
2614  }
2615  
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2616  int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2617  		      unsigned int *sg_offset)
2618  {
2619  	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2620  	int n;
2621  
2622  	mr->mmkey.ndescs = 0;
2623  
2624  	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2625  				   mr->desc_size * mr->max_descs,
2626  				   DMA_TO_DEVICE);
2627  
2628  	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2629  		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2630  				       NULL);
2631  	else
2632  		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2633  				mlx5_set_page);
2634  
2635  	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2636  				      mr->desc_size * mr->max_descs,
2637  				      DMA_TO_DEVICE);
2638  
2639  	return n;
2640  }
2641