1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46
47 enum {
48 MAX_PENDING_REG_MR = 8,
49 };
50
51 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
52 #define MLX5_UMR_ALIGN 2048
53
54 static void
55 create_mkey_callback(int status, struct mlx5_async_work *context);
56 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
57 u64 iova, int access_flags,
58 unsigned int page_size, bool populate);
59
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)60 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
61 struct ib_pd *pd)
62 {
63 struct mlx5_ib_dev *dev = to_mdev(pd->device);
64
65 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
66 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
67 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
68 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
69 MLX5_SET(mkc, mkc, lr, 1);
70
71 if (acc & IB_ACCESS_RELAXED_ORDERING) {
72 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
73 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
74
75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
76 (MLX5_CAP_GEN(dev->mdev,
77 relaxed_ordering_read_pci_enabled) &&
78 pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
79 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
80 }
81
82 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
83 MLX5_SET(mkc, mkc, qpn, 0xffffff);
84 MLX5_SET64(mkc, mkc, start_addr, start_addr);
85 }
86
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)87 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
88 {
89 u8 key = atomic_inc_return(&dev->mkey_var);
90 void *mkc;
91
92 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
93 MLX5_SET(mkc, mkc, mkey_7_0, key);
94 *mkey = key;
95 }
96
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)97 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
98 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
99 {
100 int ret;
101
102 assign_mkey_variant(dev, &mkey->key, in);
103 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
104 if (!ret)
105 init_waitqueue_head(&mkey->wait);
106
107 return ret;
108 }
109
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)110 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
111 {
112 struct mlx5_ib_dev *dev = async_create->ent->dev;
113 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
114 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
115
116 MLX5_SET(create_mkey_in, async_create->in, opcode,
117 MLX5_CMD_OP_CREATE_MKEY);
118 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
119 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
120 async_create->out, outlen, create_mkey_callback,
121 &async_create->cb_work);
122 }
123
124 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
125 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
126
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)127 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
128 {
129 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
130
131 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
132 }
133
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)134 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
135 {
136 if (status == -ENXIO) /* core driver is not available */
137 return;
138
139 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
140 if (status != -EREMOTEIO) /* driver specific failure */
141 return;
142
143 /* Failed in FW, print cmd out failure details */
144 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
145 }
146
push_mkey_locked(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)147 static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
148 void *to_store)
149 {
150 XA_STATE(xas, &ent->mkeys, 0);
151 void *curr;
152
153 if (limit_pendings &&
154 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
155 return -EAGAIN;
156
157 while (1) {
158 /*
159 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
160 * doesn't transparently unlock. Instead we set the xas index to
161 * the current value of reserved every iteration.
162 */
163 xas_set(&xas, ent->reserved);
164 curr = xas_load(&xas);
165 if (!curr) {
166 if (to_store && ent->stored == ent->reserved)
167 xas_store(&xas, to_store);
168 else
169 xas_store(&xas, XA_ZERO_ENTRY);
170 if (xas_valid(&xas)) {
171 ent->reserved++;
172 if (to_store) {
173 if (ent->stored != ent->reserved)
174 __xa_store(&ent->mkeys,
175 ent->stored,
176 to_store,
177 GFP_KERNEL);
178 ent->stored++;
179 queue_adjust_cache_locked(ent);
180 WRITE_ONCE(ent->dev->cache.last_add,
181 jiffies);
182 }
183 }
184 }
185 xa_unlock_irq(&ent->mkeys);
186
187 /*
188 * Notice xas_nomem() must always be called as it cleans
189 * up any cached allocation.
190 */
191 if (!xas_nomem(&xas, GFP_KERNEL))
192 break;
193 xa_lock_irq(&ent->mkeys);
194 }
195 xa_lock_irq(&ent->mkeys);
196 if (xas_error(&xas))
197 return xas_error(&xas);
198 if (WARN_ON(curr))
199 return -EINVAL;
200 return 0;
201 }
202
push_mkey(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)203 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
204 void *to_store)
205 {
206 int ret;
207
208 xa_lock_irq(&ent->mkeys);
209 ret = push_mkey_locked(ent, limit_pendings, to_store);
210 xa_unlock_irq(&ent->mkeys);
211 return ret;
212 }
213
undo_push_reserve_mkey(struct mlx5_cache_ent * ent)214 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
215 {
216 void *old;
217
218 ent->reserved--;
219 old = __xa_erase(&ent->mkeys, ent->reserved);
220 WARN_ON(old);
221 }
222
push_to_reserved(struct mlx5_cache_ent * ent,u32 mkey)223 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
224 {
225 void *old;
226
227 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
228 WARN_ON(old);
229 ent->stored++;
230 }
231
pop_stored_mkey(struct mlx5_cache_ent * ent)232 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
233 {
234 void *old, *xa_mkey;
235
236 ent->stored--;
237 ent->reserved--;
238
239 if (ent->stored == ent->reserved) {
240 xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
241 WARN_ON(!xa_mkey);
242 return (u32)xa_to_value(xa_mkey);
243 }
244
245 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
246 GFP_KERNEL);
247 WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
248 old = __xa_erase(&ent->mkeys, ent->reserved);
249 WARN_ON(old);
250 return (u32)xa_to_value(xa_mkey);
251 }
252
create_mkey_callback(int status,struct mlx5_async_work * context)253 static void create_mkey_callback(int status, struct mlx5_async_work *context)
254 {
255 struct mlx5r_async_create_mkey *mkey_out =
256 container_of(context, struct mlx5r_async_create_mkey, cb_work);
257 struct mlx5_cache_ent *ent = mkey_out->ent;
258 struct mlx5_ib_dev *dev = ent->dev;
259 unsigned long flags;
260
261 if (status) {
262 create_mkey_warn(dev, status, mkey_out->out);
263 kfree(mkey_out);
264 xa_lock_irqsave(&ent->mkeys, flags);
265 undo_push_reserve_mkey(ent);
266 WRITE_ONCE(dev->fill_delay, 1);
267 xa_unlock_irqrestore(&ent->mkeys, flags);
268 mod_timer(&dev->delay_timer, jiffies + HZ);
269 return;
270 }
271
272 mkey_out->mkey |= mlx5_idx_to_mkey(
273 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
274 WRITE_ONCE(dev->cache.last_add, jiffies);
275
276 xa_lock_irqsave(&ent->mkeys, flags);
277 push_to_reserved(ent, mkey_out->mkey);
278 /* If we are doing fill_to_high_water then keep going. */
279 queue_adjust_cache_locked(ent);
280 xa_unlock_irqrestore(&ent->mkeys, flags);
281 kfree(mkey_out);
282 }
283
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)284 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
285 {
286 int ret = 0;
287
288 switch (access_mode) {
289 case MLX5_MKC_ACCESS_MODE_MTT:
290 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
291 sizeof(struct mlx5_mtt));
292 break;
293 case MLX5_MKC_ACCESS_MODE_KSM:
294 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
295 sizeof(struct mlx5_klm));
296 break;
297 default:
298 WARN_ON(1);
299 }
300 return ret;
301 }
302
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)303 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
304 {
305 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
306 ent->dev->umrc.pd);
307 MLX5_SET(mkc, mkc, free, 1);
308 MLX5_SET(mkc, mkc, umr_en, 1);
309 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
310 MLX5_SET(mkc, mkc, access_mode_4_2,
311 (ent->rb_key.access_mode >> 2) & 0x7);
312 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
313
314 MLX5_SET(mkc, mkc, translations_octword_size,
315 get_mkc_octo_size(ent->rb_key.access_mode,
316 ent->rb_key.ndescs));
317 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
318 }
319
320 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)321 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
322 {
323 struct mlx5r_async_create_mkey *async_create;
324 void *mkc;
325 int err = 0;
326 int i;
327
328 for (i = 0; i < num; i++) {
329 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
330 GFP_KERNEL);
331 if (!async_create)
332 return -ENOMEM;
333 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
334 memory_key_mkey_entry);
335 set_cache_mkc(ent, mkc);
336 async_create->ent = ent;
337
338 err = push_mkey(ent, true, NULL);
339 if (err)
340 goto free_async_create;
341
342 err = mlx5_ib_create_mkey_cb(async_create);
343 if (err) {
344 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
345 goto err_undo_reserve;
346 }
347 }
348
349 return 0;
350
351 err_undo_reserve:
352 xa_lock_irq(&ent->mkeys);
353 undo_push_reserve_mkey(ent);
354 xa_unlock_irq(&ent->mkeys);
355 free_async_create:
356 kfree(async_create);
357 return err;
358 }
359
360 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)361 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
362 {
363 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
364 void *mkc;
365 u32 *in;
366 int err;
367
368 in = kzalloc(inlen, GFP_KERNEL);
369 if (!in)
370 return -ENOMEM;
371 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
372 set_cache_mkc(ent, mkc);
373
374 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
375 if (err)
376 goto free_in;
377
378 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
379 free_in:
380 kfree(in);
381 return err;
382 }
383
remove_cache_mr_locked(struct mlx5_cache_ent * ent)384 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
385 {
386 u32 mkey;
387
388 lockdep_assert_held(&ent->mkeys.xa_lock);
389 if (!ent->stored)
390 return;
391 mkey = pop_stored_mkey(ent);
392 xa_unlock_irq(&ent->mkeys);
393 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
394 xa_lock_irq(&ent->mkeys);
395 }
396
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)397 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
398 bool limit_fill)
399 __acquires(&ent->mkeys) __releases(&ent->mkeys)
400 {
401 int err;
402
403 lockdep_assert_held(&ent->mkeys.xa_lock);
404
405 while (true) {
406 if (limit_fill)
407 target = ent->limit * 2;
408 if (target == ent->reserved)
409 return 0;
410 if (target > ent->reserved) {
411 u32 todo = target - ent->reserved;
412
413 xa_unlock_irq(&ent->mkeys);
414 err = add_keys(ent, todo);
415 if (err == -EAGAIN)
416 usleep_range(3000, 5000);
417 xa_lock_irq(&ent->mkeys);
418 if (err) {
419 if (err != -EAGAIN)
420 return err;
421 } else
422 return 0;
423 } else {
424 remove_cache_mr_locked(ent);
425 }
426 }
427 }
428
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)429 static ssize_t size_write(struct file *filp, const char __user *buf,
430 size_t count, loff_t *pos)
431 {
432 struct mlx5_cache_ent *ent = filp->private_data;
433 u32 target;
434 int err;
435
436 err = kstrtou32_from_user(buf, count, 0, &target);
437 if (err)
438 return err;
439
440 /*
441 * Target is the new value of total_mrs the user requests, however we
442 * cannot free MRs that are in use. Compute the target value for stored
443 * mkeys.
444 */
445 xa_lock_irq(&ent->mkeys);
446 if (target < ent->in_use) {
447 err = -EINVAL;
448 goto err_unlock;
449 }
450 target = target - ent->in_use;
451 if (target < ent->limit || target > ent->limit*2) {
452 err = -EINVAL;
453 goto err_unlock;
454 }
455 err = resize_available_mrs(ent, target, false);
456 if (err)
457 goto err_unlock;
458 xa_unlock_irq(&ent->mkeys);
459
460 return count;
461
462 err_unlock:
463 xa_unlock_irq(&ent->mkeys);
464 return err;
465 }
466
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)467 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
468 loff_t *pos)
469 {
470 struct mlx5_cache_ent *ent = filp->private_data;
471 char lbuf[20];
472 int err;
473
474 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
475 if (err < 0)
476 return err;
477
478 return simple_read_from_buffer(buf, count, pos, lbuf, err);
479 }
480
481 static const struct file_operations size_fops = {
482 .owner = THIS_MODULE,
483 .open = simple_open,
484 .write = size_write,
485 .read = size_read,
486 };
487
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)488 static ssize_t limit_write(struct file *filp, const char __user *buf,
489 size_t count, loff_t *pos)
490 {
491 struct mlx5_cache_ent *ent = filp->private_data;
492 u32 var;
493 int err;
494
495 err = kstrtou32_from_user(buf, count, 0, &var);
496 if (err)
497 return err;
498
499 /*
500 * Upon set we immediately fill the cache to high water mark implied by
501 * the limit.
502 */
503 xa_lock_irq(&ent->mkeys);
504 ent->limit = var;
505 err = resize_available_mrs(ent, 0, true);
506 xa_unlock_irq(&ent->mkeys);
507 if (err)
508 return err;
509 return count;
510 }
511
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)512 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
513 loff_t *pos)
514 {
515 struct mlx5_cache_ent *ent = filp->private_data;
516 char lbuf[20];
517 int err;
518
519 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
520 if (err < 0)
521 return err;
522
523 return simple_read_from_buffer(buf, count, pos, lbuf, err);
524 }
525
526 static const struct file_operations limit_fops = {
527 .owner = THIS_MODULE,
528 .open = simple_open,
529 .write = limit_write,
530 .read = limit_read,
531 };
532
someone_adding(struct mlx5_mkey_cache * cache)533 static bool someone_adding(struct mlx5_mkey_cache *cache)
534 {
535 struct mlx5_cache_ent *ent;
536 struct rb_node *node;
537 bool ret;
538
539 mutex_lock(&cache->rb_lock);
540 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
541 ent = rb_entry(node, struct mlx5_cache_ent, node);
542 xa_lock_irq(&ent->mkeys);
543 ret = ent->stored < ent->limit;
544 xa_unlock_irq(&ent->mkeys);
545 if (ret) {
546 mutex_unlock(&cache->rb_lock);
547 return true;
548 }
549 }
550 mutex_unlock(&cache->rb_lock);
551 return false;
552 }
553
554 /*
555 * Check if the bucket is outside the high/low water mark and schedule an async
556 * update. The cache refill has hysteresis, once the low water mark is hit it is
557 * refilled up to the high mark.
558 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)559 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
560 {
561 lockdep_assert_held(&ent->mkeys.xa_lock);
562
563 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
564 return;
565 if (ent->stored < ent->limit) {
566 ent->fill_to_high_water = true;
567 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
568 } else if (ent->fill_to_high_water &&
569 ent->reserved < 2 * ent->limit) {
570 /*
571 * Once we start populating due to hitting a low water mark
572 * continue until we pass the high water mark.
573 */
574 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
575 } else if (ent->stored == 2 * ent->limit) {
576 ent->fill_to_high_water = false;
577 } else if (ent->stored > 2 * ent->limit) {
578 /* Queue deletion of excess entries */
579 ent->fill_to_high_water = false;
580 if (ent->stored != ent->reserved)
581 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
582 msecs_to_jiffies(1000));
583 else
584 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
585 }
586 }
587
__cache_work_func(struct mlx5_cache_ent * ent)588 static void __cache_work_func(struct mlx5_cache_ent *ent)
589 {
590 struct mlx5_ib_dev *dev = ent->dev;
591 struct mlx5_mkey_cache *cache = &dev->cache;
592 int err;
593
594 xa_lock_irq(&ent->mkeys);
595 if (ent->disabled)
596 goto out;
597
598 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
599 !READ_ONCE(dev->fill_delay)) {
600 xa_unlock_irq(&ent->mkeys);
601 err = add_keys(ent, 1);
602 xa_lock_irq(&ent->mkeys);
603 if (ent->disabled)
604 goto out;
605 if (err) {
606 /*
607 * EAGAIN only happens if there are pending MRs, so we
608 * will be rescheduled when storing them. The only
609 * failure path here is ENOMEM.
610 */
611 if (err != -EAGAIN) {
612 mlx5_ib_warn(
613 dev,
614 "add keys command failed, err %d\n",
615 err);
616 queue_delayed_work(cache->wq, &ent->dwork,
617 msecs_to_jiffies(1000));
618 }
619 }
620 } else if (ent->stored > 2 * ent->limit) {
621 bool need_delay;
622
623 /*
624 * The remove_cache_mr() logic is performed as garbage
625 * collection task. Such task is intended to be run when no
626 * other active processes are running.
627 *
628 * The need_resched() will return TRUE if there are user tasks
629 * to be activated in near future.
630 *
631 * In such case, we don't execute remove_cache_mr() and postpone
632 * the garbage collection work to try to run in next cycle, in
633 * order to free CPU resources to other tasks.
634 */
635 xa_unlock_irq(&ent->mkeys);
636 need_delay = need_resched() || someone_adding(cache) ||
637 !time_after(jiffies,
638 READ_ONCE(cache->last_add) + 300 * HZ);
639 xa_lock_irq(&ent->mkeys);
640 if (ent->disabled)
641 goto out;
642 if (need_delay) {
643 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
644 goto out;
645 }
646 remove_cache_mr_locked(ent);
647 queue_adjust_cache_locked(ent);
648 }
649 out:
650 xa_unlock_irq(&ent->mkeys);
651 }
652
delayed_cache_work_func(struct work_struct * work)653 static void delayed_cache_work_func(struct work_struct *work)
654 {
655 struct mlx5_cache_ent *ent;
656
657 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
658 __cache_work_func(ent);
659 }
660
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)661 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
662 struct mlx5r_cache_rb_key key2)
663 {
664 int res;
665
666 res = key1.ats - key2.ats;
667 if (res)
668 return res;
669
670 res = key1.access_mode - key2.access_mode;
671 if (res)
672 return res;
673
674 res = key1.access_flags - key2.access_flags;
675 if (res)
676 return res;
677
678 /*
679 * keep ndescs the last in the compare table since the find function
680 * searches for an exact match on all properties and only closest
681 * match in size.
682 */
683 return key1.ndescs - key2.ndescs;
684 }
685
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)686 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
687 struct mlx5_cache_ent *ent)
688 {
689 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
690 struct mlx5_cache_ent *cur;
691 int cmp;
692
693 /* Figure out where to put new node */
694 while (*new) {
695 cur = rb_entry(*new, struct mlx5_cache_ent, node);
696 parent = *new;
697 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
698 if (cmp > 0)
699 new = &((*new)->rb_left);
700 if (cmp < 0)
701 new = &((*new)->rb_right);
702 if (cmp == 0)
703 return -EEXIST;
704 }
705
706 /* Add new node and rebalance tree. */
707 rb_link_node(&ent->node, parent, new);
708 rb_insert_color(&ent->node, &cache->rb_root);
709
710 return 0;
711 }
712
713 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)714 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
715 struct mlx5r_cache_rb_key rb_key)
716 {
717 struct rb_node *node = dev->cache.rb_root.rb_node;
718 struct mlx5_cache_ent *cur, *smallest = NULL;
719 u64 ndescs_limit;
720 int cmp;
721
722 /*
723 * Find the smallest ent with order >= requested_order.
724 */
725 while (node) {
726 cur = rb_entry(node, struct mlx5_cache_ent, node);
727 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
728 if (cmp > 0) {
729 smallest = cur;
730 node = node->rb_left;
731 }
732 if (cmp < 0)
733 node = node->rb_right;
734 if (cmp == 0)
735 return cur;
736 }
737
738 /*
739 * Limit the usage of mkeys larger than twice the required size while
740 * also allowing the usage of smallest cache entry for small MRs.
741 */
742 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
743 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
744
745 return (smallest &&
746 smallest->rb_key.access_mode == rb_key.access_mode &&
747 smallest->rb_key.access_flags == rb_key.access_flags &&
748 smallest->rb_key.ats == rb_key.ats &&
749 smallest->rb_key.ndescs <= ndescs_limit) ?
750 smallest :
751 NULL;
752 }
753
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent,int access_flags)754 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
755 struct mlx5_cache_ent *ent,
756 int access_flags)
757 {
758 struct mlx5_ib_mr *mr;
759 int err;
760
761 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
762 if (!mr)
763 return ERR_PTR(-ENOMEM);
764
765 xa_lock_irq(&ent->mkeys);
766 ent->in_use++;
767
768 if (!ent->stored) {
769 queue_adjust_cache_locked(ent);
770 ent->miss++;
771 xa_unlock_irq(&ent->mkeys);
772 err = create_cache_mkey(ent, &mr->mmkey.key);
773 if (err) {
774 xa_lock_irq(&ent->mkeys);
775 ent->in_use--;
776 xa_unlock_irq(&ent->mkeys);
777 kfree(mr);
778 return ERR_PTR(err);
779 }
780 } else {
781 mr->mmkey.key = pop_stored_mkey(ent);
782 queue_adjust_cache_locked(ent);
783 xa_unlock_irq(&ent->mkeys);
784 }
785 mr->mmkey.cache_ent = ent;
786 mr->mmkey.type = MLX5_MKEY_MR;
787 init_waitqueue_head(&mr->mmkey.wait);
788 return mr;
789 }
790
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)791 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
792 int access_flags)
793 {
794 int ret = 0;
795
796 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
797 MLX5_CAP_GEN(dev->mdev, atomic) &&
798 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
799 ret |= IB_ACCESS_REMOTE_ATOMIC;
800
801 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
802 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
803 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
804 ret |= IB_ACCESS_RELAXED_ORDERING;
805
806 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
807 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
808 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
809 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
810 ret |= IB_ACCESS_RELAXED_ORDERING;
811
812 return ret;
813 }
814
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)815 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
816 int access_flags, int access_mode,
817 int ndescs)
818 {
819 struct mlx5r_cache_rb_key rb_key = {
820 .ndescs = ndescs,
821 .access_mode = access_mode,
822 .access_flags = get_unchangeable_access_flags(dev, access_flags)
823 };
824 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
825
826 if (!ent)
827 return ERR_PTR(-EOPNOTSUPP);
828
829 return _mlx5_mr_cache_alloc(dev, ent, access_flags);
830 }
831
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)832 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
833 {
834 u32 mkey;
835
836 cancel_delayed_work(&ent->dwork);
837 xa_lock_irq(&ent->mkeys);
838 while (ent->stored) {
839 mkey = pop_stored_mkey(ent);
840 xa_unlock_irq(&ent->mkeys);
841 mlx5_core_destroy_mkey(dev->mdev, mkey);
842 xa_lock_irq(&ent->mkeys);
843 }
844 xa_unlock_irq(&ent->mkeys);
845 }
846
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)847 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
848 {
849 if (!mlx5_debugfs_root || dev->is_rep)
850 return;
851
852 debugfs_remove_recursive(dev->cache.fs_root);
853 dev->cache.fs_root = NULL;
854 }
855
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)856 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
857 struct mlx5_cache_ent *ent)
858 {
859 int order = order_base_2(ent->rb_key.ndescs);
860 struct dentry *dir;
861
862 if (!mlx5_debugfs_root || dev->is_rep)
863 return;
864
865 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
866 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
867
868 sprintf(ent->name, "%d", order);
869 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
870 debugfs_create_file("size", 0600, dir, ent, &size_fops);
871 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
872 debugfs_create_ulong("cur", 0400, dir, &ent->stored);
873 debugfs_create_u32("miss", 0600, dir, &ent->miss);
874 }
875
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)876 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
877 {
878 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
879 struct mlx5_mkey_cache *cache = &dev->cache;
880
881 if (!mlx5_debugfs_root || dev->is_rep)
882 return;
883
884 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
885 }
886
delay_time_func(struct timer_list * t)887 static void delay_time_func(struct timer_list *t)
888 {
889 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
890
891 WRITE_ONCE(dev->fill_delay, 0);
892 }
893
894 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)895 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
896 struct mlx5r_cache_rb_key rb_key,
897 bool persistent_entry)
898 {
899 struct mlx5_cache_ent *ent;
900 int order;
901 int ret;
902
903 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
904 if (!ent)
905 return ERR_PTR(-ENOMEM);
906
907 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
908 ent->rb_key = rb_key;
909 ent->dev = dev;
910 ent->is_tmp = !persistent_entry;
911
912 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
913
914 ret = mlx5_cache_ent_insert(&dev->cache, ent);
915 if (ret) {
916 kfree(ent);
917 return ERR_PTR(ret);
918 }
919
920 if (persistent_entry) {
921 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
922 order = MLX5_IMR_KSM_CACHE_ENTRY;
923 else
924 order = order_base_2(rb_key.ndescs) - 2;
925
926 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
927 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
928 mlx5r_umr_can_load_pas(dev, 0))
929 ent->limit = dev->mdev->profile.mr_cache[order].limit;
930 else
931 ent->limit = 0;
932
933 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
934 } else {
935 mod_delayed_work(ent->dev->cache.wq,
936 &ent->dev->cache.remove_ent_dwork,
937 msecs_to_jiffies(30 * 1000));
938 }
939
940 return ent;
941 }
942
remove_ent_work_func(struct work_struct * work)943 static void remove_ent_work_func(struct work_struct *work)
944 {
945 struct mlx5_mkey_cache *cache;
946 struct mlx5_cache_ent *ent;
947 struct rb_node *cur;
948
949 cache = container_of(work, struct mlx5_mkey_cache,
950 remove_ent_dwork.work);
951 mutex_lock(&cache->rb_lock);
952 cur = rb_last(&cache->rb_root);
953 while (cur) {
954 ent = rb_entry(cur, struct mlx5_cache_ent, node);
955 cur = rb_prev(cur);
956 mutex_unlock(&cache->rb_lock);
957
958 xa_lock_irq(&ent->mkeys);
959 if (!ent->is_tmp) {
960 xa_unlock_irq(&ent->mkeys);
961 mutex_lock(&cache->rb_lock);
962 continue;
963 }
964 xa_unlock_irq(&ent->mkeys);
965
966 clean_keys(ent->dev, ent);
967 mutex_lock(&cache->rb_lock);
968 }
969 mutex_unlock(&cache->rb_lock);
970 }
971
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)972 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
973 {
974 struct mlx5_mkey_cache *cache = &dev->cache;
975 struct rb_root *root = &dev->cache.rb_root;
976 struct mlx5r_cache_rb_key rb_key = {
977 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
978 };
979 struct mlx5_cache_ent *ent;
980 struct rb_node *node;
981 int ret;
982 int i;
983
984 mutex_init(&dev->slow_path_mutex);
985 mutex_init(&dev->cache.rb_lock);
986 dev->cache.rb_root = RB_ROOT;
987 INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
988 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
989 if (!cache->wq) {
990 mlx5_ib_warn(dev, "failed to create work queue\n");
991 return -ENOMEM;
992 }
993
994 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
995 timer_setup(&dev->delay_timer, delay_time_func, 0);
996 mlx5_mkey_cache_debugfs_init(dev);
997 mutex_lock(&cache->rb_lock);
998 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
999 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
1000 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
1001 if (IS_ERR(ent)) {
1002 ret = PTR_ERR(ent);
1003 goto err;
1004 }
1005 }
1006
1007 ret = mlx5_odp_init_mkey_cache(dev);
1008 if (ret)
1009 goto err;
1010
1011 mutex_unlock(&cache->rb_lock);
1012 for (node = rb_first(root); node; node = rb_next(node)) {
1013 ent = rb_entry(node, struct mlx5_cache_ent, node);
1014 xa_lock_irq(&ent->mkeys);
1015 queue_adjust_cache_locked(ent);
1016 xa_unlock_irq(&ent->mkeys);
1017 }
1018
1019 return 0;
1020
1021 err:
1022 mutex_unlock(&cache->rb_lock);
1023 mlx5_mkey_cache_debugfs_cleanup(dev);
1024 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
1025 return ret;
1026 }
1027
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)1028 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
1029 {
1030 struct rb_root *root = &dev->cache.rb_root;
1031 struct mlx5_cache_ent *ent;
1032 struct rb_node *node;
1033
1034 if (!dev->cache.wq)
1035 return;
1036
1037 mutex_lock(&dev->cache.rb_lock);
1038 cancel_delayed_work(&dev->cache.remove_ent_dwork);
1039 for (node = rb_first(root); node; node = rb_next(node)) {
1040 ent = rb_entry(node, struct mlx5_cache_ent, node);
1041 xa_lock_irq(&ent->mkeys);
1042 ent->disabled = true;
1043 xa_unlock_irq(&ent->mkeys);
1044 cancel_delayed_work(&ent->dwork);
1045 }
1046 mutex_unlock(&dev->cache.rb_lock);
1047
1048 /*
1049 * After all entries are disabled and will not reschedule on WQ,
1050 * flush it and all async commands.
1051 */
1052 flush_workqueue(dev->cache.wq);
1053
1054 mlx5_mkey_cache_debugfs_cleanup(dev);
1055 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1056
1057 /* At this point all entries are disabled and have no concurrent work. */
1058 mutex_lock(&dev->cache.rb_lock);
1059 node = rb_first(root);
1060 while (node) {
1061 ent = rb_entry(node, struct mlx5_cache_ent, node);
1062 node = rb_next(node);
1063 clean_keys(dev, ent);
1064 rb_erase(&ent->node, root);
1065 kfree(ent);
1066 }
1067 mutex_unlock(&dev->cache.rb_lock);
1068
1069 destroy_workqueue(dev->cache.wq);
1070 del_timer_sync(&dev->delay_timer);
1071 }
1072
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1073 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1074 {
1075 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1076 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1077 struct mlx5_ib_mr *mr;
1078 void *mkc;
1079 u32 *in;
1080 int err;
1081
1082 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1083 if (!mr)
1084 return ERR_PTR(-ENOMEM);
1085
1086 in = kzalloc(inlen, GFP_KERNEL);
1087 if (!in) {
1088 err = -ENOMEM;
1089 goto err_free;
1090 }
1091
1092 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1093
1094 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1095 MLX5_SET(mkc, mkc, length64, 1);
1096 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1097 pd);
1098
1099 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1100 if (err)
1101 goto err_in;
1102
1103 kfree(in);
1104 mr->mmkey.type = MLX5_MKEY_MR;
1105 mr->ibmr.lkey = mr->mmkey.key;
1106 mr->ibmr.rkey = mr->mmkey.key;
1107 mr->umem = NULL;
1108
1109 return &mr->ibmr;
1110
1111 err_in:
1112 kfree(in);
1113
1114 err_free:
1115 kfree(mr);
1116
1117 return ERR_PTR(err);
1118 }
1119
get_octo_len(u64 addr,u64 len,int page_shift)1120 static int get_octo_len(u64 addr, u64 len, int page_shift)
1121 {
1122 u64 page_size = 1ULL << page_shift;
1123 u64 offset;
1124 int npages;
1125
1126 offset = addr & (page_size - 1);
1127 npages = ALIGN(len + offset, page_size) >> page_shift;
1128 return (npages + 1) / 2;
1129 }
1130
mkey_cache_max_order(struct mlx5_ib_dev * dev)1131 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1132 {
1133 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1134 return MKEY_CACHE_LAST_STD_ENTRY;
1135 return MLX5_MAX_UMR_SHIFT;
1136 }
1137
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1138 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1139 u64 length, int access_flags, u64 iova)
1140 {
1141 mr->ibmr.lkey = mr->mmkey.key;
1142 mr->ibmr.rkey = mr->mmkey.key;
1143 mr->ibmr.length = length;
1144 mr->ibmr.device = &dev->ib_dev;
1145 mr->ibmr.iova = iova;
1146 mr->access_flags = access_flags;
1147 }
1148
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1149 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1150 u64 iova)
1151 {
1152 /*
1153 * The alignment of iova has already been checked upon entering
1154 * UVERBS_METHOD_REG_DMABUF_MR
1155 */
1156 umem->iova = iova;
1157 return PAGE_SIZE;
1158 }
1159
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1160 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1161 struct ib_umem *umem, u64 iova,
1162 int access_flags)
1163 {
1164 struct mlx5r_cache_rb_key rb_key = {
1165 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
1166 };
1167 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1168 struct mlx5_cache_ent *ent;
1169 struct mlx5_ib_mr *mr;
1170 unsigned int page_size;
1171
1172 if (umem->is_dmabuf)
1173 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1174 else
1175 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
1176 0, iova);
1177 if (WARN_ON(!page_size))
1178 return ERR_PTR(-EINVAL);
1179
1180 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1181 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1182 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1183 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1184 /*
1185 * If the MR can't come from the cache then synchronously create an uncached
1186 * one.
1187 */
1188 if (!ent) {
1189 mutex_lock(&dev->slow_path_mutex);
1190 mr = reg_create(pd, umem, iova, access_flags, page_size, false);
1191 mutex_unlock(&dev->slow_path_mutex);
1192 if (IS_ERR(mr))
1193 return mr;
1194 mr->mmkey.rb_key = rb_key;
1195 return mr;
1196 }
1197
1198 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1199 if (IS_ERR(mr))
1200 return mr;
1201
1202 mr->ibmr.pd = pd;
1203 mr->umem = umem;
1204 mr->page_shift = order_base_2(page_size);
1205 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1206
1207 return mr;
1208 }
1209
1210 /*
1211 * If ibmr is NULL it will be allocated by reg_create.
1212 * Else, the given ibmr will be used.
1213 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned int page_size,bool populate)1214 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1215 u64 iova, int access_flags,
1216 unsigned int page_size, bool populate)
1217 {
1218 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1219 struct mlx5_ib_mr *mr;
1220 __be64 *pas;
1221 void *mkc;
1222 int inlen;
1223 u32 *in;
1224 int err;
1225 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1226
1227 if (!page_size)
1228 return ERR_PTR(-EINVAL);
1229 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1230 if (!mr)
1231 return ERR_PTR(-ENOMEM);
1232
1233 mr->ibmr.pd = pd;
1234 mr->access_flags = access_flags;
1235 mr->page_shift = order_base_2(page_size);
1236
1237 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1238 if (populate)
1239 inlen += sizeof(*pas) *
1240 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1241 in = kvzalloc(inlen, GFP_KERNEL);
1242 if (!in) {
1243 err = -ENOMEM;
1244 goto err_1;
1245 }
1246 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1247 if (populate) {
1248 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1249 err = -EINVAL;
1250 goto err_2;
1251 }
1252 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1253 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1254 }
1255
1256 /* The pg_access bit allows setting the access flags
1257 * in the page list submitted with the command.
1258 */
1259 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1260
1261 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1262 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1263 populate ? pd : dev->umrc.pd);
1264 MLX5_SET(mkc, mkc, free, !populate);
1265 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1266 MLX5_SET(mkc, mkc, umr_en, 1);
1267
1268 MLX5_SET64(mkc, mkc, len, umem->length);
1269 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1270 MLX5_SET(mkc, mkc, translations_octword_size,
1271 get_octo_len(iova, umem->length, mr->page_shift));
1272 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1273 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1274 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1275 if (populate) {
1276 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1277 get_octo_len(iova, umem->length, mr->page_shift));
1278 }
1279
1280 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1281 if (err) {
1282 mlx5_ib_warn(dev, "create mkey failed\n");
1283 goto err_2;
1284 }
1285 mr->mmkey.type = MLX5_MKEY_MR;
1286 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1287 mr->umem = umem;
1288 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1289 kvfree(in);
1290
1291 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1292
1293 return mr;
1294
1295 err_2:
1296 kvfree(in);
1297 err_1:
1298 kfree(mr);
1299 return ERR_PTR(err);
1300 }
1301
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1302 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1303 u64 length, int acc, int mode)
1304 {
1305 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1306 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1307 struct mlx5_ib_mr *mr;
1308 void *mkc;
1309 u32 *in;
1310 int err;
1311
1312 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1313 if (!mr)
1314 return ERR_PTR(-ENOMEM);
1315
1316 in = kzalloc(inlen, GFP_KERNEL);
1317 if (!in) {
1318 err = -ENOMEM;
1319 goto err_free;
1320 }
1321
1322 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1323
1324 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1325 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1326 MLX5_SET64(mkc, mkc, len, length);
1327 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1328
1329 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1330 if (err)
1331 goto err_in;
1332
1333 kfree(in);
1334
1335 set_mr_fields(dev, mr, length, acc, start_addr);
1336
1337 return &mr->ibmr;
1338
1339 err_in:
1340 kfree(in);
1341
1342 err_free:
1343 kfree(mr);
1344
1345 return ERR_PTR(err);
1346 }
1347
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1348 int mlx5_ib_advise_mr(struct ib_pd *pd,
1349 enum ib_uverbs_advise_mr_advice advice,
1350 u32 flags,
1351 struct ib_sge *sg_list,
1352 u32 num_sge,
1353 struct uverbs_attr_bundle *attrs)
1354 {
1355 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1356 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1357 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1358 return -EOPNOTSUPP;
1359
1360 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1361 sg_list, num_sge);
1362 }
1363
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1364 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1365 struct ib_dm_mr_attr *attr,
1366 struct uverbs_attr_bundle *attrs)
1367 {
1368 struct mlx5_ib_dm *mdm = to_mdm(dm);
1369 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1370 u64 start_addr = mdm->dev_addr + attr->offset;
1371 int mode;
1372
1373 switch (mdm->type) {
1374 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1375 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1376 return ERR_PTR(-EINVAL);
1377
1378 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1379 start_addr -= pci_resource_start(dev->pdev, 0);
1380 break;
1381 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1382 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1383 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1384 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1385 return ERR_PTR(-EINVAL);
1386
1387 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1388 break;
1389 default:
1390 return ERR_PTR(-EINVAL);
1391 }
1392
1393 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1394 attr->access_flags, mode);
1395 }
1396
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1397 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1398 u64 iova, int access_flags)
1399 {
1400 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1401 struct mlx5_ib_mr *mr = NULL;
1402 bool xlt_with_umr;
1403 int err;
1404
1405 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1406 if (xlt_with_umr) {
1407 mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1408 } else {
1409 unsigned int page_size = mlx5_umem_find_best_pgsz(
1410 umem, mkc, log_page_size, 0, iova);
1411
1412 mutex_lock(&dev->slow_path_mutex);
1413 mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1414 mutex_unlock(&dev->slow_path_mutex);
1415 }
1416 if (IS_ERR(mr)) {
1417 ib_umem_release(umem);
1418 return ERR_CAST(mr);
1419 }
1420
1421 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1422
1423 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1424
1425 if (xlt_with_umr) {
1426 /*
1427 * If the MR was created with reg_create then it will be
1428 * configured properly but left disabled. It is safe to go ahead
1429 * and configure it again via UMR while enabling it.
1430 */
1431 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1432 if (err) {
1433 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1434 return ERR_PTR(err);
1435 }
1436 }
1437 return &mr->ibmr;
1438 }
1439
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1440 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1441 u64 iova, int access_flags,
1442 struct ib_udata *udata)
1443 {
1444 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1445 struct ib_umem_odp *odp;
1446 struct mlx5_ib_mr *mr;
1447 int err;
1448
1449 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1450 return ERR_PTR(-EOPNOTSUPP);
1451
1452 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1453 if (err)
1454 return ERR_PTR(err);
1455 if (!start && length == U64_MAX) {
1456 if (iova != 0)
1457 return ERR_PTR(-EINVAL);
1458 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1459 return ERR_PTR(-EINVAL);
1460
1461 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1462 if (IS_ERR(mr))
1463 return ERR_CAST(mr);
1464 return &mr->ibmr;
1465 }
1466
1467 /* ODP requires xlt update via umr to work. */
1468 if (!mlx5r_umr_can_load_pas(dev, length))
1469 return ERR_PTR(-EINVAL);
1470
1471 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1472 &mlx5_mn_ops);
1473 if (IS_ERR(odp))
1474 return ERR_CAST(odp);
1475
1476 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1477 if (IS_ERR(mr)) {
1478 ib_umem_release(&odp->umem);
1479 return ERR_CAST(mr);
1480 }
1481 xa_init(&mr->implicit_children);
1482
1483 odp->private = mr;
1484 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1485 if (err)
1486 goto err_dereg_mr;
1487
1488 err = mlx5_ib_init_odp_mr(mr);
1489 if (err)
1490 goto err_dereg_mr;
1491 return &mr->ibmr;
1492
1493 err_dereg_mr:
1494 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1495 return ERR_PTR(err);
1496 }
1497
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1498 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1499 u64 iova, int access_flags,
1500 struct ib_udata *udata)
1501 {
1502 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1503 struct ib_umem *umem;
1504
1505 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1506 return ERR_PTR(-EOPNOTSUPP);
1507
1508 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1509 start, iova, length, access_flags);
1510
1511 if (access_flags & IB_ACCESS_ON_DEMAND)
1512 return create_user_odp_mr(pd, start, length, iova, access_flags,
1513 udata);
1514 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1515 if (IS_ERR(umem))
1516 return ERR_CAST(umem);
1517 return create_real_mr(pd, umem, iova, access_flags);
1518 }
1519
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1520 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1521 {
1522 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1523 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1524
1525 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1526
1527 if (!umem_dmabuf->sgt)
1528 return;
1529
1530 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1531 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1532 }
1533
1534 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1535 .allow_peer2peer = 1,
1536 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1537 };
1538
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct ib_udata * udata)1539 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1540 u64 length, u64 virt_addr,
1541 int fd, int access_flags,
1542 struct ib_udata *udata)
1543 {
1544 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1545 struct mlx5_ib_mr *mr = NULL;
1546 struct ib_umem_dmabuf *umem_dmabuf;
1547 int err;
1548
1549 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1550 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1551 return ERR_PTR(-EOPNOTSUPP);
1552
1553 mlx5_ib_dbg(dev,
1554 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1555 offset, virt_addr, length, fd, access_flags);
1556
1557 /* dmabuf requires xlt update via umr to work. */
1558 if (!mlx5r_umr_can_load_pas(dev, length))
1559 return ERR_PTR(-EINVAL);
1560
1561 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1562 access_flags,
1563 &mlx5_ib_dmabuf_attach_ops);
1564 if (IS_ERR(umem_dmabuf)) {
1565 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1566 PTR_ERR(umem_dmabuf));
1567 return ERR_CAST(umem_dmabuf);
1568 }
1569
1570 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1571 access_flags);
1572 if (IS_ERR(mr)) {
1573 ib_umem_release(&umem_dmabuf->umem);
1574 return ERR_CAST(mr);
1575 }
1576
1577 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1578
1579 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1580 umem_dmabuf->private = mr;
1581 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1582 if (err)
1583 goto err_dereg_mr;
1584
1585 err = mlx5_ib_init_dmabuf_mr(mr);
1586 if (err)
1587 goto err_dereg_mr;
1588 return &mr->ibmr;
1589
1590 err_dereg_mr:
1591 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1592 return ERR_PTR(err);
1593 }
1594
1595 /*
1596 * True if the change in access flags can be done via UMR, only some access
1597 * flags can be updated.
1598 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1599 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1600 unsigned int current_access_flags,
1601 unsigned int target_access_flags)
1602 {
1603 unsigned int diffs = current_access_flags ^ target_access_flags;
1604
1605 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1606 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1607 IB_ACCESS_REMOTE_ATOMIC))
1608 return false;
1609 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1610 target_access_flags);
1611 }
1612
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1613 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1614 struct ib_umem *new_umem,
1615 int new_access_flags, u64 iova,
1616 unsigned long *page_size)
1617 {
1618 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1619
1620 /* We only track the allocated sizes of MRs from the cache */
1621 if (!mr->mmkey.cache_ent)
1622 return false;
1623 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1624 return false;
1625
1626 *page_size =
1627 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1628 if (WARN_ON(!*page_size))
1629 return false;
1630 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1631 ib_umem_num_dma_blocks(new_umem, *page_size);
1632 }
1633
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1634 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1635 int access_flags, int flags, struct ib_umem *new_umem,
1636 u64 iova, unsigned long page_size)
1637 {
1638 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1639 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1640 struct ib_umem *old_umem = mr->umem;
1641 int err;
1642
1643 /*
1644 * To keep everything simple the MR is revoked before we start to mess
1645 * with it. This ensure the change is atomic relative to any use of the
1646 * MR.
1647 */
1648 err = mlx5r_umr_revoke_mr(mr);
1649 if (err)
1650 return err;
1651
1652 if (flags & IB_MR_REREG_PD) {
1653 mr->ibmr.pd = pd;
1654 upd_flags |= MLX5_IB_UPD_XLT_PD;
1655 }
1656 if (flags & IB_MR_REREG_ACCESS) {
1657 mr->access_flags = access_flags;
1658 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1659 }
1660
1661 mr->ibmr.iova = iova;
1662 mr->ibmr.length = new_umem->length;
1663 mr->page_shift = order_base_2(page_size);
1664 mr->umem = new_umem;
1665 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1666 if (err) {
1667 /*
1668 * The MR is revoked at this point so there is no issue to free
1669 * new_umem.
1670 */
1671 mr->umem = old_umem;
1672 return err;
1673 }
1674
1675 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1676 ib_umem_release(old_umem);
1677 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1678 return 0;
1679 }
1680
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1681 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1682 u64 length, u64 iova, int new_access_flags,
1683 struct ib_pd *new_pd,
1684 struct ib_udata *udata)
1685 {
1686 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1687 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1688 int err;
1689
1690 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1691 return ERR_PTR(-EOPNOTSUPP);
1692
1693 mlx5_ib_dbg(
1694 dev,
1695 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1696 start, iova, length, new_access_flags);
1697
1698 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1699 return ERR_PTR(-EOPNOTSUPP);
1700
1701 if (!(flags & IB_MR_REREG_ACCESS))
1702 new_access_flags = mr->access_flags;
1703 if (!(flags & IB_MR_REREG_PD))
1704 new_pd = ib_mr->pd;
1705
1706 if (!(flags & IB_MR_REREG_TRANS)) {
1707 struct ib_umem *umem;
1708
1709 /* Fast path for PD/access change */
1710 if (can_use_umr_rereg_access(dev, mr->access_flags,
1711 new_access_flags)) {
1712 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1713 new_access_flags);
1714 if (err)
1715 return ERR_PTR(err);
1716 return NULL;
1717 }
1718 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1719 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1720 goto recreate;
1721
1722 /*
1723 * Only one active MR can refer to a umem at one time, revoke
1724 * the old MR before assigning the umem to the new one.
1725 */
1726 err = mlx5r_umr_revoke_mr(mr);
1727 if (err)
1728 return ERR_PTR(err);
1729 umem = mr->umem;
1730 mr->umem = NULL;
1731 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1732
1733 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1734 new_access_flags);
1735 }
1736
1737 /*
1738 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1739 * but the logic around releasing the umem is different
1740 */
1741 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1742 goto recreate;
1743
1744 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1745 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1746 struct ib_umem *new_umem;
1747 unsigned long page_size;
1748
1749 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1750 new_access_flags);
1751 if (IS_ERR(new_umem))
1752 return ERR_CAST(new_umem);
1753
1754 /* Fast path for PAS change */
1755 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1756 &page_size)) {
1757 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1758 new_umem, iova, page_size);
1759 if (err) {
1760 ib_umem_release(new_umem);
1761 return ERR_PTR(err);
1762 }
1763 return NULL;
1764 }
1765 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1766 }
1767
1768 /*
1769 * Everything else has no state we can preserve, just create a new MR
1770 * from scratch
1771 */
1772 recreate:
1773 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1774 new_access_flags, udata);
1775 }
1776
1777 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1778 mlx5_alloc_priv_descs(struct ib_device *device,
1779 struct mlx5_ib_mr *mr,
1780 int ndescs,
1781 int desc_size)
1782 {
1783 struct mlx5_ib_dev *dev = to_mdev(device);
1784 struct device *ddev = &dev->mdev->pdev->dev;
1785 int size = ndescs * desc_size;
1786 int add_size;
1787 int ret;
1788
1789 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1790 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1791 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1792
1793 add_size = min_t(int, end - size, add_size);
1794 }
1795
1796 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1797 if (!mr->descs_alloc)
1798 return -ENOMEM;
1799
1800 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1801
1802 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1803 if (dma_mapping_error(ddev, mr->desc_map)) {
1804 ret = -ENOMEM;
1805 goto err;
1806 }
1807
1808 return 0;
1809 err:
1810 kfree(mr->descs_alloc);
1811
1812 return ret;
1813 }
1814
1815 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1816 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1817 {
1818 if (!mr->umem && mr->descs) {
1819 struct ib_device *device = mr->ibmr.device;
1820 int size = mr->max_descs * mr->desc_size;
1821 struct mlx5_ib_dev *dev = to_mdev(device);
1822
1823 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1824 DMA_TO_DEVICE);
1825 kfree(mr->descs_alloc);
1826 mr->descs = NULL;
1827 }
1828 }
1829
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1830 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1831 struct mlx5_ib_mr *mr)
1832 {
1833 struct mlx5_mkey_cache *cache = &dev->cache;
1834 struct mlx5_cache_ent *ent;
1835 int ret;
1836
1837 if (mr->mmkey.cache_ent) {
1838 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1839 mr->mmkey.cache_ent->in_use--;
1840 goto end;
1841 }
1842
1843 mutex_lock(&cache->rb_lock);
1844 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1845 if (ent) {
1846 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1847 if (ent->disabled) {
1848 mutex_unlock(&cache->rb_lock);
1849 return -EOPNOTSUPP;
1850 }
1851 mr->mmkey.cache_ent = ent;
1852 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1853 mutex_unlock(&cache->rb_lock);
1854 goto end;
1855 }
1856 }
1857
1858 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1859 mutex_unlock(&cache->rb_lock);
1860 if (IS_ERR(ent))
1861 return PTR_ERR(ent);
1862
1863 mr->mmkey.cache_ent = ent;
1864 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1865
1866 end:
1867 ret = push_mkey_locked(mr->mmkey.cache_ent, false,
1868 xa_mk_value(mr->mmkey.key));
1869 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
1870 return ret;
1871 }
1872
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)1873 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1874 {
1875 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1876 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1877 int rc;
1878
1879 /*
1880 * Any async use of the mr must hold the refcount, once the refcount
1881 * goes to zero no other thread, such as ODP page faults, prefetch, any
1882 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1883 */
1884 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1885 refcount_read(&mr->mmkey.usecount) != 0 &&
1886 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1887 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1888
1889 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1890 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1891 mr->sig, NULL, GFP_KERNEL);
1892
1893 if (mr->mtt_mr) {
1894 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1895 if (rc)
1896 return rc;
1897 mr->mtt_mr = NULL;
1898 }
1899 if (mr->klm_mr) {
1900 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1901 if (rc)
1902 return rc;
1903 mr->klm_mr = NULL;
1904 }
1905
1906 if (mlx5_core_destroy_psv(dev->mdev,
1907 mr->sig->psv_memory.psv_idx))
1908 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1909 mr->sig->psv_memory.psv_idx);
1910 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1911 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1912 mr->sig->psv_wire.psv_idx);
1913 kfree(mr->sig);
1914 mr->sig = NULL;
1915 }
1916
1917 /* Stop DMA */
1918 if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
1919 if (mlx5r_umr_revoke_mr(mr) ||
1920 cache_ent_find_and_store(dev, mr))
1921 mr->mmkey.cache_ent = NULL;
1922
1923 if (!mr->mmkey.cache_ent) {
1924 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1925 if (rc)
1926 return rc;
1927 }
1928
1929 if (mr->umem) {
1930 bool is_odp = is_odp_mr(mr);
1931
1932 if (!is_odp)
1933 atomic_sub(ib_umem_num_pages(mr->umem),
1934 &dev->mdev->priv.reg_pages);
1935 ib_umem_release(mr->umem);
1936 if (is_odp)
1937 mlx5_ib_free_odp_mr(mr);
1938 }
1939
1940 if (!mr->mmkey.cache_ent)
1941 mlx5_free_priv_descs(mr);
1942
1943 kfree(mr);
1944 return 0;
1945 }
1946
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)1947 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1948 int access_mode, int page_shift)
1949 {
1950 void *mkc;
1951
1952 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1953
1954 /* This is only used from the kernel, so setting the PD is OK. */
1955 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
1956 MLX5_SET(mkc, mkc, free, 1);
1957 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1958 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1959 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1960 MLX5_SET(mkc, mkc, umr_en, 1);
1961 MLX5_SET(mkc, mkc, log_page_size, page_shift);
1962 }
1963
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)1964 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1965 int ndescs, int desc_size, int page_shift,
1966 int access_mode, u32 *in, int inlen)
1967 {
1968 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1969 int err;
1970
1971 mr->access_mode = access_mode;
1972 mr->desc_size = desc_size;
1973 mr->max_descs = ndescs;
1974
1975 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1976 if (err)
1977 return err;
1978
1979 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1980
1981 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1982 if (err)
1983 goto err_free_descs;
1984
1985 mr->mmkey.type = MLX5_MKEY_MR;
1986 mr->ibmr.lkey = mr->mmkey.key;
1987 mr->ibmr.rkey = mr->mmkey.key;
1988
1989 return 0;
1990
1991 err_free_descs:
1992 mlx5_free_priv_descs(mr);
1993 return err;
1994 }
1995
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)1996 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1997 u32 max_num_sg, u32 max_num_meta_sg,
1998 int desc_size, int access_mode)
1999 {
2000 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2001 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2002 int page_shift = 0;
2003 struct mlx5_ib_mr *mr;
2004 u32 *in;
2005 int err;
2006
2007 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2008 if (!mr)
2009 return ERR_PTR(-ENOMEM);
2010
2011 mr->ibmr.pd = pd;
2012 mr->ibmr.device = pd->device;
2013
2014 in = kzalloc(inlen, GFP_KERNEL);
2015 if (!in) {
2016 err = -ENOMEM;
2017 goto err_free;
2018 }
2019
2020 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2021 page_shift = PAGE_SHIFT;
2022
2023 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2024 access_mode, in, inlen);
2025 if (err)
2026 goto err_free_in;
2027
2028 mr->umem = NULL;
2029 kfree(in);
2030
2031 return mr;
2032
2033 err_free_in:
2034 kfree(in);
2035 err_free:
2036 kfree(mr);
2037 return ERR_PTR(err);
2038 }
2039
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2040 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2041 int ndescs, u32 *in, int inlen)
2042 {
2043 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2044 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2045 inlen);
2046 }
2047
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2048 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2049 int ndescs, u32 *in, int inlen)
2050 {
2051 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2052 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2053 }
2054
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2055 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2056 int max_num_sg, int max_num_meta_sg,
2057 u32 *in, int inlen)
2058 {
2059 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2060 u32 psv_index[2];
2061 void *mkc;
2062 int err;
2063
2064 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2065 if (!mr->sig)
2066 return -ENOMEM;
2067
2068 /* create mem & wire PSVs */
2069 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2070 if (err)
2071 goto err_free_sig;
2072
2073 mr->sig->psv_memory.psv_idx = psv_index[0];
2074 mr->sig->psv_wire.psv_idx = psv_index[1];
2075
2076 mr->sig->sig_status_checked = true;
2077 mr->sig->sig_err_exists = false;
2078 /* Next UMR, Arm SIGERR */
2079 ++mr->sig->sigerr_count;
2080 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2081 sizeof(struct mlx5_klm),
2082 MLX5_MKC_ACCESS_MODE_KLMS);
2083 if (IS_ERR(mr->klm_mr)) {
2084 err = PTR_ERR(mr->klm_mr);
2085 goto err_destroy_psv;
2086 }
2087 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2088 sizeof(struct mlx5_mtt),
2089 MLX5_MKC_ACCESS_MODE_MTT);
2090 if (IS_ERR(mr->mtt_mr)) {
2091 err = PTR_ERR(mr->mtt_mr);
2092 goto err_free_klm_mr;
2093 }
2094
2095 /* Set bsf descriptors for mkey */
2096 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2097 MLX5_SET(mkc, mkc, bsf_en, 1);
2098 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2099
2100 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2101 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2102 if (err)
2103 goto err_free_mtt_mr;
2104
2105 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2106 mr->sig, GFP_KERNEL));
2107 if (err)
2108 goto err_free_descs;
2109 return 0;
2110
2111 err_free_descs:
2112 destroy_mkey(dev, mr);
2113 mlx5_free_priv_descs(mr);
2114 err_free_mtt_mr:
2115 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2116 mr->mtt_mr = NULL;
2117 err_free_klm_mr:
2118 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2119 mr->klm_mr = NULL;
2120 err_destroy_psv:
2121 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2122 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2123 mr->sig->psv_memory.psv_idx);
2124 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2125 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2126 mr->sig->psv_wire.psv_idx);
2127 err_free_sig:
2128 kfree(mr->sig);
2129
2130 return err;
2131 }
2132
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2133 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2134 enum ib_mr_type mr_type, u32 max_num_sg,
2135 u32 max_num_meta_sg)
2136 {
2137 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2138 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2139 int ndescs = ALIGN(max_num_sg, 4);
2140 struct mlx5_ib_mr *mr;
2141 u32 *in;
2142 int err;
2143
2144 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2145 if (!mr)
2146 return ERR_PTR(-ENOMEM);
2147
2148 in = kzalloc(inlen, GFP_KERNEL);
2149 if (!in) {
2150 err = -ENOMEM;
2151 goto err_free;
2152 }
2153
2154 mr->ibmr.device = pd->device;
2155 mr->umem = NULL;
2156
2157 switch (mr_type) {
2158 case IB_MR_TYPE_MEM_REG:
2159 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2160 break;
2161 case IB_MR_TYPE_SG_GAPS:
2162 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2163 break;
2164 case IB_MR_TYPE_INTEGRITY:
2165 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2166 max_num_meta_sg, in, inlen);
2167 break;
2168 default:
2169 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2170 err = -EINVAL;
2171 }
2172
2173 if (err)
2174 goto err_free_in;
2175
2176 kfree(in);
2177
2178 return &mr->ibmr;
2179
2180 err_free_in:
2181 kfree(in);
2182 err_free:
2183 kfree(mr);
2184 return ERR_PTR(err);
2185 }
2186
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2187 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2188 u32 max_num_sg)
2189 {
2190 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2191 }
2192
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2193 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2194 u32 max_num_sg, u32 max_num_meta_sg)
2195 {
2196 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2197 max_num_meta_sg);
2198 }
2199
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2200 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2201 {
2202 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2203 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2204 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2205 unsigned int ndescs;
2206 u32 *in = NULL;
2207 void *mkc;
2208 int err;
2209 struct mlx5_ib_alloc_mw req = {};
2210 struct {
2211 __u32 comp_mask;
2212 __u32 response_length;
2213 } resp = {};
2214
2215 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2216 if (err)
2217 return err;
2218
2219 if (req.comp_mask || req.reserved1 || req.reserved2)
2220 return -EOPNOTSUPP;
2221
2222 if (udata->inlen > sizeof(req) &&
2223 !ib_is_udata_cleared(udata, sizeof(req),
2224 udata->inlen - sizeof(req)))
2225 return -EOPNOTSUPP;
2226
2227 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2228
2229 in = kzalloc(inlen, GFP_KERNEL);
2230 if (!in)
2231 return -ENOMEM;
2232
2233 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2234
2235 MLX5_SET(mkc, mkc, free, 1);
2236 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2237 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2238 MLX5_SET(mkc, mkc, umr_en, 1);
2239 MLX5_SET(mkc, mkc, lr, 1);
2240 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2241 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2242 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2243
2244 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2245 if (err)
2246 goto free;
2247
2248 mw->mmkey.type = MLX5_MKEY_MW;
2249 ibmw->rkey = mw->mmkey.key;
2250 mw->mmkey.ndescs = ndescs;
2251
2252 resp.response_length =
2253 min(offsetofend(typeof(resp), response_length), udata->outlen);
2254 if (resp.response_length) {
2255 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2256 if (err)
2257 goto free_mkey;
2258 }
2259
2260 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2261 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2262 if (err)
2263 goto free_mkey;
2264 }
2265
2266 kfree(in);
2267 return 0;
2268
2269 free_mkey:
2270 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2271 free:
2272 kfree(in);
2273 return err;
2274 }
2275
mlx5_ib_dealloc_mw(struct ib_mw * mw)2276 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2277 {
2278 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2279 struct mlx5_ib_mw *mmw = to_mmw(mw);
2280
2281 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2282 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2283 /*
2284 * pagefault_single_data_segment() may be accessing mmw
2285 * if the user bound an ODP MR to this MW.
2286 */
2287 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2288
2289 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2290 }
2291
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2292 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2293 struct ib_mr_status *mr_status)
2294 {
2295 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2296 int ret = 0;
2297
2298 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2299 pr_err("Invalid status check mask\n");
2300 ret = -EINVAL;
2301 goto done;
2302 }
2303
2304 mr_status->fail_status = 0;
2305 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2306 if (!mmr->sig) {
2307 ret = -EINVAL;
2308 pr_err("signature status check requested on a non-signature enabled MR\n");
2309 goto done;
2310 }
2311
2312 mmr->sig->sig_status_checked = true;
2313 if (!mmr->sig->sig_err_exists)
2314 goto done;
2315
2316 if (ibmr->lkey == mmr->sig->err_item.key)
2317 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2318 sizeof(mr_status->sig_err));
2319 else {
2320 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2321 mr_status->sig_err.sig_err_offset = 0;
2322 mr_status->sig_err.key = mmr->sig->err_item.key;
2323 }
2324
2325 mmr->sig->sig_err_exists = false;
2326 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2327 }
2328
2329 done:
2330 return ret;
2331 }
2332
2333 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2334 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2335 int data_sg_nents, unsigned int *data_sg_offset,
2336 struct scatterlist *meta_sg, int meta_sg_nents,
2337 unsigned int *meta_sg_offset)
2338 {
2339 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2340 unsigned int sg_offset = 0;
2341 int n = 0;
2342
2343 mr->meta_length = 0;
2344 if (data_sg_nents == 1) {
2345 n++;
2346 mr->mmkey.ndescs = 1;
2347 if (data_sg_offset)
2348 sg_offset = *data_sg_offset;
2349 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2350 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2351 if (meta_sg_nents == 1) {
2352 n++;
2353 mr->meta_ndescs = 1;
2354 if (meta_sg_offset)
2355 sg_offset = *meta_sg_offset;
2356 else
2357 sg_offset = 0;
2358 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2359 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2360 }
2361 ibmr->length = mr->data_length + mr->meta_length;
2362 }
2363
2364 return n;
2365 }
2366
2367 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2368 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2369 struct scatterlist *sgl,
2370 unsigned short sg_nents,
2371 unsigned int *sg_offset_p,
2372 struct scatterlist *meta_sgl,
2373 unsigned short meta_sg_nents,
2374 unsigned int *meta_sg_offset_p)
2375 {
2376 struct scatterlist *sg = sgl;
2377 struct mlx5_klm *klms = mr->descs;
2378 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2379 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2380 int i, j = 0;
2381
2382 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2383 mr->ibmr.length = 0;
2384
2385 for_each_sg(sgl, sg, sg_nents, i) {
2386 if (unlikely(i >= mr->max_descs))
2387 break;
2388 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2389 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2390 klms[i].key = cpu_to_be32(lkey);
2391 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2392
2393 sg_offset = 0;
2394 }
2395
2396 if (sg_offset_p)
2397 *sg_offset_p = sg_offset;
2398
2399 mr->mmkey.ndescs = i;
2400 mr->data_length = mr->ibmr.length;
2401
2402 if (meta_sg_nents) {
2403 sg = meta_sgl;
2404 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2405 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2406 if (unlikely(i + j >= mr->max_descs))
2407 break;
2408 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2409 sg_offset);
2410 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2411 sg_offset);
2412 klms[i + j].key = cpu_to_be32(lkey);
2413 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2414
2415 sg_offset = 0;
2416 }
2417 if (meta_sg_offset_p)
2418 *meta_sg_offset_p = sg_offset;
2419
2420 mr->meta_ndescs = j;
2421 mr->meta_length = mr->ibmr.length - mr->data_length;
2422 }
2423
2424 return i + j;
2425 }
2426
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2427 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2428 {
2429 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2430 __be64 *descs;
2431
2432 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2433 return -ENOMEM;
2434
2435 descs = mr->descs;
2436 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2437
2438 return 0;
2439 }
2440
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2441 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2442 {
2443 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2444 __be64 *descs;
2445
2446 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2447 return -ENOMEM;
2448
2449 descs = mr->descs;
2450 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2451 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2452
2453 return 0;
2454 }
2455
2456 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2457 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2458 int data_sg_nents, unsigned int *data_sg_offset,
2459 struct scatterlist *meta_sg, int meta_sg_nents,
2460 unsigned int *meta_sg_offset)
2461 {
2462 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2463 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2464 int n;
2465
2466 pi_mr->mmkey.ndescs = 0;
2467 pi_mr->meta_ndescs = 0;
2468 pi_mr->meta_length = 0;
2469
2470 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2471 pi_mr->desc_size * pi_mr->max_descs,
2472 DMA_TO_DEVICE);
2473
2474 pi_mr->ibmr.page_size = ibmr->page_size;
2475 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2476 mlx5_set_page);
2477 if (n != data_sg_nents)
2478 return n;
2479
2480 pi_mr->data_iova = pi_mr->ibmr.iova;
2481 pi_mr->data_length = pi_mr->ibmr.length;
2482 pi_mr->ibmr.length = pi_mr->data_length;
2483 ibmr->length = pi_mr->data_length;
2484
2485 if (meta_sg_nents) {
2486 u64 page_mask = ~((u64)ibmr->page_size - 1);
2487 u64 iova = pi_mr->data_iova;
2488
2489 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2490 meta_sg_offset, mlx5_set_page_pi);
2491
2492 pi_mr->meta_length = pi_mr->ibmr.length;
2493 /*
2494 * PI address for the HW is the offset of the metadata address
2495 * relative to the first data page address.
2496 * It equals to first data page address + size of data pages +
2497 * metadata offset at the first metadata page
2498 */
2499 pi_mr->pi_iova = (iova & page_mask) +
2500 pi_mr->mmkey.ndescs * ibmr->page_size +
2501 (pi_mr->ibmr.iova & ~page_mask);
2502 /*
2503 * In order to use one MTT MR for data and metadata, we register
2504 * also the gaps between the end of the data and the start of
2505 * the metadata (the sig MR will verify that the HW will access
2506 * to right addresses). This mapping is safe because we use
2507 * internal mkey for the registration.
2508 */
2509 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2510 pi_mr->ibmr.iova = iova;
2511 ibmr->length += pi_mr->meta_length;
2512 }
2513
2514 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2515 pi_mr->desc_size * pi_mr->max_descs,
2516 DMA_TO_DEVICE);
2517
2518 return n;
2519 }
2520
2521 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2522 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2523 int data_sg_nents, unsigned int *data_sg_offset,
2524 struct scatterlist *meta_sg, int meta_sg_nents,
2525 unsigned int *meta_sg_offset)
2526 {
2527 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2528 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2529 int n;
2530
2531 pi_mr->mmkey.ndescs = 0;
2532 pi_mr->meta_ndescs = 0;
2533 pi_mr->meta_length = 0;
2534
2535 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2536 pi_mr->desc_size * pi_mr->max_descs,
2537 DMA_TO_DEVICE);
2538
2539 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2540 meta_sg, meta_sg_nents, meta_sg_offset);
2541
2542 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2543 pi_mr->desc_size * pi_mr->max_descs,
2544 DMA_TO_DEVICE);
2545
2546 /* This is zero-based memory region */
2547 pi_mr->data_iova = 0;
2548 pi_mr->ibmr.iova = 0;
2549 pi_mr->pi_iova = pi_mr->data_length;
2550 ibmr->length = pi_mr->ibmr.length;
2551
2552 return n;
2553 }
2554
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2555 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2556 int data_sg_nents, unsigned int *data_sg_offset,
2557 struct scatterlist *meta_sg, int meta_sg_nents,
2558 unsigned int *meta_sg_offset)
2559 {
2560 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2561 struct mlx5_ib_mr *pi_mr = NULL;
2562 int n;
2563
2564 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2565
2566 mr->mmkey.ndescs = 0;
2567 mr->data_length = 0;
2568 mr->data_iova = 0;
2569 mr->meta_ndescs = 0;
2570 mr->pi_iova = 0;
2571 /*
2572 * As a performance optimization, if possible, there is no need to
2573 * perform UMR operation to register the data/metadata buffers.
2574 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2575 * Fallback to UMR only in case of a failure.
2576 */
2577 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2578 data_sg_offset, meta_sg, meta_sg_nents,
2579 meta_sg_offset);
2580 if (n == data_sg_nents + meta_sg_nents)
2581 goto out;
2582 /*
2583 * As a performance optimization, if possible, there is no need to map
2584 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2585 * descriptors and fallback to KLM only in case of a failure.
2586 * It's more efficient for the HW to work with MTT descriptors
2587 * (especially in high load).
2588 * Use KLM (indirect access) only if it's mandatory.
2589 */
2590 pi_mr = mr->mtt_mr;
2591 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2592 data_sg_offset, meta_sg, meta_sg_nents,
2593 meta_sg_offset);
2594 if (n == data_sg_nents + meta_sg_nents)
2595 goto out;
2596
2597 pi_mr = mr->klm_mr;
2598 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2599 data_sg_offset, meta_sg, meta_sg_nents,
2600 meta_sg_offset);
2601 if (unlikely(n != data_sg_nents + meta_sg_nents))
2602 return -ENOMEM;
2603
2604 out:
2605 /* This is zero-based memory region */
2606 ibmr->iova = 0;
2607 mr->pi_mr = pi_mr;
2608 if (pi_mr)
2609 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2610 else
2611 ibmr->sig_attrs->meta_length = mr->meta_length;
2612
2613 return 0;
2614 }
2615
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2616 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2617 unsigned int *sg_offset)
2618 {
2619 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2620 int n;
2621
2622 mr->mmkey.ndescs = 0;
2623
2624 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2625 mr->desc_size * mr->max_descs,
2626 DMA_TO_DEVICE);
2627
2628 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2629 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2630 NULL);
2631 else
2632 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2633 mlx5_set_page);
2634
2635 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2636 mr->desc_size * mr->max_descs,
2637 DMA_TO_DEVICE);
2638
2639 return n;
2640 }
2641