1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem.h> 43 #include <rdma/ib_umem_odp.h> 44 #include <rdma/ib_verbs.h> 45 #include "dm.h" 46 #include "mlx5_ib.h" 47 48 /* 49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't 50 * work on kernel modules memory 51 */ 52 void *xlt_emergency_page; 53 static DEFINE_MUTEX(xlt_emergency_page_mutex); 54 55 enum { 56 MAX_PENDING_REG_MR = 8, 57 }; 58 59 #define MLX5_UMR_ALIGN 2048 60 61 static void 62 create_mkey_callback(int status, struct mlx5_async_work *context); 63 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 64 u64 iova, int access_flags, 65 unsigned int page_size, bool populate); 66 67 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 68 struct ib_pd *pd) 69 { 70 struct mlx5_ib_dev *dev = to_mdev(pd->device); 71 72 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 73 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 74 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 75 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 76 MLX5_SET(mkc, mkc, lr, 1); 77 78 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 79 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 80 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 81 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 82 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 83 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 84 } 85 86 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 87 MLX5_SET(mkc, mkc, qpn, 0xffffff); 88 MLX5_SET64(mkc, mkc, start_addr, start_addr); 89 } 90 91 static void assign_mkey_variant(struct mlx5_ib_dev *dev, 92 struct mlx5_ib_mkey *mkey, u32 *in) 93 { 94 u8 key = atomic_inc_return(&dev->mkey_var); 95 void *mkc; 96 97 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 98 MLX5_SET(mkc, mkc, mkey_7_0, key); 99 mkey->key = key; 100 } 101 102 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 103 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 104 { 105 int ret; 106 107 assign_mkey_variant(dev, mkey, in); 108 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 109 if (!ret) 110 init_waitqueue_head(&mkey->wait); 111 112 return ret; 113 } 114 115 static int 116 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 117 struct mlx5_ib_mkey *mkey, 118 struct mlx5_async_ctx *async_ctx, 119 u32 *in, int inlen, u32 *out, int outlen, 120 struct mlx5_async_work *context) 121 { 122 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 123 assign_mkey_variant(dev, mkey, in); 124 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 125 create_mkey_callback, context); 126 } 127 128 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 129 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 130 131 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) 132 { 133 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); 134 } 135 136 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 137 { 138 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 139 140 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 141 } 142 143 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 144 { 145 if (status == -ENXIO) /* core driver is not available */ 146 return; 147 148 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 149 if (status != -EREMOTEIO) /* driver specific failure */ 150 return; 151 152 /* Failed in FW, print cmd out failure details */ 153 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 154 } 155 156 static void create_mkey_callback(int status, struct mlx5_async_work *context) 157 { 158 struct mlx5_ib_mr *mr = 159 container_of(context, struct mlx5_ib_mr, cb_work); 160 struct mlx5_cache_ent *ent = mr->cache_ent; 161 struct mlx5_ib_dev *dev = ent->dev; 162 unsigned long flags; 163 164 if (status) { 165 create_mkey_warn(dev, status, mr->out); 166 kfree(mr); 167 spin_lock_irqsave(&ent->lock, flags); 168 ent->pending--; 169 WRITE_ONCE(dev->fill_delay, 1); 170 spin_unlock_irqrestore(&ent->lock, flags); 171 mod_timer(&dev->delay_timer, jiffies + HZ); 172 return; 173 } 174 175 mr->mmkey.type = MLX5_MKEY_MR; 176 mr->mmkey.key |= mlx5_idx_to_mkey( 177 MLX5_GET(create_mkey_out, mr->out, mkey_index)); 178 init_waitqueue_head(&mr->mmkey.wait); 179 180 WRITE_ONCE(dev->cache.last_add, jiffies); 181 182 spin_lock_irqsave(&ent->lock, flags); 183 list_add_tail(&mr->list, &ent->head); 184 ent->available_mrs++; 185 ent->total_mrs++; 186 /* If we are doing fill_to_high_water then keep going. */ 187 queue_adjust_cache_locked(ent); 188 ent->pending--; 189 spin_unlock_irqrestore(&ent->lock, flags); 190 } 191 192 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 193 { 194 int ret = 0; 195 196 switch (access_mode) { 197 case MLX5_MKC_ACCESS_MODE_MTT: 198 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 199 sizeof(struct mlx5_mtt)); 200 break; 201 case MLX5_MKC_ACCESS_MODE_KSM: 202 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 203 sizeof(struct mlx5_klm)); 204 break; 205 default: 206 WARN_ON(1); 207 } 208 return ret; 209 } 210 211 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 212 { 213 struct mlx5_ib_mr *mr; 214 215 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 216 if (!mr) 217 return NULL; 218 mr->cache_ent = ent; 219 220 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 221 MLX5_SET(mkc, mkc, free, 1); 222 MLX5_SET(mkc, mkc, umr_en, 1); 223 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 224 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 225 226 MLX5_SET(mkc, mkc, translations_octword_size, 227 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 228 MLX5_SET(mkc, mkc, log_page_size, ent->page); 229 return mr; 230 } 231 232 /* Asynchronously schedule new MRs to be populated in the cache. */ 233 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 234 { 235 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 236 struct mlx5_ib_mr *mr; 237 void *mkc; 238 u32 *in; 239 int err = 0; 240 int i; 241 242 in = kzalloc(inlen, GFP_KERNEL); 243 if (!in) 244 return -ENOMEM; 245 246 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 247 for (i = 0; i < num; i++) { 248 mr = alloc_cache_mr(ent, mkc); 249 if (!mr) { 250 err = -ENOMEM; 251 break; 252 } 253 spin_lock_irq(&ent->lock); 254 if (ent->pending >= MAX_PENDING_REG_MR) { 255 err = -EAGAIN; 256 spin_unlock_irq(&ent->lock); 257 kfree(mr); 258 break; 259 } 260 ent->pending++; 261 spin_unlock_irq(&ent->lock); 262 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 263 &ent->dev->async_ctx, in, inlen, 264 mr->out, sizeof(mr->out), 265 &mr->cb_work); 266 if (err) { 267 spin_lock_irq(&ent->lock); 268 ent->pending--; 269 spin_unlock_irq(&ent->lock); 270 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 271 kfree(mr); 272 break; 273 } 274 } 275 276 kfree(in); 277 return err; 278 } 279 280 /* Synchronously create a MR in the cache */ 281 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 282 { 283 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 284 struct mlx5_ib_mr *mr; 285 void *mkc; 286 u32 *in; 287 int err; 288 289 in = kzalloc(inlen, GFP_KERNEL); 290 if (!in) 291 return ERR_PTR(-ENOMEM); 292 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 293 294 mr = alloc_cache_mr(ent, mkc); 295 if (!mr) { 296 err = -ENOMEM; 297 goto free_in; 298 } 299 300 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); 301 if (err) 302 goto free_mr; 303 304 init_waitqueue_head(&mr->mmkey.wait); 305 mr->mmkey.type = MLX5_MKEY_MR; 306 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 307 spin_lock_irq(&ent->lock); 308 ent->total_mrs++; 309 spin_unlock_irq(&ent->lock); 310 kfree(in); 311 return mr; 312 free_mr: 313 kfree(mr); 314 free_in: 315 kfree(in); 316 return ERR_PTR(err); 317 } 318 319 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 320 { 321 struct mlx5_ib_mr *mr; 322 323 lockdep_assert_held(&ent->lock); 324 if (list_empty(&ent->head)) 325 return; 326 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 327 list_del(&mr->list); 328 ent->available_mrs--; 329 ent->total_mrs--; 330 spin_unlock_irq(&ent->lock); 331 mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); 332 kfree(mr); 333 spin_lock_irq(&ent->lock); 334 } 335 336 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 337 bool limit_fill) 338 { 339 int err; 340 341 lockdep_assert_held(&ent->lock); 342 343 while (true) { 344 if (limit_fill) 345 target = ent->limit * 2; 346 if (target == ent->available_mrs + ent->pending) 347 return 0; 348 if (target > ent->available_mrs + ent->pending) { 349 u32 todo = target - (ent->available_mrs + ent->pending); 350 351 spin_unlock_irq(&ent->lock); 352 err = add_keys(ent, todo); 353 if (err == -EAGAIN) 354 usleep_range(3000, 5000); 355 spin_lock_irq(&ent->lock); 356 if (err) { 357 if (err != -EAGAIN) 358 return err; 359 } else 360 return 0; 361 } else { 362 remove_cache_mr_locked(ent); 363 } 364 } 365 } 366 367 static ssize_t size_write(struct file *filp, const char __user *buf, 368 size_t count, loff_t *pos) 369 { 370 struct mlx5_cache_ent *ent = filp->private_data; 371 u32 target; 372 int err; 373 374 err = kstrtou32_from_user(buf, count, 0, &target); 375 if (err) 376 return err; 377 378 /* 379 * Target is the new value of total_mrs the user requests, however we 380 * cannot free MRs that are in use. Compute the target value for 381 * available_mrs. 382 */ 383 spin_lock_irq(&ent->lock); 384 if (target < ent->total_mrs - ent->available_mrs) { 385 err = -EINVAL; 386 goto err_unlock; 387 } 388 target = target - (ent->total_mrs - ent->available_mrs); 389 if (target < ent->limit || target > ent->limit*2) { 390 err = -EINVAL; 391 goto err_unlock; 392 } 393 err = resize_available_mrs(ent, target, false); 394 if (err) 395 goto err_unlock; 396 spin_unlock_irq(&ent->lock); 397 398 return count; 399 400 err_unlock: 401 spin_unlock_irq(&ent->lock); 402 return err; 403 } 404 405 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 406 loff_t *pos) 407 { 408 struct mlx5_cache_ent *ent = filp->private_data; 409 char lbuf[20]; 410 int err; 411 412 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 413 if (err < 0) 414 return err; 415 416 return simple_read_from_buffer(buf, count, pos, lbuf, err); 417 } 418 419 static const struct file_operations size_fops = { 420 .owner = THIS_MODULE, 421 .open = simple_open, 422 .write = size_write, 423 .read = size_read, 424 }; 425 426 static ssize_t limit_write(struct file *filp, const char __user *buf, 427 size_t count, loff_t *pos) 428 { 429 struct mlx5_cache_ent *ent = filp->private_data; 430 u32 var; 431 int err; 432 433 err = kstrtou32_from_user(buf, count, 0, &var); 434 if (err) 435 return err; 436 437 /* 438 * Upon set we immediately fill the cache to high water mark implied by 439 * the limit. 440 */ 441 spin_lock_irq(&ent->lock); 442 ent->limit = var; 443 err = resize_available_mrs(ent, 0, true); 444 spin_unlock_irq(&ent->lock); 445 if (err) 446 return err; 447 return count; 448 } 449 450 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 451 loff_t *pos) 452 { 453 struct mlx5_cache_ent *ent = filp->private_data; 454 char lbuf[20]; 455 int err; 456 457 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 458 if (err < 0) 459 return err; 460 461 return simple_read_from_buffer(buf, count, pos, lbuf, err); 462 } 463 464 static const struct file_operations limit_fops = { 465 .owner = THIS_MODULE, 466 .open = simple_open, 467 .write = limit_write, 468 .read = limit_read, 469 }; 470 471 static bool someone_adding(struct mlx5_mr_cache *cache) 472 { 473 unsigned int i; 474 475 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 476 struct mlx5_cache_ent *ent = &cache->ent[i]; 477 bool ret; 478 479 spin_lock_irq(&ent->lock); 480 ret = ent->available_mrs < ent->limit; 481 spin_unlock_irq(&ent->lock); 482 if (ret) 483 return true; 484 } 485 return false; 486 } 487 488 /* 489 * Check if the bucket is outside the high/low water mark and schedule an async 490 * update. The cache refill has hysteresis, once the low water mark is hit it is 491 * refilled up to the high mark. 492 */ 493 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 494 { 495 lockdep_assert_held(&ent->lock); 496 497 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 498 return; 499 if (ent->available_mrs < ent->limit) { 500 ent->fill_to_high_water = true; 501 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 502 } else if (ent->fill_to_high_water && 503 ent->available_mrs + ent->pending < 2 * ent->limit) { 504 /* 505 * Once we start populating due to hitting a low water mark 506 * continue until we pass the high water mark. 507 */ 508 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 509 } else if (ent->available_mrs == 2 * ent->limit) { 510 ent->fill_to_high_water = false; 511 } else if (ent->available_mrs > 2 * ent->limit) { 512 /* Queue deletion of excess entries */ 513 ent->fill_to_high_water = false; 514 if (ent->pending) 515 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 516 msecs_to_jiffies(1000)); 517 else 518 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 519 } 520 } 521 522 static void __cache_work_func(struct mlx5_cache_ent *ent) 523 { 524 struct mlx5_ib_dev *dev = ent->dev; 525 struct mlx5_mr_cache *cache = &dev->cache; 526 int err; 527 528 spin_lock_irq(&ent->lock); 529 if (ent->disabled) 530 goto out; 531 532 if (ent->fill_to_high_water && 533 ent->available_mrs + ent->pending < 2 * ent->limit && 534 !READ_ONCE(dev->fill_delay)) { 535 spin_unlock_irq(&ent->lock); 536 err = add_keys(ent, 1); 537 spin_lock_irq(&ent->lock); 538 if (ent->disabled) 539 goto out; 540 if (err) { 541 /* 542 * EAGAIN only happens if pending is positive, so we 543 * will be rescheduled from reg_mr_callback(). The only 544 * failure path here is ENOMEM. 545 */ 546 if (err != -EAGAIN) { 547 mlx5_ib_warn( 548 dev, 549 "command failed order %d, err %d\n", 550 ent->order, err); 551 queue_delayed_work(cache->wq, &ent->dwork, 552 msecs_to_jiffies(1000)); 553 } 554 } 555 } else if (ent->available_mrs > 2 * ent->limit) { 556 bool need_delay; 557 558 /* 559 * The remove_cache_mr() logic is performed as garbage 560 * collection task. Such task is intended to be run when no 561 * other active processes are running. 562 * 563 * The need_resched() will return TRUE if there are user tasks 564 * to be activated in near future. 565 * 566 * In such case, we don't execute remove_cache_mr() and postpone 567 * the garbage collection work to try to run in next cycle, in 568 * order to free CPU resources to other tasks. 569 */ 570 spin_unlock_irq(&ent->lock); 571 need_delay = need_resched() || someone_adding(cache) || 572 !time_after(jiffies, 573 READ_ONCE(cache->last_add) + 300 * HZ); 574 spin_lock_irq(&ent->lock); 575 if (ent->disabled) 576 goto out; 577 if (need_delay) 578 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 579 remove_cache_mr_locked(ent); 580 queue_adjust_cache_locked(ent); 581 } 582 out: 583 spin_unlock_irq(&ent->lock); 584 } 585 586 static void delayed_cache_work_func(struct work_struct *work) 587 { 588 struct mlx5_cache_ent *ent; 589 590 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 591 __cache_work_func(ent); 592 } 593 594 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 595 struct mlx5_cache_ent *ent, 596 int access_flags) 597 { 598 struct mlx5_ib_mr *mr; 599 600 /* Matches access in alloc_cache_mr() */ 601 if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) 602 return ERR_PTR(-EOPNOTSUPP); 603 604 spin_lock_irq(&ent->lock); 605 if (list_empty(&ent->head)) { 606 queue_adjust_cache_locked(ent); 607 ent->miss++; 608 spin_unlock_irq(&ent->lock); 609 mr = create_cache_mr(ent); 610 if (IS_ERR(mr)) 611 return mr; 612 } else { 613 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 614 list_del(&mr->list); 615 ent->available_mrs--; 616 queue_adjust_cache_locked(ent); 617 spin_unlock_irq(&ent->lock); 618 619 mlx5_clear_mr(mr); 620 } 621 return mr; 622 } 623 624 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 625 { 626 struct mlx5_cache_ent *ent = mr->cache_ent; 627 628 spin_lock_irq(&ent->lock); 629 list_add_tail(&mr->list, &ent->head); 630 ent->available_mrs++; 631 queue_adjust_cache_locked(ent); 632 spin_unlock_irq(&ent->lock); 633 } 634 635 static void clean_keys(struct mlx5_ib_dev *dev, int c) 636 { 637 struct mlx5_mr_cache *cache = &dev->cache; 638 struct mlx5_cache_ent *ent = &cache->ent[c]; 639 struct mlx5_ib_mr *tmp_mr; 640 struct mlx5_ib_mr *mr; 641 LIST_HEAD(del_list); 642 643 cancel_delayed_work(&ent->dwork); 644 while (1) { 645 spin_lock_irq(&ent->lock); 646 if (list_empty(&ent->head)) { 647 spin_unlock_irq(&ent->lock); 648 break; 649 } 650 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 651 list_move(&mr->list, &del_list); 652 ent->available_mrs--; 653 ent->total_mrs--; 654 spin_unlock_irq(&ent->lock); 655 mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 656 } 657 658 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 659 list_del(&mr->list); 660 kfree(mr); 661 } 662 } 663 664 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 665 { 666 if (!mlx5_debugfs_root || dev->is_rep) 667 return; 668 669 debugfs_remove_recursive(dev->cache.root); 670 dev->cache.root = NULL; 671 } 672 673 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 674 { 675 struct mlx5_mr_cache *cache = &dev->cache; 676 struct mlx5_cache_ent *ent; 677 struct dentry *dir; 678 int i; 679 680 if (!mlx5_debugfs_root || dev->is_rep) 681 return; 682 683 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 684 685 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 686 ent = &cache->ent[i]; 687 sprintf(ent->name, "%d", ent->order); 688 dir = debugfs_create_dir(ent->name, cache->root); 689 debugfs_create_file("size", 0600, dir, ent, &size_fops); 690 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 691 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 692 debugfs_create_u32("miss", 0600, dir, &ent->miss); 693 } 694 } 695 696 static void delay_time_func(struct timer_list *t) 697 { 698 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 699 700 WRITE_ONCE(dev->fill_delay, 0); 701 } 702 703 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 704 { 705 struct mlx5_mr_cache *cache = &dev->cache; 706 struct mlx5_cache_ent *ent; 707 int i; 708 709 mutex_init(&dev->slow_path_mutex); 710 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 711 if (!cache->wq) { 712 mlx5_ib_warn(dev, "failed to create work queue\n"); 713 return -ENOMEM; 714 } 715 716 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 717 timer_setup(&dev->delay_timer, delay_time_func, 0); 718 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 719 ent = &cache->ent[i]; 720 INIT_LIST_HEAD(&ent->head); 721 spin_lock_init(&ent->lock); 722 ent->order = i + 2; 723 ent->dev = dev; 724 ent->limit = 0; 725 726 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 727 728 if (i > MR_CACHE_LAST_STD_ENTRY) { 729 mlx5_odp_init_mr_cache_entry(ent); 730 continue; 731 } 732 733 if (ent->order > mr_cache_max_order(dev)) 734 continue; 735 736 ent->page = PAGE_SHIFT; 737 ent->ndescs = 1 << ent->order; 738 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 739 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 740 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 741 mlx5_ib_can_load_pas_with_umr(dev, 0)) 742 ent->limit = dev->mdev->profile.mr_cache[i].limit; 743 else 744 ent->limit = 0; 745 spin_lock_irq(&ent->lock); 746 queue_adjust_cache_locked(ent); 747 spin_unlock_irq(&ent->lock); 748 } 749 750 mlx5_mr_cache_debugfs_init(dev); 751 752 return 0; 753 } 754 755 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 756 { 757 unsigned int i; 758 759 if (!dev->cache.wq) 760 return 0; 761 762 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 763 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 764 765 spin_lock_irq(&ent->lock); 766 ent->disabled = true; 767 spin_unlock_irq(&ent->lock); 768 cancel_delayed_work_sync(&ent->dwork); 769 } 770 771 mlx5_mr_cache_debugfs_cleanup(dev); 772 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 773 774 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 775 clean_keys(dev, i); 776 777 destroy_workqueue(dev->cache.wq); 778 del_timer_sync(&dev->delay_timer); 779 780 return 0; 781 } 782 783 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 784 { 785 struct mlx5_ib_dev *dev = to_mdev(pd->device); 786 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 787 struct mlx5_ib_mr *mr; 788 void *mkc; 789 u32 *in; 790 int err; 791 792 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 793 if (!mr) 794 return ERR_PTR(-ENOMEM); 795 796 in = kzalloc(inlen, GFP_KERNEL); 797 if (!in) { 798 err = -ENOMEM; 799 goto err_free; 800 } 801 802 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 803 804 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 805 MLX5_SET(mkc, mkc, length64, 1); 806 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 807 pd); 808 809 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 810 if (err) 811 goto err_in; 812 813 kfree(in); 814 mr->mmkey.type = MLX5_MKEY_MR; 815 mr->ibmr.lkey = mr->mmkey.key; 816 mr->ibmr.rkey = mr->mmkey.key; 817 mr->umem = NULL; 818 819 return &mr->ibmr; 820 821 err_in: 822 kfree(in); 823 824 err_free: 825 kfree(mr); 826 827 return ERR_PTR(err); 828 } 829 830 static int get_octo_len(u64 addr, u64 len, int page_shift) 831 { 832 u64 page_size = 1ULL << page_shift; 833 u64 offset; 834 int npages; 835 836 offset = addr & (page_size - 1); 837 npages = ALIGN(len + offset, page_size) >> page_shift; 838 return (npages + 1) / 2; 839 } 840 841 static int mr_cache_max_order(struct mlx5_ib_dev *dev) 842 { 843 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 844 return MR_CACHE_LAST_STD_ENTRY + 2; 845 return MLX5_MAX_UMR_SHIFT; 846 } 847 848 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) 849 { 850 struct mlx5_ib_umr_context *context = 851 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); 852 853 context->status = wc->status; 854 complete(&context->done); 855 } 856 857 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) 858 { 859 context->cqe.done = mlx5_ib_umr_done; 860 context->status = -1; 861 init_completion(&context->done); 862 } 863 864 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, 865 struct mlx5_umr_wr *umrwr) 866 { 867 struct umr_common *umrc = &dev->umrc; 868 const struct ib_send_wr *bad; 869 int err; 870 struct mlx5_ib_umr_context umr_context; 871 872 mlx5_ib_init_umr_context(&umr_context); 873 umrwr->wr.wr_cqe = &umr_context.cqe; 874 875 down(&umrc->sem); 876 err = ib_post_send(umrc->qp, &umrwr->wr, &bad); 877 if (err) { 878 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); 879 } else { 880 wait_for_completion(&umr_context.done); 881 if (umr_context.status != IB_WC_SUCCESS) { 882 mlx5_ib_warn(dev, "reg umr failed (%u)\n", 883 umr_context.status); 884 err = -EFAULT; 885 } 886 } 887 up(&umrc->sem); 888 return err; 889 } 890 891 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 892 unsigned int order) 893 { 894 struct mlx5_mr_cache *cache = &dev->cache; 895 896 if (order < cache->ent[0].order) 897 return &cache->ent[0]; 898 order = order - cache->ent[0].order; 899 if (order > MR_CACHE_LAST_STD_ENTRY) 900 return NULL; 901 return &cache->ent[order]; 902 } 903 904 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 905 u64 length, int access_flags, u64 iova) 906 { 907 mr->ibmr.lkey = mr->mmkey.key; 908 mr->ibmr.rkey = mr->mmkey.key; 909 mr->ibmr.length = length; 910 mr->ibmr.device = &dev->ib_dev; 911 mr->ibmr.iova = iova; 912 mr->access_flags = access_flags; 913 } 914 915 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 916 u64 iova) 917 { 918 /* 919 * The alignment of iova has already been checked upon entering 920 * UVERBS_METHOD_REG_DMABUF_MR 921 */ 922 umem->iova = iova; 923 return PAGE_SIZE; 924 } 925 926 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 927 struct ib_umem *umem, u64 iova, 928 int access_flags) 929 { 930 struct mlx5_ib_dev *dev = to_mdev(pd->device); 931 struct mlx5_cache_ent *ent; 932 struct mlx5_ib_mr *mr; 933 unsigned int page_size; 934 935 if (umem->is_dmabuf) 936 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 937 else 938 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 939 0, iova); 940 if (WARN_ON(!page_size)) 941 return ERR_PTR(-EINVAL); 942 ent = mr_cache_ent_from_order( 943 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 944 /* 945 * Matches access in alloc_cache_mr(). If the MR can't come from the 946 * cache then synchronously create an uncached one. 947 */ 948 if (!ent || ent->limit == 0 || 949 !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) { 950 mutex_lock(&dev->slow_path_mutex); 951 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 952 mutex_unlock(&dev->slow_path_mutex); 953 return mr; 954 } 955 956 mr = mlx5_mr_cache_alloc(dev, ent, access_flags); 957 if (IS_ERR(mr)) 958 return mr; 959 960 mr->ibmr.pd = pd; 961 mr->umem = umem; 962 mr->page_shift = order_base_2(page_size); 963 set_mr_fields(dev, mr, umem->length, access_flags, iova); 964 965 return mr; 966 } 967 968 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ 969 MLX5_UMR_MTT_ALIGNMENT) 970 #define MLX5_SPARE_UMR_CHUNK 0x10000 971 972 /* 973 * Allocate a temporary buffer to hold the per-page information to transfer to 974 * HW. For efficiency this should be as large as it can be, but buffer 975 * allocation failure is not allowed, so try smaller sizes. 976 */ 977 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) 978 { 979 const size_t xlt_chunk_align = 980 MLX5_UMR_MTT_ALIGNMENT / ent_size; 981 size_t size; 982 void *res = NULL; 983 984 static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); 985 986 /* 987 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the 988 * allocation can't trigger any kind of reclaim. 989 */ 990 might_sleep(); 991 992 gfp_mask |= __GFP_ZERO | __GFP_NORETRY; 993 994 /* 995 * If the system already has a suitable high order page then just use 996 * that, but don't try hard to create one. This max is about 1M, so a 997 * free x86 huge page will satisfy it. 998 */ 999 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), 1000 MLX5_MAX_UMR_CHUNK); 1001 *nents = size / ent_size; 1002 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1003 get_order(size)); 1004 if (res) 1005 return res; 1006 1007 if (size > MLX5_SPARE_UMR_CHUNK) { 1008 size = MLX5_SPARE_UMR_CHUNK; 1009 *nents = size / ent_size; 1010 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1011 get_order(size)); 1012 if (res) 1013 return res; 1014 } 1015 1016 *nents = PAGE_SIZE / ent_size; 1017 res = (void *)__get_free_page(gfp_mask); 1018 if (res) 1019 return res; 1020 1021 mutex_lock(&xlt_emergency_page_mutex); 1022 memset(xlt_emergency_page, 0, PAGE_SIZE); 1023 return xlt_emergency_page; 1024 } 1025 1026 static void mlx5_ib_free_xlt(void *xlt, size_t length) 1027 { 1028 if (xlt == xlt_emergency_page) { 1029 mutex_unlock(&xlt_emergency_page_mutex); 1030 return; 1031 } 1032 1033 free_pages((unsigned long)xlt, get_order(length)); 1034 } 1035 1036 /* 1037 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for 1038 * submission. 1039 */ 1040 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr, 1041 struct mlx5_umr_wr *wr, struct ib_sge *sg, 1042 size_t nents, size_t ent_size, 1043 unsigned int flags) 1044 { 1045 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1046 struct device *ddev = &dev->mdev->pdev->dev; 1047 dma_addr_t dma; 1048 void *xlt; 1049 1050 xlt = mlx5_ib_alloc_xlt(&nents, ent_size, 1051 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : 1052 GFP_KERNEL); 1053 sg->length = nents * ent_size; 1054 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); 1055 if (dma_mapping_error(ddev, dma)) { 1056 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); 1057 mlx5_ib_free_xlt(xlt, sg->length); 1058 return NULL; 1059 } 1060 sg->addr = dma; 1061 sg->lkey = dev->umrc.pd->local_dma_lkey; 1062 1063 memset(wr, 0, sizeof(*wr)); 1064 wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; 1065 if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) 1066 wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; 1067 wr->wr.sg_list = sg; 1068 wr->wr.num_sge = 1; 1069 wr->wr.opcode = MLX5_IB_WR_UMR; 1070 wr->pd = mr->ibmr.pd; 1071 wr->mkey = mr->mmkey.key; 1072 wr->length = mr->ibmr.length; 1073 wr->virt_addr = mr->ibmr.iova; 1074 wr->access_flags = mr->access_flags; 1075 wr->page_shift = mr->page_shift; 1076 wr->xlt_size = sg->length; 1077 return xlt; 1078 } 1079 1080 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, 1081 struct ib_sge *sg) 1082 { 1083 struct device *ddev = &dev->mdev->pdev->dev; 1084 1085 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); 1086 mlx5_ib_free_xlt(xlt, sg->length); 1087 } 1088 1089 static unsigned int xlt_wr_final_send_flags(unsigned int flags) 1090 { 1091 unsigned int res = 0; 1092 1093 if (flags & MLX5_IB_UPD_XLT_ENABLE) 1094 res |= MLX5_IB_SEND_UMR_ENABLE_MR | 1095 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | 1096 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1097 if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS) 1098 res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1099 if (flags & MLX5_IB_UPD_XLT_ADDR) 1100 res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1101 return res; 1102 } 1103 1104 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 1105 int page_shift, int flags) 1106 { 1107 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1108 struct device *ddev = &dev->mdev->pdev->dev; 1109 void *xlt; 1110 struct mlx5_umr_wr wr; 1111 struct ib_sge sg; 1112 int err = 0; 1113 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) 1114 ? sizeof(struct mlx5_klm) 1115 : sizeof(struct mlx5_mtt); 1116 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; 1117 const int page_mask = page_align - 1; 1118 size_t pages_mapped = 0; 1119 size_t pages_to_map = 0; 1120 size_t pages_iter; 1121 size_t size_to_map = 0; 1122 size_t orig_sg_length; 1123 1124 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && 1125 !umr_can_use_indirect_mkey(dev)) 1126 return -EPERM; 1127 1128 if (WARN_ON(!mr->umem->is_odp)) 1129 return -EINVAL; 1130 1131 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, 1132 * so we need to align the offset and length accordingly 1133 */ 1134 if (idx & page_mask) { 1135 npages += idx & page_mask; 1136 idx &= ~page_mask; 1137 } 1138 pages_to_map = ALIGN(npages, page_align); 1139 1140 xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags); 1141 if (!xlt) 1142 return -ENOMEM; 1143 pages_iter = sg.length / desc_size; 1144 orig_sg_length = sg.length; 1145 1146 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { 1147 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 1148 size_t max_pages = ib_umem_odp_num_pages(odp) - idx; 1149 1150 pages_to_map = min_t(size_t, pages_to_map, max_pages); 1151 } 1152 1153 wr.page_shift = page_shift; 1154 1155 for (pages_mapped = 0; 1156 pages_mapped < pages_to_map && !err; 1157 pages_mapped += pages_iter, idx += pages_iter) { 1158 npages = min_t(int, pages_iter, pages_to_map - pages_mapped); 1159 size_to_map = npages * desc_size; 1160 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1161 DMA_TO_DEVICE); 1162 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 1163 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1164 DMA_TO_DEVICE); 1165 1166 sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); 1167 1168 if (pages_mapped + pages_iter >= pages_to_map) 1169 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1170 1171 wr.offset = idx * desc_size; 1172 wr.xlt_size = sg.length; 1173 1174 err = mlx5_ib_post_send_wait(dev, &wr); 1175 } 1176 sg.length = orig_sg_length; 1177 mlx5_ib_unmap_free_xlt(dev, xlt, &sg); 1178 return err; 1179 } 1180 1181 /* 1182 * Send the DMA list to the HW for a normal MR using UMR. 1183 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP 1184 * flag may be used. 1185 */ 1186 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 1187 { 1188 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1189 struct device *ddev = &dev->mdev->pdev->dev; 1190 struct ib_block_iter biter; 1191 struct mlx5_mtt *cur_mtt; 1192 struct mlx5_umr_wr wr; 1193 size_t orig_sg_length; 1194 struct mlx5_mtt *mtt; 1195 size_t final_size; 1196 struct ib_sge sg; 1197 int err = 0; 1198 1199 if (WARN_ON(mr->umem->is_odp)) 1200 return -EINVAL; 1201 1202 mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, 1203 ib_umem_num_dma_blocks(mr->umem, 1204 1 << mr->page_shift), 1205 sizeof(*mtt), flags); 1206 if (!mtt) 1207 return -ENOMEM; 1208 orig_sg_length = sg.length; 1209 1210 cur_mtt = mtt; 1211 rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter, 1212 mr->umem->sgt_append.sgt.nents, 1213 BIT(mr->page_shift)) { 1214 if (cur_mtt == (void *)mtt + sg.length) { 1215 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1216 DMA_TO_DEVICE); 1217 err = mlx5_ib_post_send_wait(dev, &wr); 1218 if (err) 1219 goto err; 1220 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1221 DMA_TO_DEVICE); 1222 wr.offset += sg.length; 1223 cur_mtt = mtt; 1224 } 1225 1226 cur_mtt->ptag = 1227 cpu_to_be64(rdma_block_iter_dma_address(&biter) | 1228 MLX5_IB_MTT_PRESENT); 1229 1230 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) 1231 cur_mtt->ptag = 0; 1232 1233 cur_mtt++; 1234 } 1235 1236 final_size = (void *)cur_mtt - (void *)mtt; 1237 sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); 1238 memset(cur_mtt, 0, sg.length - final_size); 1239 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1240 wr.xlt_size = sg.length; 1241 1242 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); 1243 err = mlx5_ib_post_send_wait(dev, &wr); 1244 1245 err: 1246 sg.length = orig_sg_length; 1247 mlx5_ib_unmap_free_xlt(dev, mtt, &sg); 1248 return err; 1249 } 1250 1251 /* 1252 * If ibmr is NULL it will be allocated by reg_create. 1253 * Else, the given ibmr will be used. 1254 */ 1255 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1256 u64 iova, int access_flags, 1257 unsigned int page_size, bool populate) 1258 { 1259 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1260 struct mlx5_ib_mr *mr; 1261 __be64 *pas; 1262 void *mkc; 1263 int inlen; 1264 u32 *in; 1265 int err; 1266 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1267 1268 if (!page_size) 1269 return ERR_PTR(-EINVAL); 1270 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1271 if (!mr) 1272 return ERR_PTR(-ENOMEM); 1273 1274 mr->ibmr.pd = pd; 1275 mr->access_flags = access_flags; 1276 mr->page_shift = order_base_2(page_size); 1277 1278 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1279 if (populate) 1280 inlen += sizeof(*pas) * 1281 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1282 in = kvzalloc(inlen, GFP_KERNEL); 1283 if (!in) { 1284 err = -ENOMEM; 1285 goto err_1; 1286 } 1287 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1288 if (populate) { 1289 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1290 err = -EINVAL; 1291 goto err_2; 1292 } 1293 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1294 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1295 } 1296 1297 /* The pg_access bit allows setting the access flags 1298 * in the page list submitted with the command. */ 1299 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1300 1301 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1302 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1303 populate ? pd : dev->umrc.pd); 1304 MLX5_SET(mkc, mkc, free, !populate); 1305 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1306 MLX5_SET(mkc, mkc, umr_en, 1); 1307 1308 MLX5_SET64(mkc, mkc, len, umem->length); 1309 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1310 MLX5_SET(mkc, mkc, translations_octword_size, 1311 get_octo_len(iova, umem->length, mr->page_shift)); 1312 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1313 if (populate) { 1314 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1315 get_octo_len(iova, umem->length, mr->page_shift)); 1316 } 1317 1318 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1319 if (err) { 1320 mlx5_ib_warn(dev, "create mkey failed\n"); 1321 goto err_2; 1322 } 1323 mr->mmkey.type = MLX5_MKEY_MR; 1324 mr->umem = umem; 1325 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1326 kvfree(in); 1327 1328 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1329 1330 return mr; 1331 1332 err_2: 1333 kvfree(in); 1334 err_1: 1335 kfree(mr); 1336 return ERR_PTR(err); 1337 } 1338 1339 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1340 u64 length, int acc, int mode) 1341 { 1342 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1343 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1344 struct mlx5_ib_mr *mr; 1345 void *mkc; 1346 u32 *in; 1347 int err; 1348 1349 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1350 if (!mr) 1351 return ERR_PTR(-ENOMEM); 1352 1353 in = kzalloc(inlen, GFP_KERNEL); 1354 if (!in) { 1355 err = -ENOMEM; 1356 goto err_free; 1357 } 1358 1359 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1360 1361 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1362 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1363 MLX5_SET64(mkc, mkc, len, length); 1364 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1365 1366 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1367 if (err) 1368 goto err_in; 1369 1370 kfree(in); 1371 1372 set_mr_fields(dev, mr, length, acc, start_addr); 1373 1374 return &mr->ibmr; 1375 1376 err_in: 1377 kfree(in); 1378 1379 err_free: 1380 kfree(mr); 1381 1382 return ERR_PTR(err); 1383 } 1384 1385 int mlx5_ib_advise_mr(struct ib_pd *pd, 1386 enum ib_uverbs_advise_mr_advice advice, 1387 u32 flags, 1388 struct ib_sge *sg_list, 1389 u32 num_sge, 1390 struct uverbs_attr_bundle *attrs) 1391 { 1392 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1393 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1394 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1395 return -EOPNOTSUPP; 1396 1397 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1398 sg_list, num_sge); 1399 } 1400 1401 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1402 struct ib_dm_mr_attr *attr, 1403 struct uverbs_attr_bundle *attrs) 1404 { 1405 struct mlx5_ib_dm *mdm = to_mdm(dm); 1406 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1407 u64 start_addr = mdm->dev_addr + attr->offset; 1408 int mode; 1409 1410 switch (mdm->type) { 1411 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1412 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1413 return ERR_PTR(-EINVAL); 1414 1415 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1416 start_addr -= pci_resource_start(dev->pdev, 0); 1417 break; 1418 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1419 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1420 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1421 return ERR_PTR(-EINVAL); 1422 1423 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1424 break; 1425 default: 1426 return ERR_PTR(-EINVAL); 1427 } 1428 1429 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1430 attr->access_flags, mode); 1431 } 1432 1433 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1434 u64 iova, int access_flags) 1435 { 1436 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1437 struct mlx5_ib_mr *mr = NULL; 1438 bool xlt_with_umr; 1439 int err; 1440 1441 xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); 1442 if (xlt_with_umr) { 1443 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1444 } else { 1445 unsigned int page_size = mlx5_umem_find_best_pgsz( 1446 umem, mkc, log_page_size, 0, iova); 1447 1448 mutex_lock(&dev->slow_path_mutex); 1449 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1450 mutex_unlock(&dev->slow_path_mutex); 1451 } 1452 if (IS_ERR(mr)) { 1453 ib_umem_release(umem); 1454 return ERR_CAST(mr); 1455 } 1456 1457 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1458 1459 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1460 1461 if (xlt_with_umr) { 1462 /* 1463 * If the MR was created with reg_create then it will be 1464 * configured properly but left disabled. It is safe to go ahead 1465 * and configure it again via UMR while enabling it. 1466 */ 1467 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1468 if (err) { 1469 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1470 return ERR_PTR(err); 1471 } 1472 } 1473 return &mr->ibmr; 1474 } 1475 1476 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1477 u64 iova, int access_flags, 1478 struct ib_udata *udata) 1479 { 1480 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1481 struct ib_umem_odp *odp; 1482 struct mlx5_ib_mr *mr; 1483 int err; 1484 1485 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1486 return ERR_PTR(-EOPNOTSUPP); 1487 1488 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1489 if (err) 1490 return ERR_PTR(err); 1491 if (!start && length == U64_MAX) { 1492 if (iova != 0) 1493 return ERR_PTR(-EINVAL); 1494 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1495 return ERR_PTR(-EINVAL); 1496 1497 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1498 if (IS_ERR(mr)) 1499 return ERR_CAST(mr); 1500 return &mr->ibmr; 1501 } 1502 1503 /* ODP requires xlt update via umr to work. */ 1504 if (!mlx5_ib_can_load_pas_with_umr(dev, length)) 1505 return ERR_PTR(-EINVAL); 1506 1507 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1508 &mlx5_mn_ops); 1509 if (IS_ERR(odp)) 1510 return ERR_CAST(odp); 1511 1512 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1513 if (IS_ERR(mr)) { 1514 ib_umem_release(&odp->umem); 1515 return ERR_CAST(mr); 1516 } 1517 xa_init(&mr->implicit_children); 1518 1519 odp->private = mr; 1520 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1521 if (err) 1522 goto err_dereg_mr; 1523 1524 err = mlx5_ib_init_odp_mr(mr); 1525 if (err) 1526 goto err_dereg_mr; 1527 return &mr->ibmr; 1528 1529 err_dereg_mr: 1530 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1531 return ERR_PTR(err); 1532 } 1533 1534 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1535 u64 iova, int access_flags, 1536 struct ib_udata *udata) 1537 { 1538 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1539 struct ib_umem *umem; 1540 1541 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1542 return ERR_PTR(-EOPNOTSUPP); 1543 1544 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1545 start, iova, length, access_flags); 1546 1547 if (access_flags & IB_ACCESS_ON_DEMAND) 1548 return create_user_odp_mr(pd, start, length, iova, access_flags, 1549 udata); 1550 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1551 if (IS_ERR(umem)) 1552 return ERR_CAST(umem); 1553 return create_real_mr(pd, umem, iova, access_flags); 1554 } 1555 1556 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1557 { 1558 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1559 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1560 1561 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1562 1563 if (!umem_dmabuf->sgt) 1564 return; 1565 1566 mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1567 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1568 } 1569 1570 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1571 .allow_peer2peer = 1, 1572 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1573 }; 1574 1575 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1576 u64 length, u64 virt_addr, 1577 int fd, int access_flags, 1578 struct ib_udata *udata) 1579 { 1580 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1581 struct mlx5_ib_mr *mr = NULL; 1582 struct ib_umem_dmabuf *umem_dmabuf; 1583 int err; 1584 1585 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1586 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1587 return ERR_PTR(-EOPNOTSUPP); 1588 1589 mlx5_ib_dbg(dev, 1590 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1591 offset, virt_addr, length, fd, access_flags); 1592 1593 /* dmabuf requires xlt update via umr to work. */ 1594 if (!mlx5_ib_can_load_pas_with_umr(dev, length)) 1595 return ERR_PTR(-EINVAL); 1596 1597 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1598 access_flags, 1599 &mlx5_ib_dmabuf_attach_ops); 1600 if (IS_ERR(umem_dmabuf)) { 1601 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1602 PTR_ERR(umem_dmabuf)); 1603 return ERR_CAST(umem_dmabuf); 1604 } 1605 1606 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1607 access_flags); 1608 if (IS_ERR(mr)) { 1609 ib_umem_release(&umem_dmabuf->umem); 1610 return ERR_CAST(mr); 1611 } 1612 1613 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1614 1615 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1616 umem_dmabuf->private = mr; 1617 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1618 if (err) 1619 goto err_dereg_mr; 1620 1621 err = mlx5_ib_init_dmabuf_mr(mr); 1622 if (err) 1623 goto err_dereg_mr; 1624 return &mr->ibmr; 1625 1626 err_dereg_mr: 1627 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1628 return ERR_PTR(err); 1629 } 1630 1631 /** 1632 * revoke_mr - Fence all DMA on the MR 1633 * @mr: The MR to fence 1634 * 1635 * Upon return the NIC will not be doing any DMA to the pages under the MR, 1636 * and any DMA in progress will be completed. Failure of this function 1637 * indicates the HW has failed catastrophically. 1638 */ 1639 static int revoke_mr(struct mlx5_ib_mr *mr) 1640 { 1641 struct mlx5_umr_wr umrwr = {}; 1642 1643 if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 1644 return 0; 1645 1646 umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | 1647 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1648 umrwr.wr.opcode = MLX5_IB_WR_UMR; 1649 umrwr.pd = mr_to_mdev(mr)->umrc.pd; 1650 umrwr.mkey = mr->mmkey.key; 1651 umrwr.ignore_free_state = 1; 1652 1653 return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr); 1654 } 1655 1656 /* 1657 * True if the change in access flags can be done via UMR, only some access 1658 * flags can be updated. 1659 */ 1660 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1661 unsigned int current_access_flags, 1662 unsigned int target_access_flags) 1663 { 1664 unsigned int diffs = current_access_flags ^ target_access_flags; 1665 1666 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1667 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1668 return false; 1669 return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags, 1670 target_access_flags); 1671 } 1672 1673 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1674 int access_flags) 1675 { 1676 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1677 struct mlx5_umr_wr umrwr = { 1678 .wr = { 1679 .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | 1680 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS, 1681 .opcode = MLX5_IB_WR_UMR, 1682 }, 1683 .mkey = mr->mmkey.key, 1684 .pd = pd, 1685 .access_flags = access_flags, 1686 }; 1687 int err; 1688 1689 err = mlx5_ib_post_send_wait(dev, &umrwr); 1690 if (err) 1691 return err; 1692 1693 mr->access_flags = access_flags; 1694 return 0; 1695 } 1696 1697 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1698 struct ib_umem *new_umem, 1699 int new_access_flags, u64 iova, 1700 unsigned long *page_size) 1701 { 1702 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1703 1704 /* We only track the allocated sizes of MRs from the cache */ 1705 if (!mr->cache_ent) 1706 return false; 1707 if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length)) 1708 return false; 1709 1710 *page_size = 1711 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1712 if (WARN_ON(!*page_size)) 1713 return false; 1714 return (1ULL << mr->cache_ent->order) >= 1715 ib_umem_num_dma_blocks(new_umem, *page_size); 1716 } 1717 1718 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1719 int access_flags, int flags, struct ib_umem *new_umem, 1720 u64 iova, unsigned long page_size) 1721 { 1722 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1723 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1724 struct ib_umem *old_umem = mr->umem; 1725 int err; 1726 1727 /* 1728 * To keep everything simple the MR is revoked before we start to mess 1729 * with it. This ensure the change is atomic relative to any use of the 1730 * MR. 1731 */ 1732 err = revoke_mr(mr); 1733 if (err) 1734 return err; 1735 1736 if (flags & IB_MR_REREG_PD) { 1737 mr->ibmr.pd = pd; 1738 upd_flags |= MLX5_IB_UPD_XLT_PD; 1739 } 1740 if (flags & IB_MR_REREG_ACCESS) { 1741 mr->access_flags = access_flags; 1742 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1743 } 1744 1745 mr->ibmr.length = new_umem->length; 1746 mr->ibmr.iova = iova; 1747 mr->ibmr.length = new_umem->length; 1748 mr->page_shift = order_base_2(page_size); 1749 mr->umem = new_umem; 1750 err = mlx5_ib_update_mr_pas(mr, upd_flags); 1751 if (err) { 1752 /* 1753 * The MR is revoked at this point so there is no issue to free 1754 * new_umem. 1755 */ 1756 mr->umem = old_umem; 1757 return err; 1758 } 1759 1760 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1761 ib_umem_release(old_umem); 1762 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1763 return 0; 1764 } 1765 1766 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1767 u64 length, u64 iova, int new_access_flags, 1768 struct ib_pd *new_pd, 1769 struct ib_udata *udata) 1770 { 1771 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1772 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1773 int err; 1774 1775 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1776 return ERR_PTR(-EOPNOTSUPP); 1777 1778 mlx5_ib_dbg( 1779 dev, 1780 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1781 start, iova, length, new_access_flags); 1782 1783 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1784 return ERR_PTR(-EOPNOTSUPP); 1785 1786 if (!(flags & IB_MR_REREG_ACCESS)) 1787 new_access_flags = mr->access_flags; 1788 if (!(flags & IB_MR_REREG_PD)) 1789 new_pd = ib_mr->pd; 1790 1791 if (!(flags & IB_MR_REREG_TRANS)) { 1792 struct ib_umem *umem; 1793 1794 /* Fast path for PD/access change */ 1795 if (can_use_umr_rereg_access(dev, mr->access_flags, 1796 new_access_flags)) { 1797 err = umr_rereg_pd_access(mr, new_pd, new_access_flags); 1798 if (err) 1799 return ERR_PTR(err); 1800 return NULL; 1801 } 1802 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1803 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1804 goto recreate; 1805 1806 /* 1807 * Only one active MR can refer to a umem at one time, revoke 1808 * the old MR before assigning the umem to the new one. 1809 */ 1810 err = revoke_mr(mr); 1811 if (err) 1812 return ERR_PTR(err); 1813 umem = mr->umem; 1814 mr->umem = NULL; 1815 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1816 1817 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1818 new_access_flags); 1819 } 1820 1821 /* 1822 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1823 * but the logic around releasing the umem is different 1824 */ 1825 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1826 goto recreate; 1827 1828 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1829 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1830 struct ib_umem *new_umem; 1831 unsigned long page_size; 1832 1833 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1834 new_access_flags); 1835 if (IS_ERR(new_umem)) 1836 return ERR_CAST(new_umem); 1837 1838 /* Fast path for PAS change */ 1839 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1840 &page_size)) { 1841 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1842 new_umem, iova, page_size); 1843 if (err) { 1844 ib_umem_release(new_umem); 1845 return ERR_PTR(err); 1846 } 1847 return NULL; 1848 } 1849 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1850 } 1851 1852 /* 1853 * Everything else has no state we can preserve, just create a new MR 1854 * from scratch 1855 */ 1856 recreate: 1857 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1858 new_access_flags, udata); 1859 } 1860 1861 static int 1862 mlx5_alloc_priv_descs(struct ib_device *device, 1863 struct mlx5_ib_mr *mr, 1864 int ndescs, 1865 int desc_size) 1866 { 1867 struct mlx5_ib_dev *dev = to_mdev(device); 1868 struct device *ddev = &dev->mdev->pdev->dev; 1869 int size = ndescs * desc_size; 1870 int add_size; 1871 int ret; 1872 1873 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1874 1875 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1876 if (!mr->descs_alloc) 1877 return -ENOMEM; 1878 1879 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1880 1881 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1882 if (dma_mapping_error(ddev, mr->desc_map)) { 1883 ret = -ENOMEM; 1884 goto err; 1885 } 1886 1887 return 0; 1888 err: 1889 kfree(mr->descs_alloc); 1890 1891 return ret; 1892 } 1893 1894 static void 1895 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1896 { 1897 if (!mr->umem && mr->descs) { 1898 struct ib_device *device = mr->ibmr.device; 1899 int size = mr->max_descs * mr->desc_size; 1900 struct mlx5_ib_dev *dev = to_mdev(device); 1901 1902 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1903 DMA_TO_DEVICE); 1904 kfree(mr->descs_alloc); 1905 mr->descs = NULL; 1906 } 1907 } 1908 1909 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1910 { 1911 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1912 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1913 int rc; 1914 1915 /* 1916 * Any async use of the mr must hold the refcount, once the refcount 1917 * goes to zero no other thread, such as ODP page faults, prefetch, any 1918 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1919 */ 1920 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1921 refcount_read(&mr->mmkey.usecount) != 0 && 1922 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1923 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1924 1925 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1926 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1927 mr->sig, NULL, GFP_KERNEL); 1928 1929 if (mr->mtt_mr) { 1930 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1931 if (rc) 1932 return rc; 1933 mr->mtt_mr = NULL; 1934 } 1935 if (mr->klm_mr) { 1936 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1937 if (rc) 1938 return rc; 1939 mr->klm_mr = NULL; 1940 } 1941 1942 if (mlx5_core_destroy_psv(dev->mdev, 1943 mr->sig->psv_memory.psv_idx)) 1944 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1945 mr->sig->psv_memory.psv_idx); 1946 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1947 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1948 mr->sig->psv_wire.psv_idx); 1949 kfree(mr->sig); 1950 mr->sig = NULL; 1951 } 1952 1953 /* Stop DMA */ 1954 if (mr->cache_ent) { 1955 if (revoke_mr(mr)) { 1956 spin_lock_irq(&mr->cache_ent->lock); 1957 mr->cache_ent->total_mrs--; 1958 spin_unlock_irq(&mr->cache_ent->lock); 1959 mr->cache_ent = NULL; 1960 } 1961 } 1962 if (!mr->cache_ent) { 1963 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1964 if (rc) 1965 return rc; 1966 } 1967 1968 if (mr->umem) { 1969 bool is_odp = is_odp_mr(mr); 1970 1971 if (!is_odp) 1972 atomic_sub(ib_umem_num_pages(mr->umem), 1973 &dev->mdev->priv.reg_pages); 1974 ib_umem_release(mr->umem); 1975 if (is_odp) 1976 mlx5_ib_free_odp_mr(mr); 1977 } 1978 1979 if (mr->cache_ent) { 1980 mlx5_mr_cache_free(dev, mr); 1981 } else { 1982 mlx5_free_priv_descs(mr); 1983 kfree(mr); 1984 } 1985 return 0; 1986 } 1987 1988 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1989 int access_mode, int page_shift) 1990 { 1991 void *mkc; 1992 1993 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1994 1995 /* This is only used from the kernel, so setting the PD is OK. */ 1996 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1997 MLX5_SET(mkc, mkc, free, 1); 1998 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1999 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2000 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2001 MLX5_SET(mkc, mkc, umr_en, 1); 2002 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2003 } 2004 2005 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2006 int ndescs, int desc_size, int page_shift, 2007 int access_mode, u32 *in, int inlen) 2008 { 2009 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2010 int err; 2011 2012 mr->access_mode = access_mode; 2013 mr->desc_size = desc_size; 2014 mr->max_descs = ndescs; 2015 2016 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2017 if (err) 2018 return err; 2019 2020 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2021 2022 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2023 if (err) 2024 goto err_free_descs; 2025 2026 mr->mmkey.type = MLX5_MKEY_MR; 2027 mr->ibmr.lkey = mr->mmkey.key; 2028 mr->ibmr.rkey = mr->mmkey.key; 2029 2030 return 0; 2031 2032 err_free_descs: 2033 mlx5_free_priv_descs(mr); 2034 return err; 2035 } 2036 2037 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2038 u32 max_num_sg, u32 max_num_meta_sg, 2039 int desc_size, int access_mode) 2040 { 2041 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2042 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2043 int page_shift = 0; 2044 struct mlx5_ib_mr *mr; 2045 u32 *in; 2046 int err; 2047 2048 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2049 if (!mr) 2050 return ERR_PTR(-ENOMEM); 2051 2052 mr->ibmr.pd = pd; 2053 mr->ibmr.device = pd->device; 2054 2055 in = kzalloc(inlen, GFP_KERNEL); 2056 if (!in) { 2057 err = -ENOMEM; 2058 goto err_free; 2059 } 2060 2061 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2062 page_shift = PAGE_SHIFT; 2063 2064 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2065 access_mode, in, inlen); 2066 if (err) 2067 goto err_free_in; 2068 2069 mr->umem = NULL; 2070 kfree(in); 2071 2072 return mr; 2073 2074 err_free_in: 2075 kfree(in); 2076 err_free: 2077 kfree(mr); 2078 return ERR_PTR(err); 2079 } 2080 2081 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2082 int ndescs, u32 *in, int inlen) 2083 { 2084 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2085 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2086 inlen); 2087 } 2088 2089 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2090 int ndescs, u32 *in, int inlen) 2091 { 2092 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2093 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2094 } 2095 2096 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2097 int max_num_sg, int max_num_meta_sg, 2098 u32 *in, int inlen) 2099 { 2100 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2101 u32 psv_index[2]; 2102 void *mkc; 2103 int err; 2104 2105 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2106 if (!mr->sig) 2107 return -ENOMEM; 2108 2109 /* create mem & wire PSVs */ 2110 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2111 if (err) 2112 goto err_free_sig; 2113 2114 mr->sig->psv_memory.psv_idx = psv_index[0]; 2115 mr->sig->psv_wire.psv_idx = psv_index[1]; 2116 2117 mr->sig->sig_status_checked = true; 2118 mr->sig->sig_err_exists = false; 2119 /* Next UMR, Arm SIGERR */ 2120 ++mr->sig->sigerr_count; 2121 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2122 sizeof(struct mlx5_klm), 2123 MLX5_MKC_ACCESS_MODE_KLMS); 2124 if (IS_ERR(mr->klm_mr)) { 2125 err = PTR_ERR(mr->klm_mr); 2126 goto err_destroy_psv; 2127 } 2128 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2129 sizeof(struct mlx5_mtt), 2130 MLX5_MKC_ACCESS_MODE_MTT); 2131 if (IS_ERR(mr->mtt_mr)) { 2132 err = PTR_ERR(mr->mtt_mr); 2133 goto err_free_klm_mr; 2134 } 2135 2136 /* Set bsf descriptors for mkey */ 2137 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2138 MLX5_SET(mkc, mkc, bsf_en, 1); 2139 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2140 2141 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2142 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2143 if (err) 2144 goto err_free_mtt_mr; 2145 2146 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2147 mr->sig, GFP_KERNEL)); 2148 if (err) 2149 goto err_free_descs; 2150 return 0; 2151 2152 err_free_descs: 2153 destroy_mkey(dev, mr); 2154 mlx5_free_priv_descs(mr); 2155 err_free_mtt_mr: 2156 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2157 mr->mtt_mr = NULL; 2158 err_free_klm_mr: 2159 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2160 mr->klm_mr = NULL; 2161 err_destroy_psv: 2162 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2163 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2164 mr->sig->psv_memory.psv_idx); 2165 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2166 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2167 mr->sig->psv_wire.psv_idx); 2168 err_free_sig: 2169 kfree(mr->sig); 2170 2171 return err; 2172 } 2173 2174 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2175 enum ib_mr_type mr_type, u32 max_num_sg, 2176 u32 max_num_meta_sg) 2177 { 2178 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2179 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2180 int ndescs = ALIGN(max_num_sg, 4); 2181 struct mlx5_ib_mr *mr; 2182 u32 *in; 2183 int err; 2184 2185 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2186 if (!mr) 2187 return ERR_PTR(-ENOMEM); 2188 2189 in = kzalloc(inlen, GFP_KERNEL); 2190 if (!in) { 2191 err = -ENOMEM; 2192 goto err_free; 2193 } 2194 2195 mr->ibmr.device = pd->device; 2196 mr->umem = NULL; 2197 2198 switch (mr_type) { 2199 case IB_MR_TYPE_MEM_REG: 2200 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2201 break; 2202 case IB_MR_TYPE_SG_GAPS: 2203 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2204 break; 2205 case IB_MR_TYPE_INTEGRITY: 2206 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2207 max_num_meta_sg, in, inlen); 2208 break; 2209 default: 2210 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2211 err = -EINVAL; 2212 } 2213 2214 if (err) 2215 goto err_free_in; 2216 2217 kfree(in); 2218 2219 return &mr->ibmr; 2220 2221 err_free_in: 2222 kfree(in); 2223 err_free: 2224 kfree(mr); 2225 return ERR_PTR(err); 2226 } 2227 2228 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2229 u32 max_num_sg) 2230 { 2231 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2232 } 2233 2234 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2235 u32 max_num_sg, u32 max_num_meta_sg) 2236 { 2237 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2238 max_num_meta_sg); 2239 } 2240 2241 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2242 { 2243 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2244 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2245 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2246 unsigned int ndescs; 2247 u32 *in = NULL; 2248 void *mkc; 2249 int err; 2250 struct mlx5_ib_alloc_mw req = {}; 2251 struct { 2252 __u32 comp_mask; 2253 __u32 response_length; 2254 } resp = {}; 2255 2256 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2257 if (err) 2258 return err; 2259 2260 if (req.comp_mask || req.reserved1 || req.reserved2) 2261 return -EOPNOTSUPP; 2262 2263 if (udata->inlen > sizeof(req) && 2264 !ib_is_udata_cleared(udata, sizeof(req), 2265 udata->inlen - sizeof(req))) 2266 return -EOPNOTSUPP; 2267 2268 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2269 2270 in = kzalloc(inlen, GFP_KERNEL); 2271 if (!in) { 2272 err = -ENOMEM; 2273 goto free; 2274 } 2275 2276 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2277 2278 MLX5_SET(mkc, mkc, free, 1); 2279 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2280 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2281 MLX5_SET(mkc, mkc, umr_en, 1); 2282 MLX5_SET(mkc, mkc, lr, 1); 2283 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2284 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2285 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2286 2287 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2288 if (err) 2289 goto free; 2290 2291 mw->mmkey.type = MLX5_MKEY_MW; 2292 ibmw->rkey = mw->mmkey.key; 2293 mw->mmkey.ndescs = ndescs; 2294 2295 resp.response_length = 2296 min(offsetofend(typeof(resp), response_length), udata->outlen); 2297 if (resp.response_length) { 2298 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2299 if (err) 2300 goto free_mkey; 2301 } 2302 2303 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2304 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2305 if (err) 2306 goto free_mkey; 2307 } 2308 2309 kfree(in); 2310 return 0; 2311 2312 free_mkey: 2313 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2314 free: 2315 kfree(in); 2316 return err; 2317 } 2318 2319 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2320 { 2321 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2322 struct mlx5_ib_mw *mmw = to_mmw(mw); 2323 2324 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2325 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2326 /* 2327 * pagefault_single_data_segment() may be accessing mmw 2328 * if the user bound an ODP MR to this MW. 2329 */ 2330 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2331 2332 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2333 } 2334 2335 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2336 struct ib_mr_status *mr_status) 2337 { 2338 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2339 int ret = 0; 2340 2341 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2342 pr_err("Invalid status check mask\n"); 2343 ret = -EINVAL; 2344 goto done; 2345 } 2346 2347 mr_status->fail_status = 0; 2348 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2349 if (!mmr->sig) { 2350 ret = -EINVAL; 2351 pr_err("signature status check requested on a non-signature enabled MR\n"); 2352 goto done; 2353 } 2354 2355 mmr->sig->sig_status_checked = true; 2356 if (!mmr->sig->sig_err_exists) 2357 goto done; 2358 2359 if (ibmr->lkey == mmr->sig->err_item.key) 2360 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2361 sizeof(mr_status->sig_err)); 2362 else { 2363 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2364 mr_status->sig_err.sig_err_offset = 0; 2365 mr_status->sig_err.key = mmr->sig->err_item.key; 2366 } 2367 2368 mmr->sig->sig_err_exists = false; 2369 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2370 } 2371 2372 done: 2373 return ret; 2374 } 2375 2376 static int 2377 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2378 int data_sg_nents, unsigned int *data_sg_offset, 2379 struct scatterlist *meta_sg, int meta_sg_nents, 2380 unsigned int *meta_sg_offset) 2381 { 2382 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2383 unsigned int sg_offset = 0; 2384 int n = 0; 2385 2386 mr->meta_length = 0; 2387 if (data_sg_nents == 1) { 2388 n++; 2389 mr->mmkey.ndescs = 1; 2390 if (data_sg_offset) 2391 sg_offset = *data_sg_offset; 2392 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2393 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2394 if (meta_sg_nents == 1) { 2395 n++; 2396 mr->meta_ndescs = 1; 2397 if (meta_sg_offset) 2398 sg_offset = *meta_sg_offset; 2399 else 2400 sg_offset = 0; 2401 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2402 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2403 } 2404 ibmr->length = mr->data_length + mr->meta_length; 2405 } 2406 2407 return n; 2408 } 2409 2410 static int 2411 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2412 struct scatterlist *sgl, 2413 unsigned short sg_nents, 2414 unsigned int *sg_offset_p, 2415 struct scatterlist *meta_sgl, 2416 unsigned short meta_sg_nents, 2417 unsigned int *meta_sg_offset_p) 2418 { 2419 struct scatterlist *sg = sgl; 2420 struct mlx5_klm *klms = mr->descs; 2421 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2422 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2423 int i, j = 0; 2424 2425 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2426 mr->ibmr.length = 0; 2427 2428 for_each_sg(sgl, sg, sg_nents, i) { 2429 if (unlikely(i >= mr->max_descs)) 2430 break; 2431 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2432 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2433 klms[i].key = cpu_to_be32(lkey); 2434 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2435 2436 sg_offset = 0; 2437 } 2438 2439 if (sg_offset_p) 2440 *sg_offset_p = sg_offset; 2441 2442 mr->mmkey.ndescs = i; 2443 mr->data_length = mr->ibmr.length; 2444 2445 if (meta_sg_nents) { 2446 sg = meta_sgl; 2447 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2448 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2449 if (unlikely(i + j >= mr->max_descs)) 2450 break; 2451 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2452 sg_offset); 2453 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2454 sg_offset); 2455 klms[i + j].key = cpu_to_be32(lkey); 2456 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2457 2458 sg_offset = 0; 2459 } 2460 if (meta_sg_offset_p) 2461 *meta_sg_offset_p = sg_offset; 2462 2463 mr->meta_ndescs = j; 2464 mr->meta_length = mr->ibmr.length - mr->data_length; 2465 } 2466 2467 return i + j; 2468 } 2469 2470 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2471 { 2472 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2473 __be64 *descs; 2474 2475 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2476 return -ENOMEM; 2477 2478 descs = mr->descs; 2479 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2480 2481 return 0; 2482 } 2483 2484 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2485 { 2486 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2487 __be64 *descs; 2488 2489 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2490 return -ENOMEM; 2491 2492 descs = mr->descs; 2493 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2494 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2495 2496 return 0; 2497 } 2498 2499 static int 2500 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2501 int data_sg_nents, unsigned int *data_sg_offset, 2502 struct scatterlist *meta_sg, int meta_sg_nents, 2503 unsigned int *meta_sg_offset) 2504 { 2505 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2506 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2507 int n; 2508 2509 pi_mr->mmkey.ndescs = 0; 2510 pi_mr->meta_ndescs = 0; 2511 pi_mr->meta_length = 0; 2512 2513 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2514 pi_mr->desc_size * pi_mr->max_descs, 2515 DMA_TO_DEVICE); 2516 2517 pi_mr->ibmr.page_size = ibmr->page_size; 2518 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2519 mlx5_set_page); 2520 if (n != data_sg_nents) 2521 return n; 2522 2523 pi_mr->data_iova = pi_mr->ibmr.iova; 2524 pi_mr->data_length = pi_mr->ibmr.length; 2525 pi_mr->ibmr.length = pi_mr->data_length; 2526 ibmr->length = pi_mr->data_length; 2527 2528 if (meta_sg_nents) { 2529 u64 page_mask = ~((u64)ibmr->page_size - 1); 2530 u64 iova = pi_mr->data_iova; 2531 2532 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2533 meta_sg_offset, mlx5_set_page_pi); 2534 2535 pi_mr->meta_length = pi_mr->ibmr.length; 2536 /* 2537 * PI address for the HW is the offset of the metadata address 2538 * relative to the first data page address. 2539 * It equals to first data page address + size of data pages + 2540 * metadata offset at the first metadata page 2541 */ 2542 pi_mr->pi_iova = (iova & page_mask) + 2543 pi_mr->mmkey.ndescs * ibmr->page_size + 2544 (pi_mr->ibmr.iova & ~page_mask); 2545 /* 2546 * In order to use one MTT MR for data and metadata, we register 2547 * also the gaps between the end of the data and the start of 2548 * the metadata (the sig MR will verify that the HW will access 2549 * to right addresses). This mapping is safe because we use 2550 * internal mkey for the registration. 2551 */ 2552 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2553 pi_mr->ibmr.iova = iova; 2554 ibmr->length += pi_mr->meta_length; 2555 } 2556 2557 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2558 pi_mr->desc_size * pi_mr->max_descs, 2559 DMA_TO_DEVICE); 2560 2561 return n; 2562 } 2563 2564 static int 2565 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2566 int data_sg_nents, unsigned int *data_sg_offset, 2567 struct scatterlist *meta_sg, int meta_sg_nents, 2568 unsigned int *meta_sg_offset) 2569 { 2570 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2571 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2572 int n; 2573 2574 pi_mr->mmkey.ndescs = 0; 2575 pi_mr->meta_ndescs = 0; 2576 pi_mr->meta_length = 0; 2577 2578 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2579 pi_mr->desc_size * pi_mr->max_descs, 2580 DMA_TO_DEVICE); 2581 2582 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2583 meta_sg, meta_sg_nents, meta_sg_offset); 2584 2585 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2586 pi_mr->desc_size * pi_mr->max_descs, 2587 DMA_TO_DEVICE); 2588 2589 /* This is zero-based memory region */ 2590 pi_mr->data_iova = 0; 2591 pi_mr->ibmr.iova = 0; 2592 pi_mr->pi_iova = pi_mr->data_length; 2593 ibmr->length = pi_mr->ibmr.length; 2594 2595 return n; 2596 } 2597 2598 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2599 int data_sg_nents, unsigned int *data_sg_offset, 2600 struct scatterlist *meta_sg, int meta_sg_nents, 2601 unsigned int *meta_sg_offset) 2602 { 2603 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2604 struct mlx5_ib_mr *pi_mr = NULL; 2605 int n; 2606 2607 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2608 2609 mr->mmkey.ndescs = 0; 2610 mr->data_length = 0; 2611 mr->data_iova = 0; 2612 mr->meta_ndescs = 0; 2613 mr->pi_iova = 0; 2614 /* 2615 * As a performance optimization, if possible, there is no need to 2616 * perform UMR operation to register the data/metadata buffers. 2617 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2618 * Fallback to UMR only in case of a failure. 2619 */ 2620 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2621 data_sg_offset, meta_sg, meta_sg_nents, 2622 meta_sg_offset); 2623 if (n == data_sg_nents + meta_sg_nents) 2624 goto out; 2625 /* 2626 * As a performance optimization, if possible, there is no need to map 2627 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2628 * descriptors and fallback to KLM only in case of a failure. 2629 * It's more efficient for the HW to work with MTT descriptors 2630 * (especially in high load). 2631 * Use KLM (indirect access) only if it's mandatory. 2632 */ 2633 pi_mr = mr->mtt_mr; 2634 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2635 data_sg_offset, meta_sg, meta_sg_nents, 2636 meta_sg_offset); 2637 if (n == data_sg_nents + meta_sg_nents) 2638 goto out; 2639 2640 pi_mr = mr->klm_mr; 2641 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2642 data_sg_offset, meta_sg, meta_sg_nents, 2643 meta_sg_offset); 2644 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2645 return -ENOMEM; 2646 2647 out: 2648 /* This is zero-based memory region */ 2649 ibmr->iova = 0; 2650 mr->pi_mr = pi_mr; 2651 if (pi_mr) 2652 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2653 else 2654 ibmr->sig_attrs->meta_length = mr->meta_length; 2655 2656 return 0; 2657 } 2658 2659 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2660 unsigned int *sg_offset) 2661 { 2662 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2663 int n; 2664 2665 mr->mmkey.ndescs = 0; 2666 2667 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2668 mr->desc_size * mr->max_descs, 2669 DMA_TO_DEVICE); 2670 2671 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2672 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2673 NULL); 2674 else 2675 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2676 mlx5_set_page); 2677 2678 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2679 mr->desc_size * mr->max_descs, 2680 DMA_TO_DEVICE); 2681 2682 return n; 2683 } 2684