1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem.h> 43 #include <rdma/ib_umem_odp.h> 44 #include <rdma/ib_verbs.h> 45 #include "dm.h" 46 #include "mlx5_ib.h" 47 48 /* 49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't 50 * work on kernel modules memory 51 */ 52 void *xlt_emergency_page; 53 static DEFINE_MUTEX(xlt_emergency_page_mutex); 54 55 enum { 56 MAX_PENDING_REG_MR = 8, 57 }; 58 59 #define MLX5_UMR_ALIGN 2048 60 61 static void 62 create_mkey_callback(int status, struct mlx5_async_work *context); 63 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 64 u64 iova, int access_flags, 65 unsigned int page_size, bool populate); 66 67 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 68 struct ib_pd *pd) 69 { 70 struct mlx5_ib_dev *dev = to_mdev(pd->device); 71 72 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 73 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 74 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 75 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 76 MLX5_SET(mkc, mkc, lr, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 79 MLX5_SET(mkc, mkc, relaxed_ordering_write, 80 !!(acc & IB_ACCESS_RELAXED_ORDERING)); 81 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 83 !!(acc & IB_ACCESS_RELAXED_ORDERING)); 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void 91 assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey, 92 u32 *in) 93 { 94 u8 key = atomic_inc_return(&dev->mkey_var); 95 void *mkc; 96 97 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 98 MLX5_SET(mkc, mkc, mkey_7_0, key); 99 mkey->key = key; 100 } 101 102 static int 103 mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey, 104 u32 *in, int inlen) 105 { 106 assign_mkey_variant(dev, mkey, in); 107 return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen); 108 } 109 110 static int 111 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 112 struct mlx5_core_mkey *mkey, 113 struct mlx5_async_ctx *async_ctx, 114 u32 *in, int inlen, u32 *out, int outlen, 115 struct mlx5_async_work *context) 116 { 117 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 118 assign_mkey_variant(dev, mkey, in); 119 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 120 create_mkey_callback, context); 121 } 122 123 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 124 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 125 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 126 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 127 128 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) 129 { 130 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); 131 } 132 133 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 134 { 135 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 136 137 return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); 138 } 139 140 static void create_mkey_callback(int status, struct mlx5_async_work *context) 141 { 142 struct mlx5_ib_mr *mr = 143 container_of(context, struct mlx5_ib_mr, cb_work); 144 struct mlx5_cache_ent *ent = mr->cache_ent; 145 struct mlx5_ib_dev *dev = ent->dev; 146 unsigned long flags; 147 148 if (status) { 149 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 150 kfree(mr); 151 spin_lock_irqsave(&ent->lock, flags); 152 ent->pending--; 153 WRITE_ONCE(dev->fill_delay, 1); 154 spin_unlock_irqrestore(&ent->lock, flags); 155 mod_timer(&dev->delay_timer, jiffies + HZ); 156 return; 157 } 158 159 mr->mmkey.type = MLX5_MKEY_MR; 160 mr->mmkey.key |= mlx5_idx_to_mkey( 161 MLX5_GET(create_mkey_out, mr->out, mkey_index)); 162 init_waitqueue_head(&mr->mmkey.wait); 163 164 WRITE_ONCE(dev->cache.last_add, jiffies); 165 166 spin_lock_irqsave(&ent->lock, flags); 167 list_add_tail(&mr->list, &ent->head); 168 ent->available_mrs++; 169 ent->total_mrs++; 170 /* If we are doing fill_to_high_water then keep going. */ 171 queue_adjust_cache_locked(ent); 172 ent->pending--; 173 spin_unlock_irqrestore(&ent->lock, flags); 174 } 175 176 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 177 { 178 struct mlx5_ib_mr *mr; 179 180 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 181 if (!mr) 182 return NULL; 183 mr->cache_ent = ent; 184 185 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 186 MLX5_SET(mkc, mkc, free, 1); 187 MLX5_SET(mkc, mkc, umr_en, 1); 188 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 189 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 190 191 MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); 192 MLX5_SET(mkc, mkc, log_page_size, ent->page); 193 return mr; 194 } 195 196 /* Asynchronously schedule new MRs to be populated in the cache. */ 197 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 198 { 199 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 200 struct mlx5_ib_mr *mr; 201 void *mkc; 202 u32 *in; 203 int err = 0; 204 int i; 205 206 in = kzalloc(inlen, GFP_KERNEL); 207 if (!in) 208 return -ENOMEM; 209 210 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 211 for (i = 0; i < num; i++) { 212 mr = alloc_cache_mr(ent, mkc); 213 if (!mr) { 214 err = -ENOMEM; 215 break; 216 } 217 spin_lock_irq(&ent->lock); 218 if (ent->pending >= MAX_PENDING_REG_MR) { 219 err = -EAGAIN; 220 spin_unlock_irq(&ent->lock); 221 kfree(mr); 222 break; 223 } 224 ent->pending++; 225 spin_unlock_irq(&ent->lock); 226 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 227 &ent->dev->async_ctx, in, inlen, 228 mr->out, sizeof(mr->out), 229 &mr->cb_work); 230 if (err) { 231 spin_lock_irq(&ent->lock); 232 ent->pending--; 233 spin_unlock_irq(&ent->lock); 234 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 235 kfree(mr); 236 break; 237 } 238 } 239 240 kfree(in); 241 return err; 242 } 243 244 /* Synchronously create a MR in the cache */ 245 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 246 { 247 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 248 struct mlx5_ib_mr *mr; 249 void *mkc; 250 u32 *in; 251 int err; 252 253 in = kzalloc(inlen, GFP_KERNEL); 254 if (!in) 255 return ERR_PTR(-ENOMEM); 256 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 257 258 mr = alloc_cache_mr(ent, mkc); 259 if (!mr) { 260 err = -ENOMEM; 261 goto free_in; 262 } 263 264 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen); 265 if (err) 266 goto free_mr; 267 268 mr->mmkey.type = MLX5_MKEY_MR; 269 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 270 spin_lock_irq(&ent->lock); 271 ent->total_mrs++; 272 spin_unlock_irq(&ent->lock); 273 kfree(in); 274 return mr; 275 free_mr: 276 kfree(mr); 277 free_in: 278 kfree(in); 279 return ERR_PTR(err); 280 } 281 282 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 283 { 284 struct mlx5_ib_mr *mr; 285 286 lockdep_assert_held(&ent->lock); 287 if (list_empty(&ent->head)) 288 return; 289 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 290 list_del(&mr->list); 291 ent->available_mrs--; 292 ent->total_mrs--; 293 spin_unlock_irq(&ent->lock); 294 mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey); 295 kfree(mr); 296 spin_lock_irq(&ent->lock); 297 } 298 299 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 300 bool limit_fill) 301 { 302 int err; 303 304 lockdep_assert_held(&ent->lock); 305 306 while (true) { 307 if (limit_fill) 308 target = ent->limit * 2; 309 if (target == ent->available_mrs + ent->pending) 310 return 0; 311 if (target > ent->available_mrs + ent->pending) { 312 u32 todo = target - (ent->available_mrs + ent->pending); 313 314 spin_unlock_irq(&ent->lock); 315 err = add_keys(ent, todo); 316 if (err == -EAGAIN) 317 usleep_range(3000, 5000); 318 spin_lock_irq(&ent->lock); 319 if (err) { 320 if (err != -EAGAIN) 321 return err; 322 } else 323 return 0; 324 } else { 325 remove_cache_mr_locked(ent); 326 } 327 } 328 } 329 330 static ssize_t size_write(struct file *filp, const char __user *buf, 331 size_t count, loff_t *pos) 332 { 333 struct mlx5_cache_ent *ent = filp->private_data; 334 u32 target; 335 int err; 336 337 err = kstrtou32_from_user(buf, count, 0, &target); 338 if (err) 339 return err; 340 341 /* 342 * Target is the new value of total_mrs the user requests, however we 343 * cannot free MRs that are in use. Compute the target value for 344 * available_mrs. 345 */ 346 spin_lock_irq(&ent->lock); 347 if (target < ent->total_mrs - ent->available_mrs) { 348 err = -EINVAL; 349 goto err_unlock; 350 } 351 target = target - (ent->total_mrs - ent->available_mrs); 352 if (target < ent->limit || target > ent->limit*2) { 353 err = -EINVAL; 354 goto err_unlock; 355 } 356 err = resize_available_mrs(ent, target, false); 357 if (err) 358 goto err_unlock; 359 spin_unlock_irq(&ent->lock); 360 361 return count; 362 363 err_unlock: 364 spin_unlock_irq(&ent->lock); 365 return err; 366 } 367 368 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 369 loff_t *pos) 370 { 371 struct mlx5_cache_ent *ent = filp->private_data; 372 char lbuf[20]; 373 int err; 374 375 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 376 if (err < 0) 377 return err; 378 379 return simple_read_from_buffer(buf, count, pos, lbuf, err); 380 } 381 382 static const struct file_operations size_fops = { 383 .owner = THIS_MODULE, 384 .open = simple_open, 385 .write = size_write, 386 .read = size_read, 387 }; 388 389 static ssize_t limit_write(struct file *filp, const char __user *buf, 390 size_t count, loff_t *pos) 391 { 392 struct mlx5_cache_ent *ent = filp->private_data; 393 u32 var; 394 int err; 395 396 err = kstrtou32_from_user(buf, count, 0, &var); 397 if (err) 398 return err; 399 400 /* 401 * Upon set we immediately fill the cache to high water mark implied by 402 * the limit. 403 */ 404 spin_lock_irq(&ent->lock); 405 ent->limit = var; 406 err = resize_available_mrs(ent, 0, true); 407 spin_unlock_irq(&ent->lock); 408 if (err) 409 return err; 410 return count; 411 } 412 413 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 414 loff_t *pos) 415 { 416 struct mlx5_cache_ent *ent = filp->private_data; 417 char lbuf[20]; 418 int err; 419 420 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations limit_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = limit_write, 431 .read = limit_read, 432 }; 433 434 static bool someone_adding(struct mlx5_mr_cache *cache) 435 { 436 unsigned int i; 437 438 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 439 struct mlx5_cache_ent *ent = &cache->ent[i]; 440 bool ret; 441 442 spin_lock_irq(&ent->lock); 443 ret = ent->available_mrs < ent->limit; 444 spin_unlock_irq(&ent->lock); 445 if (ret) 446 return true; 447 } 448 return false; 449 } 450 451 /* 452 * Check if the bucket is outside the high/low water mark and schedule an async 453 * update. The cache refill has hysteresis, once the low water mark is hit it is 454 * refilled up to the high mark. 455 */ 456 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 457 { 458 lockdep_assert_held(&ent->lock); 459 460 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 461 return; 462 if (ent->available_mrs < ent->limit) { 463 ent->fill_to_high_water = true; 464 queue_work(ent->dev->cache.wq, &ent->work); 465 } else if (ent->fill_to_high_water && 466 ent->available_mrs + ent->pending < 2 * ent->limit) { 467 /* 468 * Once we start populating due to hitting a low water mark 469 * continue until we pass the high water mark. 470 */ 471 queue_work(ent->dev->cache.wq, &ent->work); 472 } else if (ent->available_mrs == 2 * ent->limit) { 473 ent->fill_to_high_water = false; 474 } else if (ent->available_mrs > 2 * ent->limit) { 475 /* Queue deletion of excess entries */ 476 ent->fill_to_high_water = false; 477 if (ent->pending) 478 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 479 msecs_to_jiffies(1000)); 480 else 481 queue_work(ent->dev->cache.wq, &ent->work); 482 } 483 } 484 485 static void __cache_work_func(struct mlx5_cache_ent *ent) 486 { 487 struct mlx5_ib_dev *dev = ent->dev; 488 struct mlx5_mr_cache *cache = &dev->cache; 489 int err; 490 491 spin_lock_irq(&ent->lock); 492 if (ent->disabled) 493 goto out; 494 495 if (ent->fill_to_high_water && 496 ent->available_mrs + ent->pending < 2 * ent->limit && 497 !READ_ONCE(dev->fill_delay)) { 498 spin_unlock_irq(&ent->lock); 499 err = add_keys(ent, 1); 500 spin_lock_irq(&ent->lock); 501 if (ent->disabled) 502 goto out; 503 if (err) { 504 /* 505 * EAGAIN only happens if pending is positive, so we 506 * will be rescheduled from reg_mr_callback(). The only 507 * failure path here is ENOMEM. 508 */ 509 if (err != -EAGAIN) { 510 mlx5_ib_warn( 511 dev, 512 "command failed order %d, err %d\n", 513 ent->order, err); 514 queue_delayed_work(cache->wq, &ent->dwork, 515 msecs_to_jiffies(1000)); 516 } 517 } 518 } else if (ent->available_mrs > 2 * ent->limit) { 519 bool need_delay; 520 521 /* 522 * The remove_cache_mr() logic is performed as garbage 523 * collection task. Such task is intended to be run when no 524 * other active processes are running. 525 * 526 * The need_resched() will return TRUE if there are user tasks 527 * to be activated in near future. 528 * 529 * In such case, we don't execute remove_cache_mr() and postpone 530 * the garbage collection work to try to run in next cycle, in 531 * order to free CPU resources to other tasks. 532 */ 533 spin_unlock_irq(&ent->lock); 534 need_delay = need_resched() || someone_adding(cache) || 535 time_after(jiffies, 536 READ_ONCE(cache->last_add) + 300 * HZ); 537 spin_lock_irq(&ent->lock); 538 if (ent->disabled) 539 goto out; 540 if (need_delay) 541 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 542 remove_cache_mr_locked(ent); 543 queue_adjust_cache_locked(ent); 544 } 545 out: 546 spin_unlock_irq(&ent->lock); 547 } 548 549 static void delayed_cache_work_func(struct work_struct *work) 550 { 551 struct mlx5_cache_ent *ent; 552 553 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 554 __cache_work_func(ent); 555 } 556 557 static void cache_work_func(struct work_struct *work) 558 { 559 struct mlx5_cache_ent *ent; 560 561 ent = container_of(work, struct mlx5_cache_ent, work); 562 __cache_work_func(ent); 563 } 564 565 /* Allocate a special entry from the cache */ 566 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 567 unsigned int entry, int access_flags) 568 { 569 struct mlx5_mr_cache *cache = &dev->cache; 570 struct mlx5_cache_ent *ent; 571 struct mlx5_ib_mr *mr; 572 573 if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY || 574 entry >= ARRAY_SIZE(cache->ent))) 575 return ERR_PTR(-EINVAL); 576 577 /* Matches access in alloc_cache_mr() */ 578 if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) 579 return ERR_PTR(-EOPNOTSUPP); 580 581 ent = &cache->ent[entry]; 582 spin_lock_irq(&ent->lock); 583 if (list_empty(&ent->head)) { 584 spin_unlock_irq(&ent->lock); 585 mr = create_cache_mr(ent); 586 if (IS_ERR(mr)) 587 return mr; 588 } else { 589 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 590 list_del(&mr->list); 591 ent->available_mrs--; 592 queue_adjust_cache_locked(ent); 593 spin_unlock_irq(&ent->lock); 594 } 595 mr->access_flags = access_flags; 596 return mr; 597 } 598 599 /* Return a MR already available in the cache */ 600 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent) 601 { 602 struct mlx5_ib_dev *dev = req_ent->dev; 603 struct mlx5_ib_mr *mr = NULL; 604 struct mlx5_cache_ent *ent = req_ent; 605 606 /* Try larger MR pools from the cache to satisfy the allocation */ 607 for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) { 608 mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order, 609 ent - dev->cache.ent); 610 611 spin_lock_irq(&ent->lock); 612 if (!list_empty(&ent->head)) { 613 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, 614 list); 615 list_del(&mr->list); 616 ent->available_mrs--; 617 queue_adjust_cache_locked(ent); 618 spin_unlock_irq(&ent->lock); 619 break; 620 } 621 queue_adjust_cache_locked(ent); 622 spin_unlock_irq(&ent->lock); 623 } 624 625 if (!mr) 626 req_ent->miss++; 627 628 return mr; 629 } 630 631 static void detach_mr_from_cache(struct mlx5_ib_mr *mr) 632 { 633 struct mlx5_cache_ent *ent = mr->cache_ent; 634 635 mr->cache_ent = NULL; 636 spin_lock_irq(&ent->lock); 637 ent->total_mrs--; 638 spin_unlock_irq(&ent->lock); 639 } 640 641 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 642 { 643 struct mlx5_cache_ent *ent = mr->cache_ent; 644 645 if (!ent) 646 return; 647 648 if (mlx5_mr_cache_invalidate(mr)) { 649 detach_mr_from_cache(mr); 650 destroy_mkey(dev, mr); 651 kfree(mr); 652 return; 653 } 654 655 spin_lock_irq(&ent->lock); 656 list_add_tail(&mr->list, &ent->head); 657 ent->available_mrs++; 658 queue_adjust_cache_locked(ent); 659 spin_unlock_irq(&ent->lock); 660 } 661 662 static void clean_keys(struct mlx5_ib_dev *dev, int c) 663 { 664 struct mlx5_mr_cache *cache = &dev->cache; 665 struct mlx5_cache_ent *ent = &cache->ent[c]; 666 struct mlx5_ib_mr *tmp_mr; 667 struct mlx5_ib_mr *mr; 668 LIST_HEAD(del_list); 669 670 cancel_delayed_work(&ent->dwork); 671 while (1) { 672 spin_lock_irq(&ent->lock); 673 if (list_empty(&ent->head)) { 674 spin_unlock_irq(&ent->lock); 675 break; 676 } 677 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 678 list_move(&mr->list, &del_list); 679 ent->available_mrs--; 680 ent->total_mrs--; 681 spin_unlock_irq(&ent->lock); 682 mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); 683 } 684 685 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 686 list_del(&mr->list); 687 kfree(mr); 688 } 689 } 690 691 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 692 { 693 if (!mlx5_debugfs_root || dev->is_rep) 694 return; 695 696 debugfs_remove_recursive(dev->cache.root); 697 dev->cache.root = NULL; 698 } 699 700 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 701 { 702 struct mlx5_mr_cache *cache = &dev->cache; 703 struct mlx5_cache_ent *ent; 704 struct dentry *dir; 705 int i; 706 707 if (!mlx5_debugfs_root || dev->is_rep) 708 return; 709 710 cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); 711 712 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 713 ent = &cache->ent[i]; 714 sprintf(ent->name, "%d", ent->order); 715 dir = debugfs_create_dir(ent->name, cache->root); 716 debugfs_create_file("size", 0600, dir, ent, &size_fops); 717 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 718 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 719 debugfs_create_u32("miss", 0600, dir, &ent->miss); 720 } 721 } 722 723 static void delay_time_func(struct timer_list *t) 724 { 725 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 726 727 WRITE_ONCE(dev->fill_delay, 0); 728 } 729 730 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 731 { 732 struct mlx5_mr_cache *cache = &dev->cache; 733 struct mlx5_cache_ent *ent; 734 int i; 735 736 mutex_init(&dev->slow_path_mutex); 737 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 738 if (!cache->wq) { 739 mlx5_ib_warn(dev, "failed to create work queue\n"); 740 return -ENOMEM; 741 } 742 743 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 744 timer_setup(&dev->delay_timer, delay_time_func, 0); 745 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 746 ent = &cache->ent[i]; 747 INIT_LIST_HEAD(&ent->head); 748 spin_lock_init(&ent->lock); 749 ent->order = i + 2; 750 ent->dev = dev; 751 ent->limit = 0; 752 753 INIT_WORK(&ent->work, cache_work_func); 754 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 755 756 if (i > MR_CACHE_LAST_STD_ENTRY) { 757 mlx5_odp_init_mr_cache_entry(ent); 758 continue; 759 } 760 761 if (ent->order > mr_cache_max_order(dev)) 762 continue; 763 764 ent->page = PAGE_SHIFT; 765 ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / 766 MLX5_IB_UMR_OCTOWORD; 767 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 768 if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && 769 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 770 mlx5_ib_can_load_pas_with_umr(dev, 0)) 771 ent->limit = dev->mdev->profile->mr_cache[i].limit; 772 else 773 ent->limit = 0; 774 spin_lock_irq(&ent->lock); 775 queue_adjust_cache_locked(ent); 776 spin_unlock_irq(&ent->lock); 777 } 778 779 mlx5_mr_cache_debugfs_init(dev); 780 781 return 0; 782 } 783 784 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 785 { 786 unsigned int i; 787 788 if (!dev->cache.wq) 789 return 0; 790 791 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 792 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 793 794 spin_lock_irq(&ent->lock); 795 ent->disabled = true; 796 spin_unlock_irq(&ent->lock); 797 cancel_work_sync(&ent->work); 798 cancel_delayed_work_sync(&ent->dwork); 799 } 800 801 mlx5_mr_cache_debugfs_cleanup(dev); 802 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 803 804 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 805 clean_keys(dev, i); 806 807 destroy_workqueue(dev->cache.wq); 808 del_timer_sync(&dev->delay_timer); 809 810 return 0; 811 } 812 813 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 814 { 815 struct mlx5_ib_dev *dev = to_mdev(pd->device); 816 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 817 struct mlx5_ib_mr *mr; 818 void *mkc; 819 u32 *in; 820 int err; 821 822 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 823 if (!mr) 824 return ERR_PTR(-ENOMEM); 825 826 in = kzalloc(inlen, GFP_KERNEL); 827 if (!in) { 828 err = -ENOMEM; 829 goto err_free; 830 } 831 832 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 833 834 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 835 MLX5_SET(mkc, mkc, length64, 1); 836 set_mkc_access_pd_addr_fields(mkc, acc, 0, pd); 837 838 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 839 if (err) 840 goto err_in; 841 842 kfree(in); 843 mr->mmkey.type = MLX5_MKEY_MR; 844 mr->ibmr.lkey = mr->mmkey.key; 845 mr->ibmr.rkey = mr->mmkey.key; 846 mr->umem = NULL; 847 848 return &mr->ibmr; 849 850 err_in: 851 kfree(in); 852 853 err_free: 854 kfree(mr); 855 856 return ERR_PTR(err); 857 } 858 859 static int get_octo_len(u64 addr, u64 len, int page_shift) 860 { 861 u64 page_size = 1ULL << page_shift; 862 u64 offset; 863 int npages; 864 865 offset = addr & (page_size - 1); 866 npages = ALIGN(len + offset, page_size) >> page_shift; 867 return (npages + 1) / 2; 868 } 869 870 static int mr_cache_max_order(struct mlx5_ib_dev *dev) 871 { 872 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 873 return MR_CACHE_LAST_STD_ENTRY + 2; 874 return MLX5_MAX_UMR_SHIFT; 875 } 876 877 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) 878 { 879 struct mlx5_ib_umr_context *context = 880 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); 881 882 context->status = wc->status; 883 complete(&context->done); 884 } 885 886 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) 887 { 888 context->cqe.done = mlx5_ib_umr_done; 889 context->status = -1; 890 init_completion(&context->done); 891 } 892 893 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, 894 struct mlx5_umr_wr *umrwr) 895 { 896 struct umr_common *umrc = &dev->umrc; 897 const struct ib_send_wr *bad; 898 int err; 899 struct mlx5_ib_umr_context umr_context; 900 901 mlx5_ib_init_umr_context(&umr_context); 902 umrwr->wr.wr_cqe = &umr_context.cqe; 903 904 down(&umrc->sem); 905 err = ib_post_send(umrc->qp, &umrwr->wr, &bad); 906 if (err) { 907 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); 908 } else { 909 wait_for_completion(&umr_context.done); 910 if (umr_context.status != IB_WC_SUCCESS) { 911 mlx5_ib_warn(dev, "reg umr failed (%u)\n", 912 umr_context.status); 913 err = -EFAULT; 914 } 915 } 916 up(&umrc->sem); 917 return err; 918 } 919 920 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 921 unsigned int order) 922 { 923 struct mlx5_mr_cache *cache = &dev->cache; 924 925 if (order < cache->ent[0].order) 926 return &cache->ent[0]; 927 order = order - cache->ent[0].order; 928 if (order > MR_CACHE_LAST_STD_ENTRY) 929 return NULL; 930 return &cache->ent[order]; 931 } 932 933 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 934 u64 length, int access_flags) 935 { 936 mr->ibmr.lkey = mr->mmkey.key; 937 mr->ibmr.rkey = mr->mmkey.key; 938 mr->ibmr.length = length; 939 mr->ibmr.device = &dev->ib_dev; 940 mr->access_flags = access_flags; 941 } 942 943 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 944 u64 iova) 945 { 946 /* 947 * The alignment of iova has already been checked upon entering 948 * UVERBS_METHOD_REG_DMABUF_MR 949 */ 950 umem->iova = iova; 951 return PAGE_SIZE; 952 } 953 954 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 955 struct ib_umem *umem, u64 iova, 956 int access_flags) 957 { 958 struct mlx5_ib_dev *dev = to_mdev(pd->device); 959 struct mlx5_cache_ent *ent; 960 struct mlx5_ib_mr *mr; 961 unsigned int page_size; 962 963 if (umem->is_dmabuf) 964 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 965 else 966 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 967 0, iova); 968 if (WARN_ON(!page_size)) 969 return ERR_PTR(-EINVAL); 970 ent = mr_cache_ent_from_order( 971 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 972 /* 973 * Matches access in alloc_cache_mr(). If the MR can't come from the 974 * cache then synchronously create an uncached one. 975 */ 976 if (!ent || ent->limit == 0 || 977 !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) { 978 mutex_lock(&dev->slow_path_mutex); 979 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 980 mutex_unlock(&dev->slow_path_mutex); 981 return mr; 982 } 983 984 mr = get_cache_mr(ent); 985 if (!mr) { 986 mr = create_cache_mr(ent); 987 /* 988 * The above already tried to do the same stuff as reg_create(), 989 * no reason to try it again. 990 */ 991 if (IS_ERR(mr)) 992 return mr; 993 } 994 995 mr->ibmr.pd = pd; 996 mr->umem = umem; 997 mr->access_flags = access_flags; 998 mr->desc_size = sizeof(struct mlx5_mtt); 999 mr->mmkey.iova = iova; 1000 mr->mmkey.size = umem->length; 1001 mr->mmkey.pd = to_mpd(pd)->pdn; 1002 mr->page_shift = order_base_2(page_size); 1003 set_mr_fields(dev, mr, umem->length, access_flags); 1004 1005 return mr; 1006 } 1007 1008 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ 1009 MLX5_UMR_MTT_ALIGNMENT) 1010 #define MLX5_SPARE_UMR_CHUNK 0x10000 1011 1012 /* 1013 * Allocate a temporary buffer to hold the per-page information to transfer to 1014 * HW. For efficiency this should be as large as it can be, but buffer 1015 * allocation failure is not allowed, so try smaller sizes. 1016 */ 1017 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) 1018 { 1019 const size_t xlt_chunk_align = 1020 MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size); 1021 size_t size; 1022 void *res = NULL; 1023 1024 static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); 1025 1026 /* 1027 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the 1028 * allocation can't trigger any kind of reclaim. 1029 */ 1030 might_sleep(); 1031 1032 gfp_mask |= __GFP_ZERO; 1033 1034 /* 1035 * If the system already has a suitable high order page then just use 1036 * that, but don't try hard to create one. This max is about 1M, so a 1037 * free x86 huge page will satisfy it. 1038 */ 1039 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), 1040 MLX5_MAX_UMR_CHUNK); 1041 *nents = size / ent_size; 1042 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1043 get_order(size)); 1044 if (res) 1045 return res; 1046 1047 if (size > MLX5_SPARE_UMR_CHUNK) { 1048 size = MLX5_SPARE_UMR_CHUNK; 1049 *nents = get_order(size) / ent_size; 1050 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1051 get_order(size)); 1052 if (res) 1053 return res; 1054 } 1055 1056 *nents = PAGE_SIZE / ent_size; 1057 res = (void *)__get_free_page(gfp_mask); 1058 if (res) 1059 return res; 1060 1061 mutex_lock(&xlt_emergency_page_mutex); 1062 memset(xlt_emergency_page, 0, PAGE_SIZE); 1063 return xlt_emergency_page; 1064 } 1065 1066 static void mlx5_ib_free_xlt(void *xlt, size_t length) 1067 { 1068 if (xlt == xlt_emergency_page) { 1069 mutex_unlock(&xlt_emergency_page_mutex); 1070 return; 1071 } 1072 1073 free_pages((unsigned long)xlt, get_order(length)); 1074 } 1075 1076 /* 1077 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for 1078 * submission. 1079 */ 1080 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr, 1081 struct mlx5_umr_wr *wr, struct ib_sge *sg, 1082 size_t nents, size_t ent_size, 1083 unsigned int flags) 1084 { 1085 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1086 struct device *ddev = &dev->mdev->pdev->dev; 1087 dma_addr_t dma; 1088 void *xlt; 1089 1090 xlt = mlx5_ib_alloc_xlt(&nents, ent_size, 1091 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : 1092 GFP_KERNEL); 1093 sg->length = nents * ent_size; 1094 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); 1095 if (dma_mapping_error(ddev, dma)) { 1096 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); 1097 mlx5_ib_free_xlt(xlt, sg->length); 1098 return NULL; 1099 } 1100 sg->addr = dma; 1101 sg->lkey = dev->umrc.pd->local_dma_lkey; 1102 1103 memset(wr, 0, sizeof(*wr)); 1104 wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; 1105 if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) 1106 wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; 1107 wr->wr.sg_list = sg; 1108 wr->wr.num_sge = 1; 1109 wr->wr.opcode = MLX5_IB_WR_UMR; 1110 wr->pd = mr->ibmr.pd; 1111 wr->mkey = mr->mmkey.key; 1112 wr->length = mr->mmkey.size; 1113 wr->virt_addr = mr->mmkey.iova; 1114 wr->access_flags = mr->access_flags; 1115 wr->page_shift = mr->page_shift; 1116 wr->xlt_size = sg->length; 1117 return xlt; 1118 } 1119 1120 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, 1121 struct ib_sge *sg) 1122 { 1123 struct device *ddev = &dev->mdev->pdev->dev; 1124 1125 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); 1126 mlx5_ib_free_xlt(xlt, sg->length); 1127 } 1128 1129 static unsigned int xlt_wr_final_send_flags(unsigned int flags) 1130 { 1131 unsigned int res = 0; 1132 1133 if (flags & MLX5_IB_UPD_XLT_ENABLE) 1134 res |= MLX5_IB_SEND_UMR_ENABLE_MR | 1135 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | 1136 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1137 if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS) 1138 res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1139 if (flags & MLX5_IB_UPD_XLT_ADDR) 1140 res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1141 return res; 1142 } 1143 1144 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 1145 int page_shift, int flags) 1146 { 1147 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1148 struct device *ddev = &dev->mdev->pdev->dev; 1149 void *xlt; 1150 struct mlx5_umr_wr wr; 1151 struct ib_sge sg; 1152 int err = 0; 1153 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) 1154 ? sizeof(struct mlx5_klm) 1155 : sizeof(struct mlx5_mtt); 1156 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; 1157 const int page_mask = page_align - 1; 1158 size_t pages_mapped = 0; 1159 size_t pages_to_map = 0; 1160 size_t pages_iter; 1161 size_t size_to_map = 0; 1162 size_t orig_sg_length; 1163 1164 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && 1165 !umr_can_use_indirect_mkey(dev)) 1166 return -EPERM; 1167 1168 if (WARN_ON(!mr->umem->is_odp)) 1169 return -EINVAL; 1170 1171 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, 1172 * so we need to align the offset and length accordingly 1173 */ 1174 if (idx & page_mask) { 1175 npages += idx & page_mask; 1176 idx &= ~page_mask; 1177 } 1178 pages_to_map = ALIGN(npages, page_align); 1179 1180 xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags); 1181 if (!xlt) 1182 return -ENOMEM; 1183 pages_iter = sg.length / desc_size; 1184 orig_sg_length = sg.length; 1185 1186 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { 1187 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 1188 size_t max_pages = ib_umem_odp_num_pages(odp) - idx; 1189 1190 pages_to_map = min_t(size_t, pages_to_map, max_pages); 1191 } 1192 1193 wr.page_shift = page_shift; 1194 1195 for (pages_mapped = 0; 1196 pages_mapped < pages_to_map && !err; 1197 pages_mapped += pages_iter, idx += pages_iter) { 1198 npages = min_t(int, pages_iter, pages_to_map - pages_mapped); 1199 size_to_map = npages * desc_size; 1200 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1201 DMA_TO_DEVICE); 1202 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 1203 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1204 DMA_TO_DEVICE); 1205 1206 sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); 1207 1208 if (pages_mapped + pages_iter >= pages_to_map) 1209 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1210 1211 wr.offset = idx * desc_size; 1212 wr.xlt_size = sg.length; 1213 1214 err = mlx5_ib_post_send_wait(dev, &wr); 1215 } 1216 sg.length = orig_sg_length; 1217 mlx5_ib_unmap_free_xlt(dev, xlt, &sg); 1218 return err; 1219 } 1220 1221 /* 1222 * Send the DMA list to the HW for a normal MR using UMR. 1223 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP 1224 * flag may be used. 1225 */ 1226 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 1227 { 1228 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1229 struct device *ddev = &dev->mdev->pdev->dev; 1230 struct ib_block_iter biter; 1231 struct mlx5_mtt *cur_mtt; 1232 struct mlx5_umr_wr wr; 1233 size_t orig_sg_length; 1234 struct mlx5_mtt *mtt; 1235 size_t final_size; 1236 struct ib_sge sg; 1237 int err = 0; 1238 1239 if (WARN_ON(mr->umem->is_odp)) 1240 return -EINVAL; 1241 1242 mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, 1243 ib_umem_num_dma_blocks(mr->umem, 1244 1 << mr->page_shift), 1245 sizeof(*mtt), flags); 1246 if (!mtt) 1247 return -ENOMEM; 1248 orig_sg_length = sg.length; 1249 1250 cur_mtt = mtt; 1251 rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap, 1252 BIT(mr->page_shift)) { 1253 if (cur_mtt == (void *)mtt + sg.length) { 1254 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1255 DMA_TO_DEVICE); 1256 err = mlx5_ib_post_send_wait(dev, &wr); 1257 if (err) 1258 goto err; 1259 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1260 DMA_TO_DEVICE); 1261 wr.offset += sg.length; 1262 cur_mtt = mtt; 1263 } 1264 1265 cur_mtt->ptag = 1266 cpu_to_be64(rdma_block_iter_dma_address(&biter) | 1267 MLX5_IB_MTT_PRESENT); 1268 1269 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) 1270 cur_mtt->ptag = 0; 1271 1272 cur_mtt++; 1273 } 1274 1275 final_size = (void *)cur_mtt - (void *)mtt; 1276 sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); 1277 memset(cur_mtt, 0, sg.length - final_size); 1278 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1279 wr.xlt_size = sg.length; 1280 1281 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); 1282 err = mlx5_ib_post_send_wait(dev, &wr); 1283 1284 err: 1285 sg.length = orig_sg_length; 1286 mlx5_ib_unmap_free_xlt(dev, mtt, &sg); 1287 return err; 1288 } 1289 1290 /* 1291 * If ibmr is NULL it will be allocated by reg_create. 1292 * Else, the given ibmr will be used. 1293 */ 1294 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1295 u64 iova, int access_flags, 1296 unsigned int page_size, bool populate) 1297 { 1298 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1299 struct mlx5_ib_mr *mr; 1300 __be64 *pas; 1301 void *mkc; 1302 int inlen; 1303 u32 *in; 1304 int err; 1305 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1306 1307 if (!page_size) 1308 return ERR_PTR(-EINVAL); 1309 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1310 if (!mr) 1311 return ERR_PTR(-ENOMEM); 1312 1313 mr->ibmr.pd = pd; 1314 mr->access_flags = access_flags; 1315 mr->page_shift = order_base_2(page_size); 1316 1317 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1318 if (populate) 1319 inlen += sizeof(*pas) * 1320 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1321 in = kvzalloc(inlen, GFP_KERNEL); 1322 if (!in) { 1323 err = -ENOMEM; 1324 goto err_1; 1325 } 1326 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1327 if (populate) { 1328 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1329 err = -EINVAL; 1330 goto err_2; 1331 } 1332 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1333 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1334 } 1335 1336 /* The pg_access bit allows setting the access flags 1337 * in the page list submitted with the command. */ 1338 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1339 1340 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1341 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1342 populate ? pd : dev->umrc.pd); 1343 MLX5_SET(mkc, mkc, free, !populate); 1344 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1345 MLX5_SET(mkc, mkc, umr_en, 1); 1346 1347 MLX5_SET64(mkc, mkc, len, umem->length); 1348 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1349 MLX5_SET(mkc, mkc, translations_octword_size, 1350 get_octo_len(iova, umem->length, mr->page_shift)); 1351 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1352 if (populate) { 1353 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1354 get_octo_len(iova, umem->length, mr->page_shift)); 1355 } 1356 1357 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1358 if (err) { 1359 mlx5_ib_warn(dev, "create mkey failed\n"); 1360 goto err_2; 1361 } 1362 mr->mmkey.type = MLX5_MKEY_MR; 1363 mr->desc_size = sizeof(struct mlx5_mtt); 1364 mr->umem = umem; 1365 set_mr_fields(dev, mr, umem->length, access_flags); 1366 kvfree(in); 1367 1368 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1369 1370 return mr; 1371 1372 err_2: 1373 kvfree(in); 1374 err_1: 1375 kfree(mr); 1376 return ERR_PTR(err); 1377 } 1378 1379 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1380 u64 length, int acc, int mode) 1381 { 1382 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1383 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1384 struct mlx5_ib_mr *mr; 1385 void *mkc; 1386 u32 *in; 1387 int err; 1388 1389 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1390 if (!mr) 1391 return ERR_PTR(-ENOMEM); 1392 1393 in = kzalloc(inlen, GFP_KERNEL); 1394 if (!in) { 1395 err = -ENOMEM; 1396 goto err_free; 1397 } 1398 1399 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1400 1401 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1402 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1403 MLX5_SET64(mkc, mkc, len, length); 1404 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1405 1406 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1407 if (err) 1408 goto err_in; 1409 1410 kfree(in); 1411 1412 set_mr_fields(dev, mr, length, acc); 1413 1414 return &mr->ibmr; 1415 1416 err_in: 1417 kfree(in); 1418 1419 err_free: 1420 kfree(mr); 1421 1422 return ERR_PTR(err); 1423 } 1424 1425 int mlx5_ib_advise_mr(struct ib_pd *pd, 1426 enum ib_uverbs_advise_mr_advice advice, 1427 u32 flags, 1428 struct ib_sge *sg_list, 1429 u32 num_sge, 1430 struct uverbs_attr_bundle *attrs) 1431 { 1432 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1433 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1434 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1435 return -EOPNOTSUPP; 1436 1437 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1438 sg_list, num_sge); 1439 } 1440 1441 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1442 struct ib_dm_mr_attr *attr, 1443 struct uverbs_attr_bundle *attrs) 1444 { 1445 struct mlx5_ib_dm *mdm = to_mdm(dm); 1446 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1447 u64 start_addr = mdm->dev_addr + attr->offset; 1448 int mode; 1449 1450 switch (mdm->type) { 1451 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1452 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1453 return ERR_PTR(-EINVAL); 1454 1455 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1456 start_addr -= pci_resource_start(dev->pdev, 0); 1457 break; 1458 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1459 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1460 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1461 return ERR_PTR(-EINVAL); 1462 1463 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1464 break; 1465 default: 1466 return ERR_PTR(-EINVAL); 1467 } 1468 1469 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1470 attr->access_flags, mode); 1471 } 1472 1473 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1474 u64 iova, int access_flags) 1475 { 1476 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1477 struct mlx5_ib_mr *mr = NULL; 1478 bool xlt_with_umr; 1479 int err; 1480 1481 xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); 1482 if (xlt_with_umr) { 1483 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1484 } else { 1485 unsigned int page_size = mlx5_umem_find_best_pgsz( 1486 umem, mkc, log_page_size, 0, iova); 1487 1488 mutex_lock(&dev->slow_path_mutex); 1489 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1490 mutex_unlock(&dev->slow_path_mutex); 1491 } 1492 if (IS_ERR(mr)) { 1493 ib_umem_release(umem); 1494 return ERR_CAST(mr); 1495 } 1496 1497 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1498 1499 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1500 1501 if (xlt_with_umr) { 1502 /* 1503 * If the MR was created with reg_create then it will be 1504 * configured properly but left disabled. It is safe to go ahead 1505 * and configure it again via UMR while enabling it. 1506 */ 1507 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1508 if (err) { 1509 dereg_mr(dev, mr); 1510 return ERR_PTR(err); 1511 } 1512 } 1513 return &mr->ibmr; 1514 } 1515 1516 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1517 u64 iova, int access_flags, 1518 struct ib_udata *udata) 1519 { 1520 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1521 struct ib_umem_odp *odp; 1522 struct mlx5_ib_mr *mr; 1523 int err; 1524 1525 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1526 return ERR_PTR(-EOPNOTSUPP); 1527 1528 if (!start && length == U64_MAX) { 1529 if (iova != 0) 1530 return ERR_PTR(-EINVAL); 1531 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1532 return ERR_PTR(-EINVAL); 1533 1534 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags); 1535 if (IS_ERR(mr)) 1536 return ERR_CAST(mr); 1537 return &mr->ibmr; 1538 } 1539 1540 /* ODP requires xlt update via umr to work. */ 1541 if (!mlx5_ib_can_load_pas_with_umr(dev, length)) 1542 return ERR_PTR(-EINVAL); 1543 1544 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1545 &mlx5_mn_ops); 1546 if (IS_ERR(odp)) 1547 return ERR_CAST(odp); 1548 1549 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1550 if (IS_ERR(mr)) { 1551 ib_umem_release(&odp->umem); 1552 return ERR_CAST(mr); 1553 } 1554 1555 odp->private = mr; 1556 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1557 if (err) 1558 goto err_dereg_mr; 1559 1560 err = mlx5_ib_init_odp_mr(mr); 1561 if (err) 1562 goto err_dereg_mr; 1563 return &mr->ibmr; 1564 1565 err_dereg_mr: 1566 dereg_mr(dev, mr); 1567 return ERR_PTR(err); 1568 } 1569 1570 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1571 u64 iova, int access_flags, 1572 struct ib_udata *udata) 1573 { 1574 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1575 struct ib_umem *umem; 1576 1577 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1578 return ERR_PTR(-EOPNOTSUPP); 1579 1580 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1581 start, iova, length, access_flags); 1582 1583 if (access_flags & IB_ACCESS_ON_DEMAND) 1584 return create_user_odp_mr(pd, start, length, iova, access_flags, 1585 udata); 1586 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1587 if (IS_ERR(umem)) 1588 return ERR_CAST(umem); 1589 return create_real_mr(pd, umem, iova, access_flags); 1590 } 1591 1592 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1593 { 1594 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1595 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1596 1597 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1598 1599 if (!umem_dmabuf->sgt) 1600 return; 1601 1602 mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1603 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1604 } 1605 1606 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1607 .allow_peer2peer = 1, 1608 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1609 }; 1610 1611 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1612 u64 length, u64 virt_addr, 1613 int fd, int access_flags, 1614 struct ib_udata *udata) 1615 { 1616 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1617 struct mlx5_ib_mr *mr = NULL; 1618 struct ib_umem_dmabuf *umem_dmabuf; 1619 int err; 1620 1621 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1622 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1623 return ERR_PTR(-EOPNOTSUPP); 1624 1625 mlx5_ib_dbg(dev, 1626 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1627 offset, virt_addr, length, fd, access_flags); 1628 1629 /* dmabuf requires xlt update via umr to work. */ 1630 if (!mlx5_ib_can_load_pas_with_umr(dev, length)) 1631 return ERR_PTR(-EINVAL); 1632 1633 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1634 access_flags, 1635 &mlx5_ib_dmabuf_attach_ops); 1636 if (IS_ERR(umem_dmabuf)) { 1637 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1638 PTR_ERR(umem_dmabuf)); 1639 return ERR_CAST(umem_dmabuf); 1640 } 1641 1642 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1643 access_flags); 1644 if (IS_ERR(mr)) { 1645 ib_umem_release(&umem_dmabuf->umem); 1646 return ERR_CAST(mr); 1647 } 1648 1649 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1650 1651 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1652 umem_dmabuf->private = mr; 1653 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1654 if (err) 1655 goto err_dereg_mr; 1656 1657 err = mlx5_ib_init_dmabuf_mr(mr); 1658 if (err) 1659 goto err_dereg_mr; 1660 return &mr->ibmr; 1661 1662 err_dereg_mr: 1663 dereg_mr(dev, mr); 1664 return ERR_PTR(err); 1665 } 1666 1667 /** 1668 * mlx5_mr_cache_invalidate - Fence all DMA on the MR 1669 * @mr: The MR to fence 1670 * 1671 * Upon return the NIC will not be doing any DMA to the pages under the MR, 1672 * and any DMA inprogress will be completed. Failure of this function 1673 * indicates the HW has failed catastrophically. 1674 */ 1675 int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) 1676 { 1677 struct mlx5_umr_wr umrwr = {}; 1678 1679 if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 1680 return 0; 1681 1682 umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | 1683 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1684 umrwr.wr.opcode = MLX5_IB_WR_UMR; 1685 umrwr.pd = mr_to_mdev(mr)->umrc.pd; 1686 umrwr.mkey = mr->mmkey.key; 1687 umrwr.ignore_free_state = 1; 1688 1689 return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr); 1690 } 1691 1692 /* 1693 * True if the change in access flags can be done via UMR, only some access 1694 * flags can be updated. 1695 */ 1696 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1697 unsigned int current_access_flags, 1698 unsigned int target_access_flags) 1699 { 1700 unsigned int diffs = current_access_flags ^ target_access_flags; 1701 1702 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1703 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1704 return false; 1705 return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags, 1706 target_access_flags); 1707 } 1708 1709 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1710 int access_flags) 1711 { 1712 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1713 struct mlx5_umr_wr umrwr = { 1714 .wr = { 1715 .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | 1716 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS, 1717 .opcode = MLX5_IB_WR_UMR, 1718 }, 1719 .mkey = mr->mmkey.key, 1720 .pd = pd, 1721 .access_flags = access_flags, 1722 }; 1723 int err; 1724 1725 err = mlx5_ib_post_send_wait(dev, &umrwr); 1726 if (err) 1727 return err; 1728 1729 mr->access_flags = access_flags; 1730 mr->mmkey.pd = to_mpd(pd)->pdn; 1731 return 0; 1732 } 1733 1734 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1735 struct ib_umem *new_umem, 1736 int new_access_flags, u64 iova, 1737 unsigned long *page_size) 1738 { 1739 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1740 1741 /* We only track the allocated sizes of MRs from the cache */ 1742 if (!mr->cache_ent) 1743 return false; 1744 if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length)) 1745 return false; 1746 1747 *page_size = 1748 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1749 if (WARN_ON(!*page_size)) 1750 return false; 1751 return (1ULL << mr->cache_ent->order) >= 1752 ib_umem_num_dma_blocks(new_umem, *page_size); 1753 } 1754 1755 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1756 int access_flags, int flags, struct ib_umem *new_umem, 1757 u64 iova, unsigned long page_size) 1758 { 1759 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1760 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1761 struct ib_umem *old_umem = mr->umem; 1762 int err; 1763 1764 /* 1765 * To keep everything simple the MR is revoked before we start to mess 1766 * with it. This ensure the change is atomic relative to any use of the 1767 * MR. 1768 */ 1769 err = mlx5_mr_cache_invalidate(mr); 1770 if (err) 1771 return err; 1772 1773 if (flags & IB_MR_REREG_PD) { 1774 mr->ibmr.pd = pd; 1775 mr->mmkey.pd = to_mpd(pd)->pdn; 1776 upd_flags |= MLX5_IB_UPD_XLT_PD; 1777 } 1778 if (flags & IB_MR_REREG_ACCESS) { 1779 mr->access_flags = access_flags; 1780 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1781 } 1782 1783 mr->ibmr.length = new_umem->length; 1784 mr->mmkey.iova = iova; 1785 mr->mmkey.size = new_umem->length; 1786 mr->page_shift = order_base_2(page_size); 1787 mr->umem = new_umem; 1788 err = mlx5_ib_update_mr_pas(mr, upd_flags); 1789 if (err) { 1790 /* 1791 * The MR is revoked at this point so there is no issue to free 1792 * new_umem. 1793 */ 1794 mr->umem = old_umem; 1795 return err; 1796 } 1797 1798 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1799 ib_umem_release(old_umem); 1800 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1801 return 0; 1802 } 1803 1804 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1805 u64 length, u64 iova, int new_access_flags, 1806 struct ib_pd *new_pd, 1807 struct ib_udata *udata) 1808 { 1809 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1810 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1811 int err; 1812 1813 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1814 return ERR_PTR(-EOPNOTSUPP); 1815 1816 mlx5_ib_dbg( 1817 dev, 1818 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1819 start, iova, length, new_access_flags); 1820 1821 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1822 return ERR_PTR(-EOPNOTSUPP); 1823 1824 if (!(flags & IB_MR_REREG_ACCESS)) 1825 new_access_flags = mr->access_flags; 1826 if (!(flags & IB_MR_REREG_PD)) 1827 new_pd = ib_mr->pd; 1828 1829 if (!(flags & IB_MR_REREG_TRANS)) { 1830 struct ib_umem *umem; 1831 1832 /* Fast path for PD/access change */ 1833 if (can_use_umr_rereg_access(dev, mr->access_flags, 1834 new_access_flags)) { 1835 err = umr_rereg_pd_access(mr, new_pd, new_access_flags); 1836 if (err) 1837 return ERR_PTR(err); 1838 return NULL; 1839 } 1840 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1841 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1842 goto recreate; 1843 1844 /* 1845 * Only one active MR can refer to a umem at one time, revoke 1846 * the old MR before assigning the umem to the new one. 1847 */ 1848 err = mlx5_mr_cache_invalidate(mr); 1849 if (err) 1850 return ERR_PTR(err); 1851 umem = mr->umem; 1852 mr->umem = NULL; 1853 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1854 1855 return create_real_mr(new_pd, umem, mr->mmkey.iova, 1856 new_access_flags); 1857 } 1858 1859 /* 1860 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1861 * but the logic around releasing the umem is different 1862 */ 1863 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1864 goto recreate; 1865 1866 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1867 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1868 struct ib_umem *new_umem; 1869 unsigned long page_size; 1870 1871 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1872 new_access_flags); 1873 if (IS_ERR(new_umem)) 1874 return ERR_CAST(new_umem); 1875 1876 /* Fast path for PAS change */ 1877 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1878 &page_size)) { 1879 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1880 new_umem, iova, page_size); 1881 if (err) { 1882 ib_umem_release(new_umem); 1883 return ERR_PTR(err); 1884 } 1885 return NULL; 1886 } 1887 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1888 } 1889 1890 /* 1891 * Everything else has no state we can preserve, just create a new MR 1892 * from scratch 1893 */ 1894 recreate: 1895 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1896 new_access_flags, udata); 1897 } 1898 1899 static int 1900 mlx5_alloc_priv_descs(struct ib_device *device, 1901 struct mlx5_ib_mr *mr, 1902 int ndescs, 1903 int desc_size) 1904 { 1905 struct mlx5_ib_dev *dev = to_mdev(device); 1906 struct device *ddev = &dev->mdev->pdev->dev; 1907 int size = ndescs * desc_size; 1908 int add_size; 1909 int ret; 1910 1911 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1912 1913 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1914 if (!mr->descs_alloc) 1915 return -ENOMEM; 1916 1917 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1918 1919 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1920 if (dma_mapping_error(ddev, mr->desc_map)) { 1921 ret = -ENOMEM; 1922 goto err; 1923 } 1924 1925 return 0; 1926 err: 1927 kfree(mr->descs_alloc); 1928 1929 return ret; 1930 } 1931 1932 static void 1933 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1934 { 1935 if (mr->descs) { 1936 struct ib_device *device = mr->ibmr.device; 1937 int size = mr->max_descs * mr->desc_size; 1938 struct mlx5_ib_dev *dev = to_mdev(device); 1939 1940 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1941 DMA_TO_DEVICE); 1942 kfree(mr->descs_alloc); 1943 mr->descs = NULL; 1944 } 1945 } 1946 1947 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1948 { 1949 if (mr->sig) { 1950 if (mlx5_core_destroy_psv(dev->mdev, 1951 mr->sig->psv_memory.psv_idx)) 1952 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1953 mr->sig->psv_memory.psv_idx); 1954 if (mlx5_core_destroy_psv(dev->mdev, 1955 mr->sig->psv_wire.psv_idx)) 1956 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1957 mr->sig->psv_wire.psv_idx); 1958 xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key)); 1959 kfree(mr->sig); 1960 mr->sig = NULL; 1961 } 1962 1963 if (!mr->cache_ent) { 1964 destroy_mkey(dev, mr); 1965 mlx5_free_priv_descs(mr); 1966 } 1967 } 1968 1969 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1970 { 1971 struct ib_umem *umem = mr->umem; 1972 1973 /* Stop all DMA */ 1974 if (is_odp_mr(mr)) 1975 mlx5_ib_fence_odp_mr(mr); 1976 else if (is_dmabuf_mr(mr)) 1977 mlx5_ib_fence_dmabuf_mr(mr); 1978 else 1979 clean_mr(dev, mr); 1980 1981 if (umem) { 1982 if (!is_odp_mr(mr)) 1983 atomic_sub(ib_umem_num_pages(umem), 1984 &dev->mdev->priv.reg_pages); 1985 ib_umem_release(umem); 1986 } 1987 1988 if (mr->cache_ent) 1989 mlx5_mr_cache_free(dev, mr); 1990 else 1991 kfree(mr); 1992 } 1993 1994 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1995 { 1996 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1997 1998 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1999 dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr); 2000 dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); 2001 } 2002 2003 if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) { 2004 mlx5_ib_free_implicit_mr(mmr); 2005 return 0; 2006 } 2007 2008 dereg_mr(to_mdev(ibmr->device), mmr); 2009 2010 return 0; 2011 } 2012 2013 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2014 int access_mode, int page_shift) 2015 { 2016 void *mkc; 2017 2018 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2019 2020 /* This is only used from the kernel, so setting the PD is OK. */ 2021 set_mkc_access_pd_addr_fields(mkc, 0, 0, pd); 2022 MLX5_SET(mkc, mkc, free, 1); 2023 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2024 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2025 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2026 MLX5_SET(mkc, mkc, umr_en, 1); 2027 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2028 } 2029 2030 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2031 int ndescs, int desc_size, int page_shift, 2032 int access_mode, u32 *in, int inlen) 2033 { 2034 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2035 int err; 2036 2037 mr->access_mode = access_mode; 2038 mr->desc_size = desc_size; 2039 mr->max_descs = ndescs; 2040 2041 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2042 if (err) 2043 return err; 2044 2045 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2046 2047 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2048 if (err) 2049 goto err_free_descs; 2050 2051 mr->mmkey.type = MLX5_MKEY_MR; 2052 mr->ibmr.lkey = mr->mmkey.key; 2053 mr->ibmr.rkey = mr->mmkey.key; 2054 2055 return 0; 2056 2057 err_free_descs: 2058 mlx5_free_priv_descs(mr); 2059 return err; 2060 } 2061 2062 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2063 u32 max_num_sg, u32 max_num_meta_sg, 2064 int desc_size, int access_mode) 2065 { 2066 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2067 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2068 int page_shift = 0; 2069 struct mlx5_ib_mr *mr; 2070 u32 *in; 2071 int err; 2072 2073 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2074 if (!mr) 2075 return ERR_PTR(-ENOMEM); 2076 2077 mr->ibmr.pd = pd; 2078 mr->ibmr.device = pd->device; 2079 2080 in = kzalloc(inlen, GFP_KERNEL); 2081 if (!in) { 2082 err = -ENOMEM; 2083 goto err_free; 2084 } 2085 2086 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2087 page_shift = PAGE_SHIFT; 2088 2089 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2090 access_mode, in, inlen); 2091 if (err) 2092 goto err_free_in; 2093 2094 mr->umem = NULL; 2095 kfree(in); 2096 2097 return mr; 2098 2099 err_free_in: 2100 kfree(in); 2101 err_free: 2102 kfree(mr); 2103 return ERR_PTR(err); 2104 } 2105 2106 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2107 int ndescs, u32 *in, int inlen) 2108 { 2109 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2110 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2111 inlen); 2112 } 2113 2114 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2115 int ndescs, u32 *in, int inlen) 2116 { 2117 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2118 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2119 } 2120 2121 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2122 int max_num_sg, int max_num_meta_sg, 2123 u32 *in, int inlen) 2124 { 2125 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2126 u32 psv_index[2]; 2127 void *mkc; 2128 int err; 2129 2130 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2131 if (!mr->sig) 2132 return -ENOMEM; 2133 2134 /* create mem & wire PSVs */ 2135 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2136 if (err) 2137 goto err_free_sig; 2138 2139 mr->sig->psv_memory.psv_idx = psv_index[0]; 2140 mr->sig->psv_wire.psv_idx = psv_index[1]; 2141 2142 mr->sig->sig_status_checked = true; 2143 mr->sig->sig_err_exists = false; 2144 /* Next UMR, Arm SIGERR */ 2145 ++mr->sig->sigerr_count; 2146 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2147 sizeof(struct mlx5_klm), 2148 MLX5_MKC_ACCESS_MODE_KLMS); 2149 if (IS_ERR(mr->klm_mr)) { 2150 err = PTR_ERR(mr->klm_mr); 2151 goto err_destroy_psv; 2152 } 2153 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2154 sizeof(struct mlx5_mtt), 2155 MLX5_MKC_ACCESS_MODE_MTT); 2156 if (IS_ERR(mr->mtt_mr)) { 2157 err = PTR_ERR(mr->mtt_mr); 2158 goto err_free_klm_mr; 2159 } 2160 2161 /* Set bsf descriptors for mkey */ 2162 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2163 MLX5_SET(mkc, mkc, bsf_en, 1); 2164 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2165 2166 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2167 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2168 if (err) 2169 goto err_free_mtt_mr; 2170 2171 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2172 mr->sig, GFP_KERNEL)); 2173 if (err) 2174 goto err_free_descs; 2175 return 0; 2176 2177 err_free_descs: 2178 destroy_mkey(dev, mr); 2179 mlx5_free_priv_descs(mr); 2180 err_free_mtt_mr: 2181 dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); 2182 mr->mtt_mr = NULL; 2183 err_free_klm_mr: 2184 dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); 2185 mr->klm_mr = NULL; 2186 err_destroy_psv: 2187 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2188 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2189 mr->sig->psv_memory.psv_idx); 2190 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2191 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2192 mr->sig->psv_wire.psv_idx); 2193 err_free_sig: 2194 kfree(mr->sig); 2195 2196 return err; 2197 } 2198 2199 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2200 enum ib_mr_type mr_type, u32 max_num_sg, 2201 u32 max_num_meta_sg) 2202 { 2203 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2204 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2205 int ndescs = ALIGN(max_num_sg, 4); 2206 struct mlx5_ib_mr *mr; 2207 u32 *in; 2208 int err; 2209 2210 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2211 if (!mr) 2212 return ERR_PTR(-ENOMEM); 2213 2214 in = kzalloc(inlen, GFP_KERNEL); 2215 if (!in) { 2216 err = -ENOMEM; 2217 goto err_free; 2218 } 2219 2220 mr->ibmr.device = pd->device; 2221 mr->umem = NULL; 2222 2223 switch (mr_type) { 2224 case IB_MR_TYPE_MEM_REG: 2225 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2226 break; 2227 case IB_MR_TYPE_SG_GAPS: 2228 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2229 break; 2230 case IB_MR_TYPE_INTEGRITY: 2231 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2232 max_num_meta_sg, in, inlen); 2233 break; 2234 default: 2235 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2236 err = -EINVAL; 2237 } 2238 2239 if (err) 2240 goto err_free_in; 2241 2242 kfree(in); 2243 2244 return &mr->ibmr; 2245 2246 err_free_in: 2247 kfree(in); 2248 err_free: 2249 kfree(mr); 2250 return ERR_PTR(err); 2251 } 2252 2253 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2254 u32 max_num_sg) 2255 { 2256 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2257 } 2258 2259 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2260 u32 max_num_sg, u32 max_num_meta_sg) 2261 { 2262 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2263 max_num_meta_sg); 2264 } 2265 2266 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2267 { 2268 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2269 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2270 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2271 u32 *in = NULL; 2272 void *mkc; 2273 int ndescs; 2274 int err; 2275 struct mlx5_ib_alloc_mw req = {}; 2276 struct { 2277 __u32 comp_mask; 2278 __u32 response_length; 2279 } resp = {}; 2280 2281 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2282 if (err) 2283 return err; 2284 2285 if (req.comp_mask || req.reserved1 || req.reserved2) 2286 return -EOPNOTSUPP; 2287 2288 if (udata->inlen > sizeof(req) && 2289 !ib_is_udata_cleared(udata, sizeof(req), 2290 udata->inlen - sizeof(req))) 2291 return -EOPNOTSUPP; 2292 2293 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2294 2295 in = kzalloc(inlen, GFP_KERNEL); 2296 if (!in) { 2297 err = -ENOMEM; 2298 goto free; 2299 } 2300 2301 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2302 2303 MLX5_SET(mkc, mkc, free, 1); 2304 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2305 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2306 MLX5_SET(mkc, mkc, umr_en, 1); 2307 MLX5_SET(mkc, mkc, lr, 1); 2308 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2309 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2310 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2311 2312 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2313 if (err) 2314 goto free; 2315 2316 mw->mmkey.type = MLX5_MKEY_MW; 2317 ibmw->rkey = mw->mmkey.key; 2318 mw->ndescs = ndescs; 2319 2320 resp.response_length = 2321 min(offsetofend(typeof(resp), response_length), udata->outlen); 2322 if (resp.response_length) { 2323 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2324 if (err) 2325 goto free_mkey; 2326 } 2327 2328 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2329 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2330 if (err) 2331 goto free_mkey; 2332 } 2333 2334 kfree(in); 2335 return 0; 2336 2337 free_mkey: 2338 mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); 2339 free: 2340 kfree(in); 2341 return err; 2342 } 2343 2344 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2345 { 2346 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2347 struct mlx5_ib_mw *mmw = to_mmw(mw); 2348 2349 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2350 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2351 /* 2352 * pagefault_single_data_segment() may be accessing mmw 2353 * if the user bound an ODP MR to this MW. 2354 */ 2355 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2356 2357 return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); 2358 } 2359 2360 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2361 struct ib_mr_status *mr_status) 2362 { 2363 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2364 int ret = 0; 2365 2366 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2367 pr_err("Invalid status check mask\n"); 2368 ret = -EINVAL; 2369 goto done; 2370 } 2371 2372 mr_status->fail_status = 0; 2373 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2374 if (!mmr->sig) { 2375 ret = -EINVAL; 2376 pr_err("signature status check requested on a non-signature enabled MR\n"); 2377 goto done; 2378 } 2379 2380 mmr->sig->sig_status_checked = true; 2381 if (!mmr->sig->sig_err_exists) 2382 goto done; 2383 2384 if (ibmr->lkey == mmr->sig->err_item.key) 2385 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2386 sizeof(mr_status->sig_err)); 2387 else { 2388 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2389 mr_status->sig_err.sig_err_offset = 0; 2390 mr_status->sig_err.key = mmr->sig->err_item.key; 2391 } 2392 2393 mmr->sig->sig_err_exists = false; 2394 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2395 } 2396 2397 done: 2398 return ret; 2399 } 2400 2401 static int 2402 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2403 int data_sg_nents, unsigned int *data_sg_offset, 2404 struct scatterlist *meta_sg, int meta_sg_nents, 2405 unsigned int *meta_sg_offset) 2406 { 2407 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2408 unsigned int sg_offset = 0; 2409 int n = 0; 2410 2411 mr->meta_length = 0; 2412 if (data_sg_nents == 1) { 2413 n++; 2414 mr->ndescs = 1; 2415 if (data_sg_offset) 2416 sg_offset = *data_sg_offset; 2417 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2418 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2419 if (meta_sg_nents == 1) { 2420 n++; 2421 mr->meta_ndescs = 1; 2422 if (meta_sg_offset) 2423 sg_offset = *meta_sg_offset; 2424 else 2425 sg_offset = 0; 2426 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2427 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2428 } 2429 ibmr->length = mr->data_length + mr->meta_length; 2430 } 2431 2432 return n; 2433 } 2434 2435 static int 2436 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2437 struct scatterlist *sgl, 2438 unsigned short sg_nents, 2439 unsigned int *sg_offset_p, 2440 struct scatterlist *meta_sgl, 2441 unsigned short meta_sg_nents, 2442 unsigned int *meta_sg_offset_p) 2443 { 2444 struct scatterlist *sg = sgl; 2445 struct mlx5_klm *klms = mr->descs; 2446 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2447 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2448 int i, j = 0; 2449 2450 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2451 mr->ibmr.length = 0; 2452 2453 for_each_sg(sgl, sg, sg_nents, i) { 2454 if (unlikely(i >= mr->max_descs)) 2455 break; 2456 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2457 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2458 klms[i].key = cpu_to_be32(lkey); 2459 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2460 2461 sg_offset = 0; 2462 } 2463 2464 if (sg_offset_p) 2465 *sg_offset_p = sg_offset; 2466 2467 mr->ndescs = i; 2468 mr->data_length = mr->ibmr.length; 2469 2470 if (meta_sg_nents) { 2471 sg = meta_sgl; 2472 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2473 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2474 if (unlikely(i + j >= mr->max_descs)) 2475 break; 2476 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2477 sg_offset); 2478 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2479 sg_offset); 2480 klms[i + j].key = cpu_to_be32(lkey); 2481 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2482 2483 sg_offset = 0; 2484 } 2485 if (meta_sg_offset_p) 2486 *meta_sg_offset_p = sg_offset; 2487 2488 mr->meta_ndescs = j; 2489 mr->meta_length = mr->ibmr.length - mr->data_length; 2490 } 2491 2492 return i + j; 2493 } 2494 2495 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2496 { 2497 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2498 __be64 *descs; 2499 2500 if (unlikely(mr->ndescs == mr->max_descs)) 2501 return -ENOMEM; 2502 2503 descs = mr->descs; 2504 descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2505 2506 return 0; 2507 } 2508 2509 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2510 { 2511 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2512 __be64 *descs; 2513 2514 if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs)) 2515 return -ENOMEM; 2516 2517 descs = mr->descs; 2518 descs[mr->ndescs + mr->meta_ndescs++] = 2519 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2520 2521 return 0; 2522 } 2523 2524 static int 2525 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2526 int data_sg_nents, unsigned int *data_sg_offset, 2527 struct scatterlist *meta_sg, int meta_sg_nents, 2528 unsigned int *meta_sg_offset) 2529 { 2530 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2531 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2532 int n; 2533 2534 pi_mr->ndescs = 0; 2535 pi_mr->meta_ndescs = 0; 2536 pi_mr->meta_length = 0; 2537 2538 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2539 pi_mr->desc_size * pi_mr->max_descs, 2540 DMA_TO_DEVICE); 2541 2542 pi_mr->ibmr.page_size = ibmr->page_size; 2543 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2544 mlx5_set_page); 2545 if (n != data_sg_nents) 2546 return n; 2547 2548 pi_mr->data_iova = pi_mr->ibmr.iova; 2549 pi_mr->data_length = pi_mr->ibmr.length; 2550 pi_mr->ibmr.length = pi_mr->data_length; 2551 ibmr->length = pi_mr->data_length; 2552 2553 if (meta_sg_nents) { 2554 u64 page_mask = ~((u64)ibmr->page_size - 1); 2555 u64 iova = pi_mr->data_iova; 2556 2557 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2558 meta_sg_offset, mlx5_set_page_pi); 2559 2560 pi_mr->meta_length = pi_mr->ibmr.length; 2561 /* 2562 * PI address for the HW is the offset of the metadata address 2563 * relative to the first data page address. 2564 * It equals to first data page address + size of data pages + 2565 * metadata offset at the first metadata page 2566 */ 2567 pi_mr->pi_iova = (iova & page_mask) + 2568 pi_mr->ndescs * ibmr->page_size + 2569 (pi_mr->ibmr.iova & ~page_mask); 2570 /* 2571 * In order to use one MTT MR for data and metadata, we register 2572 * also the gaps between the end of the data and the start of 2573 * the metadata (the sig MR will verify that the HW will access 2574 * to right addresses). This mapping is safe because we use 2575 * internal mkey for the registration. 2576 */ 2577 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2578 pi_mr->ibmr.iova = iova; 2579 ibmr->length += pi_mr->meta_length; 2580 } 2581 2582 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2583 pi_mr->desc_size * pi_mr->max_descs, 2584 DMA_TO_DEVICE); 2585 2586 return n; 2587 } 2588 2589 static int 2590 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2591 int data_sg_nents, unsigned int *data_sg_offset, 2592 struct scatterlist *meta_sg, int meta_sg_nents, 2593 unsigned int *meta_sg_offset) 2594 { 2595 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2596 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2597 int n; 2598 2599 pi_mr->ndescs = 0; 2600 pi_mr->meta_ndescs = 0; 2601 pi_mr->meta_length = 0; 2602 2603 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2604 pi_mr->desc_size * pi_mr->max_descs, 2605 DMA_TO_DEVICE); 2606 2607 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2608 meta_sg, meta_sg_nents, meta_sg_offset); 2609 2610 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2611 pi_mr->desc_size * pi_mr->max_descs, 2612 DMA_TO_DEVICE); 2613 2614 /* This is zero-based memory region */ 2615 pi_mr->data_iova = 0; 2616 pi_mr->ibmr.iova = 0; 2617 pi_mr->pi_iova = pi_mr->data_length; 2618 ibmr->length = pi_mr->ibmr.length; 2619 2620 return n; 2621 } 2622 2623 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2624 int data_sg_nents, unsigned int *data_sg_offset, 2625 struct scatterlist *meta_sg, int meta_sg_nents, 2626 unsigned int *meta_sg_offset) 2627 { 2628 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2629 struct mlx5_ib_mr *pi_mr = NULL; 2630 int n; 2631 2632 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2633 2634 mr->ndescs = 0; 2635 mr->data_length = 0; 2636 mr->data_iova = 0; 2637 mr->meta_ndescs = 0; 2638 mr->pi_iova = 0; 2639 /* 2640 * As a performance optimization, if possible, there is no need to 2641 * perform UMR operation to register the data/metadata buffers. 2642 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2643 * Fallback to UMR only in case of a failure. 2644 */ 2645 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2646 data_sg_offset, meta_sg, meta_sg_nents, 2647 meta_sg_offset); 2648 if (n == data_sg_nents + meta_sg_nents) 2649 goto out; 2650 /* 2651 * As a performance optimization, if possible, there is no need to map 2652 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2653 * descriptors and fallback to KLM only in case of a failure. 2654 * It's more efficient for the HW to work with MTT descriptors 2655 * (especially in high load). 2656 * Use KLM (indirect access) only if it's mandatory. 2657 */ 2658 pi_mr = mr->mtt_mr; 2659 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2660 data_sg_offset, meta_sg, meta_sg_nents, 2661 meta_sg_offset); 2662 if (n == data_sg_nents + meta_sg_nents) 2663 goto out; 2664 2665 pi_mr = mr->klm_mr; 2666 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2667 data_sg_offset, meta_sg, meta_sg_nents, 2668 meta_sg_offset); 2669 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2670 return -ENOMEM; 2671 2672 out: 2673 /* This is zero-based memory region */ 2674 ibmr->iova = 0; 2675 mr->pi_mr = pi_mr; 2676 if (pi_mr) 2677 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2678 else 2679 ibmr->sig_attrs->meta_length = mr->meta_length; 2680 2681 return 0; 2682 } 2683 2684 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2685 unsigned int *sg_offset) 2686 { 2687 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2688 int n; 2689 2690 mr->ndescs = 0; 2691 2692 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2693 mr->desc_size * mr->max_descs, 2694 DMA_TO_DEVICE); 2695 2696 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2697 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2698 NULL); 2699 else 2700 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2701 mlx5_set_page); 2702 2703 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2704 mr->desc_size * mr->max_descs, 2705 DMA_TO_DEVICE); 2706 2707 return n; 2708 } 2709