1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 11 u16 *vhca_id); 12 static void 13 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 14 15 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 16 { 17 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 18 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 19 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 20 int err; 21 22 lockdep_assert_held(&mvdev->state_mutex); 23 if (mvdev->mdev_detach) 24 return -ENOTCONN; 25 26 /* 27 * In case PRE_COPY is used, saving_migf is exposed while the device is 28 * running. Make sure to run only once there is no active save command. 29 * Running both in parallel, might end-up with a failure in the save 30 * command once it will try to turn on 'tracking' on a suspended device. 31 */ 32 if (migf) { 33 err = wait_for_completion_interruptible(&migf->save_comp); 34 if (err) 35 return err; 36 } 37 38 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 39 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 40 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 41 42 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 43 if (migf) 44 complete(&migf->save_comp); 45 46 return err; 47 } 48 49 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 50 { 51 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 52 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 53 54 lockdep_assert_held(&mvdev->state_mutex); 55 if (mvdev->mdev_detach) 56 return -ENOTCONN; 57 58 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 59 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 60 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 61 62 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 63 } 64 65 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 66 size_t *state_size, u8 query_flags) 67 { 68 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 69 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 70 bool inc = query_flags & MLX5VF_QUERY_INC; 71 int ret; 72 73 lockdep_assert_held(&mvdev->state_mutex); 74 if (mvdev->mdev_detach) 75 return -ENOTCONN; 76 77 /* 78 * In case PRE_COPY is used, saving_migf is exposed while device is 79 * running. Make sure to run only once there is no active save command. 80 * Running both in parallel, might end-up with a failure in the 81 * incremental query command on un-tracked vhca. 82 */ 83 if (inc) { 84 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 85 if (ret) 86 return ret; 87 if (mvdev->saving_migf->state == 88 MLX5_MIGF_STATE_PRE_COPY_ERROR) { 89 /* 90 * In case we had a PRE_COPY error, only query full 91 * image for final image 92 */ 93 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 94 *state_size = 0; 95 complete(&mvdev->saving_migf->save_comp); 96 return 0; 97 } 98 query_flags &= ~MLX5VF_QUERY_INC; 99 } 100 } 101 102 MLX5_SET(query_vhca_migration_state_in, in, opcode, 103 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 104 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 105 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 106 MLX5_SET(query_vhca_migration_state_in, in, incremental, 107 query_flags & MLX5VF_QUERY_INC); 108 109 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 110 out); 111 if (inc) 112 complete(&mvdev->saving_migf->save_comp); 113 114 if (ret) 115 return ret; 116 117 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 118 required_umem_size); 119 return 0; 120 } 121 122 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 123 { 124 /* Mark the tracker under an error and wake it up if it's running */ 125 mvdev->tracker.is_err = true; 126 complete(&mvdev->tracker_comp); 127 } 128 129 static int mlx5fv_vf_event(struct notifier_block *nb, 130 unsigned long event, void *data) 131 { 132 struct mlx5vf_pci_core_device *mvdev = 133 container_of(nb, struct mlx5vf_pci_core_device, nb); 134 135 switch (event) { 136 case MLX5_PF_NOTIFY_ENABLE_VF: 137 mutex_lock(&mvdev->state_mutex); 138 mvdev->mdev_detach = false; 139 mlx5vf_state_mutex_unlock(mvdev); 140 break; 141 case MLX5_PF_NOTIFY_DISABLE_VF: 142 mlx5vf_cmd_close_migratable(mvdev); 143 mutex_lock(&mvdev->state_mutex); 144 mvdev->mdev_detach = true; 145 mlx5vf_state_mutex_unlock(mvdev); 146 break; 147 default: 148 break; 149 } 150 151 return 0; 152 } 153 154 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 155 { 156 if (!mvdev->migrate_cap) 157 return; 158 159 /* Must be done outside the lock to let it progress */ 160 set_tracker_error(mvdev); 161 mutex_lock(&mvdev->state_mutex); 162 mlx5vf_disable_fds(mvdev); 163 _mlx5vf_free_page_tracker_resources(mvdev); 164 mlx5vf_state_mutex_unlock(mvdev); 165 } 166 167 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 168 { 169 if (!mvdev->migrate_cap) 170 return; 171 172 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 173 &mvdev->nb); 174 destroy_workqueue(mvdev->cb_wq); 175 } 176 177 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 178 const struct vfio_migration_ops *mig_ops, 179 const struct vfio_log_ops *log_ops) 180 { 181 struct pci_dev *pdev = mvdev->core_device.pdev; 182 int ret; 183 184 if (!pdev->is_virtfn) 185 return; 186 187 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 188 if (!mvdev->mdev) 189 return; 190 191 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 192 goto end; 193 194 mvdev->vf_id = pci_iov_vf_id(pdev); 195 if (mvdev->vf_id < 0) 196 goto end; 197 198 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 199 &mvdev->vhca_id)) 200 goto end; 201 202 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 203 if (!mvdev->cb_wq) 204 goto end; 205 206 mutex_init(&mvdev->state_mutex); 207 spin_lock_init(&mvdev->reset_lock); 208 mvdev->nb.notifier_call = mlx5fv_vf_event; 209 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 210 &mvdev->nb); 211 if (ret) { 212 destroy_workqueue(mvdev->cb_wq); 213 goto end; 214 } 215 216 mvdev->migrate_cap = 1; 217 mvdev->core_device.vdev.migration_flags = 218 VFIO_MIGRATION_STOP_COPY | 219 VFIO_MIGRATION_P2P; 220 mvdev->core_device.vdev.mig_ops = mig_ops; 221 init_completion(&mvdev->tracker_comp); 222 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 223 mvdev->core_device.vdev.log_ops = log_ops; 224 225 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 226 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 227 mvdev->core_device.vdev.migration_flags |= 228 VFIO_MIGRATION_PRE_COPY; 229 230 end: 231 mlx5_vf_put_core_dev(mvdev->mdev); 232 } 233 234 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 235 u16 *vhca_id) 236 { 237 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 238 int out_size; 239 void *out; 240 int ret; 241 242 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 243 out = kzalloc(out_size, GFP_KERNEL); 244 if (!out) 245 return -ENOMEM; 246 247 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 248 MLX5_SET(query_hca_cap_in, in, other_function, 1); 249 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 250 MLX5_SET(query_hca_cap_in, in, op_mod, 251 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 252 HCA_CAP_OPMOD_GET_CUR); 253 254 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 255 if (ret) 256 goto err_exec; 257 258 *vhca_id = MLX5_GET(query_hca_cap_out, out, 259 capability.cmd_hca_cap.vhca_id); 260 261 err_exec: 262 kfree(out); 263 return ret; 264 } 265 266 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 267 struct mlx5_vhca_data_buffer *buf, 268 struct mlx5_vhca_recv_buf *recv_buf, 269 u32 *mkey) 270 { 271 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 272 recv_buf->npages; 273 int err = 0, inlen; 274 __be64 *mtt; 275 void *mkc; 276 u32 *in; 277 278 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 279 sizeof(*mtt) * round_up(npages, 2); 280 281 in = kvzalloc(inlen, GFP_KERNEL); 282 if (!in) 283 return -ENOMEM; 284 285 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 286 DIV_ROUND_UP(npages, 2)); 287 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 288 289 if (buf) { 290 struct sg_dma_page_iter dma_iter; 291 292 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 293 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 294 } else { 295 int i; 296 297 for (i = 0; i < npages; i++) 298 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 299 } 300 301 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 302 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 303 MLX5_SET(mkc, mkc, lr, 1); 304 MLX5_SET(mkc, mkc, lw, 1); 305 MLX5_SET(mkc, mkc, rr, 1); 306 MLX5_SET(mkc, mkc, rw, 1); 307 MLX5_SET(mkc, mkc, pd, pdn); 308 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 309 MLX5_SET(mkc, mkc, qpn, 0xffffff); 310 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 311 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 312 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 313 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 314 kvfree(in); 315 return err; 316 } 317 318 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 319 { 320 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 321 struct mlx5_core_dev *mdev = mvdev->mdev; 322 int ret; 323 324 lockdep_assert_held(&mvdev->state_mutex); 325 if (mvdev->mdev_detach) 326 return -ENOTCONN; 327 328 if (buf->dmaed || !buf->allocated_length) 329 return -EINVAL; 330 331 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 332 if (ret) 333 return ret; 334 335 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 336 if (ret) 337 goto err; 338 339 buf->dmaed = true; 340 341 return 0; 342 err: 343 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 344 return ret; 345 } 346 347 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 348 { 349 struct mlx5_vf_migration_file *migf = buf->migf; 350 struct sg_page_iter sg_iter; 351 352 lockdep_assert_held(&migf->mvdev->state_mutex); 353 WARN_ON(migf->mvdev->mdev_detach); 354 355 if (buf->dmaed) { 356 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 357 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 358 buf->dma_dir, 0); 359 } 360 361 /* Undo alloc_pages_bulk_array() */ 362 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 363 __free_page(sg_page_iter_page(&sg_iter)); 364 sg_free_append_table(&buf->table); 365 kfree(buf); 366 } 367 368 struct mlx5_vhca_data_buffer * 369 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 370 size_t length, 371 enum dma_data_direction dma_dir) 372 { 373 struct mlx5_vhca_data_buffer *buf; 374 int ret; 375 376 buf = kzalloc(sizeof(*buf), GFP_KERNEL); 377 if (!buf) 378 return ERR_PTR(-ENOMEM); 379 380 buf->dma_dir = dma_dir; 381 buf->migf = migf; 382 if (length) { 383 ret = mlx5vf_add_migration_pages(buf, 384 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 385 if (ret) 386 goto end; 387 388 if (dma_dir != DMA_NONE) { 389 ret = mlx5vf_dma_data_buffer(buf); 390 if (ret) 391 goto end; 392 } 393 } 394 395 return buf; 396 end: 397 mlx5vf_free_data_buffer(buf); 398 return ERR_PTR(ret); 399 } 400 401 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 402 { 403 spin_lock_irq(&buf->migf->list_lock); 404 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 405 spin_unlock_irq(&buf->migf->list_lock); 406 } 407 408 struct mlx5_vhca_data_buffer * 409 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 410 size_t length, enum dma_data_direction dma_dir) 411 { 412 struct mlx5_vhca_data_buffer *buf, *temp_buf; 413 struct list_head free_list; 414 415 lockdep_assert_held(&migf->mvdev->state_mutex); 416 if (migf->mvdev->mdev_detach) 417 return ERR_PTR(-ENOTCONN); 418 419 INIT_LIST_HEAD(&free_list); 420 421 spin_lock_irq(&migf->list_lock); 422 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 423 if (buf->dma_dir == dma_dir) { 424 list_del_init(&buf->buf_elm); 425 if (buf->allocated_length >= length) { 426 spin_unlock_irq(&migf->list_lock); 427 goto found; 428 } 429 /* 430 * Prevent holding redundant buffers. Put in a free 431 * list and call at the end not under the spin lock 432 * (&migf->list_lock) to mlx5vf_free_data_buffer which 433 * might sleep. 434 */ 435 list_add(&buf->buf_elm, &free_list); 436 } 437 } 438 spin_unlock_irq(&migf->list_lock); 439 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 440 441 found: 442 while ((temp_buf = list_first_entry_or_null(&free_list, 443 struct mlx5_vhca_data_buffer, buf_elm))) { 444 list_del(&temp_buf->buf_elm); 445 mlx5vf_free_data_buffer(temp_buf); 446 } 447 448 return buf; 449 } 450 451 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 452 { 453 struct mlx5vf_async_data *async_data = container_of(_work, 454 struct mlx5vf_async_data, work); 455 struct mlx5_vf_migration_file *migf = container_of(async_data, 456 struct mlx5_vf_migration_file, async_data); 457 458 mutex_lock(&migf->lock); 459 if (async_data->status) { 460 mlx5vf_put_data_buffer(async_data->buf); 461 if (async_data->header_buf) 462 mlx5vf_put_data_buffer(async_data->header_buf); 463 if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 464 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 465 else 466 migf->state = MLX5_MIGF_STATE_ERROR; 467 wake_up_interruptible(&migf->poll_wait); 468 } 469 mutex_unlock(&migf->lock); 470 kvfree(async_data->out); 471 complete(&migf->save_comp); 472 fput(migf->filp); 473 } 474 475 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 476 size_t image_size) 477 { 478 struct mlx5_vf_migration_file *migf = header_buf->migf; 479 struct mlx5_vf_migration_header header = {}; 480 unsigned long flags; 481 struct page *page; 482 u8 *to_buff; 483 484 header.image_size = cpu_to_le64(image_size); 485 page = mlx5vf_get_migration_page(header_buf, 0); 486 if (!page) 487 return -EINVAL; 488 to_buff = kmap_local_page(page); 489 memcpy(to_buff, &header, sizeof(header)); 490 kunmap_local(to_buff); 491 header_buf->length = sizeof(header); 492 header_buf->header_image_size = image_size; 493 header_buf->start_pos = header_buf->migf->max_pos; 494 migf->max_pos += header_buf->length; 495 spin_lock_irqsave(&migf->list_lock, flags); 496 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 497 spin_unlock_irqrestore(&migf->list_lock, flags); 498 return 0; 499 } 500 501 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 502 { 503 struct mlx5vf_async_data *async_data = container_of(context, 504 struct mlx5vf_async_data, cb_work); 505 struct mlx5_vf_migration_file *migf = container_of(async_data, 506 struct mlx5_vf_migration_file, async_data); 507 508 if (!status) { 509 size_t image_size; 510 unsigned long flags; 511 512 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 513 actual_image_size); 514 if (async_data->header_buf) { 515 status = add_buf_header(async_data->header_buf, image_size); 516 if (status) 517 goto err; 518 } 519 async_data->buf->length = image_size; 520 async_data->buf->start_pos = migf->max_pos; 521 migf->max_pos += async_data->buf->length; 522 spin_lock_irqsave(&migf->list_lock, flags); 523 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 524 spin_unlock_irqrestore(&migf->list_lock, flags); 525 migf->state = async_data->last_chunk ? 526 MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; 527 wake_up_interruptible(&migf->poll_wait); 528 } 529 530 err: 531 /* 532 * The error and the cleanup flows can't run from an 533 * interrupt context 534 */ 535 if (status == -EREMOTEIO) 536 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 537 async_data->status = status; 538 queue_work(migf->mvdev->cb_wq, &async_data->work); 539 } 540 541 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 542 struct mlx5_vf_migration_file *migf, 543 struct mlx5_vhca_data_buffer *buf, bool inc, 544 bool track) 545 { 546 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 547 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 548 struct mlx5_vhca_data_buffer *header_buf = NULL; 549 struct mlx5vf_async_data *async_data; 550 int err; 551 552 lockdep_assert_held(&mvdev->state_mutex); 553 if (mvdev->mdev_detach) 554 return -ENOTCONN; 555 556 err = wait_for_completion_interruptible(&migf->save_comp); 557 if (err) 558 return err; 559 560 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 561 /* 562 * In case we had a PRE_COPY error, SAVE is triggered only for 563 * the final image, read device full image. 564 */ 565 inc = false; 566 567 MLX5_SET(save_vhca_state_in, in, opcode, 568 MLX5_CMD_OP_SAVE_VHCA_STATE); 569 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 570 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 571 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 572 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 573 MLX5_SET(save_vhca_state_in, in, incremental, inc); 574 MLX5_SET(save_vhca_state_in, in, set_track, track); 575 576 async_data = &migf->async_data; 577 async_data->buf = buf; 578 async_data->last_chunk = !track; 579 async_data->out = kvzalloc(out_size, GFP_KERNEL); 580 if (!async_data->out) { 581 err = -ENOMEM; 582 goto err_out; 583 } 584 585 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 586 header_buf = mlx5vf_get_data_buffer(migf, 587 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 588 if (IS_ERR(header_buf)) { 589 err = PTR_ERR(header_buf); 590 goto err_free; 591 } 592 } 593 594 if (async_data->last_chunk) 595 migf->state = MLX5_MIGF_STATE_SAVE_LAST; 596 597 async_data->header_buf = header_buf; 598 get_file(migf->filp); 599 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 600 async_data->out, 601 out_size, mlx5vf_save_callback, 602 &async_data->cb_work); 603 if (err) 604 goto err_exec; 605 606 return 0; 607 608 err_exec: 609 if (header_buf) 610 mlx5vf_put_data_buffer(header_buf); 611 fput(migf->filp); 612 err_free: 613 kvfree(async_data->out); 614 err_out: 615 complete(&migf->save_comp); 616 return err; 617 } 618 619 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 620 struct mlx5_vf_migration_file *migf, 621 struct mlx5_vhca_data_buffer *buf) 622 { 623 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 624 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 625 int err; 626 627 lockdep_assert_held(&mvdev->state_mutex); 628 if (mvdev->mdev_detach) 629 return -ENOTCONN; 630 631 if (!buf->dmaed) { 632 err = mlx5vf_dma_data_buffer(buf); 633 if (err) 634 return err; 635 } 636 637 MLX5_SET(load_vhca_state_in, in, opcode, 638 MLX5_CMD_OP_LOAD_VHCA_STATE); 639 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 640 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 641 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 642 MLX5_SET(load_vhca_state_in, in, size, buf->length); 643 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 644 } 645 646 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 647 { 648 int err; 649 650 lockdep_assert_held(&migf->mvdev->state_mutex); 651 if (migf->mvdev->mdev_detach) 652 return -ENOTCONN; 653 654 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 655 return err; 656 } 657 658 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 659 { 660 lockdep_assert_held(&migf->mvdev->state_mutex); 661 if (migf->mvdev->mdev_detach) 662 return; 663 664 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 665 } 666 667 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 668 { 669 struct mlx5_vhca_data_buffer *entry; 670 671 lockdep_assert_held(&migf->mvdev->state_mutex); 672 WARN_ON(migf->mvdev->mdev_detach); 673 674 if (migf->buf) { 675 mlx5vf_free_data_buffer(migf->buf); 676 migf->buf = NULL; 677 } 678 679 if (migf->buf_header) { 680 mlx5vf_free_data_buffer(migf->buf_header); 681 migf->buf_header = NULL; 682 } 683 684 list_splice(&migf->avail_list, &migf->buf_list); 685 686 while ((entry = list_first_entry_or_null(&migf->buf_list, 687 struct mlx5_vhca_data_buffer, buf_elm))) { 688 list_del(&entry->buf_elm); 689 mlx5vf_free_data_buffer(entry); 690 } 691 692 mlx5vf_cmd_dealloc_pd(migf); 693 } 694 695 static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, 696 u32 req_nodes) 697 { 698 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 699 unsigned long min_gap; 700 unsigned long curr_gap; 701 702 /* Special shortcut when a single range is required */ 703 if (req_nodes == 1) { 704 unsigned long last; 705 706 curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 707 while (curr) { 708 last = curr->last; 709 prev = curr; 710 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 711 if (prev != comb_start) 712 interval_tree_remove(prev, root); 713 } 714 comb_start->last = last; 715 return; 716 } 717 718 /* Combine ranges which have the smallest gap */ 719 while (cur_nodes > req_nodes) { 720 prev = NULL; 721 min_gap = ULONG_MAX; 722 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 723 while (curr) { 724 if (prev) { 725 curr_gap = curr->start - prev->last; 726 if (curr_gap < min_gap) { 727 min_gap = curr_gap; 728 comb_start = prev; 729 comb_end = curr; 730 } 731 } 732 prev = curr; 733 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 734 } 735 comb_start->last = comb_end->last; 736 interval_tree_remove(comb_end, root); 737 cur_nodes--; 738 } 739 } 740 741 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 742 struct mlx5vf_pci_core_device *mvdev, 743 struct rb_root_cached *ranges, u32 nnodes) 744 { 745 int max_num_range = 746 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 747 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 748 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 749 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 750 struct interval_tree_node *node = NULL; 751 u64 total_ranges_len = 0; 752 u32 num_ranges = nnodes; 753 u8 log_addr_space_size; 754 void *range_list_ptr; 755 void *obj_context; 756 void *cmd_hdr; 757 int inlen; 758 void *in; 759 int err; 760 int i; 761 762 if (num_ranges > max_num_range) { 763 combine_ranges(ranges, nnodes, max_num_range); 764 num_ranges = max_num_range; 765 } 766 767 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 768 record_size * num_ranges; 769 in = kzalloc(inlen, GFP_KERNEL); 770 if (!in) 771 return -ENOMEM; 772 773 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 774 general_obj_in_cmd_hdr); 775 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 776 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 777 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 778 MLX5_OBJ_TYPE_PAGE_TRACK); 779 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 780 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 781 MLX5_SET(page_track, obj_context, track_type, 1); 782 MLX5_SET(page_track, obj_context, log_page_size, 783 ilog2(tracker->host_qp->tracked_page_size)); 784 MLX5_SET(page_track, obj_context, log_msg_size, 785 ilog2(tracker->host_qp->max_msg_size)); 786 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 787 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 788 789 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 790 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 791 for (i = 0; i < num_ranges; i++) { 792 void *addr_range_i_base = range_list_ptr + record_size * i; 793 unsigned long length = node->last - node->start; 794 795 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 796 node->start); 797 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 798 total_ranges_len += length; 799 node = interval_tree_iter_next(node, 0, ULONG_MAX); 800 } 801 802 WARN_ON(node); 803 log_addr_space_size = ilog2(total_ranges_len); 804 if (log_addr_space_size < 805 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 806 log_addr_space_size > 807 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 808 err = -EOPNOTSUPP; 809 goto out; 810 } 811 812 MLX5_SET(page_track, obj_context, log_addr_space_size, 813 log_addr_space_size); 814 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 815 if (err) 816 goto out; 817 818 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 819 out: 820 kfree(in); 821 return err; 822 } 823 824 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 825 u32 tracker_id) 826 { 827 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 828 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 829 830 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 831 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 832 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 833 834 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 835 } 836 837 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 838 u32 tracker_id, unsigned long iova, 839 unsigned long length, u32 tracker_state) 840 { 841 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 842 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 843 void *obj_context; 844 void *cmd_hdr; 845 846 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 847 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 848 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 849 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 850 851 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 852 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 853 MLX5_SET64(page_track, obj_context, range_start_address, iova); 854 MLX5_SET64(page_track, obj_context, length, length); 855 MLX5_SET(page_track, obj_context, state, tracker_state); 856 857 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 858 } 859 860 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 861 struct mlx5_vhca_cq_buf *buf, int nent, 862 int cqe_size) 863 { 864 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 865 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 866 u8 log_wq_sz = ilog2(cqe_size); 867 int err; 868 869 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 870 mdev->priv.numa_node); 871 if (err) 872 return err; 873 874 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 875 buf->cqe_size = cqe_size; 876 buf->nent = nent; 877 return 0; 878 } 879 880 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 881 { 882 struct mlx5_cqe64 *cqe64; 883 void *cqe; 884 int i; 885 886 for (i = 0; i < buf->nent; i++) { 887 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 888 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 889 cqe64->op_own = MLX5_CQE_INVALID << 4; 890 } 891 } 892 893 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 894 struct mlx5_vhca_cq *cq) 895 { 896 mlx5_core_destroy_cq(mdev, &cq->mcq); 897 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 898 mlx5_db_free(mdev, &cq->db); 899 } 900 901 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 902 { 903 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 904 return; 905 906 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 907 tracker.cq.mcq)); 908 } 909 910 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 911 void *data) 912 { 913 struct mlx5_vhca_page_tracker *tracker = 914 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 915 struct mlx5vf_pci_core_device *mvdev = container_of( 916 tracker, struct mlx5vf_pci_core_device, tracker); 917 struct mlx5_eqe *eqe = data; 918 u8 event_type = (u8)type; 919 u8 queue_type; 920 int qp_num; 921 922 switch (event_type) { 923 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 924 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 925 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 926 queue_type = eqe->data.qp_srq.type; 927 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 928 break; 929 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 930 if (qp_num != tracker->host_qp->qpn && 931 qp_num != tracker->fw_qp->qpn) 932 break; 933 set_tracker_error(mvdev); 934 break; 935 default: 936 break; 937 } 938 939 return NOTIFY_OK; 940 } 941 942 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 943 struct mlx5_eqe *eqe) 944 { 945 struct mlx5vf_pci_core_device *mvdev = 946 container_of(mcq, struct mlx5vf_pci_core_device, 947 tracker.cq.mcq); 948 949 complete(&mvdev->tracker_comp); 950 } 951 952 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 953 struct mlx5_vhca_page_tracker *tracker, 954 size_t ncqe) 955 { 956 int cqe_size = cache_line_size() == 128 ? 128 : 64; 957 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 958 struct mlx5_vhca_cq *cq; 959 int inlen, err, eqn; 960 void *cqc, *in; 961 __be64 *pas; 962 int vector; 963 964 cq = &tracker->cq; 965 ncqe = roundup_pow_of_two(ncqe); 966 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 967 if (err) 968 return err; 969 970 cq->ncqe = ncqe; 971 cq->mcq.set_ci_db = cq->db.db; 972 cq->mcq.arm_db = cq->db.db + 1; 973 cq->mcq.cqe_sz = cqe_size; 974 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 975 if (err) 976 goto err_db_free; 977 978 init_cq_frag_buf(&cq->buf); 979 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 980 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 981 cq->buf.frag_buf.npages; 982 in = kvzalloc(inlen, GFP_KERNEL); 983 if (!in) { 984 err = -ENOMEM; 985 goto err_buff; 986 } 987 988 vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); 989 err = mlx5_vector2eqn(mdev, vector, &eqn); 990 if (err) 991 goto err_vec; 992 993 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 994 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 995 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 996 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 997 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 998 MLX5_ADAPTER_PAGE_SHIFT); 999 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1000 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1001 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1002 cq->mcq.comp = mlx5vf_cq_complete; 1003 cq->mcq.event = mlx5vf_cq_event; 1004 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1005 if (err) 1006 goto err_vec; 1007 1008 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1009 cq->mcq.cons_index); 1010 kvfree(in); 1011 return 0; 1012 1013 err_vec: 1014 kvfree(in); 1015 err_buff: 1016 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1017 err_db_free: 1018 mlx5_db_free(mdev, &cq->db); 1019 return err; 1020 } 1021 1022 static struct mlx5_vhca_qp * 1023 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1024 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1025 { 1026 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1027 struct mlx5_vhca_qp *qp; 1028 u8 log_rq_stride; 1029 u8 log_rq_sz; 1030 void *qpc; 1031 int inlen; 1032 void *in; 1033 int err; 1034 1035 qp = kzalloc(sizeof(*qp), GFP_KERNEL); 1036 if (!qp) 1037 return ERR_PTR(-ENOMEM); 1038 1039 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1040 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1041 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1042 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1043 if (err) 1044 goto err_free; 1045 1046 if (max_recv_wr) { 1047 err = mlx5_frag_buf_alloc_node(mdev, 1048 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1049 &qp->buf, mdev->priv.numa_node); 1050 if (err) 1051 goto err_db_free; 1052 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1053 } 1054 1055 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1056 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1057 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1058 qp->buf.npages; 1059 in = kvzalloc(inlen, GFP_KERNEL); 1060 if (!in) { 1061 err = -ENOMEM; 1062 goto err_in; 1063 } 1064 1065 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1066 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1067 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1068 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1069 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1070 MLX5_SET(qpc, qpc, log_page_size, 1071 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1072 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1073 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1074 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1075 MLX5_SET(qpc, qpc, no_sq, 1); 1076 if (max_recv_wr) { 1077 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1078 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1079 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1080 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1081 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1082 mlx5_fill_page_frag_array(&qp->buf, 1083 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1084 in, pas)); 1085 } else { 1086 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1087 } 1088 1089 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1090 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1091 kvfree(in); 1092 if (err) 1093 goto err_in; 1094 1095 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1096 return qp; 1097 1098 err_in: 1099 if (max_recv_wr) 1100 mlx5_frag_buf_free(mdev, &qp->buf); 1101 err_db_free: 1102 mlx5_db_free(mdev, &qp->db); 1103 err_free: 1104 kfree(qp); 1105 return ERR_PTR(err); 1106 } 1107 1108 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1109 { 1110 struct mlx5_wqe_data_seg *data; 1111 unsigned int ix; 1112 1113 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1114 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1115 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1116 data->byte_count = cpu_to_be32(qp->max_msg_size); 1117 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1118 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1119 qp->rq.pc++; 1120 /* Make sure that descriptors are written before doorbell record. */ 1121 dma_wmb(); 1122 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1123 } 1124 1125 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1126 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1127 bool host_qp) 1128 { 1129 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1130 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1131 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1132 void *qpc; 1133 int ret; 1134 1135 /* Init */ 1136 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1137 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1138 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1139 MLX5_SET(qpc, qpc, rre, 1); 1140 MLX5_SET(qpc, qpc, rwe, 1); 1141 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1142 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1143 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1144 if (ret) 1145 return ret; 1146 1147 if (host_qp) { 1148 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1149 int i; 1150 1151 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1152 mlx5vf_post_recv(qp); 1153 recv_buf->next_rq_offset += qp->max_msg_size; 1154 } 1155 } 1156 1157 /* RTR */ 1158 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1159 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1160 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1161 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1162 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1163 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1164 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1165 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1166 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1167 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1168 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1169 if (ret || host_qp) 1170 return ret; 1171 1172 /* RTS */ 1173 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1174 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1175 MLX5_SET(qpc, qpc, retry_count, 7); 1176 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1177 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1178 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1179 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1180 1181 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1182 } 1183 1184 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1185 struct mlx5_vhca_qp *qp) 1186 { 1187 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1188 1189 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1190 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1191 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1192 1193 mlx5_frag_buf_free(mdev, &qp->buf); 1194 mlx5_db_free(mdev, &qp->db); 1195 kfree(qp); 1196 } 1197 1198 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1199 { 1200 int i; 1201 1202 /* Undo alloc_pages_bulk_array() */ 1203 for (i = 0; i < recv_buf->npages; i++) 1204 __free_page(recv_buf->page_list[i]); 1205 1206 kvfree(recv_buf->page_list); 1207 } 1208 1209 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1210 unsigned int npages) 1211 { 1212 unsigned int filled = 0, done = 0; 1213 int i; 1214 1215 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1216 GFP_KERNEL); 1217 if (!recv_buf->page_list) 1218 return -ENOMEM; 1219 1220 for (;;) { 1221 filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, 1222 recv_buf->page_list + done); 1223 if (!filled) 1224 goto err; 1225 1226 done += filled; 1227 if (done == npages) 1228 break; 1229 } 1230 1231 recv_buf->npages = npages; 1232 return 0; 1233 1234 err: 1235 for (i = 0; i < npages; i++) { 1236 if (recv_buf->page_list[i]) 1237 __free_page(recv_buf->page_list[i]); 1238 } 1239 1240 kvfree(recv_buf->page_list); 1241 return -ENOMEM; 1242 } 1243 1244 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1245 struct mlx5_vhca_recv_buf *recv_buf) 1246 { 1247 int i, j; 1248 1249 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1250 sizeof(*recv_buf->dma_addrs), 1251 GFP_KERNEL); 1252 if (!recv_buf->dma_addrs) 1253 return -ENOMEM; 1254 1255 for (i = 0; i < recv_buf->npages; i++) { 1256 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1257 recv_buf->page_list[i], 1258 0, PAGE_SIZE, 1259 DMA_FROM_DEVICE); 1260 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1261 goto error; 1262 } 1263 return 0; 1264 1265 error: 1266 for (j = 0; j < i; j++) 1267 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1268 PAGE_SIZE, DMA_FROM_DEVICE); 1269 1270 kvfree(recv_buf->dma_addrs); 1271 return -ENOMEM; 1272 } 1273 1274 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1275 struct mlx5_vhca_recv_buf *recv_buf) 1276 { 1277 int i; 1278 1279 for (i = 0; i < recv_buf->npages; i++) 1280 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1281 PAGE_SIZE, DMA_FROM_DEVICE); 1282 1283 kvfree(recv_buf->dma_addrs); 1284 } 1285 1286 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1287 struct mlx5_vhca_qp *qp) 1288 { 1289 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1290 1291 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1292 unregister_dma_recv_pages(mdev, recv_buf); 1293 free_recv_pages(&qp->recv_buf); 1294 } 1295 1296 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1297 struct mlx5_vhca_qp *qp, u32 pdn, 1298 u64 rq_size) 1299 { 1300 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1301 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1302 int err; 1303 1304 err = alloc_recv_pages(recv_buf, npages); 1305 if (err < 0) 1306 return err; 1307 1308 err = register_dma_recv_pages(mdev, recv_buf); 1309 if (err) 1310 goto end; 1311 1312 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1313 if (err) 1314 goto err_create_mkey; 1315 1316 return 0; 1317 1318 err_create_mkey: 1319 unregister_dma_recv_pages(mdev, recv_buf); 1320 end: 1321 free_recv_pages(recv_buf); 1322 return err; 1323 } 1324 1325 static void 1326 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1327 { 1328 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1329 struct mlx5_core_dev *mdev = mvdev->mdev; 1330 1331 lockdep_assert_held(&mvdev->state_mutex); 1332 1333 if (!mvdev->log_active) 1334 return; 1335 1336 WARN_ON(mvdev->mdev_detach); 1337 1338 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1339 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1340 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1341 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1342 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1343 mlx5vf_destroy_cq(mdev, &tracker->cq); 1344 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1345 mlx5_put_uars_page(mdev, tracker->uar); 1346 mvdev->log_active = false; 1347 } 1348 1349 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1350 { 1351 struct mlx5vf_pci_core_device *mvdev = container_of( 1352 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1353 1354 mutex_lock(&mvdev->state_mutex); 1355 if (!mvdev->log_active) 1356 goto end; 1357 1358 _mlx5vf_free_page_tracker_resources(mvdev); 1359 mvdev->log_active = false; 1360 end: 1361 mlx5vf_state_mutex_unlock(mvdev); 1362 return 0; 1363 } 1364 1365 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1366 struct rb_root_cached *ranges, u32 nnodes, 1367 u64 *page_size) 1368 { 1369 struct mlx5vf_pci_core_device *mvdev = container_of( 1370 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1371 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1372 u8 log_tracked_page = ilog2(*page_size); 1373 struct mlx5_vhca_qp *host_qp; 1374 struct mlx5_vhca_qp *fw_qp; 1375 struct mlx5_core_dev *mdev; 1376 u32 max_msg_size = PAGE_SIZE; 1377 u64 rq_size = SZ_2M; 1378 u32 max_recv_wr; 1379 int err; 1380 1381 mutex_lock(&mvdev->state_mutex); 1382 if (mvdev->mdev_detach) { 1383 err = -ENOTCONN; 1384 goto end; 1385 } 1386 1387 if (mvdev->log_active) { 1388 err = -EINVAL; 1389 goto end; 1390 } 1391 1392 mdev = mvdev->mdev; 1393 memset(tracker, 0, sizeof(*tracker)); 1394 tracker->uar = mlx5_get_uars_page(mdev); 1395 if (IS_ERR(tracker->uar)) { 1396 err = PTR_ERR(tracker->uar); 1397 goto end; 1398 } 1399 1400 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1401 if (err) 1402 goto err_uar; 1403 1404 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1405 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1406 if (err) 1407 goto err_dealloc_pd; 1408 1409 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1410 if (IS_ERR(host_qp)) { 1411 err = PTR_ERR(host_qp); 1412 goto err_cq; 1413 } 1414 1415 host_qp->max_msg_size = max_msg_size; 1416 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1417 pg_track_log_min_page_size)) { 1418 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1419 pg_track_log_min_page_size); 1420 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1421 pg_track_log_max_page_size)) { 1422 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1423 pg_track_log_max_page_size); 1424 } 1425 1426 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1427 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1428 rq_size); 1429 if (err) 1430 goto err_host_qp; 1431 1432 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1433 if (IS_ERR(fw_qp)) { 1434 err = PTR_ERR(fw_qp); 1435 goto err_recv_resources; 1436 } 1437 1438 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1439 if (err) 1440 goto err_activate; 1441 1442 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1443 if (err) 1444 goto err_activate; 1445 1446 tracker->host_qp = host_qp; 1447 tracker->fw_qp = fw_qp; 1448 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1449 if (err) 1450 goto err_activate; 1451 1452 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1453 mlx5_eq_notifier_register(mdev, &tracker->nb); 1454 *page_size = host_qp->tracked_page_size; 1455 mvdev->log_active = true; 1456 mlx5vf_state_mutex_unlock(mvdev); 1457 return 0; 1458 1459 err_activate: 1460 mlx5vf_destroy_qp(mdev, fw_qp); 1461 err_recv_resources: 1462 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1463 err_host_qp: 1464 mlx5vf_destroy_qp(mdev, host_qp); 1465 err_cq: 1466 mlx5vf_destroy_cq(mdev, &tracker->cq); 1467 err_dealloc_pd: 1468 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1469 err_uar: 1470 mlx5_put_uars_page(mdev, tracker->uar); 1471 end: 1472 mlx5vf_state_mutex_unlock(mvdev); 1473 return err; 1474 } 1475 1476 static void 1477 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1478 struct iova_bitmap *dirty) 1479 { 1480 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1481 u32 nent = size / entry_size; 1482 struct page *page; 1483 u64 addr; 1484 u64 *buf; 1485 int i; 1486 1487 if (WARN_ON(index >= qp->recv_buf.npages || 1488 (nent > qp->max_msg_size / entry_size))) 1489 return; 1490 1491 page = qp->recv_buf.page_list[index]; 1492 buf = kmap_local_page(page); 1493 for (i = 0; i < nent; i++) { 1494 addr = MLX5_GET(page_track_report_entry, buf + i, 1495 dirty_address_low); 1496 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1497 dirty_address_high) << 32; 1498 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1499 } 1500 kunmap_local(buf); 1501 } 1502 1503 static void 1504 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1505 struct iova_bitmap *dirty, int *tracker_status) 1506 { 1507 u32 size; 1508 int ix; 1509 1510 qp->rq.cc++; 1511 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1512 size = be32_to_cpu(cqe->byte_cnt); 1513 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1514 1515 /* zero length CQE, no data */ 1516 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1517 if (size) 1518 set_report_output(size, ix, qp, dirty); 1519 1520 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1521 mlx5vf_post_recv(qp); 1522 } 1523 1524 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1525 { 1526 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1527 } 1528 1529 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1530 { 1531 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1532 struct mlx5_cqe64 *cqe64; 1533 1534 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1535 1536 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1537 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1538 return cqe64; 1539 } else { 1540 return NULL; 1541 } 1542 } 1543 1544 static int 1545 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1546 struct iova_bitmap *dirty, int *tracker_status) 1547 { 1548 struct mlx5_cqe64 *cqe; 1549 u8 opcode; 1550 1551 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1552 if (!cqe) 1553 return CQ_EMPTY; 1554 1555 ++cq->mcq.cons_index; 1556 /* 1557 * Make sure we read CQ entry contents after we've checked the 1558 * ownership bit. 1559 */ 1560 rmb(); 1561 opcode = get_cqe_opcode(cqe); 1562 switch (opcode) { 1563 case MLX5_CQE_RESP_SEND_IMM: 1564 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1565 return CQ_OK; 1566 default: 1567 return CQ_POLL_ERR; 1568 } 1569 } 1570 1571 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1572 unsigned long length, 1573 struct iova_bitmap *dirty) 1574 { 1575 struct mlx5vf_pci_core_device *mvdev = container_of( 1576 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1577 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1578 struct mlx5_vhca_cq *cq = &tracker->cq; 1579 struct mlx5_core_dev *mdev; 1580 int poll_err, err; 1581 1582 mutex_lock(&mvdev->state_mutex); 1583 if (!mvdev->log_active) { 1584 err = -EINVAL; 1585 goto end; 1586 } 1587 1588 if (mvdev->mdev_detach) { 1589 err = -ENOTCONN; 1590 goto end; 1591 } 1592 1593 mdev = mvdev->mdev; 1594 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1595 MLX5_PAGE_TRACK_STATE_REPORTING); 1596 if (err) 1597 goto end; 1598 1599 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1600 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1601 !tracker->is_err) { 1602 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1603 &tracker->status); 1604 if (poll_err == CQ_EMPTY) { 1605 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1606 cq->mcq.cons_index); 1607 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1608 dirty, &tracker->status); 1609 if (poll_err == CQ_EMPTY) { 1610 wait_for_completion(&mvdev->tracker_comp); 1611 continue; 1612 } 1613 } 1614 if (poll_err == CQ_POLL_ERR) { 1615 err = -EIO; 1616 goto end; 1617 } 1618 mlx5_cq_set_ci(&cq->mcq); 1619 } 1620 1621 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1622 tracker->is_err = true; 1623 1624 if (tracker->is_err) 1625 err = -EIO; 1626 end: 1627 mlx5vf_state_mutex_unlock(mvdev); 1628 return err; 1629 } 1630