1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Arbitrary to prevent userspace from consuming endless memory */ 25 #define MAX_MIGRATION_SIZE (512*1024*1024) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 82 if (!filled) { 83 ret = -ENOMEM; 84 goto err; 85 } 86 to_alloc -= filled; 87 ret = sg_alloc_append_table_from_pages( 88 &buf->table, page_list, filled, 0, 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 GFP_KERNEL); 91 92 if (ret) 93 goto err; 94 buf->allocated_length += filled * PAGE_SIZE; 95 /* clean input for another bulk allocation */ 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 to_fill = min_t(unsigned int, to_alloc, 98 PAGE_SIZE / sizeof(*page_list)); 99 } while (to_alloc > 0); 100 101 kvfree(page_list); 102 return 0; 103 104 err: 105 kvfree(page_list); 106 return ret; 107 } 108 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 { 111 mutex_lock(&migf->lock); 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 migf->filp->f_pos = 0; 114 mutex_unlock(&migf->lock); 115 } 116 117 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 118 { 119 struct mlx5_vf_migration_file *migf = filp->private_data; 120 121 mlx5vf_disable_fd(migf); 122 mutex_destroy(&migf->lock); 123 kfree(migf); 124 return 0; 125 } 126 127 static struct mlx5_vhca_data_buffer * 128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 129 bool *end_of_data) 130 { 131 struct mlx5_vhca_data_buffer *buf; 132 bool found = false; 133 134 *end_of_data = false; 135 spin_lock_irq(&migf->list_lock); 136 if (list_empty(&migf->buf_list)) { 137 *end_of_data = true; 138 goto end; 139 } 140 141 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 142 buf_elm); 143 if (pos >= buf->start_pos && 144 pos < buf->start_pos + buf->length) { 145 found = true; 146 goto end; 147 } 148 149 /* 150 * As we use a stream based FD we may expect having the data always 151 * on first chunk 152 */ 153 migf->state = MLX5_MIGF_STATE_ERROR; 154 155 end: 156 spin_unlock_irq(&migf->list_lock); 157 return found ? buf : NULL; 158 } 159 160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 161 char __user **buf, size_t *len, loff_t *pos) 162 { 163 unsigned long offset; 164 ssize_t done = 0; 165 size_t copy_len; 166 167 copy_len = min_t(size_t, 168 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 169 while (copy_len) { 170 size_t page_offset; 171 struct page *page; 172 size_t page_len; 173 u8 *from_buff; 174 int ret; 175 176 offset = *pos - vhca_buf->start_pos; 177 page_offset = offset % PAGE_SIZE; 178 offset -= page_offset; 179 page = mlx5vf_get_migration_page(vhca_buf, offset); 180 if (!page) 181 return -EINVAL; 182 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 183 from_buff = kmap_local_page(page); 184 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 185 kunmap_local(from_buff); 186 if (ret) 187 return -EFAULT; 188 *pos += page_len; 189 *len -= page_len; 190 *buf += page_len; 191 done += page_len; 192 copy_len -= page_len; 193 } 194 195 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 196 spin_lock_irq(&vhca_buf->migf->list_lock); 197 list_del_init(&vhca_buf->buf_elm); 198 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 199 spin_unlock_irq(&vhca_buf->migf->list_lock); 200 } 201 202 return done; 203 } 204 205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 206 loff_t *pos) 207 { 208 struct mlx5_vf_migration_file *migf = filp->private_data; 209 struct mlx5_vhca_data_buffer *vhca_buf; 210 bool first_loop_call = true; 211 bool end_of_data; 212 ssize_t done = 0; 213 214 if (pos) 215 return -ESPIPE; 216 pos = &filp->f_pos; 217 218 if (!(filp->f_flags & O_NONBLOCK)) { 219 if (wait_event_interruptible(migf->poll_wait, 220 !list_empty(&migf->buf_list) || 221 migf->state == MLX5_MIGF_STATE_ERROR || 222 migf->state == MLX5_MIGF_STATE_COMPLETE)) 223 return -ERESTARTSYS; 224 } 225 226 mutex_lock(&migf->lock); 227 if (migf->state == MLX5_MIGF_STATE_ERROR) { 228 done = -ENODEV; 229 goto out_unlock; 230 } 231 232 while (len) { 233 ssize_t count; 234 235 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 236 &end_of_data); 237 if (first_loop_call) { 238 first_loop_call = false; 239 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 240 if (filp->f_flags & O_NONBLOCK) { 241 done = -EAGAIN; 242 goto out_unlock; 243 } 244 } 245 } 246 247 if (end_of_data) 248 goto out_unlock; 249 250 if (!vhca_buf) { 251 done = -EINVAL; 252 goto out_unlock; 253 } 254 255 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 256 if (count < 0) { 257 done = count; 258 goto out_unlock; 259 } 260 done += count; 261 } 262 263 out_unlock: 264 mutex_unlock(&migf->lock); 265 return done; 266 } 267 268 static __poll_t mlx5vf_save_poll(struct file *filp, 269 struct poll_table_struct *wait) 270 { 271 struct mlx5_vf_migration_file *migf = filp->private_data; 272 __poll_t pollflags = 0; 273 274 poll_wait(filp, &migf->poll_wait, wait); 275 276 mutex_lock(&migf->lock); 277 if (migf->state == MLX5_MIGF_STATE_ERROR) 278 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 279 else if (!list_empty(&migf->buf_list) || 280 migf->state == MLX5_MIGF_STATE_COMPLETE) 281 pollflags = EPOLLIN | EPOLLRDNORM; 282 mutex_unlock(&migf->lock); 283 284 return pollflags; 285 } 286 287 /* 288 * FD is exposed and user can use it after receiving an error. 289 * Mark migf in error, and wake the user. 290 */ 291 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 292 { 293 migf->state = MLX5_MIGF_STATE_ERROR; 294 wake_up_interruptible(&migf->poll_wait); 295 } 296 297 static const struct file_operations mlx5vf_save_fops = { 298 .owner = THIS_MODULE, 299 .read = mlx5vf_save_read, 300 .poll = mlx5vf_save_poll, 301 .release = mlx5vf_release_file, 302 .llseek = no_llseek, 303 }; 304 305 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 306 { 307 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 308 struct mlx5_vhca_data_buffer *buf; 309 size_t length; 310 int ret; 311 312 if (migf->state == MLX5_MIGF_STATE_ERROR) 313 return -ENODEV; 314 315 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 316 MLX5VF_QUERY_INC); 317 if (ret) 318 goto err; 319 320 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 321 if (IS_ERR(buf)) { 322 ret = PTR_ERR(buf); 323 goto err; 324 } 325 326 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 327 if (ret) 328 goto err_save; 329 330 return 0; 331 332 err_save: 333 mlx5vf_put_data_buffer(buf); 334 err: 335 mlx5vf_mark_err(migf); 336 return ret; 337 } 338 339 static struct mlx5_vf_migration_file * 340 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 341 { 342 struct mlx5_vf_migration_file *migf; 343 struct mlx5_vhca_data_buffer *buf; 344 size_t length; 345 int ret; 346 347 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 348 if (!migf) 349 return ERR_PTR(-ENOMEM); 350 351 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 352 O_RDONLY); 353 if (IS_ERR(migf->filp)) { 354 ret = PTR_ERR(migf->filp); 355 goto end; 356 } 357 358 migf->mvdev = mvdev; 359 ret = mlx5vf_cmd_alloc_pd(migf); 360 if (ret) 361 goto out_free; 362 363 stream_open(migf->filp->f_inode, migf->filp); 364 mutex_init(&migf->lock); 365 init_waitqueue_head(&migf->poll_wait); 366 init_completion(&migf->save_comp); 367 /* 368 * save_comp is being used as a binary semaphore built from 369 * a completion. A normal mutex cannot be used because the lock is 370 * passed between kernel threads and lockdep can't model this. 371 */ 372 complete(&migf->save_comp); 373 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 374 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 375 INIT_LIST_HEAD(&migf->buf_list); 376 INIT_LIST_HEAD(&migf->avail_list); 377 spin_lock_init(&migf->list_lock); 378 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 379 if (ret) 380 goto out_pd; 381 382 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 383 if (IS_ERR(buf)) { 384 ret = PTR_ERR(buf); 385 goto out_pd; 386 } 387 388 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 389 if (ret) 390 goto out_save; 391 return migf; 392 out_save: 393 mlx5vf_free_data_buffer(buf); 394 out_pd: 395 mlx5vf_cmd_dealloc_pd(migf); 396 out_free: 397 fput(migf->filp); 398 end: 399 kfree(migf); 400 return ERR_PTR(ret); 401 } 402 403 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 404 size_t len, loff_t *pos) 405 { 406 struct mlx5_vf_migration_file *migf = filp->private_data; 407 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 408 loff_t requested_length; 409 ssize_t done = 0; 410 411 if (pos) 412 return -ESPIPE; 413 pos = &filp->f_pos; 414 415 if (*pos < 0 || 416 check_add_overflow((loff_t)len, *pos, &requested_length)) 417 return -EINVAL; 418 419 if (requested_length > MAX_MIGRATION_SIZE) 420 return -ENOMEM; 421 422 mutex_lock(&migf->lock); 423 if (migf->state == MLX5_MIGF_STATE_ERROR) { 424 done = -ENODEV; 425 goto out_unlock; 426 } 427 428 if (vhca_buf->allocated_length < requested_length) { 429 done = mlx5vf_add_migration_pages( 430 vhca_buf, 431 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 432 PAGE_SIZE)); 433 if (done) 434 goto out_unlock; 435 } 436 437 while (len) { 438 size_t page_offset; 439 struct page *page; 440 size_t page_len; 441 u8 *to_buff; 442 int ret; 443 444 page_offset = (*pos) % PAGE_SIZE; 445 page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); 446 if (!page) { 447 if (done == 0) 448 done = -EINVAL; 449 goto out_unlock; 450 } 451 452 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 453 to_buff = kmap_local_page(page); 454 ret = copy_from_user(to_buff + page_offset, buf, page_len); 455 kunmap_local(to_buff); 456 if (ret) { 457 done = -EFAULT; 458 goto out_unlock; 459 } 460 *pos += page_len; 461 len -= page_len; 462 done += page_len; 463 buf += page_len; 464 vhca_buf->length += page_len; 465 } 466 out_unlock: 467 mutex_unlock(&migf->lock); 468 return done; 469 } 470 471 static const struct file_operations mlx5vf_resume_fops = { 472 .owner = THIS_MODULE, 473 .write = mlx5vf_resume_write, 474 .release = mlx5vf_release_file, 475 .llseek = no_llseek, 476 }; 477 478 static struct mlx5_vf_migration_file * 479 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 480 { 481 struct mlx5_vf_migration_file *migf; 482 struct mlx5_vhca_data_buffer *buf; 483 int ret; 484 485 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 486 if (!migf) 487 return ERR_PTR(-ENOMEM); 488 489 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 490 O_WRONLY); 491 if (IS_ERR(migf->filp)) { 492 ret = PTR_ERR(migf->filp); 493 goto end; 494 } 495 496 migf->mvdev = mvdev; 497 ret = mlx5vf_cmd_alloc_pd(migf); 498 if (ret) 499 goto out_free; 500 501 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 502 if (IS_ERR(buf)) { 503 ret = PTR_ERR(buf); 504 goto out_pd; 505 } 506 507 migf->buf = buf; 508 stream_open(migf->filp->f_inode, migf->filp); 509 mutex_init(&migf->lock); 510 INIT_LIST_HEAD(&migf->buf_list); 511 INIT_LIST_HEAD(&migf->avail_list); 512 spin_lock_init(&migf->list_lock); 513 return migf; 514 out_pd: 515 mlx5vf_cmd_dealloc_pd(migf); 516 out_free: 517 fput(migf->filp); 518 end: 519 kfree(migf); 520 return ERR_PTR(ret); 521 } 522 523 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 524 { 525 if (mvdev->resuming_migf) { 526 mlx5vf_disable_fd(mvdev->resuming_migf); 527 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 528 fput(mvdev->resuming_migf->filp); 529 mvdev->resuming_migf = NULL; 530 } 531 if (mvdev->saving_migf) { 532 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 533 cancel_work_sync(&mvdev->saving_migf->async_data.work); 534 mlx5vf_disable_fd(mvdev->saving_migf); 535 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 536 fput(mvdev->saving_migf->filp); 537 mvdev->saving_migf = NULL; 538 } 539 } 540 541 static struct file * 542 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 543 u32 new) 544 { 545 u32 cur = mvdev->mig_state; 546 int ret; 547 548 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 549 ret = mlx5vf_cmd_suspend_vhca(mvdev, 550 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 551 if (ret) 552 return ERR_PTR(ret); 553 return NULL; 554 } 555 556 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 557 ret = mlx5vf_cmd_resume_vhca(mvdev, 558 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 559 if (ret) 560 return ERR_PTR(ret); 561 return NULL; 562 } 563 564 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 565 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 566 ret = mlx5vf_cmd_suspend_vhca(mvdev, 567 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 568 if (ret) 569 return ERR_PTR(ret); 570 return NULL; 571 } 572 573 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 574 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 575 ret = mlx5vf_cmd_resume_vhca(mvdev, 576 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 577 if (ret) 578 return ERR_PTR(ret); 579 return NULL; 580 } 581 582 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 583 struct mlx5_vf_migration_file *migf; 584 585 migf = mlx5vf_pci_save_device_data(mvdev, false); 586 if (IS_ERR(migf)) 587 return ERR_CAST(migf); 588 get_file(migf->filp); 589 mvdev->saving_migf = migf; 590 return migf->filp; 591 } 592 593 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 594 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 595 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 596 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 597 mlx5vf_disable_fds(mvdev); 598 return NULL; 599 } 600 601 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 602 struct mlx5_vf_migration_file *migf; 603 604 migf = mlx5vf_pci_resume_device_data(mvdev); 605 if (IS_ERR(migf)) 606 return ERR_CAST(migf); 607 get_file(migf->filp); 608 mvdev->resuming_migf = migf; 609 return migf->filp; 610 } 611 612 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 613 ret = mlx5vf_cmd_load_vhca_state(mvdev, 614 mvdev->resuming_migf, 615 mvdev->resuming_migf->buf); 616 if (ret) 617 return ERR_PTR(ret); 618 mlx5vf_disable_fds(mvdev); 619 return NULL; 620 } 621 622 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 623 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 624 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 625 struct mlx5_vf_migration_file *migf; 626 627 migf = mlx5vf_pci_save_device_data(mvdev, true); 628 if (IS_ERR(migf)) 629 return ERR_CAST(migf); 630 get_file(migf->filp); 631 mvdev->saving_migf = migf; 632 return migf->filp; 633 } 634 635 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 636 ret = mlx5vf_cmd_suspend_vhca(mvdev, 637 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 638 if (ret) 639 return ERR_PTR(ret); 640 ret = mlx5vf_pci_save_device_inc_data(mvdev); 641 return ret ? ERR_PTR(ret) : NULL; 642 } 643 644 /* 645 * vfio_mig_get_next_state() does not use arcs other than the above 646 */ 647 WARN_ON(true); 648 return ERR_PTR(-EINVAL); 649 } 650 651 /* 652 * This function is called in all state_mutex unlock cases to 653 * handle a 'deferred_reset' if exists. 654 */ 655 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 656 { 657 again: 658 spin_lock(&mvdev->reset_lock); 659 if (mvdev->deferred_reset) { 660 mvdev->deferred_reset = false; 661 spin_unlock(&mvdev->reset_lock); 662 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 663 mlx5vf_disable_fds(mvdev); 664 goto again; 665 } 666 mutex_unlock(&mvdev->state_mutex); 667 spin_unlock(&mvdev->reset_lock); 668 } 669 670 static struct file * 671 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 672 enum vfio_device_mig_state new_state) 673 { 674 struct mlx5vf_pci_core_device *mvdev = container_of( 675 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 676 enum vfio_device_mig_state next_state; 677 struct file *res = NULL; 678 int ret; 679 680 mutex_lock(&mvdev->state_mutex); 681 while (new_state != mvdev->mig_state) { 682 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 683 new_state, &next_state); 684 if (ret) { 685 res = ERR_PTR(ret); 686 break; 687 } 688 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 689 if (IS_ERR(res)) 690 break; 691 mvdev->mig_state = next_state; 692 if (WARN_ON(res && new_state != mvdev->mig_state)) { 693 fput(res); 694 res = ERR_PTR(-EINVAL); 695 break; 696 } 697 } 698 mlx5vf_state_mutex_unlock(mvdev); 699 return res; 700 } 701 702 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 703 unsigned long *stop_copy_length) 704 { 705 struct mlx5vf_pci_core_device *mvdev = container_of( 706 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 707 size_t state_size; 708 int ret; 709 710 mutex_lock(&mvdev->state_mutex); 711 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 712 &state_size, 0); 713 if (!ret) 714 *stop_copy_length = state_size; 715 mlx5vf_state_mutex_unlock(mvdev); 716 return ret; 717 } 718 719 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 720 enum vfio_device_mig_state *curr_state) 721 { 722 struct mlx5vf_pci_core_device *mvdev = container_of( 723 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 724 725 mutex_lock(&mvdev->state_mutex); 726 *curr_state = mvdev->mig_state; 727 mlx5vf_state_mutex_unlock(mvdev); 728 return 0; 729 } 730 731 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 732 { 733 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 734 735 if (!mvdev->migrate_cap) 736 return; 737 738 /* 739 * As the higher VFIO layers are holding locks across reset and using 740 * those same locks with the mm_lock we need to prevent ABBA deadlock 741 * with the state_mutex and mm_lock. 742 * In case the state_mutex was taken already we defer the cleanup work 743 * to the unlock flow of the other running context. 744 */ 745 spin_lock(&mvdev->reset_lock); 746 mvdev->deferred_reset = true; 747 if (!mutex_trylock(&mvdev->state_mutex)) { 748 spin_unlock(&mvdev->reset_lock); 749 return; 750 } 751 spin_unlock(&mvdev->reset_lock); 752 mlx5vf_state_mutex_unlock(mvdev); 753 } 754 755 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 756 { 757 struct mlx5vf_pci_core_device *mvdev = container_of( 758 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 759 struct vfio_pci_core_device *vdev = &mvdev->core_device; 760 int ret; 761 762 ret = vfio_pci_core_enable(vdev); 763 if (ret) 764 return ret; 765 766 if (mvdev->migrate_cap) 767 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 768 vfio_pci_core_finish_enable(vdev); 769 return 0; 770 } 771 772 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 773 { 774 struct mlx5vf_pci_core_device *mvdev = container_of( 775 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 776 777 mlx5vf_cmd_close_migratable(mvdev); 778 vfio_pci_core_close_device(core_vdev); 779 } 780 781 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 782 .migration_set_state = mlx5vf_pci_set_device_state, 783 .migration_get_state = mlx5vf_pci_get_device_state, 784 .migration_get_data_size = mlx5vf_pci_get_data_size, 785 }; 786 787 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 788 .log_start = mlx5vf_start_page_tracker, 789 .log_stop = mlx5vf_stop_page_tracker, 790 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 791 }; 792 793 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 794 { 795 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 796 struct mlx5vf_pci_core_device, core_device.vdev); 797 int ret; 798 799 ret = vfio_pci_core_init_dev(core_vdev); 800 if (ret) 801 return ret; 802 803 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 804 &mlx5vf_pci_log_ops); 805 806 return 0; 807 } 808 809 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 810 { 811 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 812 struct mlx5vf_pci_core_device, core_device.vdev); 813 814 mlx5vf_cmd_remove_migratable(mvdev); 815 vfio_pci_core_release_dev(core_vdev); 816 } 817 818 static const struct vfio_device_ops mlx5vf_pci_ops = { 819 .name = "mlx5-vfio-pci", 820 .init = mlx5vf_pci_init_dev, 821 .release = mlx5vf_pci_release_dev, 822 .open_device = mlx5vf_pci_open_device, 823 .close_device = mlx5vf_pci_close_device, 824 .ioctl = vfio_pci_core_ioctl, 825 .device_feature = vfio_pci_core_ioctl_feature, 826 .read = vfio_pci_core_read, 827 .write = vfio_pci_core_write, 828 .mmap = vfio_pci_core_mmap, 829 .request = vfio_pci_core_request, 830 .match = vfio_pci_core_match, 831 }; 832 833 static int mlx5vf_pci_probe(struct pci_dev *pdev, 834 const struct pci_device_id *id) 835 { 836 struct mlx5vf_pci_core_device *mvdev; 837 int ret; 838 839 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 840 &pdev->dev, &mlx5vf_pci_ops); 841 if (IS_ERR(mvdev)) 842 return PTR_ERR(mvdev); 843 844 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 845 ret = vfio_pci_core_register_device(&mvdev->core_device); 846 if (ret) 847 goto out_put_vdev; 848 return 0; 849 850 out_put_vdev: 851 vfio_put_device(&mvdev->core_device.vdev); 852 return ret; 853 } 854 855 static void mlx5vf_pci_remove(struct pci_dev *pdev) 856 { 857 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 858 859 vfio_pci_core_unregister_device(&mvdev->core_device); 860 vfio_put_device(&mvdev->core_device.vdev); 861 } 862 863 static const struct pci_device_id mlx5vf_pci_table[] = { 864 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 865 {} 866 }; 867 868 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 869 870 static const struct pci_error_handlers mlx5vf_err_handlers = { 871 .reset_done = mlx5vf_pci_aer_reset_done, 872 .error_detected = vfio_pci_core_aer_err_detected, 873 }; 874 875 static struct pci_driver mlx5vf_pci_driver = { 876 .name = KBUILD_MODNAME, 877 .id_table = mlx5vf_pci_table, 878 .probe = mlx5vf_pci_probe, 879 .remove = mlx5vf_pci_remove, 880 .err_handler = &mlx5vf_err_handlers, 881 .driver_managed_dma = true, 882 }; 883 884 module_pci_driver(mlx5vf_pci_driver); 885 886 MODULE_LICENSE("GPL"); 887 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 888 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 889 MODULE_DESCRIPTION( 890 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 891