1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Arbitrary to prevent userspace from consuming endless memory */ 25 #define MAX_MIGRATION_SIZE (512*1024*1024) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 82 if (!filled) { 83 ret = -ENOMEM; 84 goto err; 85 } 86 to_alloc -= filled; 87 ret = sg_alloc_append_table_from_pages( 88 &buf->table, page_list, filled, 0, 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 GFP_KERNEL); 91 92 if (ret) 93 goto err; 94 buf->allocated_length += filled * PAGE_SIZE; 95 /* clean input for another bulk allocation */ 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 to_fill = min_t(unsigned int, to_alloc, 98 PAGE_SIZE / sizeof(*page_list)); 99 } while (to_alloc > 0); 100 101 kvfree(page_list); 102 return 0; 103 104 err: 105 kvfree(page_list); 106 return ret; 107 } 108 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 { 111 mutex_lock(&migf->lock); 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 migf->filp->f_pos = 0; 114 mutex_unlock(&migf->lock); 115 } 116 117 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 118 { 119 struct mlx5_vf_migration_file *migf = filp->private_data; 120 121 mlx5vf_disable_fd(migf); 122 mutex_destroy(&migf->lock); 123 kfree(migf); 124 return 0; 125 } 126 127 static struct mlx5_vhca_data_buffer * 128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 129 bool *end_of_data) 130 { 131 struct mlx5_vhca_data_buffer *buf; 132 bool found = false; 133 134 *end_of_data = false; 135 spin_lock_irq(&migf->list_lock); 136 if (list_empty(&migf->buf_list)) { 137 *end_of_data = true; 138 goto end; 139 } 140 141 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 142 buf_elm); 143 if (pos >= buf->start_pos && 144 pos < buf->start_pos + buf->length) { 145 found = true; 146 goto end; 147 } 148 149 /* 150 * As we use a stream based FD we may expect having the data always 151 * on first chunk 152 */ 153 migf->state = MLX5_MIGF_STATE_ERROR; 154 155 end: 156 spin_unlock_irq(&migf->list_lock); 157 return found ? buf : NULL; 158 } 159 160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 161 char __user **buf, size_t *len, loff_t *pos) 162 { 163 unsigned long offset; 164 ssize_t done = 0; 165 size_t copy_len; 166 167 copy_len = min_t(size_t, 168 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 169 while (copy_len) { 170 size_t page_offset; 171 struct page *page; 172 size_t page_len; 173 u8 *from_buff; 174 int ret; 175 176 offset = *pos - vhca_buf->start_pos; 177 page_offset = offset % PAGE_SIZE; 178 offset -= page_offset; 179 page = mlx5vf_get_migration_page(vhca_buf, offset); 180 if (!page) 181 return -EINVAL; 182 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 183 from_buff = kmap_local_page(page); 184 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 185 kunmap_local(from_buff); 186 if (ret) 187 return -EFAULT; 188 *pos += page_len; 189 *len -= page_len; 190 *buf += page_len; 191 done += page_len; 192 copy_len -= page_len; 193 } 194 195 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 196 spin_lock_irq(&vhca_buf->migf->list_lock); 197 list_del_init(&vhca_buf->buf_elm); 198 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 199 spin_unlock_irq(&vhca_buf->migf->list_lock); 200 } 201 202 return done; 203 } 204 205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 206 loff_t *pos) 207 { 208 struct mlx5_vf_migration_file *migf = filp->private_data; 209 struct mlx5_vhca_data_buffer *vhca_buf; 210 bool first_loop_call = true; 211 bool end_of_data; 212 ssize_t done = 0; 213 214 if (pos) 215 return -ESPIPE; 216 pos = &filp->f_pos; 217 218 if (!(filp->f_flags & O_NONBLOCK)) { 219 if (wait_event_interruptible(migf->poll_wait, 220 !list_empty(&migf->buf_list) || 221 migf->state == MLX5_MIGF_STATE_ERROR || 222 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 223 migf->state == MLX5_MIGF_STATE_PRE_COPY || 224 migf->state == MLX5_MIGF_STATE_COMPLETE)) 225 return -ERESTARTSYS; 226 } 227 228 mutex_lock(&migf->lock); 229 if (migf->state == MLX5_MIGF_STATE_ERROR) { 230 done = -ENODEV; 231 goto out_unlock; 232 } 233 234 while (len) { 235 ssize_t count; 236 237 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 238 &end_of_data); 239 if (first_loop_call) { 240 first_loop_call = false; 241 /* Temporary end of file as part of PRE_COPY */ 242 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 243 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 244 done = -ENOMSG; 245 goto out_unlock; 246 } 247 248 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 249 if (filp->f_flags & O_NONBLOCK) { 250 done = -EAGAIN; 251 goto out_unlock; 252 } 253 } 254 } 255 256 if (end_of_data) 257 goto out_unlock; 258 259 if (!vhca_buf) { 260 done = -EINVAL; 261 goto out_unlock; 262 } 263 264 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 265 if (count < 0) { 266 done = count; 267 goto out_unlock; 268 } 269 done += count; 270 } 271 272 out_unlock: 273 mutex_unlock(&migf->lock); 274 return done; 275 } 276 277 static __poll_t mlx5vf_save_poll(struct file *filp, 278 struct poll_table_struct *wait) 279 { 280 struct mlx5_vf_migration_file *migf = filp->private_data; 281 __poll_t pollflags = 0; 282 283 poll_wait(filp, &migf->poll_wait, wait); 284 285 mutex_lock(&migf->lock); 286 if (migf->state == MLX5_MIGF_STATE_ERROR) 287 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 288 else if (!list_empty(&migf->buf_list) || 289 migf->state == MLX5_MIGF_STATE_COMPLETE) 290 pollflags = EPOLLIN | EPOLLRDNORM; 291 mutex_unlock(&migf->lock); 292 293 return pollflags; 294 } 295 296 /* 297 * FD is exposed and user can use it after receiving an error. 298 * Mark migf in error, and wake the user. 299 */ 300 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 301 { 302 migf->state = MLX5_MIGF_STATE_ERROR; 303 wake_up_interruptible(&migf->poll_wait); 304 } 305 306 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 307 unsigned long arg) 308 { 309 struct mlx5_vf_migration_file *migf = filp->private_data; 310 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 311 struct mlx5_vhca_data_buffer *buf; 312 struct vfio_precopy_info info = {}; 313 loff_t *pos = &filp->f_pos; 314 unsigned long minsz; 315 size_t inc_length = 0; 316 bool end_of_data; 317 int ret; 318 319 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 320 return -ENOTTY; 321 322 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 323 324 if (copy_from_user(&info, (void __user *)arg, minsz)) 325 return -EFAULT; 326 327 if (info.argsz < minsz) 328 return -EINVAL; 329 330 mutex_lock(&mvdev->state_mutex); 331 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 332 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 333 ret = -EINVAL; 334 goto err_state_unlock; 335 } 336 337 /* 338 * We can't issue a SAVE command when the device is suspended, so as 339 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 340 * bytes that can't be read. 341 */ 342 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 343 /* 344 * Once the query returns it's guaranteed that there is no 345 * active SAVE command. 346 * As so, the other code below is safe with the proper locks. 347 */ 348 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 349 MLX5VF_QUERY_INC); 350 if (ret) 351 goto err_state_unlock; 352 } 353 354 mutex_lock(&migf->lock); 355 if (migf->state == MLX5_MIGF_STATE_ERROR) { 356 ret = -ENODEV; 357 goto err_migf_unlock; 358 } 359 360 buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); 361 if (buf) { 362 if (buf->start_pos == 0) { 363 info.initial_bytes = buf->header_image_size - *pos; 364 } else if (buf->start_pos == 365 sizeof(struct mlx5_vf_migration_header)) { 366 /* First data buffer following the header */ 367 info.initial_bytes = buf->start_pos + 368 buf->length - *pos; 369 } else { 370 info.dirty_bytes = buf->start_pos + buf->length - *pos; 371 } 372 } else { 373 if (!end_of_data) { 374 ret = -EINVAL; 375 goto err_migf_unlock; 376 } 377 378 info.dirty_bytes = inc_length; 379 } 380 381 if (!end_of_data || !inc_length) { 382 mutex_unlock(&migf->lock); 383 goto done; 384 } 385 386 mutex_unlock(&migf->lock); 387 /* 388 * We finished transferring the current state and the device has a 389 * dirty state, save a new state to be ready for. 390 */ 391 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 392 if (IS_ERR(buf)) { 393 ret = PTR_ERR(buf); 394 mlx5vf_mark_err(migf); 395 goto err_state_unlock; 396 } 397 398 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 399 if (ret) { 400 mlx5vf_mark_err(migf); 401 mlx5vf_put_data_buffer(buf); 402 goto err_state_unlock; 403 } 404 405 done: 406 mlx5vf_state_mutex_unlock(mvdev); 407 return copy_to_user((void __user *)arg, &info, minsz); 408 err_migf_unlock: 409 mutex_unlock(&migf->lock); 410 err_state_unlock: 411 mlx5vf_state_mutex_unlock(mvdev); 412 return ret; 413 } 414 415 static const struct file_operations mlx5vf_save_fops = { 416 .owner = THIS_MODULE, 417 .read = mlx5vf_save_read, 418 .poll = mlx5vf_save_poll, 419 .unlocked_ioctl = mlx5vf_precopy_ioctl, 420 .compat_ioctl = compat_ptr_ioctl, 421 .release = mlx5vf_release_file, 422 .llseek = no_llseek, 423 }; 424 425 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 426 { 427 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 428 struct mlx5_vhca_data_buffer *buf; 429 size_t length; 430 int ret; 431 432 if (migf->state == MLX5_MIGF_STATE_ERROR) 433 return -ENODEV; 434 435 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 436 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 437 if (ret) 438 goto err; 439 440 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 441 if (IS_ERR(buf)) { 442 ret = PTR_ERR(buf); 443 goto err; 444 } 445 446 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 447 if (ret) 448 goto err_save; 449 450 return 0; 451 452 err_save: 453 mlx5vf_put_data_buffer(buf); 454 err: 455 mlx5vf_mark_err(migf); 456 return ret; 457 } 458 459 static struct mlx5_vf_migration_file * 460 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 461 { 462 struct mlx5_vf_migration_file *migf; 463 struct mlx5_vhca_data_buffer *buf; 464 size_t length; 465 int ret; 466 467 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 468 if (!migf) 469 return ERR_PTR(-ENOMEM); 470 471 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 472 O_RDONLY); 473 if (IS_ERR(migf->filp)) { 474 ret = PTR_ERR(migf->filp); 475 goto end; 476 } 477 478 migf->mvdev = mvdev; 479 ret = mlx5vf_cmd_alloc_pd(migf); 480 if (ret) 481 goto out_free; 482 483 stream_open(migf->filp->f_inode, migf->filp); 484 mutex_init(&migf->lock); 485 init_waitqueue_head(&migf->poll_wait); 486 init_completion(&migf->save_comp); 487 /* 488 * save_comp is being used as a binary semaphore built from 489 * a completion. A normal mutex cannot be used because the lock is 490 * passed between kernel threads and lockdep can't model this. 491 */ 492 complete(&migf->save_comp); 493 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 494 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 495 INIT_LIST_HEAD(&migf->buf_list); 496 INIT_LIST_HEAD(&migf->avail_list); 497 spin_lock_init(&migf->list_lock); 498 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 499 if (ret) 500 goto out_pd; 501 502 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 503 if (IS_ERR(buf)) { 504 ret = PTR_ERR(buf); 505 goto out_pd; 506 } 507 508 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 509 if (ret) 510 goto out_save; 511 return migf; 512 out_save: 513 mlx5vf_free_data_buffer(buf); 514 out_pd: 515 mlx5vf_cmd_dealloc_pd(migf); 516 out_free: 517 fput(migf->filp); 518 end: 519 kfree(migf); 520 return ERR_PTR(ret); 521 } 522 523 static int 524 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 525 const char __user **buf, size_t *len, 526 loff_t *pos, ssize_t *done) 527 { 528 unsigned long offset; 529 size_t page_offset; 530 struct page *page; 531 size_t page_len; 532 u8 *to_buff; 533 int ret; 534 535 offset = *pos - vhca_buf->start_pos; 536 page_offset = offset % PAGE_SIZE; 537 538 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 539 if (!page) 540 return -EINVAL; 541 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 542 to_buff = kmap_local_page(page); 543 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 544 kunmap_local(to_buff); 545 if (ret) 546 return -EFAULT; 547 548 *pos += page_len; 549 *done += page_len; 550 *buf += page_len; 551 *len -= page_len; 552 vhca_buf->length += page_len; 553 return 0; 554 } 555 556 static int 557 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 558 loff_t requested_length, 559 const char __user **buf, size_t *len, 560 loff_t *pos, ssize_t *done) 561 { 562 int ret; 563 564 if (requested_length > MAX_MIGRATION_SIZE) 565 return -ENOMEM; 566 567 if (vhca_buf->allocated_length < requested_length) { 568 ret = mlx5vf_add_migration_pages( 569 vhca_buf, 570 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 571 PAGE_SIZE)); 572 if (ret) 573 return ret; 574 } 575 576 while (*len) { 577 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 578 done); 579 if (ret) 580 return ret; 581 } 582 583 return 0; 584 } 585 586 static ssize_t 587 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 588 struct mlx5_vhca_data_buffer *vhca_buf, 589 size_t image_size, const char __user **buf, 590 size_t *len, loff_t *pos, ssize_t *done, 591 bool *has_work) 592 { 593 size_t copy_len, to_copy; 594 int ret; 595 596 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 597 copy_len = to_copy; 598 while (to_copy) { 599 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 600 done); 601 if (ret) 602 return ret; 603 } 604 605 *len -= copy_len; 606 if (vhca_buf->length == image_size) { 607 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 608 migf->max_pos += image_size; 609 *has_work = true; 610 } 611 612 return 0; 613 } 614 615 static int 616 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 617 struct mlx5_vhca_data_buffer *vhca_buf, 618 const char __user **buf, 619 size_t *len, loff_t *pos, 620 ssize_t *done, bool *has_work) 621 { 622 struct page *page; 623 size_t copy_len; 624 u8 *to_buff; 625 int ret; 626 627 copy_len = min_t(size_t, *len, 628 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 629 page = mlx5vf_get_migration_page(vhca_buf, 0); 630 if (!page) 631 return -EINVAL; 632 to_buff = kmap_local_page(page); 633 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 634 if (ret) { 635 ret = -EFAULT; 636 goto end; 637 } 638 639 *buf += copy_len; 640 *pos += copy_len; 641 *done += copy_len; 642 *len -= copy_len; 643 vhca_buf->length += copy_len; 644 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 645 u64 flags; 646 647 vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); 648 if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { 649 ret = -ENOMEM; 650 goto end; 651 } 652 653 flags = le64_to_cpup((__le64 *)(to_buff + 654 offsetof(struct mlx5_vf_migration_header, flags))); 655 if (flags) { 656 ret = -EOPNOTSUPP; 657 goto end; 658 } 659 660 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 661 migf->max_pos += vhca_buf->length; 662 *has_work = true; 663 } 664 end: 665 kunmap_local(to_buff); 666 return ret; 667 } 668 669 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 670 size_t len, loff_t *pos) 671 { 672 struct mlx5_vf_migration_file *migf = filp->private_data; 673 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 674 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; 675 loff_t requested_length; 676 bool has_work = false; 677 ssize_t done = 0; 678 int ret = 0; 679 680 if (pos) 681 return -ESPIPE; 682 pos = &filp->f_pos; 683 684 if (*pos < 0 || 685 check_add_overflow((loff_t)len, *pos, &requested_length)) 686 return -EINVAL; 687 688 mutex_lock(&migf->mvdev->state_mutex); 689 mutex_lock(&migf->lock); 690 if (migf->state == MLX5_MIGF_STATE_ERROR) { 691 ret = -ENODEV; 692 goto out_unlock; 693 } 694 695 while (len || has_work) { 696 has_work = false; 697 switch (migf->load_state) { 698 case MLX5_VF_LOAD_STATE_READ_HEADER: 699 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 700 &buf, &len, pos, 701 &done, &has_work); 702 if (ret) 703 goto out_unlock; 704 break; 705 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 706 { 707 u64 size = vhca_buf_header->header_image_size; 708 709 if (vhca_buf->allocated_length < size) { 710 mlx5vf_free_data_buffer(vhca_buf); 711 712 migf->buf = mlx5vf_alloc_data_buffer(migf, 713 size, DMA_TO_DEVICE); 714 if (IS_ERR(migf->buf)) { 715 ret = PTR_ERR(migf->buf); 716 migf->buf = NULL; 717 goto out_unlock; 718 } 719 720 vhca_buf = migf->buf; 721 } 722 723 vhca_buf->start_pos = migf->max_pos; 724 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 725 break; 726 } 727 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 728 ret = mlx5vf_resume_read_image_no_header(vhca_buf, 729 requested_length, 730 &buf, &len, pos, &done); 731 if (ret) 732 goto out_unlock; 733 break; 734 case MLX5_VF_LOAD_STATE_READ_IMAGE: 735 ret = mlx5vf_resume_read_image(migf, vhca_buf, 736 vhca_buf_header->header_image_size, 737 &buf, &len, pos, &done, &has_work); 738 if (ret) 739 goto out_unlock; 740 break; 741 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 742 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 743 if (ret) 744 goto out_unlock; 745 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 746 747 /* prep header buf for next image */ 748 vhca_buf_header->length = 0; 749 vhca_buf_header->header_image_size = 0; 750 /* prep data buf for next image */ 751 vhca_buf->length = 0; 752 753 break; 754 default: 755 break; 756 } 757 } 758 759 out_unlock: 760 if (ret) 761 migf->state = MLX5_MIGF_STATE_ERROR; 762 mutex_unlock(&migf->lock); 763 mlx5vf_state_mutex_unlock(migf->mvdev); 764 return ret ? ret : done; 765 } 766 767 static const struct file_operations mlx5vf_resume_fops = { 768 .owner = THIS_MODULE, 769 .write = mlx5vf_resume_write, 770 .release = mlx5vf_release_file, 771 .llseek = no_llseek, 772 }; 773 774 static struct mlx5_vf_migration_file * 775 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 776 { 777 struct mlx5_vf_migration_file *migf; 778 struct mlx5_vhca_data_buffer *buf; 779 int ret; 780 781 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 782 if (!migf) 783 return ERR_PTR(-ENOMEM); 784 785 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 786 O_WRONLY); 787 if (IS_ERR(migf->filp)) { 788 ret = PTR_ERR(migf->filp); 789 goto end; 790 } 791 792 migf->mvdev = mvdev; 793 ret = mlx5vf_cmd_alloc_pd(migf); 794 if (ret) 795 goto out_free; 796 797 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 798 if (IS_ERR(buf)) { 799 ret = PTR_ERR(buf); 800 goto out_pd; 801 } 802 803 migf->buf = buf; 804 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 805 buf = mlx5vf_alloc_data_buffer(migf, 806 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 807 if (IS_ERR(buf)) { 808 ret = PTR_ERR(buf); 809 goto out_buf; 810 } 811 812 migf->buf_header = buf; 813 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 814 } else { 815 /* Initial state will be to read the image */ 816 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 817 } 818 819 stream_open(migf->filp->f_inode, migf->filp); 820 mutex_init(&migf->lock); 821 INIT_LIST_HEAD(&migf->buf_list); 822 INIT_LIST_HEAD(&migf->avail_list); 823 spin_lock_init(&migf->list_lock); 824 return migf; 825 out_buf: 826 mlx5vf_free_data_buffer(buf); 827 out_pd: 828 mlx5vf_cmd_dealloc_pd(migf); 829 out_free: 830 fput(migf->filp); 831 end: 832 kfree(migf); 833 return ERR_PTR(ret); 834 } 835 836 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 837 { 838 if (mvdev->resuming_migf) { 839 mlx5vf_disable_fd(mvdev->resuming_migf); 840 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 841 fput(mvdev->resuming_migf->filp); 842 mvdev->resuming_migf = NULL; 843 } 844 if (mvdev->saving_migf) { 845 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 846 cancel_work_sync(&mvdev->saving_migf->async_data.work); 847 mlx5vf_disable_fd(mvdev->saving_migf); 848 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 849 fput(mvdev->saving_migf->filp); 850 mvdev->saving_migf = NULL; 851 } 852 } 853 854 static struct file * 855 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 856 u32 new) 857 { 858 u32 cur = mvdev->mig_state; 859 int ret; 860 861 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 862 ret = mlx5vf_cmd_suspend_vhca(mvdev, 863 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 864 if (ret) 865 return ERR_PTR(ret); 866 return NULL; 867 } 868 869 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 870 ret = mlx5vf_cmd_resume_vhca(mvdev, 871 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 872 if (ret) 873 return ERR_PTR(ret); 874 return NULL; 875 } 876 877 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 878 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 879 ret = mlx5vf_cmd_suspend_vhca(mvdev, 880 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 881 if (ret) 882 return ERR_PTR(ret); 883 return NULL; 884 } 885 886 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 887 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 888 ret = mlx5vf_cmd_resume_vhca(mvdev, 889 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 890 if (ret) 891 return ERR_PTR(ret); 892 return NULL; 893 } 894 895 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 896 struct mlx5_vf_migration_file *migf; 897 898 migf = mlx5vf_pci_save_device_data(mvdev, false); 899 if (IS_ERR(migf)) 900 return ERR_CAST(migf); 901 get_file(migf->filp); 902 mvdev->saving_migf = migf; 903 return migf->filp; 904 } 905 906 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 907 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 908 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 909 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 910 mlx5vf_disable_fds(mvdev); 911 return NULL; 912 } 913 914 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 915 struct mlx5_vf_migration_file *migf; 916 917 migf = mlx5vf_pci_resume_device_data(mvdev); 918 if (IS_ERR(migf)) 919 return ERR_CAST(migf); 920 get_file(migf->filp); 921 mvdev->resuming_migf = migf; 922 return migf->filp; 923 } 924 925 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 926 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 927 ret = mlx5vf_cmd_load_vhca_state(mvdev, 928 mvdev->resuming_migf, 929 mvdev->resuming_migf->buf); 930 if (ret) 931 return ERR_PTR(ret); 932 } 933 mlx5vf_disable_fds(mvdev); 934 return NULL; 935 } 936 937 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 938 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 939 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 940 struct mlx5_vf_migration_file *migf; 941 942 migf = mlx5vf_pci_save_device_data(mvdev, true); 943 if (IS_ERR(migf)) 944 return ERR_CAST(migf); 945 get_file(migf->filp); 946 mvdev->saving_migf = migf; 947 return migf->filp; 948 } 949 950 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 951 ret = mlx5vf_cmd_suspend_vhca(mvdev, 952 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 953 if (ret) 954 return ERR_PTR(ret); 955 ret = mlx5vf_pci_save_device_inc_data(mvdev); 956 return ret ? ERR_PTR(ret) : NULL; 957 } 958 959 /* 960 * vfio_mig_get_next_state() does not use arcs other than the above 961 */ 962 WARN_ON(true); 963 return ERR_PTR(-EINVAL); 964 } 965 966 /* 967 * This function is called in all state_mutex unlock cases to 968 * handle a 'deferred_reset' if exists. 969 */ 970 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 971 { 972 again: 973 spin_lock(&mvdev->reset_lock); 974 if (mvdev->deferred_reset) { 975 mvdev->deferred_reset = false; 976 spin_unlock(&mvdev->reset_lock); 977 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 978 mlx5vf_disable_fds(mvdev); 979 goto again; 980 } 981 mutex_unlock(&mvdev->state_mutex); 982 spin_unlock(&mvdev->reset_lock); 983 } 984 985 static struct file * 986 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 987 enum vfio_device_mig_state new_state) 988 { 989 struct mlx5vf_pci_core_device *mvdev = container_of( 990 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 991 enum vfio_device_mig_state next_state; 992 struct file *res = NULL; 993 int ret; 994 995 mutex_lock(&mvdev->state_mutex); 996 while (new_state != mvdev->mig_state) { 997 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 998 new_state, &next_state); 999 if (ret) { 1000 res = ERR_PTR(ret); 1001 break; 1002 } 1003 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1004 if (IS_ERR(res)) 1005 break; 1006 mvdev->mig_state = next_state; 1007 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1008 fput(res); 1009 res = ERR_PTR(-EINVAL); 1010 break; 1011 } 1012 } 1013 mlx5vf_state_mutex_unlock(mvdev); 1014 return res; 1015 } 1016 1017 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1018 unsigned long *stop_copy_length) 1019 { 1020 struct mlx5vf_pci_core_device *mvdev = container_of( 1021 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1022 size_t state_size; 1023 int ret; 1024 1025 mutex_lock(&mvdev->state_mutex); 1026 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 1027 &state_size, 0); 1028 if (!ret) 1029 *stop_copy_length = state_size; 1030 mlx5vf_state_mutex_unlock(mvdev); 1031 return ret; 1032 } 1033 1034 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1035 enum vfio_device_mig_state *curr_state) 1036 { 1037 struct mlx5vf_pci_core_device *mvdev = container_of( 1038 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1039 1040 mutex_lock(&mvdev->state_mutex); 1041 *curr_state = mvdev->mig_state; 1042 mlx5vf_state_mutex_unlock(mvdev); 1043 return 0; 1044 } 1045 1046 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1047 { 1048 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1049 1050 if (!mvdev->migrate_cap) 1051 return; 1052 1053 /* 1054 * As the higher VFIO layers are holding locks across reset and using 1055 * those same locks with the mm_lock we need to prevent ABBA deadlock 1056 * with the state_mutex and mm_lock. 1057 * In case the state_mutex was taken already we defer the cleanup work 1058 * to the unlock flow of the other running context. 1059 */ 1060 spin_lock(&mvdev->reset_lock); 1061 mvdev->deferred_reset = true; 1062 if (!mutex_trylock(&mvdev->state_mutex)) { 1063 spin_unlock(&mvdev->reset_lock); 1064 return; 1065 } 1066 spin_unlock(&mvdev->reset_lock); 1067 mlx5vf_state_mutex_unlock(mvdev); 1068 } 1069 1070 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1071 { 1072 struct mlx5vf_pci_core_device *mvdev = container_of( 1073 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1074 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1075 int ret; 1076 1077 ret = vfio_pci_core_enable(vdev); 1078 if (ret) 1079 return ret; 1080 1081 if (mvdev->migrate_cap) 1082 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1083 vfio_pci_core_finish_enable(vdev); 1084 return 0; 1085 } 1086 1087 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1088 { 1089 struct mlx5vf_pci_core_device *mvdev = container_of( 1090 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1091 1092 mlx5vf_cmd_close_migratable(mvdev); 1093 vfio_pci_core_close_device(core_vdev); 1094 } 1095 1096 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1097 .migration_set_state = mlx5vf_pci_set_device_state, 1098 .migration_get_state = mlx5vf_pci_get_device_state, 1099 .migration_get_data_size = mlx5vf_pci_get_data_size, 1100 }; 1101 1102 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1103 .log_start = mlx5vf_start_page_tracker, 1104 .log_stop = mlx5vf_stop_page_tracker, 1105 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1106 }; 1107 1108 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1109 { 1110 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1111 struct mlx5vf_pci_core_device, core_device.vdev); 1112 int ret; 1113 1114 ret = vfio_pci_core_init_dev(core_vdev); 1115 if (ret) 1116 return ret; 1117 1118 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1119 &mlx5vf_pci_log_ops); 1120 1121 return 0; 1122 } 1123 1124 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1125 { 1126 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1127 struct mlx5vf_pci_core_device, core_device.vdev); 1128 1129 mlx5vf_cmd_remove_migratable(mvdev); 1130 vfio_pci_core_release_dev(core_vdev); 1131 } 1132 1133 static const struct vfio_device_ops mlx5vf_pci_ops = { 1134 .name = "mlx5-vfio-pci", 1135 .init = mlx5vf_pci_init_dev, 1136 .release = mlx5vf_pci_release_dev, 1137 .open_device = mlx5vf_pci_open_device, 1138 .close_device = mlx5vf_pci_close_device, 1139 .ioctl = vfio_pci_core_ioctl, 1140 .device_feature = vfio_pci_core_ioctl_feature, 1141 .read = vfio_pci_core_read, 1142 .write = vfio_pci_core_write, 1143 .mmap = vfio_pci_core_mmap, 1144 .request = vfio_pci_core_request, 1145 .match = vfio_pci_core_match, 1146 }; 1147 1148 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1149 const struct pci_device_id *id) 1150 { 1151 struct mlx5vf_pci_core_device *mvdev; 1152 int ret; 1153 1154 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1155 &pdev->dev, &mlx5vf_pci_ops); 1156 if (IS_ERR(mvdev)) 1157 return PTR_ERR(mvdev); 1158 1159 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1160 ret = vfio_pci_core_register_device(&mvdev->core_device); 1161 if (ret) 1162 goto out_put_vdev; 1163 return 0; 1164 1165 out_put_vdev: 1166 vfio_put_device(&mvdev->core_device.vdev); 1167 return ret; 1168 } 1169 1170 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1171 { 1172 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1173 1174 vfio_pci_core_unregister_device(&mvdev->core_device); 1175 vfio_put_device(&mvdev->core_device.vdev); 1176 } 1177 1178 static const struct pci_device_id mlx5vf_pci_table[] = { 1179 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1180 {} 1181 }; 1182 1183 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1184 1185 static const struct pci_error_handlers mlx5vf_err_handlers = { 1186 .reset_done = mlx5vf_pci_aer_reset_done, 1187 .error_detected = vfio_pci_core_aer_err_detected, 1188 }; 1189 1190 static struct pci_driver mlx5vf_pci_driver = { 1191 .name = KBUILD_MODNAME, 1192 .id_table = mlx5vf_pci_table, 1193 .probe = mlx5vf_pci_probe, 1194 .remove = mlx5vf_pci_remove, 1195 .err_handler = &mlx5vf_err_handlers, 1196 .driver_managed_dma = true, 1197 }; 1198 1199 module_pci_driver(mlx5vf_pci_driver); 1200 1201 MODULE_LICENSE("GPL"); 1202 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1203 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1204 MODULE_DESCRIPTION( 1205 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1206