1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 82 page_list); 83 if (!filled) { 84 ret = -ENOMEM; 85 goto err; 86 } 87 to_alloc -= filled; 88 ret = sg_alloc_append_table_from_pages( 89 &buf->table, page_list, filled, 0, 90 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 91 GFP_KERNEL_ACCOUNT); 92 93 if (ret) 94 goto err; 95 buf->allocated_length += filled * PAGE_SIZE; 96 /* clean input for another bulk allocation */ 97 memset(page_list, 0, filled * sizeof(*page_list)); 98 to_fill = min_t(unsigned int, to_alloc, 99 PAGE_SIZE / sizeof(*page_list)); 100 } while (to_alloc > 0); 101 102 kvfree(page_list); 103 return 0; 104 105 err: 106 kvfree(page_list); 107 return ret; 108 } 109 110 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 111 { 112 mutex_lock(&migf->lock); 113 migf->state = MLX5_MIGF_STATE_ERROR; 114 migf->filp->f_pos = 0; 115 mutex_unlock(&migf->lock); 116 } 117 118 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 119 { 120 struct mlx5_vf_migration_file *migf = filp->private_data; 121 122 mlx5vf_disable_fd(migf); 123 mutex_destroy(&migf->lock); 124 kfree(migf); 125 return 0; 126 } 127 128 static struct mlx5_vhca_data_buffer * 129 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 130 bool *end_of_data) 131 { 132 struct mlx5_vhca_data_buffer *buf; 133 bool found = false; 134 135 *end_of_data = false; 136 spin_lock_irq(&migf->list_lock); 137 if (list_empty(&migf->buf_list)) { 138 *end_of_data = true; 139 goto end; 140 } 141 142 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 143 buf_elm); 144 if (pos >= buf->start_pos && 145 pos < buf->start_pos + buf->length) { 146 found = true; 147 goto end; 148 } 149 150 /* 151 * As we use a stream based FD we may expect having the data always 152 * on first chunk 153 */ 154 migf->state = MLX5_MIGF_STATE_ERROR; 155 156 end: 157 spin_unlock_irq(&migf->list_lock); 158 return found ? buf : NULL; 159 } 160 161 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 162 char __user **buf, size_t *len, loff_t *pos) 163 { 164 unsigned long offset; 165 ssize_t done = 0; 166 size_t copy_len; 167 168 copy_len = min_t(size_t, 169 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 170 while (copy_len) { 171 size_t page_offset; 172 struct page *page; 173 size_t page_len; 174 u8 *from_buff; 175 int ret; 176 177 offset = *pos - vhca_buf->start_pos; 178 page_offset = offset % PAGE_SIZE; 179 offset -= page_offset; 180 page = mlx5vf_get_migration_page(vhca_buf, offset); 181 if (!page) 182 return -EINVAL; 183 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 184 from_buff = kmap_local_page(page); 185 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 186 kunmap_local(from_buff); 187 if (ret) 188 return -EFAULT; 189 *pos += page_len; 190 *len -= page_len; 191 *buf += page_len; 192 done += page_len; 193 copy_len -= page_len; 194 } 195 196 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 197 spin_lock_irq(&vhca_buf->migf->list_lock); 198 list_del_init(&vhca_buf->buf_elm); 199 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 200 spin_unlock_irq(&vhca_buf->migf->list_lock); 201 } 202 203 return done; 204 } 205 206 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 207 loff_t *pos) 208 { 209 struct mlx5_vf_migration_file *migf = filp->private_data; 210 struct mlx5_vhca_data_buffer *vhca_buf; 211 bool first_loop_call = true; 212 bool end_of_data; 213 ssize_t done = 0; 214 215 if (pos) 216 return -ESPIPE; 217 pos = &filp->f_pos; 218 219 if (!(filp->f_flags & O_NONBLOCK)) { 220 if (wait_event_interruptible(migf->poll_wait, 221 !list_empty(&migf->buf_list) || 222 migf->state == MLX5_MIGF_STATE_ERROR || 223 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 224 migf->state == MLX5_MIGF_STATE_PRE_COPY || 225 migf->state == MLX5_MIGF_STATE_COMPLETE)) 226 return -ERESTARTSYS; 227 } 228 229 mutex_lock(&migf->lock); 230 if (migf->state == MLX5_MIGF_STATE_ERROR) { 231 done = -ENODEV; 232 goto out_unlock; 233 } 234 235 while (len) { 236 ssize_t count; 237 238 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 239 &end_of_data); 240 if (first_loop_call) { 241 first_loop_call = false; 242 /* Temporary end of file as part of PRE_COPY */ 243 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 244 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 245 done = -ENOMSG; 246 goto out_unlock; 247 } 248 249 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 250 if (filp->f_flags & O_NONBLOCK) { 251 done = -EAGAIN; 252 goto out_unlock; 253 } 254 } 255 } 256 257 if (end_of_data) 258 goto out_unlock; 259 260 if (!vhca_buf) { 261 done = -EINVAL; 262 goto out_unlock; 263 } 264 265 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 266 if (count < 0) { 267 done = count; 268 goto out_unlock; 269 } 270 done += count; 271 } 272 273 out_unlock: 274 mutex_unlock(&migf->lock); 275 return done; 276 } 277 278 static __poll_t mlx5vf_save_poll(struct file *filp, 279 struct poll_table_struct *wait) 280 { 281 struct mlx5_vf_migration_file *migf = filp->private_data; 282 __poll_t pollflags = 0; 283 284 poll_wait(filp, &migf->poll_wait, wait); 285 286 mutex_lock(&migf->lock); 287 if (migf->state == MLX5_MIGF_STATE_ERROR) 288 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 289 else if (!list_empty(&migf->buf_list) || 290 migf->state == MLX5_MIGF_STATE_COMPLETE) 291 pollflags = EPOLLIN | EPOLLRDNORM; 292 mutex_unlock(&migf->lock); 293 294 return pollflags; 295 } 296 297 /* 298 * FD is exposed and user can use it after receiving an error. 299 * Mark migf in error, and wake the user. 300 */ 301 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 302 { 303 migf->state = MLX5_MIGF_STATE_ERROR; 304 wake_up_interruptible(&migf->poll_wait); 305 } 306 307 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) 308 { 309 size_t size = sizeof(struct mlx5_vf_migration_header) + 310 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 311 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 312 struct mlx5_vhca_data_buffer *header_buf = NULL; 313 struct mlx5_vf_migration_header header = {}; 314 unsigned long flags; 315 struct page *page; 316 u8 *to_buff; 317 int ret; 318 319 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 320 if (IS_ERR(header_buf)) 321 return PTR_ERR(header_buf); 322 323 header.record_size = cpu_to_le64(sizeof(data)); 324 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 325 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 326 page = mlx5vf_get_migration_page(header_buf, 0); 327 if (!page) { 328 ret = -EINVAL; 329 goto err; 330 } 331 to_buff = kmap_local_page(page); 332 memcpy(to_buff, &header, sizeof(header)); 333 header_buf->length = sizeof(header); 334 data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); 335 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 336 header_buf->length += sizeof(data); 337 kunmap_local(to_buff); 338 header_buf->start_pos = header_buf->migf->max_pos; 339 migf->max_pos += header_buf->length; 340 spin_lock_irqsave(&migf->list_lock, flags); 341 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 342 spin_unlock_irqrestore(&migf->list_lock, flags); 343 migf->pre_copy_initial_bytes = size; 344 return 0; 345 err: 346 mlx5vf_put_data_buffer(header_buf); 347 return ret; 348 } 349 350 static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, 351 size_t state_size) 352 { 353 struct mlx5_vhca_data_buffer *buf; 354 size_t inc_state_size; 355 int ret; 356 357 /* let's be ready for stop_copy size that might grow by 10 percents */ 358 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 359 inc_state_size = state_size; 360 361 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 362 if (IS_ERR(buf)) 363 return PTR_ERR(buf); 364 365 migf->buf = buf; 366 buf = mlx5vf_get_data_buffer(migf, 367 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 368 if (IS_ERR(buf)) { 369 ret = PTR_ERR(buf); 370 goto err; 371 } 372 373 migf->buf_header = buf; 374 ret = mlx5vf_add_stop_copy_header(migf); 375 if (ret) 376 goto err_header; 377 return 0; 378 379 err_header: 380 mlx5vf_put_data_buffer(migf->buf_header); 381 migf->buf_header = NULL; 382 err: 383 mlx5vf_put_data_buffer(migf->buf); 384 migf->buf = NULL; 385 return ret; 386 } 387 388 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 389 unsigned long arg) 390 { 391 struct mlx5_vf_migration_file *migf = filp->private_data; 392 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 393 struct mlx5_vhca_data_buffer *buf; 394 struct vfio_precopy_info info = {}; 395 loff_t *pos = &filp->f_pos; 396 unsigned long minsz; 397 size_t inc_length = 0; 398 bool end_of_data = false; 399 int ret; 400 401 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 402 return -ENOTTY; 403 404 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 405 406 if (copy_from_user(&info, (void __user *)arg, minsz)) 407 return -EFAULT; 408 409 if (info.argsz < minsz) 410 return -EINVAL; 411 412 mutex_lock(&mvdev->state_mutex); 413 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 414 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 415 ret = -EINVAL; 416 goto err_state_unlock; 417 } 418 419 /* 420 * We can't issue a SAVE command when the device is suspended, so as 421 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 422 * bytes that can't be read. 423 */ 424 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 425 /* 426 * Once the query returns it's guaranteed that there is no 427 * active SAVE command. 428 * As so, the other code below is safe with the proper locks. 429 */ 430 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 431 MLX5VF_QUERY_INC); 432 if (ret) 433 goto err_state_unlock; 434 } 435 436 mutex_lock(&migf->lock); 437 if (migf->state == MLX5_MIGF_STATE_ERROR) { 438 ret = -ENODEV; 439 goto err_migf_unlock; 440 } 441 442 if (migf->pre_copy_initial_bytes > *pos) { 443 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 444 } else { 445 info.dirty_bytes = migf->max_pos - *pos; 446 if (!info.dirty_bytes) 447 end_of_data = true; 448 info.dirty_bytes += inc_length; 449 } 450 451 if (!end_of_data || !inc_length) { 452 mutex_unlock(&migf->lock); 453 goto done; 454 } 455 456 mutex_unlock(&migf->lock); 457 /* 458 * We finished transferring the current state and the device has a 459 * dirty state, save a new state to be ready for. 460 */ 461 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 462 if (IS_ERR(buf)) { 463 ret = PTR_ERR(buf); 464 mlx5vf_mark_err(migf); 465 goto err_state_unlock; 466 } 467 468 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 469 if (ret) { 470 mlx5vf_mark_err(migf); 471 mlx5vf_put_data_buffer(buf); 472 goto err_state_unlock; 473 } 474 475 done: 476 mlx5vf_state_mutex_unlock(mvdev); 477 if (copy_to_user((void __user *)arg, &info, minsz)) 478 return -EFAULT; 479 return 0; 480 481 err_migf_unlock: 482 mutex_unlock(&migf->lock); 483 err_state_unlock: 484 mlx5vf_state_mutex_unlock(mvdev); 485 return ret; 486 } 487 488 static const struct file_operations mlx5vf_save_fops = { 489 .owner = THIS_MODULE, 490 .read = mlx5vf_save_read, 491 .poll = mlx5vf_save_poll, 492 .unlocked_ioctl = mlx5vf_precopy_ioctl, 493 .compat_ioctl = compat_ptr_ioctl, 494 .release = mlx5vf_release_file, 495 .llseek = no_llseek, 496 }; 497 498 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 499 { 500 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 501 struct mlx5_vhca_data_buffer *buf; 502 size_t length; 503 int ret; 504 505 if (migf->state == MLX5_MIGF_STATE_ERROR) 506 return -ENODEV; 507 508 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 509 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 510 if (ret) 511 goto err; 512 513 /* Checking whether we have a matching pre-allocated buffer that can fit */ 514 if (migf->buf && migf->buf->allocated_length >= length) { 515 buf = migf->buf; 516 migf->buf = NULL; 517 } else { 518 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 519 if (IS_ERR(buf)) { 520 ret = PTR_ERR(buf); 521 goto err; 522 } 523 } 524 525 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 526 if (ret) 527 goto err_save; 528 529 return 0; 530 531 err_save: 532 mlx5vf_put_data_buffer(buf); 533 err: 534 mlx5vf_mark_err(migf); 535 return ret; 536 } 537 538 static struct mlx5_vf_migration_file * 539 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 540 { 541 struct mlx5_vf_migration_file *migf; 542 struct mlx5_vhca_data_buffer *buf; 543 size_t length; 544 int ret; 545 546 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 547 if (!migf) 548 return ERR_PTR(-ENOMEM); 549 550 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 551 O_RDONLY); 552 if (IS_ERR(migf->filp)) { 553 ret = PTR_ERR(migf->filp); 554 goto end; 555 } 556 557 migf->mvdev = mvdev; 558 ret = mlx5vf_cmd_alloc_pd(migf); 559 if (ret) 560 goto out_free; 561 562 stream_open(migf->filp->f_inode, migf->filp); 563 mutex_init(&migf->lock); 564 init_waitqueue_head(&migf->poll_wait); 565 init_completion(&migf->save_comp); 566 /* 567 * save_comp is being used as a binary semaphore built from 568 * a completion. A normal mutex cannot be used because the lock is 569 * passed between kernel threads and lockdep can't model this. 570 */ 571 complete(&migf->save_comp); 572 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 573 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 574 INIT_LIST_HEAD(&migf->buf_list); 575 INIT_LIST_HEAD(&migf->avail_list); 576 spin_lock_init(&migf->list_lock); 577 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 578 if (ret) 579 goto out_pd; 580 581 if (track) { 582 ret = mlx5vf_prep_stop_copy(migf, length); 583 if (ret) 584 goto out_pd; 585 } 586 587 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 588 if (IS_ERR(buf)) { 589 ret = PTR_ERR(buf); 590 goto out_pd; 591 } 592 593 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 594 if (ret) 595 goto out_save; 596 return migf; 597 out_save: 598 mlx5vf_free_data_buffer(buf); 599 out_pd: 600 mlx5fv_cmd_clean_migf_resources(migf); 601 out_free: 602 fput(migf->filp); 603 end: 604 kfree(migf); 605 return ERR_PTR(ret); 606 } 607 608 static int 609 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 610 const char __user **buf, size_t *len, 611 loff_t *pos, ssize_t *done) 612 { 613 unsigned long offset; 614 size_t page_offset; 615 struct page *page; 616 size_t page_len; 617 u8 *to_buff; 618 int ret; 619 620 offset = *pos - vhca_buf->start_pos; 621 page_offset = offset % PAGE_SIZE; 622 623 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 624 if (!page) 625 return -EINVAL; 626 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 627 to_buff = kmap_local_page(page); 628 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 629 kunmap_local(to_buff); 630 if (ret) 631 return -EFAULT; 632 633 *pos += page_len; 634 *done += page_len; 635 *buf += page_len; 636 *len -= page_len; 637 vhca_buf->length += page_len; 638 return 0; 639 } 640 641 static int 642 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 643 loff_t requested_length, 644 const char __user **buf, size_t *len, 645 loff_t *pos, ssize_t *done) 646 { 647 int ret; 648 649 if (requested_length > MAX_LOAD_SIZE) 650 return -ENOMEM; 651 652 if (vhca_buf->allocated_length < requested_length) { 653 ret = mlx5vf_add_migration_pages( 654 vhca_buf, 655 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 656 PAGE_SIZE)); 657 if (ret) 658 return ret; 659 } 660 661 while (*len) { 662 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 663 done); 664 if (ret) 665 return ret; 666 } 667 668 return 0; 669 } 670 671 static ssize_t 672 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 673 struct mlx5_vhca_data_buffer *vhca_buf, 674 size_t image_size, const char __user **buf, 675 size_t *len, loff_t *pos, ssize_t *done, 676 bool *has_work) 677 { 678 size_t copy_len, to_copy; 679 int ret; 680 681 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 682 copy_len = to_copy; 683 while (to_copy) { 684 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 685 done); 686 if (ret) 687 return ret; 688 } 689 690 *len -= copy_len; 691 if (vhca_buf->length == image_size) { 692 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 693 migf->max_pos += image_size; 694 *has_work = true; 695 } 696 697 return 0; 698 } 699 700 static int 701 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 702 struct mlx5_vhca_data_buffer *vhca_buf, 703 const char __user **buf, size_t *len, 704 loff_t *pos, ssize_t *done) 705 { 706 size_t copy_len, to_copy; 707 size_t required_data; 708 u8 *to_buff; 709 int ret; 710 711 required_data = migf->record_size - vhca_buf->length; 712 to_copy = min_t(size_t, *len, required_data); 713 copy_len = to_copy; 714 while (to_copy) { 715 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 716 done); 717 if (ret) 718 return ret; 719 } 720 721 *len -= copy_len; 722 if (vhca_buf->length == migf->record_size) { 723 switch (migf->record_tag) { 724 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 725 { 726 struct page *page; 727 728 page = mlx5vf_get_migration_page(vhca_buf, 0); 729 if (!page) 730 return -EINVAL; 731 to_buff = kmap_local_page(page); 732 migf->stop_copy_prep_size = min_t(u64, 733 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 734 kunmap_local(to_buff); 735 break; 736 } 737 default: 738 /* Optional tag */ 739 break; 740 } 741 742 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 743 migf->max_pos += migf->record_size; 744 vhca_buf->length = 0; 745 } 746 747 return 0; 748 } 749 750 static int 751 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 752 struct mlx5_vhca_data_buffer *vhca_buf, 753 const char __user **buf, 754 size_t *len, loff_t *pos, 755 ssize_t *done, bool *has_work) 756 { 757 struct page *page; 758 size_t copy_len; 759 u8 *to_buff; 760 int ret; 761 762 copy_len = min_t(size_t, *len, 763 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 764 page = mlx5vf_get_migration_page(vhca_buf, 0); 765 if (!page) 766 return -EINVAL; 767 to_buff = kmap_local_page(page); 768 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 769 if (ret) { 770 ret = -EFAULT; 771 goto end; 772 } 773 774 *buf += copy_len; 775 *pos += copy_len; 776 *done += copy_len; 777 *len -= copy_len; 778 vhca_buf->length += copy_len; 779 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 780 u64 record_size; 781 u32 flags; 782 783 record_size = le64_to_cpup((__le64 *)to_buff); 784 if (record_size > MAX_LOAD_SIZE) { 785 ret = -ENOMEM; 786 goto end; 787 } 788 789 migf->record_size = record_size; 790 flags = le32_to_cpup((__le32 *)(to_buff + 791 offsetof(struct mlx5_vf_migration_header, flags))); 792 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 793 offsetof(struct mlx5_vf_migration_header, tag))); 794 switch (migf->record_tag) { 795 case MLX5_MIGF_HEADER_TAG_FW_DATA: 796 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 797 break; 798 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 799 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 800 break; 801 default: 802 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 803 ret = -EOPNOTSUPP; 804 goto end; 805 } 806 /* We may read and skip this optional record data */ 807 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 808 } 809 810 migf->max_pos += vhca_buf->length; 811 vhca_buf->length = 0; 812 *has_work = true; 813 } 814 end: 815 kunmap_local(to_buff); 816 return ret; 817 } 818 819 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 820 size_t len, loff_t *pos) 821 { 822 struct mlx5_vf_migration_file *migf = filp->private_data; 823 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 824 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; 825 loff_t requested_length; 826 bool has_work = false; 827 ssize_t done = 0; 828 int ret = 0; 829 830 if (pos) 831 return -ESPIPE; 832 pos = &filp->f_pos; 833 834 if (*pos < 0 || 835 check_add_overflow((loff_t)len, *pos, &requested_length)) 836 return -EINVAL; 837 838 mutex_lock(&migf->mvdev->state_mutex); 839 mutex_lock(&migf->lock); 840 if (migf->state == MLX5_MIGF_STATE_ERROR) { 841 ret = -ENODEV; 842 goto out_unlock; 843 } 844 845 while (len || has_work) { 846 has_work = false; 847 switch (migf->load_state) { 848 case MLX5_VF_LOAD_STATE_READ_HEADER: 849 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 850 &buf, &len, pos, 851 &done, &has_work); 852 if (ret) 853 goto out_unlock; 854 break; 855 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 856 if (vhca_buf_header->allocated_length < migf->record_size) { 857 mlx5vf_free_data_buffer(vhca_buf_header); 858 859 migf->buf_header = mlx5vf_alloc_data_buffer(migf, 860 migf->record_size, DMA_NONE); 861 if (IS_ERR(migf->buf_header)) { 862 ret = PTR_ERR(migf->buf_header); 863 migf->buf_header = NULL; 864 goto out_unlock; 865 } 866 867 vhca_buf_header = migf->buf_header; 868 } 869 870 vhca_buf_header->start_pos = migf->max_pos; 871 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 872 break; 873 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 874 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 875 &buf, &len, pos, &done); 876 if (ret) 877 goto out_unlock; 878 break; 879 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 880 { 881 u64 size = max(migf->record_size, 882 migf->stop_copy_prep_size); 883 884 if (vhca_buf->allocated_length < size) { 885 mlx5vf_free_data_buffer(vhca_buf); 886 887 migf->buf = mlx5vf_alloc_data_buffer(migf, 888 size, DMA_TO_DEVICE); 889 if (IS_ERR(migf->buf)) { 890 ret = PTR_ERR(migf->buf); 891 migf->buf = NULL; 892 goto out_unlock; 893 } 894 895 vhca_buf = migf->buf; 896 } 897 898 vhca_buf->start_pos = migf->max_pos; 899 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 900 break; 901 } 902 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 903 ret = mlx5vf_resume_read_image_no_header(vhca_buf, 904 requested_length, 905 &buf, &len, pos, &done); 906 if (ret) 907 goto out_unlock; 908 break; 909 case MLX5_VF_LOAD_STATE_READ_IMAGE: 910 ret = mlx5vf_resume_read_image(migf, vhca_buf, 911 migf->record_size, 912 &buf, &len, pos, &done, &has_work); 913 if (ret) 914 goto out_unlock; 915 break; 916 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 917 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 918 if (ret) 919 goto out_unlock; 920 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 921 922 /* prep header buf for next image */ 923 vhca_buf_header->length = 0; 924 /* prep data buf for next image */ 925 vhca_buf->length = 0; 926 927 break; 928 default: 929 break; 930 } 931 } 932 933 out_unlock: 934 if (ret) 935 migf->state = MLX5_MIGF_STATE_ERROR; 936 mutex_unlock(&migf->lock); 937 mlx5vf_state_mutex_unlock(migf->mvdev); 938 return ret ? ret : done; 939 } 940 941 static const struct file_operations mlx5vf_resume_fops = { 942 .owner = THIS_MODULE, 943 .write = mlx5vf_resume_write, 944 .release = mlx5vf_release_file, 945 .llseek = no_llseek, 946 }; 947 948 static struct mlx5_vf_migration_file * 949 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 950 { 951 struct mlx5_vf_migration_file *migf; 952 struct mlx5_vhca_data_buffer *buf; 953 int ret; 954 955 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 956 if (!migf) 957 return ERR_PTR(-ENOMEM); 958 959 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 960 O_WRONLY); 961 if (IS_ERR(migf->filp)) { 962 ret = PTR_ERR(migf->filp); 963 goto end; 964 } 965 966 migf->mvdev = mvdev; 967 ret = mlx5vf_cmd_alloc_pd(migf); 968 if (ret) 969 goto out_free; 970 971 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 972 if (IS_ERR(buf)) { 973 ret = PTR_ERR(buf); 974 goto out_pd; 975 } 976 977 migf->buf = buf; 978 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 979 buf = mlx5vf_alloc_data_buffer(migf, 980 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 981 if (IS_ERR(buf)) { 982 ret = PTR_ERR(buf); 983 goto out_buf; 984 } 985 986 migf->buf_header = buf; 987 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 988 } else { 989 /* Initial state will be to read the image */ 990 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 991 } 992 993 stream_open(migf->filp->f_inode, migf->filp); 994 mutex_init(&migf->lock); 995 INIT_LIST_HEAD(&migf->buf_list); 996 INIT_LIST_HEAD(&migf->avail_list); 997 spin_lock_init(&migf->list_lock); 998 return migf; 999 out_buf: 1000 mlx5vf_free_data_buffer(migf->buf); 1001 out_pd: 1002 mlx5vf_cmd_dealloc_pd(migf); 1003 out_free: 1004 fput(migf->filp); 1005 end: 1006 kfree(migf); 1007 return ERR_PTR(ret); 1008 } 1009 1010 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 1011 { 1012 if (mvdev->resuming_migf) { 1013 mlx5vf_disable_fd(mvdev->resuming_migf); 1014 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1015 fput(mvdev->resuming_migf->filp); 1016 mvdev->resuming_migf = NULL; 1017 } 1018 if (mvdev->saving_migf) { 1019 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1020 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1021 mlx5vf_disable_fd(mvdev->saving_migf); 1022 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1023 fput(mvdev->saving_migf->filp); 1024 mvdev->saving_migf = NULL; 1025 } 1026 } 1027 1028 static struct file * 1029 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1030 u32 new) 1031 { 1032 u32 cur = mvdev->mig_state; 1033 int ret; 1034 1035 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1036 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1037 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1038 if (ret) 1039 return ERR_PTR(ret); 1040 return NULL; 1041 } 1042 1043 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1044 ret = mlx5vf_cmd_resume_vhca(mvdev, 1045 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1046 if (ret) 1047 return ERR_PTR(ret); 1048 return NULL; 1049 } 1050 1051 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1052 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1053 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1054 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1055 if (ret) 1056 return ERR_PTR(ret); 1057 return NULL; 1058 } 1059 1060 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1061 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1062 ret = mlx5vf_cmd_resume_vhca(mvdev, 1063 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1064 if (ret) 1065 return ERR_PTR(ret); 1066 return NULL; 1067 } 1068 1069 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1070 struct mlx5_vf_migration_file *migf; 1071 1072 migf = mlx5vf_pci_save_device_data(mvdev, false); 1073 if (IS_ERR(migf)) 1074 return ERR_CAST(migf); 1075 get_file(migf->filp); 1076 mvdev->saving_migf = migf; 1077 return migf->filp; 1078 } 1079 1080 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 1081 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1082 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1083 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1084 mlx5vf_disable_fds(mvdev); 1085 return NULL; 1086 } 1087 1088 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1089 struct mlx5_vf_migration_file *migf; 1090 1091 migf = mlx5vf_pci_resume_device_data(mvdev); 1092 if (IS_ERR(migf)) 1093 return ERR_CAST(migf); 1094 get_file(migf->filp); 1095 mvdev->resuming_migf = migf; 1096 return migf->filp; 1097 } 1098 1099 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1100 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 1101 ret = mlx5vf_cmd_load_vhca_state(mvdev, 1102 mvdev->resuming_migf, 1103 mvdev->resuming_migf->buf); 1104 if (ret) 1105 return ERR_PTR(ret); 1106 } 1107 mlx5vf_disable_fds(mvdev); 1108 return NULL; 1109 } 1110 1111 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1112 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1113 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1114 struct mlx5_vf_migration_file *migf; 1115 1116 migf = mlx5vf_pci_save_device_data(mvdev, true); 1117 if (IS_ERR(migf)) 1118 return ERR_CAST(migf); 1119 get_file(migf->filp); 1120 mvdev->saving_migf = migf; 1121 return migf->filp; 1122 } 1123 1124 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1125 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1126 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1127 if (ret) 1128 return ERR_PTR(ret); 1129 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1130 return ret ? ERR_PTR(ret) : NULL; 1131 } 1132 1133 /* 1134 * vfio_mig_get_next_state() does not use arcs other than the above 1135 */ 1136 WARN_ON(true); 1137 return ERR_PTR(-EINVAL); 1138 } 1139 1140 /* 1141 * This function is called in all state_mutex unlock cases to 1142 * handle a 'deferred_reset' if exists. 1143 */ 1144 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1145 { 1146 again: 1147 spin_lock(&mvdev->reset_lock); 1148 if (mvdev->deferred_reset) { 1149 mvdev->deferred_reset = false; 1150 spin_unlock(&mvdev->reset_lock); 1151 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1152 mlx5vf_disable_fds(mvdev); 1153 goto again; 1154 } 1155 mutex_unlock(&mvdev->state_mutex); 1156 spin_unlock(&mvdev->reset_lock); 1157 } 1158 1159 static struct file * 1160 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1161 enum vfio_device_mig_state new_state) 1162 { 1163 struct mlx5vf_pci_core_device *mvdev = container_of( 1164 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1165 enum vfio_device_mig_state next_state; 1166 struct file *res = NULL; 1167 int ret; 1168 1169 mutex_lock(&mvdev->state_mutex); 1170 while (new_state != mvdev->mig_state) { 1171 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1172 new_state, &next_state); 1173 if (ret) { 1174 res = ERR_PTR(ret); 1175 break; 1176 } 1177 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1178 if (IS_ERR(res)) 1179 break; 1180 mvdev->mig_state = next_state; 1181 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1182 fput(res); 1183 res = ERR_PTR(-EINVAL); 1184 break; 1185 } 1186 } 1187 mlx5vf_state_mutex_unlock(mvdev); 1188 return res; 1189 } 1190 1191 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1192 unsigned long *stop_copy_length) 1193 { 1194 struct mlx5vf_pci_core_device *mvdev = container_of( 1195 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1196 size_t state_size; 1197 int ret; 1198 1199 mutex_lock(&mvdev->state_mutex); 1200 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 1201 &state_size, 0); 1202 if (!ret) 1203 *stop_copy_length = state_size; 1204 mlx5vf_state_mutex_unlock(mvdev); 1205 return ret; 1206 } 1207 1208 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1209 enum vfio_device_mig_state *curr_state) 1210 { 1211 struct mlx5vf_pci_core_device *mvdev = container_of( 1212 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1213 1214 mutex_lock(&mvdev->state_mutex); 1215 *curr_state = mvdev->mig_state; 1216 mlx5vf_state_mutex_unlock(mvdev); 1217 return 0; 1218 } 1219 1220 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1221 { 1222 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1223 1224 if (!mvdev->migrate_cap) 1225 return; 1226 1227 /* 1228 * As the higher VFIO layers are holding locks across reset and using 1229 * those same locks with the mm_lock we need to prevent ABBA deadlock 1230 * with the state_mutex and mm_lock. 1231 * In case the state_mutex was taken already we defer the cleanup work 1232 * to the unlock flow of the other running context. 1233 */ 1234 spin_lock(&mvdev->reset_lock); 1235 mvdev->deferred_reset = true; 1236 if (!mutex_trylock(&mvdev->state_mutex)) { 1237 spin_unlock(&mvdev->reset_lock); 1238 return; 1239 } 1240 spin_unlock(&mvdev->reset_lock); 1241 mlx5vf_state_mutex_unlock(mvdev); 1242 } 1243 1244 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1245 { 1246 struct mlx5vf_pci_core_device *mvdev = container_of( 1247 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1248 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1249 int ret; 1250 1251 ret = vfio_pci_core_enable(vdev); 1252 if (ret) 1253 return ret; 1254 1255 if (mvdev->migrate_cap) 1256 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1257 vfio_pci_core_finish_enable(vdev); 1258 return 0; 1259 } 1260 1261 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1262 { 1263 struct mlx5vf_pci_core_device *mvdev = container_of( 1264 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1265 1266 mlx5vf_cmd_close_migratable(mvdev); 1267 vfio_pci_core_close_device(core_vdev); 1268 } 1269 1270 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1271 .migration_set_state = mlx5vf_pci_set_device_state, 1272 .migration_get_state = mlx5vf_pci_get_device_state, 1273 .migration_get_data_size = mlx5vf_pci_get_data_size, 1274 }; 1275 1276 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1277 .log_start = mlx5vf_start_page_tracker, 1278 .log_stop = mlx5vf_stop_page_tracker, 1279 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1280 }; 1281 1282 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1283 { 1284 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1285 struct mlx5vf_pci_core_device, core_device.vdev); 1286 int ret; 1287 1288 ret = vfio_pci_core_init_dev(core_vdev); 1289 if (ret) 1290 return ret; 1291 1292 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1293 &mlx5vf_pci_log_ops); 1294 1295 return 0; 1296 } 1297 1298 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1299 { 1300 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1301 struct mlx5vf_pci_core_device, core_device.vdev); 1302 1303 mlx5vf_cmd_remove_migratable(mvdev); 1304 vfio_pci_core_release_dev(core_vdev); 1305 } 1306 1307 static const struct vfio_device_ops mlx5vf_pci_ops = { 1308 .name = "mlx5-vfio-pci", 1309 .init = mlx5vf_pci_init_dev, 1310 .release = mlx5vf_pci_release_dev, 1311 .open_device = mlx5vf_pci_open_device, 1312 .close_device = mlx5vf_pci_close_device, 1313 .ioctl = vfio_pci_core_ioctl, 1314 .device_feature = vfio_pci_core_ioctl_feature, 1315 .read = vfio_pci_core_read, 1316 .write = vfio_pci_core_write, 1317 .mmap = vfio_pci_core_mmap, 1318 .request = vfio_pci_core_request, 1319 .match = vfio_pci_core_match, 1320 .bind_iommufd = vfio_iommufd_physical_bind, 1321 .unbind_iommufd = vfio_iommufd_physical_unbind, 1322 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1323 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1324 }; 1325 1326 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1327 const struct pci_device_id *id) 1328 { 1329 struct mlx5vf_pci_core_device *mvdev; 1330 int ret; 1331 1332 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1333 &pdev->dev, &mlx5vf_pci_ops); 1334 if (IS_ERR(mvdev)) 1335 return PTR_ERR(mvdev); 1336 1337 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1338 ret = vfio_pci_core_register_device(&mvdev->core_device); 1339 if (ret) 1340 goto out_put_vdev; 1341 return 0; 1342 1343 out_put_vdev: 1344 vfio_put_device(&mvdev->core_device.vdev); 1345 return ret; 1346 } 1347 1348 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1349 { 1350 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1351 1352 vfio_pci_core_unregister_device(&mvdev->core_device); 1353 vfio_put_device(&mvdev->core_device.vdev); 1354 } 1355 1356 static const struct pci_device_id mlx5vf_pci_table[] = { 1357 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1358 {} 1359 }; 1360 1361 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1362 1363 static const struct pci_error_handlers mlx5vf_err_handlers = { 1364 .reset_done = mlx5vf_pci_aer_reset_done, 1365 .error_detected = vfio_pci_core_aer_err_detected, 1366 }; 1367 1368 static struct pci_driver mlx5vf_pci_driver = { 1369 .name = KBUILD_MODNAME, 1370 .id_table = mlx5vf_pci_table, 1371 .probe = mlx5vf_pci_probe, 1372 .remove = mlx5vf_pci_remove, 1373 .err_handler = &mlx5vf_err_handlers, 1374 .driver_managed_dma = true, 1375 }; 1376 1377 module_pci_driver(mlx5vf_pci_driver); 1378 1379 MODULE_LICENSE("GPL"); 1380 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1381 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1382 MODULE_DESCRIPTION( 1383 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1384