xref: /openbmc/linux/drivers/vfio/pci/mlx5/main.c (revision 844f5ed5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26 
27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30 
31 	return container_of(core_device, struct mlx5vf_pci_core_device,
32 			    core_device);
33 }
34 
35 struct page *
36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 			  unsigned long offset)
38 {
39 	unsigned long cur_offset = 0;
40 	struct scatterlist *sg;
41 	unsigned int i;
42 
43 	/* All accesses are sequential */
44 	if (offset < buf->last_offset || !buf->last_offset_sg) {
45 		buf->last_offset = 0;
46 		buf->last_offset_sg = buf->table.sgt.sgl;
47 		buf->sg_last_entry = 0;
48 	}
49 
50 	cur_offset = buf->last_offset;
51 
52 	for_each_sg(buf->last_offset_sg, sg,
53 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 		if (offset < sg->length + cur_offset) {
55 			buf->last_offset_sg = sg;
56 			buf->sg_last_entry += i;
57 			buf->last_offset = cur_offset;
58 			return nth_page(sg_page(sg),
59 					(offset - cur_offset) / PAGE_SIZE);
60 		}
61 		cur_offset += sg->length;
62 	}
63 	return NULL;
64 }
65 
66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 			       unsigned int npages)
68 {
69 	unsigned int to_alloc = npages;
70 	struct page **page_list;
71 	unsigned long filled;
72 	unsigned int to_fill;
73 	int ret;
74 
75 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
77 	if (!page_list)
78 		return -ENOMEM;
79 
80 	do {
81 		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
82 						page_list);
83 		if (!filled) {
84 			ret = -ENOMEM;
85 			goto err;
86 		}
87 		to_alloc -= filled;
88 		ret = sg_alloc_append_table_from_pages(
89 			&buf->table, page_list, filled, 0,
90 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
91 			GFP_KERNEL_ACCOUNT);
92 
93 		if (ret)
94 			goto err;
95 		buf->allocated_length += filled * PAGE_SIZE;
96 		/* clean input for another bulk allocation */
97 		memset(page_list, 0, filled * sizeof(*page_list));
98 		to_fill = min_t(unsigned int, to_alloc,
99 				PAGE_SIZE / sizeof(*page_list));
100 	} while (to_alloc > 0);
101 
102 	kvfree(page_list);
103 	return 0;
104 
105 err:
106 	kvfree(page_list);
107 	return ret;
108 }
109 
110 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
111 {
112 	mutex_lock(&migf->lock);
113 	migf->state = MLX5_MIGF_STATE_ERROR;
114 	migf->filp->f_pos = 0;
115 	mutex_unlock(&migf->lock);
116 }
117 
118 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
119 {
120 	struct mlx5_vf_migration_file *migf = filp->private_data;
121 
122 	mlx5vf_disable_fd(migf);
123 	mutex_destroy(&migf->lock);
124 	kfree(migf);
125 	return 0;
126 }
127 
128 static struct mlx5_vhca_data_buffer *
129 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
130 			      bool *end_of_data)
131 {
132 	struct mlx5_vhca_data_buffer *buf;
133 	bool found = false;
134 
135 	*end_of_data = false;
136 	spin_lock_irq(&migf->list_lock);
137 	if (list_empty(&migf->buf_list)) {
138 		*end_of_data = true;
139 		goto end;
140 	}
141 
142 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
143 			       buf_elm);
144 	if (pos >= buf->start_pos &&
145 	    pos < buf->start_pos + buf->length) {
146 		found = true;
147 		goto end;
148 	}
149 
150 	/*
151 	 * As we use a stream based FD we may expect having the data always
152 	 * on first chunk
153 	 */
154 	migf->state = MLX5_MIGF_STATE_ERROR;
155 
156 end:
157 	spin_unlock_irq(&migf->list_lock);
158 	return found ? buf : NULL;
159 }
160 
161 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
162 			       char __user **buf, size_t *len, loff_t *pos)
163 {
164 	unsigned long offset;
165 	ssize_t done = 0;
166 	size_t copy_len;
167 
168 	copy_len = min_t(size_t,
169 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
170 	while (copy_len) {
171 		size_t page_offset;
172 		struct page *page;
173 		size_t page_len;
174 		u8 *from_buff;
175 		int ret;
176 
177 		offset = *pos - vhca_buf->start_pos;
178 		page_offset = offset % PAGE_SIZE;
179 		offset -= page_offset;
180 		page = mlx5vf_get_migration_page(vhca_buf, offset);
181 		if (!page)
182 			return -EINVAL;
183 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
184 		from_buff = kmap_local_page(page);
185 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
186 		kunmap_local(from_buff);
187 		if (ret)
188 			return -EFAULT;
189 		*pos += page_len;
190 		*len -= page_len;
191 		*buf += page_len;
192 		done += page_len;
193 		copy_len -= page_len;
194 	}
195 
196 	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
197 		spin_lock_irq(&vhca_buf->migf->list_lock);
198 		list_del_init(&vhca_buf->buf_elm);
199 		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
200 		spin_unlock_irq(&vhca_buf->migf->list_lock);
201 	}
202 
203 	return done;
204 }
205 
206 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
207 			       loff_t *pos)
208 {
209 	struct mlx5_vf_migration_file *migf = filp->private_data;
210 	struct mlx5_vhca_data_buffer *vhca_buf;
211 	bool first_loop_call = true;
212 	bool end_of_data;
213 	ssize_t done = 0;
214 
215 	if (pos)
216 		return -ESPIPE;
217 	pos = &filp->f_pos;
218 
219 	if (!(filp->f_flags & O_NONBLOCK)) {
220 		if (wait_event_interruptible(migf->poll_wait,
221 				!list_empty(&migf->buf_list) ||
222 				migf->state == MLX5_MIGF_STATE_ERROR ||
223 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
224 				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
225 				migf->state == MLX5_MIGF_STATE_COMPLETE))
226 			return -ERESTARTSYS;
227 	}
228 
229 	mutex_lock(&migf->lock);
230 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
231 		done = -ENODEV;
232 		goto out_unlock;
233 	}
234 
235 	while (len) {
236 		ssize_t count;
237 
238 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
239 							 &end_of_data);
240 		if (first_loop_call) {
241 			first_loop_call = false;
242 			/* Temporary end of file as part of PRE_COPY */
243 			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
244 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
245 				done = -ENOMSG;
246 				goto out_unlock;
247 			}
248 
249 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
250 				if (filp->f_flags & O_NONBLOCK) {
251 					done = -EAGAIN;
252 					goto out_unlock;
253 				}
254 			}
255 		}
256 
257 		if (end_of_data)
258 			goto out_unlock;
259 
260 		if (!vhca_buf) {
261 			done = -EINVAL;
262 			goto out_unlock;
263 		}
264 
265 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
266 		if (count < 0) {
267 			done = count;
268 			goto out_unlock;
269 		}
270 		done += count;
271 	}
272 
273 out_unlock:
274 	mutex_unlock(&migf->lock);
275 	return done;
276 }
277 
278 static __poll_t mlx5vf_save_poll(struct file *filp,
279 				 struct poll_table_struct *wait)
280 {
281 	struct mlx5_vf_migration_file *migf = filp->private_data;
282 	__poll_t pollflags = 0;
283 
284 	poll_wait(filp, &migf->poll_wait, wait);
285 
286 	mutex_lock(&migf->lock);
287 	if (migf->state == MLX5_MIGF_STATE_ERROR)
288 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
289 	else if (!list_empty(&migf->buf_list) ||
290 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
291 		pollflags = EPOLLIN | EPOLLRDNORM;
292 	mutex_unlock(&migf->lock);
293 
294 	return pollflags;
295 }
296 
297 /*
298  * FD is exposed and user can use it after receiving an error.
299  * Mark migf in error, and wake the user.
300  */
301 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
302 {
303 	migf->state = MLX5_MIGF_STATE_ERROR;
304 	wake_up_interruptible(&migf->poll_wait);
305 }
306 
307 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
308 {
309 	size_t size = sizeof(struct mlx5_vf_migration_header) +
310 		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
311 	struct mlx5_vf_migration_tag_stop_copy_data data = {};
312 	struct mlx5_vhca_data_buffer *header_buf = NULL;
313 	struct mlx5_vf_migration_header header = {};
314 	unsigned long flags;
315 	struct page *page;
316 	u8 *to_buff;
317 	int ret;
318 
319 	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
320 	if (IS_ERR(header_buf))
321 		return PTR_ERR(header_buf);
322 
323 	header.record_size = cpu_to_le64(sizeof(data));
324 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
325 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
326 	page = mlx5vf_get_migration_page(header_buf, 0);
327 	if (!page) {
328 		ret = -EINVAL;
329 		goto err;
330 	}
331 	to_buff = kmap_local_page(page);
332 	memcpy(to_buff, &header, sizeof(header));
333 	header_buf->length = sizeof(header);
334 	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
335 	memcpy(to_buff + sizeof(header), &data, sizeof(data));
336 	header_buf->length += sizeof(data);
337 	kunmap_local(to_buff);
338 	header_buf->start_pos = header_buf->migf->max_pos;
339 	migf->max_pos += header_buf->length;
340 	spin_lock_irqsave(&migf->list_lock, flags);
341 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
342 	spin_unlock_irqrestore(&migf->list_lock, flags);
343 	migf->pre_copy_initial_bytes = size;
344 	return 0;
345 err:
346 	mlx5vf_put_data_buffer(header_buf);
347 	return ret;
348 }
349 
350 static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
351 				 size_t state_size)
352 {
353 	struct mlx5_vhca_data_buffer *buf;
354 	size_t inc_state_size;
355 	int ret;
356 
357 	/* let's be ready for stop_copy size that might grow by 10 percents */
358 	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
359 		inc_state_size = state_size;
360 
361 	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
362 	if (IS_ERR(buf))
363 		return PTR_ERR(buf);
364 
365 	migf->buf = buf;
366 	buf = mlx5vf_get_data_buffer(migf,
367 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
368 	if (IS_ERR(buf)) {
369 		ret = PTR_ERR(buf);
370 		goto err;
371 	}
372 
373 	migf->buf_header = buf;
374 	ret = mlx5vf_add_stop_copy_header(migf);
375 	if (ret)
376 		goto err_header;
377 	return 0;
378 
379 err_header:
380 	mlx5vf_put_data_buffer(migf->buf_header);
381 	migf->buf_header = NULL;
382 err:
383 	mlx5vf_put_data_buffer(migf->buf);
384 	migf->buf = NULL;
385 	return ret;
386 }
387 
388 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
389 				 unsigned long arg)
390 {
391 	struct mlx5_vf_migration_file *migf = filp->private_data;
392 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
393 	struct mlx5_vhca_data_buffer *buf;
394 	struct vfio_precopy_info info = {};
395 	loff_t *pos = &filp->f_pos;
396 	unsigned long minsz;
397 	size_t inc_length = 0;
398 	bool end_of_data = false;
399 	int ret;
400 
401 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
402 		return -ENOTTY;
403 
404 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
405 
406 	if (copy_from_user(&info, (void __user *)arg, minsz))
407 		return -EFAULT;
408 
409 	if (info.argsz < minsz)
410 		return -EINVAL;
411 
412 	mutex_lock(&mvdev->state_mutex);
413 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
414 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
415 		ret = -EINVAL;
416 		goto err_state_unlock;
417 	}
418 
419 	/*
420 	 * We can't issue a SAVE command when the device is suspended, so as
421 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
422 	 * bytes that can't be read.
423 	 */
424 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
425 		/*
426 		 * Once the query returns it's guaranteed that there is no
427 		 * active SAVE command.
428 		 * As so, the other code below is safe with the proper locks.
429 		 */
430 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
431 							    MLX5VF_QUERY_INC);
432 		if (ret)
433 			goto err_state_unlock;
434 	}
435 
436 	mutex_lock(&migf->lock);
437 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
438 		ret = -ENODEV;
439 		goto err_migf_unlock;
440 	}
441 
442 	if (migf->pre_copy_initial_bytes > *pos) {
443 		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
444 	} else {
445 		buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
446 		if (buf) {
447 			info.dirty_bytes = buf->start_pos + buf->length - *pos;
448 		} else {
449 			if (!end_of_data) {
450 				ret = -EINVAL;
451 				goto err_migf_unlock;
452 			}
453 			info.dirty_bytes = inc_length;
454 		}
455 	}
456 
457 	if (!end_of_data || !inc_length) {
458 		mutex_unlock(&migf->lock);
459 		goto done;
460 	}
461 
462 	mutex_unlock(&migf->lock);
463 	/*
464 	 * We finished transferring the current state and the device has a
465 	 * dirty state, save a new state to be ready for.
466 	 */
467 	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
468 	if (IS_ERR(buf)) {
469 		ret = PTR_ERR(buf);
470 		mlx5vf_mark_err(migf);
471 		goto err_state_unlock;
472 	}
473 
474 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
475 	if (ret) {
476 		mlx5vf_mark_err(migf);
477 		mlx5vf_put_data_buffer(buf);
478 		goto err_state_unlock;
479 	}
480 
481 done:
482 	mlx5vf_state_mutex_unlock(mvdev);
483 	if (copy_to_user((void __user *)arg, &info, minsz))
484 		return -EFAULT;
485 	return 0;
486 
487 err_migf_unlock:
488 	mutex_unlock(&migf->lock);
489 err_state_unlock:
490 	mlx5vf_state_mutex_unlock(mvdev);
491 	return ret;
492 }
493 
494 static const struct file_operations mlx5vf_save_fops = {
495 	.owner = THIS_MODULE,
496 	.read = mlx5vf_save_read,
497 	.poll = mlx5vf_save_poll,
498 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
499 	.compat_ioctl = compat_ptr_ioctl,
500 	.release = mlx5vf_release_file,
501 	.llseek = no_llseek,
502 };
503 
504 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
505 {
506 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
507 	struct mlx5_vhca_data_buffer *buf;
508 	size_t length;
509 	int ret;
510 
511 	if (migf->state == MLX5_MIGF_STATE_ERROR)
512 		return -ENODEV;
513 
514 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
515 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
516 	if (ret)
517 		goto err;
518 
519 	/* Checking whether we have a matching pre-allocated buffer that can fit */
520 	if (migf->buf && migf->buf->allocated_length >= length) {
521 		buf = migf->buf;
522 		migf->buf = NULL;
523 	} else {
524 		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
525 		if (IS_ERR(buf)) {
526 			ret = PTR_ERR(buf);
527 			goto err;
528 		}
529 	}
530 
531 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
532 	if (ret)
533 		goto err_save;
534 
535 	return 0;
536 
537 err_save:
538 	mlx5vf_put_data_buffer(buf);
539 err:
540 	mlx5vf_mark_err(migf);
541 	return ret;
542 }
543 
544 static struct mlx5_vf_migration_file *
545 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
546 {
547 	struct mlx5_vf_migration_file *migf;
548 	struct mlx5_vhca_data_buffer *buf;
549 	size_t length;
550 	int ret;
551 
552 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
553 	if (!migf)
554 		return ERR_PTR(-ENOMEM);
555 
556 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
557 					O_RDONLY);
558 	if (IS_ERR(migf->filp)) {
559 		ret = PTR_ERR(migf->filp);
560 		goto end;
561 	}
562 
563 	migf->mvdev = mvdev;
564 	ret = mlx5vf_cmd_alloc_pd(migf);
565 	if (ret)
566 		goto out_free;
567 
568 	stream_open(migf->filp->f_inode, migf->filp);
569 	mutex_init(&migf->lock);
570 	init_waitqueue_head(&migf->poll_wait);
571 	init_completion(&migf->save_comp);
572 	/*
573 	 * save_comp is being used as a binary semaphore built from
574 	 * a completion. A normal mutex cannot be used because the lock is
575 	 * passed between kernel threads and lockdep can't model this.
576 	 */
577 	complete(&migf->save_comp);
578 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
579 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
580 	INIT_LIST_HEAD(&migf->buf_list);
581 	INIT_LIST_HEAD(&migf->avail_list);
582 	spin_lock_init(&migf->list_lock);
583 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
584 	if (ret)
585 		goto out_pd;
586 
587 	if (track) {
588 		ret = mlx5vf_prep_stop_copy(migf, length);
589 		if (ret)
590 			goto out_pd;
591 	}
592 
593 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
594 	if (IS_ERR(buf)) {
595 		ret = PTR_ERR(buf);
596 		goto out_pd;
597 	}
598 
599 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
600 	if (ret)
601 		goto out_save;
602 	return migf;
603 out_save:
604 	mlx5vf_free_data_buffer(buf);
605 out_pd:
606 	mlx5fv_cmd_clean_migf_resources(migf);
607 out_free:
608 	fput(migf->filp);
609 end:
610 	kfree(migf);
611 	return ERR_PTR(ret);
612 }
613 
614 static int
615 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
616 			      const char __user **buf, size_t *len,
617 			      loff_t *pos, ssize_t *done)
618 {
619 	unsigned long offset;
620 	size_t page_offset;
621 	struct page *page;
622 	size_t page_len;
623 	u8 *to_buff;
624 	int ret;
625 
626 	offset = *pos - vhca_buf->start_pos;
627 	page_offset = offset % PAGE_SIZE;
628 
629 	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
630 	if (!page)
631 		return -EINVAL;
632 	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
633 	to_buff = kmap_local_page(page);
634 	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
635 	kunmap_local(to_buff);
636 	if (ret)
637 		return -EFAULT;
638 
639 	*pos += page_len;
640 	*done += page_len;
641 	*buf += page_len;
642 	*len -= page_len;
643 	vhca_buf->length += page_len;
644 	return 0;
645 }
646 
647 static int
648 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
649 				   loff_t requested_length,
650 				   const char __user **buf, size_t *len,
651 				   loff_t *pos, ssize_t *done)
652 {
653 	int ret;
654 
655 	if (requested_length > MAX_LOAD_SIZE)
656 		return -ENOMEM;
657 
658 	if (vhca_buf->allocated_length < requested_length) {
659 		ret = mlx5vf_add_migration_pages(
660 			vhca_buf,
661 			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
662 				     PAGE_SIZE));
663 		if (ret)
664 			return ret;
665 	}
666 
667 	while (*len) {
668 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
669 						    done);
670 		if (ret)
671 			return ret;
672 	}
673 
674 	return 0;
675 }
676 
677 static ssize_t
678 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
679 			 struct mlx5_vhca_data_buffer *vhca_buf,
680 			 size_t image_size, const char __user **buf,
681 			 size_t *len, loff_t *pos, ssize_t *done,
682 			 bool *has_work)
683 {
684 	size_t copy_len, to_copy;
685 	int ret;
686 
687 	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
688 	copy_len = to_copy;
689 	while (to_copy) {
690 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
691 						    done);
692 		if (ret)
693 			return ret;
694 	}
695 
696 	*len -= copy_len;
697 	if (vhca_buf->length == image_size) {
698 		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
699 		migf->max_pos += image_size;
700 		*has_work = true;
701 	}
702 
703 	return 0;
704 }
705 
706 static int
707 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
708 			       struct mlx5_vhca_data_buffer *vhca_buf,
709 			       const char __user **buf, size_t *len,
710 			       loff_t *pos, ssize_t *done)
711 {
712 	size_t copy_len, to_copy;
713 	size_t required_data;
714 	u8 *to_buff;
715 	int ret;
716 
717 	required_data = migf->record_size - vhca_buf->length;
718 	to_copy = min_t(size_t, *len, required_data);
719 	copy_len = to_copy;
720 	while (to_copy) {
721 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
722 						    done);
723 		if (ret)
724 			return ret;
725 	}
726 
727 	*len -= copy_len;
728 	if (vhca_buf->length == migf->record_size) {
729 		switch (migf->record_tag) {
730 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
731 		{
732 			struct page *page;
733 
734 			page = mlx5vf_get_migration_page(vhca_buf, 0);
735 			if (!page)
736 				return -EINVAL;
737 			to_buff = kmap_local_page(page);
738 			migf->stop_copy_prep_size = min_t(u64,
739 				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
740 			kunmap_local(to_buff);
741 			break;
742 		}
743 		default:
744 			/* Optional tag */
745 			break;
746 		}
747 
748 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
749 		migf->max_pos += migf->record_size;
750 		vhca_buf->length = 0;
751 	}
752 
753 	return 0;
754 }
755 
756 static int
757 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
758 			  struct mlx5_vhca_data_buffer *vhca_buf,
759 			  const char __user **buf,
760 			  size_t *len, loff_t *pos,
761 			  ssize_t *done, bool *has_work)
762 {
763 	struct page *page;
764 	size_t copy_len;
765 	u8 *to_buff;
766 	int ret;
767 
768 	copy_len = min_t(size_t, *len,
769 		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
770 	page = mlx5vf_get_migration_page(vhca_buf, 0);
771 	if (!page)
772 		return -EINVAL;
773 	to_buff = kmap_local_page(page);
774 	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
775 	if (ret) {
776 		ret = -EFAULT;
777 		goto end;
778 	}
779 
780 	*buf += copy_len;
781 	*pos += copy_len;
782 	*done += copy_len;
783 	*len -= copy_len;
784 	vhca_buf->length += copy_len;
785 	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
786 		u64 record_size;
787 		u32 flags;
788 
789 		record_size = le64_to_cpup((__le64 *)to_buff);
790 		if (record_size > MAX_LOAD_SIZE) {
791 			ret = -ENOMEM;
792 			goto end;
793 		}
794 
795 		migf->record_size = record_size;
796 		flags = le32_to_cpup((__le32 *)(to_buff +
797 			    offsetof(struct mlx5_vf_migration_header, flags)));
798 		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
799 			    offsetof(struct mlx5_vf_migration_header, tag)));
800 		switch (migf->record_tag) {
801 		case MLX5_MIGF_HEADER_TAG_FW_DATA:
802 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
803 			break;
804 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
805 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
806 			break;
807 		default:
808 			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
809 				ret = -EOPNOTSUPP;
810 				goto end;
811 			}
812 			/* We may read and skip this optional record data */
813 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
814 		}
815 
816 		migf->max_pos += vhca_buf->length;
817 		vhca_buf->length = 0;
818 		*has_work = true;
819 	}
820 end:
821 	kunmap_local(to_buff);
822 	return ret;
823 }
824 
825 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
826 				   size_t len, loff_t *pos)
827 {
828 	struct mlx5_vf_migration_file *migf = filp->private_data;
829 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
830 	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
831 	loff_t requested_length;
832 	bool has_work = false;
833 	ssize_t done = 0;
834 	int ret = 0;
835 
836 	if (pos)
837 		return -ESPIPE;
838 	pos = &filp->f_pos;
839 
840 	if (*pos < 0 ||
841 	    check_add_overflow((loff_t)len, *pos, &requested_length))
842 		return -EINVAL;
843 
844 	mutex_lock(&migf->mvdev->state_mutex);
845 	mutex_lock(&migf->lock);
846 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
847 		ret = -ENODEV;
848 		goto out_unlock;
849 	}
850 
851 	while (len || has_work) {
852 		has_work = false;
853 		switch (migf->load_state) {
854 		case MLX5_VF_LOAD_STATE_READ_HEADER:
855 			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
856 							&buf, &len, pos,
857 							&done, &has_work);
858 			if (ret)
859 				goto out_unlock;
860 			break;
861 		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
862 			if (vhca_buf_header->allocated_length < migf->record_size) {
863 				mlx5vf_free_data_buffer(vhca_buf_header);
864 
865 				migf->buf_header = mlx5vf_alloc_data_buffer(migf,
866 						migf->record_size, DMA_NONE);
867 				if (IS_ERR(migf->buf_header)) {
868 					ret = PTR_ERR(migf->buf_header);
869 					migf->buf_header = NULL;
870 					goto out_unlock;
871 				}
872 
873 				vhca_buf_header = migf->buf_header;
874 			}
875 
876 			vhca_buf_header->start_pos = migf->max_pos;
877 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
878 			break;
879 		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
880 			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
881 							&buf, &len, pos, &done);
882 			if (ret)
883 				goto out_unlock;
884 			break;
885 		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
886 		{
887 			u64 size = max(migf->record_size,
888 				       migf->stop_copy_prep_size);
889 
890 			if (vhca_buf->allocated_length < size) {
891 				mlx5vf_free_data_buffer(vhca_buf);
892 
893 				migf->buf = mlx5vf_alloc_data_buffer(migf,
894 							size, DMA_TO_DEVICE);
895 				if (IS_ERR(migf->buf)) {
896 					ret = PTR_ERR(migf->buf);
897 					migf->buf = NULL;
898 					goto out_unlock;
899 				}
900 
901 				vhca_buf = migf->buf;
902 			}
903 
904 			vhca_buf->start_pos = migf->max_pos;
905 			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
906 			break;
907 		}
908 		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
909 			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
910 						requested_length,
911 						&buf, &len, pos, &done);
912 			if (ret)
913 				goto out_unlock;
914 			break;
915 		case MLX5_VF_LOAD_STATE_READ_IMAGE:
916 			ret = mlx5vf_resume_read_image(migf, vhca_buf,
917 						migf->record_size,
918 						&buf, &len, pos, &done, &has_work);
919 			if (ret)
920 				goto out_unlock;
921 			break;
922 		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
923 			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
924 			if (ret)
925 				goto out_unlock;
926 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
927 
928 			/* prep header buf for next image */
929 			vhca_buf_header->length = 0;
930 			/* prep data buf for next image */
931 			vhca_buf->length = 0;
932 
933 			break;
934 		default:
935 			break;
936 		}
937 	}
938 
939 out_unlock:
940 	if (ret)
941 		migf->state = MLX5_MIGF_STATE_ERROR;
942 	mutex_unlock(&migf->lock);
943 	mlx5vf_state_mutex_unlock(migf->mvdev);
944 	return ret ? ret : done;
945 }
946 
947 static const struct file_operations mlx5vf_resume_fops = {
948 	.owner = THIS_MODULE,
949 	.write = mlx5vf_resume_write,
950 	.release = mlx5vf_release_file,
951 	.llseek = no_llseek,
952 };
953 
954 static struct mlx5_vf_migration_file *
955 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
956 {
957 	struct mlx5_vf_migration_file *migf;
958 	struct mlx5_vhca_data_buffer *buf;
959 	int ret;
960 
961 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
962 	if (!migf)
963 		return ERR_PTR(-ENOMEM);
964 
965 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
966 					O_WRONLY);
967 	if (IS_ERR(migf->filp)) {
968 		ret = PTR_ERR(migf->filp);
969 		goto end;
970 	}
971 
972 	migf->mvdev = mvdev;
973 	ret = mlx5vf_cmd_alloc_pd(migf);
974 	if (ret)
975 		goto out_free;
976 
977 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
978 	if (IS_ERR(buf)) {
979 		ret = PTR_ERR(buf);
980 		goto out_pd;
981 	}
982 
983 	migf->buf = buf;
984 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
985 		buf = mlx5vf_alloc_data_buffer(migf,
986 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
987 		if (IS_ERR(buf)) {
988 			ret = PTR_ERR(buf);
989 			goto out_buf;
990 		}
991 
992 		migf->buf_header = buf;
993 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
994 	} else {
995 		/* Initial state will be to read the image */
996 		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
997 	}
998 
999 	stream_open(migf->filp->f_inode, migf->filp);
1000 	mutex_init(&migf->lock);
1001 	INIT_LIST_HEAD(&migf->buf_list);
1002 	INIT_LIST_HEAD(&migf->avail_list);
1003 	spin_lock_init(&migf->list_lock);
1004 	return migf;
1005 out_buf:
1006 	mlx5vf_free_data_buffer(migf->buf);
1007 out_pd:
1008 	mlx5vf_cmd_dealloc_pd(migf);
1009 out_free:
1010 	fput(migf->filp);
1011 end:
1012 	kfree(migf);
1013 	return ERR_PTR(ret);
1014 }
1015 
1016 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
1017 {
1018 	if (mvdev->resuming_migf) {
1019 		mlx5vf_disable_fd(mvdev->resuming_migf);
1020 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1021 		fput(mvdev->resuming_migf->filp);
1022 		mvdev->resuming_migf = NULL;
1023 	}
1024 	if (mvdev->saving_migf) {
1025 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1026 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
1027 		mlx5vf_disable_fd(mvdev->saving_migf);
1028 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1029 		fput(mvdev->saving_migf->filp);
1030 		mvdev->saving_migf = NULL;
1031 	}
1032 }
1033 
1034 static struct file *
1035 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1036 				    u32 new)
1037 {
1038 	u32 cur = mvdev->mig_state;
1039 	int ret;
1040 
1041 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1042 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1043 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1044 		if (ret)
1045 			return ERR_PTR(ret);
1046 		return NULL;
1047 	}
1048 
1049 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1050 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1051 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1052 		if (ret)
1053 			return ERR_PTR(ret);
1054 		return NULL;
1055 	}
1056 
1057 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1058 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1059 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1060 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1061 		if (ret)
1062 			return ERR_PTR(ret);
1063 		return NULL;
1064 	}
1065 
1066 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1067 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1068 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1069 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1070 		if (ret)
1071 			return ERR_PTR(ret);
1072 		return NULL;
1073 	}
1074 
1075 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1076 		struct mlx5_vf_migration_file *migf;
1077 
1078 		migf = mlx5vf_pci_save_device_data(mvdev, false);
1079 		if (IS_ERR(migf))
1080 			return ERR_CAST(migf);
1081 		get_file(migf->filp);
1082 		mvdev->saving_migf = migf;
1083 		return migf->filp;
1084 	}
1085 
1086 	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
1087 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1088 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1089 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1090 		mlx5vf_disable_fds(mvdev);
1091 		return NULL;
1092 	}
1093 
1094 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1095 		struct mlx5_vf_migration_file *migf;
1096 
1097 		migf = mlx5vf_pci_resume_device_data(mvdev);
1098 		if (IS_ERR(migf))
1099 			return ERR_CAST(migf);
1100 		get_file(migf->filp);
1101 		mvdev->resuming_migf = migf;
1102 		return migf->filp;
1103 	}
1104 
1105 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1106 		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
1107 			ret = mlx5vf_cmd_load_vhca_state(mvdev,
1108 							 mvdev->resuming_migf,
1109 							 mvdev->resuming_migf->buf);
1110 			if (ret)
1111 				return ERR_PTR(ret);
1112 		}
1113 		mlx5vf_disable_fds(mvdev);
1114 		return NULL;
1115 	}
1116 
1117 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1118 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1119 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1120 		struct mlx5_vf_migration_file *migf;
1121 
1122 		migf = mlx5vf_pci_save_device_data(mvdev, true);
1123 		if (IS_ERR(migf))
1124 			return ERR_CAST(migf);
1125 		get_file(migf->filp);
1126 		mvdev->saving_migf = migf;
1127 		return migf->filp;
1128 	}
1129 
1130 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1131 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1132 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1133 		if (ret)
1134 			return ERR_PTR(ret);
1135 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
1136 		return ret ? ERR_PTR(ret) : NULL;
1137 	}
1138 
1139 	/*
1140 	 * vfio_mig_get_next_state() does not use arcs other than the above
1141 	 */
1142 	WARN_ON(true);
1143 	return ERR_PTR(-EINVAL);
1144 }
1145 
1146 /*
1147  * This function is called in all state_mutex unlock cases to
1148  * handle a 'deferred_reset' if exists.
1149  */
1150 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1151 {
1152 again:
1153 	spin_lock(&mvdev->reset_lock);
1154 	if (mvdev->deferred_reset) {
1155 		mvdev->deferred_reset = false;
1156 		spin_unlock(&mvdev->reset_lock);
1157 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1158 		mlx5vf_disable_fds(mvdev);
1159 		goto again;
1160 	}
1161 	mutex_unlock(&mvdev->state_mutex);
1162 	spin_unlock(&mvdev->reset_lock);
1163 }
1164 
1165 static struct file *
1166 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1167 			    enum vfio_device_mig_state new_state)
1168 {
1169 	struct mlx5vf_pci_core_device *mvdev = container_of(
1170 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1171 	enum vfio_device_mig_state next_state;
1172 	struct file *res = NULL;
1173 	int ret;
1174 
1175 	mutex_lock(&mvdev->state_mutex);
1176 	while (new_state != mvdev->mig_state) {
1177 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1178 					      new_state, &next_state);
1179 		if (ret) {
1180 			res = ERR_PTR(ret);
1181 			break;
1182 		}
1183 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1184 		if (IS_ERR(res))
1185 			break;
1186 		mvdev->mig_state = next_state;
1187 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1188 			fput(res);
1189 			res = ERR_PTR(-EINVAL);
1190 			break;
1191 		}
1192 	}
1193 	mlx5vf_state_mutex_unlock(mvdev);
1194 	return res;
1195 }
1196 
1197 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1198 				    unsigned long *stop_copy_length)
1199 {
1200 	struct mlx5vf_pci_core_device *mvdev = container_of(
1201 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1202 	size_t state_size;
1203 	int ret;
1204 
1205 	mutex_lock(&mvdev->state_mutex);
1206 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
1207 						    &state_size, 0);
1208 	if (!ret)
1209 		*stop_copy_length = state_size;
1210 	mlx5vf_state_mutex_unlock(mvdev);
1211 	return ret;
1212 }
1213 
1214 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1215 				       enum vfio_device_mig_state *curr_state)
1216 {
1217 	struct mlx5vf_pci_core_device *mvdev = container_of(
1218 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1219 
1220 	mutex_lock(&mvdev->state_mutex);
1221 	*curr_state = mvdev->mig_state;
1222 	mlx5vf_state_mutex_unlock(mvdev);
1223 	return 0;
1224 }
1225 
1226 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1227 {
1228 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1229 
1230 	if (!mvdev->migrate_cap)
1231 		return;
1232 
1233 	/*
1234 	 * As the higher VFIO layers are holding locks across reset and using
1235 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1236 	 * with the state_mutex and mm_lock.
1237 	 * In case the state_mutex was taken already we defer the cleanup work
1238 	 * to the unlock flow of the other running context.
1239 	 */
1240 	spin_lock(&mvdev->reset_lock);
1241 	mvdev->deferred_reset = true;
1242 	if (!mutex_trylock(&mvdev->state_mutex)) {
1243 		spin_unlock(&mvdev->reset_lock);
1244 		return;
1245 	}
1246 	spin_unlock(&mvdev->reset_lock);
1247 	mlx5vf_state_mutex_unlock(mvdev);
1248 }
1249 
1250 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1251 {
1252 	struct mlx5vf_pci_core_device *mvdev = container_of(
1253 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1254 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1255 	int ret;
1256 
1257 	ret = vfio_pci_core_enable(vdev);
1258 	if (ret)
1259 		return ret;
1260 
1261 	if (mvdev->migrate_cap)
1262 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1263 	vfio_pci_core_finish_enable(vdev);
1264 	return 0;
1265 }
1266 
1267 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1268 {
1269 	struct mlx5vf_pci_core_device *mvdev = container_of(
1270 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1271 
1272 	mlx5vf_cmd_close_migratable(mvdev);
1273 	vfio_pci_core_close_device(core_vdev);
1274 }
1275 
1276 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1277 	.migration_set_state = mlx5vf_pci_set_device_state,
1278 	.migration_get_state = mlx5vf_pci_get_device_state,
1279 	.migration_get_data_size = mlx5vf_pci_get_data_size,
1280 };
1281 
1282 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1283 	.log_start = mlx5vf_start_page_tracker,
1284 	.log_stop = mlx5vf_stop_page_tracker,
1285 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1286 };
1287 
1288 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1289 {
1290 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1291 			struct mlx5vf_pci_core_device, core_device.vdev);
1292 	int ret;
1293 
1294 	ret = vfio_pci_core_init_dev(core_vdev);
1295 	if (ret)
1296 		return ret;
1297 
1298 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1299 				  &mlx5vf_pci_log_ops);
1300 
1301 	return 0;
1302 }
1303 
1304 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1305 {
1306 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1307 			struct mlx5vf_pci_core_device, core_device.vdev);
1308 
1309 	mlx5vf_cmd_remove_migratable(mvdev);
1310 	vfio_pci_core_release_dev(core_vdev);
1311 }
1312 
1313 static const struct vfio_device_ops mlx5vf_pci_ops = {
1314 	.name = "mlx5-vfio-pci",
1315 	.init = mlx5vf_pci_init_dev,
1316 	.release = mlx5vf_pci_release_dev,
1317 	.open_device = mlx5vf_pci_open_device,
1318 	.close_device = mlx5vf_pci_close_device,
1319 	.ioctl = vfio_pci_core_ioctl,
1320 	.device_feature = vfio_pci_core_ioctl_feature,
1321 	.read = vfio_pci_core_read,
1322 	.write = vfio_pci_core_write,
1323 	.mmap = vfio_pci_core_mmap,
1324 	.request = vfio_pci_core_request,
1325 	.match = vfio_pci_core_match,
1326 	.bind_iommufd = vfio_iommufd_physical_bind,
1327 	.unbind_iommufd = vfio_iommufd_physical_unbind,
1328 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1329 };
1330 
1331 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1332 			    const struct pci_device_id *id)
1333 {
1334 	struct mlx5vf_pci_core_device *mvdev;
1335 	int ret;
1336 
1337 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1338 				  &pdev->dev, &mlx5vf_pci_ops);
1339 	if (IS_ERR(mvdev))
1340 		return PTR_ERR(mvdev);
1341 
1342 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1343 	ret = vfio_pci_core_register_device(&mvdev->core_device);
1344 	if (ret)
1345 		goto out_put_vdev;
1346 	return 0;
1347 
1348 out_put_vdev:
1349 	vfio_put_device(&mvdev->core_device.vdev);
1350 	return ret;
1351 }
1352 
1353 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1354 {
1355 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1356 
1357 	vfio_pci_core_unregister_device(&mvdev->core_device);
1358 	vfio_put_device(&mvdev->core_device.vdev);
1359 }
1360 
1361 static const struct pci_device_id mlx5vf_pci_table[] = {
1362 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1363 	{}
1364 };
1365 
1366 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1367 
1368 static const struct pci_error_handlers mlx5vf_err_handlers = {
1369 	.reset_done = mlx5vf_pci_aer_reset_done,
1370 	.error_detected = vfio_pci_core_aer_err_detected,
1371 };
1372 
1373 static struct pci_driver mlx5vf_pci_driver = {
1374 	.name = KBUILD_MODNAME,
1375 	.id_table = mlx5vf_pci_table,
1376 	.probe = mlx5vf_pci_probe,
1377 	.remove = mlx5vf_pci_remove,
1378 	.err_handler = &mlx5vf_err_handlers,
1379 	.driver_managed_dma = true,
1380 };
1381 
1382 module_pci_driver(mlx5vf_pci_driver);
1383 
1384 MODULE_LICENSE("GPL");
1385 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1386 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1387 MODULE_DESCRIPTION(
1388 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1389