xref: /openbmc/linux/drivers/vfio/pci/mlx5/main.c (revision 0dce165b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Arbitrary to prevent userspace from consuming endless memory */
25 #define MAX_MIGRATION_SIZE (512*1024*1024)
26 
27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30 
31 	return container_of(core_device, struct mlx5vf_pci_core_device,
32 			    core_device);
33 }
34 
35 struct page *
36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 			  unsigned long offset)
38 {
39 	unsigned long cur_offset = 0;
40 	struct scatterlist *sg;
41 	unsigned int i;
42 
43 	/* All accesses are sequential */
44 	if (offset < buf->last_offset || !buf->last_offset_sg) {
45 		buf->last_offset = 0;
46 		buf->last_offset_sg = buf->table.sgt.sgl;
47 		buf->sg_last_entry = 0;
48 	}
49 
50 	cur_offset = buf->last_offset;
51 
52 	for_each_sg(buf->last_offset_sg, sg,
53 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 		if (offset < sg->length + cur_offset) {
55 			buf->last_offset_sg = sg;
56 			buf->sg_last_entry += i;
57 			buf->last_offset = cur_offset;
58 			return nth_page(sg_page(sg),
59 					(offset - cur_offset) / PAGE_SIZE);
60 		}
61 		cur_offset += sg->length;
62 	}
63 	return NULL;
64 }
65 
66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 			       unsigned int npages)
68 {
69 	unsigned int to_alloc = npages;
70 	struct page **page_list;
71 	unsigned long filled;
72 	unsigned int to_fill;
73 	int ret;
74 
75 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
77 	if (!page_list)
78 		return -ENOMEM;
79 
80 	do {
81 		filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
82 		if (!filled) {
83 			ret = -ENOMEM;
84 			goto err;
85 		}
86 		to_alloc -= filled;
87 		ret = sg_alloc_append_table_from_pages(
88 			&buf->table, page_list, filled, 0,
89 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
90 			GFP_KERNEL);
91 
92 		if (ret)
93 			goto err;
94 		buf->allocated_length += filled * PAGE_SIZE;
95 		/* clean input for another bulk allocation */
96 		memset(page_list, 0, filled * sizeof(*page_list));
97 		to_fill = min_t(unsigned int, to_alloc,
98 				PAGE_SIZE / sizeof(*page_list));
99 	} while (to_alloc > 0);
100 
101 	kvfree(page_list);
102 	return 0;
103 
104 err:
105 	kvfree(page_list);
106 	return ret;
107 }
108 
109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
110 {
111 	mutex_lock(&migf->lock);
112 	migf->state = MLX5_MIGF_STATE_ERROR;
113 	migf->filp->f_pos = 0;
114 	mutex_unlock(&migf->lock);
115 }
116 
117 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
118 {
119 	struct mlx5_vf_migration_file *migf = filp->private_data;
120 
121 	mlx5vf_disable_fd(migf);
122 	mutex_destroy(&migf->lock);
123 	kfree(migf);
124 	return 0;
125 }
126 
127 static struct mlx5_vhca_data_buffer *
128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
129 			      bool *end_of_data)
130 {
131 	struct mlx5_vhca_data_buffer *buf;
132 	bool found = false;
133 
134 	*end_of_data = false;
135 	spin_lock_irq(&migf->list_lock);
136 	if (list_empty(&migf->buf_list)) {
137 		*end_of_data = true;
138 		goto end;
139 	}
140 
141 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
142 			       buf_elm);
143 	if (pos >= buf->start_pos &&
144 	    pos < buf->start_pos + buf->length) {
145 		found = true;
146 		goto end;
147 	}
148 
149 	/*
150 	 * As we use a stream based FD we may expect having the data always
151 	 * on first chunk
152 	 */
153 	migf->state = MLX5_MIGF_STATE_ERROR;
154 
155 end:
156 	spin_unlock_irq(&migf->list_lock);
157 	return found ? buf : NULL;
158 }
159 
160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
161 			       char __user **buf, size_t *len, loff_t *pos)
162 {
163 	unsigned long offset;
164 	ssize_t done = 0;
165 	size_t copy_len;
166 
167 	copy_len = min_t(size_t,
168 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
169 	while (copy_len) {
170 		size_t page_offset;
171 		struct page *page;
172 		size_t page_len;
173 		u8 *from_buff;
174 		int ret;
175 
176 		offset = *pos - vhca_buf->start_pos;
177 		page_offset = offset % PAGE_SIZE;
178 		offset -= page_offset;
179 		page = mlx5vf_get_migration_page(vhca_buf, offset);
180 		if (!page)
181 			return -EINVAL;
182 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
183 		from_buff = kmap_local_page(page);
184 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
185 		kunmap_local(from_buff);
186 		if (ret)
187 			return -EFAULT;
188 		*pos += page_len;
189 		*len -= page_len;
190 		*buf += page_len;
191 		done += page_len;
192 		copy_len -= page_len;
193 	}
194 
195 	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
196 		spin_lock_irq(&vhca_buf->migf->list_lock);
197 		list_del_init(&vhca_buf->buf_elm);
198 		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
199 		spin_unlock_irq(&vhca_buf->migf->list_lock);
200 	}
201 
202 	return done;
203 }
204 
205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
206 			       loff_t *pos)
207 {
208 	struct mlx5_vf_migration_file *migf = filp->private_data;
209 	struct mlx5_vhca_data_buffer *vhca_buf;
210 	bool first_loop_call = true;
211 	bool end_of_data;
212 	ssize_t done = 0;
213 
214 	if (pos)
215 		return -ESPIPE;
216 	pos = &filp->f_pos;
217 
218 	if (!(filp->f_flags & O_NONBLOCK)) {
219 		if (wait_event_interruptible(migf->poll_wait,
220 				!list_empty(&migf->buf_list) ||
221 				migf->state == MLX5_MIGF_STATE_ERROR ||
222 				migf->state == MLX5_MIGF_STATE_COMPLETE))
223 			return -ERESTARTSYS;
224 	}
225 
226 	mutex_lock(&migf->lock);
227 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
228 		done = -ENODEV;
229 		goto out_unlock;
230 	}
231 
232 	while (len) {
233 		ssize_t count;
234 
235 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
236 							 &end_of_data);
237 		if (first_loop_call) {
238 			first_loop_call = false;
239 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
240 				if (filp->f_flags & O_NONBLOCK) {
241 					done = -EAGAIN;
242 					goto out_unlock;
243 				}
244 			}
245 		}
246 
247 		if (end_of_data)
248 			goto out_unlock;
249 
250 		if (!vhca_buf) {
251 			done = -EINVAL;
252 			goto out_unlock;
253 		}
254 
255 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
256 		if (count < 0) {
257 			done = count;
258 			goto out_unlock;
259 		}
260 		done += count;
261 	}
262 
263 out_unlock:
264 	mutex_unlock(&migf->lock);
265 	return done;
266 }
267 
268 static __poll_t mlx5vf_save_poll(struct file *filp,
269 				 struct poll_table_struct *wait)
270 {
271 	struct mlx5_vf_migration_file *migf = filp->private_data;
272 	__poll_t pollflags = 0;
273 
274 	poll_wait(filp, &migf->poll_wait, wait);
275 
276 	mutex_lock(&migf->lock);
277 	if (migf->state == MLX5_MIGF_STATE_ERROR)
278 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
279 	else if (!list_empty(&migf->buf_list) ||
280 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
281 		pollflags = EPOLLIN | EPOLLRDNORM;
282 	mutex_unlock(&migf->lock);
283 
284 	return pollflags;
285 }
286 
287 /*
288  * FD is exposed and user can use it after receiving an error.
289  * Mark migf in error, and wake the user.
290  */
291 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
292 {
293 	migf->state = MLX5_MIGF_STATE_ERROR;
294 	wake_up_interruptible(&migf->poll_wait);
295 }
296 
297 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
298 				 unsigned long arg)
299 {
300 	struct mlx5_vf_migration_file *migf = filp->private_data;
301 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
302 	struct mlx5_vhca_data_buffer *buf;
303 	struct vfio_precopy_info info = {};
304 	loff_t *pos = &filp->f_pos;
305 	unsigned long minsz;
306 	size_t inc_length = 0;
307 	bool end_of_data;
308 	int ret;
309 
310 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
311 		return -ENOTTY;
312 
313 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
314 
315 	if (copy_from_user(&info, (void __user *)arg, minsz))
316 		return -EFAULT;
317 
318 	if (info.argsz < minsz)
319 		return -EINVAL;
320 
321 	mutex_lock(&mvdev->state_mutex);
322 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
323 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
324 		ret = -EINVAL;
325 		goto err_state_unlock;
326 	}
327 
328 	/*
329 	 * We can't issue a SAVE command when the device is suspended, so as
330 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
331 	 * bytes that can't be read.
332 	 */
333 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
334 		/*
335 		 * Once the query returns it's guaranteed that there is no
336 		 * active SAVE command.
337 		 * As so, the other code below is safe with the proper locks.
338 		 */
339 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
340 							    MLX5VF_QUERY_INC);
341 		if (ret)
342 			goto err_state_unlock;
343 	}
344 
345 	mutex_lock(&migf->lock);
346 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
347 		ret = -ENODEV;
348 		goto err_migf_unlock;
349 	}
350 
351 	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
352 	if (buf) {
353 		if (buf->start_pos == 0) {
354 			info.initial_bytes = buf->header_image_size - *pos;
355 		} else if (buf->start_pos ==
356 				sizeof(struct mlx5_vf_migration_header)) {
357 			/* First data buffer following the header */
358 			info.initial_bytes = buf->start_pos +
359 						buf->length - *pos;
360 		} else {
361 			info.dirty_bytes = buf->start_pos + buf->length - *pos;
362 		}
363 	} else {
364 		if (!end_of_data) {
365 			ret = -EINVAL;
366 			goto err_migf_unlock;
367 		}
368 
369 		info.dirty_bytes = inc_length;
370 	}
371 
372 	if (!end_of_data || !inc_length) {
373 		mutex_unlock(&migf->lock);
374 		goto done;
375 	}
376 
377 	mutex_unlock(&migf->lock);
378 	/*
379 	 * We finished transferring the current state and the device has a
380 	 * dirty state, save a new state to be ready for.
381 	 */
382 	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
383 	if (IS_ERR(buf)) {
384 		ret = PTR_ERR(buf);
385 		mlx5vf_mark_err(migf);
386 		goto err_state_unlock;
387 	}
388 
389 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
390 	if (ret) {
391 		mlx5vf_mark_err(migf);
392 		mlx5vf_put_data_buffer(buf);
393 		goto err_state_unlock;
394 	}
395 
396 done:
397 	mlx5vf_state_mutex_unlock(mvdev);
398 	return copy_to_user((void __user *)arg, &info, minsz);
399 err_migf_unlock:
400 	mutex_unlock(&migf->lock);
401 err_state_unlock:
402 	mlx5vf_state_mutex_unlock(mvdev);
403 	return ret;
404 }
405 
406 static const struct file_operations mlx5vf_save_fops = {
407 	.owner = THIS_MODULE,
408 	.read = mlx5vf_save_read,
409 	.poll = mlx5vf_save_poll,
410 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
411 	.compat_ioctl = compat_ptr_ioctl,
412 	.release = mlx5vf_release_file,
413 	.llseek = no_llseek,
414 };
415 
416 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
417 {
418 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
419 	struct mlx5_vhca_data_buffer *buf;
420 	size_t length;
421 	int ret;
422 
423 	if (migf->state == MLX5_MIGF_STATE_ERROR)
424 		return -ENODEV;
425 
426 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
427 						    MLX5VF_QUERY_INC);
428 	if (ret)
429 		goto err;
430 
431 	buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
432 	if (IS_ERR(buf)) {
433 		ret = PTR_ERR(buf);
434 		goto err;
435 	}
436 
437 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
438 	if (ret)
439 		goto err_save;
440 
441 	return 0;
442 
443 err_save:
444 	mlx5vf_put_data_buffer(buf);
445 err:
446 	mlx5vf_mark_err(migf);
447 	return ret;
448 }
449 
450 static struct mlx5_vf_migration_file *
451 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
452 {
453 	struct mlx5_vf_migration_file *migf;
454 	struct mlx5_vhca_data_buffer *buf;
455 	size_t length;
456 	int ret;
457 
458 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
459 	if (!migf)
460 		return ERR_PTR(-ENOMEM);
461 
462 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
463 					O_RDONLY);
464 	if (IS_ERR(migf->filp)) {
465 		ret = PTR_ERR(migf->filp);
466 		goto end;
467 	}
468 
469 	migf->mvdev = mvdev;
470 	ret = mlx5vf_cmd_alloc_pd(migf);
471 	if (ret)
472 		goto out_free;
473 
474 	stream_open(migf->filp->f_inode, migf->filp);
475 	mutex_init(&migf->lock);
476 	init_waitqueue_head(&migf->poll_wait);
477 	init_completion(&migf->save_comp);
478 	/*
479 	 * save_comp is being used as a binary semaphore built from
480 	 * a completion. A normal mutex cannot be used because the lock is
481 	 * passed between kernel threads and lockdep can't model this.
482 	 */
483 	complete(&migf->save_comp);
484 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
485 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
486 	INIT_LIST_HEAD(&migf->buf_list);
487 	INIT_LIST_HEAD(&migf->avail_list);
488 	spin_lock_init(&migf->list_lock);
489 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
490 	if (ret)
491 		goto out_pd;
492 
493 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
494 	if (IS_ERR(buf)) {
495 		ret = PTR_ERR(buf);
496 		goto out_pd;
497 	}
498 
499 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
500 	if (ret)
501 		goto out_save;
502 	return migf;
503 out_save:
504 	mlx5vf_free_data_buffer(buf);
505 out_pd:
506 	mlx5vf_cmd_dealloc_pd(migf);
507 out_free:
508 	fput(migf->filp);
509 end:
510 	kfree(migf);
511 	return ERR_PTR(ret);
512 }
513 
514 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
515 				   size_t len, loff_t *pos)
516 {
517 	struct mlx5_vf_migration_file *migf = filp->private_data;
518 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
519 	loff_t requested_length;
520 	ssize_t done = 0;
521 
522 	if (pos)
523 		return -ESPIPE;
524 	pos = &filp->f_pos;
525 
526 	if (*pos < 0 ||
527 	    check_add_overflow((loff_t)len, *pos, &requested_length))
528 		return -EINVAL;
529 
530 	if (requested_length > MAX_MIGRATION_SIZE)
531 		return -ENOMEM;
532 
533 	mutex_lock(&migf->lock);
534 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
535 		done = -ENODEV;
536 		goto out_unlock;
537 	}
538 
539 	if (vhca_buf->allocated_length < requested_length) {
540 		done = mlx5vf_add_migration_pages(
541 			vhca_buf,
542 			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
543 				     PAGE_SIZE));
544 		if (done)
545 			goto out_unlock;
546 	}
547 
548 	while (len) {
549 		size_t page_offset;
550 		struct page *page;
551 		size_t page_len;
552 		u8 *to_buff;
553 		int ret;
554 
555 		page_offset = (*pos) % PAGE_SIZE;
556 		page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset);
557 		if (!page) {
558 			if (done == 0)
559 				done = -EINVAL;
560 			goto out_unlock;
561 		}
562 
563 		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
564 		to_buff = kmap_local_page(page);
565 		ret = copy_from_user(to_buff + page_offset, buf, page_len);
566 		kunmap_local(to_buff);
567 		if (ret) {
568 			done = -EFAULT;
569 			goto out_unlock;
570 		}
571 		*pos += page_len;
572 		len -= page_len;
573 		done += page_len;
574 		buf += page_len;
575 		vhca_buf->length += page_len;
576 	}
577 out_unlock:
578 	mutex_unlock(&migf->lock);
579 	return done;
580 }
581 
582 static const struct file_operations mlx5vf_resume_fops = {
583 	.owner = THIS_MODULE,
584 	.write = mlx5vf_resume_write,
585 	.release = mlx5vf_release_file,
586 	.llseek = no_llseek,
587 };
588 
589 static struct mlx5_vf_migration_file *
590 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
591 {
592 	struct mlx5_vf_migration_file *migf;
593 	struct mlx5_vhca_data_buffer *buf;
594 	int ret;
595 
596 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
597 	if (!migf)
598 		return ERR_PTR(-ENOMEM);
599 
600 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
601 					O_WRONLY);
602 	if (IS_ERR(migf->filp)) {
603 		ret = PTR_ERR(migf->filp);
604 		goto end;
605 	}
606 
607 	migf->mvdev = mvdev;
608 	ret = mlx5vf_cmd_alloc_pd(migf);
609 	if (ret)
610 		goto out_free;
611 
612 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
613 	if (IS_ERR(buf)) {
614 		ret = PTR_ERR(buf);
615 		goto out_pd;
616 	}
617 
618 	migf->buf = buf;
619 	stream_open(migf->filp->f_inode, migf->filp);
620 	mutex_init(&migf->lock);
621 	INIT_LIST_HEAD(&migf->buf_list);
622 	INIT_LIST_HEAD(&migf->avail_list);
623 	spin_lock_init(&migf->list_lock);
624 	return migf;
625 out_pd:
626 	mlx5vf_cmd_dealloc_pd(migf);
627 out_free:
628 	fput(migf->filp);
629 end:
630 	kfree(migf);
631 	return ERR_PTR(ret);
632 }
633 
634 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
635 {
636 	if (mvdev->resuming_migf) {
637 		mlx5vf_disable_fd(mvdev->resuming_migf);
638 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
639 		fput(mvdev->resuming_migf->filp);
640 		mvdev->resuming_migf = NULL;
641 	}
642 	if (mvdev->saving_migf) {
643 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
644 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
645 		mlx5vf_disable_fd(mvdev->saving_migf);
646 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
647 		fput(mvdev->saving_migf->filp);
648 		mvdev->saving_migf = NULL;
649 	}
650 }
651 
652 static struct file *
653 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
654 				    u32 new)
655 {
656 	u32 cur = mvdev->mig_state;
657 	int ret;
658 
659 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
660 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
661 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
662 		if (ret)
663 			return ERR_PTR(ret);
664 		return NULL;
665 	}
666 
667 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
668 		ret = mlx5vf_cmd_resume_vhca(mvdev,
669 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
670 		if (ret)
671 			return ERR_PTR(ret);
672 		return NULL;
673 	}
674 
675 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
676 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
677 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
678 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
679 		if (ret)
680 			return ERR_PTR(ret);
681 		return NULL;
682 	}
683 
684 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
685 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
686 		ret = mlx5vf_cmd_resume_vhca(mvdev,
687 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
688 		if (ret)
689 			return ERR_PTR(ret);
690 		return NULL;
691 	}
692 
693 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
694 		struct mlx5_vf_migration_file *migf;
695 
696 		migf = mlx5vf_pci_save_device_data(mvdev, false);
697 		if (IS_ERR(migf))
698 			return ERR_CAST(migf);
699 		get_file(migf->filp);
700 		mvdev->saving_migf = migf;
701 		return migf->filp;
702 	}
703 
704 	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
705 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
706 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
707 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
708 		mlx5vf_disable_fds(mvdev);
709 		return NULL;
710 	}
711 
712 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
713 		struct mlx5_vf_migration_file *migf;
714 
715 		migf = mlx5vf_pci_resume_device_data(mvdev);
716 		if (IS_ERR(migf))
717 			return ERR_CAST(migf);
718 		get_file(migf->filp);
719 		mvdev->resuming_migf = migf;
720 		return migf->filp;
721 	}
722 
723 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
724 		ret = mlx5vf_cmd_load_vhca_state(mvdev,
725 						 mvdev->resuming_migf,
726 						 mvdev->resuming_migf->buf);
727 		if (ret)
728 			return ERR_PTR(ret);
729 		mlx5vf_disable_fds(mvdev);
730 		return NULL;
731 	}
732 
733 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
734 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
735 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
736 		struct mlx5_vf_migration_file *migf;
737 
738 		migf = mlx5vf_pci_save_device_data(mvdev, true);
739 		if (IS_ERR(migf))
740 			return ERR_CAST(migf);
741 		get_file(migf->filp);
742 		mvdev->saving_migf = migf;
743 		return migf->filp;
744 	}
745 
746 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
747 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
748 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
749 		if (ret)
750 			return ERR_PTR(ret);
751 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
752 		return ret ? ERR_PTR(ret) : NULL;
753 	}
754 
755 	/*
756 	 * vfio_mig_get_next_state() does not use arcs other than the above
757 	 */
758 	WARN_ON(true);
759 	return ERR_PTR(-EINVAL);
760 }
761 
762 /*
763  * This function is called in all state_mutex unlock cases to
764  * handle a 'deferred_reset' if exists.
765  */
766 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
767 {
768 again:
769 	spin_lock(&mvdev->reset_lock);
770 	if (mvdev->deferred_reset) {
771 		mvdev->deferred_reset = false;
772 		spin_unlock(&mvdev->reset_lock);
773 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
774 		mlx5vf_disable_fds(mvdev);
775 		goto again;
776 	}
777 	mutex_unlock(&mvdev->state_mutex);
778 	spin_unlock(&mvdev->reset_lock);
779 }
780 
781 static struct file *
782 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
783 			    enum vfio_device_mig_state new_state)
784 {
785 	struct mlx5vf_pci_core_device *mvdev = container_of(
786 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
787 	enum vfio_device_mig_state next_state;
788 	struct file *res = NULL;
789 	int ret;
790 
791 	mutex_lock(&mvdev->state_mutex);
792 	while (new_state != mvdev->mig_state) {
793 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
794 					      new_state, &next_state);
795 		if (ret) {
796 			res = ERR_PTR(ret);
797 			break;
798 		}
799 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
800 		if (IS_ERR(res))
801 			break;
802 		mvdev->mig_state = next_state;
803 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
804 			fput(res);
805 			res = ERR_PTR(-EINVAL);
806 			break;
807 		}
808 	}
809 	mlx5vf_state_mutex_unlock(mvdev);
810 	return res;
811 }
812 
813 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
814 				    unsigned long *stop_copy_length)
815 {
816 	struct mlx5vf_pci_core_device *mvdev = container_of(
817 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
818 	size_t state_size;
819 	int ret;
820 
821 	mutex_lock(&mvdev->state_mutex);
822 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
823 						    &state_size, 0);
824 	if (!ret)
825 		*stop_copy_length = state_size;
826 	mlx5vf_state_mutex_unlock(mvdev);
827 	return ret;
828 }
829 
830 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
831 				       enum vfio_device_mig_state *curr_state)
832 {
833 	struct mlx5vf_pci_core_device *mvdev = container_of(
834 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
835 
836 	mutex_lock(&mvdev->state_mutex);
837 	*curr_state = mvdev->mig_state;
838 	mlx5vf_state_mutex_unlock(mvdev);
839 	return 0;
840 }
841 
842 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
843 {
844 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
845 
846 	if (!mvdev->migrate_cap)
847 		return;
848 
849 	/*
850 	 * As the higher VFIO layers are holding locks across reset and using
851 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
852 	 * with the state_mutex and mm_lock.
853 	 * In case the state_mutex was taken already we defer the cleanup work
854 	 * to the unlock flow of the other running context.
855 	 */
856 	spin_lock(&mvdev->reset_lock);
857 	mvdev->deferred_reset = true;
858 	if (!mutex_trylock(&mvdev->state_mutex)) {
859 		spin_unlock(&mvdev->reset_lock);
860 		return;
861 	}
862 	spin_unlock(&mvdev->reset_lock);
863 	mlx5vf_state_mutex_unlock(mvdev);
864 }
865 
866 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
867 {
868 	struct mlx5vf_pci_core_device *mvdev = container_of(
869 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
870 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
871 	int ret;
872 
873 	ret = vfio_pci_core_enable(vdev);
874 	if (ret)
875 		return ret;
876 
877 	if (mvdev->migrate_cap)
878 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
879 	vfio_pci_core_finish_enable(vdev);
880 	return 0;
881 }
882 
883 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
884 {
885 	struct mlx5vf_pci_core_device *mvdev = container_of(
886 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
887 
888 	mlx5vf_cmd_close_migratable(mvdev);
889 	vfio_pci_core_close_device(core_vdev);
890 }
891 
892 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
893 	.migration_set_state = mlx5vf_pci_set_device_state,
894 	.migration_get_state = mlx5vf_pci_get_device_state,
895 	.migration_get_data_size = mlx5vf_pci_get_data_size,
896 };
897 
898 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
899 	.log_start = mlx5vf_start_page_tracker,
900 	.log_stop = mlx5vf_stop_page_tracker,
901 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
902 };
903 
904 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
905 {
906 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
907 			struct mlx5vf_pci_core_device, core_device.vdev);
908 	int ret;
909 
910 	ret = vfio_pci_core_init_dev(core_vdev);
911 	if (ret)
912 		return ret;
913 
914 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
915 				  &mlx5vf_pci_log_ops);
916 
917 	return 0;
918 }
919 
920 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
921 {
922 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
923 			struct mlx5vf_pci_core_device, core_device.vdev);
924 
925 	mlx5vf_cmd_remove_migratable(mvdev);
926 	vfio_pci_core_release_dev(core_vdev);
927 }
928 
929 static const struct vfio_device_ops mlx5vf_pci_ops = {
930 	.name = "mlx5-vfio-pci",
931 	.init = mlx5vf_pci_init_dev,
932 	.release = mlx5vf_pci_release_dev,
933 	.open_device = mlx5vf_pci_open_device,
934 	.close_device = mlx5vf_pci_close_device,
935 	.ioctl = vfio_pci_core_ioctl,
936 	.device_feature = vfio_pci_core_ioctl_feature,
937 	.read = vfio_pci_core_read,
938 	.write = vfio_pci_core_write,
939 	.mmap = vfio_pci_core_mmap,
940 	.request = vfio_pci_core_request,
941 	.match = vfio_pci_core_match,
942 };
943 
944 static int mlx5vf_pci_probe(struct pci_dev *pdev,
945 			    const struct pci_device_id *id)
946 {
947 	struct mlx5vf_pci_core_device *mvdev;
948 	int ret;
949 
950 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
951 				  &pdev->dev, &mlx5vf_pci_ops);
952 	if (IS_ERR(mvdev))
953 		return PTR_ERR(mvdev);
954 
955 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
956 	ret = vfio_pci_core_register_device(&mvdev->core_device);
957 	if (ret)
958 		goto out_put_vdev;
959 	return 0;
960 
961 out_put_vdev:
962 	vfio_put_device(&mvdev->core_device.vdev);
963 	return ret;
964 }
965 
966 static void mlx5vf_pci_remove(struct pci_dev *pdev)
967 {
968 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
969 
970 	vfio_pci_core_unregister_device(&mvdev->core_device);
971 	vfio_put_device(&mvdev->core_device.vdev);
972 }
973 
974 static const struct pci_device_id mlx5vf_pci_table[] = {
975 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
976 	{}
977 };
978 
979 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
980 
981 static const struct pci_error_handlers mlx5vf_err_handlers = {
982 	.reset_done = mlx5vf_pci_aer_reset_done,
983 	.error_detected = vfio_pci_core_aer_err_detected,
984 };
985 
986 static struct pci_driver mlx5vf_pci_driver = {
987 	.name = KBUILD_MODNAME,
988 	.id_table = mlx5vf_pci_table,
989 	.probe = mlx5vf_pci_probe,
990 	.remove = mlx5vf_pci_remove,
991 	.err_handler = &mlx5vf_err_handlers,
992 	.driver_managed_dma = true,
993 };
994 
995 module_pci_driver(mlx5vf_pci_driver);
996 
997 MODULE_LICENSE("GPL");
998 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
999 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1000 MODULE_DESCRIPTION(
1001 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1002