xref: /openbmc/linux/drivers/vfio/pci/mlx5/cmd.c (revision 844f5ed5)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 	void *query_cap = NULL, *cap;
14 	int ret;
15 
16 	query_cap = kzalloc(query_sz, GFP_KERNEL);
17 	if (!query_cap)
18 		return -ENOMEM;
19 
20 	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 					    MLX5_CAP_GENERAL_2);
22 	if (ret)
23 		goto out;
24 
25 	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 		ret = -EOPNOTSUPP;
28 out:
29 	kfree(query_cap);
30 	return ret;
31 }
32 
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 				  u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37 
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 	int err;
44 
45 	lockdep_assert_held(&mvdev->state_mutex);
46 	if (mvdev->mdev_detach)
47 		return -ENOTCONN;
48 
49 	/*
50 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 	 * running. Make sure to run only once there is no active save command.
52 	 * Running both in parallel, might end-up with a failure in the save
53 	 * command once it will try to turn on 'tracking' on a suspended device.
54 	 */
55 	if (migf) {
56 		err = wait_for_completion_interruptible(&migf->save_comp);
57 		if (err)
58 			return err;
59 	}
60 
61 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64 
65 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 	if (migf)
67 		complete(&migf->save_comp);
68 
69 	return err;
70 }
71 
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76 
77 	lockdep_assert_held(&mvdev->state_mutex);
78 	if (mvdev->mdev_detach)
79 		return -ENOTCONN;
80 
81 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84 
85 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87 
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 					  size_t *state_size, u8 query_flags)
90 {
91 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
92 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
93 	bool inc = query_flags & MLX5VF_QUERY_INC;
94 	int ret;
95 
96 	lockdep_assert_held(&mvdev->state_mutex);
97 	if (mvdev->mdev_detach)
98 		return -ENOTCONN;
99 
100 	/*
101 	 * In case PRE_COPY is used, saving_migf is exposed while device is
102 	 * running. Make sure to run only once there is no active save command.
103 	 * Running both in parallel, might end-up with a failure in the
104 	 * incremental query command on un-tracked vhca.
105 	 */
106 	if (inc) {
107 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
108 		if (ret)
109 			return ret;
110 		if (mvdev->saving_migf->state ==
111 		    MLX5_MIGF_STATE_PRE_COPY_ERROR) {
112 			/*
113 			 * In case we had a PRE_COPY error, only query full
114 			 * image for final image
115 			 */
116 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
117 				*state_size = 0;
118 				complete(&mvdev->saving_migf->save_comp);
119 				return 0;
120 			}
121 			query_flags &= ~MLX5VF_QUERY_INC;
122 		}
123 	}
124 
125 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
126 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
127 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
128 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
129 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
130 		 query_flags & MLX5VF_QUERY_INC);
131 
132 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
133 				  out);
134 	if (inc)
135 		complete(&mvdev->saving_migf->save_comp);
136 
137 	if (ret)
138 		return ret;
139 
140 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
141 			       required_umem_size);
142 	return 0;
143 }
144 
145 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
146 {
147 	/* Mark the tracker under an error and wake it up if it's running */
148 	mvdev->tracker.is_err = true;
149 	complete(&mvdev->tracker_comp);
150 }
151 
152 static int mlx5fv_vf_event(struct notifier_block *nb,
153 			   unsigned long event, void *data)
154 {
155 	struct mlx5vf_pci_core_device *mvdev =
156 		container_of(nb, struct mlx5vf_pci_core_device, nb);
157 
158 	switch (event) {
159 	case MLX5_PF_NOTIFY_ENABLE_VF:
160 		mutex_lock(&mvdev->state_mutex);
161 		mvdev->mdev_detach = false;
162 		mlx5vf_state_mutex_unlock(mvdev);
163 		break;
164 	case MLX5_PF_NOTIFY_DISABLE_VF:
165 		mlx5vf_cmd_close_migratable(mvdev);
166 		mutex_lock(&mvdev->state_mutex);
167 		mvdev->mdev_detach = true;
168 		mlx5vf_state_mutex_unlock(mvdev);
169 		break;
170 	default:
171 		break;
172 	}
173 
174 	return 0;
175 }
176 
177 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
178 {
179 	if (!mvdev->migrate_cap)
180 		return;
181 
182 	/* Must be done outside the lock to let it progress */
183 	set_tracker_error(mvdev);
184 	mutex_lock(&mvdev->state_mutex);
185 	mlx5vf_disable_fds(mvdev);
186 	_mlx5vf_free_page_tracker_resources(mvdev);
187 	mlx5vf_state_mutex_unlock(mvdev);
188 }
189 
190 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
191 {
192 	if (!mvdev->migrate_cap)
193 		return;
194 
195 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
196 						&mvdev->nb);
197 	destroy_workqueue(mvdev->cb_wq);
198 }
199 
200 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
201 			       const struct vfio_migration_ops *mig_ops,
202 			       const struct vfio_log_ops *log_ops)
203 {
204 	struct pci_dev *pdev = mvdev->core_device.pdev;
205 	int ret;
206 
207 	if (!pdev->is_virtfn)
208 		return;
209 
210 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
211 	if (!mvdev->mdev)
212 		return;
213 
214 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
215 		goto end;
216 
217 	mvdev->vf_id = pci_iov_vf_id(pdev);
218 	if (mvdev->vf_id < 0)
219 		goto end;
220 
221 	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
222 	if (ret)
223 		goto end;
224 
225 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
226 				   &mvdev->vhca_id))
227 		goto end;
228 
229 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
230 	if (!mvdev->cb_wq)
231 		goto end;
232 
233 	mutex_init(&mvdev->state_mutex);
234 	spin_lock_init(&mvdev->reset_lock);
235 	mvdev->nb.notifier_call = mlx5fv_vf_event;
236 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
237 						    &mvdev->nb);
238 	if (ret) {
239 		destroy_workqueue(mvdev->cb_wq);
240 		goto end;
241 	}
242 
243 	mvdev->migrate_cap = 1;
244 	mvdev->core_device.vdev.migration_flags =
245 		VFIO_MIGRATION_STOP_COPY |
246 		VFIO_MIGRATION_P2P;
247 	mvdev->core_device.vdev.mig_ops = mig_ops;
248 	init_completion(&mvdev->tracker_comp);
249 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
250 		mvdev->core_device.vdev.log_ops = log_ops;
251 
252 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
253 	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
254 		mvdev->core_device.vdev.migration_flags |=
255 			VFIO_MIGRATION_PRE_COPY;
256 
257 end:
258 	mlx5_vf_put_core_dev(mvdev->mdev);
259 }
260 
261 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
262 				  u16 *vhca_id)
263 {
264 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
265 	int out_size;
266 	void *out;
267 	int ret;
268 
269 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
270 	out = kzalloc(out_size, GFP_KERNEL);
271 	if (!out)
272 		return -ENOMEM;
273 
274 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
275 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
276 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
277 	MLX5_SET(query_hca_cap_in, in, op_mod,
278 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
279 		 HCA_CAP_OPMOD_GET_CUR);
280 
281 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
282 	if (ret)
283 		goto err_exec;
284 
285 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
286 			    capability.cmd_hca_cap.vhca_id);
287 
288 err_exec:
289 	kfree(out);
290 	return ret;
291 }
292 
293 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
294 			struct mlx5_vhca_data_buffer *buf,
295 			struct mlx5_vhca_recv_buf *recv_buf,
296 			u32 *mkey)
297 {
298 	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
299 				recv_buf->npages;
300 	int err = 0, inlen;
301 	__be64 *mtt;
302 	void *mkc;
303 	u32 *in;
304 
305 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
306 		sizeof(*mtt) * round_up(npages, 2);
307 
308 	in = kvzalloc(inlen, GFP_KERNEL);
309 	if (!in)
310 		return -ENOMEM;
311 
312 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
313 		 DIV_ROUND_UP(npages, 2));
314 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
315 
316 	if (buf) {
317 		struct sg_dma_page_iter dma_iter;
318 
319 		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
320 			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
321 	} else {
322 		int i;
323 
324 		for (i = 0; i < npages; i++)
325 			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
326 	}
327 
328 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
329 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
330 	MLX5_SET(mkc, mkc, lr, 1);
331 	MLX5_SET(mkc, mkc, lw, 1);
332 	MLX5_SET(mkc, mkc, rr, 1);
333 	MLX5_SET(mkc, mkc, rw, 1);
334 	MLX5_SET(mkc, mkc, pd, pdn);
335 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
336 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
337 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
338 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
339 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
340 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
341 	kvfree(in);
342 	return err;
343 }
344 
345 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
346 {
347 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
348 	struct mlx5_core_dev *mdev = mvdev->mdev;
349 	int ret;
350 
351 	lockdep_assert_held(&mvdev->state_mutex);
352 	if (mvdev->mdev_detach)
353 		return -ENOTCONN;
354 
355 	if (buf->dmaed || !buf->allocated_length)
356 		return -EINVAL;
357 
358 	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
359 	if (ret)
360 		return ret;
361 
362 	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
363 	if (ret)
364 		goto err;
365 
366 	buf->dmaed = true;
367 
368 	return 0;
369 err:
370 	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
371 	return ret;
372 }
373 
374 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
375 {
376 	struct mlx5_vf_migration_file *migf = buf->migf;
377 	struct sg_page_iter sg_iter;
378 
379 	lockdep_assert_held(&migf->mvdev->state_mutex);
380 	WARN_ON(migf->mvdev->mdev_detach);
381 
382 	if (buf->dmaed) {
383 		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
384 		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
385 				  buf->dma_dir, 0);
386 	}
387 
388 	/* Undo alloc_pages_bulk_array() */
389 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
390 		__free_page(sg_page_iter_page(&sg_iter));
391 	sg_free_append_table(&buf->table);
392 	kfree(buf);
393 }
394 
395 struct mlx5_vhca_data_buffer *
396 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
397 			 size_t length,
398 			 enum dma_data_direction dma_dir)
399 {
400 	struct mlx5_vhca_data_buffer *buf;
401 	int ret;
402 
403 	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
404 	if (!buf)
405 		return ERR_PTR(-ENOMEM);
406 
407 	buf->dma_dir = dma_dir;
408 	buf->migf = migf;
409 	if (length) {
410 		ret = mlx5vf_add_migration_pages(buf,
411 				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
412 		if (ret)
413 			goto end;
414 
415 		if (dma_dir != DMA_NONE) {
416 			ret = mlx5vf_dma_data_buffer(buf);
417 			if (ret)
418 				goto end;
419 		}
420 	}
421 
422 	return buf;
423 end:
424 	mlx5vf_free_data_buffer(buf);
425 	return ERR_PTR(ret);
426 }
427 
428 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
429 {
430 	spin_lock_irq(&buf->migf->list_lock);
431 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
432 	spin_unlock_irq(&buf->migf->list_lock);
433 }
434 
435 struct mlx5_vhca_data_buffer *
436 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
437 		       size_t length, enum dma_data_direction dma_dir)
438 {
439 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
440 	struct list_head free_list;
441 
442 	lockdep_assert_held(&migf->mvdev->state_mutex);
443 	if (migf->mvdev->mdev_detach)
444 		return ERR_PTR(-ENOTCONN);
445 
446 	INIT_LIST_HEAD(&free_list);
447 
448 	spin_lock_irq(&migf->list_lock);
449 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
450 		if (buf->dma_dir == dma_dir) {
451 			list_del_init(&buf->buf_elm);
452 			if (buf->allocated_length >= length) {
453 				spin_unlock_irq(&migf->list_lock);
454 				goto found;
455 			}
456 			/*
457 			 * Prevent holding redundant buffers. Put in a free
458 			 * list and call at the end not under the spin lock
459 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
460 			 * might sleep.
461 			 */
462 			list_add(&buf->buf_elm, &free_list);
463 		}
464 	}
465 	spin_unlock_irq(&migf->list_lock);
466 	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
467 
468 found:
469 	while ((temp_buf = list_first_entry_or_null(&free_list,
470 				struct mlx5_vhca_data_buffer, buf_elm))) {
471 		list_del(&temp_buf->buf_elm);
472 		mlx5vf_free_data_buffer(temp_buf);
473 	}
474 
475 	return buf;
476 }
477 
478 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
479 {
480 	struct mlx5vf_async_data *async_data = container_of(_work,
481 		struct mlx5vf_async_data, work);
482 	struct mlx5_vf_migration_file *migf = container_of(async_data,
483 		struct mlx5_vf_migration_file, async_data);
484 
485 	mutex_lock(&migf->lock);
486 	if (async_data->status) {
487 		mlx5vf_put_data_buffer(async_data->buf);
488 		if (async_data->header_buf)
489 			mlx5vf_put_data_buffer(async_data->header_buf);
490 		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
491 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
492 		else
493 			migf->state = MLX5_MIGF_STATE_ERROR;
494 		wake_up_interruptible(&migf->poll_wait);
495 	}
496 	mutex_unlock(&migf->lock);
497 	kvfree(async_data->out);
498 	complete(&migf->save_comp);
499 	fput(migf->filp);
500 }
501 
502 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
503 			  size_t image_size, bool initial_pre_copy)
504 {
505 	struct mlx5_vf_migration_file *migf = header_buf->migf;
506 	struct mlx5_vf_migration_header header = {};
507 	unsigned long flags;
508 	struct page *page;
509 	u8 *to_buff;
510 
511 	header.record_size = cpu_to_le64(image_size);
512 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
513 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
514 	page = mlx5vf_get_migration_page(header_buf, 0);
515 	if (!page)
516 		return -EINVAL;
517 	to_buff = kmap_local_page(page);
518 	memcpy(to_buff, &header, sizeof(header));
519 	kunmap_local(to_buff);
520 	header_buf->length = sizeof(header);
521 	header_buf->start_pos = header_buf->migf->max_pos;
522 	migf->max_pos += header_buf->length;
523 	spin_lock_irqsave(&migf->list_lock, flags);
524 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
525 	spin_unlock_irqrestore(&migf->list_lock, flags);
526 	if (initial_pre_copy)
527 		migf->pre_copy_initial_bytes += sizeof(header);
528 	return 0;
529 }
530 
531 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
532 {
533 	struct mlx5vf_async_data *async_data = container_of(context,
534 			struct mlx5vf_async_data, cb_work);
535 	struct mlx5_vf_migration_file *migf = container_of(async_data,
536 			struct mlx5_vf_migration_file, async_data);
537 
538 	if (!status) {
539 		size_t image_size;
540 		unsigned long flags;
541 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
542 				!async_data->last_chunk;
543 
544 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
545 				      actual_image_size);
546 		if (async_data->header_buf) {
547 			status = add_buf_header(async_data->header_buf, image_size,
548 						initial_pre_copy);
549 			if (status)
550 				goto err;
551 		}
552 		async_data->buf->length = image_size;
553 		async_data->buf->start_pos = migf->max_pos;
554 		migf->max_pos += async_data->buf->length;
555 		spin_lock_irqsave(&migf->list_lock, flags);
556 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
557 		spin_unlock_irqrestore(&migf->list_lock, flags);
558 		if (initial_pre_copy)
559 			migf->pre_copy_initial_bytes += image_size;
560 		migf->state = async_data->last_chunk ?
561 			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
562 		wake_up_interruptible(&migf->poll_wait);
563 	}
564 
565 err:
566 	/*
567 	 * The error and the cleanup flows can't run from an
568 	 * interrupt context
569 	 */
570 	if (status == -EREMOTEIO)
571 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
572 	async_data->status = status;
573 	queue_work(migf->mvdev->cb_wq, &async_data->work);
574 }
575 
576 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
577 			       struct mlx5_vf_migration_file *migf,
578 			       struct mlx5_vhca_data_buffer *buf, bool inc,
579 			       bool track)
580 {
581 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
582 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
583 	struct mlx5_vhca_data_buffer *header_buf = NULL;
584 	struct mlx5vf_async_data *async_data;
585 	int err;
586 
587 	lockdep_assert_held(&mvdev->state_mutex);
588 	if (mvdev->mdev_detach)
589 		return -ENOTCONN;
590 
591 	err = wait_for_completion_interruptible(&migf->save_comp);
592 	if (err)
593 		return err;
594 
595 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
596 		/*
597 		 * In case we had a PRE_COPY error, SAVE is triggered only for
598 		 * the final image, read device full image.
599 		 */
600 		inc = false;
601 
602 	MLX5_SET(save_vhca_state_in, in, opcode,
603 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
604 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
605 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
606 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
607 	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
608 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
609 	MLX5_SET(save_vhca_state_in, in, set_track, track);
610 
611 	async_data = &migf->async_data;
612 	async_data->buf = buf;
613 	async_data->last_chunk = !track;
614 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
615 	if (!async_data->out) {
616 		err = -ENOMEM;
617 		goto err_out;
618 	}
619 
620 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
621 		if (async_data->last_chunk && migf->buf_header) {
622 			header_buf = migf->buf_header;
623 			migf->buf_header = NULL;
624 		} else {
625 			header_buf = mlx5vf_get_data_buffer(migf,
626 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
627 			if (IS_ERR(header_buf)) {
628 				err = PTR_ERR(header_buf);
629 				goto err_free;
630 			}
631 		}
632 	}
633 
634 	if (async_data->last_chunk)
635 		migf->state = MLX5_MIGF_STATE_SAVE_LAST;
636 
637 	async_data->header_buf = header_buf;
638 	get_file(migf->filp);
639 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
640 			       async_data->out,
641 			       out_size, mlx5vf_save_callback,
642 			       &async_data->cb_work);
643 	if (err)
644 		goto err_exec;
645 
646 	return 0;
647 
648 err_exec:
649 	if (header_buf)
650 		mlx5vf_put_data_buffer(header_buf);
651 	fput(migf->filp);
652 err_free:
653 	kvfree(async_data->out);
654 err_out:
655 	complete(&migf->save_comp);
656 	return err;
657 }
658 
659 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
660 			       struct mlx5_vf_migration_file *migf,
661 			       struct mlx5_vhca_data_buffer *buf)
662 {
663 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
664 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
665 	int err;
666 
667 	lockdep_assert_held(&mvdev->state_mutex);
668 	if (mvdev->mdev_detach)
669 		return -ENOTCONN;
670 
671 	if (!buf->dmaed) {
672 		err = mlx5vf_dma_data_buffer(buf);
673 		if (err)
674 			return err;
675 	}
676 
677 	MLX5_SET(load_vhca_state_in, in, opcode,
678 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
679 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
680 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
681 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
682 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
683 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
684 }
685 
686 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
687 {
688 	int err;
689 
690 	lockdep_assert_held(&migf->mvdev->state_mutex);
691 	if (migf->mvdev->mdev_detach)
692 		return -ENOTCONN;
693 
694 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
695 	return err;
696 }
697 
698 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
699 {
700 	lockdep_assert_held(&migf->mvdev->state_mutex);
701 	if (migf->mvdev->mdev_detach)
702 		return;
703 
704 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
705 }
706 
707 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
708 {
709 	struct mlx5_vhca_data_buffer *entry;
710 
711 	lockdep_assert_held(&migf->mvdev->state_mutex);
712 	WARN_ON(migf->mvdev->mdev_detach);
713 
714 	if (migf->buf) {
715 		mlx5vf_free_data_buffer(migf->buf);
716 		migf->buf = NULL;
717 	}
718 
719 	if (migf->buf_header) {
720 		mlx5vf_free_data_buffer(migf->buf_header);
721 		migf->buf_header = NULL;
722 	}
723 
724 	list_splice(&migf->avail_list, &migf->buf_list);
725 
726 	while ((entry = list_first_entry_or_null(&migf->buf_list,
727 				struct mlx5_vhca_data_buffer, buf_elm))) {
728 		list_del(&entry->buf_elm);
729 		mlx5vf_free_data_buffer(entry);
730 	}
731 
732 	mlx5vf_cmd_dealloc_pd(migf);
733 }
734 
735 static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
736 			   u32 req_nodes)
737 {
738 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
739 	unsigned long min_gap;
740 	unsigned long curr_gap;
741 
742 	/* Special shortcut when a single range is required */
743 	if (req_nodes == 1) {
744 		unsigned long last;
745 
746 		curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
747 		while (curr) {
748 			last = curr->last;
749 			prev = curr;
750 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
751 			if (prev != comb_start)
752 				interval_tree_remove(prev, root);
753 		}
754 		comb_start->last = last;
755 		return;
756 	}
757 
758 	/* Combine ranges which have the smallest gap */
759 	while (cur_nodes > req_nodes) {
760 		prev = NULL;
761 		min_gap = ULONG_MAX;
762 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
763 		while (curr) {
764 			if (prev) {
765 				curr_gap = curr->start - prev->last;
766 				if (curr_gap < min_gap) {
767 					min_gap = curr_gap;
768 					comb_start = prev;
769 					comb_end = curr;
770 				}
771 			}
772 			prev = curr;
773 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
774 		}
775 		comb_start->last = comb_end->last;
776 		interval_tree_remove(comb_end, root);
777 		cur_nodes--;
778 	}
779 }
780 
781 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
782 				 struct mlx5vf_pci_core_device *mvdev,
783 				 struct rb_root_cached *ranges, u32 nnodes)
784 {
785 	int max_num_range =
786 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
787 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
788 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
789 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
790 	struct interval_tree_node *node = NULL;
791 	u64 total_ranges_len = 0;
792 	u32 num_ranges = nnodes;
793 	u8 log_addr_space_size;
794 	void *range_list_ptr;
795 	void *obj_context;
796 	void *cmd_hdr;
797 	int inlen;
798 	void *in;
799 	int err;
800 	int i;
801 
802 	if (num_ranges > max_num_range) {
803 		combine_ranges(ranges, nnodes, max_num_range);
804 		num_ranges = max_num_range;
805 	}
806 
807 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
808 				 record_size * num_ranges;
809 	in = kzalloc(inlen, GFP_KERNEL);
810 	if (!in)
811 		return -ENOMEM;
812 
813 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
814 			       general_obj_in_cmd_hdr);
815 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
816 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
817 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
818 		 MLX5_OBJ_TYPE_PAGE_TRACK);
819 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
820 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
821 	MLX5_SET(page_track, obj_context, track_type, 1);
822 	MLX5_SET(page_track, obj_context, log_page_size,
823 		 ilog2(tracker->host_qp->tracked_page_size));
824 	MLX5_SET(page_track, obj_context, log_msg_size,
825 		 ilog2(tracker->host_qp->max_msg_size));
826 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
827 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
828 
829 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
830 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
831 	for (i = 0; i < num_ranges; i++) {
832 		void *addr_range_i_base = range_list_ptr + record_size * i;
833 		unsigned long length = node->last - node->start + 1;
834 
835 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
836 			   node->start);
837 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
838 		total_ranges_len += length;
839 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
840 	}
841 
842 	WARN_ON(node);
843 	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
844 	if (log_addr_space_size <
845 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
846 	    log_addr_space_size >
847 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
848 		err = -EOPNOTSUPP;
849 		goto out;
850 	}
851 
852 	MLX5_SET(page_track, obj_context, log_addr_space_size,
853 		 log_addr_space_size);
854 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
855 	if (err)
856 		goto out;
857 
858 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
859 out:
860 	kfree(in);
861 	return err;
862 }
863 
864 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
865 				      u32 tracker_id)
866 {
867 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
868 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
869 
870 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
871 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
872 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
873 
874 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
875 }
876 
877 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
878 				     u32 tracker_id, unsigned long iova,
879 				     unsigned long length, u32 tracker_state)
880 {
881 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
882 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
883 	void *obj_context;
884 	void *cmd_hdr;
885 
886 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
887 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
888 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
889 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
890 
891 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
892 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
893 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
894 	MLX5_SET64(page_track, obj_context, length, length);
895 	MLX5_SET(page_track, obj_context, state, tracker_state);
896 
897 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
898 }
899 
900 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
901 			     struct mlx5_vhca_cq_buf *buf, int nent,
902 			     int cqe_size)
903 {
904 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
905 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
906 	u8 log_wq_sz = ilog2(cqe_size);
907 	int err;
908 
909 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
910 				       mdev->priv.numa_node);
911 	if (err)
912 		return err;
913 
914 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
915 	buf->cqe_size = cqe_size;
916 	buf->nent = nent;
917 	return 0;
918 }
919 
920 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
921 {
922 	struct mlx5_cqe64 *cqe64;
923 	void *cqe;
924 	int i;
925 
926 	for (i = 0; i < buf->nent; i++) {
927 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
928 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
929 		cqe64->op_own = MLX5_CQE_INVALID << 4;
930 	}
931 }
932 
933 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
934 			      struct mlx5_vhca_cq *cq)
935 {
936 	mlx5_core_destroy_cq(mdev, &cq->mcq);
937 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
938 	mlx5_db_free(mdev, &cq->db);
939 }
940 
941 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
942 {
943 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
944 		return;
945 
946 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
947 				       tracker.cq.mcq));
948 }
949 
950 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
951 				 void *data)
952 {
953 	struct mlx5_vhca_page_tracker *tracker =
954 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
955 	struct mlx5vf_pci_core_device *mvdev = container_of(
956 		tracker, struct mlx5vf_pci_core_device, tracker);
957 	struct mlx5_eqe *eqe = data;
958 	u8 event_type = (u8)type;
959 	u8 queue_type;
960 	int qp_num;
961 
962 	switch (event_type) {
963 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
964 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
965 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
966 		queue_type = eqe->data.qp_srq.type;
967 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
968 			break;
969 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
970 		if (qp_num != tracker->host_qp->qpn &&
971 		    qp_num != tracker->fw_qp->qpn)
972 			break;
973 		set_tracker_error(mvdev);
974 		break;
975 	default:
976 		break;
977 	}
978 
979 	return NOTIFY_OK;
980 }
981 
982 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
983 			       struct mlx5_eqe *eqe)
984 {
985 	struct mlx5vf_pci_core_device *mvdev =
986 		container_of(mcq, struct mlx5vf_pci_core_device,
987 			     tracker.cq.mcq);
988 
989 	complete(&mvdev->tracker_comp);
990 }
991 
992 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
993 			    struct mlx5_vhca_page_tracker *tracker,
994 			    size_t ncqe)
995 {
996 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
997 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
998 	struct mlx5_vhca_cq *cq;
999 	int inlen, err, eqn;
1000 	void *cqc, *in;
1001 	__be64 *pas;
1002 	int vector;
1003 
1004 	cq = &tracker->cq;
1005 	ncqe = roundup_pow_of_two(ncqe);
1006 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1007 	if (err)
1008 		return err;
1009 
1010 	cq->ncqe = ncqe;
1011 	cq->mcq.set_ci_db = cq->db.db;
1012 	cq->mcq.arm_db = cq->db.db + 1;
1013 	cq->mcq.cqe_sz = cqe_size;
1014 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1015 	if (err)
1016 		goto err_db_free;
1017 
1018 	init_cq_frag_buf(&cq->buf);
1019 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1020 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1021 		cq->buf.frag_buf.npages;
1022 	in = kvzalloc(inlen, GFP_KERNEL);
1023 	if (!in) {
1024 		err = -ENOMEM;
1025 		goto err_buff;
1026 	}
1027 
1028 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
1029 	err = mlx5_vector2eqn(mdev, vector, &eqn);
1030 	if (err)
1031 		goto err_vec;
1032 
1033 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1034 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1035 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1036 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1037 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1038 		 MLX5_ADAPTER_PAGE_SHIFT);
1039 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1040 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1041 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1042 	cq->mcq.comp = mlx5vf_cq_complete;
1043 	cq->mcq.event = mlx5vf_cq_event;
1044 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1045 	if (err)
1046 		goto err_vec;
1047 
1048 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1049 		    cq->mcq.cons_index);
1050 	kvfree(in);
1051 	return 0;
1052 
1053 err_vec:
1054 	kvfree(in);
1055 err_buff:
1056 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1057 err_db_free:
1058 	mlx5_db_free(mdev, &cq->db);
1059 	return err;
1060 }
1061 
1062 static struct mlx5_vhca_qp *
1063 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1064 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1065 {
1066 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1067 	struct mlx5_vhca_qp *qp;
1068 	u8 log_rq_stride;
1069 	u8 log_rq_sz;
1070 	void *qpc;
1071 	int inlen;
1072 	void *in;
1073 	int err;
1074 
1075 	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1076 	if (!qp)
1077 		return ERR_PTR(-ENOMEM);
1078 
1079 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1080 	if (err)
1081 		goto err_free;
1082 
1083 	if (max_recv_wr) {
1084 		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1085 		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1086 		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1087 		err = mlx5_frag_buf_alloc_node(mdev,
1088 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1089 			&qp->buf, mdev->priv.numa_node);
1090 		if (err)
1091 			goto err_db_free;
1092 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1093 	}
1094 
1095 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1096 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1097 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1098 		qp->buf.npages;
1099 	in = kvzalloc(inlen, GFP_KERNEL);
1100 	if (!in) {
1101 		err = -ENOMEM;
1102 		goto err_in;
1103 	}
1104 
1105 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1106 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1107 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1108 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1109 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1110 	MLX5_SET(qpc, qpc, log_page_size,
1111 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1112 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1113 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1114 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1115 	MLX5_SET(qpc, qpc, no_sq, 1);
1116 	if (max_recv_wr) {
1117 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1118 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1119 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1120 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1121 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1122 		mlx5_fill_page_frag_array(&qp->buf,
1123 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1124 								 in, pas));
1125 	} else {
1126 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1127 	}
1128 
1129 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1130 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1131 	kvfree(in);
1132 	if (err)
1133 		goto err_in;
1134 
1135 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1136 	return qp;
1137 
1138 err_in:
1139 	if (max_recv_wr)
1140 		mlx5_frag_buf_free(mdev, &qp->buf);
1141 err_db_free:
1142 	mlx5_db_free(mdev, &qp->db);
1143 err_free:
1144 	kfree(qp);
1145 	return ERR_PTR(err);
1146 }
1147 
1148 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1149 {
1150 	struct mlx5_wqe_data_seg *data;
1151 	unsigned int ix;
1152 
1153 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1154 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1155 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1156 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1157 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1158 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1159 	qp->rq.pc++;
1160 	/* Make sure that descriptors are written before doorbell record. */
1161 	dma_wmb();
1162 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1163 }
1164 
1165 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1166 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1167 			      bool host_qp)
1168 {
1169 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1170 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1171 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1172 	void *qpc;
1173 	int ret;
1174 
1175 	/* Init */
1176 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1177 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1178 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1179 	MLX5_SET(qpc, qpc, rre, 1);
1180 	MLX5_SET(qpc, qpc, rwe, 1);
1181 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1182 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1183 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1184 	if (ret)
1185 		return ret;
1186 
1187 	if (host_qp) {
1188 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1189 		int i;
1190 
1191 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1192 			mlx5vf_post_recv(qp);
1193 			recv_buf->next_rq_offset += qp->max_msg_size;
1194 		}
1195 	}
1196 
1197 	/* RTR */
1198 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1199 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1200 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1201 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1202 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1203 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1204 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1205 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1206 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1207 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1208 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1209 	if (ret || host_qp)
1210 		return ret;
1211 
1212 	/* RTS */
1213 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1214 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1215 	MLX5_SET(qpc, qpc, retry_count, 7);
1216 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1217 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1218 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1219 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1220 
1221 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1222 }
1223 
1224 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1225 			      struct mlx5_vhca_qp *qp)
1226 {
1227 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1228 
1229 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1230 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1231 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1232 
1233 	mlx5_frag_buf_free(mdev, &qp->buf);
1234 	mlx5_db_free(mdev, &qp->db);
1235 	kfree(qp);
1236 }
1237 
1238 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1239 {
1240 	int i;
1241 
1242 	/* Undo alloc_pages_bulk_array() */
1243 	for (i = 0; i < recv_buf->npages; i++)
1244 		__free_page(recv_buf->page_list[i]);
1245 
1246 	kvfree(recv_buf->page_list);
1247 }
1248 
1249 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1250 			    unsigned int npages)
1251 {
1252 	unsigned int filled = 0, done = 0;
1253 	int i;
1254 
1255 	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1256 				       GFP_KERNEL_ACCOUNT);
1257 	if (!recv_buf->page_list)
1258 		return -ENOMEM;
1259 
1260 	for (;;) {
1261 		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1262 						npages - done,
1263 						recv_buf->page_list + done);
1264 		if (!filled)
1265 			goto err;
1266 
1267 		done += filled;
1268 		if (done == npages)
1269 			break;
1270 	}
1271 
1272 	recv_buf->npages = npages;
1273 	return 0;
1274 
1275 err:
1276 	for (i = 0; i < npages; i++) {
1277 		if (recv_buf->page_list[i])
1278 			__free_page(recv_buf->page_list[i]);
1279 	}
1280 
1281 	kvfree(recv_buf->page_list);
1282 	return -ENOMEM;
1283 }
1284 
1285 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1286 				   struct mlx5_vhca_recv_buf *recv_buf)
1287 {
1288 	int i, j;
1289 
1290 	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1291 				       sizeof(*recv_buf->dma_addrs),
1292 				       GFP_KERNEL_ACCOUNT);
1293 	if (!recv_buf->dma_addrs)
1294 		return -ENOMEM;
1295 
1296 	for (i = 0; i < recv_buf->npages; i++) {
1297 		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1298 						      recv_buf->page_list[i],
1299 						      0, PAGE_SIZE,
1300 						      DMA_FROM_DEVICE);
1301 		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1302 			goto error;
1303 	}
1304 	return 0;
1305 
1306 error:
1307 	for (j = 0; j < i; j++)
1308 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1309 				 PAGE_SIZE, DMA_FROM_DEVICE);
1310 
1311 	kvfree(recv_buf->dma_addrs);
1312 	return -ENOMEM;
1313 }
1314 
1315 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1316 				      struct mlx5_vhca_recv_buf *recv_buf)
1317 {
1318 	int i;
1319 
1320 	for (i = 0; i < recv_buf->npages; i++)
1321 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1322 				 PAGE_SIZE, DMA_FROM_DEVICE);
1323 
1324 	kvfree(recv_buf->dma_addrs);
1325 }
1326 
1327 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1328 					  struct mlx5_vhca_qp *qp)
1329 {
1330 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1331 
1332 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1333 	unregister_dma_recv_pages(mdev, recv_buf);
1334 	free_recv_pages(&qp->recv_buf);
1335 }
1336 
1337 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1338 					  struct mlx5_vhca_qp *qp, u32 pdn,
1339 					  u64 rq_size)
1340 {
1341 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1342 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1343 	int err;
1344 
1345 	err = alloc_recv_pages(recv_buf, npages);
1346 	if (err < 0)
1347 		return err;
1348 
1349 	err = register_dma_recv_pages(mdev, recv_buf);
1350 	if (err)
1351 		goto end;
1352 
1353 	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1354 	if (err)
1355 		goto err_create_mkey;
1356 
1357 	return 0;
1358 
1359 err_create_mkey:
1360 	unregister_dma_recv_pages(mdev, recv_buf);
1361 end:
1362 	free_recv_pages(recv_buf);
1363 	return err;
1364 }
1365 
1366 static void
1367 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1368 {
1369 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1370 	struct mlx5_core_dev *mdev = mvdev->mdev;
1371 
1372 	lockdep_assert_held(&mvdev->state_mutex);
1373 
1374 	if (!mvdev->log_active)
1375 		return;
1376 
1377 	WARN_ON(mvdev->mdev_detach);
1378 
1379 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1380 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1381 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1382 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1383 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1384 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1385 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1386 	mlx5_put_uars_page(mdev, tracker->uar);
1387 	mvdev->log_active = false;
1388 }
1389 
1390 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1391 {
1392 	struct mlx5vf_pci_core_device *mvdev = container_of(
1393 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1394 
1395 	mutex_lock(&mvdev->state_mutex);
1396 	if (!mvdev->log_active)
1397 		goto end;
1398 
1399 	_mlx5vf_free_page_tracker_resources(mvdev);
1400 	mvdev->log_active = false;
1401 end:
1402 	mlx5vf_state_mutex_unlock(mvdev);
1403 	return 0;
1404 }
1405 
1406 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1407 			      struct rb_root_cached *ranges, u32 nnodes,
1408 			      u64 *page_size)
1409 {
1410 	struct mlx5vf_pci_core_device *mvdev = container_of(
1411 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1412 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1413 	u8 log_tracked_page = ilog2(*page_size);
1414 	struct mlx5_vhca_qp *host_qp;
1415 	struct mlx5_vhca_qp *fw_qp;
1416 	struct mlx5_core_dev *mdev;
1417 	u32 max_msg_size = PAGE_SIZE;
1418 	u64 rq_size = SZ_2M;
1419 	u32 max_recv_wr;
1420 	int err;
1421 
1422 	mutex_lock(&mvdev->state_mutex);
1423 	if (mvdev->mdev_detach) {
1424 		err = -ENOTCONN;
1425 		goto end;
1426 	}
1427 
1428 	if (mvdev->log_active) {
1429 		err = -EINVAL;
1430 		goto end;
1431 	}
1432 
1433 	mdev = mvdev->mdev;
1434 	memset(tracker, 0, sizeof(*tracker));
1435 	tracker->uar = mlx5_get_uars_page(mdev);
1436 	if (IS_ERR(tracker->uar)) {
1437 		err = PTR_ERR(tracker->uar);
1438 		goto end;
1439 	}
1440 
1441 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1442 	if (err)
1443 		goto err_uar;
1444 
1445 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1446 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1447 	if (err)
1448 		goto err_dealloc_pd;
1449 
1450 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1451 	if (IS_ERR(host_qp)) {
1452 		err = PTR_ERR(host_qp);
1453 		goto err_cq;
1454 	}
1455 
1456 	host_qp->max_msg_size = max_msg_size;
1457 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1458 				pg_track_log_min_page_size)) {
1459 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1460 				pg_track_log_min_page_size);
1461 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1462 				pg_track_log_max_page_size)) {
1463 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1464 				pg_track_log_max_page_size);
1465 	}
1466 
1467 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1468 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1469 					     rq_size);
1470 	if (err)
1471 		goto err_host_qp;
1472 
1473 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1474 	if (IS_ERR(fw_qp)) {
1475 		err = PTR_ERR(fw_qp);
1476 		goto err_recv_resources;
1477 	}
1478 
1479 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1480 	if (err)
1481 		goto err_activate;
1482 
1483 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1484 	if (err)
1485 		goto err_activate;
1486 
1487 	tracker->host_qp = host_qp;
1488 	tracker->fw_qp = fw_qp;
1489 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1490 	if (err)
1491 		goto err_activate;
1492 
1493 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1494 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1495 	*page_size = host_qp->tracked_page_size;
1496 	mvdev->log_active = true;
1497 	mlx5vf_state_mutex_unlock(mvdev);
1498 	return 0;
1499 
1500 err_activate:
1501 	mlx5vf_destroy_qp(mdev, fw_qp);
1502 err_recv_resources:
1503 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1504 err_host_qp:
1505 	mlx5vf_destroy_qp(mdev, host_qp);
1506 err_cq:
1507 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1508 err_dealloc_pd:
1509 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1510 err_uar:
1511 	mlx5_put_uars_page(mdev, tracker->uar);
1512 end:
1513 	mlx5vf_state_mutex_unlock(mvdev);
1514 	return err;
1515 }
1516 
1517 static void
1518 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1519 		  struct iova_bitmap *dirty)
1520 {
1521 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1522 	u32 nent = size / entry_size;
1523 	struct page *page;
1524 	u64 addr;
1525 	u64 *buf;
1526 	int i;
1527 
1528 	if (WARN_ON(index >= qp->recv_buf.npages ||
1529 		    (nent > qp->max_msg_size / entry_size)))
1530 		return;
1531 
1532 	page = qp->recv_buf.page_list[index];
1533 	buf = kmap_local_page(page);
1534 	for (i = 0; i < nent; i++) {
1535 		addr = MLX5_GET(page_track_report_entry, buf + i,
1536 				dirty_address_low);
1537 		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1538 				      dirty_address_high) << 32;
1539 		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1540 	}
1541 	kunmap_local(buf);
1542 }
1543 
1544 static void
1545 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1546 	      struct iova_bitmap *dirty, int *tracker_status)
1547 {
1548 	u32 size;
1549 	int ix;
1550 
1551 	qp->rq.cc++;
1552 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1553 	size = be32_to_cpu(cqe->byte_cnt);
1554 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1555 
1556 	/* zero length CQE, no data */
1557 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1558 	if (size)
1559 		set_report_output(size, ix, qp, dirty);
1560 
1561 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1562 	mlx5vf_post_recv(qp);
1563 }
1564 
1565 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1566 {
1567 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1568 }
1569 
1570 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1571 {
1572 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1573 	struct mlx5_cqe64 *cqe64;
1574 
1575 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1576 
1577 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1578 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1579 		return cqe64;
1580 	} else {
1581 		return NULL;
1582 	}
1583 }
1584 
1585 static int
1586 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1587 		   struct iova_bitmap *dirty, int *tracker_status)
1588 {
1589 	struct mlx5_cqe64 *cqe;
1590 	u8 opcode;
1591 
1592 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1593 	if (!cqe)
1594 		return CQ_EMPTY;
1595 
1596 	++cq->mcq.cons_index;
1597 	/*
1598 	 * Make sure we read CQ entry contents after we've checked the
1599 	 * ownership bit.
1600 	 */
1601 	rmb();
1602 	opcode = get_cqe_opcode(cqe);
1603 	switch (opcode) {
1604 	case MLX5_CQE_RESP_SEND_IMM:
1605 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1606 		return CQ_OK;
1607 	default:
1608 		return CQ_POLL_ERR;
1609 	}
1610 }
1611 
1612 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1613 				  unsigned long length,
1614 				  struct iova_bitmap *dirty)
1615 {
1616 	struct mlx5vf_pci_core_device *mvdev = container_of(
1617 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1618 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1619 	struct mlx5_vhca_cq *cq = &tracker->cq;
1620 	struct mlx5_core_dev *mdev;
1621 	int poll_err, err;
1622 
1623 	mutex_lock(&mvdev->state_mutex);
1624 	if (!mvdev->log_active) {
1625 		err = -EINVAL;
1626 		goto end;
1627 	}
1628 
1629 	if (mvdev->mdev_detach) {
1630 		err = -ENOTCONN;
1631 		goto end;
1632 	}
1633 
1634 	mdev = mvdev->mdev;
1635 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1636 					MLX5_PAGE_TRACK_STATE_REPORTING);
1637 	if (err)
1638 		goto end;
1639 
1640 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1641 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1642 	       !tracker->is_err) {
1643 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1644 					      &tracker->status);
1645 		if (poll_err == CQ_EMPTY) {
1646 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1647 				    cq->mcq.cons_index);
1648 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1649 						      dirty, &tracker->status);
1650 			if (poll_err == CQ_EMPTY) {
1651 				wait_for_completion(&mvdev->tracker_comp);
1652 				continue;
1653 			}
1654 		}
1655 		if (poll_err == CQ_POLL_ERR) {
1656 			err = -EIO;
1657 			goto end;
1658 		}
1659 		mlx5_cq_set_ci(&cq->mcq);
1660 	}
1661 
1662 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1663 		tracker->is_err = true;
1664 
1665 	if (tracker->is_err)
1666 		err = -EIO;
1667 end:
1668 	mlx5vf_state_mutex_unlock(mvdev);
1669 	return err;
1670 }
1671