xref: /openbmc/linux/drivers/vfio/pci/mlx5/cmd.c (revision d2a8e92f)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
10 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
11 				  u16 *vhca_id);
12 static void
13 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
14 
15 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
16 {
17 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
18 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
19 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
20 	int err;
21 
22 	lockdep_assert_held(&mvdev->state_mutex);
23 	if (mvdev->mdev_detach)
24 		return -ENOTCONN;
25 
26 	/*
27 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
28 	 * running. Make sure to run only once there is no active save command.
29 	 * Running both in parallel, might end-up with a failure in the save
30 	 * command once it will try to turn on 'tracking' on a suspended device.
31 	 */
32 	if (migf) {
33 		err = wait_for_completion_interruptible(&migf->save_comp);
34 		if (err)
35 			return err;
36 	}
37 
38 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
39 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
40 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
41 
42 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
43 	if (migf)
44 		complete(&migf->save_comp);
45 
46 	return err;
47 }
48 
49 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
50 {
51 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
52 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
53 
54 	lockdep_assert_held(&mvdev->state_mutex);
55 	if (mvdev->mdev_detach)
56 		return -ENOTCONN;
57 
58 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
59 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
60 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
61 
62 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
63 }
64 
65 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
66 					  size_t *state_size, u8 query_flags)
67 {
68 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
69 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
70 	bool inc = query_flags & MLX5VF_QUERY_INC;
71 	int ret;
72 
73 	lockdep_assert_held(&mvdev->state_mutex);
74 	if (mvdev->mdev_detach)
75 		return -ENOTCONN;
76 
77 	/*
78 	 * In case PRE_COPY is used, saving_migf is exposed while device is
79 	 * running. Make sure to run only once there is no active save command.
80 	 * Running both in parallel, might end-up with a failure in the
81 	 * incremental query command on un-tracked vhca.
82 	 */
83 	if (inc) {
84 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
85 		if (ret)
86 			return ret;
87 		if (mvdev->saving_migf->state ==
88 		    MLX5_MIGF_STATE_PRE_COPY_ERROR) {
89 			/*
90 			 * In case we had a PRE_COPY error, only query full
91 			 * image for final image
92 			 */
93 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
94 				*state_size = 0;
95 				complete(&mvdev->saving_migf->save_comp);
96 				return 0;
97 			}
98 			query_flags &= ~MLX5VF_QUERY_INC;
99 		}
100 	}
101 
102 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
103 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
104 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
105 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
106 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
107 		 query_flags & MLX5VF_QUERY_INC);
108 
109 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
110 				  out);
111 	if (inc)
112 		complete(&mvdev->saving_migf->save_comp);
113 
114 	if (ret)
115 		return ret;
116 
117 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
118 			       required_umem_size);
119 	return 0;
120 }
121 
122 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
123 {
124 	/* Mark the tracker under an error and wake it up if it's running */
125 	mvdev->tracker.is_err = true;
126 	complete(&mvdev->tracker_comp);
127 }
128 
129 static int mlx5fv_vf_event(struct notifier_block *nb,
130 			   unsigned long event, void *data)
131 {
132 	struct mlx5vf_pci_core_device *mvdev =
133 		container_of(nb, struct mlx5vf_pci_core_device, nb);
134 
135 	switch (event) {
136 	case MLX5_PF_NOTIFY_ENABLE_VF:
137 		mutex_lock(&mvdev->state_mutex);
138 		mvdev->mdev_detach = false;
139 		mlx5vf_state_mutex_unlock(mvdev);
140 		break;
141 	case MLX5_PF_NOTIFY_DISABLE_VF:
142 		mlx5vf_cmd_close_migratable(mvdev);
143 		mutex_lock(&mvdev->state_mutex);
144 		mvdev->mdev_detach = true;
145 		mlx5vf_state_mutex_unlock(mvdev);
146 		break;
147 	default:
148 		break;
149 	}
150 
151 	return 0;
152 }
153 
154 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
155 {
156 	if (!mvdev->migrate_cap)
157 		return;
158 
159 	/* Must be done outside the lock to let it progress */
160 	set_tracker_error(mvdev);
161 	mutex_lock(&mvdev->state_mutex);
162 	mlx5vf_disable_fds(mvdev);
163 	_mlx5vf_free_page_tracker_resources(mvdev);
164 	mlx5vf_state_mutex_unlock(mvdev);
165 }
166 
167 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
168 {
169 	if (!mvdev->migrate_cap)
170 		return;
171 
172 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
173 						&mvdev->nb);
174 	destroy_workqueue(mvdev->cb_wq);
175 }
176 
177 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
178 			       const struct vfio_migration_ops *mig_ops,
179 			       const struct vfio_log_ops *log_ops)
180 {
181 	struct pci_dev *pdev = mvdev->core_device.pdev;
182 	int ret;
183 
184 	if (!pdev->is_virtfn)
185 		return;
186 
187 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
188 	if (!mvdev->mdev)
189 		return;
190 
191 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
192 		goto end;
193 
194 	mvdev->vf_id = pci_iov_vf_id(pdev);
195 	if (mvdev->vf_id < 0)
196 		goto end;
197 
198 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
199 				   &mvdev->vhca_id))
200 		goto end;
201 
202 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
203 	if (!mvdev->cb_wq)
204 		goto end;
205 
206 	mutex_init(&mvdev->state_mutex);
207 	spin_lock_init(&mvdev->reset_lock);
208 	mvdev->nb.notifier_call = mlx5fv_vf_event;
209 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
210 						    &mvdev->nb);
211 	if (ret) {
212 		destroy_workqueue(mvdev->cb_wq);
213 		goto end;
214 	}
215 
216 	mvdev->migrate_cap = 1;
217 	mvdev->core_device.vdev.migration_flags =
218 		VFIO_MIGRATION_STOP_COPY |
219 		VFIO_MIGRATION_P2P;
220 	mvdev->core_device.vdev.mig_ops = mig_ops;
221 	init_completion(&mvdev->tracker_comp);
222 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
223 		mvdev->core_device.vdev.log_ops = log_ops;
224 
225 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
226 	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
227 		mvdev->core_device.vdev.migration_flags |=
228 			VFIO_MIGRATION_PRE_COPY;
229 
230 end:
231 	mlx5_vf_put_core_dev(mvdev->mdev);
232 }
233 
234 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
235 				  u16 *vhca_id)
236 {
237 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
238 	int out_size;
239 	void *out;
240 	int ret;
241 
242 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
243 	out = kzalloc(out_size, GFP_KERNEL);
244 	if (!out)
245 		return -ENOMEM;
246 
247 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
248 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
249 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
250 	MLX5_SET(query_hca_cap_in, in, op_mod,
251 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
252 		 HCA_CAP_OPMOD_GET_CUR);
253 
254 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
255 	if (ret)
256 		goto err_exec;
257 
258 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
259 			    capability.cmd_hca_cap.vhca_id);
260 
261 err_exec:
262 	kfree(out);
263 	return ret;
264 }
265 
266 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
267 			struct mlx5_vhca_data_buffer *buf,
268 			struct mlx5_vhca_recv_buf *recv_buf,
269 			u32 *mkey)
270 {
271 	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
272 				recv_buf->npages;
273 	int err = 0, inlen;
274 	__be64 *mtt;
275 	void *mkc;
276 	u32 *in;
277 
278 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
279 		sizeof(*mtt) * round_up(npages, 2);
280 
281 	in = kvzalloc(inlen, GFP_KERNEL);
282 	if (!in)
283 		return -ENOMEM;
284 
285 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
286 		 DIV_ROUND_UP(npages, 2));
287 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
288 
289 	if (buf) {
290 		struct sg_dma_page_iter dma_iter;
291 
292 		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
293 			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
294 	} else {
295 		int i;
296 
297 		for (i = 0; i < npages; i++)
298 			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
299 	}
300 
301 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
302 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
303 	MLX5_SET(mkc, mkc, lr, 1);
304 	MLX5_SET(mkc, mkc, lw, 1);
305 	MLX5_SET(mkc, mkc, rr, 1);
306 	MLX5_SET(mkc, mkc, rw, 1);
307 	MLX5_SET(mkc, mkc, pd, pdn);
308 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
309 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
310 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
311 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
312 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
313 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
314 	kvfree(in);
315 	return err;
316 }
317 
318 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
319 {
320 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
321 	struct mlx5_core_dev *mdev = mvdev->mdev;
322 	int ret;
323 
324 	lockdep_assert_held(&mvdev->state_mutex);
325 	if (mvdev->mdev_detach)
326 		return -ENOTCONN;
327 
328 	if (buf->dmaed || !buf->allocated_length)
329 		return -EINVAL;
330 
331 	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
332 	if (ret)
333 		return ret;
334 
335 	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
336 	if (ret)
337 		goto err;
338 
339 	buf->dmaed = true;
340 
341 	return 0;
342 err:
343 	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
344 	return ret;
345 }
346 
347 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
348 {
349 	struct mlx5_vf_migration_file *migf = buf->migf;
350 	struct sg_page_iter sg_iter;
351 
352 	lockdep_assert_held(&migf->mvdev->state_mutex);
353 	WARN_ON(migf->mvdev->mdev_detach);
354 
355 	if (buf->dmaed) {
356 		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
357 		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
358 				  buf->dma_dir, 0);
359 	}
360 
361 	/* Undo alloc_pages_bulk_array() */
362 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
363 		__free_page(sg_page_iter_page(&sg_iter));
364 	sg_free_append_table(&buf->table);
365 	kfree(buf);
366 }
367 
368 struct mlx5_vhca_data_buffer *
369 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
370 			 size_t length,
371 			 enum dma_data_direction dma_dir)
372 {
373 	struct mlx5_vhca_data_buffer *buf;
374 	int ret;
375 
376 	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
377 	if (!buf)
378 		return ERR_PTR(-ENOMEM);
379 
380 	buf->dma_dir = dma_dir;
381 	buf->migf = migf;
382 	if (length) {
383 		ret = mlx5vf_add_migration_pages(buf,
384 				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
385 		if (ret)
386 			goto end;
387 
388 		if (dma_dir != DMA_NONE) {
389 			ret = mlx5vf_dma_data_buffer(buf);
390 			if (ret)
391 				goto end;
392 		}
393 	}
394 
395 	return buf;
396 end:
397 	mlx5vf_free_data_buffer(buf);
398 	return ERR_PTR(ret);
399 }
400 
401 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
402 {
403 	spin_lock_irq(&buf->migf->list_lock);
404 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
405 	spin_unlock_irq(&buf->migf->list_lock);
406 }
407 
408 struct mlx5_vhca_data_buffer *
409 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
410 		       size_t length, enum dma_data_direction dma_dir)
411 {
412 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
413 	struct list_head free_list;
414 
415 	lockdep_assert_held(&migf->mvdev->state_mutex);
416 	if (migf->mvdev->mdev_detach)
417 		return ERR_PTR(-ENOTCONN);
418 
419 	INIT_LIST_HEAD(&free_list);
420 
421 	spin_lock_irq(&migf->list_lock);
422 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
423 		if (buf->dma_dir == dma_dir) {
424 			list_del_init(&buf->buf_elm);
425 			if (buf->allocated_length >= length) {
426 				spin_unlock_irq(&migf->list_lock);
427 				goto found;
428 			}
429 			/*
430 			 * Prevent holding redundant buffers. Put in a free
431 			 * list and call at the end not under the spin lock
432 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
433 			 * might sleep.
434 			 */
435 			list_add(&buf->buf_elm, &free_list);
436 		}
437 	}
438 	spin_unlock_irq(&migf->list_lock);
439 	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
440 
441 found:
442 	while ((temp_buf = list_first_entry_or_null(&free_list,
443 				struct mlx5_vhca_data_buffer, buf_elm))) {
444 		list_del(&temp_buf->buf_elm);
445 		mlx5vf_free_data_buffer(temp_buf);
446 	}
447 
448 	return buf;
449 }
450 
451 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
452 {
453 	struct mlx5vf_async_data *async_data = container_of(_work,
454 		struct mlx5vf_async_data, work);
455 	struct mlx5_vf_migration_file *migf = container_of(async_data,
456 		struct mlx5_vf_migration_file, async_data);
457 
458 	mutex_lock(&migf->lock);
459 	if (async_data->status) {
460 		mlx5vf_put_data_buffer(async_data->buf);
461 		if (async_data->header_buf)
462 			mlx5vf_put_data_buffer(async_data->header_buf);
463 		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
464 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
465 		else
466 			migf->state = MLX5_MIGF_STATE_ERROR;
467 		wake_up_interruptible(&migf->poll_wait);
468 	}
469 	mutex_unlock(&migf->lock);
470 	kvfree(async_data->out);
471 	complete(&migf->save_comp);
472 	fput(migf->filp);
473 }
474 
475 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
476 			  size_t image_size)
477 {
478 	struct mlx5_vf_migration_file *migf = header_buf->migf;
479 	struct mlx5_vf_migration_header header = {};
480 	unsigned long flags;
481 	struct page *page;
482 	u8 *to_buff;
483 
484 	header.image_size = cpu_to_le64(image_size);
485 	page = mlx5vf_get_migration_page(header_buf, 0);
486 	if (!page)
487 		return -EINVAL;
488 	to_buff = kmap_local_page(page);
489 	memcpy(to_buff, &header, sizeof(header));
490 	kunmap_local(to_buff);
491 	header_buf->length = sizeof(header);
492 	header_buf->header_image_size = image_size;
493 	header_buf->start_pos = header_buf->migf->max_pos;
494 	migf->max_pos += header_buf->length;
495 	spin_lock_irqsave(&migf->list_lock, flags);
496 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
497 	spin_unlock_irqrestore(&migf->list_lock, flags);
498 	return 0;
499 }
500 
501 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
502 {
503 	struct mlx5vf_async_data *async_data = container_of(context,
504 			struct mlx5vf_async_data, cb_work);
505 	struct mlx5_vf_migration_file *migf = container_of(async_data,
506 			struct mlx5_vf_migration_file, async_data);
507 
508 	if (!status) {
509 		size_t image_size;
510 		unsigned long flags;
511 
512 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
513 				      actual_image_size);
514 		if (async_data->header_buf) {
515 			status = add_buf_header(async_data->header_buf, image_size);
516 			if (status)
517 				goto err;
518 		}
519 		async_data->buf->length = image_size;
520 		async_data->buf->start_pos = migf->max_pos;
521 		migf->max_pos += async_data->buf->length;
522 		spin_lock_irqsave(&migf->list_lock, flags);
523 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
524 		spin_unlock_irqrestore(&migf->list_lock, flags);
525 		migf->state = async_data->last_chunk ?
526 			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
527 		wake_up_interruptible(&migf->poll_wait);
528 	}
529 
530 err:
531 	/*
532 	 * The error and the cleanup flows can't run from an
533 	 * interrupt context
534 	 */
535 	if (status == -EREMOTEIO)
536 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
537 	async_data->status = status;
538 	queue_work(migf->mvdev->cb_wq, &async_data->work);
539 }
540 
541 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
542 			       struct mlx5_vf_migration_file *migf,
543 			       struct mlx5_vhca_data_buffer *buf, bool inc,
544 			       bool track)
545 {
546 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
547 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
548 	struct mlx5_vhca_data_buffer *header_buf = NULL;
549 	struct mlx5vf_async_data *async_data;
550 	int err;
551 
552 	lockdep_assert_held(&mvdev->state_mutex);
553 	if (mvdev->mdev_detach)
554 		return -ENOTCONN;
555 
556 	err = wait_for_completion_interruptible(&migf->save_comp);
557 	if (err)
558 		return err;
559 
560 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
561 		/*
562 		 * In case we had a PRE_COPY error, SAVE is triggered only for
563 		 * the final image, read device full image.
564 		 */
565 		inc = false;
566 
567 	MLX5_SET(save_vhca_state_in, in, opcode,
568 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
569 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
570 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
571 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
572 	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
573 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
574 	MLX5_SET(save_vhca_state_in, in, set_track, track);
575 
576 	async_data = &migf->async_data;
577 	async_data->buf = buf;
578 	async_data->last_chunk = !track;
579 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
580 	if (!async_data->out) {
581 		err = -ENOMEM;
582 		goto err_out;
583 	}
584 
585 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
586 		header_buf = mlx5vf_get_data_buffer(migf,
587 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
588 		if (IS_ERR(header_buf)) {
589 			err = PTR_ERR(header_buf);
590 			goto err_free;
591 		}
592 	}
593 
594 	if (async_data->last_chunk)
595 		migf->state = MLX5_MIGF_STATE_SAVE_LAST;
596 
597 	async_data->header_buf = header_buf;
598 	get_file(migf->filp);
599 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
600 			       async_data->out,
601 			       out_size, mlx5vf_save_callback,
602 			       &async_data->cb_work);
603 	if (err)
604 		goto err_exec;
605 
606 	return 0;
607 
608 err_exec:
609 	if (header_buf)
610 		mlx5vf_put_data_buffer(header_buf);
611 	fput(migf->filp);
612 err_free:
613 	kvfree(async_data->out);
614 err_out:
615 	complete(&migf->save_comp);
616 	return err;
617 }
618 
619 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
620 			       struct mlx5_vf_migration_file *migf,
621 			       struct mlx5_vhca_data_buffer *buf)
622 {
623 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
624 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
625 	int err;
626 
627 	lockdep_assert_held(&mvdev->state_mutex);
628 	if (mvdev->mdev_detach)
629 		return -ENOTCONN;
630 
631 	if (!buf->dmaed) {
632 		err = mlx5vf_dma_data_buffer(buf);
633 		if (err)
634 			return err;
635 	}
636 
637 	MLX5_SET(load_vhca_state_in, in, opcode,
638 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
639 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
640 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
641 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
642 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
643 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
644 }
645 
646 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
647 {
648 	int err;
649 
650 	lockdep_assert_held(&migf->mvdev->state_mutex);
651 	if (migf->mvdev->mdev_detach)
652 		return -ENOTCONN;
653 
654 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
655 	return err;
656 }
657 
658 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
659 {
660 	lockdep_assert_held(&migf->mvdev->state_mutex);
661 	if (migf->mvdev->mdev_detach)
662 		return;
663 
664 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
665 }
666 
667 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
668 {
669 	struct mlx5_vhca_data_buffer *entry;
670 
671 	lockdep_assert_held(&migf->mvdev->state_mutex);
672 	WARN_ON(migf->mvdev->mdev_detach);
673 
674 	if (migf->buf) {
675 		mlx5vf_free_data_buffer(migf->buf);
676 		migf->buf = NULL;
677 	}
678 
679 	if (migf->buf_header) {
680 		mlx5vf_free_data_buffer(migf->buf_header);
681 		migf->buf_header = NULL;
682 	}
683 
684 	list_splice(&migf->avail_list, &migf->buf_list);
685 
686 	while ((entry = list_first_entry_or_null(&migf->buf_list,
687 				struct mlx5_vhca_data_buffer, buf_elm))) {
688 		list_del(&entry->buf_elm);
689 		mlx5vf_free_data_buffer(entry);
690 	}
691 
692 	mlx5vf_cmd_dealloc_pd(migf);
693 }
694 
695 static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
696 			   u32 req_nodes)
697 {
698 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
699 	unsigned long min_gap;
700 	unsigned long curr_gap;
701 
702 	/* Special shortcut when a single range is required */
703 	if (req_nodes == 1) {
704 		unsigned long last;
705 
706 		curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
707 		while (curr) {
708 			last = curr->last;
709 			prev = curr;
710 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
711 			if (prev != comb_start)
712 				interval_tree_remove(prev, root);
713 		}
714 		comb_start->last = last;
715 		return;
716 	}
717 
718 	/* Combine ranges which have the smallest gap */
719 	while (cur_nodes > req_nodes) {
720 		prev = NULL;
721 		min_gap = ULONG_MAX;
722 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
723 		while (curr) {
724 			if (prev) {
725 				curr_gap = curr->start - prev->last;
726 				if (curr_gap < min_gap) {
727 					min_gap = curr_gap;
728 					comb_start = prev;
729 					comb_end = curr;
730 				}
731 			}
732 			prev = curr;
733 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
734 		}
735 		comb_start->last = comb_end->last;
736 		interval_tree_remove(comb_end, root);
737 		cur_nodes--;
738 	}
739 }
740 
741 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
742 				 struct mlx5vf_pci_core_device *mvdev,
743 				 struct rb_root_cached *ranges, u32 nnodes)
744 {
745 	int max_num_range =
746 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
747 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
748 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
749 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
750 	struct interval_tree_node *node = NULL;
751 	u64 total_ranges_len = 0;
752 	u32 num_ranges = nnodes;
753 	u8 log_addr_space_size;
754 	void *range_list_ptr;
755 	void *obj_context;
756 	void *cmd_hdr;
757 	int inlen;
758 	void *in;
759 	int err;
760 	int i;
761 
762 	if (num_ranges > max_num_range) {
763 		combine_ranges(ranges, nnodes, max_num_range);
764 		num_ranges = max_num_range;
765 	}
766 
767 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
768 				 record_size * num_ranges;
769 	in = kzalloc(inlen, GFP_KERNEL);
770 	if (!in)
771 		return -ENOMEM;
772 
773 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
774 			       general_obj_in_cmd_hdr);
775 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
776 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
777 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
778 		 MLX5_OBJ_TYPE_PAGE_TRACK);
779 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
780 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
781 	MLX5_SET(page_track, obj_context, track_type, 1);
782 	MLX5_SET(page_track, obj_context, log_page_size,
783 		 ilog2(tracker->host_qp->tracked_page_size));
784 	MLX5_SET(page_track, obj_context, log_msg_size,
785 		 ilog2(tracker->host_qp->max_msg_size));
786 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
787 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
788 
789 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
790 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
791 	for (i = 0; i < num_ranges; i++) {
792 		void *addr_range_i_base = range_list_ptr + record_size * i;
793 		unsigned long length = node->last - node->start;
794 
795 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
796 			   node->start);
797 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
798 		total_ranges_len += length;
799 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
800 	}
801 
802 	WARN_ON(node);
803 	log_addr_space_size = ilog2(total_ranges_len);
804 	if (log_addr_space_size <
805 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
806 	    log_addr_space_size >
807 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
808 		err = -EOPNOTSUPP;
809 		goto out;
810 	}
811 
812 	MLX5_SET(page_track, obj_context, log_addr_space_size,
813 		 log_addr_space_size);
814 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
815 	if (err)
816 		goto out;
817 
818 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
819 out:
820 	kfree(in);
821 	return err;
822 }
823 
824 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
825 				      u32 tracker_id)
826 {
827 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
828 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
829 
830 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
831 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
832 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
833 
834 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
835 }
836 
837 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
838 				     u32 tracker_id, unsigned long iova,
839 				     unsigned long length, u32 tracker_state)
840 {
841 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
842 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
843 	void *obj_context;
844 	void *cmd_hdr;
845 
846 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
847 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
848 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
849 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
850 
851 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
852 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
853 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
854 	MLX5_SET64(page_track, obj_context, length, length);
855 	MLX5_SET(page_track, obj_context, state, tracker_state);
856 
857 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
858 }
859 
860 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
861 			     struct mlx5_vhca_cq_buf *buf, int nent,
862 			     int cqe_size)
863 {
864 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
865 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
866 	u8 log_wq_sz = ilog2(cqe_size);
867 	int err;
868 
869 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
870 				       mdev->priv.numa_node);
871 	if (err)
872 		return err;
873 
874 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
875 	buf->cqe_size = cqe_size;
876 	buf->nent = nent;
877 	return 0;
878 }
879 
880 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
881 {
882 	struct mlx5_cqe64 *cqe64;
883 	void *cqe;
884 	int i;
885 
886 	for (i = 0; i < buf->nent; i++) {
887 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
888 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
889 		cqe64->op_own = MLX5_CQE_INVALID << 4;
890 	}
891 }
892 
893 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
894 			      struct mlx5_vhca_cq *cq)
895 {
896 	mlx5_core_destroy_cq(mdev, &cq->mcq);
897 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
898 	mlx5_db_free(mdev, &cq->db);
899 }
900 
901 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
902 {
903 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
904 		return;
905 
906 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
907 				       tracker.cq.mcq));
908 }
909 
910 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
911 				 void *data)
912 {
913 	struct mlx5_vhca_page_tracker *tracker =
914 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
915 	struct mlx5vf_pci_core_device *mvdev = container_of(
916 		tracker, struct mlx5vf_pci_core_device, tracker);
917 	struct mlx5_eqe *eqe = data;
918 	u8 event_type = (u8)type;
919 	u8 queue_type;
920 	int qp_num;
921 
922 	switch (event_type) {
923 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
924 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
925 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
926 		queue_type = eqe->data.qp_srq.type;
927 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
928 			break;
929 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
930 		if (qp_num != tracker->host_qp->qpn &&
931 		    qp_num != tracker->fw_qp->qpn)
932 			break;
933 		set_tracker_error(mvdev);
934 		break;
935 	default:
936 		break;
937 	}
938 
939 	return NOTIFY_OK;
940 }
941 
942 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
943 			       struct mlx5_eqe *eqe)
944 {
945 	struct mlx5vf_pci_core_device *mvdev =
946 		container_of(mcq, struct mlx5vf_pci_core_device,
947 			     tracker.cq.mcq);
948 
949 	complete(&mvdev->tracker_comp);
950 }
951 
952 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
953 			    struct mlx5_vhca_page_tracker *tracker,
954 			    size_t ncqe)
955 {
956 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
957 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
958 	struct mlx5_vhca_cq *cq;
959 	int inlen, err, eqn;
960 	void *cqc, *in;
961 	__be64 *pas;
962 	int vector;
963 
964 	cq = &tracker->cq;
965 	ncqe = roundup_pow_of_two(ncqe);
966 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
967 	if (err)
968 		return err;
969 
970 	cq->ncqe = ncqe;
971 	cq->mcq.set_ci_db = cq->db.db;
972 	cq->mcq.arm_db = cq->db.db + 1;
973 	cq->mcq.cqe_sz = cqe_size;
974 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
975 	if (err)
976 		goto err_db_free;
977 
978 	init_cq_frag_buf(&cq->buf);
979 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
980 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
981 		cq->buf.frag_buf.npages;
982 	in = kvzalloc(inlen, GFP_KERNEL);
983 	if (!in) {
984 		err = -ENOMEM;
985 		goto err_buff;
986 	}
987 
988 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
989 	err = mlx5_vector2eqn(mdev, vector, &eqn);
990 	if (err)
991 		goto err_vec;
992 
993 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
994 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
995 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
996 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
997 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
998 		 MLX5_ADAPTER_PAGE_SHIFT);
999 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1000 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1001 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1002 	cq->mcq.comp = mlx5vf_cq_complete;
1003 	cq->mcq.event = mlx5vf_cq_event;
1004 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1005 	if (err)
1006 		goto err_vec;
1007 
1008 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1009 		    cq->mcq.cons_index);
1010 	kvfree(in);
1011 	return 0;
1012 
1013 err_vec:
1014 	kvfree(in);
1015 err_buff:
1016 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1017 err_db_free:
1018 	mlx5_db_free(mdev, &cq->db);
1019 	return err;
1020 }
1021 
1022 static struct mlx5_vhca_qp *
1023 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1024 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1025 {
1026 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1027 	struct mlx5_vhca_qp *qp;
1028 	u8 log_rq_stride;
1029 	u8 log_rq_sz;
1030 	void *qpc;
1031 	int inlen;
1032 	void *in;
1033 	int err;
1034 
1035 	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
1036 	if (!qp)
1037 		return ERR_PTR(-ENOMEM);
1038 
1039 	qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1040 	log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1041 	log_rq_sz = ilog2(qp->rq.wqe_cnt);
1042 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1043 	if (err)
1044 		goto err_free;
1045 
1046 	if (max_recv_wr) {
1047 		err = mlx5_frag_buf_alloc_node(mdev,
1048 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1049 			&qp->buf, mdev->priv.numa_node);
1050 		if (err)
1051 			goto err_db_free;
1052 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1053 	}
1054 
1055 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1056 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1057 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1058 		qp->buf.npages;
1059 	in = kvzalloc(inlen, GFP_KERNEL);
1060 	if (!in) {
1061 		err = -ENOMEM;
1062 		goto err_in;
1063 	}
1064 
1065 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1066 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1067 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1068 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1069 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1070 	MLX5_SET(qpc, qpc, log_page_size,
1071 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1072 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1073 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1074 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1075 	MLX5_SET(qpc, qpc, no_sq, 1);
1076 	if (max_recv_wr) {
1077 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1078 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1079 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1080 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1081 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1082 		mlx5_fill_page_frag_array(&qp->buf,
1083 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1084 								 in, pas));
1085 	} else {
1086 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1087 	}
1088 
1089 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1090 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1091 	kvfree(in);
1092 	if (err)
1093 		goto err_in;
1094 
1095 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1096 	return qp;
1097 
1098 err_in:
1099 	if (max_recv_wr)
1100 		mlx5_frag_buf_free(mdev, &qp->buf);
1101 err_db_free:
1102 	mlx5_db_free(mdev, &qp->db);
1103 err_free:
1104 	kfree(qp);
1105 	return ERR_PTR(err);
1106 }
1107 
1108 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1109 {
1110 	struct mlx5_wqe_data_seg *data;
1111 	unsigned int ix;
1112 
1113 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1114 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1115 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1116 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1117 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1118 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1119 	qp->rq.pc++;
1120 	/* Make sure that descriptors are written before doorbell record. */
1121 	dma_wmb();
1122 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1123 }
1124 
1125 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1126 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1127 			      bool host_qp)
1128 {
1129 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1130 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1131 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1132 	void *qpc;
1133 	int ret;
1134 
1135 	/* Init */
1136 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1137 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1138 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1139 	MLX5_SET(qpc, qpc, rre, 1);
1140 	MLX5_SET(qpc, qpc, rwe, 1);
1141 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1142 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1143 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1144 	if (ret)
1145 		return ret;
1146 
1147 	if (host_qp) {
1148 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1149 		int i;
1150 
1151 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1152 			mlx5vf_post_recv(qp);
1153 			recv_buf->next_rq_offset += qp->max_msg_size;
1154 		}
1155 	}
1156 
1157 	/* RTR */
1158 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1159 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1160 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1161 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1162 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1163 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1164 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1165 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1166 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1167 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1168 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1169 	if (ret || host_qp)
1170 		return ret;
1171 
1172 	/* RTS */
1173 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1174 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1175 	MLX5_SET(qpc, qpc, retry_count, 7);
1176 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1177 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1178 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1179 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1180 
1181 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1182 }
1183 
1184 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1185 			      struct mlx5_vhca_qp *qp)
1186 {
1187 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1188 
1189 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1190 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1191 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1192 
1193 	mlx5_frag_buf_free(mdev, &qp->buf);
1194 	mlx5_db_free(mdev, &qp->db);
1195 	kfree(qp);
1196 }
1197 
1198 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1199 {
1200 	int i;
1201 
1202 	/* Undo alloc_pages_bulk_array() */
1203 	for (i = 0; i < recv_buf->npages; i++)
1204 		__free_page(recv_buf->page_list[i]);
1205 
1206 	kvfree(recv_buf->page_list);
1207 }
1208 
1209 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1210 			    unsigned int npages)
1211 {
1212 	unsigned int filled = 0, done = 0;
1213 	int i;
1214 
1215 	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1216 				       GFP_KERNEL);
1217 	if (!recv_buf->page_list)
1218 		return -ENOMEM;
1219 
1220 	for (;;) {
1221 		filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done,
1222 						recv_buf->page_list + done);
1223 		if (!filled)
1224 			goto err;
1225 
1226 		done += filled;
1227 		if (done == npages)
1228 			break;
1229 	}
1230 
1231 	recv_buf->npages = npages;
1232 	return 0;
1233 
1234 err:
1235 	for (i = 0; i < npages; i++) {
1236 		if (recv_buf->page_list[i])
1237 			__free_page(recv_buf->page_list[i]);
1238 	}
1239 
1240 	kvfree(recv_buf->page_list);
1241 	return -ENOMEM;
1242 }
1243 
1244 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1245 				   struct mlx5_vhca_recv_buf *recv_buf)
1246 {
1247 	int i, j;
1248 
1249 	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1250 				       sizeof(*recv_buf->dma_addrs),
1251 				       GFP_KERNEL);
1252 	if (!recv_buf->dma_addrs)
1253 		return -ENOMEM;
1254 
1255 	for (i = 0; i < recv_buf->npages; i++) {
1256 		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1257 						      recv_buf->page_list[i],
1258 						      0, PAGE_SIZE,
1259 						      DMA_FROM_DEVICE);
1260 		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1261 			goto error;
1262 	}
1263 	return 0;
1264 
1265 error:
1266 	for (j = 0; j < i; j++)
1267 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1268 				 PAGE_SIZE, DMA_FROM_DEVICE);
1269 
1270 	kvfree(recv_buf->dma_addrs);
1271 	return -ENOMEM;
1272 }
1273 
1274 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1275 				      struct mlx5_vhca_recv_buf *recv_buf)
1276 {
1277 	int i;
1278 
1279 	for (i = 0; i < recv_buf->npages; i++)
1280 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1281 				 PAGE_SIZE, DMA_FROM_DEVICE);
1282 
1283 	kvfree(recv_buf->dma_addrs);
1284 }
1285 
1286 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1287 					  struct mlx5_vhca_qp *qp)
1288 {
1289 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1290 
1291 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1292 	unregister_dma_recv_pages(mdev, recv_buf);
1293 	free_recv_pages(&qp->recv_buf);
1294 }
1295 
1296 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1297 					  struct mlx5_vhca_qp *qp, u32 pdn,
1298 					  u64 rq_size)
1299 {
1300 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1301 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1302 	int err;
1303 
1304 	err = alloc_recv_pages(recv_buf, npages);
1305 	if (err < 0)
1306 		return err;
1307 
1308 	err = register_dma_recv_pages(mdev, recv_buf);
1309 	if (err)
1310 		goto end;
1311 
1312 	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1313 	if (err)
1314 		goto err_create_mkey;
1315 
1316 	return 0;
1317 
1318 err_create_mkey:
1319 	unregister_dma_recv_pages(mdev, recv_buf);
1320 end:
1321 	free_recv_pages(recv_buf);
1322 	return err;
1323 }
1324 
1325 static void
1326 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1327 {
1328 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1329 	struct mlx5_core_dev *mdev = mvdev->mdev;
1330 
1331 	lockdep_assert_held(&mvdev->state_mutex);
1332 
1333 	if (!mvdev->log_active)
1334 		return;
1335 
1336 	WARN_ON(mvdev->mdev_detach);
1337 
1338 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1339 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1340 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1341 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1342 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1343 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1344 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1345 	mlx5_put_uars_page(mdev, tracker->uar);
1346 	mvdev->log_active = false;
1347 }
1348 
1349 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1350 {
1351 	struct mlx5vf_pci_core_device *mvdev = container_of(
1352 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1353 
1354 	mutex_lock(&mvdev->state_mutex);
1355 	if (!mvdev->log_active)
1356 		goto end;
1357 
1358 	_mlx5vf_free_page_tracker_resources(mvdev);
1359 	mvdev->log_active = false;
1360 end:
1361 	mlx5vf_state_mutex_unlock(mvdev);
1362 	return 0;
1363 }
1364 
1365 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1366 			      struct rb_root_cached *ranges, u32 nnodes,
1367 			      u64 *page_size)
1368 {
1369 	struct mlx5vf_pci_core_device *mvdev = container_of(
1370 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1371 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1372 	u8 log_tracked_page = ilog2(*page_size);
1373 	struct mlx5_vhca_qp *host_qp;
1374 	struct mlx5_vhca_qp *fw_qp;
1375 	struct mlx5_core_dev *mdev;
1376 	u32 max_msg_size = PAGE_SIZE;
1377 	u64 rq_size = SZ_2M;
1378 	u32 max_recv_wr;
1379 	int err;
1380 
1381 	mutex_lock(&mvdev->state_mutex);
1382 	if (mvdev->mdev_detach) {
1383 		err = -ENOTCONN;
1384 		goto end;
1385 	}
1386 
1387 	if (mvdev->log_active) {
1388 		err = -EINVAL;
1389 		goto end;
1390 	}
1391 
1392 	mdev = mvdev->mdev;
1393 	memset(tracker, 0, sizeof(*tracker));
1394 	tracker->uar = mlx5_get_uars_page(mdev);
1395 	if (IS_ERR(tracker->uar)) {
1396 		err = PTR_ERR(tracker->uar);
1397 		goto end;
1398 	}
1399 
1400 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1401 	if (err)
1402 		goto err_uar;
1403 
1404 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1405 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1406 	if (err)
1407 		goto err_dealloc_pd;
1408 
1409 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1410 	if (IS_ERR(host_qp)) {
1411 		err = PTR_ERR(host_qp);
1412 		goto err_cq;
1413 	}
1414 
1415 	host_qp->max_msg_size = max_msg_size;
1416 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1417 				pg_track_log_min_page_size)) {
1418 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1419 				pg_track_log_min_page_size);
1420 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1421 				pg_track_log_max_page_size)) {
1422 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1423 				pg_track_log_max_page_size);
1424 	}
1425 
1426 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1427 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1428 					     rq_size);
1429 	if (err)
1430 		goto err_host_qp;
1431 
1432 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1433 	if (IS_ERR(fw_qp)) {
1434 		err = PTR_ERR(fw_qp);
1435 		goto err_recv_resources;
1436 	}
1437 
1438 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1439 	if (err)
1440 		goto err_activate;
1441 
1442 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1443 	if (err)
1444 		goto err_activate;
1445 
1446 	tracker->host_qp = host_qp;
1447 	tracker->fw_qp = fw_qp;
1448 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1449 	if (err)
1450 		goto err_activate;
1451 
1452 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1453 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1454 	*page_size = host_qp->tracked_page_size;
1455 	mvdev->log_active = true;
1456 	mlx5vf_state_mutex_unlock(mvdev);
1457 	return 0;
1458 
1459 err_activate:
1460 	mlx5vf_destroy_qp(mdev, fw_qp);
1461 err_recv_resources:
1462 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1463 err_host_qp:
1464 	mlx5vf_destroy_qp(mdev, host_qp);
1465 err_cq:
1466 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1467 err_dealloc_pd:
1468 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1469 err_uar:
1470 	mlx5_put_uars_page(mdev, tracker->uar);
1471 end:
1472 	mlx5vf_state_mutex_unlock(mvdev);
1473 	return err;
1474 }
1475 
1476 static void
1477 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1478 		  struct iova_bitmap *dirty)
1479 {
1480 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1481 	u32 nent = size / entry_size;
1482 	struct page *page;
1483 	u64 addr;
1484 	u64 *buf;
1485 	int i;
1486 
1487 	if (WARN_ON(index >= qp->recv_buf.npages ||
1488 		    (nent > qp->max_msg_size / entry_size)))
1489 		return;
1490 
1491 	page = qp->recv_buf.page_list[index];
1492 	buf = kmap_local_page(page);
1493 	for (i = 0; i < nent; i++) {
1494 		addr = MLX5_GET(page_track_report_entry, buf + i,
1495 				dirty_address_low);
1496 		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1497 				      dirty_address_high) << 32;
1498 		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1499 	}
1500 	kunmap_local(buf);
1501 }
1502 
1503 static void
1504 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1505 	      struct iova_bitmap *dirty, int *tracker_status)
1506 {
1507 	u32 size;
1508 	int ix;
1509 
1510 	qp->rq.cc++;
1511 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1512 	size = be32_to_cpu(cqe->byte_cnt);
1513 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1514 
1515 	/* zero length CQE, no data */
1516 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1517 	if (size)
1518 		set_report_output(size, ix, qp, dirty);
1519 
1520 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1521 	mlx5vf_post_recv(qp);
1522 }
1523 
1524 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1525 {
1526 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1527 }
1528 
1529 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1530 {
1531 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1532 	struct mlx5_cqe64 *cqe64;
1533 
1534 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1535 
1536 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1537 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1538 		return cqe64;
1539 	} else {
1540 		return NULL;
1541 	}
1542 }
1543 
1544 static int
1545 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1546 		   struct iova_bitmap *dirty, int *tracker_status)
1547 {
1548 	struct mlx5_cqe64 *cqe;
1549 	u8 opcode;
1550 
1551 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1552 	if (!cqe)
1553 		return CQ_EMPTY;
1554 
1555 	++cq->mcq.cons_index;
1556 	/*
1557 	 * Make sure we read CQ entry contents after we've checked the
1558 	 * ownership bit.
1559 	 */
1560 	rmb();
1561 	opcode = get_cqe_opcode(cqe);
1562 	switch (opcode) {
1563 	case MLX5_CQE_RESP_SEND_IMM:
1564 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1565 		return CQ_OK;
1566 	default:
1567 		return CQ_POLL_ERR;
1568 	}
1569 }
1570 
1571 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1572 				  unsigned long length,
1573 				  struct iova_bitmap *dirty)
1574 {
1575 	struct mlx5vf_pci_core_device *mvdev = container_of(
1576 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1577 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1578 	struct mlx5_vhca_cq *cq = &tracker->cq;
1579 	struct mlx5_core_dev *mdev;
1580 	int poll_err, err;
1581 
1582 	mutex_lock(&mvdev->state_mutex);
1583 	if (!mvdev->log_active) {
1584 		err = -EINVAL;
1585 		goto end;
1586 	}
1587 
1588 	if (mvdev->mdev_detach) {
1589 		err = -ENOTCONN;
1590 		goto end;
1591 	}
1592 
1593 	mdev = mvdev->mdev;
1594 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1595 					MLX5_PAGE_TRACK_STATE_REPORTING);
1596 	if (err)
1597 		goto end;
1598 
1599 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1600 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1601 	       !tracker->is_err) {
1602 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1603 					      &tracker->status);
1604 		if (poll_err == CQ_EMPTY) {
1605 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1606 				    cq->mcq.cons_index);
1607 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1608 						      dirty, &tracker->status);
1609 			if (poll_err == CQ_EMPTY) {
1610 				wait_for_completion(&mvdev->tracker_comp);
1611 				continue;
1612 			}
1613 		}
1614 		if (poll_err == CQ_POLL_ERR) {
1615 			err = -EIO;
1616 			goto end;
1617 		}
1618 		mlx5_cq_set_ci(&cq->mcq);
1619 	}
1620 
1621 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1622 		tracker->is_err = true;
1623 
1624 	if (tracker->is_err)
1625 		err = -EIO;
1626 end:
1627 	mlx5vf_state_mutex_unlock(mvdev);
1628 	return err;
1629 }
1630