1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include "cmd.h"
7
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
mlx5vf_is_migratable(struct mlx5_core_dev * mdev,u16 func_id)10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 void *query_cap = NULL, *cap;
14 int ret;
15
16 query_cap = kzalloc(query_sz, GFP_KERNEL);
17 if (!query_cap)
18 return -ENOMEM;
19
20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 MLX5_CAP_GENERAL_2);
22 if (ret)
23 goto out;
24
25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 ret = -EOPNOTSUPP;
28 out:
29 kfree(query_cap);
30 return ret;
31 }
32
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 int err;
44
45 lockdep_assert_held(&mvdev->state_mutex);
46 if (mvdev->mdev_detach)
47 return -ENOTCONN;
48
49 /*
50 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 * running. Make sure to run only once there is no active save command.
52 * Running both in parallel, might end-up with a failure in the save
53 * command once it will try to turn on 'tracking' on a suspended device.
54 */
55 if (migf) {
56 err = wait_for_completion_interruptible(&migf->save_comp);
57 if (err)
58 return err;
59 }
60
61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 if (migf)
67 complete(&migf->save_comp);
68
69 return err;
70 }
71
mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77 lockdep_assert_held(&mvdev->state_mutex);
78 if (mvdev->mdev_detach)
79 return -ENOTCONN;
80
81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87
mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device * mvdev,size_t * state_size,u8 query_flags)88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 size_t *state_size, u8 query_flags)
90 {
91 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
92 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
93 bool inc = query_flags & MLX5VF_QUERY_INC;
94 int ret;
95
96 lockdep_assert_held(&mvdev->state_mutex);
97 if (mvdev->mdev_detach)
98 return -ENOTCONN;
99
100 /*
101 * In case PRE_COPY is used, saving_migf is exposed while device is
102 * running. Make sure to run only once there is no active save command.
103 * Running both in parallel, might end-up with a failure in the
104 * incremental query command on un-tracked vhca.
105 */
106 if (inc) {
107 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
108 if (ret)
109 return ret;
110 if (mvdev->saving_migf->state ==
111 MLX5_MIGF_STATE_PRE_COPY_ERROR) {
112 /*
113 * In case we had a PRE_COPY error, only query full
114 * image for final image
115 */
116 if (!(query_flags & MLX5VF_QUERY_FINAL)) {
117 *state_size = 0;
118 complete(&mvdev->saving_migf->save_comp);
119 return 0;
120 }
121 query_flags &= ~MLX5VF_QUERY_INC;
122 }
123 }
124
125 MLX5_SET(query_vhca_migration_state_in, in, opcode,
126 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
127 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
128 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
129 MLX5_SET(query_vhca_migration_state_in, in, incremental,
130 query_flags & MLX5VF_QUERY_INC);
131
132 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
133 out);
134 if (inc)
135 complete(&mvdev->saving_migf->save_comp);
136
137 if (ret)
138 return ret;
139
140 *state_size = MLX5_GET(query_vhca_migration_state_out, out,
141 required_umem_size);
142 return 0;
143 }
144
set_tracker_error(struct mlx5vf_pci_core_device * mvdev)145 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
146 {
147 /* Mark the tracker under an error and wake it up if it's running */
148 mvdev->tracker.is_err = true;
149 complete(&mvdev->tracker_comp);
150 }
151
mlx5fv_vf_event(struct notifier_block * nb,unsigned long event,void * data)152 static int mlx5fv_vf_event(struct notifier_block *nb,
153 unsigned long event, void *data)
154 {
155 struct mlx5vf_pci_core_device *mvdev =
156 container_of(nb, struct mlx5vf_pci_core_device, nb);
157
158 switch (event) {
159 case MLX5_PF_NOTIFY_ENABLE_VF:
160 mutex_lock(&mvdev->state_mutex);
161 mvdev->mdev_detach = false;
162 mlx5vf_state_mutex_unlock(mvdev);
163 break;
164 case MLX5_PF_NOTIFY_DISABLE_VF:
165 mlx5vf_cmd_close_migratable(mvdev);
166 mutex_lock(&mvdev->state_mutex);
167 mvdev->mdev_detach = true;
168 mlx5vf_state_mutex_unlock(mvdev);
169 break;
170 default:
171 break;
172 }
173
174 return 0;
175 }
176
mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device * mvdev)177 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
178 {
179 if (!mvdev->migrate_cap)
180 return;
181
182 /* Must be done outside the lock to let it progress */
183 set_tracker_error(mvdev);
184 mutex_lock(&mvdev->state_mutex);
185 mlx5vf_disable_fds(mvdev);
186 _mlx5vf_free_page_tracker_resources(mvdev);
187 mlx5vf_state_mutex_unlock(mvdev);
188 }
189
mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device * mvdev)190 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
191 {
192 if (!mvdev->migrate_cap)
193 return;
194
195 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
196 &mvdev->nb);
197 destroy_workqueue(mvdev->cb_wq);
198 }
199
mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device * mvdev,const struct vfio_migration_ops * mig_ops,const struct vfio_log_ops * log_ops)200 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
201 const struct vfio_migration_ops *mig_ops,
202 const struct vfio_log_ops *log_ops)
203 {
204 struct pci_dev *pdev = mvdev->core_device.pdev;
205 int ret;
206
207 if (!pdev->is_virtfn)
208 return;
209
210 mvdev->mdev = mlx5_vf_get_core_dev(pdev);
211 if (!mvdev->mdev)
212 return;
213
214 if (!MLX5_CAP_GEN(mvdev->mdev, migration))
215 goto end;
216
217 mvdev->vf_id = pci_iov_vf_id(pdev);
218 if (mvdev->vf_id < 0)
219 goto end;
220
221 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
222 if (ret)
223 goto end;
224
225 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
226 &mvdev->vhca_id))
227 goto end;
228
229 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
230 if (!mvdev->cb_wq)
231 goto end;
232
233 mutex_init(&mvdev->state_mutex);
234 spin_lock_init(&mvdev->reset_lock);
235 mvdev->nb.notifier_call = mlx5fv_vf_event;
236 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
237 &mvdev->nb);
238 if (ret) {
239 destroy_workqueue(mvdev->cb_wq);
240 goto end;
241 }
242
243 mvdev->migrate_cap = 1;
244 mvdev->core_device.vdev.migration_flags =
245 VFIO_MIGRATION_STOP_COPY |
246 VFIO_MIGRATION_P2P;
247 mvdev->core_device.vdev.mig_ops = mig_ops;
248 init_completion(&mvdev->tracker_comp);
249 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
250 mvdev->core_device.vdev.log_ops = log_ops;
251
252 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
253 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
254 mvdev->core_device.vdev.migration_flags |=
255 VFIO_MIGRATION_PRE_COPY;
256
257 end:
258 mlx5_vf_put_core_dev(mvdev->mdev);
259 }
260
mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev * mdev,u16 function_id,u16 * vhca_id)261 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
262 u16 *vhca_id)
263 {
264 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
265 int out_size;
266 void *out;
267 int ret;
268
269 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
270 out = kzalloc(out_size, GFP_KERNEL);
271 if (!out)
272 return -ENOMEM;
273
274 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
275 MLX5_SET(query_hca_cap_in, in, other_function, 1);
276 MLX5_SET(query_hca_cap_in, in, function_id, function_id);
277 MLX5_SET(query_hca_cap_in, in, op_mod,
278 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
279 HCA_CAP_OPMOD_GET_CUR);
280
281 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
282 if (ret)
283 goto err_exec;
284
285 *vhca_id = MLX5_GET(query_hca_cap_out, out,
286 capability.cmd_hca_cap.vhca_id);
287
288 err_exec:
289 kfree(out);
290 return ret;
291 }
292
_create_mkey(struct mlx5_core_dev * mdev,u32 pdn,struct mlx5_vhca_data_buffer * buf,struct mlx5_vhca_recv_buf * recv_buf,u32 * mkey)293 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
294 struct mlx5_vhca_data_buffer *buf,
295 struct mlx5_vhca_recv_buf *recv_buf,
296 u32 *mkey)
297 {
298 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
299 recv_buf->npages;
300 int err = 0, inlen;
301 __be64 *mtt;
302 void *mkc;
303 u32 *in;
304
305 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
306 sizeof(*mtt) * round_up(npages, 2);
307
308 in = kvzalloc(inlen, GFP_KERNEL);
309 if (!in)
310 return -ENOMEM;
311
312 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
313 DIV_ROUND_UP(npages, 2));
314 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
315
316 if (buf) {
317 struct sg_dma_page_iter dma_iter;
318
319 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
320 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
321 } else {
322 int i;
323
324 for (i = 0; i < npages; i++)
325 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
326 }
327
328 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
329 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
330 MLX5_SET(mkc, mkc, lr, 1);
331 MLX5_SET(mkc, mkc, lw, 1);
332 MLX5_SET(mkc, mkc, rr, 1);
333 MLX5_SET(mkc, mkc, rw, 1);
334 MLX5_SET(mkc, mkc, pd, pdn);
335 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
336 MLX5_SET(mkc, mkc, qpn, 0xffffff);
337 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
338 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
339 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
340 err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
341 kvfree(in);
342 return err;
343 }
344
mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer * buf)345 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
346 {
347 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
348 struct mlx5_core_dev *mdev = mvdev->mdev;
349 int ret;
350
351 lockdep_assert_held(&mvdev->state_mutex);
352 if (mvdev->mdev_detach)
353 return -ENOTCONN;
354
355 if (buf->dmaed || !buf->allocated_length)
356 return -EINVAL;
357
358 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
359 if (ret)
360 return ret;
361
362 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
363 if (ret)
364 goto err;
365
366 buf->dmaed = true;
367
368 return 0;
369 err:
370 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
371 return ret;
372 }
373
mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer * buf)374 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
375 {
376 struct mlx5_vf_migration_file *migf = buf->migf;
377 struct sg_page_iter sg_iter;
378
379 lockdep_assert_held(&migf->mvdev->state_mutex);
380 WARN_ON(migf->mvdev->mdev_detach);
381
382 if (buf->dmaed) {
383 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
384 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
385 buf->dma_dir, 0);
386 }
387
388 /* Undo alloc_pages_bulk_array() */
389 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
390 __free_page(sg_page_iter_page(&sg_iter));
391 sg_free_append_table(&buf->table);
392 kfree(buf);
393 }
394
395 struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file * migf,size_t length,enum dma_data_direction dma_dir)396 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
397 size_t length,
398 enum dma_data_direction dma_dir)
399 {
400 struct mlx5_vhca_data_buffer *buf;
401 int ret;
402
403 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
404 if (!buf)
405 return ERR_PTR(-ENOMEM);
406
407 buf->dma_dir = dma_dir;
408 buf->migf = migf;
409 if (length) {
410 ret = mlx5vf_add_migration_pages(buf,
411 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
412 if (ret)
413 goto end;
414
415 if (dma_dir != DMA_NONE) {
416 ret = mlx5vf_dma_data_buffer(buf);
417 if (ret)
418 goto end;
419 }
420 }
421
422 return buf;
423 end:
424 mlx5vf_free_data_buffer(buf);
425 return ERR_PTR(ret);
426 }
427
mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer * buf)428 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
429 {
430 spin_lock_irq(&buf->migf->list_lock);
431 list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
432 spin_unlock_irq(&buf->migf->list_lock);
433 }
434
435 struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buffer(struct mlx5_vf_migration_file * migf,size_t length,enum dma_data_direction dma_dir)436 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
437 size_t length, enum dma_data_direction dma_dir)
438 {
439 struct mlx5_vhca_data_buffer *buf, *temp_buf;
440 struct list_head free_list;
441
442 lockdep_assert_held(&migf->mvdev->state_mutex);
443 if (migf->mvdev->mdev_detach)
444 return ERR_PTR(-ENOTCONN);
445
446 INIT_LIST_HEAD(&free_list);
447
448 spin_lock_irq(&migf->list_lock);
449 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
450 if (buf->dma_dir == dma_dir) {
451 list_del_init(&buf->buf_elm);
452 if (buf->allocated_length >= length) {
453 spin_unlock_irq(&migf->list_lock);
454 goto found;
455 }
456 /*
457 * Prevent holding redundant buffers. Put in a free
458 * list and call at the end not under the spin lock
459 * (&migf->list_lock) to mlx5vf_free_data_buffer which
460 * might sleep.
461 */
462 list_add(&buf->buf_elm, &free_list);
463 }
464 }
465 spin_unlock_irq(&migf->list_lock);
466 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
467
468 found:
469 while ((temp_buf = list_first_entry_or_null(&free_list,
470 struct mlx5_vhca_data_buffer, buf_elm))) {
471 list_del(&temp_buf->buf_elm);
472 mlx5vf_free_data_buffer(temp_buf);
473 }
474
475 return buf;
476 }
477
mlx5vf_mig_file_cleanup_cb(struct work_struct * _work)478 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
479 {
480 struct mlx5vf_async_data *async_data = container_of(_work,
481 struct mlx5vf_async_data, work);
482 struct mlx5_vf_migration_file *migf = container_of(async_data,
483 struct mlx5_vf_migration_file, async_data);
484
485 mutex_lock(&migf->lock);
486 if (async_data->status) {
487 mlx5vf_put_data_buffer(async_data->buf);
488 if (async_data->header_buf)
489 mlx5vf_put_data_buffer(async_data->header_buf);
490 if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
491 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
492 else
493 migf->state = MLX5_MIGF_STATE_ERROR;
494 wake_up_interruptible(&migf->poll_wait);
495 }
496 mutex_unlock(&migf->lock);
497 kvfree(async_data->out);
498 complete(&migf->save_comp);
499 fput(migf->filp);
500 }
501
add_buf_header(struct mlx5_vhca_data_buffer * header_buf,size_t image_size,bool initial_pre_copy)502 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
503 size_t image_size, bool initial_pre_copy)
504 {
505 struct mlx5_vf_migration_file *migf = header_buf->migf;
506 struct mlx5_vf_migration_header header = {};
507 unsigned long flags;
508 struct page *page;
509 u8 *to_buff;
510
511 header.record_size = cpu_to_le64(image_size);
512 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
513 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
514 page = mlx5vf_get_migration_page(header_buf, 0);
515 if (!page)
516 return -EINVAL;
517 to_buff = kmap_local_page(page);
518 memcpy(to_buff, &header, sizeof(header));
519 kunmap_local(to_buff);
520 header_buf->length = sizeof(header);
521 header_buf->start_pos = header_buf->migf->max_pos;
522 migf->max_pos += header_buf->length;
523 spin_lock_irqsave(&migf->list_lock, flags);
524 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
525 spin_unlock_irqrestore(&migf->list_lock, flags);
526 if (initial_pre_copy)
527 migf->pre_copy_initial_bytes += sizeof(header);
528 return 0;
529 }
530
mlx5vf_save_callback(int status,struct mlx5_async_work * context)531 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
532 {
533 struct mlx5vf_async_data *async_data = container_of(context,
534 struct mlx5vf_async_data, cb_work);
535 struct mlx5_vf_migration_file *migf = container_of(async_data,
536 struct mlx5_vf_migration_file, async_data);
537
538 if (!status) {
539 size_t image_size;
540 unsigned long flags;
541 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
542 !async_data->last_chunk;
543
544 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
545 actual_image_size);
546 if (async_data->header_buf) {
547 status = add_buf_header(async_data->header_buf, image_size,
548 initial_pre_copy);
549 if (status)
550 goto err;
551 }
552 async_data->buf->length = image_size;
553 async_data->buf->start_pos = migf->max_pos;
554 migf->max_pos += async_data->buf->length;
555 spin_lock_irqsave(&migf->list_lock, flags);
556 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
557 spin_unlock_irqrestore(&migf->list_lock, flags);
558 if (initial_pre_copy)
559 migf->pre_copy_initial_bytes += image_size;
560 migf->state = async_data->last_chunk ?
561 MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
562 wake_up_interruptible(&migf->poll_wait);
563 }
564
565 err:
566 /*
567 * The error and the cleanup flows can't run from an
568 * interrupt context
569 */
570 if (status == -EREMOTEIO)
571 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
572 async_data->status = status;
573 queue_work(migf->mvdev->cb_wq, &async_data->work);
574 }
575
mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf,bool inc,bool track)576 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
577 struct mlx5_vf_migration_file *migf,
578 struct mlx5_vhca_data_buffer *buf, bool inc,
579 bool track)
580 {
581 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
582 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
583 struct mlx5_vhca_data_buffer *header_buf = NULL;
584 struct mlx5vf_async_data *async_data;
585 int err;
586
587 lockdep_assert_held(&mvdev->state_mutex);
588 if (mvdev->mdev_detach)
589 return -ENOTCONN;
590
591 err = wait_for_completion_interruptible(&migf->save_comp);
592 if (err)
593 return err;
594
595 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
596 /*
597 * In case we had a PRE_COPY error, SAVE is triggered only for
598 * the final image, read device full image.
599 */
600 inc = false;
601
602 MLX5_SET(save_vhca_state_in, in, opcode,
603 MLX5_CMD_OP_SAVE_VHCA_STATE);
604 MLX5_SET(save_vhca_state_in, in, op_mod, 0);
605 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
606 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
607 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
608 MLX5_SET(save_vhca_state_in, in, incremental, inc);
609 MLX5_SET(save_vhca_state_in, in, set_track, track);
610
611 async_data = &migf->async_data;
612 async_data->buf = buf;
613 async_data->last_chunk = !track;
614 async_data->out = kvzalloc(out_size, GFP_KERNEL);
615 if (!async_data->out) {
616 err = -ENOMEM;
617 goto err_out;
618 }
619
620 if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
621 if (async_data->last_chunk && migf->buf_header) {
622 header_buf = migf->buf_header;
623 migf->buf_header = NULL;
624 } else {
625 header_buf = mlx5vf_get_data_buffer(migf,
626 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
627 if (IS_ERR(header_buf)) {
628 err = PTR_ERR(header_buf);
629 goto err_free;
630 }
631 }
632 }
633
634 if (async_data->last_chunk)
635 migf->state = MLX5_MIGF_STATE_SAVE_LAST;
636
637 async_data->header_buf = header_buf;
638 get_file(migf->filp);
639 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
640 async_data->out,
641 out_size, mlx5vf_save_callback,
642 &async_data->cb_work);
643 if (err)
644 goto err_exec;
645
646 return 0;
647
648 err_exec:
649 if (header_buf)
650 mlx5vf_put_data_buffer(header_buf);
651 fput(migf->filp);
652 err_free:
653 kvfree(async_data->out);
654 err_out:
655 complete(&migf->save_comp);
656 return err;
657 }
658
mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf)659 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
660 struct mlx5_vf_migration_file *migf,
661 struct mlx5_vhca_data_buffer *buf)
662 {
663 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
664 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
665 int err;
666
667 lockdep_assert_held(&mvdev->state_mutex);
668 if (mvdev->mdev_detach)
669 return -ENOTCONN;
670
671 if (!buf->dmaed) {
672 err = mlx5vf_dma_data_buffer(buf);
673 if (err)
674 return err;
675 }
676
677 MLX5_SET(load_vhca_state_in, in, opcode,
678 MLX5_CMD_OP_LOAD_VHCA_STATE);
679 MLX5_SET(load_vhca_state_in, in, op_mod, 0);
680 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
681 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
682 MLX5_SET(load_vhca_state_in, in, size, buf->length);
683 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
684 }
685
mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file * migf)686 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
687 {
688 int err;
689
690 lockdep_assert_held(&migf->mvdev->state_mutex);
691 if (migf->mvdev->mdev_detach)
692 return -ENOTCONN;
693
694 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
695 return err;
696 }
697
mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file * migf)698 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
699 {
700 lockdep_assert_held(&migf->mvdev->state_mutex);
701 if (migf->mvdev->mdev_detach)
702 return;
703
704 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
705 }
706
mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file * migf)707 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
708 {
709 struct mlx5_vhca_data_buffer *entry;
710
711 lockdep_assert_held(&migf->mvdev->state_mutex);
712 WARN_ON(migf->mvdev->mdev_detach);
713
714 if (migf->buf) {
715 mlx5vf_free_data_buffer(migf->buf);
716 migf->buf = NULL;
717 }
718
719 if (migf->buf_header) {
720 mlx5vf_free_data_buffer(migf->buf_header);
721 migf->buf_header = NULL;
722 }
723
724 list_splice(&migf->avail_list, &migf->buf_list);
725
726 while ((entry = list_first_entry_or_null(&migf->buf_list,
727 struct mlx5_vhca_data_buffer, buf_elm))) {
728 list_del(&entry->buf_elm);
729 mlx5vf_free_data_buffer(entry);
730 }
731
732 mlx5vf_cmd_dealloc_pd(migf);
733 }
734
mlx5vf_create_tracker(struct mlx5_core_dev * mdev,struct mlx5vf_pci_core_device * mvdev,struct rb_root_cached * ranges,u32 nnodes)735 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
736 struct mlx5vf_pci_core_device *mvdev,
737 struct rb_root_cached *ranges, u32 nnodes)
738 {
739 int max_num_range =
740 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
741 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
742 int record_size = MLX5_ST_SZ_BYTES(page_track_range);
743 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
744 struct interval_tree_node *node = NULL;
745 u64 total_ranges_len = 0;
746 u32 num_ranges = nnodes;
747 u8 log_addr_space_size;
748 void *range_list_ptr;
749 void *obj_context;
750 void *cmd_hdr;
751 int inlen;
752 void *in;
753 int err;
754 int i;
755
756 if (num_ranges > max_num_range) {
757 vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
758 num_ranges = max_num_range;
759 }
760
761 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
762 record_size * num_ranges;
763 in = kzalloc(inlen, GFP_KERNEL);
764 if (!in)
765 return -ENOMEM;
766
767 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
768 general_obj_in_cmd_hdr);
769 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
770 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
771 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
772 MLX5_OBJ_TYPE_PAGE_TRACK);
773 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
774 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
775 MLX5_SET(page_track, obj_context, track_type, 1);
776 MLX5_SET(page_track, obj_context, log_page_size,
777 ilog2(tracker->host_qp->tracked_page_size));
778 MLX5_SET(page_track, obj_context, log_msg_size,
779 ilog2(tracker->host_qp->max_msg_size));
780 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
781 MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
782
783 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
784 node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
785 for (i = 0; i < num_ranges; i++) {
786 void *addr_range_i_base = range_list_ptr + record_size * i;
787 unsigned long length = node->last - node->start + 1;
788
789 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
790 node->start);
791 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
792 total_ranges_len += length;
793 node = interval_tree_iter_next(node, 0, ULONG_MAX);
794 }
795
796 WARN_ON(node);
797 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
798 if (log_addr_space_size <
799 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
800 log_addr_space_size >
801 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
802 err = -EOPNOTSUPP;
803 goto out;
804 }
805
806 MLX5_SET(page_track, obj_context, log_addr_space_size,
807 log_addr_space_size);
808 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
809 if (err)
810 goto out;
811
812 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
813 out:
814 kfree(in);
815 return err;
816 }
817
mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev * mdev,u32 tracker_id)818 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
819 u32 tracker_id)
820 {
821 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
822 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
823
824 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
825 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
826 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
827
828 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
829 }
830
mlx5vf_cmd_modify_tracker(struct mlx5_core_dev * mdev,u32 tracker_id,unsigned long iova,unsigned long length,u32 tracker_state)831 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
832 u32 tracker_id, unsigned long iova,
833 unsigned long length, u32 tracker_state)
834 {
835 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
836 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
837 void *obj_context;
838 void *cmd_hdr;
839
840 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
841 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
842 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
843 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
844
845 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
846 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
847 MLX5_SET64(page_track, obj_context, range_start_address, iova);
848 MLX5_SET64(page_track, obj_context, length, length);
849 MLX5_SET(page_track, obj_context, state, tracker_state);
850
851 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
852 }
853
alloc_cq_frag_buf(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq_buf * buf,int nent,int cqe_size)854 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
855 struct mlx5_vhca_cq_buf *buf, int nent,
856 int cqe_size)
857 {
858 struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
859 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
860 u8 log_wq_sz = ilog2(cqe_size);
861 int err;
862
863 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
864 mdev->priv.numa_node);
865 if (err)
866 return err;
867
868 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
869 buf->cqe_size = cqe_size;
870 buf->nent = nent;
871 return 0;
872 }
873
init_cq_frag_buf(struct mlx5_vhca_cq_buf * buf)874 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
875 {
876 struct mlx5_cqe64 *cqe64;
877 void *cqe;
878 int i;
879
880 for (i = 0; i < buf->nent; i++) {
881 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
882 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
883 cqe64->op_own = MLX5_CQE_INVALID << 4;
884 }
885 }
886
mlx5vf_destroy_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq * cq)887 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
888 struct mlx5_vhca_cq *cq)
889 {
890 mlx5_core_destroy_cq(mdev, &cq->mcq);
891 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
892 mlx5_db_free(mdev, &cq->db);
893 }
894
mlx5vf_cq_event(struct mlx5_core_cq * mcq,enum mlx5_event type)895 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
896 {
897 if (type != MLX5_EVENT_TYPE_CQ_ERROR)
898 return;
899
900 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
901 tracker.cq.mcq));
902 }
903
mlx5vf_event_notifier(struct notifier_block * nb,unsigned long type,void * data)904 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
905 void *data)
906 {
907 struct mlx5_vhca_page_tracker *tracker =
908 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
909 struct mlx5vf_pci_core_device *mvdev = container_of(
910 tracker, struct mlx5vf_pci_core_device, tracker);
911 struct mlx5_eqe *eqe = data;
912 u8 event_type = (u8)type;
913 u8 queue_type;
914 int qp_num;
915
916 switch (event_type) {
917 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
918 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
919 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
920 queue_type = eqe->data.qp_srq.type;
921 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
922 break;
923 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
924 if (qp_num != tracker->host_qp->qpn &&
925 qp_num != tracker->fw_qp->qpn)
926 break;
927 set_tracker_error(mvdev);
928 break;
929 default:
930 break;
931 }
932
933 return NOTIFY_OK;
934 }
935
mlx5vf_cq_complete(struct mlx5_core_cq * mcq,struct mlx5_eqe * eqe)936 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
937 struct mlx5_eqe *eqe)
938 {
939 struct mlx5vf_pci_core_device *mvdev =
940 container_of(mcq, struct mlx5vf_pci_core_device,
941 tracker.cq.mcq);
942
943 complete(&mvdev->tracker_comp);
944 }
945
mlx5vf_create_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,size_t ncqe)946 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
947 struct mlx5_vhca_page_tracker *tracker,
948 size_t ncqe)
949 {
950 int cqe_size = cache_line_size() == 128 ? 128 : 64;
951 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
952 struct mlx5_vhca_cq *cq;
953 int inlen, err, eqn;
954 void *cqc, *in;
955 __be64 *pas;
956 int vector;
957
958 cq = &tracker->cq;
959 ncqe = roundup_pow_of_two(ncqe);
960 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
961 if (err)
962 return err;
963
964 cq->ncqe = ncqe;
965 cq->mcq.set_ci_db = cq->db.db;
966 cq->mcq.arm_db = cq->db.db + 1;
967 cq->mcq.cqe_sz = cqe_size;
968 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
969 if (err)
970 goto err_db_free;
971
972 init_cq_frag_buf(&cq->buf);
973 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
974 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
975 cq->buf.frag_buf.npages;
976 in = kvzalloc(inlen, GFP_KERNEL);
977 if (!in) {
978 err = -ENOMEM;
979 goto err_buff;
980 }
981
982 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
983 err = mlx5_comp_eqn_get(mdev, vector, &eqn);
984 if (err)
985 goto err_vec;
986
987 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
988 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
989 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
990 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
991 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
992 MLX5_ADAPTER_PAGE_SHIFT);
993 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
994 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
995 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
996 cq->mcq.comp = mlx5vf_cq_complete;
997 cq->mcq.event = mlx5vf_cq_event;
998 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
999 if (err)
1000 goto err_vec;
1001
1002 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1003 cq->mcq.cons_index);
1004 kvfree(in);
1005 return 0;
1006
1007 err_vec:
1008 kvfree(in);
1009 err_buff:
1010 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1011 err_db_free:
1012 mlx5_db_free(mdev, &cq->db);
1013 return err;
1014 }
1015
1016 static struct mlx5_vhca_qp *
mlx5vf_create_rc_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,u32 max_recv_wr)1017 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1018 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1019 {
1020 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1021 struct mlx5_vhca_qp *qp;
1022 u8 log_rq_stride;
1023 u8 log_rq_sz;
1024 void *qpc;
1025 int inlen;
1026 void *in;
1027 int err;
1028
1029 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1030 if (!qp)
1031 return ERR_PTR(-ENOMEM);
1032
1033 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1034 if (err)
1035 goto err_free;
1036
1037 if (max_recv_wr) {
1038 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1039 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1040 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1041 err = mlx5_frag_buf_alloc_node(mdev,
1042 wq_get_byte_sz(log_rq_sz, log_rq_stride),
1043 &qp->buf, mdev->priv.numa_node);
1044 if (err)
1045 goto err_db_free;
1046 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1047 }
1048
1049 qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1050 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1051 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1052 qp->buf.npages;
1053 in = kvzalloc(inlen, GFP_KERNEL);
1054 if (!in) {
1055 err = -ENOMEM;
1056 goto err_in;
1057 }
1058
1059 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1060 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1061 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1062 MLX5_SET(qpc, qpc, pd, tracker->pdn);
1063 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1064 MLX5_SET(qpc, qpc, log_page_size,
1065 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1066 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1067 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1068 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1069 MLX5_SET(qpc, qpc, no_sq, 1);
1070 if (max_recv_wr) {
1071 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1072 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1073 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1074 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1075 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1076 mlx5_fill_page_frag_array(&qp->buf,
1077 (__be64 *)MLX5_ADDR_OF(create_qp_in,
1078 in, pas));
1079 } else {
1080 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1081 }
1082
1083 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1084 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1085 kvfree(in);
1086 if (err)
1087 goto err_in;
1088
1089 qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1090 return qp;
1091
1092 err_in:
1093 if (max_recv_wr)
1094 mlx5_frag_buf_free(mdev, &qp->buf);
1095 err_db_free:
1096 mlx5_db_free(mdev, &qp->db);
1097 err_free:
1098 kfree(qp);
1099 return ERR_PTR(err);
1100 }
1101
mlx5vf_post_recv(struct mlx5_vhca_qp * qp)1102 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1103 {
1104 struct mlx5_wqe_data_seg *data;
1105 unsigned int ix;
1106
1107 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1108 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1109 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1110 data->byte_count = cpu_to_be32(qp->max_msg_size);
1111 data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1112 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1113 qp->rq.pc++;
1114 /* Make sure that descriptors are written before doorbell record. */
1115 dma_wmb();
1116 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1117 }
1118
mlx5vf_activate_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 remote_qpn,bool host_qp)1119 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1120 struct mlx5_vhca_qp *qp, u32 remote_qpn,
1121 bool host_qp)
1122 {
1123 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1124 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1125 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1126 void *qpc;
1127 int ret;
1128
1129 /* Init */
1130 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1131 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1132 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1133 MLX5_SET(qpc, qpc, rre, 1);
1134 MLX5_SET(qpc, qpc, rwe, 1);
1135 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1136 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1137 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1138 if (ret)
1139 return ret;
1140
1141 if (host_qp) {
1142 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1143 int i;
1144
1145 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1146 mlx5vf_post_recv(qp);
1147 recv_buf->next_rq_offset += qp->max_msg_size;
1148 }
1149 }
1150
1151 /* RTR */
1152 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1153 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1154 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1155 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1156 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1157 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1158 MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1159 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1160 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1161 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1162 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1163 if (ret || host_qp)
1164 return ret;
1165
1166 /* RTS */
1167 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1168 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1169 MLX5_SET(qpc, qpc, retry_count, 7);
1170 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1171 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1172 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1173 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1174
1175 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1176 }
1177
mlx5vf_destroy_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1178 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1179 struct mlx5_vhca_qp *qp)
1180 {
1181 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1182
1183 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1184 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1185 mlx5_cmd_exec_in(mdev, destroy_qp, in);
1186
1187 mlx5_frag_buf_free(mdev, &qp->buf);
1188 mlx5_db_free(mdev, &qp->db);
1189 kfree(qp);
1190 }
1191
free_recv_pages(struct mlx5_vhca_recv_buf * recv_buf)1192 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1193 {
1194 int i;
1195
1196 /* Undo alloc_pages_bulk_array() */
1197 for (i = 0; i < recv_buf->npages; i++)
1198 __free_page(recv_buf->page_list[i]);
1199
1200 kvfree(recv_buf->page_list);
1201 }
1202
alloc_recv_pages(struct mlx5_vhca_recv_buf * recv_buf,unsigned int npages)1203 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1204 unsigned int npages)
1205 {
1206 unsigned int filled = 0, done = 0;
1207 int i;
1208
1209 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1210 GFP_KERNEL_ACCOUNT);
1211 if (!recv_buf->page_list)
1212 return -ENOMEM;
1213
1214 for (;;) {
1215 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1216 npages - done,
1217 recv_buf->page_list + done);
1218 if (!filled)
1219 goto err;
1220
1221 done += filled;
1222 if (done == npages)
1223 break;
1224 }
1225
1226 recv_buf->npages = npages;
1227 return 0;
1228
1229 err:
1230 for (i = 0; i < npages; i++) {
1231 if (recv_buf->page_list[i])
1232 __free_page(recv_buf->page_list[i]);
1233 }
1234
1235 kvfree(recv_buf->page_list);
1236 return -ENOMEM;
1237 }
1238
register_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)1239 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1240 struct mlx5_vhca_recv_buf *recv_buf)
1241 {
1242 int i, j;
1243
1244 recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1245 sizeof(*recv_buf->dma_addrs),
1246 GFP_KERNEL_ACCOUNT);
1247 if (!recv_buf->dma_addrs)
1248 return -ENOMEM;
1249
1250 for (i = 0; i < recv_buf->npages; i++) {
1251 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1252 recv_buf->page_list[i],
1253 0, PAGE_SIZE,
1254 DMA_FROM_DEVICE);
1255 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1256 goto error;
1257 }
1258 return 0;
1259
1260 error:
1261 for (j = 0; j < i; j++)
1262 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1263 PAGE_SIZE, DMA_FROM_DEVICE);
1264
1265 kvfree(recv_buf->dma_addrs);
1266 return -ENOMEM;
1267 }
1268
unregister_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)1269 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1270 struct mlx5_vhca_recv_buf *recv_buf)
1271 {
1272 int i;
1273
1274 for (i = 0; i < recv_buf->npages; i++)
1275 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1276 PAGE_SIZE, DMA_FROM_DEVICE);
1277
1278 kvfree(recv_buf->dma_addrs);
1279 }
1280
mlx5vf_free_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1281 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1282 struct mlx5_vhca_qp *qp)
1283 {
1284 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1285
1286 mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1287 unregister_dma_recv_pages(mdev, recv_buf);
1288 free_recv_pages(&qp->recv_buf);
1289 }
1290
mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 pdn,u64 rq_size)1291 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1292 struct mlx5_vhca_qp *qp, u32 pdn,
1293 u64 rq_size)
1294 {
1295 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1296 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1297 int err;
1298
1299 err = alloc_recv_pages(recv_buf, npages);
1300 if (err < 0)
1301 return err;
1302
1303 err = register_dma_recv_pages(mdev, recv_buf);
1304 if (err)
1305 goto end;
1306
1307 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1308 if (err)
1309 goto err_create_mkey;
1310
1311 return 0;
1312
1313 err_create_mkey:
1314 unregister_dma_recv_pages(mdev, recv_buf);
1315 end:
1316 free_recv_pages(recv_buf);
1317 return err;
1318 }
1319
1320 static void
_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device * mvdev)1321 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1322 {
1323 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1324 struct mlx5_core_dev *mdev = mvdev->mdev;
1325
1326 lockdep_assert_held(&mvdev->state_mutex);
1327
1328 if (!mvdev->log_active)
1329 return;
1330
1331 WARN_ON(mvdev->mdev_detach);
1332
1333 mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1334 mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1335 mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1336 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1337 mlx5vf_destroy_qp(mdev, tracker->host_qp);
1338 mlx5vf_destroy_cq(mdev, &tracker->cq);
1339 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1340 mlx5_put_uars_page(mdev, tracker->uar);
1341 mvdev->log_active = false;
1342 }
1343
mlx5vf_stop_page_tracker(struct vfio_device * vdev)1344 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1345 {
1346 struct mlx5vf_pci_core_device *mvdev = container_of(
1347 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1348
1349 mutex_lock(&mvdev->state_mutex);
1350 if (!mvdev->log_active)
1351 goto end;
1352
1353 _mlx5vf_free_page_tracker_resources(mvdev);
1354 mvdev->log_active = false;
1355 end:
1356 mlx5vf_state_mutex_unlock(mvdev);
1357 return 0;
1358 }
1359
mlx5vf_start_page_tracker(struct vfio_device * vdev,struct rb_root_cached * ranges,u32 nnodes,u64 * page_size)1360 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1361 struct rb_root_cached *ranges, u32 nnodes,
1362 u64 *page_size)
1363 {
1364 struct mlx5vf_pci_core_device *mvdev = container_of(
1365 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1366 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1367 u8 log_tracked_page = ilog2(*page_size);
1368 struct mlx5_vhca_qp *host_qp;
1369 struct mlx5_vhca_qp *fw_qp;
1370 struct mlx5_core_dev *mdev;
1371 u32 log_max_msg_size;
1372 u32 max_msg_size;
1373 u64 rq_size = SZ_2M;
1374 u32 max_recv_wr;
1375 int err;
1376
1377 mutex_lock(&mvdev->state_mutex);
1378 if (mvdev->mdev_detach) {
1379 err = -ENOTCONN;
1380 goto end;
1381 }
1382
1383 if (mvdev->log_active) {
1384 err = -EINVAL;
1385 goto end;
1386 }
1387
1388 mdev = mvdev->mdev;
1389 log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
1390 max_msg_size = (1ULL << log_max_msg_size);
1391 /* The RQ must hold at least 4 WQEs/messages for successful QP creation */
1392 if (rq_size < 4 * max_msg_size)
1393 rq_size = 4 * max_msg_size;
1394
1395 memset(tracker, 0, sizeof(*tracker));
1396 tracker->uar = mlx5_get_uars_page(mdev);
1397 if (IS_ERR(tracker->uar)) {
1398 err = PTR_ERR(tracker->uar);
1399 goto end;
1400 }
1401
1402 err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1403 if (err)
1404 goto err_uar;
1405
1406 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1407 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1408 if (err)
1409 goto err_dealloc_pd;
1410
1411 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1412 if (IS_ERR(host_qp)) {
1413 err = PTR_ERR(host_qp);
1414 goto err_cq;
1415 }
1416
1417 host_qp->max_msg_size = max_msg_size;
1418 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1419 pg_track_log_min_page_size)) {
1420 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1421 pg_track_log_min_page_size);
1422 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1423 pg_track_log_max_page_size)) {
1424 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1425 pg_track_log_max_page_size);
1426 }
1427
1428 host_qp->tracked_page_size = (1ULL << log_tracked_page);
1429 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1430 rq_size);
1431 if (err)
1432 goto err_host_qp;
1433
1434 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1435 if (IS_ERR(fw_qp)) {
1436 err = PTR_ERR(fw_qp);
1437 goto err_recv_resources;
1438 }
1439
1440 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1441 if (err)
1442 goto err_activate;
1443
1444 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1445 if (err)
1446 goto err_activate;
1447
1448 tracker->host_qp = host_qp;
1449 tracker->fw_qp = fw_qp;
1450 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1451 if (err)
1452 goto err_activate;
1453
1454 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1455 mlx5_eq_notifier_register(mdev, &tracker->nb);
1456 *page_size = host_qp->tracked_page_size;
1457 mvdev->log_active = true;
1458 mlx5vf_state_mutex_unlock(mvdev);
1459 return 0;
1460
1461 err_activate:
1462 mlx5vf_destroy_qp(mdev, fw_qp);
1463 err_recv_resources:
1464 mlx5vf_free_qp_recv_resources(mdev, host_qp);
1465 err_host_qp:
1466 mlx5vf_destroy_qp(mdev, host_qp);
1467 err_cq:
1468 mlx5vf_destroy_cq(mdev, &tracker->cq);
1469 err_dealloc_pd:
1470 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1471 err_uar:
1472 mlx5_put_uars_page(mdev, tracker->uar);
1473 end:
1474 mlx5vf_state_mutex_unlock(mvdev);
1475 return err;
1476 }
1477
1478 static void
set_report_output(u32 size,int index,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty)1479 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1480 struct iova_bitmap *dirty)
1481 {
1482 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1483 u32 nent = size / entry_size;
1484 u32 nent_in_page;
1485 u32 nent_to_set;
1486 struct page *page;
1487 u32 page_offset;
1488 u32 page_index;
1489 u32 buf_offset;
1490 void *kaddr;
1491 u64 addr;
1492 u64 *buf;
1493 int i;
1494
1495 buf_offset = index * qp->max_msg_size;
1496 if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
1497 (nent > qp->max_msg_size / entry_size)))
1498 return;
1499
1500 do {
1501 page_index = buf_offset / PAGE_SIZE;
1502 page_offset = buf_offset % PAGE_SIZE;
1503 nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
1504 page = qp->recv_buf.page_list[page_index];
1505 kaddr = kmap_local_page(page);
1506 buf = kaddr + page_offset;
1507 nent_to_set = min(nent, nent_in_page);
1508 for (i = 0; i < nent_to_set; i++) {
1509 addr = MLX5_GET(page_track_report_entry, buf + i,
1510 dirty_address_low);
1511 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1512 dirty_address_high) << 32;
1513 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1514 }
1515 kunmap_local(kaddr);
1516 buf_offset += (nent_to_set * entry_size);
1517 nent -= nent_to_set;
1518 } while (nent);
1519 }
1520
1521 static void
mlx5vf_rq_cqe(struct mlx5_vhca_qp * qp,struct mlx5_cqe64 * cqe,struct iova_bitmap * dirty,int * tracker_status)1522 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1523 struct iova_bitmap *dirty, int *tracker_status)
1524 {
1525 u32 size;
1526 int ix;
1527
1528 qp->rq.cc++;
1529 *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1530 size = be32_to_cpu(cqe->byte_cnt);
1531 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1532
1533 /* zero length CQE, no data */
1534 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1535 if (size)
1536 set_report_output(size, ix, qp, dirty);
1537
1538 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1539 mlx5vf_post_recv(qp);
1540 }
1541
get_cqe(struct mlx5_vhca_cq * cq,int n)1542 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1543 {
1544 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1545 }
1546
get_sw_cqe(struct mlx5_vhca_cq * cq,int n)1547 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1548 {
1549 void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1550 struct mlx5_cqe64 *cqe64;
1551
1552 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1553
1554 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1555 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1556 return cqe64;
1557 } else {
1558 return NULL;
1559 }
1560 }
1561
1562 static int
mlx5vf_cq_poll_one(struct mlx5_vhca_cq * cq,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty,int * tracker_status)1563 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1564 struct iova_bitmap *dirty, int *tracker_status)
1565 {
1566 struct mlx5_cqe64 *cqe;
1567 u8 opcode;
1568
1569 cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1570 if (!cqe)
1571 return CQ_EMPTY;
1572
1573 ++cq->mcq.cons_index;
1574 /*
1575 * Make sure we read CQ entry contents after we've checked the
1576 * ownership bit.
1577 */
1578 rmb();
1579 opcode = get_cqe_opcode(cqe);
1580 switch (opcode) {
1581 case MLX5_CQE_RESP_SEND_IMM:
1582 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1583 return CQ_OK;
1584 default:
1585 return CQ_POLL_ERR;
1586 }
1587 }
1588
mlx5vf_tracker_read_and_clear(struct vfio_device * vdev,unsigned long iova,unsigned long length,struct iova_bitmap * dirty)1589 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1590 unsigned long length,
1591 struct iova_bitmap *dirty)
1592 {
1593 struct mlx5vf_pci_core_device *mvdev = container_of(
1594 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1595 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1596 struct mlx5_vhca_cq *cq = &tracker->cq;
1597 struct mlx5_core_dev *mdev;
1598 int poll_err, err;
1599
1600 mutex_lock(&mvdev->state_mutex);
1601 if (!mvdev->log_active) {
1602 err = -EINVAL;
1603 goto end;
1604 }
1605
1606 if (mvdev->mdev_detach) {
1607 err = -ENOTCONN;
1608 goto end;
1609 }
1610
1611 mdev = mvdev->mdev;
1612 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1613 MLX5_PAGE_TRACK_STATE_REPORTING);
1614 if (err)
1615 goto end;
1616
1617 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1618 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1619 !tracker->is_err) {
1620 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1621 &tracker->status);
1622 if (poll_err == CQ_EMPTY) {
1623 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1624 cq->mcq.cons_index);
1625 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1626 dirty, &tracker->status);
1627 if (poll_err == CQ_EMPTY) {
1628 wait_for_completion(&mvdev->tracker_comp);
1629 continue;
1630 }
1631 }
1632 if (poll_err == CQ_POLL_ERR) {
1633 err = -EIO;
1634 goto end;
1635 }
1636 mlx5_cq_set_ci(&cq->mcq);
1637 }
1638
1639 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1640 tracker->is_err = true;
1641
1642 if (tracker->is_err)
1643 err = -EIO;
1644 end:
1645 mlx5vf_state_mutex_unlock(mvdev);
1646 return err;
1647 }
1648