xref: /openbmc/linux/drivers/vfio/pci/pds/dirty.c (revision 2984f26a)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2023 Advanced Micro Devices, Inc. */
3 
4 #include <linux/interval_tree.h>
5 #include <linux/vfio.h>
6 
7 #include <linux/pds/pds_common.h>
8 #include <linux/pds/pds_core_if.h>
9 #include <linux/pds/pds_adminq.h>
10 
11 #include "vfio_dev.h"
12 #include "cmds.h"
13 #include "dirty.h"
14 
15 #define READ_SEQ true
16 #define WRITE_ACK false
17 
18 bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
19 {
20 	return pds_vfio->dirty.is_enabled;
21 }
22 
23 void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
24 {
25 	pds_vfio->dirty.is_enabled = true;
26 }
27 
28 void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
29 {
30 	pds_vfio->dirty.is_enabled = false;
31 }
32 
33 static void
34 pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
35 				 u8 max_regions)
36 {
37 	int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
38 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
39 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
40 	struct pds_lm_dirty_region_info *region_info;
41 	dma_addr_t regions_dma;
42 	u8 num_regions;
43 	int err;
44 
45 	region_info = kcalloc(max_regions,
46 			      sizeof(struct pds_lm_dirty_region_info),
47 			      GFP_KERNEL);
48 	if (!region_info)
49 		return;
50 
51 	regions_dma =
52 		dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
53 	if (dma_mapping_error(pdsc_dev, regions_dma))
54 		goto out_free_region_info;
55 
56 	err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
57 					&num_regions);
58 	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
59 	if (err)
60 		goto out_free_region_info;
61 
62 	for (unsigned int i = 0; i < num_regions; i++)
63 		dev_dbg(&pdev->dev,
64 			"region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
65 			i, le64_to_cpu(region_info[i].dma_base),
66 			le32_to_cpu(region_info[i].page_count),
67 			region_info[i].page_size_log2);
68 
69 out_free_region_info:
70 	kfree(region_info);
71 }
72 
73 static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
74 					unsigned long bytes)
75 {
76 	unsigned long *host_seq_bmp, *host_ack_bmp;
77 
78 	host_seq_bmp = vzalloc(bytes);
79 	if (!host_seq_bmp)
80 		return -ENOMEM;
81 
82 	host_ack_bmp = vzalloc(bytes);
83 	if (!host_ack_bmp) {
84 		bitmap_free(host_seq_bmp);
85 		return -ENOMEM;
86 	}
87 
88 	dirty->host_seq.bmp = host_seq_bmp;
89 	dirty->host_ack.bmp = host_ack_bmp;
90 
91 	return 0;
92 }
93 
94 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
95 {
96 	vfree(dirty->host_seq.bmp);
97 	vfree(dirty->host_ack.bmp);
98 	dirty->host_seq.bmp = NULL;
99 	dirty->host_ack.bmp = NULL;
100 }
101 
102 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
103 				      struct pds_vfio_bmp_info *bmp_info)
104 {
105 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
106 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
107 
108 	dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
109 			 bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
110 			 DMA_BIDIRECTIONAL);
111 	kfree(bmp_info->sgl);
112 
113 	bmp_info->num_sge = 0;
114 	bmp_info->sgl = NULL;
115 	bmp_info->sgl_addr = 0;
116 }
117 
118 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
119 {
120 	if (pds_vfio->dirty.host_seq.sgl)
121 		__pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
122 	if (pds_vfio->dirty.host_ack.sgl)
123 		__pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
124 }
125 
126 static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
127 				      struct pds_vfio_bmp_info *bmp_info,
128 				      u32 page_count)
129 {
130 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
131 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
132 	struct pds_lm_sg_elem *sgl;
133 	dma_addr_t sgl_addr;
134 	size_t sgl_size;
135 	u32 max_sge;
136 
137 	max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
138 	sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
139 
140 	sgl = kzalloc(sgl_size, GFP_KERNEL);
141 	if (!sgl)
142 		return -ENOMEM;
143 
144 	sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
145 	if (dma_mapping_error(pdsc_dev, sgl_addr)) {
146 		kfree(sgl);
147 		return -EIO;
148 	}
149 
150 	bmp_info->sgl = sgl;
151 	bmp_info->num_sge = max_sge;
152 	bmp_info->sgl_addr = sgl_addr;
153 
154 	return 0;
155 }
156 
157 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
158 				    u32 page_count)
159 {
160 	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
161 	int err;
162 
163 	err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
164 					 page_count);
165 	if (err)
166 		return err;
167 
168 	err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
169 					 page_count);
170 	if (err) {
171 		__pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
172 		return err;
173 	}
174 
175 	return 0;
176 }
177 
178 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
179 				 struct rb_root_cached *ranges, u32 nnodes,
180 				 u64 *page_size)
181 {
182 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
183 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
184 	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
185 	u64 region_start, region_size, region_page_size;
186 	struct pds_lm_dirty_region_info *region_info;
187 	struct interval_tree_node *node = NULL;
188 	u8 max_regions = 0, num_regions;
189 	dma_addr_t regions_dma = 0;
190 	u32 num_ranges = nnodes;
191 	u32 page_count;
192 	u16 len;
193 	int err;
194 
195 	dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
196 		pds_vfio->vf_id);
197 
198 	if (pds_vfio_dirty_is_enabled(pds_vfio))
199 		return -EINVAL;
200 
201 	/* find if dirty tracking is disabled, i.e. num_regions == 0 */
202 	err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
203 					&num_regions);
204 	if (err < 0) {
205 		dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
206 			ERR_PTR(err));
207 		return err;
208 	} else if (num_regions) {
209 		dev_err(&pdev->dev,
210 			"Dirty tracking already enabled for %d regions\n",
211 			num_regions);
212 		return -EEXIST;
213 	} else if (!max_regions) {
214 		dev_err(&pdev->dev,
215 			"Device doesn't support dirty tracking, max_regions %d\n",
216 			max_regions);
217 		return -EOPNOTSUPP;
218 	}
219 
220 	/*
221 	 * Only support 1 region for now. If there are any large gaps in the
222 	 * VM's address regions, then this would be a waste of memory as we are
223 	 * generating 2 bitmaps (ack/seq) from the min address to the max
224 	 * address of the VM's address regions. In the future, if we support
225 	 * more than one region in the device/driver we can split the bitmaps
226 	 * on the largest address region gaps. We can do this split up to the
227 	 * max_regions times returned from the dirty_status command.
228 	 */
229 	max_regions = 1;
230 	if (num_ranges > max_regions) {
231 		vfio_combine_iova_ranges(ranges, nnodes, max_regions);
232 		num_ranges = max_regions;
233 	}
234 
235 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
236 	if (!node)
237 		return -EINVAL;
238 
239 	region_size = node->last - node->start + 1;
240 	region_start = node->start;
241 	region_page_size = *page_size;
242 
243 	len = sizeof(*region_info);
244 	region_info = kzalloc(len, GFP_KERNEL);
245 	if (!region_info)
246 		return -ENOMEM;
247 
248 	page_count = DIV_ROUND_UP(region_size, region_page_size);
249 
250 	region_info->dma_base = cpu_to_le64(region_start);
251 	region_info->page_count = cpu_to_le32(page_count);
252 	region_info->page_size_log2 = ilog2(region_page_size);
253 
254 	regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
255 				     DMA_BIDIRECTIONAL);
256 	if (dma_mapping_error(pdsc_dev, regions_dma)) {
257 		err = -ENOMEM;
258 		goto out_free_region_info;
259 	}
260 
261 	err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
262 	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
263 	if (err)
264 		goto out_free_region_info;
265 
266 	/*
267 	 * page_count might be adjusted by the device,
268 	 * update it before freeing region_info DMA
269 	 */
270 	page_count = le32_to_cpu(region_info->page_count);
271 
272 	dev_dbg(&pdev->dev,
273 		"region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
274 		regions_dma, region_start, page_count,
275 		(u8)ilog2(region_page_size));
276 
277 	err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
278 	if (err) {
279 		dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
280 			ERR_PTR(err));
281 		goto out_free_region_info;
282 	}
283 
284 	err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
285 	if (err) {
286 		dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
287 			ERR_PTR(err));
288 		goto out_free_bitmaps;
289 	}
290 
291 	dirty->region_start = region_start;
292 	dirty->region_size = region_size;
293 	dirty->region_page_size = region_page_size;
294 	pds_vfio_dirty_set_enabled(pds_vfio);
295 
296 	pds_vfio_print_guest_region_info(pds_vfio, max_regions);
297 
298 	kfree(region_info);
299 
300 	return 0;
301 
302 out_free_bitmaps:
303 	pds_vfio_dirty_free_bitmaps(dirty);
304 out_free_region_info:
305 	kfree(region_info);
306 	return err;
307 }
308 
309 void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
310 {
311 	if (pds_vfio_dirty_is_enabled(pds_vfio)) {
312 		pds_vfio_dirty_set_disabled(pds_vfio);
313 		if (send_cmd)
314 			pds_vfio_dirty_disable_cmd(pds_vfio);
315 		pds_vfio_dirty_free_sgl(pds_vfio);
316 		pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
317 	}
318 
319 	if (send_cmd)
320 		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
321 }
322 
323 static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
324 				  struct pds_vfio_bmp_info *bmp_info,
325 				  u32 offset, u32 bmp_bytes, bool read_seq)
326 {
327 	const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
328 	u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
329 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
330 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
331 	unsigned long long npages;
332 	struct sg_table sg_table;
333 	struct scatterlist *sg;
334 	struct page **pages;
335 	u32 page_offset;
336 	const void *bmp;
337 	size_t size;
338 	u16 num_sge;
339 	int err;
340 	int i;
341 
342 	bmp = (void *)((u64)bmp_info->bmp + offset);
343 	page_offset = offset_in_page(bmp);
344 	bmp -= page_offset;
345 
346 	/*
347 	 * Start and end of bitmap section to seq/ack might not be page
348 	 * aligned, so use the page_offset to account for that so there
349 	 * will be enough pages to represent the bmp_bytes
350 	 */
351 	npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
352 	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
353 	if (!pages)
354 		return -ENOMEM;
355 
356 	for (unsigned long long i = 0; i < npages; i++) {
357 		struct page *page = vmalloc_to_page(bmp);
358 
359 		if (!page) {
360 			err = -EFAULT;
361 			goto out_free_pages;
362 		}
363 
364 		pages[i] = page;
365 		bmp += PAGE_SIZE;
366 	}
367 
368 	err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
369 					bmp_bytes, GFP_KERNEL);
370 	if (err)
371 		goto out_free_pages;
372 
373 	err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
374 	if (err)
375 		goto out_free_sg_table;
376 
377 	for_each_sgtable_dma_sg(&sg_table, sg, i) {
378 		struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
379 
380 		sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
381 		sg_elem->len = cpu_to_le32(sg_dma_len(sg));
382 	}
383 
384 	num_sge = sg_table.nents;
385 	size = num_sge * sizeof(struct pds_lm_sg_elem);
386 	dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
387 	err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
388 					 offset, bmp_bytes, read_seq);
389 	if (err)
390 		dev_err(&pdev->dev,
391 			"Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
392 			bmp_type_str, offset, bmp_bytes,
393 			num_sge, bmp_info->sgl_addr, ERR_PTR(err));
394 	dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
395 
396 	dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
397 out_free_sg_table:
398 	sg_free_table(&sg_table);
399 out_free_pages:
400 	kfree(pages);
401 
402 	return err;
403 }
404 
405 static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
406 				    u32 offset, u32 len)
407 {
408 	return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
409 				      offset, len, WRITE_ACK);
410 }
411 
412 static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
413 				   u32 offset, u32 len)
414 {
415 	return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
416 				      offset, len, READ_SEQ);
417 }
418 
419 static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
420 					  struct iova_bitmap *dirty_bitmap,
421 					  u32 bmp_offset, u32 len_bytes)
422 {
423 	u64 page_size = pds_vfio->dirty.region_page_size;
424 	u64 region_start = pds_vfio->dirty.region_start;
425 	u32 bmp_offset_bit;
426 	__le64 *seq, *ack;
427 	int dword_count;
428 
429 	dword_count = len_bytes / sizeof(u64);
430 	seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
431 	ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
432 	bmp_offset_bit = bmp_offset * 8;
433 
434 	for (int i = 0; i < dword_count; i++) {
435 		u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
436 
437 		/* prepare for next write_ack call */
438 		ack[i] = seq[i];
439 
440 		for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
441 			if (xor & BIT(bit_i)) {
442 				u64 abs_bit_i = bmp_offset_bit +
443 						i * BITS_PER_TYPE(u64) + bit_i;
444 				u64 addr = abs_bit_i * page_size + region_start;
445 
446 				iova_bitmap_set(dirty_bitmap, addr, page_size);
447 			}
448 		}
449 	}
450 
451 	return 0;
452 }
453 
454 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
455 			       struct iova_bitmap *dirty_bitmap,
456 			       unsigned long iova, unsigned long length)
457 {
458 	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
459 	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
460 	u64 bmp_offset, bmp_bytes;
461 	u64 bitmap_size, pages;
462 	int err;
463 
464 	dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
465 
466 	if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
467 		dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
468 			pds_vfio->vf_id);
469 		return -EINVAL;
470 	}
471 
472 	pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
473 	bitmap_size =
474 		round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
475 
476 	dev_dbg(dev,
477 		"vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
478 		pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
479 		pages, bitmap_size);
480 
481 	if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) {
482 		dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
483 			iova, length);
484 		return -EINVAL;
485 	}
486 
487 	/* bitmap is modified in 64 bit chunks */
488 	bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
489 				       sizeof(u64)),
490 			  sizeof(u64));
491 	if (bmp_bytes != bitmap_size) {
492 		dev_err(dev,
493 			"Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
494 			bmp_bytes, bitmap_size);
495 		return -EINVAL;
496 	}
497 
498 	bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) /
499 				  dirty->region_page_size, sizeof(u64));
500 
501 	dev_dbg(dev,
502 		"Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
503 		iova, length, bmp_offset, bmp_bytes);
504 
505 	err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
506 	if (err)
507 		return err;
508 
509 	err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
510 					     bmp_bytes);
511 	if (err)
512 		return err;
513 
514 	err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
515 	if (err)
516 		return err;
517 
518 	return 0;
519 }
520 
521 int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
522 				unsigned long length, struct iova_bitmap *dirty)
523 {
524 	struct pds_vfio_pci_device *pds_vfio =
525 		container_of(vdev, struct pds_vfio_pci_device,
526 			     vfio_coredev.vdev);
527 	int err;
528 
529 	mutex_lock(&pds_vfio->state_mutex);
530 	err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
531 	pds_vfio_state_mutex_unlock(pds_vfio);
532 
533 	return err;
534 }
535 
536 int pds_vfio_dma_logging_start(struct vfio_device *vdev,
537 			       struct rb_root_cached *ranges, u32 nnodes,
538 			       u64 *page_size)
539 {
540 	struct pds_vfio_pci_device *pds_vfio =
541 		container_of(vdev, struct pds_vfio_pci_device,
542 			     vfio_coredev.vdev);
543 	int err;
544 
545 	mutex_lock(&pds_vfio->state_mutex);
546 	pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
547 	err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
548 	pds_vfio_state_mutex_unlock(pds_vfio);
549 
550 	return err;
551 }
552 
553 int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
554 {
555 	struct pds_vfio_pci_device *pds_vfio =
556 		container_of(vdev, struct pds_vfio_pci_device,
557 			     vfio_coredev.vdev);
558 
559 	mutex_lock(&pds_vfio->state_mutex);
560 	pds_vfio_dirty_disable(pds_vfio, true);
561 	pds_vfio_state_mutex_unlock(pds_vfio);
562 
563 	return 0;
564 }
565