1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 4 #include <linux/interval_tree.h> 5 #include <linux/vfio.h> 6 7 #include <linux/pds/pds_common.h> 8 #include <linux/pds/pds_core_if.h> 9 #include <linux/pds/pds_adminq.h> 10 11 #include "vfio_dev.h" 12 #include "cmds.h" 13 #include "dirty.h" 14 15 #define READ_SEQ true 16 #define WRITE_ACK false 17 18 bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio) 19 { 20 return pds_vfio->dirty.is_enabled; 21 } 22 23 void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio) 24 { 25 pds_vfio->dirty.is_enabled = true; 26 } 27 28 void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio) 29 { 30 pds_vfio->dirty.is_enabled = false; 31 } 32 33 static void 34 pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio, 35 u8 max_regions) 36 { 37 int len = max_regions * sizeof(struct pds_lm_dirty_region_info); 38 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 39 struct device *pdsc_dev = &pci_physfn(pdev)->dev; 40 struct pds_lm_dirty_region_info *region_info; 41 dma_addr_t regions_dma; 42 u8 num_regions; 43 int err; 44 45 region_info = kcalloc(max_regions, 46 sizeof(struct pds_lm_dirty_region_info), 47 GFP_KERNEL); 48 if (!region_info) 49 return; 50 51 regions_dma = 52 dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE); 53 if (dma_mapping_error(pdsc_dev, regions_dma)) 54 goto out_free_region_info; 55 56 err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions, 57 &num_regions); 58 dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE); 59 if (err) 60 goto out_free_region_info; 61 62 for (unsigned int i = 0; i < num_regions; i++) 63 dev_dbg(&pdev->dev, 64 "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n", 65 i, le64_to_cpu(region_info[i].dma_base), 66 le32_to_cpu(region_info[i].page_count), 67 region_info[i].page_size_log2); 68 69 out_free_region_info: 70 kfree(region_info); 71 } 72 73 static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, 74 unsigned long bytes) 75 { 76 unsigned long *host_seq_bmp, *host_ack_bmp; 77 78 host_seq_bmp = vzalloc(bytes); 79 if (!host_seq_bmp) 80 return -ENOMEM; 81 82 host_ack_bmp = vzalloc(bytes); 83 if (!host_ack_bmp) { 84 bitmap_free(host_seq_bmp); 85 return -ENOMEM; 86 } 87 88 dirty->host_seq.bmp = host_seq_bmp; 89 dirty->host_ack.bmp = host_ack_bmp; 90 91 return 0; 92 } 93 94 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) 95 { 96 vfree(dirty->host_seq.bmp); 97 vfree(dirty->host_ack.bmp); 98 dirty->host_seq.bmp = NULL; 99 dirty->host_ack.bmp = NULL; 100 } 101 102 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, 103 struct pds_vfio_bmp_info *bmp_info) 104 { 105 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 106 struct device *pdsc_dev = &pci_physfn(pdev)->dev; 107 108 dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, 109 bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), 110 DMA_BIDIRECTIONAL); 111 kfree(bmp_info->sgl); 112 113 bmp_info->num_sge = 0; 114 bmp_info->sgl = NULL; 115 bmp_info->sgl_addr = 0; 116 } 117 118 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) 119 { 120 if (pds_vfio->dirty.host_seq.sgl) 121 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); 122 if (pds_vfio->dirty.host_ack.sgl) 123 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); 124 } 125 126 static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, 127 struct pds_vfio_bmp_info *bmp_info, 128 u32 page_count) 129 { 130 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 131 struct device *pdsc_dev = &pci_physfn(pdev)->dev; 132 struct pds_lm_sg_elem *sgl; 133 dma_addr_t sgl_addr; 134 size_t sgl_size; 135 u32 max_sge; 136 137 max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8); 138 sgl_size = max_sge * sizeof(struct pds_lm_sg_elem); 139 140 sgl = kzalloc(sgl_size, GFP_KERNEL); 141 if (!sgl) 142 return -ENOMEM; 143 144 sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL); 145 if (dma_mapping_error(pdsc_dev, sgl_addr)) { 146 kfree(sgl); 147 return -EIO; 148 } 149 150 bmp_info->sgl = sgl; 151 bmp_info->num_sge = max_sge; 152 bmp_info->sgl_addr = sgl_addr; 153 154 return 0; 155 } 156 157 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, 158 u32 page_count) 159 { 160 struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 161 int err; 162 163 err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, 164 page_count); 165 if (err) 166 return err; 167 168 err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, 169 page_count); 170 if (err) { 171 __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); 172 return err; 173 } 174 175 return 0; 176 } 177 178 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, 179 struct rb_root_cached *ranges, u32 nnodes, 180 u64 *page_size) 181 { 182 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 183 struct device *pdsc_dev = &pci_physfn(pdev)->dev; 184 struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 185 u64 region_start, region_size, region_page_size; 186 struct pds_lm_dirty_region_info *region_info; 187 struct interval_tree_node *node = NULL; 188 u8 max_regions = 0, num_regions; 189 dma_addr_t regions_dma = 0; 190 u32 num_ranges = nnodes; 191 u32 page_count; 192 u16 len; 193 int err; 194 195 dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", 196 pds_vfio->vf_id); 197 198 if (pds_vfio_dirty_is_enabled(pds_vfio)) 199 return -EINVAL; 200 201 /* find if dirty tracking is disabled, i.e. num_regions == 0 */ 202 err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, 203 &num_regions); 204 if (err < 0) { 205 dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", 206 ERR_PTR(err)); 207 return err; 208 } else if (num_regions) { 209 dev_err(&pdev->dev, 210 "Dirty tracking already enabled for %d regions\n", 211 num_regions); 212 return -EEXIST; 213 } else if (!max_regions) { 214 dev_err(&pdev->dev, 215 "Device doesn't support dirty tracking, max_regions %d\n", 216 max_regions); 217 return -EOPNOTSUPP; 218 } 219 220 /* 221 * Only support 1 region for now. If there are any large gaps in the 222 * VM's address regions, then this would be a waste of memory as we are 223 * generating 2 bitmaps (ack/seq) from the min address to the max 224 * address of the VM's address regions. In the future, if we support 225 * more than one region in the device/driver we can split the bitmaps 226 * on the largest address region gaps. We can do this split up to the 227 * max_regions times returned from the dirty_status command. 228 */ 229 max_regions = 1; 230 if (num_ranges > max_regions) { 231 vfio_combine_iova_ranges(ranges, nnodes, max_regions); 232 num_ranges = max_regions; 233 } 234 235 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 236 if (!node) 237 return -EINVAL; 238 239 region_size = node->last - node->start + 1; 240 region_start = node->start; 241 region_page_size = *page_size; 242 243 len = sizeof(*region_info); 244 region_info = kzalloc(len, GFP_KERNEL); 245 if (!region_info) 246 return -ENOMEM; 247 248 page_count = DIV_ROUND_UP(region_size, region_page_size); 249 250 region_info->dma_base = cpu_to_le64(region_start); 251 region_info->page_count = cpu_to_le32(page_count); 252 region_info->page_size_log2 = ilog2(region_page_size); 253 254 regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, 255 DMA_BIDIRECTIONAL); 256 if (dma_mapping_error(pdsc_dev, regions_dma)) { 257 err = -ENOMEM; 258 goto out_free_region_info; 259 } 260 261 err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); 262 dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); 263 if (err) 264 goto out_free_region_info; 265 266 /* 267 * page_count might be adjusted by the device, 268 * update it before freeing region_info DMA 269 */ 270 page_count = le32_to_cpu(region_info->page_count); 271 272 dev_dbg(&pdev->dev, 273 "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", 274 regions_dma, region_start, page_count, 275 (u8)ilog2(region_page_size)); 276 277 err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); 278 if (err) { 279 dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", 280 ERR_PTR(err)); 281 goto out_free_region_info; 282 } 283 284 err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); 285 if (err) { 286 dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", 287 ERR_PTR(err)); 288 goto out_free_bitmaps; 289 } 290 291 dirty->region_start = region_start; 292 dirty->region_size = region_size; 293 dirty->region_page_size = region_page_size; 294 pds_vfio_dirty_set_enabled(pds_vfio); 295 296 pds_vfio_print_guest_region_info(pds_vfio, max_regions); 297 298 kfree(region_info); 299 300 return 0; 301 302 out_free_bitmaps: 303 pds_vfio_dirty_free_bitmaps(dirty); 304 out_free_region_info: 305 kfree(region_info); 306 return err; 307 } 308 309 void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) 310 { 311 if (pds_vfio_dirty_is_enabled(pds_vfio)) { 312 pds_vfio_dirty_set_disabled(pds_vfio); 313 if (send_cmd) 314 pds_vfio_dirty_disable_cmd(pds_vfio); 315 pds_vfio_dirty_free_sgl(pds_vfio); 316 pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); 317 } 318 319 if (send_cmd) 320 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); 321 } 322 323 static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, 324 struct pds_vfio_bmp_info *bmp_info, 325 u32 offset, u32 bmp_bytes, bool read_seq) 326 { 327 const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; 328 u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 329 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 330 struct device *pdsc_dev = &pci_physfn(pdev)->dev; 331 unsigned long long npages; 332 struct sg_table sg_table; 333 struct scatterlist *sg; 334 struct page **pages; 335 u32 page_offset; 336 const void *bmp; 337 size_t size; 338 u16 num_sge; 339 int err; 340 int i; 341 342 bmp = (void *)((u64)bmp_info->bmp + offset); 343 page_offset = offset_in_page(bmp); 344 bmp -= page_offset; 345 346 /* 347 * Start and end of bitmap section to seq/ack might not be page 348 * aligned, so use the page_offset to account for that so there 349 * will be enough pages to represent the bmp_bytes 350 */ 351 npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE); 352 pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); 353 if (!pages) 354 return -ENOMEM; 355 356 for (unsigned long long i = 0; i < npages; i++) { 357 struct page *page = vmalloc_to_page(bmp); 358 359 if (!page) { 360 err = -EFAULT; 361 goto out_free_pages; 362 } 363 364 pages[i] = page; 365 bmp += PAGE_SIZE; 366 } 367 368 err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset, 369 bmp_bytes, GFP_KERNEL); 370 if (err) 371 goto out_free_pages; 372 373 err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0); 374 if (err) 375 goto out_free_sg_table; 376 377 for_each_sgtable_dma_sg(&sg_table, sg, i) { 378 struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; 379 380 sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); 381 sg_elem->len = cpu_to_le32(sg_dma_len(sg)); 382 } 383 384 num_sge = sg_table.nents; 385 size = num_sge * sizeof(struct pds_lm_sg_elem); 386 dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); 387 err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, 388 offset, bmp_bytes, read_seq); 389 if (err) 390 dev_err(&pdev->dev, 391 "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", 392 bmp_type_str, offset, bmp_bytes, 393 num_sge, bmp_info->sgl_addr, ERR_PTR(err)); 394 dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); 395 396 dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); 397 out_free_sg_table: 398 sg_free_table(&sg_table); 399 out_free_pages: 400 kfree(pages); 401 402 return err; 403 } 404 405 static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, 406 u32 offset, u32 len) 407 { 408 return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, 409 offset, len, WRITE_ACK); 410 } 411 412 static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, 413 u32 offset, u32 len) 414 { 415 return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, 416 offset, len, READ_SEQ); 417 } 418 419 static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, 420 struct iova_bitmap *dirty_bitmap, 421 u32 bmp_offset, u32 len_bytes) 422 { 423 u64 page_size = pds_vfio->dirty.region_page_size; 424 u64 region_start = pds_vfio->dirty.region_start; 425 u32 bmp_offset_bit; 426 __le64 *seq, *ack; 427 int dword_count; 428 429 dword_count = len_bytes / sizeof(u64); 430 seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); 431 ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); 432 bmp_offset_bit = bmp_offset * 8; 433 434 for (int i = 0; i < dword_count; i++) { 435 u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]); 436 437 /* prepare for next write_ack call */ 438 ack[i] = seq[i]; 439 440 for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) { 441 if (xor & BIT(bit_i)) { 442 u64 abs_bit_i = bmp_offset_bit + 443 i * BITS_PER_TYPE(u64) + bit_i; 444 u64 addr = abs_bit_i * page_size + region_start; 445 446 iova_bitmap_set(dirty_bitmap, addr, page_size); 447 } 448 } 449 } 450 451 return 0; 452 } 453 454 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, 455 struct iova_bitmap *dirty_bitmap, 456 unsigned long iova, unsigned long length) 457 { 458 struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; 459 struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 460 u64 bmp_offset, bmp_bytes; 461 u64 bitmap_size, pages; 462 int err; 463 464 dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id); 465 466 if (!pds_vfio_dirty_is_enabled(pds_vfio)) { 467 dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n", 468 pds_vfio->vf_id); 469 return -EINVAL; 470 } 471 472 pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); 473 bitmap_size = 474 round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; 475 476 dev_dbg(dev, 477 "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", 478 pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, 479 pages, bitmap_size); 480 481 if (!length || ((dirty->region_start + iova + length) > 482 (dirty->region_start + dirty->region_size))) { 483 dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", 484 iova, length); 485 return -EINVAL; 486 } 487 488 /* bitmap is modified in 64 bit chunks */ 489 bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, 490 sizeof(u64)), 491 sizeof(u64)); 492 if (bmp_bytes != bitmap_size) { 493 dev_err(dev, 494 "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", 495 bmp_bytes, bitmap_size); 496 return -EINVAL; 497 } 498 499 bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); 500 501 dev_dbg(dev, 502 "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", 503 iova, length, bmp_offset, bmp_bytes); 504 505 err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); 506 if (err) 507 return err; 508 509 err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, 510 bmp_bytes); 511 if (err) 512 return err; 513 514 err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); 515 if (err) 516 return err; 517 518 return 0; 519 } 520 521 int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, 522 unsigned long length, struct iova_bitmap *dirty) 523 { 524 struct pds_vfio_pci_device *pds_vfio = 525 container_of(vdev, struct pds_vfio_pci_device, 526 vfio_coredev.vdev); 527 int err; 528 529 mutex_lock(&pds_vfio->state_mutex); 530 err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); 531 pds_vfio_state_mutex_unlock(pds_vfio); 532 533 return err; 534 } 535 536 int pds_vfio_dma_logging_start(struct vfio_device *vdev, 537 struct rb_root_cached *ranges, u32 nnodes, 538 u64 *page_size) 539 { 540 struct pds_vfio_pci_device *pds_vfio = 541 container_of(vdev, struct pds_vfio_pci_device, 542 vfio_coredev.vdev); 543 int err; 544 545 mutex_lock(&pds_vfio->state_mutex); 546 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); 547 err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); 548 pds_vfio_state_mutex_unlock(pds_vfio); 549 550 return err; 551 } 552 553 int pds_vfio_dma_logging_stop(struct vfio_device *vdev) 554 { 555 struct pds_vfio_pci_device *pds_vfio = 556 container_of(vdev, struct pds_vfio_pci_device, 557 vfio_coredev.vdev); 558 559 mutex_lock(&pds_vfio->state_mutex); 560 pds_vfio_dirty_disable(pds_vfio, true); 561 pds_vfio_state_mutex_unlock(pds_vfio); 562 563 return 0; 564 } 565