1 /* 2 * Virtio MEM device 3 * 4 * Copyright (C) 2020 Red Hat, Inc. 5 * 6 * Authors: 7 * David Hildenbrand <david@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu-common.h" 15 #include "qemu/iov.h" 16 #include "qemu/cutils.h" 17 #include "qemu/error-report.h" 18 #include "qemu/units.h" 19 #include "sysemu/numa.h" 20 #include "sysemu/sysemu.h" 21 #include "sysemu/reset.h" 22 #include "hw/virtio/virtio.h" 23 #include "hw/virtio/virtio-bus.h" 24 #include "hw/virtio/virtio-access.h" 25 #include "hw/virtio/virtio-mem.h" 26 #include "qapi/error.h" 27 #include "qapi/visitor.h" 28 #include "exec/ram_addr.h" 29 #include "migration/misc.h" 30 #include "hw/boards.h" 31 #include "hw/qdev-properties.h" 32 #include CONFIG_DEVICES 33 #include "trace.h" 34 35 /* 36 * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking 37 * bitmap small. 38 */ 39 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) 40 41 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \ 42 defined(__powerpc64__) 43 #define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB)) 44 #else 45 /* fallback to 1 MiB (e.g., the THP size on s390x) */ 46 #define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE 47 #endif 48 49 /* 50 * We want to have a reasonable default block size such that 51 * 1. We avoid splitting THPs when unplugging memory, which degrades 52 * performance. 53 * 2. We avoid placing THPs for plugged blocks that also cover unplugged 54 * blocks. 55 * 56 * The actual THP size might differ between Linux kernels, so we try to probe 57 * it. In the future (if we ever run into issues regarding 2.), we might want 58 * to disable THP in case we fail to properly probe the THP size, or if the 59 * block size is configured smaller than the THP size. 60 */ 61 static uint32_t thp_size; 62 63 #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" 64 static uint32_t virtio_mem_thp_size(void) 65 { 66 gchar *content = NULL; 67 const char *endptr; 68 uint64_t tmp; 69 70 if (thp_size) { 71 return thp_size; 72 } 73 74 /* 75 * Try to probe the actual THP size, fallback to (sane but eventually 76 * incorrect) default sizes. 77 */ 78 if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && 79 !qemu_strtou64(content, &endptr, 0, &tmp) && 80 (!endptr || *endptr == '\n')) { 81 /* 82 * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base 83 * pages) or weird, fallback to something smaller. 84 */ 85 if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) { 86 warn_report("Read unsupported THP size: %" PRIx64, tmp); 87 } else { 88 thp_size = tmp; 89 } 90 } 91 92 if (!thp_size) { 93 thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE; 94 warn_report("Could not detect THP size, falling back to %" PRIx64 95 " MiB.", thp_size / MiB); 96 } 97 98 g_free(content); 99 return thp_size; 100 } 101 102 static uint64_t virtio_mem_default_block_size(RAMBlock *rb) 103 { 104 const uint64_t page_size = qemu_ram_pagesize(rb); 105 106 /* We can have hugetlbfs with a page size smaller than the THP size. */ 107 if (page_size == qemu_real_host_page_size) { 108 return MAX(page_size, virtio_mem_thp_size()); 109 } 110 return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); 111 } 112 113 /* 114 * Size the usable region bigger than the requested size if possible. Esp. 115 * Linux guests will only add (aligned) memory blocks in case they fully 116 * fit into the usable region, but plug+online only a subset of the pages. 117 * The memory block size corresponds mostly to the section size. 118 * 119 * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and 120 * a section size of 1GB on arm64 (as long as the start address is properly 121 * aligned, similar to ordinary DIMMs). 122 * 123 * We can change this at any time and maybe even make it configurable if 124 * necessary (as the section size can change). But it's more likely that the 125 * section size will rather get smaller and not bigger over time. 126 */ 127 #if defined(TARGET_X86_64) || defined(TARGET_I386) 128 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB)) 129 #else 130 #error VIRTIO_MEM_USABLE_EXTENT not defined 131 #endif 132 133 static bool virtio_mem_is_busy(void) 134 { 135 /* 136 * Postcopy cannot handle concurrent discards and we don't want to migrate 137 * pages on-demand with stale content when plugging new blocks. 138 * 139 * For precopy, we don't want unplugged blocks in our migration stream, and 140 * when plugging new blocks, the page content might differ between source 141 * and destination (observable by the guest when not initializing pages 142 * after plugging them) until we're running on the destination (as we didn't 143 * migrate these blocks when they were unplugged). 144 */ 145 return migration_in_incoming_postcopy() || !migration_is_idle(); 146 } 147 148 static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, 149 uint64_t size, bool plugged) 150 { 151 const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; 152 const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1; 153 unsigned long found_bit; 154 155 /* We fake a shorter bitmap to avoid searching too far. */ 156 if (plugged) { 157 found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit); 158 } else { 159 found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit); 160 } 161 return found_bit > last_bit; 162 } 163 164 static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, 165 uint64_t size, bool plugged) 166 { 167 const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size; 168 const unsigned long nbits = size / vmem->block_size; 169 170 if (plugged) { 171 bitmap_set(vmem->bitmap, bit, nbits); 172 } else { 173 bitmap_clear(vmem->bitmap, bit, nbits); 174 } 175 } 176 177 static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem, 178 struct virtio_mem_resp *resp) 179 { 180 VirtIODevice *vdev = VIRTIO_DEVICE(vmem); 181 VirtQueue *vq = vmem->vq; 182 183 trace_virtio_mem_send_response(le16_to_cpu(resp->type)); 184 iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp)); 185 186 virtqueue_push(vq, elem, sizeof(*resp)); 187 virtio_notify(vdev, vq); 188 } 189 190 static void virtio_mem_send_response_simple(VirtIOMEM *vmem, 191 VirtQueueElement *elem, 192 uint16_t type) 193 { 194 struct virtio_mem_resp resp = { 195 .type = cpu_to_le16(type), 196 }; 197 198 virtio_mem_send_response(vmem, elem, &resp); 199 } 200 201 static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size) 202 { 203 if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { 204 return false; 205 } 206 if (gpa + size < gpa || !size) { 207 return false; 208 } 209 if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) { 210 return false; 211 } 212 if (gpa + size > vmem->addr + vmem->usable_region_size) { 213 return false; 214 } 215 return true; 216 } 217 218 static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, 219 uint64_t size, bool plug) 220 { 221 const uint64_t offset = start_gpa - vmem->addr; 222 int ret; 223 224 if (virtio_mem_is_busy()) { 225 return -EBUSY; 226 } 227 228 if (!plug) { 229 ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); 230 if (ret) { 231 error_report("Unexpected error discarding RAM: %s", 232 strerror(-ret)); 233 return -EBUSY; 234 } 235 } 236 virtio_mem_set_bitmap(vmem, start_gpa, size, plug); 237 return 0; 238 } 239 240 static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa, 241 uint16_t nb_blocks, bool plug) 242 { 243 const uint64_t size = nb_blocks * vmem->block_size; 244 int ret; 245 246 if (!virtio_mem_valid_range(vmem, gpa, size)) { 247 return VIRTIO_MEM_RESP_ERROR; 248 } 249 250 if (plug && (vmem->size + size > vmem->requested_size)) { 251 return VIRTIO_MEM_RESP_NACK; 252 } 253 254 /* test if really all blocks are in the opposite state */ 255 if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) { 256 return VIRTIO_MEM_RESP_ERROR; 257 } 258 259 ret = virtio_mem_set_block_state(vmem, gpa, size, plug); 260 if (ret) { 261 return VIRTIO_MEM_RESP_BUSY; 262 } 263 if (plug) { 264 vmem->size += size; 265 } else { 266 vmem->size -= size; 267 } 268 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); 269 return VIRTIO_MEM_RESP_ACK; 270 } 271 272 static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem, 273 struct virtio_mem_req *req) 274 { 275 const uint64_t gpa = le64_to_cpu(req->u.plug.addr); 276 const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks); 277 uint16_t type; 278 279 trace_virtio_mem_plug_request(gpa, nb_blocks); 280 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true); 281 virtio_mem_send_response_simple(vmem, elem, type); 282 } 283 284 static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem, 285 struct virtio_mem_req *req) 286 { 287 const uint64_t gpa = le64_to_cpu(req->u.unplug.addr); 288 const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks); 289 uint16_t type; 290 291 trace_virtio_mem_unplug_request(gpa, nb_blocks); 292 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false); 293 virtio_mem_send_response_simple(vmem, elem, type); 294 } 295 296 static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, 297 uint64_t requested_size, 298 bool can_shrink) 299 { 300 uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr), 301 requested_size + VIRTIO_MEM_USABLE_EXTENT); 302 303 /* The usable region size always has to be multiples of the block size. */ 304 newsize = QEMU_ALIGN_UP(newsize, vmem->block_size); 305 306 if (!requested_size) { 307 newsize = 0; 308 } 309 310 if (newsize < vmem->usable_region_size && !can_shrink) { 311 return; 312 } 313 314 trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize); 315 vmem->usable_region_size = newsize; 316 } 317 318 static int virtio_mem_unplug_all(VirtIOMEM *vmem) 319 { 320 RAMBlock *rb = vmem->memdev->mr.ram_block; 321 int ret; 322 323 if (virtio_mem_is_busy()) { 324 return -EBUSY; 325 } 326 327 ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); 328 if (ret) { 329 error_report("Unexpected error discarding RAM: %s", strerror(-ret)); 330 return -EBUSY; 331 } 332 bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); 333 if (vmem->size) { 334 vmem->size = 0; 335 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); 336 } 337 trace_virtio_mem_unplugged_all(); 338 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); 339 return 0; 340 } 341 342 static void virtio_mem_unplug_all_request(VirtIOMEM *vmem, 343 VirtQueueElement *elem) 344 { 345 trace_virtio_mem_unplug_all_request(); 346 if (virtio_mem_unplug_all(vmem)) { 347 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY); 348 } else { 349 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK); 350 } 351 } 352 353 static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem, 354 struct virtio_mem_req *req) 355 { 356 const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks); 357 const uint64_t gpa = le64_to_cpu(req->u.state.addr); 358 const uint64_t size = nb_blocks * vmem->block_size; 359 struct virtio_mem_resp resp = { 360 .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK), 361 }; 362 363 trace_virtio_mem_state_request(gpa, nb_blocks); 364 if (!virtio_mem_valid_range(vmem, gpa, size)) { 365 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR); 366 return; 367 } 368 369 if (virtio_mem_test_bitmap(vmem, gpa, size, true)) { 370 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED); 371 } else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) { 372 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED); 373 } else { 374 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED); 375 } 376 trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state)); 377 virtio_mem_send_response(vmem, elem, &resp); 378 } 379 380 static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq) 381 { 382 const int len = sizeof(struct virtio_mem_req); 383 VirtIOMEM *vmem = VIRTIO_MEM(vdev); 384 VirtQueueElement *elem; 385 struct virtio_mem_req req; 386 uint16_t type; 387 388 while (true) { 389 elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); 390 if (!elem) { 391 return; 392 } 393 394 if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) { 395 virtio_error(vdev, "virtio-mem protocol violation: invalid request" 396 " size: %d", len); 397 virtqueue_detach_element(vq, elem, 0); 398 g_free(elem); 399 return; 400 } 401 402 if (iov_size(elem->in_sg, elem->in_num) < 403 sizeof(struct virtio_mem_resp)) { 404 virtio_error(vdev, "virtio-mem protocol violation: not enough space" 405 " for response: %zu", 406 iov_size(elem->in_sg, elem->in_num)); 407 virtqueue_detach_element(vq, elem, 0); 408 g_free(elem); 409 return; 410 } 411 412 type = le16_to_cpu(req.type); 413 switch (type) { 414 case VIRTIO_MEM_REQ_PLUG: 415 virtio_mem_plug_request(vmem, elem, &req); 416 break; 417 case VIRTIO_MEM_REQ_UNPLUG: 418 virtio_mem_unplug_request(vmem, elem, &req); 419 break; 420 case VIRTIO_MEM_REQ_UNPLUG_ALL: 421 virtio_mem_unplug_all_request(vmem, elem); 422 break; 423 case VIRTIO_MEM_REQ_STATE: 424 virtio_mem_state_request(vmem, elem, &req); 425 break; 426 default: 427 virtio_error(vdev, "virtio-mem protocol violation: unknown request" 428 " type: %d", type); 429 virtqueue_detach_element(vq, elem, 0); 430 g_free(elem); 431 return; 432 } 433 434 g_free(elem); 435 } 436 } 437 438 static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data) 439 { 440 VirtIOMEM *vmem = VIRTIO_MEM(vdev); 441 struct virtio_mem_config *config = (void *) config_data; 442 443 config->block_size = cpu_to_le64(vmem->block_size); 444 config->node_id = cpu_to_le16(vmem->node); 445 config->requested_size = cpu_to_le64(vmem->requested_size); 446 config->plugged_size = cpu_to_le64(vmem->size); 447 config->addr = cpu_to_le64(vmem->addr); 448 config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr)); 449 config->usable_region_size = cpu_to_le64(vmem->usable_region_size); 450 } 451 452 static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features, 453 Error **errp) 454 { 455 MachineState *ms = MACHINE(qdev_get_machine()); 456 457 if (ms->numa_state) { 458 #if defined(CONFIG_ACPI) 459 virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM); 460 #endif 461 } 462 return features; 463 } 464 465 static void virtio_mem_system_reset(void *opaque) 466 { 467 VirtIOMEM *vmem = VIRTIO_MEM(opaque); 468 469 /* 470 * During usual resets, we will unplug all memory and shrink the usable 471 * region size. This is, however, not possible in all scenarios. Then, 472 * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). 473 */ 474 virtio_mem_unplug_all(vmem); 475 } 476 477 static void virtio_mem_device_realize(DeviceState *dev, Error **errp) 478 { 479 MachineState *ms = MACHINE(qdev_get_machine()); 480 int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0; 481 VirtIODevice *vdev = VIRTIO_DEVICE(dev); 482 VirtIOMEM *vmem = VIRTIO_MEM(dev); 483 uint64_t page_size; 484 RAMBlock *rb; 485 int ret; 486 487 if (!vmem->memdev) { 488 error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP); 489 return; 490 } else if (host_memory_backend_is_mapped(vmem->memdev)) { 491 error_setg(errp, "'%s' property specifies a busy memdev: %s", 492 VIRTIO_MEM_MEMDEV_PROP, 493 object_get_canonical_path_component(OBJECT(vmem->memdev))); 494 return; 495 } else if (!memory_region_is_ram(&vmem->memdev->mr) || 496 memory_region_is_rom(&vmem->memdev->mr) || 497 !vmem->memdev->mr.ram_block) { 498 error_setg(errp, "'%s' property specifies an unsupported memdev", 499 VIRTIO_MEM_MEMDEV_PROP); 500 return; 501 } 502 503 if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) || 504 (!nb_numa_nodes && vmem->node)) { 505 error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds" 506 "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP, 507 vmem->node, nb_numa_nodes ? nb_numa_nodes : 1); 508 return; 509 } 510 511 if (enable_mlock) { 512 error_setg(errp, "Incompatible with mlock"); 513 return; 514 } 515 516 rb = vmem->memdev->mr.ram_block; 517 page_size = qemu_ram_pagesize(rb); 518 519 /* 520 * If the block size wasn't configured by the user, use a sane default. This 521 * allows using hugetlbfs backends of any page size without manual 522 * intervention. 523 */ 524 if (!vmem->block_size) { 525 vmem->block_size = virtio_mem_default_block_size(rb); 526 } 527 528 if (vmem->block_size < page_size) { 529 error_setg(errp, "'%s' property has to be at least the page size (0x%" 530 PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); 531 return; 532 } else if (vmem->block_size < virtio_mem_default_block_size(rb)) { 533 warn_report("'%s' property is smaller than the default block size (%" 534 PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP, 535 virtio_mem_default_block_size(rb) / MiB); 536 } else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { 537 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 538 ")", VIRTIO_MEM_REQUESTED_SIZE_PROP, 539 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); 540 return; 541 } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) { 542 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 543 ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP, 544 vmem->block_size); 545 return; 546 } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr), 547 vmem->block_size)) { 548 error_setg(errp, "'%s' property memdev size has to be multiples of" 549 "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP, 550 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); 551 return; 552 } 553 554 if (ram_block_discard_require(true)) { 555 error_setg(errp, "Discarding RAM is disabled"); 556 return; 557 } 558 559 ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); 560 if (ret) { 561 error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); 562 ram_block_discard_require(false); 563 return; 564 } 565 566 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); 567 568 vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) / 569 vmem->block_size; 570 vmem->bitmap = bitmap_new(vmem->bitmap_size); 571 572 virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM, 573 sizeof(struct virtio_mem_config)); 574 vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request); 575 576 host_memory_backend_set_mapped(vmem->memdev, true); 577 vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); 578 qemu_register_reset(virtio_mem_system_reset, vmem); 579 precopy_add_notifier(&vmem->precopy_notifier); 580 } 581 582 static void virtio_mem_device_unrealize(DeviceState *dev) 583 { 584 VirtIODevice *vdev = VIRTIO_DEVICE(dev); 585 VirtIOMEM *vmem = VIRTIO_MEM(dev); 586 587 precopy_remove_notifier(&vmem->precopy_notifier); 588 qemu_unregister_reset(virtio_mem_system_reset, vmem); 589 vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); 590 host_memory_backend_set_mapped(vmem->memdev, false); 591 virtio_del_queue(vdev, 0); 592 virtio_cleanup(vdev); 593 g_free(vmem->bitmap); 594 ram_block_discard_require(false); 595 } 596 597 static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) 598 { 599 RAMBlock *rb = vmem->memdev->mr.ram_block; 600 unsigned long first_zero_bit, last_zero_bit; 601 uint64_t offset, length; 602 int ret; 603 604 /* Find consecutive unplugged blocks and discard the consecutive range. */ 605 first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); 606 while (first_zero_bit < vmem->bitmap_size) { 607 offset = first_zero_bit * vmem->block_size; 608 last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, 609 first_zero_bit + 1) - 1; 610 length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; 611 612 ret = ram_block_discard_range(rb, offset, length); 613 if (ret) { 614 error_report("Unexpected error discarding RAM: %s", 615 strerror(-ret)); 616 return -EINVAL; 617 } 618 first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, 619 last_zero_bit + 2); 620 } 621 return 0; 622 } 623 624 static int virtio_mem_post_load(void *opaque, int version_id) 625 { 626 if (migration_in_incoming_postcopy()) { 627 return 0; 628 } 629 630 return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque)); 631 } 632 633 typedef struct VirtIOMEMMigSanityChecks { 634 VirtIOMEM *parent; 635 uint64_t addr; 636 uint64_t region_size; 637 uint64_t block_size; 638 uint32_t node; 639 } VirtIOMEMMigSanityChecks; 640 641 static int virtio_mem_mig_sanity_checks_pre_save(void *opaque) 642 { 643 VirtIOMEMMigSanityChecks *tmp = opaque; 644 VirtIOMEM *vmem = tmp->parent; 645 646 tmp->addr = vmem->addr; 647 tmp->region_size = memory_region_size(&vmem->memdev->mr); 648 tmp->block_size = vmem->block_size; 649 tmp->node = vmem->node; 650 return 0; 651 } 652 653 static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id) 654 { 655 VirtIOMEMMigSanityChecks *tmp = opaque; 656 VirtIOMEM *vmem = tmp->parent; 657 const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr); 658 659 if (tmp->addr != vmem->addr) { 660 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, 661 VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr); 662 return -EINVAL; 663 } 664 /* 665 * Note: Preparation for resizeable memory regions. The maximum size 666 * of the memory region must not change during migration. 667 */ 668 if (tmp->region_size != new_region_size) { 669 error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%" 670 PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size, 671 new_region_size); 672 return -EINVAL; 673 } 674 if (tmp->block_size != vmem->block_size) { 675 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, 676 VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size, 677 vmem->block_size); 678 return -EINVAL; 679 } 680 if (tmp->node != vmem->node) { 681 error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32, 682 VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node); 683 return -EINVAL; 684 } 685 return 0; 686 } 687 688 static const VMStateDescription vmstate_virtio_mem_sanity_checks = { 689 .name = "virtio-mem-device/sanity-checks", 690 .pre_save = virtio_mem_mig_sanity_checks_pre_save, 691 .post_load = virtio_mem_mig_sanity_checks_post_load, 692 .fields = (VMStateField[]) { 693 VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks), 694 VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks), 695 VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks), 696 VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks), 697 VMSTATE_END_OF_LIST(), 698 }, 699 }; 700 701 static const VMStateDescription vmstate_virtio_mem_device = { 702 .name = "virtio-mem-device", 703 .minimum_version_id = 1, 704 .version_id = 1, 705 .post_load = virtio_mem_post_load, 706 .fields = (VMStateField[]) { 707 VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, 708 vmstate_virtio_mem_sanity_checks), 709 VMSTATE_UINT64(usable_region_size, VirtIOMEM), 710 VMSTATE_UINT64(size, VirtIOMEM), 711 VMSTATE_UINT64(requested_size, VirtIOMEM), 712 VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size), 713 VMSTATE_END_OF_LIST() 714 }, 715 }; 716 717 static const VMStateDescription vmstate_virtio_mem = { 718 .name = "virtio-mem", 719 .minimum_version_id = 1, 720 .version_id = 1, 721 .fields = (VMStateField[]) { 722 VMSTATE_VIRTIO_DEVICE, 723 VMSTATE_END_OF_LIST() 724 }, 725 }; 726 727 static void virtio_mem_fill_device_info(const VirtIOMEM *vmem, 728 VirtioMEMDeviceInfo *vi) 729 { 730 vi->memaddr = vmem->addr; 731 vi->node = vmem->node; 732 vi->requested_size = vmem->requested_size; 733 vi->size = vmem->size; 734 vi->max_size = memory_region_size(&vmem->memdev->mr); 735 vi->block_size = vmem->block_size; 736 vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev)); 737 } 738 739 static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp) 740 { 741 if (!vmem->memdev) { 742 error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP); 743 return NULL; 744 } 745 746 return &vmem->memdev->mr; 747 } 748 749 static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem, 750 Notifier *notifier) 751 { 752 notifier_list_add(&vmem->size_change_notifiers, notifier); 753 } 754 755 static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem, 756 Notifier *notifier) 757 { 758 notifier_remove(notifier); 759 } 760 761 static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name, 762 void *opaque, Error **errp) 763 { 764 const VirtIOMEM *vmem = VIRTIO_MEM(obj); 765 uint64_t value = vmem->size; 766 767 visit_type_size(v, name, &value, errp); 768 } 769 770 static void virtio_mem_get_requested_size(Object *obj, Visitor *v, 771 const char *name, void *opaque, 772 Error **errp) 773 { 774 const VirtIOMEM *vmem = VIRTIO_MEM(obj); 775 uint64_t value = vmem->requested_size; 776 777 visit_type_size(v, name, &value, errp); 778 } 779 780 static void virtio_mem_set_requested_size(Object *obj, Visitor *v, 781 const char *name, void *opaque, 782 Error **errp) 783 { 784 VirtIOMEM *vmem = VIRTIO_MEM(obj); 785 Error *err = NULL; 786 uint64_t value; 787 788 visit_type_size(v, name, &value, &err); 789 if (err) { 790 error_propagate(errp, err); 791 return; 792 } 793 794 /* 795 * The block size and memory backend are not fixed until the device was 796 * realized. realize() will verify these properties then. 797 */ 798 if (DEVICE(obj)->realized) { 799 if (!QEMU_IS_ALIGNED(value, vmem->block_size)) { 800 error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64 801 ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP, 802 vmem->block_size); 803 return; 804 } else if (value > memory_region_size(&vmem->memdev->mr)) { 805 error_setg(errp, "'%s' cannot exceed the memory backend size" 806 "(0x%" PRIx64 ")", name, 807 memory_region_size(&vmem->memdev->mr)); 808 return; 809 } 810 811 if (value != vmem->requested_size) { 812 virtio_mem_resize_usable_region(vmem, value, false); 813 vmem->requested_size = value; 814 } 815 /* 816 * Trigger a config update so the guest gets notified. We trigger 817 * even if the size didn't change (especially helpful for debugging). 818 */ 819 virtio_notify_config(VIRTIO_DEVICE(vmem)); 820 } else { 821 vmem->requested_size = value; 822 } 823 } 824 825 static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name, 826 void *opaque, Error **errp) 827 { 828 const VirtIOMEM *vmem = VIRTIO_MEM(obj); 829 uint64_t value = vmem->block_size; 830 831 /* 832 * If not configured by the user (and we're not realized yet), use the 833 * default block size we would use with the current memory backend. 834 */ 835 if (!value) { 836 if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) { 837 value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block); 838 } else { 839 value = virtio_mem_thp_size(); 840 } 841 } 842 843 visit_type_size(v, name, &value, errp); 844 } 845 846 static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, 847 void *opaque, Error **errp) 848 { 849 VirtIOMEM *vmem = VIRTIO_MEM(obj); 850 Error *err = NULL; 851 uint64_t value; 852 853 if (DEVICE(obj)->realized) { 854 error_setg(errp, "'%s' cannot be changed", name); 855 return; 856 } 857 858 visit_type_size(v, name, &value, &err); 859 if (err) { 860 error_propagate(errp, err); 861 return; 862 } 863 864 if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) { 865 error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name, 866 VIRTIO_MEM_MIN_BLOCK_SIZE); 867 return; 868 } else if (!is_power_of_2(value)) { 869 error_setg(errp, "'%s' property has to be a power of two", name); 870 return; 871 } 872 vmem->block_size = value; 873 } 874 875 static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) 876 { 877 void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block); 878 unsigned long first_zero_bit, last_zero_bit; 879 uint64_t offset, length; 880 881 /* 882 * Find consecutive unplugged blocks and exclude them from migration. 883 * 884 * Note: Blocks cannot get (un)plugged during precopy, no locking needed. 885 */ 886 first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); 887 while (first_zero_bit < vmem->bitmap_size) { 888 offset = first_zero_bit * vmem->block_size; 889 last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, 890 first_zero_bit + 1) - 1; 891 length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; 892 893 qemu_guest_free_page_hint(host + offset, length); 894 first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, 895 last_zero_bit + 2); 896 } 897 } 898 899 static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data) 900 { 901 VirtIOMEM *vmem = container_of(n, VirtIOMEM, precopy_notifier); 902 PrecopyNotifyData *pnd = data; 903 904 switch (pnd->reason) { 905 case PRECOPY_NOTIFY_SETUP: 906 precopy_enable_free_page_optimization(); 907 break; 908 case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC: 909 virtio_mem_precopy_exclude_unplugged(vmem); 910 break; 911 default: 912 break; 913 } 914 915 return 0; 916 } 917 918 static void virtio_mem_instance_init(Object *obj) 919 { 920 VirtIOMEM *vmem = VIRTIO_MEM(obj); 921 922 notifier_list_init(&vmem->size_change_notifiers); 923 vmem->precopy_notifier.notify = virtio_mem_precopy_notify; 924 925 object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, 926 NULL, NULL, NULL); 927 object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size", 928 virtio_mem_get_requested_size, 929 virtio_mem_set_requested_size, NULL, NULL); 930 object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size", 931 virtio_mem_get_block_size, virtio_mem_set_block_size, 932 NULL, NULL); 933 } 934 935 static Property virtio_mem_properties[] = { 936 DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0), 937 DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0), 938 DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev, 939 TYPE_MEMORY_BACKEND, HostMemoryBackend *), 940 DEFINE_PROP_END_OF_LIST(), 941 }; 942 943 static void virtio_mem_class_init(ObjectClass *klass, void *data) 944 { 945 DeviceClass *dc = DEVICE_CLASS(klass); 946 VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); 947 VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); 948 949 device_class_set_props(dc, virtio_mem_properties); 950 dc->vmsd = &vmstate_virtio_mem; 951 952 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 953 vdc->realize = virtio_mem_device_realize; 954 vdc->unrealize = virtio_mem_device_unrealize; 955 vdc->get_config = virtio_mem_get_config; 956 vdc->get_features = virtio_mem_get_features; 957 vdc->vmsd = &vmstate_virtio_mem_device; 958 959 vmc->fill_device_info = virtio_mem_fill_device_info; 960 vmc->get_memory_region = virtio_mem_get_memory_region; 961 vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; 962 vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; 963 } 964 965 static const TypeInfo virtio_mem_info = { 966 .name = TYPE_VIRTIO_MEM, 967 .parent = TYPE_VIRTIO_DEVICE, 968 .instance_size = sizeof(VirtIOMEM), 969 .instance_init = virtio_mem_instance_init, 970 .class_init = virtio_mem_class_init, 971 .class_size = sizeof(VirtIOMEMClass), 972 }; 973 974 static void virtio_register_types(void) 975 { 976 type_register_static(&virtio_mem_info); 977 } 978 979 type_init(virtio_register_types) 980