1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 24 #include <acpi/acpi_numa.h> 25 26 static bool unplug_online = true; 27 module_param(unplug_online, bool, 0644); 28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 29 30 static bool force_bbm; 31 module_param(force_bbm, bool, 0444); 32 MODULE_PARM_DESC(force_bbm, 33 "Force Big Block Mode. Default is 0 (auto-selection)"); 34 35 static unsigned long bbm_block_size; 36 module_param(bbm_block_size, ulong, 0444); 37 MODULE_PARM_DESC(bbm_block_size, 38 "Big Block size in bytes. Default is 0 (auto-detection)."); 39 40 static bool bbm_safe_unplug = true; 41 module_param(bbm_safe_unplug, bool, 0444); 42 MODULE_PARM_DESC(bbm_safe_unplug, 43 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 44 45 /* 46 * virtio-mem currently supports the following modes of operation: 47 * 48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 49 * size of a Sub Block (SB) is determined based on the device block size, the 50 * pageblock size, and the maximum allocation granularity of the buddy. 51 * Subblocks within a Linux memory block might either be plugged or unplugged. 52 * Memory is added/removed to Linux MM in Linux memory block granularity. 53 * 54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 55 * Memory is added/removed to Linux MM in Big Block granularity. 56 * 57 * The mode is determined automatically based on the Linux memory block size 58 * and the device block size. 59 * 60 * User space / core MM (auto onlining) is responsible for onlining added 61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 62 * always onlined separately, and all memory within a Linux memory block is 63 * onlined to the same zone - virtio-mem relies on this behavior. 64 */ 65 66 /* 67 * State of a Linux memory block in SBM. 68 */ 69 enum virtio_mem_sbm_mb_state { 70 /* Unplugged, not added to Linux. Can be reused later. */ 71 VIRTIO_MEM_SBM_MB_UNUSED = 0, 72 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 73 VIRTIO_MEM_SBM_MB_PLUGGED, 74 /* Fully plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE, 76 /* Partially plugged, fully added to Linux, offline. */ 77 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 78 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 79 VIRTIO_MEM_SBM_MB_KERNEL, 80 /* Partially plugged, fully added to Linux, online to a kernel zone */ 81 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 82 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 VIRTIO_MEM_SBM_MB_MOVABLE, 84 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 85 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 86 VIRTIO_MEM_SBM_MB_COUNT 87 }; 88 89 /* 90 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 91 */ 92 enum virtio_mem_bbm_bb_state { 93 /* Unplugged, not added to Linux. Can be reused later. */ 94 VIRTIO_MEM_BBM_BB_UNUSED = 0, 95 /* Plugged, not added to Linux. Error on add_memory(). */ 96 VIRTIO_MEM_BBM_BB_PLUGGED, 97 /* Plugged and added to Linux. */ 98 VIRTIO_MEM_BBM_BB_ADDED, 99 /* All online parts are fake-offline, ready to remove. */ 100 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 101 VIRTIO_MEM_BBM_BB_COUNT 102 }; 103 104 struct virtio_mem { 105 struct virtio_device *vdev; 106 107 /* We might first have to unplug all memory when starting up. */ 108 bool unplug_all_required; 109 110 /* Workqueue that processes the plug/unplug requests. */ 111 struct work_struct wq; 112 atomic_t wq_active; 113 atomic_t config_changed; 114 115 /* Virtqueue for guest->host requests. */ 116 struct virtqueue *vq; 117 118 /* Wait for a host response to a guest request. */ 119 wait_queue_head_t host_resp; 120 121 /* Space for one guest request and the host response. */ 122 struct virtio_mem_req req; 123 struct virtio_mem_resp resp; 124 125 /* The current size of the device. */ 126 uint64_t plugged_size; 127 /* The requested size of the device. */ 128 uint64_t requested_size; 129 130 /* The device block size (for communicating with the device). */ 131 uint64_t device_block_size; 132 /* The determined node id for all memory of the device. */ 133 int nid; 134 /* Physical start address of the memory region. */ 135 uint64_t addr; 136 /* Maximum region size in bytes. */ 137 uint64_t region_size; 138 139 /* The parent resource for all memory added via this device. */ 140 struct resource *parent_resource; 141 /* 142 * Copy of "System RAM (virtio_mem)" to be used for 143 * add_memory_driver_managed(). 144 */ 145 const char *resource_name; 146 /* Memory group identification. */ 147 int mgid; 148 149 /* 150 * We don't want to add too much memory if it's not getting onlined, 151 * to avoid running OOM. Besides this threshold, we allow to have at 152 * least two offline blocks at a time (whatever is bigger). 153 */ 154 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 155 atomic64_t offline_size; 156 uint64_t offline_threshold; 157 158 /* If set, the driver is in SBM, otherwise in BBM. */ 159 bool in_sbm; 160 161 union { 162 struct { 163 /* Id of the first memory block of this device. */ 164 unsigned long first_mb_id; 165 /* Id of the last usable memory block of this device. */ 166 unsigned long last_usable_mb_id; 167 /* Id of the next memory bock to prepare when needed. */ 168 unsigned long next_mb_id; 169 170 /* The subblock size. */ 171 uint64_t sb_size; 172 /* The number of subblocks per Linux memory block. */ 173 uint32_t sbs_per_mb; 174 175 /* Summary of all memory block states. */ 176 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 177 178 /* 179 * One byte state per memory block. Allocated via 180 * vmalloc(). Resized (alloc+copy+free) on demand. 181 * 182 * With 128 MiB memory blocks, we have states for 512 183 * GiB of memory in one 4 KiB page. 184 */ 185 uint8_t *mb_states; 186 187 /* 188 * Bitmap: one bit per subblock. Allocated similar to 189 * sbm.mb_states. 190 * 191 * A set bit means the corresponding subblock is 192 * plugged, otherwise it's unblocked. 193 * 194 * With 4 MiB subblocks, we manage 128 GiB of memory 195 * in one 4 KiB page. 196 */ 197 unsigned long *sb_states; 198 } sbm; 199 200 struct { 201 /* Id of the first big block of this device. */ 202 unsigned long first_bb_id; 203 /* Id of the last usable big block of this device. */ 204 unsigned long last_usable_bb_id; 205 /* Id of the next device bock to prepare when needed. */ 206 unsigned long next_bb_id; 207 208 /* Summary of all big block states. */ 209 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 210 211 /* One byte state per big block. See sbm.mb_states. */ 212 uint8_t *bb_states; 213 214 /* The block size used for plugging/adding/removing. */ 215 uint64_t bb_size; 216 } bbm; 217 }; 218 219 /* 220 * Mutex that protects the sbm.mb_count, sbm.mb_states, 221 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 222 * 223 * When this lock is held the pointers can't change, ONLINE and 224 * OFFLINE blocks can't change the state and no subblocks will get 225 * plugged/unplugged. 226 * 227 * In kdump mode, used to serialize requests, last_block_addr and 228 * last_block_plugged. 229 */ 230 struct mutex hotplug_mutex; 231 bool hotplug_active; 232 233 /* An error occurred we cannot handle - stop processing requests. */ 234 bool broken; 235 236 /* Cached valued of is_kdump_kernel() when the device was probed. */ 237 bool in_kdump; 238 239 /* The driver is being removed. */ 240 spinlock_t removal_lock; 241 bool removing; 242 243 /* Timer for retrying to plug/unplug memory. */ 244 struct hrtimer retry_timer; 245 unsigned int retry_timer_ms; 246 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 247 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 248 249 /* Memory notifier (online/offline events). */ 250 struct notifier_block memory_notifier; 251 252 #ifdef CONFIG_PROC_VMCORE 253 /* vmcore callback for /proc/vmcore handling in kdump mode */ 254 struct vmcore_cb vmcore_cb; 255 uint64_t last_block_addr; 256 bool last_block_plugged; 257 #endif /* CONFIG_PROC_VMCORE */ 258 259 /* Next device in the list of virtio-mem devices. */ 260 struct list_head next; 261 }; 262 263 /* 264 * We have to share a single online_page callback among all virtio-mem 265 * devices. We use RCU to iterate the list in the callback. 266 */ 267 static DEFINE_MUTEX(virtio_mem_mutex); 268 static LIST_HEAD(virtio_mem_devices); 269 270 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 271 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 272 unsigned long nr_pages); 273 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 274 unsigned long nr_pages); 275 static void virtio_mem_retry(struct virtio_mem *vm); 276 static int virtio_mem_create_resource(struct virtio_mem *vm); 277 static void virtio_mem_delete_resource(struct virtio_mem *vm); 278 279 /* 280 * Register a virtio-mem device so it will be considered for the online_page 281 * callback. 282 */ 283 static int register_virtio_mem_device(struct virtio_mem *vm) 284 { 285 int rc = 0; 286 287 /* First device registers the callback. */ 288 mutex_lock(&virtio_mem_mutex); 289 if (list_empty(&virtio_mem_devices)) 290 rc = set_online_page_callback(&virtio_mem_online_page_cb); 291 if (!rc) 292 list_add_rcu(&vm->next, &virtio_mem_devices); 293 mutex_unlock(&virtio_mem_mutex); 294 295 return rc; 296 } 297 298 /* 299 * Unregister a virtio-mem device so it will no longer be considered for the 300 * online_page callback. 301 */ 302 static void unregister_virtio_mem_device(struct virtio_mem *vm) 303 { 304 /* Last device unregisters the callback. */ 305 mutex_lock(&virtio_mem_mutex); 306 list_del_rcu(&vm->next); 307 if (list_empty(&virtio_mem_devices)) 308 restore_online_page_callback(&virtio_mem_online_page_cb); 309 mutex_unlock(&virtio_mem_mutex); 310 311 synchronize_rcu(); 312 } 313 314 /* 315 * Calculate the memory block id of a given address. 316 */ 317 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 318 { 319 return addr / memory_block_size_bytes(); 320 } 321 322 /* 323 * Calculate the physical start address of a given memory block id. 324 */ 325 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 326 { 327 return mb_id * memory_block_size_bytes(); 328 } 329 330 /* 331 * Calculate the big block id of a given address. 332 */ 333 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 334 uint64_t addr) 335 { 336 return addr / vm->bbm.bb_size; 337 } 338 339 /* 340 * Calculate the physical start address of a given big block id. 341 */ 342 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 343 unsigned long bb_id) 344 { 345 return bb_id * vm->bbm.bb_size; 346 } 347 348 /* 349 * Calculate the subblock id of a given address. 350 */ 351 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 352 unsigned long addr) 353 { 354 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 355 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 356 357 return (addr - mb_addr) / vm->sbm.sb_size; 358 } 359 360 /* 361 * Set the state of a big block, taking care of the state counter. 362 */ 363 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 364 unsigned long bb_id, 365 enum virtio_mem_bbm_bb_state state) 366 { 367 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 368 enum virtio_mem_bbm_bb_state old_state; 369 370 old_state = vm->bbm.bb_states[idx]; 371 vm->bbm.bb_states[idx] = state; 372 373 BUG_ON(vm->bbm.bb_count[old_state] == 0); 374 vm->bbm.bb_count[old_state]--; 375 vm->bbm.bb_count[state]++; 376 } 377 378 /* 379 * Get the state of a big block. 380 */ 381 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 382 unsigned long bb_id) 383 { 384 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 385 } 386 387 /* 388 * Prepare the big block state array for the next big block. 389 */ 390 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 391 { 392 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 393 unsigned long new_bytes = old_bytes + 1; 394 int old_pages = PFN_UP(old_bytes); 395 int new_pages = PFN_UP(new_bytes); 396 uint8_t *new_array; 397 398 if (vm->bbm.bb_states && old_pages == new_pages) 399 return 0; 400 401 new_array = vzalloc(new_pages * PAGE_SIZE); 402 if (!new_array) 403 return -ENOMEM; 404 405 mutex_lock(&vm->hotplug_mutex); 406 if (vm->bbm.bb_states) 407 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 408 vfree(vm->bbm.bb_states); 409 vm->bbm.bb_states = new_array; 410 mutex_unlock(&vm->hotplug_mutex); 411 412 return 0; 413 } 414 415 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 416 for (_bb_id = vm->bbm.first_bb_id; \ 417 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 418 _bb_id++) \ 419 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 420 421 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 422 for (_bb_id = vm->bbm.next_bb_id - 1; \ 423 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 424 _bb_id--) \ 425 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 426 427 /* 428 * Set the state of a memory block, taking care of the state counter. 429 */ 430 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 431 unsigned long mb_id, uint8_t state) 432 { 433 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 434 uint8_t old_state; 435 436 old_state = vm->sbm.mb_states[idx]; 437 vm->sbm.mb_states[idx] = state; 438 439 BUG_ON(vm->sbm.mb_count[old_state] == 0); 440 vm->sbm.mb_count[old_state]--; 441 vm->sbm.mb_count[state]++; 442 } 443 444 /* 445 * Get the state of a memory block. 446 */ 447 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 448 unsigned long mb_id) 449 { 450 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 451 452 return vm->sbm.mb_states[idx]; 453 } 454 455 /* 456 * Prepare the state array for the next memory block. 457 */ 458 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 459 { 460 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 461 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 462 uint8_t *new_array; 463 464 if (vm->sbm.mb_states && old_pages == new_pages) 465 return 0; 466 467 new_array = vzalloc(new_pages * PAGE_SIZE); 468 if (!new_array) 469 return -ENOMEM; 470 471 mutex_lock(&vm->hotplug_mutex); 472 if (vm->sbm.mb_states) 473 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 474 vfree(vm->sbm.mb_states); 475 vm->sbm.mb_states = new_array; 476 mutex_unlock(&vm->hotplug_mutex); 477 478 return 0; 479 } 480 481 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 482 for (_mb_id = _vm->sbm.first_mb_id; \ 483 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 484 _mb_id++) \ 485 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 486 487 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 488 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 489 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 490 _mb_id--) \ 491 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 492 493 /* 494 * Calculate the bit number in the subblock bitmap for the given subblock 495 * inside the given memory block. 496 */ 497 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 498 unsigned long mb_id, int sb_id) 499 { 500 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 501 } 502 503 /* 504 * Mark all selected subblocks plugged. 505 * 506 * Will not modify the state of the memory block. 507 */ 508 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 509 unsigned long mb_id, int sb_id, 510 int count) 511 { 512 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 513 514 __bitmap_set(vm->sbm.sb_states, bit, count); 515 } 516 517 /* 518 * Mark all selected subblocks unplugged. 519 * 520 * Will not modify the state of the memory block. 521 */ 522 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 523 unsigned long mb_id, int sb_id, 524 int count) 525 { 526 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 527 528 __bitmap_clear(vm->sbm.sb_states, bit, count); 529 } 530 531 /* 532 * Test if all selected subblocks are plugged. 533 */ 534 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 535 unsigned long mb_id, int sb_id, 536 int count) 537 { 538 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 539 540 if (count == 1) 541 return test_bit(bit, vm->sbm.sb_states); 542 543 /* TODO: Helper similar to bitmap_set() */ 544 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 545 bit + count; 546 } 547 548 /* 549 * Test if all selected subblocks are unplugged. 550 */ 551 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 552 unsigned long mb_id, int sb_id, 553 int count) 554 { 555 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 556 557 /* TODO: Helper similar to bitmap_set() */ 558 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 559 bit + count; 560 } 561 562 /* 563 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 564 * none. 565 */ 566 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 567 unsigned long mb_id) 568 { 569 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 570 571 return find_next_zero_bit(vm->sbm.sb_states, 572 bit + vm->sbm.sbs_per_mb, bit) - bit; 573 } 574 575 /* 576 * Prepare the subblock bitmap for the next memory block. 577 */ 578 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 579 { 580 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 581 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 582 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 583 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 584 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 585 unsigned long *new_bitmap, *old_bitmap; 586 587 if (vm->sbm.sb_states && old_pages == new_pages) 588 return 0; 589 590 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 591 if (!new_bitmap) 592 return -ENOMEM; 593 594 mutex_lock(&vm->hotplug_mutex); 595 if (new_bitmap) 596 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 597 598 old_bitmap = vm->sbm.sb_states; 599 vm->sbm.sb_states = new_bitmap; 600 mutex_unlock(&vm->hotplug_mutex); 601 602 vfree(old_bitmap); 603 return 0; 604 } 605 606 /* 607 * Test if we could add memory without creating too much offline memory - 608 * to avoid running OOM if memory is getting onlined deferred. 609 */ 610 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 611 { 612 if (WARN_ON_ONCE(size > vm->offline_threshold)) 613 return false; 614 615 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 616 } 617 618 /* 619 * Try adding memory to Linux. Will usually only fail if out of memory. 620 * 621 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 622 * onlining code). 623 * 624 * Will not modify the state of memory blocks in virtio-mem. 625 */ 626 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 627 uint64_t size) 628 { 629 int rc; 630 631 /* 632 * When force-unloading the driver and we still have memory added to 633 * Linux, the resource name has to stay. 634 */ 635 if (!vm->resource_name) { 636 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 637 GFP_KERNEL); 638 if (!vm->resource_name) 639 return -ENOMEM; 640 } 641 642 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 643 addr + size - 1); 644 /* Memory might get onlined immediately. */ 645 atomic64_add(size, &vm->offline_size); 646 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 647 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 648 if (rc) { 649 atomic64_sub(size, &vm->offline_size); 650 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 651 /* 652 * TODO: Linux MM does not properly clean up yet in all cases 653 * where adding of memory failed - especially on -ENOMEM. 654 */ 655 } 656 return rc; 657 } 658 659 /* 660 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 661 */ 662 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 663 { 664 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 665 const uint64_t size = memory_block_size_bytes(); 666 667 return virtio_mem_add_memory(vm, addr, size); 668 } 669 670 /* 671 * See virtio_mem_add_memory(): Try adding a big block. 672 */ 673 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 674 { 675 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 676 const uint64_t size = vm->bbm.bb_size; 677 678 return virtio_mem_add_memory(vm, addr, size); 679 } 680 681 /* 682 * Try removing memory from Linux. Will only fail if memory blocks aren't 683 * offline. 684 * 685 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 686 * onlining code). 687 * 688 * Will not modify the state of memory blocks in virtio-mem. 689 */ 690 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 691 uint64_t size) 692 { 693 int rc; 694 695 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 696 addr + size - 1); 697 rc = remove_memory(addr, size); 698 if (!rc) { 699 atomic64_sub(size, &vm->offline_size); 700 /* 701 * We might have freed up memory we can now unplug, retry 702 * immediately instead of waiting. 703 */ 704 virtio_mem_retry(vm); 705 } else { 706 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 707 } 708 return rc; 709 } 710 711 /* 712 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 713 */ 714 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 715 { 716 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 717 const uint64_t size = memory_block_size_bytes(); 718 719 return virtio_mem_remove_memory(vm, addr, size); 720 } 721 722 /* 723 * Try offlining and removing memory from Linux. 724 * 725 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 726 * onlining code). 727 * 728 * Will not modify the state of memory blocks in virtio-mem. 729 */ 730 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 731 uint64_t addr, 732 uint64_t size) 733 { 734 int rc; 735 736 dev_dbg(&vm->vdev->dev, 737 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 738 addr + size - 1); 739 740 rc = offline_and_remove_memory(addr, size); 741 if (!rc) { 742 atomic64_sub(size, &vm->offline_size); 743 /* 744 * We might have freed up memory we can now unplug, retry 745 * immediately instead of waiting. 746 */ 747 virtio_mem_retry(vm); 748 } else { 749 dev_dbg(&vm->vdev->dev, 750 "offlining and removing memory failed: %d\n", rc); 751 } 752 return rc; 753 } 754 755 /* 756 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 757 * a single Linux memory block. 758 */ 759 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 760 unsigned long mb_id) 761 { 762 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 763 const uint64_t size = memory_block_size_bytes(); 764 765 return virtio_mem_offline_and_remove_memory(vm, addr, size); 766 } 767 768 /* 769 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 770 * all Linux memory blocks covered by the big block. 771 */ 772 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 773 unsigned long bb_id) 774 { 775 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 776 const uint64_t size = vm->bbm.bb_size; 777 778 return virtio_mem_offline_and_remove_memory(vm, addr, size); 779 } 780 781 /* 782 * Trigger the workqueue so the device can perform its magic. 783 */ 784 static void virtio_mem_retry(struct virtio_mem *vm) 785 { 786 unsigned long flags; 787 788 spin_lock_irqsave(&vm->removal_lock, flags); 789 if (!vm->removing) 790 queue_work(system_freezable_wq, &vm->wq); 791 spin_unlock_irqrestore(&vm->removal_lock, flags); 792 } 793 794 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 795 { 796 int node = NUMA_NO_NODE; 797 798 #if defined(CONFIG_ACPI_NUMA) 799 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 800 node = pxm_to_node(node_id); 801 #endif 802 return node; 803 } 804 805 /* 806 * Test if a virtio-mem device overlaps with the given range. Can be called 807 * from (notifier) callbacks lockless. 808 */ 809 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 810 uint64_t size) 811 { 812 return start < vm->addr + vm->region_size && vm->addr < start + size; 813 } 814 815 /* 816 * Test if a virtio-mem device contains a given range. Can be called from 817 * (notifier) callbacks lockless. 818 */ 819 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 820 uint64_t size) 821 { 822 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 823 } 824 825 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 826 unsigned long mb_id) 827 { 828 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 829 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 830 case VIRTIO_MEM_SBM_MB_OFFLINE: 831 return NOTIFY_OK; 832 default: 833 break; 834 } 835 dev_warn_ratelimited(&vm->vdev->dev, 836 "memory block onlining denied\n"); 837 return NOTIFY_BAD; 838 } 839 840 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 841 unsigned long mb_id) 842 { 843 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 844 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 845 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 846 virtio_mem_sbm_set_mb_state(vm, mb_id, 847 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 848 break; 849 case VIRTIO_MEM_SBM_MB_KERNEL: 850 case VIRTIO_MEM_SBM_MB_MOVABLE: 851 virtio_mem_sbm_set_mb_state(vm, mb_id, 852 VIRTIO_MEM_SBM_MB_OFFLINE); 853 break; 854 default: 855 BUG(); 856 break; 857 } 858 } 859 860 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 861 unsigned long mb_id, 862 unsigned long start_pfn) 863 { 864 const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == 865 ZONE_MOVABLE; 866 int new_state; 867 868 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 869 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 870 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 871 if (is_movable) 872 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 873 break; 874 case VIRTIO_MEM_SBM_MB_OFFLINE: 875 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 876 if (is_movable) 877 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 878 break; 879 default: 880 BUG(); 881 break; 882 } 883 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 884 } 885 886 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 887 unsigned long mb_id) 888 { 889 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 890 unsigned long pfn; 891 int sb_id; 892 893 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 894 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 895 continue; 896 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 897 sb_id * vm->sbm.sb_size); 898 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 899 } 900 } 901 902 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 903 unsigned long mb_id) 904 { 905 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 906 unsigned long pfn; 907 int sb_id; 908 909 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 910 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 911 continue; 912 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 913 sb_id * vm->sbm.sb_size); 914 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 915 } 916 } 917 918 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 919 unsigned long bb_id, 920 unsigned long pfn, 921 unsigned long nr_pages) 922 { 923 /* 924 * When marked as "fake-offline", all online memory of this device block 925 * is allocated by us. Otherwise, we don't have any memory allocated. 926 */ 927 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 928 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 929 return; 930 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 931 } 932 933 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 934 unsigned long bb_id, 935 unsigned long pfn, 936 unsigned long nr_pages) 937 { 938 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 939 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 940 return; 941 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 942 } 943 944 /* 945 * This callback will either be called synchronously from add_memory() or 946 * asynchronously (e.g., triggered via user space). We have to be careful 947 * with locking when calling add_memory(). 948 */ 949 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 950 unsigned long action, void *arg) 951 { 952 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 953 memory_notifier); 954 struct memory_notify *mhp = arg; 955 const unsigned long start = PFN_PHYS(mhp->start_pfn); 956 const unsigned long size = PFN_PHYS(mhp->nr_pages); 957 int rc = NOTIFY_OK; 958 unsigned long id; 959 960 if (!virtio_mem_overlaps_range(vm, start, size)) 961 return NOTIFY_DONE; 962 963 if (vm->in_sbm) { 964 id = virtio_mem_phys_to_mb_id(start); 965 /* 966 * In SBM, we add memory in separate memory blocks - we expect 967 * it to be onlined/offlined in the same granularity. Bail out 968 * if this ever changes. 969 */ 970 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 971 !IS_ALIGNED(start, memory_block_size_bytes()))) 972 return NOTIFY_BAD; 973 } else { 974 id = virtio_mem_phys_to_bb_id(vm, start); 975 /* 976 * In BBM, we only care about onlining/offlining happening 977 * within a single big block, we don't care about the 978 * actual granularity as we don't track individual Linux 979 * memory blocks. 980 */ 981 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 982 return NOTIFY_BAD; 983 } 984 985 /* 986 * Avoid circular locking lockdep warnings. We lock the mutex 987 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 988 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 989 * between both notifier calls and will bail out. False positive. 990 */ 991 lockdep_off(); 992 993 switch (action) { 994 case MEM_GOING_OFFLINE: 995 mutex_lock(&vm->hotplug_mutex); 996 if (vm->removing) { 997 rc = notifier_from_errno(-EBUSY); 998 mutex_unlock(&vm->hotplug_mutex); 999 break; 1000 } 1001 vm->hotplug_active = true; 1002 if (vm->in_sbm) 1003 virtio_mem_sbm_notify_going_offline(vm, id); 1004 else 1005 virtio_mem_bbm_notify_going_offline(vm, id, 1006 mhp->start_pfn, 1007 mhp->nr_pages); 1008 break; 1009 case MEM_GOING_ONLINE: 1010 mutex_lock(&vm->hotplug_mutex); 1011 if (vm->removing) { 1012 rc = notifier_from_errno(-EBUSY); 1013 mutex_unlock(&vm->hotplug_mutex); 1014 break; 1015 } 1016 vm->hotplug_active = true; 1017 if (vm->in_sbm) 1018 rc = virtio_mem_sbm_notify_going_online(vm, id); 1019 break; 1020 case MEM_OFFLINE: 1021 if (vm->in_sbm) 1022 virtio_mem_sbm_notify_offline(vm, id); 1023 1024 atomic64_add(size, &vm->offline_size); 1025 /* 1026 * Trigger the workqueue. Now that we have some offline memory, 1027 * maybe we can handle pending unplug requests. 1028 */ 1029 if (!unplug_online) 1030 virtio_mem_retry(vm); 1031 1032 vm->hotplug_active = false; 1033 mutex_unlock(&vm->hotplug_mutex); 1034 break; 1035 case MEM_ONLINE: 1036 if (vm->in_sbm) 1037 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1038 1039 atomic64_sub(size, &vm->offline_size); 1040 /* 1041 * Start adding more memory once we onlined half of our 1042 * threshold. Don't trigger if it's possibly due to our actipn 1043 * (e.g., us adding memory which gets onlined immediately from 1044 * the core). 1045 */ 1046 if (!atomic_read(&vm->wq_active) && 1047 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1048 virtio_mem_retry(vm); 1049 1050 vm->hotplug_active = false; 1051 mutex_unlock(&vm->hotplug_mutex); 1052 break; 1053 case MEM_CANCEL_OFFLINE: 1054 if (!vm->hotplug_active) 1055 break; 1056 if (vm->in_sbm) 1057 virtio_mem_sbm_notify_cancel_offline(vm, id); 1058 else 1059 virtio_mem_bbm_notify_cancel_offline(vm, id, 1060 mhp->start_pfn, 1061 mhp->nr_pages); 1062 vm->hotplug_active = false; 1063 mutex_unlock(&vm->hotplug_mutex); 1064 break; 1065 case MEM_CANCEL_ONLINE: 1066 if (!vm->hotplug_active) 1067 break; 1068 vm->hotplug_active = false; 1069 mutex_unlock(&vm->hotplug_mutex); 1070 break; 1071 default: 1072 break; 1073 } 1074 1075 lockdep_on(); 1076 1077 return rc; 1078 } 1079 1080 /* 1081 * Set a range of pages PG_offline. Remember pages that were never onlined 1082 * (via generic_online_page()) using PageDirty(). 1083 */ 1084 static void virtio_mem_set_fake_offline(unsigned long pfn, 1085 unsigned long nr_pages, bool onlined) 1086 { 1087 page_offline_begin(); 1088 for (; nr_pages--; pfn++) { 1089 struct page *page = pfn_to_page(pfn); 1090 1091 __SetPageOffline(page); 1092 if (!onlined) { 1093 SetPageDirty(page); 1094 /* FIXME: remove after cleanups */ 1095 ClearPageReserved(page); 1096 } 1097 } 1098 page_offline_end(); 1099 } 1100 1101 /* 1102 * Clear PG_offline from a range of pages. If the pages were never onlined, 1103 * (via generic_online_page()), clear PageDirty(). 1104 */ 1105 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1106 unsigned long nr_pages, bool onlined) 1107 { 1108 for (; nr_pages--; pfn++) { 1109 struct page *page = pfn_to_page(pfn); 1110 1111 __ClearPageOffline(page); 1112 if (!onlined) 1113 ClearPageDirty(page); 1114 } 1115 } 1116 1117 /* 1118 * Release a range of fake-offline pages to the buddy, effectively 1119 * fake-onlining them. 1120 */ 1121 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1122 { 1123 const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; 1124 unsigned long i; 1125 1126 /* 1127 * We are always called at least with MAX_ORDER_NR_PAGES 1128 * granularity/alignment (e.g., the way subblocks work). All pages 1129 * inside such a block are alike. 1130 */ 1131 for (i = 0; i < nr_pages; i += max_nr_pages) { 1132 struct page *page = pfn_to_page(pfn + i); 1133 1134 /* 1135 * If the page is PageDirty(), it was kept fake-offline when 1136 * onlining the memory block. Otherwise, it was allocated 1137 * using alloc_contig_range(). All pages in a subblock are 1138 * alike. 1139 */ 1140 if (PageDirty(page)) { 1141 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1142 false); 1143 generic_online_page(page, MAX_ORDER - 1); 1144 } else { 1145 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1146 true); 1147 free_contig_range(pfn + i, max_nr_pages); 1148 adjust_managed_page_count(page, max_nr_pages); 1149 } 1150 } 1151 } 1152 1153 /* 1154 * Try to allocate a range, marking pages fake-offline, effectively 1155 * fake-offlining them. 1156 */ 1157 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1158 { 1159 const bool is_movable = page_zonenum(pfn_to_page(pfn)) == 1160 ZONE_MOVABLE; 1161 int rc, retry_count; 1162 1163 /* 1164 * TODO: We want an alloc_contig_range() mode that tries to allocate 1165 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1166 * with ZONE_MOVABLE. So for now, retry a couple of times with 1167 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1168 * some guarantees. 1169 */ 1170 for (retry_count = 0; retry_count < 5; retry_count++) { 1171 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1172 GFP_KERNEL); 1173 if (rc == -ENOMEM) 1174 /* whoops, out of memory */ 1175 return rc; 1176 else if (rc && !is_movable) 1177 break; 1178 else if (rc) 1179 continue; 1180 1181 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1182 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1183 return 0; 1184 } 1185 1186 return -EBUSY; 1187 } 1188 1189 /* 1190 * Handle fake-offline pages when memory is going offline - such that the 1191 * pages can be skipped by mm-core when offlining. 1192 */ 1193 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1194 unsigned long nr_pages) 1195 { 1196 struct page *page; 1197 unsigned long i; 1198 1199 /* 1200 * Drop our reference to the pages so the memory can get offlined 1201 * and add the unplugged pages to the managed page counters (so 1202 * offlining code can correctly subtract them again). 1203 */ 1204 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1205 /* Drop our reference to the pages so the memory can get offlined. */ 1206 for (i = 0; i < nr_pages; i++) { 1207 page = pfn_to_page(pfn + i); 1208 if (WARN_ON(!page_ref_dec_and_test(page))) 1209 dump_page(page, "fake-offline page referenced"); 1210 } 1211 } 1212 1213 /* 1214 * Handle fake-offline pages when memory offlining is canceled - to undo 1215 * what we did in virtio_mem_fake_offline_going_offline(). 1216 */ 1217 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1218 unsigned long nr_pages) 1219 { 1220 unsigned long i; 1221 1222 /* 1223 * Get the reference we dropped when going offline and subtract the 1224 * unplugged pages from the managed page counters. 1225 */ 1226 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1227 for (i = 0; i < nr_pages; i++) 1228 page_ref_inc(pfn_to_page(pfn + i)); 1229 } 1230 1231 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1232 { 1233 const unsigned long addr = page_to_phys(page); 1234 unsigned long id, sb_id; 1235 struct virtio_mem *vm; 1236 bool do_online; 1237 1238 rcu_read_lock(); 1239 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1240 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1241 continue; 1242 1243 if (vm->in_sbm) { 1244 /* 1245 * We exploit here that subblocks have at least 1246 * MAX_ORDER_NR_PAGES size/alignment - so we cannot 1247 * cross subblocks within one call. 1248 */ 1249 id = virtio_mem_phys_to_mb_id(addr); 1250 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1251 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, 1252 sb_id, 1); 1253 } else { 1254 /* 1255 * If the whole block is marked fake offline, keep 1256 * everything that way. 1257 */ 1258 id = virtio_mem_phys_to_bb_id(vm, addr); 1259 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1260 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1261 } 1262 1263 /* 1264 * virtio_mem_set_fake_offline() might sleep, we don't need 1265 * the device anymore. See virtio_mem_remove() how races 1266 * between memory onlining and device removal are handled. 1267 */ 1268 rcu_read_unlock(); 1269 1270 if (do_online) 1271 generic_online_page(page, order); 1272 else 1273 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1274 false); 1275 return; 1276 } 1277 rcu_read_unlock(); 1278 1279 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1280 generic_online_page(page, order); 1281 } 1282 1283 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1284 const struct virtio_mem_req *req) 1285 { 1286 struct scatterlist *sgs[2], sg_req, sg_resp; 1287 unsigned int len; 1288 int rc; 1289 1290 /* don't use the request residing on the stack (vaddr) */ 1291 vm->req = *req; 1292 1293 /* out: buffer for request */ 1294 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1295 sgs[0] = &sg_req; 1296 1297 /* in: buffer for response */ 1298 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1299 sgs[1] = &sg_resp; 1300 1301 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1302 if (rc < 0) 1303 return rc; 1304 1305 virtqueue_kick(vm->vq); 1306 1307 /* wait for a response */ 1308 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1309 1310 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1311 } 1312 1313 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1314 uint64_t size) 1315 { 1316 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1317 const struct virtio_mem_req req = { 1318 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1319 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1320 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1321 }; 1322 int rc = -ENOMEM; 1323 1324 if (atomic_read(&vm->config_changed)) 1325 return -EAGAIN; 1326 1327 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1328 addr + size - 1); 1329 1330 switch (virtio_mem_send_request(vm, &req)) { 1331 case VIRTIO_MEM_RESP_ACK: 1332 vm->plugged_size += size; 1333 return 0; 1334 case VIRTIO_MEM_RESP_NACK: 1335 rc = -EAGAIN; 1336 break; 1337 case VIRTIO_MEM_RESP_BUSY: 1338 rc = -ETXTBSY; 1339 break; 1340 case VIRTIO_MEM_RESP_ERROR: 1341 rc = -EINVAL; 1342 break; 1343 default: 1344 break; 1345 } 1346 1347 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1348 return rc; 1349 } 1350 1351 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1352 uint64_t size) 1353 { 1354 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1355 const struct virtio_mem_req req = { 1356 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1357 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1358 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1359 }; 1360 int rc = -ENOMEM; 1361 1362 if (atomic_read(&vm->config_changed)) 1363 return -EAGAIN; 1364 1365 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1366 addr + size - 1); 1367 1368 switch (virtio_mem_send_request(vm, &req)) { 1369 case VIRTIO_MEM_RESP_ACK: 1370 vm->plugged_size -= size; 1371 return 0; 1372 case VIRTIO_MEM_RESP_BUSY: 1373 rc = -ETXTBSY; 1374 break; 1375 case VIRTIO_MEM_RESP_ERROR: 1376 rc = -EINVAL; 1377 break; 1378 default: 1379 break; 1380 } 1381 1382 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1383 return rc; 1384 } 1385 1386 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1387 { 1388 const struct virtio_mem_req req = { 1389 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1390 }; 1391 int rc = -ENOMEM; 1392 1393 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1394 1395 switch (virtio_mem_send_request(vm, &req)) { 1396 case VIRTIO_MEM_RESP_ACK: 1397 vm->unplug_all_required = false; 1398 vm->plugged_size = 0; 1399 /* usable region might have shrunk */ 1400 atomic_set(&vm->config_changed, 1); 1401 return 0; 1402 case VIRTIO_MEM_RESP_BUSY: 1403 rc = -ETXTBSY; 1404 break; 1405 default: 1406 break; 1407 } 1408 1409 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1410 return rc; 1411 } 1412 1413 /* 1414 * Plug selected subblocks. Updates the plugged state, but not the state 1415 * of the memory block. 1416 */ 1417 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1418 int sb_id, int count) 1419 { 1420 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1421 sb_id * vm->sbm.sb_size; 1422 const uint64_t size = count * vm->sbm.sb_size; 1423 int rc; 1424 1425 rc = virtio_mem_send_plug_request(vm, addr, size); 1426 if (!rc) 1427 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1428 return rc; 1429 } 1430 1431 /* 1432 * Unplug selected subblocks. Updates the plugged state, but not the state 1433 * of the memory block. 1434 */ 1435 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1436 int sb_id, int count) 1437 { 1438 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1439 sb_id * vm->sbm.sb_size; 1440 const uint64_t size = count * vm->sbm.sb_size; 1441 int rc; 1442 1443 rc = virtio_mem_send_unplug_request(vm, addr, size); 1444 if (!rc) 1445 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1446 return rc; 1447 } 1448 1449 /* 1450 * Request to unplug a big block. 1451 * 1452 * Will not modify the state of the big block. 1453 */ 1454 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1455 { 1456 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1457 const uint64_t size = vm->bbm.bb_size; 1458 1459 return virtio_mem_send_unplug_request(vm, addr, size); 1460 } 1461 1462 /* 1463 * Request to plug a big block. 1464 * 1465 * Will not modify the state of the big block. 1466 */ 1467 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1468 { 1469 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1470 const uint64_t size = vm->bbm.bb_size; 1471 1472 return virtio_mem_send_plug_request(vm, addr, size); 1473 } 1474 1475 /* 1476 * Unplug the desired number of plugged subblocks of a offline or not-added 1477 * memory block. Will fail if any subblock cannot get unplugged (instead of 1478 * skipping it). 1479 * 1480 * Will not modify the state of the memory block. 1481 * 1482 * Note: can fail after some subblocks were unplugged. 1483 */ 1484 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1485 unsigned long mb_id, uint64_t *nb_sb) 1486 { 1487 int sb_id, count; 1488 int rc; 1489 1490 sb_id = vm->sbm.sbs_per_mb - 1; 1491 while (*nb_sb) { 1492 /* Find the next candidate subblock */ 1493 while (sb_id >= 0 && 1494 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1495 sb_id--; 1496 if (sb_id < 0) 1497 break; 1498 /* Try to unplug multiple subblocks at a time */ 1499 count = 1; 1500 while (count < *nb_sb && sb_id > 0 && 1501 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1502 count++; 1503 sb_id--; 1504 } 1505 1506 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1507 if (rc) 1508 return rc; 1509 *nb_sb -= count; 1510 sb_id--; 1511 } 1512 1513 return 0; 1514 } 1515 1516 /* 1517 * Unplug all plugged subblocks of an offline or not-added memory block. 1518 * 1519 * Will not modify the state of the memory block. 1520 * 1521 * Note: can fail after some subblocks were unplugged. 1522 */ 1523 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1524 { 1525 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1526 1527 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1528 } 1529 1530 /* 1531 * Prepare tracking data for the next memory block. 1532 */ 1533 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1534 unsigned long *mb_id) 1535 { 1536 int rc; 1537 1538 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1539 return -ENOSPC; 1540 1541 /* Resize the state array if required. */ 1542 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1543 if (rc) 1544 return rc; 1545 1546 /* Resize the subblock bitmap if required. */ 1547 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1548 if (rc) 1549 return rc; 1550 1551 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1552 *mb_id = vm->sbm.next_mb_id++; 1553 return 0; 1554 } 1555 1556 /* 1557 * Try to plug the desired number of subblocks and add the memory block 1558 * to Linux. 1559 * 1560 * Will modify the state of the memory block. 1561 */ 1562 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1563 unsigned long mb_id, uint64_t *nb_sb) 1564 { 1565 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1566 int rc; 1567 1568 if (WARN_ON_ONCE(!count)) 1569 return -EINVAL; 1570 1571 /* 1572 * Plug the requested number of subblocks before adding it to linux, 1573 * so that onlining will directly online all plugged subblocks. 1574 */ 1575 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1576 if (rc) 1577 return rc; 1578 1579 /* 1580 * Mark the block properly offline before adding it to Linux, 1581 * so the memory notifiers will find the block in the right state. 1582 */ 1583 if (count == vm->sbm.sbs_per_mb) 1584 virtio_mem_sbm_set_mb_state(vm, mb_id, 1585 VIRTIO_MEM_SBM_MB_OFFLINE); 1586 else 1587 virtio_mem_sbm_set_mb_state(vm, mb_id, 1588 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1589 1590 /* Add the memory block to linux - if that fails, try to unplug. */ 1591 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1592 if (rc) { 1593 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1594 1595 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1596 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1597 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1598 return rc; 1599 } 1600 1601 *nb_sb -= count; 1602 return 0; 1603 } 1604 1605 /* 1606 * Try to plug the desired number of subblocks of a memory block that 1607 * is already added to Linux. 1608 * 1609 * Will modify the state of the memory block. 1610 * 1611 * Note: Can fail after some subblocks were successfully plugged. 1612 */ 1613 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1614 unsigned long mb_id, uint64_t *nb_sb) 1615 { 1616 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1617 unsigned long pfn, nr_pages; 1618 int sb_id, count; 1619 int rc; 1620 1621 if (WARN_ON_ONCE(!*nb_sb)) 1622 return -EINVAL; 1623 1624 while (*nb_sb) { 1625 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1626 if (sb_id >= vm->sbm.sbs_per_mb) 1627 break; 1628 count = 1; 1629 while (count < *nb_sb && 1630 sb_id + count < vm->sbm.sbs_per_mb && 1631 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1632 count++; 1633 1634 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1635 if (rc) 1636 return rc; 1637 *nb_sb -= count; 1638 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1639 continue; 1640 1641 /* fake-online the pages if the memory block is online */ 1642 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1643 sb_id * vm->sbm.sb_size); 1644 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1645 virtio_mem_fake_online(pfn, nr_pages); 1646 } 1647 1648 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1649 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1650 1651 return 0; 1652 } 1653 1654 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1655 { 1656 const int mb_states[] = { 1657 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1658 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1659 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1660 }; 1661 uint64_t nb_sb = diff / vm->sbm.sb_size; 1662 unsigned long mb_id; 1663 int rc, i; 1664 1665 if (!nb_sb) 1666 return 0; 1667 1668 /* Don't race with onlining/offlining */ 1669 mutex_lock(&vm->hotplug_mutex); 1670 1671 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1672 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1673 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1674 if (rc || !nb_sb) 1675 goto out_unlock; 1676 cond_resched(); 1677 } 1678 } 1679 1680 /* 1681 * We won't be working on online/offline memory blocks from this point, 1682 * so we can't race with memory onlining/offlining. Drop the mutex. 1683 */ 1684 mutex_unlock(&vm->hotplug_mutex); 1685 1686 /* Try to plug and add unused blocks */ 1687 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1688 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1689 return -ENOSPC; 1690 1691 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1692 if (rc || !nb_sb) 1693 return rc; 1694 cond_resched(); 1695 } 1696 1697 /* Try to prepare, plug and add new blocks */ 1698 while (nb_sb) { 1699 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1700 return -ENOSPC; 1701 1702 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1703 if (rc) 1704 return rc; 1705 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1706 if (rc) 1707 return rc; 1708 cond_resched(); 1709 } 1710 1711 return 0; 1712 out_unlock: 1713 mutex_unlock(&vm->hotplug_mutex); 1714 return rc; 1715 } 1716 1717 /* 1718 * Plug a big block and add it to Linux. 1719 * 1720 * Will modify the state of the big block. 1721 */ 1722 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1723 unsigned long bb_id) 1724 { 1725 int rc; 1726 1727 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1728 VIRTIO_MEM_BBM_BB_UNUSED)) 1729 return -EINVAL; 1730 1731 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1732 if (rc) 1733 return rc; 1734 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1735 1736 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1737 if (rc) { 1738 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1739 virtio_mem_bbm_set_bb_state(vm, bb_id, 1740 VIRTIO_MEM_BBM_BB_UNUSED); 1741 else 1742 /* Retry from the main loop. */ 1743 virtio_mem_bbm_set_bb_state(vm, bb_id, 1744 VIRTIO_MEM_BBM_BB_PLUGGED); 1745 return rc; 1746 } 1747 return 0; 1748 } 1749 1750 /* 1751 * Prepare tracking data for the next big block. 1752 */ 1753 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1754 unsigned long *bb_id) 1755 { 1756 int rc; 1757 1758 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1759 return -ENOSPC; 1760 1761 /* Resize the big block state array if required. */ 1762 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1763 if (rc) 1764 return rc; 1765 1766 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1767 *bb_id = vm->bbm.next_bb_id; 1768 vm->bbm.next_bb_id++; 1769 return 0; 1770 } 1771 1772 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1773 { 1774 uint64_t nb_bb = diff / vm->bbm.bb_size; 1775 unsigned long bb_id; 1776 int rc; 1777 1778 if (!nb_bb) 1779 return 0; 1780 1781 /* Try to plug and add unused big blocks */ 1782 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1783 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1784 return -ENOSPC; 1785 1786 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1787 if (!rc) 1788 nb_bb--; 1789 if (rc || !nb_bb) 1790 return rc; 1791 cond_resched(); 1792 } 1793 1794 /* Try to prepare, plug and add new big blocks */ 1795 while (nb_bb) { 1796 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1797 return -ENOSPC; 1798 1799 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1800 if (rc) 1801 return rc; 1802 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1803 if (!rc) 1804 nb_bb--; 1805 if (rc) 1806 return rc; 1807 cond_resched(); 1808 } 1809 1810 return 0; 1811 } 1812 1813 /* 1814 * Try to plug the requested amount of memory. 1815 */ 1816 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1817 { 1818 if (vm->in_sbm) 1819 return virtio_mem_sbm_plug_request(vm, diff); 1820 return virtio_mem_bbm_plug_request(vm, diff); 1821 } 1822 1823 /* 1824 * Unplug the desired number of plugged subblocks of an offline memory block. 1825 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1826 * 1827 * Will modify the state of the memory block. Might temporarily drop the 1828 * hotplug_mutex. 1829 * 1830 * Note: Can fail after some subblocks were successfully unplugged. 1831 */ 1832 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1833 unsigned long mb_id, 1834 uint64_t *nb_sb) 1835 { 1836 int rc; 1837 1838 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1839 1840 /* some subblocks might have been unplugged even on failure */ 1841 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1842 virtio_mem_sbm_set_mb_state(vm, mb_id, 1843 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1844 if (rc) 1845 return rc; 1846 1847 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1848 /* 1849 * Remove the block from Linux - this should never fail. 1850 * Hinder the block from getting onlined by marking it 1851 * unplugged. Temporarily drop the mutex, so 1852 * any pending GOING_ONLINE requests can be serviced/rejected. 1853 */ 1854 virtio_mem_sbm_set_mb_state(vm, mb_id, 1855 VIRTIO_MEM_SBM_MB_UNUSED); 1856 1857 mutex_unlock(&vm->hotplug_mutex); 1858 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1859 BUG_ON(rc); 1860 mutex_lock(&vm->hotplug_mutex); 1861 } 1862 return 0; 1863 } 1864 1865 /* 1866 * Unplug the given plugged subblocks of an online memory block. 1867 * 1868 * Will modify the state of the memory block. 1869 */ 1870 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1871 unsigned long mb_id, int sb_id, 1872 int count) 1873 { 1874 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1875 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1876 unsigned long start_pfn; 1877 int rc; 1878 1879 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1880 sb_id * vm->sbm.sb_size); 1881 1882 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1883 if (rc) 1884 return rc; 1885 1886 /* Try to unplug the allocated memory */ 1887 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1888 if (rc) { 1889 /* Return the memory to the buddy. */ 1890 virtio_mem_fake_online(start_pfn, nr_pages); 1891 return rc; 1892 } 1893 1894 switch (old_state) { 1895 case VIRTIO_MEM_SBM_MB_KERNEL: 1896 virtio_mem_sbm_set_mb_state(vm, mb_id, 1897 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1898 break; 1899 case VIRTIO_MEM_SBM_MB_MOVABLE: 1900 virtio_mem_sbm_set_mb_state(vm, mb_id, 1901 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1902 break; 1903 } 1904 1905 return 0; 1906 } 1907 1908 /* 1909 * Unplug the desired number of plugged subblocks of an online memory block. 1910 * Will skip subblock that are busy. 1911 * 1912 * Will modify the state of the memory block. Might temporarily drop the 1913 * hotplug_mutex. 1914 * 1915 * Note: Can fail after some subblocks were successfully unplugged. Can 1916 * return 0 even if subblocks were busy and could not get unplugged. 1917 */ 1918 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1919 unsigned long mb_id, 1920 uint64_t *nb_sb) 1921 { 1922 int rc, sb_id; 1923 1924 /* If possible, try to unplug the complete block in one shot. */ 1925 if (*nb_sb >= vm->sbm.sbs_per_mb && 1926 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1927 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1928 vm->sbm.sbs_per_mb); 1929 if (!rc) { 1930 *nb_sb -= vm->sbm.sbs_per_mb; 1931 goto unplugged; 1932 } else if (rc != -EBUSY) 1933 return rc; 1934 } 1935 1936 /* Fallback to single subblocks. */ 1937 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1938 /* Find the next candidate subblock */ 1939 while (sb_id >= 0 && 1940 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1941 sb_id--; 1942 if (sb_id < 0) 1943 break; 1944 1945 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1946 if (rc == -EBUSY) 1947 continue; 1948 else if (rc) 1949 return rc; 1950 *nb_sb -= 1; 1951 } 1952 1953 unplugged: 1954 /* 1955 * Once all subblocks of a memory block were unplugged, offline and 1956 * remove it. This will usually not fail, as no memory is in use 1957 * anymore - however some other notifiers might NACK the request. 1958 */ 1959 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1960 mutex_unlock(&vm->hotplug_mutex); 1961 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 1962 mutex_lock(&vm->hotplug_mutex); 1963 if (!rc) 1964 virtio_mem_sbm_set_mb_state(vm, mb_id, 1965 VIRTIO_MEM_SBM_MB_UNUSED); 1966 } 1967 1968 return 0; 1969 } 1970 1971 /* 1972 * Unplug the desired number of plugged subblocks of a memory block that is 1973 * already added to Linux. Will skip subblock of online memory blocks that are 1974 * busy (by the OS). Will fail if any subblock that's not busy cannot get 1975 * unplugged. 1976 * 1977 * Will modify the state of the memory block. Might temporarily drop the 1978 * hotplug_mutex. 1979 * 1980 * Note: Can fail after some subblocks were successfully unplugged. Can 1981 * return 0 even if subblocks were busy and could not get unplugged. 1982 */ 1983 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1984 unsigned long mb_id, 1985 uint64_t *nb_sb) 1986 { 1987 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1988 1989 switch (old_state) { 1990 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 1991 case VIRTIO_MEM_SBM_MB_KERNEL: 1992 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 1993 case VIRTIO_MEM_SBM_MB_MOVABLE: 1994 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 1995 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 1996 case VIRTIO_MEM_SBM_MB_OFFLINE: 1997 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 1998 } 1999 return -EINVAL; 2000 } 2001 2002 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2003 { 2004 const int mb_states[] = { 2005 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2006 VIRTIO_MEM_SBM_MB_OFFLINE, 2007 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2008 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2009 VIRTIO_MEM_SBM_MB_MOVABLE, 2010 VIRTIO_MEM_SBM_MB_KERNEL, 2011 }; 2012 uint64_t nb_sb = diff / vm->sbm.sb_size; 2013 unsigned long mb_id; 2014 int rc, i; 2015 2016 if (!nb_sb) 2017 return 0; 2018 2019 /* 2020 * We'll drop the mutex a couple of times when it is safe to do so. 2021 * This might result in some blocks switching the state (online/offline) 2022 * and we could miss them in this run - we will retry again later. 2023 */ 2024 mutex_lock(&vm->hotplug_mutex); 2025 2026 /* 2027 * We try unplug from partially plugged blocks first, to try removing 2028 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2029 * as it's more reliable to unplug memory and remove whole memory 2030 * blocks, and we don't want to trigger a zone imbalances by 2031 * accidentially removing too much kernel memory. 2032 */ 2033 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2034 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2035 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2036 if (rc || !nb_sb) 2037 goto out_unlock; 2038 mutex_unlock(&vm->hotplug_mutex); 2039 cond_resched(); 2040 mutex_lock(&vm->hotplug_mutex); 2041 } 2042 if (!unplug_online && i == 1) { 2043 mutex_unlock(&vm->hotplug_mutex); 2044 return 0; 2045 } 2046 } 2047 2048 mutex_unlock(&vm->hotplug_mutex); 2049 return nb_sb ? -EBUSY : 0; 2050 out_unlock: 2051 mutex_unlock(&vm->hotplug_mutex); 2052 return rc; 2053 } 2054 2055 /* 2056 * Try to offline and remove a big block from Linux and unplug it. Will fail 2057 * with -EBUSY if some memory is busy and cannot get unplugged. 2058 * 2059 * Will modify the state of the memory block. Might temporarily drop the 2060 * hotplug_mutex. 2061 */ 2062 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2063 unsigned long bb_id) 2064 { 2065 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2066 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2067 unsigned long end_pfn = start_pfn + nr_pages; 2068 unsigned long pfn; 2069 struct page *page; 2070 int rc; 2071 2072 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2073 VIRTIO_MEM_BBM_BB_ADDED)) 2074 return -EINVAL; 2075 2076 if (bbm_safe_unplug) { 2077 /* 2078 * Start by fake-offlining all memory. Once we marked the device 2079 * block as fake-offline, all newly onlined memory will 2080 * automatically be kept fake-offline. Protect from concurrent 2081 * onlining/offlining until we have a consistent state. 2082 */ 2083 mutex_lock(&vm->hotplug_mutex); 2084 virtio_mem_bbm_set_bb_state(vm, bb_id, 2085 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2086 2087 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2088 page = pfn_to_online_page(pfn); 2089 if (!page) 2090 continue; 2091 2092 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2093 if (rc) { 2094 end_pfn = pfn; 2095 goto rollback_safe_unplug; 2096 } 2097 } 2098 mutex_unlock(&vm->hotplug_mutex); 2099 } 2100 2101 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2102 if (rc) { 2103 if (bbm_safe_unplug) { 2104 mutex_lock(&vm->hotplug_mutex); 2105 goto rollback_safe_unplug; 2106 } 2107 return rc; 2108 } 2109 2110 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2111 if (rc) 2112 virtio_mem_bbm_set_bb_state(vm, bb_id, 2113 VIRTIO_MEM_BBM_BB_PLUGGED); 2114 else 2115 virtio_mem_bbm_set_bb_state(vm, bb_id, 2116 VIRTIO_MEM_BBM_BB_UNUSED); 2117 return rc; 2118 2119 rollback_safe_unplug: 2120 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2121 page = pfn_to_online_page(pfn); 2122 if (!page) 2123 continue; 2124 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2125 } 2126 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2127 mutex_unlock(&vm->hotplug_mutex); 2128 return rc; 2129 } 2130 2131 /* 2132 * Test if a big block is completely offline. 2133 */ 2134 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2135 unsigned long bb_id) 2136 { 2137 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2138 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2139 unsigned long pfn; 2140 2141 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2142 pfn += PAGES_PER_SECTION) { 2143 if (pfn_to_online_page(pfn)) 2144 return false; 2145 } 2146 2147 return true; 2148 } 2149 2150 /* 2151 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2152 */ 2153 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2154 unsigned long bb_id) 2155 { 2156 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2157 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2158 struct page *page; 2159 unsigned long pfn; 2160 2161 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2162 pfn += PAGES_PER_SECTION) { 2163 page = pfn_to_online_page(pfn); 2164 if (!page) 2165 continue; 2166 if (page_zonenum(page) != ZONE_MOVABLE) 2167 return false; 2168 } 2169 2170 return true; 2171 } 2172 2173 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2174 { 2175 uint64_t nb_bb = diff / vm->bbm.bb_size; 2176 uint64_t bb_id; 2177 int rc, i; 2178 2179 if (!nb_bb) 2180 return 0; 2181 2182 /* 2183 * Try to unplug big blocks. Similar to SBM, start with offline 2184 * big blocks. 2185 */ 2186 for (i = 0; i < 3; i++) { 2187 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2188 cond_resched(); 2189 2190 /* 2191 * As we're holding no locks, these checks are racy, 2192 * but we don't care. 2193 */ 2194 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2195 continue; 2196 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2197 continue; 2198 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2199 if (rc == -EBUSY) 2200 continue; 2201 if (!rc) 2202 nb_bb--; 2203 if (rc || !nb_bb) 2204 return rc; 2205 } 2206 if (i == 0 && !unplug_online) 2207 return 0; 2208 } 2209 2210 return nb_bb ? -EBUSY : 0; 2211 } 2212 2213 /* 2214 * Try to unplug the requested amount of memory. 2215 */ 2216 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2217 { 2218 if (vm->in_sbm) 2219 return virtio_mem_sbm_unplug_request(vm, diff); 2220 return virtio_mem_bbm_unplug_request(vm, diff); 2221 } 2222 2223 /* 2224 * Try to unplug all blocks that couldn't be unplugged before, for example, 2225 * because the hypervisor was busy. 2226 */ 2227 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2228 { 2229 unsigned long id; 2230 int rc; 2231 2232 if (!vm->in_sbm) { 2233 virtio_mem_bbm_for_each_bb(vm, id, 2234 VIRTIO_MEM_BBM_BB_PLUGGED) { 2235 rc = virtio_mem_bbm_unplug_bb(vm, id); 2236 if (rc) 2237 return rc; 2238 virtio_mem_bbm_set_bb_state(vm, id, 2239 VIRTIO_MEM_BBM_BB_UNUSED); 2240 } 2241 return 0; 2242 } 2243 2244 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2245 rc = virtio_mem_sbm_unplug_mb(vm, id); 2246 if (rc) 2247 return rc; 2248 virtio_mem_sbm_set_mb_state(vm, id, 2249 VIRTIO_MEM_SBM_MB_UNUSED); 2250 } 2251 2252 return 0; 2253 } 2254 2255 /* 2256 * Update all parts of the config that could have changed. 2257 */ 2258 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2259 { 2260 const struct range pluggable_range = mhp_get_pluggable_range(true); 2261 uint64_t new_plugged_size, usable_region_size, end_addr; 2262 2263 /* the plugged_size is just a reflection of what _we_ did previously */ 2264 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2265 &new_plugged_size); 2266 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2267 vm->plugged_size = new_plugged_size; 2268 2269 /* calculate the last usable memory block id */ 2270 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2271 usable_region_size, &usable_region_size); 2272 end_addr = min(vm->addr + usable_region_size - 1, 2273 pluggable_range.end); 2274 2275 if (vm->in_sbm) { 2276 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2277 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2278 vm->sbm.last_usable_mb_id--; 2279 } else { 2280 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2281 end_addr); 2282 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2283 vm->bbm.last_usable_bb_id--; 2284 } 2285 /* 2286 * If we cannot plug any of our device memory (e.g., nothing in the 2287 * usable region is addressable), the last usable memory block id will 2288 * be smaller than the first usable memory block id. We'll stop 2289 * attempting to add memory with -ENOSPC from our main loop. 2290 */ 2291 2292 /* see if there is a request to change the size */ 2293 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2294 &vm->requested_size); 2295 2296 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2297 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2298 } 2299 2300 /* 2301 * Workqueue function for handling plug/unplug requests and config updates. 2302 */ 2303 static void virtio_mem_run_wq(struct work_struct *work) 2304 { 2305 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2306 uint64_t diff; 2307 int rc; 2308 2309 if (unlikely(vm->in_kdump)) { 2310 dev_warn_once(&vm->vdev->dev, 2311 "unexpected workqueue run in kdump kernel\n"); 2312 return; 2313 } 2314 2315 hrtimer_cancel(&vm->retry_timer); 2316 2317 if (vm->broken) 2318 return; 2319 2320 atomic_set(&vm->wq_active, 1); 2321 retry: 2322 rc = 0; 2323 2324 /* Make sure we start with a clean state if there are leftovers. */ 2325 if (unlikely(vm->unplug_all_required)) 2326 rc = virtio_mem_send_unplug_all_request(vm); 2327 2328 if (atomic_read(&vm->config_changed)) { 2329 atomic_set(&vm->config_changed, 0); 2330 virtio_mem_refresh_config(vm); 2331 } 2332 2333 /* Unplug any leftovers from previous runs */ 2334 if (!rc) 2335 rc = virtio_mem_unplug_pending_mb(vm); 2336 2337 if (!rc && vm->requested_size != vm->plugged_size) { 2338 if (vm->requested_size > vm->plugged_size) { 2339 diff = vm->requested_size - vm->plugged_size; 2340 rc = virtio_mem_plug_request(vm, diff); 2341 } else { 2342 diff = vm->plugged_size - vm->requested_size; 2343 rc = virtio_mem_unplug_request(vm, diff); 2344 } 2345 } 2346 2347 switch (rc) { 2348 case 0: 2349 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2350 break; 2351 case -ENOSPC: 2352 /* 2353 * We cannot add any more memory (alignment, physical limit) 2354 * or we have too many offline memory blocks. 2355 */ 2356 break; 2357 case -ETXTBSY: 2358 /* 2359 * The hypervisor cannot process our request right now 2360 * (e.g., out of memory, migrating); 2361 */ 2362 case -EBUSY: 2363 /* 2364 * We cannot free up any memory to unplug it (all plugged memory 2365 * is busy). 2366 */ 2367 case -ENOMEM: 2368 /* Out of memory, try again later. */ 2369 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2370 HRTIMER_MODE_REL); 2371 break; 2372 case -EAGAIN: 2373 /* Retry immediately (e.g., the config changed). */ 2374 goto retry; 2375 default: 2376 /* Unknown error, mark as broken */ 2377 dev_err(&vm->vdev->dev, 2378 "unknown error, marking device broken: %d\n", rc); 2379 vm->broken = true; 2380 } 2381 2382 atomic_set(&vm->wq_active, 0); 2383 } 2384 2385 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2386 { 2387 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2388 retry_timer); 2389 2390 virtio_mem_retry(vm); 2391 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2392 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2393 return HRTIMER_NORESTART; 2394 } 2395 2396 static void virtio_mem_handle_response(struct virtqueue *vq) 2397 { 2398 struct virtio_mem *vm = vq->vdev->priv; 2399 2400 wake_up(&vm->host_resp); 2401 } 2402 2403 static int virtio_mem_init_vq(struct virtio_mem *vm) 2404 { 2405 struct virtqueue *vq; 2406 2407 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2408 "guest-request"); 2409 if (IS_ERR(vq)) 2410 return PTR_ERR(vq); 2411 vm->vq = vq; 2412 2413 return 0; 2414 } 2415 2416 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2417 { 2418 const struct range pluggable_range = mhp_get_pluggable_range(true); 2419 uint64_t unit_pages, sb_size, addr; 2420 int rc; 2421 2422 /* bad device setup - warn only */ 2423 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2424 dev_warn(&vm->vdev->dev, 2425 "The alignment of the physical start address can make some memory unusable.\n"); 2426 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2427 dev_warn(&vm->vdev->dev, 2428 "The alignment of the physical end address can make some memory unusable.\n"); 2429 if (vm->addr < pluggable_range.start || 2430 vm->addr + vm->region_size - 1 > pluggable_range.end) 2431 dev_warn(&vm->vdev->dev, 2432 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2433 2434 /* Prepare the offline threshold - make sure we can add two blocks. */ 2435 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2436 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2437 2438 /* 2439 * We want subblocks to span at least MAX_ORDER_NR_PAGES and 2440 * pageblock_nr_pages pages. This: 2441 * - Simplifies our page onlining code (virtio_mem_online_page_cb) 2442 * and fake page onlining code (virtio_mem_fake_online). 2443 * - Is required for now for alloc_contig_range() to work reliably - 2444 * it doesn't properly handle smaller granularity on ZONE_NORMAL. 2445 */ 2446 sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, 2447 pageblock_nr_pages) * PAGE_SIZE; 2448 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2449 2450 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2451 /* SBM: At least two subblocks per Linux memory block. */ 2452 vm->in_sbm = true; 2453 vm->sbm.sb_size = sb_size; 2454 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2455 vm->sbm.sb_size; 2456 2457 /* Round up to the next full memory block */ 2458 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2459 memory_block_size_bytes() - 1; 2460 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2461 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2462 } else { 2463 /* BBM: At least one Linux memory block. */ 2464 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2465 memory_block_size_bytes()); 2466 2467 if (bbm_block_size) { 2468 if (!is_power_of_2(bbm_block_size)) { 2469 dev_warn(&vm->vdev->dev, 2470 "bbm_block_size is not a power of 2"); 2471 } else if (bbm_block_size < vm->bbm.bb_size) { 2472 dev_warn(&vm->vdev->dev, 2473 "bbm_block_size is too small"); 2474 } else { 2475 vm->bbm.bb_size = bbm_block_size; 2476 } 2477 } 2478 2479 /* Round up to the next aligned big block */ 2480 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2481 vm->bbm.bb_size - 1; 2482 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2483 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2484 2485 /* Make sure we can add two big blocks. */ 2486 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2487 vm->offline_threshold); 2488 } 2489 2490 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2491 memory_block_size_bytes()); 2492 if (vm->in_sbm) 2493 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2494 (unsigned long long)vm->sbm.sb_size); 2495 else 2496 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2497 (unsigned long long)vm->bbm.bb_size); 2498 2499 /* create the parent resource for all memory */ 2500 rc = virtio_mem_create_resource(vm); 2501 if (rc) 2502 return rc; 2503 2504 /* use a single dynamic memory group to cover the whole memory device */ 2505 if (vm->in_sbm) 2506 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2507 else 2508 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2509 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2510 if (rc < 0) 2511 goto out_del_resource; 2512 vm->mgid = rc; 2513 2514 /* 2515 * If we still have memory plugged, we have to unplug all memory first. 2516 * Registering our parent resource makes sure that this memory isn't 2517 * actually in use (e.g., trying to reload the driver). 2518 */ 2519 if (vm->plugged_size) { 2520 vm->unplug_all_required = true; 2521 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2522 } 2523 2524 /* register callbacks */ 2525 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2526 rc = register_memory_notifier(&vm->memory_notifier); 2527 if (rc) 2528 goto out_unreg_group; 2529 rc = register_virtio_mem_device(vm); 2530 if (rc) 2531 goto out_unreg_mem; 2532 2533 return 0; 2534 out_unreg_mem: 2535 unregister_memory_notifier(&vm->memory_notifier); 2536 out_unreg_group: 2537 memory_group_unregister(vm->mgid); 2538 out_del_resource: 2539 virtio_mem_delete_resource(vm); 2540 return rc; 2541 } 2542 2543 #ifdef CONFIG_PROC_VMCORE 2544 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2545 uint64_t size) 2546 { 2547 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2548 const struct virtio_mem_req req = { 2549 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2550 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2551 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2552 }; 2553 int rc = -ENOMEM; 2554 2555 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2556 addr + size - 1); 2557 2558 switch (virtio_mem_send_request(vm, &req)) { 2559 case VIRTIO_MEM_RESP_ACK: 2560 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2561 case VIRTIO_MEM_RESP_ERROR: 2562 rc = -EINVAL; 2563 break; 2564 default: 2565 break; 2566 } 2567 2568 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2569 return rc; 2570 } 2571 2572 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2573 unsigned long pfn) 2574 { 2575 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2576 vmcore_cb); 2577 uint64_t addr = PFN_PHYS(pfn); 2578 bool is_ram; 2579 int rc; 2580 2581 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2582 return true; 2583 if (!vm->plugged_size) 2584 return false; 2585 2586 /* 2587 * We have to serialize device requests and access to the information 2588 * about the block queried last. 2589 */ 2590 mutex_lock(&vm->hotplug_mutex); 2591 2592 addr = ALIGN_DOWN(addr, vm->device_block_size); 2593 if (addr != vm->last_block_addr) { 2594 rc = virtio_mem_send_state_request(vm, addr, 2595 vm->device_block_size); 2596 /* On any kind of error, we're going to signal !ram. */ 2597 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2598 vm->last_block_plugged = true; 2599 else 2600 vm->last_block_plugged = false; 2601 vm->last_block_addr = addr; 2602 } 2603 2604 is_ram = vm->last_block_plugged; 2605 mutex_unlock(&vm->hotplug_mutex); 2606 return is_ram; 2607 } 2608 #endif /* CONFIG_PROC_VMCORE */ 2609 2610 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2611 { 2612 #ifdef CONFIG_PROC_VMCORE 2613 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2614 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2615 register_vmcore_cb(&vm->vmcore_cb); 2616 return 0; 2617 #else /* CONFIG_PROC_VMCORE */ 2618 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2619 return -EBUSY; 2620 #endif /* CONFIG_PROC_VMCORE */ 2621 } 2622 2623 static int virtio_mem_init(struct virtio_mem *vm) 2624 { 2625 uint16_t node_id; 2626 2627 if (!vm->vdev->config->get) { 2628 dev_err(&vm->vdev->dev, "config access disabled\n"); 2629 return -EINVAL; 2630 } 2631 2632 /* Fetch all properties that can't change. */ 2633 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2634 &vm->plugged_size); 2635 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2636 &vm->device_block_size); 2637 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2638 &node_id); 2639 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2640 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2641 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2642 &vm->region_size); 2643 2644 /* Determine the nid for the device based on the lowest address. */ 2645 if (vm->nid == NUMA_NO_NODE) 2646 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2647 2648 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2649 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2650 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2651 (unsigned long long)vm->device_block_size); 2652 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2653 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2654 2655 /* 2656 * We don't want to (un)plug or reuse any memory when in kdump. The 2657 * memory is still accessible (but not exposed to Linux). 2658 */ 2659 if (vm->in_kdump) 2660 return virtio_mem_init_kdump(vm); 2661 return virtio_mem_init_hotplug(vm); 2662 } 2663 2664 static int virtio_mem_create_resource(struct virtio_mem *vm) 2665 { 2666 /* 2667 * When force-unloading the driver and removing the device, we 2668 * could have a garbage pointer. Duplicate the string. 2669 */ 2670 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2671 2672 if (!name) 2673 return -ENOMEM; 2674 2675 /* Disallow mapping device memory via /dev/mem completely. */ 2676 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2677 name, IORESOURCE_SYSTEM_RAM | 2678 IORESOURCE_EXCLUSIVE); 2679 if (!vm->parent_resource) { 2680 kfree(name); 2681 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2682 dev_info(&vm->vdev->dev, 2683 "reloading the driver is not supported\n"); 2684 return -EBUSY; 2685 } 2686 2687 /* The memory is not actually busy - make add_memory() work. */ 2688 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2689 return 0; 2690 } 2691 2692 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2693 { 2694 const char *name; 2695 2696 if (!vm->parent_resource) 2697 return; 2698 2699 name = vm->parent_resource->name; 2700 release_resource(vm->parent_resource); 2701 kfree(vm->parent_resource); 2702 kfree(name); 2703 vm->parent_resource = NULL; 2704 } 2705 2706 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2707 { 2708 return 1; 2709 } 2710 2711 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2712 { 2713 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2714 2715 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2716 vm->addr + vm->region_size, NULL, 2717 virtio_mem_range_has_system_ram) == 1; 2718 } 2719 2720 static int virtio_mem_probe(struct virtio_device *vdev) 2721 { 2722 struct virtio_mem *vm; 2723 int rc; 2724 2725 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2726 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2727 2728 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2729 if (!vm) 2730 return -ENOMEM; 2731 2732 init_waitqueue_head(&vm->host_resp); 2733 vm->vdev = vdev; 2734 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2735 mutex_init(&vm->hotplug_mutex); 2736 INIT_LIST_HEAD(&vm->next); 2737 spin_lock_init(&vm->removal_lock); 2738 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2739 vm->retry_timer.function = virtio_mem_timer_expired; 2740 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2741 vm->in_kdump = is_kdump_kernel(); 2742 2743 /* register the virtqueue */ 2744 rc = virtio_mem_init_vq(vm); 2745 if (rc) 2746 goto out_free_vm; 2747 2748 /* initialize the device by querying the config */ 2749 rc = virtio_mem_init(vm); 2750 if (rc) 2751 goto out_del_vq; 2752 2753 virtio_device_ready(vdev); 2754 2755 /* trigger a config update to start processing the requested_size */ 2756 if (!vm->in_kdump) { 2757 atomic_set(&vm->config_changed, 1); 2758 queue_work(system_freezable_wq, &vm->wq); 2759 } 2760 2761 return 0; 2762 out_del_vq: 2763 vdev->config->del_vqs(vdev); 2764 out_free_vm: 2765 kfree(vm); 2766 vdev->priv = NULL; 2767 2768 return rc; 2769 } 2770 2771 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2772 { 2773 unsigned long mb_id; 2774 int rc; 2775 2776 /* 2777 * Make sure the workqueue won't be triggered anymore and no memory 2778 * blocks can be onlined/offlined until we're finished here. 2779 */ 2780 mutex_lock(&vm->hotplug_mutex); 2781 spin_lock_irq(&vm->removal_lock); 2782 vm->removing = true; 2783 spin_unlock_irq(&vm->removal_lock); 2784 mutex_unlock(&vm->hotplug_mutex); 2785 2786 /* wait until the workqueue stopped */ 2787 cancel_work_sync(&vm->wq); 2788 hrtimer_cancel(&vm->retry_timer); 2789 2790 if (vm->in_sbm) { 2791 /* 2792 * After we unregistered our callbacks, user space can online 2793 * partially plugged offline blocks. Make sure to remove them. 2794 */ 2795 virtio_mem_sbm_for_each_mb(vm, mb_id, 2796 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2797 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2798 BUG_ON(rc); 2799 virtio_mem_sbm_set_mb_state(vm, mb_id, 2800 VIRTIO_MEM_SBM_MB_UNUSED); 2801 } 2802 /* 2803 * After we unregistered our callbacks, user space can no longer 2804 * offline partially plugged online memory blocks. No need to 2805 * worry about them. 2806 */ 2807 } 2808 2809 /* unregister callbacks */ 2810 unregister_virtio_mem_device(vm); 2811 unregister_memory_notifier(&vm->memory_notifier); 2812 2813 /* 2814 * There is no way we could reliably remove all memory we have added to 2815 * the system. And there is no way to stop the driver/device from going 2816 * away. Warn at least. 2817 */ 2818 if (virtio_mem_has_memory_added(vm)) { 2819 dev_warn(&vm->vdev->dev, 2820 "device still has system memory added\n"); 2821 } else { 2822 virtio_mem_delete_resource(vm); 2823 kfree_const(vm->resource_name); 2824 memory_group_unregister(vm->mgid); 2825 } 2826 2827 /* remove all tracking data - no locking needed */ 2828 if (vm->in_sbm) { 2829 vfree(vm->sbm.mb_states); 2830 vfree(vm->sbm.sb_states); 2831 } else { 2832 vfree(vm->bbm.bb_states); 2833 } 2834 } 2835 2836 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2837 { 2838 #ifdef CONFIG_PROC_VMCORE 2839 unregister_vmcore_cb(&vm->vmcore_cb); 2840 #endif /* CONFIG_PROC_VMCORE */ 2841 } 2842 2843 static void virtio_mem_remove(struct virtio_device *vdev) 2844 { 2845 struct virtio_mem *vm = vdev->priv; 2846 2847 if (vm->in_kdump) 2848 virtio_mem_deinit_kdump(vm); 2849 else 2850 virtio_mem_deinit_hotplug(vm); 2851 2852 /* reset the device and cleanup the queues */ 2853 vdev->config->reset(vdev); 2854 vdev->config->del_vqs(vdev); 2855 2856 kfree(vm); 2857 vdev->priv = NULL; 2858 } 2859 2860 static void virtio_mem_config_changed(struct virtio_device *vdev) 2861 { 2862 struct virtio_mem *vm = vdev->priv; 2863 2864 if (unlikely(vm->in_kdump)) 2865 return; 2866 2867 atomic_set(&vm->config_changed, 1); 2868 virtio_mem_retry(vm); 2869 } 2870 2871 #ifdef CONFIG_PM_SLEEP 2872 static int virtio_mem_freeze(struct virtio_device *vdev) 2873 { 2874 /* 2875 * When restarting the VM, all memory is usually unplugged. Don't 2876 * allow to suspend/hibernate. 2877 */ 2878 dev_err(&vdev->dev, "save/restore not supported.\n"); 2879 return -EPERM; 2880 } 2881 2882 static int virtio_mem_restore(struct virtio_device *vdev) 2883 { 2884 return -EPERM; 2885 } 2886 #endif 2887 2888 static unsigned int virtio_mem_features[] = { 2889 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2890 VIRTIO_MEM_F_ACPI_PXM, 2891 #endif 2892 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 2893 }; 2894 2895 static const struct virtio_device_id virtio_mem_id_table[] = { 2896 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2897 { 0 }, 2898 }; 2899 2900 static struct virtio_driver virtio_mem_driver = { 2901 .feature_table = virtio_mem_features, 2902 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2903 .driver.name = KBUILD_MODNAME, 2904 .driver.owner = THIS_MODULE, 2905 .id_table = virtio_mem_id_table, 2906 .probe = virtio_mem_probe, 2907 .remove = virtio_mem_remove, 2908 .config_changed = virtio_mem_config_changed, 2909 #ifdef CONFIG_PM_SLEEP 2910 .freeze = virtio_mem_freeze, 2911 .restore = virtio_mem_restore, 2912 #endif 2913 }; 2914 2915 module_virtio_driver(virtio_mem_driver); 2916 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2917 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2918 MODULE_DESCRIPTION("Virtio-mem driver"); 2919 MODULE_LICENSE("GPL"); 2920