1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 24 #include <acpi/acpi_numa.h> 25 26 static bool unplug_online = true; 27 module_param(unplug_online, bool, 0644); 28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 29 30 static bool force_bbm; 31 module_param(force_bbm, bool, 0444); 32 MODULE_PARM_DESC(force_bbm, 33 "Force Big Block Mode. Default is 0 (auto-selection)"); 34 35 static unsigned long bbm_block_size; 36 module_param(bbm_block_size, ulong, 0444); 37 MODULE_PARM_DESC(bbm_block_size, 38 "Big Block size in bytes. Default is 0 (auto-detection)."); 39 40 static bool bbm_safe_unplug = true; 41 module_param(bbm_safe_unplug, bool, 0444); 42 MODULE_PARM_DESC(bbm_safe_unplug, 43 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 44 45 /* 46 * virtio-mem currently supports the following modes of operation: 47 * 48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 49 * size of a Sub Block (SB) is determined based on the device block size, the 50 * pageblock size, and the maximum allocation granularity of the buddy. 51 * Subblocks within a Linux memory block might either be plugged or unplugged. 52 * Memory is added/removed to Linux MM in Linux memory block granularity. 53 * 54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 55 * Memory is added/removed to Linux MM in Big Block granularity. 56 * 57 * The mode is determined automatically based on the Linux memory block size 58 * and the device block size. 59 * 60 * User space / core MM (auto onlining) is responsible for onlining added 61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 62 * always onlined separately, and all memory within a Linux memory block is 63 * onlined to the same zone - virtio-mem relies on this behavior. 64 */ 65 66 /* 67 * State of a Linux memory block in SBM. 68 */ 69 enum virtio_mem_sbm_mb_state { 70 /* Unplugged, not added to Linux. Can be reused later. */ 71 VIRTIO_MEM_SBM_MB_UNUSED = 0, 72 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 73 VIRTIO_MEM_SBM_MB_PLUGGED, 74 /* Fully plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE, 76 /* Partially plugged, fully added to Linux, offline. */ 77 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 78 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 79 VIRTIO_MEM_SBM_MB_KERNEL, 80 /* Partially plugged, fully added to Linux, online to a kernel zone */ 81 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 82 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 VIRTIO_MEM_SBM_MB_MOVABLE, 84 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 85 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 86 VIRTIO_MEM_SBM_MB_COUNT 87 }; 88 89 /* 90 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 91 */ 92 enum virtio_mem_bbm_bb_state { 93 /* Unplugged, not added to Linux. Can be reused later. */ 94 VIRTIO_MEM_BBM_BB_UNUSED = 0, 95 /* Plugged, not added to Linux. Error on add_memory(). */ 96 VIRTIO_MEM_BBM_BB_PLUGGED, 97 /* Plugged and added to Linux. */ 98 VIRTIO_MEM_BBM_BB_ADDED, 99 /* All online parts are fake-offline, ready to remove. */ 100 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 101 VIRTIO_MEM_BBM_BB_COUNT 102 }; 103 104 struct virtio_mem { 105 struct virtio_device *vdev; 106 107 /* We might first have to unplug all memory when starting up. */ 108 bool unplug_all_required; 109 110 /* Workqueue that processes the plug/unplug requests. */ 111 struct work_struct wq; 112 atomic_t wq_active; 113 atomic_t config_changed; 114 115 /* Virtqueue for guest->host requests. */ 116 struct virtqueue *vq; 117 118 /* Wait for a host response to a guest request. */ 119 wait_queue_head_t host_resp; 120 121 /* Space for one guest request and the host response. */ 122 struct virtio_mem_req req; 123 struct virtio_mem_resp resp; 124 125 /* The current size of the device. */ 126 uint64_t plugged_size; 127 /* The requested size of the device. */ 128 uint64_t requested_size; 129 130 /* The device block size (for communicating with the device). */ 131 uint64_t device_block_size; 132 /* The determined node id for all memory of the device. */ 133 int nid; 134 /* Physical start address of the memory region. */ 135 uint64_t addr; 136 /* Maximum region size in bytes. */ 137 uint64_t region_size; 138 139 /* The parent resource for all memory added via this device. */ 140 struct resource *parent_resource; 141 /* 142 * Copy of "System RAM (virtio_mem)" to be used for 143 * add_memory_driver_managed(). 144 */ 145 const char *resource_name; 146 147 /* 148 * We don't want to add too much memory if it's not getting onlined, 149 * to avoid running OOM. Besides this threshold, we allow to have at 150 * least two offline blocks at a time (whatever is bigger). 151 */ 152 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 153 atomic64_t offline_size; 154 uint64_t offline_threshold; 155 156 /* If set, the driver is in SBM, otherwise in BBM. */ 157 bool in_sbm; 158 159 union { 160 struct { 161 /* Id of the first memory block of this device. */ 162 unsigned long first_mb_id; 163 /* Id of the last usable memory block of this device. */ 164 unsigned long last_usable_mb_id; 165 /* Id of the next memory bock to prepare when needed. */ 166 unsigned long next_mb_id; 167 168 /* The subblock size. */ 169 uint64_t sb_size; 170 /* The number of subblocks per Linux memory block. */ 171 uint32_t sbs_per_mb; 172 173 /* Summary of all memory block states. */ 174 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 175 176 /* 177 * One byte state per memory block. Allocated via 178 * vmalloc(). Resized (alloc+copy+free) on demand. 179 * 180 * With 128 MiB memory blocks, we have states for 512 181 * GiB of memory in one 4 KiB page. 182 */ 183 uint8_t *mb_states; 184 185 /* 186 * Bitmap: one bit per subblock. Allocated similar to 187 * sbm.mb_states. 188 * 189 * A set bit means the corresponding subblock is 190 * plugged, otherwise it's unblocked. 191 * 192 * With 4 MiB subblocks, we manage 128 GiB of memory 193 * in one 4 KiB page. 194 */ 195 unsigned long *sb_states; 196 } sbm; 197 198 struct { 199 /* Id of the first big block of this device. */ 200 unsigned long first_bb_id; 201 /* Id of the last usable big block of this device. */ 202 unsigned long last_usable_bb_id; 203 /* Id of the next device bock to prepare when needed. */ 204 unsigned long next_bb_id; 205 206 /* Summary of all big block states. */ 207 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 208 209 /* One byte state per big block. See sbm.mb_states. */ 210 uint8_t *bb_states; 211 212 /* The block size used for plugging/adding/removing. */ 213 uint64_t bb_size; 214 } bbm; 215 }; 216 217 /* 218 * Mutex that protects the sbm.mb_count, sbm.mb_states, 219 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 220 * 221 * When this lock is held the pointers can't change, ONLINE and 222 * OFFLINE blocks can't change the state and no subblocks will get 223 * plugged/unplugged. 224 */ 225 struct mutex hotplug_mutex; 226 bool hotplug_active; 227 228 /* An error occurred we cannot handle - stop processing requests. */ 229 bool broken; 230 231 /* The driver is being removed. */ 232 spinlock_t removal_lock; 233 bool removing; 234 235 /* Timer for retrying to plug/unplug memory. */ 236 struct hrtimer retry_timer; 237 unsigned int retry_timer_ms; 238 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 239 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 240 241 /* Memory notifier (online/offline events). */ 242 struct notifier_block memory_notifier; 243 244 /* Next device in the list of virtio-mem devices. */ 245 struct list_head next; 246 }; 247 248 /* 249 * We have to share a single online_page callback among all virtio-mem 250 * devices. We use RCU to iterate the list in the callback. 251 */ 252 static DEFINE_MUTEX(virtio_mem_mutex); 253 static LIST_HEAD(virtio_mem_devices); 254 255 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 256 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 257 unsigned long nr_pages); 258 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 259 unsigned long nr_pages); 260 static void virtio_mem_retry(struct virtio_mem *vm); 261 262 /* 263 * Register a virtio-mem device so it will be considered for the online_page 264 * callback. 265 */ 266 static int register_virtio_mem_device(struct virtio_mem *vm) 267 { 268 int rc = 0; 269 270 /* First device registers the callback. */ 271 mutex_lock(&virtio_mem_mutex); 272 if (list_empty(&virtio_mem_devices)) 273 rc = set_online_page_callback(&virtio_mem_online_page_cb); 274 if (!rc) 275 list_add_rcu(&vm->next, &virtio_mem_devices); 276 mutex_unlock(&virtio_mem_mutex); 277 278 return rc; 279 } 280 281 /* 282 * Unregister a virtio-mem device so it will no longer be considered for the 283 * online_page callback. 284 */ 285 static void unregister_virtio_mem_device(struct virtio_mem *vm) 286 { 287 /* Last device unregisters the callback. */ 288 mutex_lock(&virtio_mem_mutex); 289 list_del_rcu(&vm->next); 290 if (list_empty(&virtio_mem_devices)) 291 restore_online_page_callback(&virtio_mem_online_page_cb); 292 mutex_unlock(&virtio_mem_mutex); 293 294 synchronize_rcu(); 295 } 296 297 /* 298 * Calculate the memory block id of a given address. 299 */ 300 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 301 { 302 return addr / memory_block_size_bytes(); 303 } 304 305 /* 306 * Calculate the physical start address of a given memory block id. 307 */ 308 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 309 { 310 return mb_id * memory_block_size_bytes(); 311 } 312 313 /* 314 * Calculate the big block id of a given address. 315 */ 316 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 317 uint64_t addr) 318 { 319 return addr / vm->bbm.bb_size; 320 } 321 322 /* 323 * Calculate the physical start address of a given big block id. 324 */ 325 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 326 unsigned long bb_id) 327 { 328 return bb_id * vm->bbm.bb_size; 329 } 330 331 /* 332 * Calculate the subblock id of a given address. 333 */ 334 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 335 unsigned long addr) 336 { 337 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 338 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 339 340 return (addr - mb_addr) / vm->sbm.sb_size; 341 } 342 343 /* 344 * Set the state of a big block, taking care of the state counter. 345 */ 346 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 347 unsigned long bb_id, 348 enum virtio_mem_bbm_bb_state state) 349 { 350 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 351 enum virtio_mem_bbm_bb_state old_state; 352 353 old_state = vm->bbm.bb_states[idx]; 354 vm->bbm.bb_states[idx] = state; 355 356 BUG_ON(vm->bbm.bb_count[old_state] == 0); 357 vm->bbm.bb_count[old_state]--; 358 vm->bbm.bb_count[state]++; 359 } 360 361 /* 362 * Get the state of a big block. 363 */ 364 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 365 unsigned long bb_id) 366 { 367 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 368 } 369 370 /* 371 * Prepare the big block state array for the next big block. 372 */ 373 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 374 { 375 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 376 unsigned long new_bytes = old_bytes + 1; 377 int old_pages = PFN_UP(old_bytes); 378 int new_pages = PFN_UP(new_bytes); 379 uint8_t *new_array; 380 381 if (vm->bbm.bb_states && old_pages == new_pages) 382 return 0; 383 384 new_array = vzalloc(new_pages * PAGE_SIZE); 385 if (!new_array) 386 return -ENOMEM; 387 388 mutex_lock(&vm->hotplug_mutex); 389 if (vm->bbm.bb_states) 390 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 391 vfree(vm->bbm.bb_states); 392 vm->bbm.bb_states = new_array; 393 mutex_unlock(&vm->hotplug_mutex); 394 395 return 0; 396 } 397 398 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 399 for (_bb_id = vm->bbm.first_bb_id; \ 400 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 401 _bb_id++) \ 402 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 403 404 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 405 for (_bb_id = vm->bbm.next_bb_id - 1; \ 406 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 407 _bb_id--) \ 408 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 409 410 /* 411 * Set the state of a memory block, taking care of the state counter. 412 */ 413 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 414 unsigned long mb_id, uint8_t state) 415 { 416 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 417 uint8_t old_state; 418 419 old_state = vm->sbm.mb_states[idx]; 420 vm->sbm.mb_states[idx] = state; 421 422 BUG_ON(vm->sbm.mb_count[old_state] == 0); 423 vm->sbm.mb_count[old_state]--; 424 vm->sbm.mb_count[state]++; 425 } 426 427 /* 428 * Get the state of a memory block. 429 */ 430 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 431 unsigned long mb_id) 432 { 433 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 434 435 return vm->sbm.mb_states[idx]; 436 } 437 438 /* 439 * Prepare the state array for the next memory block. 440 */ 441 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 442 { 443 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 444 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 445 uint8_t *new_array; 446 447 if (vm->sbm.mb_states && old_pages == new_pages) 448 return 0; 449 450 new_array = vzalloc(new_pages * PAGE_SIZE); 451 if (!new_array) 452 return -ENOMEM; 453 454 mutex_lock(&vm->hotplug_mutex); 455 if (vm->sbm.mb_states) 456 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 457 vfree(vm->sbm.mb_states); 458 vm->sbm.mb_states = new_array; 459 mutex_unlock(&vm->hotplug_mutex); 460 461 return 0; 462 } 463 464 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 465 for (_mb_id = _vm->sbm.first_mb_id; \ 466 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 467 _mb_id++) \ 468 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 469 470 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 471 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 472 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 473 _mb_id--) \ 474 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 475 476 /* 477 * Calculate the bit number in the subblock bitmap for the given subblock 478 * inside the given memory block. 479 */ 480 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 481 unsigned long mb_id, int sb_id) 482 { 483 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 484 } 485 486 /* 487 * Mark all selected subblocks plugged. 488 * 489 * Will not modify the state of the memory block. 490 */ 491 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 492 unsigned long mb_id, int sb_id, 493 int count) 494 { 495 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 496 497 __bitmap_set(vm->sbm.sb_states, bit, count); 498 } 499 500 /* 501 * Mark all selected subblocks unplugged. 502 * 503 * Will not modify the state of the memory block. 504 */ 505 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 506 unsigned long mb_id, int sb_id, 507 int count) 508 { 509 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 510 511 __bitmap_clear(vm->sbm.sb_states, bit, count); 512 } 513 514 /* 515 * Test if all selected subblocks are plugged. 516 */ 517 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 518 unsigned long mb_id, int sb_id, 519 int count) 520 { 521 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 522 523 if (count == 1) 524 return test_bit(bit, vm->sbm.sb_states); 525 526 /* TODO: Helper similar to bitmap_set() */ 527 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 528 bit + count; 529 } 530 531 /* 532 * Test if all selected subblocks are unplugged. 533 */ 534 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 535 unsigned long mb_id, int sb_id, 536 int count) 537 { 538 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 539 540 /* TODO: Helper similar to bitmap_set() */ 541 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 542 bit + count; 543 } 544 545 /* 546 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 547 * none. 548 */ 549 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 550 unsigned long mb_id) 551 { 552 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 553 554 return find_next_zero_bit(vm->sbm.sb_states, 555 bit + vm->sbm.sbs_per_mb, bit) - bit; 556 } 557 558 /* 559 * Prepare the subblock bitmap for the next memory block. 560 */ 561 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 562 { 563 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 564 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 565 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 566 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 567 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 568 unsigned long *new_bitmap, *old_bitmap; 569 570 if (vm->sbm.sb_states && old_pages == new_pages) 571 return 0; 572 573 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 574 if (!new_bitmap) 575 return -ENOMEM; 576 577 mutex_lock(&vm->hotplug_mutex); 578 if (new_bitmap) 579 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 580 581 old_bitmap = vm->sbm.sb_states; 582 vm->sbm.sb_states = new_bitmap; 583 mutex_unlock(&vm->hotplug_mutex); 584 585 vfree(old_bitmap); 586 return 0; 587 } 588 589 /* 590 * Test if we could add memory without creating too much offline memory - 591 * to avoid running OOM if memory is getting onlined deferred. 592 */ 593 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 594 { 595 if (WARN_ON_ONCE(size > vm->offline_threshold)) 596 return false; 597 598 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 599 } 600 601 /* 602 * Try adding memory to Linux. Will usually only fail if out of memory. 603 * 604 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 605 * onlining code). 606 * 607 * Will not modify the state of memory blocks in virtio-mem. 608 */ 609 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 610 uint64_t size) 611 { 612 int rc; 613 614 /* 615 * When force-unloading the driver and we still have memory added to 616 * Linux, the resource name has to stay. 617 */ 618 if (!vm->resource_name) { 619 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 620 GFP_KERNEL); 621 if (!vm->resource_name) 622 return -ENOMEM; 623 } 624 625 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 626 addr + size - 1); 627 /* Memory might get onlined immediately. */ 628 atomic64_add(size, &vm->offline_size); 629 rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, 630 MHP_MERGE_RESOURCE); 631 if (rc) { 632 atomic64_sub(size, &vm->offline_size); 633 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 634 /* 635 * TODO: Linux MM does not properly clean up yet in all cases 636 * where adding of memory failed - especially on -ENOMEM. 637 */ 638 } 639 return rc; 640 } 641 642 /* 643 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 644 */ 645 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 646 { 647 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 648 const uint64_t size = memory_block_size_bytes(); 649 650 return virtio_mem_add_memory(vm, addr, size); 651 } 652 653 /* 654 * See virtio_mem_add_memory(): Try adding a big block. 655 */ 656 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 657 { 658 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 659 const uint64_t size = vm->bbm.bb_size; 660 661 return virtio_mem_add_memory(vm, addr, size); 662 } 663 664 /* 665 * Try removing memory from Linux. Will only fail if memory blocks aren't 666 * offline. 667 * 668 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 669 * onlining code). 670 * 671 * Will not modify the state of memory blocks in virtio-mem. 672 */ 673 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 674 uint64_t size) 675 { 676 int rc; 677 678 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 679 addr + size - 1); 680 rc = remove_memory(vm->nid, addr, size); 681 if (!rc) { 682 atomic64_sub(size, &vm->offline_size); 683 /* 684 * We might have freed up memory we can now unplug, retry 685 * immediately instead of waiting. 686 */ 687 virtio_mem_retry(vm); 688 } else { 689 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 690 } 691 return rc; 692 } 693 694 /* 695 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 696 */ 697 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 698 { 699 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 700 const uint64_t size = memory_block_size_bytes(); 701 702 return virtio_mem_remove_memory(vm, addr, size); 703 } 704 705 /* 706 * Try offlining and removing memory from Linux. 707 * 708 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 709 * onlining code). 710 * 711 * Will not modify the state of memory blocks in virtio-mem. 712 */ 713 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 714 uint64_t addr, 715 uint64_t size) 716 { 717 int rc; 718 719 dev_dbg(&vm->vdev->dev, 720 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 721 addr + size - 1); 722 723 rc = offline_and_remove_memory(vm->nid, addr, size); 724 if (!rc) { 725 atomic64_sub(size, &vm->offline_size); 726 /* 727 * We might have freed up memory we can now unplug, retry 728 * immediately instead of waiting. 729 */ 730 virtio_mem_retry(vm); 731 } else { 732 dev_dbg(&vm->vdev->dev, 733 "offlining and removing memory failed: %d\n", rc); 734 } 735 return rc; 736 } 737 738 /* 739 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 740 * a single Linux memory block. 741 */ 742 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 743 unsigned long mb_id) 744 { 745 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 746 const uint64_t size = memory_block_size_bytes(); 747 748 return virtio_mem_offline_and_remove_memory(vm, addr, size); 749 } 750 751 /* 752 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 753 * all Linux memory blocks covered by the big block. 754 */ 755 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 756 unsigned long bb_id) 757 { 758 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 759 const uint64_t size = vm->bbm.bb_size; 760 761 return virtio_mem_offline_and_remove_memory(vm, addr, size); 762 } 763 764 /* 765 * Trigger the workqueue so the device can perform its magic. 766 */ 767 static void virtio_mem_retry(struct virtio_mem *vm) 768 { 769 unsigned long flags; 770 771 spin_lock_irqsave(&vm->removal_lock, flags); 772 if (!vm->removing) 773 queue_work(system_freezable_wq, &vm->wq); 774 spin_unlock_irqrestore(&vm->removal_lock, flags); 775 } 776 777 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 778 { 779 int node = NUMA_NO_NODE; 780 781 #if defined(CONFIG_ACPI_NUMA) 782 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 783 node = pxm_to_node(node_id); 784 #endif 785 return node; 786 } 787 788 /* 789 * Test if a virtio-mem device overlaps with the given range. Can be called 790 * from (notifier) callbacks lockless. 791 */ 792 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 793 uint64_t size) 794 { 795 return start < vm->addr + vm->region_size && vm->addr < start + size; 796 } 797 798 /* 799 * Test if a virtio-mem device contains a given range. Can be called from 800 * (notifier) callbacks lockless. 801 */ 802 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 803 uint64_t size) 804 { 805 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 806 } 807 808 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 809 unsigned long mb_id) 810 { 811 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 812 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 813 case VIRTIO_MEM_SBM_MB_OFFLINE: 814 return NOTIFY_OK; 815 default: 816 break; 817 } 818 dev_warn_ratelimited(&vm->vdev->dev, 819 "memory block onlining denied\n"); 820 return NOTIFY_BAD; 821 } 822 823 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 824 unsigned long mb_id) 825 { 826 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 827 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 828 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 829 virtio_mem_sbm_set_mb_state(vm, mb_id, 830 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 831 break; 832 case VIRTIO_MEM_SBM_MB_KERNEL: 833 case VIRTIO_MEM_SBM_MB_MOVABLE: 834 virtio_mem_sbm_set_mb_state(vm, mb_id, 835 VIRTIO_MEM_SBM_MB_OFFLINE); 836 break; 837 default: 838 BUG(); 839 break; 840 } 841 } 842 843 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 844 unsigned long mb_id, 845 unsigned long start_pfn) 846 { 847 const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == 848 ZONE_MOVABLE; 849 int new_state; 850 851 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 852 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 853 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 854 if (is_movable) 855 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 856 break; 857 case VIRTIO_MEM_SBM_MB_OFFLINE: 858 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 859 if (is_movable) 860 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 861 break; 862 default: 863 BUG(); 864 break; 865 } 866 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 867 } 868 869 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 870 unsigned long mb_id) 871 { 872 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 873 unsigned long pfn; 874 int sb_id; 875 876 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 877 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 878 continue; 879 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 880 sb_id * vm->sbm.sb_size); 881 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 882 } 883 } 884 885 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 886 unsigned long mb_id) 887 { 888 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 889 unsigned long pfn; 890 int sb_id; 891 892 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 893 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 894 continue; 895 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 896 sb_id * vm->sbm.sb_size); 897 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 898 } 899 } 900 901 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 902 unsigned long bb_id, 903 unsigned long pfn, 904 unsigned long nr_pages) 905 { 906 /* 907 * When marked as "fake-offline", all online memory of this device block 908 * is allocated by us. Otherwise, we don't have any memory allocated. 909 */ 910 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 911 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 912 return; 913 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 914 } 915 916 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 917 unsigned long bb_id, 918 unsigned long pfn, 919 unsigned long nr_pages) 920 { 921 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 922 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 923 return; 924 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 925 } 926 927 /* 928 * This callback will either be called synchronously from add_memory() or 929 * asynchronously (e.g., triggered via user space). We have to be careful 930 * with locking when calling add_memory(). 931 */ 932 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 933 unsigned long action, void *arg) 934 { 935 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 936 memory_notifier); 937 struct memory_notify *mhp = arg; 938 const unsigned long start = PFN_PHYS(mhp->start_pfn); 939 const unsigned long size = PFN_PHYS(mhp->nr_pages); 940 int rc = NOTIFY_OK; 941 unsigned long id; 942 943 if (!virtio_mem_overlaps_range(vm, start, size)) 944 return NOTIFY_DONE; 945 946 if (vm->in_sbm) { 947 id = virtio_mem_phys_to_mb_id(start); 948 /* 949 * In SBM, we add memory in separate memory blocks - we expect 950 * it to be onlined/offlined in the same granularity. Bail out 951 * if this ever changes. 952 */ 953 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 954 !IS_ALIGNED(start, memory_block_size_bytes()))) 955 return NOTIFY_BAD; 956 } else { 957 id = virtio_mem_phys_to_bb_id(vm, start); 958 /* 959 * In BBM, we only care about onlining/offlining happening 960 * within a single big block, we don't care about the 961 * actual granularity as we don't track individual Linux 962 * memory blocks. 963 */ 964 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 965 return NOTIFY_BAD; 966 } 967 968 /* 969 * Avoid circular locking lockdep warnings. We lock the mutex 970 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 971 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 972 * between both notifier calls and will bail out. False positive. 973 */ 974 lockdep_off(); 975 976 switch (action) { 977 case MEM_GOING_OFFLINE: 978 mutex_lock(&vm->hotplug_mutex); 979 if (vm->removing) { 980 rc = notifier_from_errno(-EBUSY); 981 mutex_unlock(&vm->hotplug_mutex); 982 break; 983 } 984 vm->hotplug_active = true; 985 if (vm->in_sbm) 986 virtio_mem_sbm_notify_going_offline(vm, id); 987 else 988 virtio_mem_bbm_notify_going_offline(vm, id, 989 mhp->start_pfn, 990 mhp->nr_pages); 991 break; 992 case MEM_GOING_ONLINE: 993 mutex_lock(&vm->hotplug_mutex); 994 if (vm->removing) { 995 rc = notifier_from_errno(-EBUSY); 996 mutex_unlock(&vm->hotplug_mutex); 997 break; 998 } 999 vm->hotplug_active = true; 1000 if (vm->in_sbm) 1001 rc = virtio_mem_sbm_notify_going_online(vm, id); 1002 break; 1003 case MEM_OFFLINE: 1004 if (vm->in_sbm) 1005 virtio_mem_sbm_notify_offline(vm, id); 1006 1007 atomic64_add(size, &vm->offline_size); 1008 /* 1009 * Trigger the workqueue. Now that we have some offline memory, 1010 * maybe we can handle pending unplug requests. 1011 */ 1012 if (!unplug_online) 1013 virtio_mem_retry(vm); 1014 1015 vm->hotplug_active = false; 1016 mutex_unlock(&vm->hotplug_mutex); 1017 break; 1018 case MEM_ONLINE: 1019 if (vm->in_sbm) 1020 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1021 1022 atomic64_sub(size, &vm->offline_size); 1023 /* 1024 * Start adding more memory once we onlined half of our 1025 * threshold. Don't trigger if it's possibly due to our actipn 1026 * (e.g., us adding memory which gets onlined immediately from 1027 * the core). 1028 */ 1029 if (!atomic_read(&vm->wq_active) && 1030 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1031 virtio_mem_retry(vm); 1032 1033 vm->hotplug_active = false; 1034 mutex_unlock(&vm->hotplug_mutex); 1035 break; 1036 case MEM_CANCEL_OFFLINE: 1037 if (!vm->hotplug_active) 1038 break; 1039 if (vm->in_sbm) 1040 virtio_mem_sbm_notify_cancel_offline(vm, id); 1041 else 1042 virtio_mem_bbm_notify_cancel_offline(vm, id, 1043 mhp->start_pfn, 1044 mhp->nr_pages); 1045 vm->hotplug_active = false; 1046 mutex_unlock(&vm->hotplug_mutex); 1047 break; 1048 case MEM_CANCEL_ONLINE: 1049 if (!vm->hotplug_active) 1050 break; 1051 vm->hotplug_active = false; 1052 mutex_unlock(&vm->hotplug_mutex); 1053 break; 1054 default: 1055 break; 1056 } 1057 1058 lockdep_on(); 1059 1060 return rc; 1061 } 1062 1063 /* 1064 * Set a range of pages PG_offline. Remember pages that were never onlined 1065 * (via generic_online_page()) using PageDirty(). 1066 */ 1067 static void virtio_mem_set_fake_offline(unsigned long pfn, 1068 unsigned long nr_pages, bool onlined) 1069 { 1070 page_offline_begin(); 1071 for (; nr_pages--; pfn++) { 1072 struct page *page = pfn_to_page(pfn); 1073 1074 __SetPageOffline(page); 1075 if (!onlined) { 1076 SetPageDirty(page); 1077 /* FIXME: remove after cleanups */ 1078 ClearPageReserved(page); 1079 } 1080 } 1081 page_offline_end(); 1082 } 1083 1084 /* 1085 * Clear PG_offline from a range of pages. If the pages were never onlined, 1086 * (via generic_online_page()), clear PageDirty(). 1087 */ 1088 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1089 unsigned long nr_pages, bool onlined) 1090 { 1091 for (; nr_pages--; pfn++) { 1092 struct page *page = pfn_to_page(pfn); 1093 1094 __ClearPageOffline(page); 1095 if (!onlined) 1096 ClearPageDirty(page); 1097 } 1098 } 1099 1100 /* 1101 * Release a range of fake-offline pages to the buddy, effectively 1102 * fake-onlining them. 1103 */ 1104 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1105 { 1106 const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; 1107 unsigned long i; 1108 1109 /* 1110 * We are always called at least with MAX_ORDER_NR_PAGES 1111 * granularity/alignment (e.g., the way subblocks work). All pages 1112 * inside such a block are alike. 1113 */ 1114 for (i = 0; i < nr_pages; i += max_nr_pages) { 1115 struct page *page = pfn_to_page(pfn + i); 1116 1117 /* 1118 * If the page is PageDirty(), it was kept fake-offline when 1119 * onlining the memory block. Otherwise, it was allocated 1120 * using alloc_contig_range(). All pages in a subblock are 1121 * alike. 1122 */ 1123 if (PageDirty(page)) { 1124 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1125 false); 1126 generic_online_page(page, MAX_ORDER - 1); 1127 } else { 1128 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1129 true); 1130 free_contig_range(pfn + i, max_nr_pages); 1131 adjust_managed_page_count(page, max_nr_pages); 1132 } 1133 } 1134 } 1135 1136 /* 1137 * Try to allocate a range, marking pages fake-offline, effectively 1138 * fake-offlining them. 1139 */ 1140 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1141 { 1142 const bool is_movable = page_zonenum(pfn_to_page(pfn)) == 1143 ZONE_MOVABLE; 1144 int rc, retry_count; 1145 1146 /* 1147 * TODO: We want an alloc_contig_range() mode that tries to allocate 1148 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1149 * with ZONE_MOVABLE. So for now, retry a couple of times with 1150 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1151 * some guarantees. 1152 */ 1153 for (retry_count = 0; retry_count < 5; retry_count++) { 1154 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1155 GFP_KERNEL); 1156 if (rc == -ENOMEM) 1157 /* whoops, out of memory */ 1158 return rc; 1159 else if (rc && !is_movable) 1160 break; 1161 else if (rc) 1162 continue; 1163 1164 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1165 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1166 return 0; 1167 } 1168 1169 return -EBUSY; 1170 } 1171 1172 /* 1173 * Handle fake-offline pages when memory is going offline - such that the 1174 * pages can be skipped by mm-core when offlining. 1175 */ 1176 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1177 unsigned long nr_pages) 1178 { 1179 struct page *page; 1180 unsigned long i; 1181 1182 /* 1183 * Drop our reference to the pages so the memory can get offlined 1184 * and add the unplugged pages to the managed page counters (so 1185 * offlining code can correctly subtract them again). 1186 */ 1187 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1188 /* Drop our reference to the pages so the memory can get offlined. */ 1189 for (i = 0; i < nr_pages; i++) { 1190 page = pfn_to_page(pfn + i); 1191 if (WARN_ON(!page_ref_dec_and_test(page))) 1192 dump_page(page, "fake-offline page referenced"); 1193 } 1194 } 1195 1196 /* 1197 * Handle fake-offline pages when memory offlining is canceled - to undo 1198 * what we did in virtio_mem_fake_offline_going_offline(). 1199 */ 1200 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1201 unsigned long nr_pages) 1202 { 1203 unsigned long i; 1204 1205 /* 1206 * Get the reference we dropped when going offline and subtract the 1207 * unplugged pages from the managed page counters. 1208 */ 1209 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1210 for (i = 0; i < nr_pages; i++) 1211 page_ref_inc(pfn_to_page(pfn + i)); 1212 } 1213 1214 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1215 { 1216 const unsigned long addr = page_to_phys(page); 1217 unsigned long id, sb_id; 1218 struct virtio_mem *vm; 1219 bool do_online; 1220 1221 rcu_read_lock(); 1222 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1223 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1224 continue; 1225 1226 if (vm->in_sbm) { 1227 /* 1228 * We exploit here that subblocks have at least 1229 * MAX_ORDER_NR_PAGES size/alignment - so we cannot 1230 * cross subblocks within one call. 1231 */ 1232 id = virtio_mem_phys_to_mb_id(addr); 1233 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1234 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, 1235 sb_id, 1); 1236 } else { 1237 /* 1238 * If the whole block is marked fake offline, keep 1239 * everything that way. 1240 */ 1241 id = virtio_mem_phys_to_bb_id(vm, addr); 1242 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1243 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1244 } 1245 if (do_online) 1246 generic_online_page(page, order); 1247 else 1248 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1249 false); 1250 rcu_read_unlock(); 1251 return; 1252 } 1253 rcu_read_unlock(); 1254 1255 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1256 generic_online_page(page, order); 1257 } 1258 1259 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1260 const struct virtio_mem_req *req) 1261 { 1262 struct scatterlist *sgs[2], sg_req, sg_resp; 1263 unsigned int len; 1264 int rc; 1265 1266 /* don't use the request residing on the stack (vaddr) */ 1267 vm->req = *req; 1268 1269 /* out: buffer for request */ 1270 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1271 sgs[0] = &sg_req; 1272 1273 /* in: buffer for response */ 1274 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1275 sgs[1] = &sg_resp; 1276 1277 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1278 if (rc < 0) 1279 return rc; 1280 1281 virtqueue_kick(vm->vq); 1282 1283 /* wait for a response */ 1284 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1285 1286 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1287 } 1288 1289 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1290 uint64_t size) 1291 { 1292 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1293 const struct virtio_mem_req req = { 1294 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1295 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1296 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1297 }; 1298 int rc = -ENOMEM; 1299 1300 if (atomic_read(&vm->config_changed)) 1301 return -EAGAIN; 1302 1303 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1304 addr + size - 1); 1305 1306 switch (virtio_mem_send_request(vm, &req)) { 1307 case VIRTIO_MEM_RESP_ACK: 1308 vm->plugged_size += size; 1309 return 0; 1310 case VIRTIO_MEM_RESP_NACK: 1311 rc = -EAGAIN; 1312 break; 1313 case VIRTIO_MEM_RESP_BUSY: 1314 rc = -ETXTBSY; 1315 break; 1316 case VIRTIO_MEM_RESP_ERROR: 1317 rc = -EINVAL; 1318 break; 1319 default: 1320 break; 1321 } 1322 1323 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1324 return rc; 1325 } 1326 1327 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1328 uint64_t size) 1329 { 1330 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1331 const struct virtio_mem_req req = { 1332 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1333 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1334 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1335 }; 1336 int rc = -ENOMEM; 1337 1338 if (atomic_read(&vm->config_changed)) 1339 return -EAGAIN; 1340 1341 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1342 addr + size - 1); 1343 1344 switch (virtio_mem_send_request(vm, &req)) { 1345 case VIRTIO_MEM_RESP_ACK: 1346 vm->plugged_size -= size; 1347 return 0; 1348 case VIRTIO_MEM_RESP_BUSY: 1349 rc = -ETXTBSY; 1350 break; 1351 case VIRTIO_MEM_RESP_ERROR: 1352 rc = -EINVAL; 1353 break; 1354 default: 1355 break; 1356 } 1357 1358 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1359 return rc; 1360 } 1361 1362 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1363 { 1364 const struct virtio_mem_req req = { 1365 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1366 }; 1367 int rc = -ENOMEM; 1368 1369 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1370 1371 switch (virtio_mem_send_request(vm, &req)) { 1372 case VIRTIO_MEM_RESP_ACK: 1373 vm->unplug_all_required = false; 1374 vm->plugged_size = 0; 1375 /* usable region might have shrunk */ 1376 atomic_set(&vm->config_changed, 1); 1377 return 0; 1378 case VIRTIO_MEM_RESP_BUSY: 1379 rc = -ETXTBSY; 1380 break; 1381 default: 1382 break; 1383 } 1384 1385 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1386 return rc; 1387 } 1388 1389 /* 1390 * Plug selected subblocks. Updates the plugged state, but not the state 1391 * of the memory block. 1392 */ 1393 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1394 int sb_id, int count) 1395 { 1396 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1397 sb_id * vm->sbm.sb_size; 1398 const uint64_t size = count * vm->sbm.sb_size; 1399 int rc; 1400 1401 rc = virtio_mem_send_plug_request(vm, addr, size); 1402 if (!rc) 1403 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1404 return rc; 1405 } 1406 1407 /* 1408 * Unplug selected subblocks. Updates the plugged state, but not the state 1409 * of the memory block. 1410 */ 1411 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1412 int sb_id, int count) 1413 { 1414 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1415 sb_id * vm->sbm.sb_size; 1416 const uint64_t size = count * vm->sbm.sb_size; 1417 int rc; 1418 1419 rc = virtio_mem_send_unplug_request(vm, addr, size); 1420 if (!rc) 1421 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1422 return rc; 1423 } 1424 1425 /* 1426 * Request to unplug a big block. 1427 * 1428 * Will not modify the state of the big block. 1429 */ 1430 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1431 { 1432 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1433 const uint64_t size = vm->bbm.bb_size; 1434 1435 return virtio_mem_send_unplug_request(vm, addr, size); 1436 } 1437 1438 /* 1439 * Request to plug a big block. 1440 * 1441 * Will not modify the state of the big block. 1442 */ 1443 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1444 { 1445 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1446 const uint64_t size = vm->bbm.bb_size; 1447 1448 return virtio_mem_send_plug_request(vm, addr, size); 1449 } 1450 1451 /* 1452 * Unplug the desired number of plugged subblocks of a offline or not-added 1453 * memory block. Will fail if any subblock cannot get unplugged (instead of 1454 * skipping it). 1455 * 1456 * Will not modify the state of the memory block. 1457 * 1458 * Note: can fail after some subblocks were unplugged. 1459 */ 1460 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1461 unsigned long mb_id, uint64_t *nb_sb) 1462 { 1463 int sb_id, count; 1464 int rc; 1465 1466 sb_id = vm->sbm.sbs_per_mb - 1; 1467 while (*nb_sb) { 1468 /* Find the next candidate subblock */ 1469 while (sb_id >= 0 && 1470 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1471 sb_id--; 1472 if (sb_id < 0) 1473 break; 1474 /* Try to unplug multiple subblocks at a time */ 1475 count = 1; 1476 while (count < *nb_sb && sb_id > 0 && 1477 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1478 count++; 1479 sb_id--; 1480 } 1481 1482 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1483 if (rc) 1484 return rc; 1485 *nb_sb -= count; 1486 sb_id--; 1487 } 1488 1489 return 0; 1490 } 1491 1492 /* 1493 * Unplug all plugged subblocks of an offline or not-added memory block. 1494 * 1495 * Will not modify the state of the memory block. 1496 * 1497 * Note: can fail after some subblocks were unplugged. 1498 */ 1499 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1500 { 1501 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1502 1503 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1504 } 1505 1506 /* 1507 * Prepare tracking data for the next memory block. 1508 */ 1509 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1510 unsigned long *mb_id) 1511 { 1512 int rc; 1513 1514 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1515 return -ENOSPC; 1516 1517 /* Resize the state array if required. */ 1518 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1519 if (rc) 1520 return rc; 1521 1522 /* Resize the subblock bitmap if required. */ 1523 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1524 if (rc) 1525 return rc; 1526 1527 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1528 *mb_id = vm->sbm.next_mb_id++; 1529 return 0; 1530 } 1531 1532 /* 1533 * Try to plug the desired number of subblocks and add the memory block 1534 * to Linux. 1535 * 1536 * Will modify the state of the memory block. 1537 */ 1538 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1539 unsigned long mb_id, uint64_t *nb_sb) 1540 { 1541 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1542 int rc; 1543 1544 if (WARN_ON_ONCE(!count)) 1545 return -EINVAL; 1546 1547 /* 1548 * Plug the requested number of subblocks before adding it to linux, 1549 * so that onlining will directly online all plugged subblocks. 1550 */ 1551 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1552 if (rc) 1553 return rc; 1554 1555 /* 1556 * Mark the block properly offline before adding it to Linux, 1557 * so the memory notifiers will find the block in the right state. 1558 */ 1559 if (count == vm->sbm.sbs_per_mb) 1560 virtio_mem_sbm_set_mb_state(vm, mb_id, 1561 VIRTIO_MEM_SBM_MB_OFFLINE); 1562 else 1563 virtio_mem_sbm_set_mb_state(vm, mb_id, 1564 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1565 1566 /* Add the memory block to linux - if that fails, try to unplug. */ 1567 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1568 if (rc) { 1569 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1570 1571 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1572 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1573 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1574 return rc; 1575 } 1576 1577 *nb_sb -= count; 1578 return 0; 1579 } 1580 1581 /* 1582 * Try to plug the desired number of subblocks of a memory block that 1583 * is already added to Linux. 1584 * 1585 * Will modify the state of the memory block. 1586 * 1587 * Note: Can fail after some subblocks were successfully plugged. 1588 */ 1589 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1590 unsigned long mb_id, uint64_t *nb_sb) 1591 { 1592 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1593 unsigned long pfn, nr_pages; 1594 int sb_id, count; 1595 int rc; 1596 1597 if (WARN_ON_ONCE(!*nb_sb)) 1598 return -EINVAL; 1599 1600 while (*nb_sb) { 1601 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1602 if (sb_id >= vm->sbm.sbs_per_mb) 1603 break; 1604 count = 1; 1605 while (count < *nb_sb && 1606 sb_id + count < vm->sbm.sbs_per_mb && 1607 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1608 count++; 1609 1610 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1611 if (rc) 1612 return rc; 1613 *nb_sb -= count; 1614 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1615 continue; 1616 1617 /* fake-online the pages if the memory block is online */ 1618 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1619 sb_id * vm->sbm.sb_size); 1620 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1621 virtio_mem_fake_online(pfn, nr_pages); 1622 } 1623 1624 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1625 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1626 1627 return 0; 1628 } 1629 1630 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1631 { 1632 const int mb_states[] = { 1633 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1634 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1635 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1636 }; 1637 uint64_t nb_sb = diff / vm->sbm.sb_size; 1638 unsigned long mb_id; 1639 int rc, i; 1640 1641 if (!nb_sb) 1642 return 0; 1643 1644 /* Don't race with onlining/offlining */ 1645 mutex_lock(&vm->hotplug_mutex); 1646 1647 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1648 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1649 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1650 if (rc || !nb_sb) 1651 goto out_unlock; 1652 cond_resched(); 1653 } 1654 } 1655 1656 /* 1657 * We won't be working on online/offline memory blocks from this point, 1658 * so we can't race with memory onlining/offlining. Drop the mutex. 1659 */ 1660 mutex_unlock(&vm->hotplug_mutex); 1661 1662 /* Try to plug and add unused blocks */ 1663 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1664 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1665 return -ENOSPC; 1666 1667 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1668 if (rc || !nb_sb) 1669 return rc; 1670 cond_resched(); 1671 } 1672 1673 /* Try to prepare, plug and add new blocks */ 1674 while (nb_sb) { 1675 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1676 return -ENOSPC; 1677 1678 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1679 if (rc) 1680 return rc; 1681 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1682 if (rc) 1683 return rc; 1684 cond_resched(); 1685 } 1686 1687 return 0; 1688 out_unlock: 1689 mutex_unlock(&vm->hotplug_mutex); 1690 return rc; 1691 } 1692 1693 /* 1694 * Plug a big block and add it to Linux. 1695 * 1696 * Will modify the state of the big block. 1697 */ 1698 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1699 unsigned long bb_id) 1700 { 1701 int rc; 1702 1703 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1704 VIRTIO_MEM_BBM_BB_UNUSED)) 1705 return -EINVAL; 1706 1707 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1708 if (rc) 1709 return rc; 1710 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1711 1712 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1713 if (rc) { 1714 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1715 virtio_mem_bbm_set_bb_state(vm, bb_id, 1716 VIRTIO_MEM_BBM_BB_UNUSED); 1717 else 1718 /* Retry from the main loop. */ 1719 virtio_mem_bbm_set_bb_state(vm, bb_id, 1720 VIRTIO_MEM_BBM_BB_PLUGGED); 1721 return rc; 1722 } 1723 return 0; 1724 } 1725 1726 /* 1727 * Prepare tracking data for the next big block. 1728 */ 1729 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1730 unsigned long *bb_id) 1731 { 1732 int rc; 1733 1734 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1735 return -ENOSPC; 1736 1737 /* Resize the big block state array if required. */ 1738 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1739 if (rc) 1740 return rc; 1741 1742 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1743 *bb_id = vm->bbm.next_bb_id; 1744 vm->bbm.next_bb_id++; 1745 return 0; 1746 } 1747 1748 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1749 { 1750 uint64_t nb_bb = diff / vm->bbm.bb_size; 1751 unsigned long bb_id; 1752 int rc; 1753 1754 if (!nb_bb) 1755 return 0; 1756 1757 /* Try to plug and add unused big blocks */ 1758 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1759 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1760 return -ENOSPC; 1761 1762 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1763 if (!rc) 1764 nb_bb--; 1765 if (rc || !nb_bb) 1766 return rc; 1767 cond_resched(); 1768 } 1769 1770 /* Try to prepare, plug and add new big blocks */ 1771 while (nb_bb) { 1772 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1773 return -ENOSPC; 1774 1775 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1776 if (rc) 1777 return rc; 1778 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1779 if (!rc) 1780 nb_bb--; 1781 if (rc) 1782 return rc; 1783 cond_resched(); 1784 } 1785 1786 return 0; 1787 } 1788 1789 /* 1790 * Try to plug the requested amount of memory. 1791 */ 1792 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1793 { 1794 if (vm->in_sbm) 1795 return virtio_mem_sbm_plug_request(vm, diff); 1796 return virtio_mem_bbm_plug_request(vm, diff); 1797 } 1798 1799 /* 1800 * Unplug the desired number of plugged subblocks of an offline memory block. 1801 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1802 * 1803 * Will modify the state of the memory block. Might temporarily drop the 1804 * hotplug_mutex. 1805 * 1806 * Note: Can fail after some subblocks were successfully unplugged. 1807 */ 1808 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1809 unsigned long mb_id, 1810 uint64_t *nb_sb) 1811 { 1812 int rc; 1813 1814 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1815 1816 /* some subblocks might have been unplugged even on failure */ 1817 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1818 virtio_mem_sbm_set_mb_state(vm, mb_id, 1819 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1820 if (rc) 1821 return rc; 1822 1823 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1824 /* 1825 * Remove the block from Linux - this should never fail. 1826 * Hinder the block from getting onlined by marking it 1827 * unplugged. Temporarily drop the mutex, so 1828 * any pending GOING_ONLINE requests can be serviced/rejected. 1829 */ 1830 virtio_mem_sbm_set_mb_state(vm, mb_id, 1831 VIRTIO_MEM_SBM_MB_UNUSED); 1832 1833 mutex_unlock(&vm->hotplug_mutex); 1834 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1835 BUG_ON(rc); 1836 mutex_lock(&vm->hotplug_mutex); 1837 } 1838 return 0; 1839 } 1840 1841 /* 1842 * Unplug the given plugged subblocks of an online memory block. 1843 * 1844 * Will modify the state of the memory block. 1845 */ 1846 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1847 unsigned long mb_id, int sb_id, 1848 int count) 1849 { 1850 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1851 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1852 unsigned long start_pfn; 1853 int rc; 1854 1855 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1856 sb_id * vm->sbm.sb_size); 1857 1858 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1859 if (rc) 1860 return rc; 1861 1862 /* Try to unplug the allocated memory */ 1863 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1864 if (rc) { 1865 /* Return the memory to the buddy. */ 1866 virtio_mem_fake_online(start_pfn, nr_pages); 1867 return rc; 1868 } 1869 1870 switch (old_state) { 1871 case VIRTIO_MEM_SBM_MB_KERNEL: 1872 virtio_mem_sbm_set_mb_state(vm, mb_id, 1873 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1874 break; 1875 case VIRTIO_MEM_SBM_MB_MOVABLE: 1876 virtio_mem_sbm_set_mb_state(vm, mb_id, 1877 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1878 break; 1879 } 1880 1881 return 0; 1882 } 1883 1884 /* 1885 * Unplug the desired number of plugged subblocks of an online memory block. 1886 * Will skip subblock that are busy. 1887 * 1888 * Will modify the state of the memory block. Might temporarily drop the 1889 * hotplug_mutex. 1890 * 1891 * Note: Can fail after some subblocks were successfully unplugged. Can 1892 * return 0 even if subblocks were busy and could not get unplugged. 1893 */ 1894 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1895 unsigned long mb_id, 1896 uint64_t *nb_sb) 1897 { 1898 int rc, sb_id; 1899 1900 /* If possible, try to unplug the complete block in one shot. */ 1901 if (*nb_sb >= vm->sbm.sbs_per_mb && 1902 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1903 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1904 vm->sbm.sbs_per_mb); 1905 if (!rc) { 1906 *nb_sb -= vm->sbm.sbs_per_mb; 1907 goto unplugged; 1908 } else if (rc != -EBUSY) 1909 return rc; 1910 } 1911 1912 /* Fallback to single subblocks. */ 1913 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1914 /* Find the next candidate subblock */ 1915 while (sb_id >= 0 && 1916 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1917 sb_id--; 1918 if (sb_id < 0) 1919 break; 1920 1921 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1922 if (rc == -EBUSY) 1923 continue; 1924 else if (rc) 1925 return rc; 1926 *nb_sb -= 1; 1927 } 1928 1929 unplugged: 1930 /* 1931 * Once all subblocks of a memory block were unplugged, offline and 1932 * remove it. This will usually not fail, as no memory is in use 1933 * anymore - however some other notifiers might NACK the request. 1934 */ 1935 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1936 mutex_unlock(&vm->hotplug_mutex); 1937 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 1938 mutex_lock(&vm->hotplug_mutex); 1939 if (!rc) 1940 virtio_mem_sbm_set_mb_state(vm, mb_id, 1941 VIRTIO_MEM_SBM_MB_UNUSED); 1942 } 1943 1944 return 0; 1945 } 1946 1947 /* 1948 * Unplug the desired number of plugged subblocks of a memory block that is 1949 * already added to Linux. Will skip subblock of online memory blocks that are 1950 * busy (by the OS). Will fail if any subblock that's not busy cannot get 1951 * unplugged. 1952 * 1953 * Will modify the state of the memory block. Might temporarily drop the 1954 * hotplug_mutex. 1955 * 1956 * Note: Can fail after some subblocks were successfully unplugged. Can 1957 * return 0 even if subblocks were busy and could not get unplugged. 1958 */ 1959 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1960 unsigned long mb_id, 1961 uint64_t *nb_sb) 1962 { 1963 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1964 1965 switch (old_state) { 1966 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 1967 case VIRTIO_MEM_SBM_MB_KERNEL: 1968 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 1969 case VIRTIO_MEM_SBM_MB_MOVABLE: 1970 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 1971 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 1972 case VIRTIO_MEM_SBM_MB_OFFLINE: 1973 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 1974 } 1975 return -EINVAL; 1976 } 1977 1978 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1979 { 1980 const int mb_states[] = { 1981 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1982 VIRTIO_MEM_SBM_MB_OFFLINE, 1983 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1984 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1985 VIRTIO_MEM_SBM_MB_MOVABLE, 1986 VIRTIO_MEM_SBM_MB_KERNEL, 1987 }; 1988 uint64_t nb_sb = diff / vm->sbm.sb_size; 1989 unsigned long mb_id; 1990 int rc, i; 1991 1992 if (!nb_sb) 1993 return 0; 1994 1995 /* 1996 * We'll drop the mutex a couple of times when it is safe to do so. 1997 * This might result in some blocks switching the state (online/offline) 1998 * and we could miss them in this run - we will retry again later. 1999 */ 2000 mutex_lock(&vm->hotplug_mutex); 2001 2002 /* 2003 * We try unplug from partially plugged blocks first, to try removing 2004 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2005 * as it's more reliable to unplug memory and remove whole memory 2006 * blocks, and we don't want to trigger a zone imbalances by 2007 * accidentially removing too much kernel memory. 2008 */ 2009 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2010 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2011 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2012 if (rc || !nb_sb) 2013 goto out_unlock; 2014 mutex_unlock(&vm->hotplug_mutex); 2015 cond_resched(); 2016 mutex_lock(&vm->hotplug_mutex); 2017 } 2018 if (!unplug_online && i == 1) { 2019 mutex_unlock(&vm->hotplug_mutex); 2020 return 0; 2021 } 2022 } 2023 2024 mutex_unlock(&vm->hotplug_mutex); 2025 return nb_sb ? -EBUSY : 0; 2026 out_unlock: 2027 mutex_unlock(&vm->hotplug_mutex); 2028 return rc; 2029 } 2030 2031 /* 2032 * Try to offline and remove a big block from Linux and unplug it. Will fail 2033 * with -EBUSY if some memory is busy and cannot get unplugged. 2034 * 2035 * Will modify the state of the memory block. Might temporarily drop the 2036 * hotplug_mutex. 2037 */ 2038 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2039 unsigned long bb_id) 2040 { 2041 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2042 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2043 unsigned long end_pfn = start_pfn + nr_pages; 2044 unsigned long pfn; 2045 struct page *page; 2046 int rc; 2047 2048 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2049 VIRTIO_MEM_BBM_BB_ADDED)) 2050 return -EINVAL; 2051 2052 if (bbm_safe_unplug) { 2053 /* 2054 * Start by fake-offlining all memory. Once we marked the device 2055 * block as fake-offline, all newly onlined memory will 2056 * automatically be kept fake-offline. Protect from concurrent 2057 * onlining/offlining until we have a consistent state. 2058 */ 2059 mutex_lock(&vm->hotplug_mutex); 2060 virtio_mem_bbm_set_bb_state(vm, bb_id, 2061 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2062 2063 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2064 page = pfn_to_online_page(pfn); 2065 if (!page) 2066 continue; 2067 2068 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2069 if (rc) { 2070 end_pfn = pfn; 2071 goto rollback_safe_unplug; 2072 } 2073 } 2074 mutex_unlock(&vm->hotplug_mutex); 2075 } 2076 2077 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2078 if (rc) { 2079 if (bbm_safe_unplug) { 2080 mutex_lock(&vm->hotplug_mutex); 2081 goto rollback_safe_unplug; 2082 } 2083 return rc; 2084 } 2085 2086 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2087 if (rc) 2088 virtio_mem_bbm_set_bb_state(vm, bb_id, 2089 VIRTIO_MEM_BBM_BB_PLUGGED); 2090 else 2091 virtio_mem_bbm_set_bb_state(vm, bb_id, 2092 VIRTIO_MEM_BBM_BB_UNUSED); 2093 return rc; 2094 2095 rollback_safe_unplug: 2096 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2097 page = pfn_to_online_page(pfn); 2098 if (!page) 2099 continue; 2100 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2101 } 2102 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2103 mutex_unlock(&vm->hotplug_mutex); 2104 return rc; 2105 } 2106 2107 /* 2108 * Test if a big block is completely offline. 2109 */ 2110 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2111 unsigned long bb_id) 2112 { 2113 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2114 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2115 unsigned long pfn; 2116 2117 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2118 pfn += PAGES_PER_SECTION) { 2119 if (pfn_to_online_page(pfn)) 2120 return false; 2121 } 2122 2123 return true; 2124 } 2125 2126 /* 2127 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2128 */ 2129 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2130 unsigned long bb_id) 2131 { 2132 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2133 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2134 struct page *page; 2135 unsigned long pfn; 2136 2137 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2138 pfn += PAGES_PER_SECTION) { 2139 page = pfn_to_online_page(pfn); 2140 if (!page) 2141 continue; 2142 if (page_zonenum(page) != ZONE_MOVABLE) 2143 return false; 2144 } 2145 2146 return true; 2147 } 2148 2149 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2150 { 2151 uint64_t nb_bb = diff / vm->bbm.bb_size; 2152 uint64_t bb_id; 2153 int rc, i; 2154 2155 if (!nb_bb) 2156 return 0; 2157 2158 /* 2159 * Try to unplug big blocks. Similar to SBM, start with offline 2160 * big blocks. 2161 */ 2162 for (i = 0; i < 3; i++) { 2163 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2164 cond_resched(); 2165 2166 /* 2167 * As we're holding no locks, these checks are racy, 2168 * but we don't care. 2169 */ 2170 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2171 continue; 2172 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2173 continue; 2174 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2175 if (rc == -EBUSY) 2176 continue; 2177 if (!rc) 2178 nb_bb--; 2179 if (rc || !nb_bb) 2180 return rc; 2181 } 2182 if (i == 0 && !unplug_online) 2183 return 0; 2184 } 2185 2186 return nb_bb ? -EBUSY : 0; 2187 } 2188 2189 /* 2190 * Try to unplug the requested amount of memory. 2191 */ 2192 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2193 { 2194 if (vm->in_sbm) 2195 return virtio_mem_sbm_unplug_request(vm, diff); 2196 return virtio_mem_bbm_unplug_request(vm, diff); 2197 } 2198 2199 /* 2200 * Try to unplug all blocks that couldn't be unplugged before, for example, 2201 * because the hypervisor was busy. 2202 */ 2203 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2204 { 2205 unsigned long id; 2206 int rc; 2207 2208 if (!vm->in_sbm) { 2209 virtio_mem_bbm_for_each_bb(vm, id, 2210 VIRTIO_MEM_BBM_BB_PLUGGED) { 2211 rc = virtio_mem_bbm_unplug_bb(vm, id); 2212 if (rc) 2213 return rc; 2214 virtio_mem_bbm_set_bb_state(vm, id, 2215 VIRTIO_MEM_BBM_BB_UNUSED); 2216 } 2217 return 0; 2218 } 2219 2220 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2221 rc = virtio_mem_sbm_unplug_mb(vm, id); 2222 if (rc) 2223 return rc; 2224 virtio_mem_sbm_set_mb_state(vm, id, 2225 VIRTIO_MEM_SBM_MB_UNUSED); 2226 } 2227 2228 return 0; 2229 } 2230 2231 /* 2232 * Update all parts of the config that could have changed. 2233 */ 2234 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2235 { 2236 const struct range pluggable_range = mhp_get_pluggable_range(true); 2237 uint64_t new_plugged_size, usable_region_size, end_addr; 2238 2239 /* the plugged_size is just a reflection of what _we_ did previously */ 2240 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2241 &new_plugged_size); 2242 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2243 vm->plugged_size = new_plugged_size; 2244 2245 /* calculate the last usable memory block id */ 2246 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2247 usable_region_size, &usable_region_size); 2248 end_addr = min(vm->addr + usable_region_size - 1, 2249 pluggable_range.end); 2250 2251 if (vm->in_sbm) { 2252 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2253 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2254 vm->sbm.last_usable_mb_id--; 2255 } else { 2256 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2257 end_addr); 2258 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2259 vm->bbm.last_usable_bb_id--; 2260 } 2261 /* 2262 * If we cannot plug any of our device memory (e.g., nothing in the 2263 * usable region is addressable), the last usable memory block id will 2264 * be smaller than the first usable memory block id. We'll stop 2265 * attempting to add memory with -ENOSPC from our main loop. 2266 */ 2267 2268 /* see if there is a request to change the size */ 2269 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2270 &vm->requested_size); 2271 2272 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2273 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2274 } 2275 2276 /* 2277 * Workqueue function for handling plug/unplug requests and config updates. 2278 */ 2279 static void virtio_mem_run_wq(struct work_struct *work) 2280 { 2281 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2282 uint64_t diff; 2283 int rc; 2284 2285 hrtimer_cancel(&vm->retry_timer); 2286 2287 if (vm->broken) 2288 return; 2289 2290 atomic_set(&vm->wq_active, 1); 2291 retry: 2292 rc = 0; 2293 2294 /* Make sure we start with a clean state if there are leftovers. */ 2295 if (unlikely(vm->unplug_all_required)) 2296 rc = virtio_mem_send_unplug_all_request(vm); 2297 2298 if (atomic_read(&vm->config_changed)) { 2299 atomic_set(&vm->config_changed, 0); 2300 virtio_mem_refresh_config(vm); 2301 } 2302 2303 /* Unplug any leftovers from previous runs */ 2304 if (!rc) 2305 rc = virtio_mem_unplug_pending_mb(vm); 2306 2307 if (!rc && vm->requested_size != vm->plugged_size) { 2308 if (vm->requested_size > vm->plugged_size) { 2309 diff = vm->requested_size - vm->plugged_size; 2310 rc = virtio_mem_plug_request(vm, diff); 2311 } else { 2312 diff = vm->plugged_size - vm->requested_size; 2313 rc = virtio_mem_unplug_request(vm, diff); 2314 } 2315 } 2316 2317 switch (rc) { 2318 case 0: 2319 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2320 break; 2321 case -ENOSPC: 2322 /* 2323 * We cannot add any more memory (alignment, physical limit) 2324 * or we have too many offline memory blocks. 2325 */ 2326 break; 2327 case -ETXTBSY: 2328 /* 2329 * The hypervisor cannot process our request right now 2330 * (e.g., out of memory, migrating); 2331 */ 2332 case -EBUSY: 2333 /* 2334 * We cannot free up any memory to unplug it (all plugged memory 2335 * is busy). 2336 */ 2337 case -ENOMEM: 2338 /* Out of memory, try again later. */ 2339 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2340 HRTIMER_MODE_REL); 2341 break; 2342 case -EAGAIN: 2343 /* Retry immediately (e.g., the config changed). */ 2344 goto retry; 2345 default: 2346 /* Unknown error, mark as broken */ 2347 dev_err(&vm->vdev->dev, 2348 "unknown error, marking device broken: %d\n", rc); 2349 vm->broken = true; 2350 } 2351 2352 atomic_set(&vm->wq_active, 0); 2353 } 2354 2355 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2356 { 2357 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2358 retry_timer); 2359 2360 virtio_mem_retry(vm); 2361 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2362 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2363 return HRTIMER_NORESTART; 2364 } 2365 2366 static void virtio_mem_handle_response(struct virtqueue *vq) 2367 { 2368 struct virtio_mem *vm = vq->vdev->priv; 2369 2370 wake_up(&vm->host_resp); 2371 } 2372 2373 static int virtio_mem_init_vq(struct virtio_mem *vm) 2374 { 2375 struct virtqueue *vq; 2376 2377 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2378 "guest-request"); 2379 if (IS_ERR(vq)) 2380 return PTR_ERR(vq); 2381 vm->vq = vq; 2382 2383 return 0; 2384 } 2385 2386 static int virtio_mem_init(struct virtio_mem *vm) 2387 { 2388 const struct range pluggable_range = mhp_get_pluggable_range(true); 2389 uint64_t sb_size, addr; 2390 uint16_t node_id; 2391 2392 if (!vm->vdev->config->get) { 2393 dev_err(&vm->vdev->dev, "config access disabled\n"); 2394 return -EINVAL; 2395 } 2396 2397 /* 2398 * We don't want to (un)plug or reuse any memory when in kdump. The 2399 * memory is still accessible (but not mapped). 2400 */ 2401 if (is_kdump_kernel()) { 2402 dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n"); 2403 return -EBUSY; 2404 } 2405 2406 /* Fetch all properties that can't change. */ 2407 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2408 &vm->plugged_size); 2409 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2410 &vm->device_block_size); 2411 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2412 &node_id); 2413 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2414 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2415 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2416 &vm->region_size); 2417 2418 /* Determine the nid for the device based on the lowest address. */ 2419 if (vm->nid == NUMA_NO_NODE) 2420 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2421 2422 /* bad device setup - warn only */ 2423 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2424 dev_warn(&vm->vdev->dev, 2425 "The alignment of the physical start address can make some memory unusable.\n"); 2426 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2427 dev_warn(&vm->vdev->dev, 2428 "The alignment of the physical end address can make some memory unusable.\n"); 2429 if (vm->addr < pluggable_range.start || 2430 vm->addr + vm->region_size - 1 > pluggable_range.end) 2431 dev_warn(&vm->vdev->dev, 2432 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2433 2434 /* Prepare the offline threshold - make sure we can add two blocks. */ 2435 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2436 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2437 2438 /* 2439 * We want subblocks to span at least MAX_ORDER_NR_PAGES and 2440 * pageblock_nr_pages pages. This: 2441 * - Simplifies our page onlining code (virtio_mem_online_page_cb) 2442 * and fake page onlining code (virtio_mem_fake_online). 2443 * - Is required for now for alloc_contig_range() to work reliably - 2444 * it doesn't properly handle smaller granularity on ZONE_NORMAL. 2445 */ 2446 sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, 2447 pageblock_nr_pages) * PAGE_SIZE; 2448 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2449 2450 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2451 /* SBM: At least two subblocks per Linux memory block. */ 2452 vm->in_sbm = true; 2453 vm->sbm.sb_size = sb_size; 2454 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2455 vm->sbm.sb_size; 2456 2457 /* Round up to the next full memory block */ 2458 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2459 memory_block_size_bytes() - 1; 2460 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2461 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2462 } else { 2463 /* BBM: At least one Linux memory block. */ 2464 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2465 memory_block_size_bytes()); 2466 2467 if (bbm_block_size) { 2468 if (!is_power_of_2(bbm_block_size)) { 2469 dev_warn(&vm->vdev->dev, 2470 "bbm_block_size is not a power of 2"); 2471 } else if (bbm_block_size < vm->bbm.bb_size) { 2472 dev_warn(&vm->vdev->dev, 2473 "bbm_block_size is too small"); 2474 } else { 2475 vm->bbm.bb_size = bbm_block_size; 2476 } 2477 } 2478 2479 /* Round up to the next aligned big block */ 2480 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2481 vm->bbm.bb_size - 1; 2482 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2483 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2484 2485 /* Make sure we can add two big blocks. */ 2486 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2487 vm->offline_threshold); 2488 } 2489 2490 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2491 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2492 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2493 (unsigned long long)vm->device_block_size); 2494 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2495 memory_block_size_bytes()); 2496 if (vm->in_sbm) 2497 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2498 (unsigned long long)vm->sbm.sb_size); 2499 else 2500 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2501 (unsigned long long)vm->bbm.bb_size); 2502 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2503 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2504 2505 return 0; 2506 } 2507 2508 static int virtio_mem_create_resource(struct virtio_mem *vm) 2509 { 2510 /* 2511 * When force-unloading the driver and removing the device, we 2512 * could have a garbage pointer. Duplicate the string. 2513 */ 2514 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2515 2516 if (!name) 2517 return -ENOMEM; 2518 2519 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2520 name, IORESOURCE_SYSTEM_RAM); 2521 if (!vm->parent_resource) { 2522 kfree(name); 2523 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2524 dev_info(&vm->vdev->dev, 2525 "reloading the driver is not supported\n"); 2526 return -EBUSY; 2527 } 2528 2529 /* The memory is not actually busy - make add_memory() work. */ 2530 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2531 return 0; 2532 } 2533 2534 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2535 { 2536 const char *name; 2537 2538 if (!vm->parent_resource) 2539 return; 2540 2541 name = vm->parent_resource->name; 2542 release_resource(vm->parent_resource); 2543 kfree(vm->parent_resource); 2544 kfree(name); 2545 vm->parent_resource = NULL; 2546 } 2547 2548 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2549 { 2550 return 1; 2551 } 2552 2553 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2554 { 2555 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2556 2557 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2558 vm->addr + vm->region_size, NULL, 2559 virtio_mem_range_has_system_ram) == 1; 2560 } 2561 2562 static int virtio_mem_probe(struct virtio_device *vdev) 2563 { 2564 struct virtio_mem *vm; 2565 int rc; 2566 2567 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2568 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2569 2570 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2571 if (!vm) 2572 return -ENOMEM; 2573 2574 init_waitqueue_head(&vm->host_resp); 2575 vm->vdev = vdev; 2576 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2577 mutex_init(&vm->hotplug_mutex); 2578 INIT_LIST_HEAD(&vm->next); 2579 spin_lock_init(&vm->removal_lock); 2580 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2581 vm->retry_timer.function = virtio_mem_timer_expired; 2582 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2583 2584 /* register the virtqueue */ 2585 rc = virtio_mem_init_vq(vm); 2586 if (rc) 2587 goto out_free_vm; 2588 2589 /* initialize the device by querying the config */ 2590 rc = virtio_mem_init(vm); 2591 if (rc) 2592 goto out_del_vq; 2593 2594 /* create the parent resource for all memory */ 2595 rc = virtio_mem_create_resource(vm); 2596 if (rc) 2597 goto out_del_vq; 2598 2599 /* 2600 * If we still have memory plugged, we have to unplug all memory first. 2601 * Registering our parent resource makes sure that this memory isn't 2602 * actually in use (e.g., trying to reload the driver). 2603 */ 2604 if (vm->plugged_size) { 2605 vm->unplug_all_required = true; 2606 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2607 } 2608 2609 /* register callbacks */ 2610 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2611 rc = register_memory_notifier(&vm->memory_notifier); 2612 if (rc) 2613 goto out_del_resource; 2614 rc = register_virtio_mem_device(vm); 2615 if (rc) 2616 goto out_unreg_mem; 2617 2618 virtio_device_ready(vdev); 2619 2620 /* trigger a config update to start processing the requested_size */ 2621 atomic_set(&vm->config_changed, 1); 2622 queue_work(system_freezable_wq, &vm->wq); 2623 2624 return 0; 2625 out_unreg_mem: 2626 unregister_memory_notifier(&vm->memory_notifier); 2627 out_del_resource: 2628 virtio_mem_delete_resource(vm); 2629 out_del_vq: 2630 vdev->config->del_vqs(vdev); 2631 out_free_vm: 2632 kfree(vm); 2633 vdev->priv = NULL; 2634 2635 return rc; 2636 } 2637 2638 static void virtio_mem_remove(struct virtio_device *vdev) 2639 { 2640 struct virtio_mem *vm = vdev->priv; 2641 unsigned long mb_id; 2642 int rc; 2643 2644 /* 2645 * Make sure the workqueue won't be triggered anymore and no memory 2646 * blocks can be onlined/offlined until we're finished here. 2647 */ 2648 mutex_lock(&vm->hotplug_mutex); 2649 spin_lock_irq(&vm->removal_lock); 2650 vm->removing = true; 2651 spin_unlock_irq(&vm->removal_lock); 2652 mutex_unlock(&vm->hotplug_mutex); 2653 2654 /* wait until the workqueue stopped */ 2655 cancel_work_sync(&vm->wq); 2656 hrtimer_cancel(&vm->retry_timer); 2657 2658 if (vm->in_sbm) { 2659 /* 2660 * After we unregistered our callbacks, user space can online 2661 * partially plugged offline blocks. Make sure to remove them. 2662 */ 2663 virtio_mem_sbm_for_each_mb(vm, mb_id, 2664 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2665 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2666 BUG_ON(rc); 2667 virtio_mem_sbm_set_mb_state(vm, mb_id, 2668 VIRTIO_MEM_SBM_MB_UNUSED); 2669 } 2670 /* 2671 * After we unregistered our callbacks, user space can no longer 2672 * offline partially plugged online memory blocks. No need to 2673 * worry about them. 2674 */ 2675 } 2676 2677 /* unregister callbacks */ 2678 unregister_virtio_mem_device(vm); 2679 unregister_memory_notifier(&vm->memory_notifier); 2680 2681 /* 2682 * There is no way we could reliably remove all memory we have added to 2683 * the system. And there is no way to stop the driver/device from going 2684 * away. Warn at least. 2685 */ 2686 if (virtio_mem_has_memory_added(vm)) { 2687 dev_warn(&vdev->dev, "device still has system memory added\n"); 2688 } else { 2689 virtio_mem_delete_resource(vm); 2690 kfree_const(vm->resource_name); 2691 } 2692 2693 /* remove all tracking data - no locking needed */ 2694 if (vm->in_sbm) { 2695 vfree(vm->sbm.mb_states); 2696 vfree(vm->sbm.sb_states); 2697 } else { 2698 vfree(vm->bbm.bb_states); 2699 } 2700 2701 /* reset the device and cleanup the queues */ 2702 vdev->config->reset(vdev); 2703 vdev->config->del_vqs(vdev); 2704 2705 kfree(vm); 2706 vdev->priv = NULL; 2707 } 2708 2709 static void virtio_mem_config_changed(struct virtio_device *vdev) 2710 { 2711 struct virtio_mem *vm = vdev->priv; 2712 2713 atomic_set(&vm->config_changed, 1); 2714 virtio_mem_retry(vm); 2715 } 2716 2717 #ifdef CONFIG_PM_SLEEP 2718 static int virtio_mem_freeze(struct virtio_device *vdev) 2719 { 2720 /* 2721 * When restarting the VM, all memory is usually unplugged. Don't 2722 * allow to suspend/hibernate. 2723 */ 2724 dev_err(&vdev->dev, "save/restore not supported.\n"); 2725 return -EPERM; 2726 } 2727 2728 static int virtio_mem_restore(struct virtio_device *vdev) 2729 { 2730 return -EPERM; 2731 } 2732 #endif 2733 2734 static unsigned int virtio_mem_features[] = { 2735 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2736 VIRTIO_MEM_F_ACPI_PXM, 2737 #endif 2738 }; 2739 2740 static const struct virtio_device_id virtio_mem_id_table[] = { 2741 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2742 { 0 }, 2743 }; 2744 2745 static struct virtio_driver virtio_mem_driver = { 2746 .feature_table = virtio_mem_features, 2747 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2748 .driver.name = KBUILD_MODNAME, 2749 .driver.owner = THIS_MODULE, 2750 .id_table = virtio_mem_id_table, 2751 .probe = virtio_mem_probe, 2752 .remove = virtio_mem_remove, 2753 .config_changed = virtio_mem_config_changed, 2754 #ifdef CONFIG_PM_SLEEP 2755 .freeze = virtio_mem_freeze, 2756 .restore = virtio_mem_restore, 2757 #endif 2758 }; 2759 2760 module_virtio_driver(virtio_mem_driver); 2761 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2762 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2763 MODULE_DESCRIPTION("Virtio-mem driver"); 2764 MODULE_LICENSE("GPL"); 2765