1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 24 #include <acpi/acpi_numa.h> 25 26 static bool unplug_online = true; 27 module_param(unplug_online, bool, 0644); 28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 29 30 static bool force_bbm; 31 module_param(force_bbm, bool, 0444); 32 MODULE_PARM_DESC(force_bbm, 33 "Force Big Block Mode. Default is 0 (auto-selection)"); 34 35 static unsigned long bbm_block_size; 36 module_param(bbm_block_size, ulong, 0444); 37 MODULE_PARM_DESC(bbm_block_size, 38 "Big Block size in bytes. Default is 0 (auto-detection)."); 39 40 static bool bbm_safe_unplug = true; 41 module_param(bbm_safe_unplug, bool, 0444); 42 MODULE_PARM_DESC(bbm_safe_unplug, 43 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 44 45 /* 46 * virtio-mem currently supports the following modes of operation: 47 * 48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 49 * size of a Sub Block (SB) is determined based on the device block size, the 50 * pageblock size, and the maximum allocation granularity of the buddy. 51 * Subblocks within a Linux memory block might either be plugged or unplugged. 52 * Memory is added/removed to Linux MM in Linux memory block granularity. 53 * 54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 55 * Memory is added/removed to Linux MM in Big Block granularity. 56 * 57 * The mode is determined automatically based on the Linux memory block size 58 * and the device block size. 59 * 60 * User space / core MM (auto onlining) is responsible for onlining added 61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 62 * always onlined separately, and all memory within a Linux memory block is 63 * onlined to the same zone - virtio-mem relies on this behavior. 64 */ 65 66 /* 67 * State of a Linux memory block in SBM. 68 */ 69 enum virtio_mem_sbm_mb_state { 70 /* Unplugged, not added to Linux. Can be reused later. */ 71 VIRTIO_MEM_SBM_MB_UNUSED = 0, 72 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 73 VIRTIO_MEM_SBM_MB_PLUGGED, 74 /* Fully plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE, 76 /* Partially plugged, fully added to Linux, offline. */ 77 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 78 /* Fully plugged, fully added to Linux, online. */ 79 VIRTIO_MEM_SBM_MB_ONLINE, 80 /* Partially plugged, fully added to Linux, online. */ 81 VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, 82 VIRTIO_MEM_SBM_MB_COUNT 83 }; 84 85 /* 86 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 87 */ 88 enum virtio_mem_bbm_bb_state { 89 /* Unplugged, not added to Linux. Can be reused later. */ 90 VIRTIO_MEM_BBM_BB_UNUSED = 0, 91 /* Plugged, not added to Linux. Error on add_memory(). */ 92 VIRTIO_MEM_BBM_BB_PLUGGED, 93 /* Plugged and added to Linux. */ 94 VIRTIO_MEM_BBM_BB_ADDED, 95 /* All online parts are fake-offline, ready to remove. */ 96 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 97 VIRTIO_MEM_BBM_BB_COUNT 98 }; 99 100 struct virtio_mem { 101 struct virtio_device *vdev; 102 103 /* We might first have to unplug all memory when starting up. */ 104 bool unplug_all_required; 105 106 /* Workqueue that processes the plug/unplug requests. */ 107 struct work_struct wq; 108 atomic_t wq_active; 109 atomic_t config_changed; 110 111 /* Virtqueue for guest->host requests. */ 112 struct virtqueue *vq; 113 114 /* Wait for a host response to a guest request. */ 115 wait_queue_head_t host_resp; 116 117 /* Space for one guest request and the host response. */ 118 struct virtio_mem_req req; 119 struct virtio_mem_resp resp; 120 121 /* The current size of the device. */ 122 uint64_t plugged_size; 123 /* The requested size of the device. */ 124 uint64_t requested_size; 125 126 /* The device block size (for communicating with the device). */ 127 uint64_t device_block_size; 128 /* The determined node id for all memory of the device. */ 129 int nid; 130 /* Physical start address of the memory region. */ 131 uint64_t addr; 132 /* Maximum region size in bytes. */ 133 uint64_t region_size; 134 135 /* The parent resource for all memory added via this device. */ 136 struct resource *parent_resource; 137 /* 138 * Copy of "System RAM (virtio_mem)" to be used for 139 * add_memory_driver_managed(). 140 */ 141 const char *resource_name; 142 143 /* 144 * We don't want to add too much memory if it's not getting onlined, 145 * to avoid running OOM. Besides this threshold, we allow to have at 146 * least two offline blocks at a time (whatever is bigger). 147 */ 148 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 149 atomic64_t offline_size; 150 uint64_t offline_threshold; 151 152 /* If set, the driver is in SBM, otherwise in BBM. */ 153 bool in_sbm; 154 155 union { 156 struct { 157 /* Id of the first memory block of this device. */ 158 unsigned long first_mb_id; 159 /* Id of the last usable memory block of this device. */ 160 unsigned long last_usable_mb_id; 161 /* Id of the next memory bock to prepare when needed. */ 162 unsigned long next_mb_id; 163 164 /* The subblock size. */ 165 uint64_t sb_size; 166 /* The number of subblocks per Linux memory block. */ 167 uint32_t sbs_per_mb; 168 169 /* Summary of all memory block states. */ 170 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 171 172 /* 173 * One byte state per memory block. Allocated via 174 * vmalloc(). Resized (alloc+copy+free) on demand. 175 * 176 * With 128 MiB memory blocks, we have states for 512 177 * GiB of memory in one 4 KiB page. 178 */ 179 uint8_t *mb_states; 180 181 /* 182 * Bitmap: one bit per subblock. Allocated similar to 183 * sbm.mb_states. 184 * 185 * A set bit means the corresponding subblock is 186 * plugged, otherwise it's unblocked. 187 * 188 * With 4 MiB subblocks, we manage 128 GiB of memory 189 * in one 4 KiB page. 190 */ 191 unsigned long *sb_states; 192 } sbm; 193 194 struct { 195 /* Id of the first big block of this device. */ 196 unsigned long first_bb_id; 197 /* Id of the last usable big block of this device. */ 198 unsigned long last_usable_bb_id; 199 /* Id of the next device bock to prepare when needed. */ 200 unsigned long next_bb_id; 201 202 /* Summary of all big block states. */ 203 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 204 205 /* One byte state per big block. See sbm.mb_states. */ 206 uint8_t *bb_states; 207 208 /* The block size used for plugging/adding/removing. */ 209 uint64_t bb_size; 210 } bbm; 211 }; 212 213 /* 214 * Mutex that protects the sbm.mb_count, sbm.mb_states, 215 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 216 * 217 * When this lock is held the pointers can't change, ONLINE and 218 * OFFLINE blocks can't change the state and no subblocks will get 219 * plugged/unplugged. 220 */ 221 struct mutex hotplug_mutex; 222 bool hotplug_active; 223 224 /* An error occurred we cannot handle - stop processing requests. */ 225 bool broken; 226 227 /* The driver is being removed. */ 228 spinlock_t removal_lock; 229 bool removing; 230 231 /* Timer for retrying to plug/unplug memory. */ 232 struct hrtimer retry_timer; 233 unsigned int retry_timer_ms; 234 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 235 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 236 237 /* Memory notifier (online/offline events). */ 238 struct notifier_block memory_notifier; 239 240 /* Next device in the list of virtio-mem devices. */ 241 struct list_head next; 242 }; 243 244 /* 245 * We have to share a single online_page callback among all virtio-mem 246 * devices. We use RCU to iterate the list in the callback. 247 */ 248 static DEFINE_MUTEX(virtio_mem_mutex); 249 static LIST_HEAD(virtio_mem_devices); 250 251 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 252 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 253 unsigned long nr_pages); 254 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 255 unsigned long nr_pages); 256 static void virtio_mem_retry(struct virtio_mem *vm); 257 258 /* 259 * Register a virtio-mem device so it will be considered for the online_page 260 * callback. 261 */ 262 static int register_virtio_mem_device(struct virtio_mem *vm) 263 { 264 int rc = 0; 265 266 /* First device registers the callback. */ 267 mutex_lock(&virtio_mem_mutex); 268 if (list_empty(&virtio_mem_devices)) 269 rc = set_online_page_callback(&virtio_mem_online_page_cb); 270 if (!rc) 271 list_add_rcu(&vm->next, &virtio_mem_devices); 272 mutex_unlock(&virtio_mem_mutex); 273 274 return rc; 275 } 276 277 /* 278 * Unregister a virtio-mem device so it will no longer be considered for the 279 * online_page callback. 280 */ 281 static void unregister_virtio_mem_device(struct virtio_mem *vm) 282 { 283 /* Last device unregisters the callback. */ 284 mutex_lock(&virtio_mem_mutex); 285 list_del_rcu(&vm->next); 286 if (list_empty(&virtio_mem_devices)) 287 restore_online_page_callback(&virtio_mem_online_page_cb); 288 mutex_unlock(&virtio_mem_mutex); 289 290 synchronize_rcu(); 291 } 292 293 /* 294 * Calculate the memory block id of a given address. 295 */ 296 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 297 { 298 return addr / memory_block_size_bytes(); 299 } 300 301 /* 302 * Calculate the physical start address of a given memory block id. 303 */ 304 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 305 { 306 return mb_id * memory_block_size_bytes(); 307 } 308 309 /* 310 * Calculate the big block id of a given address. 311 */ 312 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 313 uint64_t addr) 314 { 315 return addr / vm->bbm.bb_size; 316 } 317 318 /* 319 * Calculate the physical start address of a given big block id. 320 */ 321 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 322 unsigned long bb_id) 323 { 324 return bb_id * vm->bbm.bb_size; 325 } 326 327 /* 328 * Calculate the subblock id of a given address. 329 */ 330 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 331 unsigned long addr) 332 { 333 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 334 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 335 336 return (addr - mb_addr) / vm->sbm.sb_size; 337 } 338 339 /* 340 * Set the state of a big block, taking care of the state counter. 341 */ 342 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 343 unsigned long bb_id, 344 enum virtio_mem_bbm_bb_state state) 345 { 346 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 347 enum virtio_mem_bbm_bb_state old_state; 348 349 old_state = vm->bbm.bb_states[idx]; 350 vm->bbm.bb_states[idx] = state; 351 352 BUG_ON(vm->bbm.bb_count[old_state] == 0); 353 vm->bbm.bb_count[old_state]--; 354 vm->bbm.bb_count[state]++; 355 } 356 357 /* 358 * Get the state of a big block. 359 */ 360 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 361 unsigned long bb_id) 362 { 363 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 364 } 365 366 /* 367 * Prepare the big block state array for the next big block. 368 */ 369 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 370 { 371 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 372 unsigned long new_bytes = old_bytes + 1; 373 int old_pages = PFN_UP(old_bytes); 374 int new_pages = PFN_UP(new_bytes); 375 uint8_t *new_array; 376 377 if (vm->bbm.bb_states && old_pages == new_pages) 378 return 0; 379 380 new_array = vzalloc(new_pages * PAGE_SIZE); 381 if (!new_array) 382 return -ENOMEM; 383 384 mutex_lock(&vm->hotplug_mutex); 385 if (vm->bbm.bb_states) 386 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 387 vfree(vm->bbm.bb_states); 388 vm->bbm.bb_states = new_array; 389 mutex_unlock(&vm->hotplug_mutex); 390 391 return 0; 392 } 393 394 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 395 for (_bb_id = vm->bbm.first_bb_id; \ 396 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 397 _bb_id++) \ 398 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 399 400 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 401 for (_bb_id = vm->bbm.next_bb_id - 1; \ 402 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 403 _bb_id--) \ 404 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 405 406 /* 407 * Set the state of a memory block, taking care of the state counter. 408 */ 409 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 410 unsigned long mb_id, uint8_t state) 411 { 412 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 413 uint8_t old_state; 414 415 old_state = vm->sbm.mb_states[idx]; 416 vm->sbm.mb_states[idx] = state; 417 418 BUG_ON(vm->sbm.mb_count[old_state] == 0); 419 vm->sbm.mb_count[old_state]--; 420 vm->sbm.mb_count[state]++; 421 } 422 423 /* 424 * Get the state of a memory block. 425 */ 426 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 427 unsigned long mb_id) 428 { 429 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 430 431 return vm->sbm.mb_states[idx]; 432 } 433 434 /* 435 * Prepare the state array for the next memory block. 436 */ 437 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 438 { 439 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 440 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 441 uint8_t *new_array; 442 443 if (vm->sbm.mb_states && old_pages == new_pages) 444 return 0; 445 446 new_array = vzalloc(new_pages * PAGE_SIZE); 447 if (!new_array) 448 return -ENOMEM; 449 450 mutex_lock(&vm->hotplug_mutex); 451 if (vm->sbm.mb_states) 452 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 453 vfree(vm->sbm.mb_states); 454 vm->sbm.mb_states = new_array; 455 mutex_unlock(&vm->hotplug_mutex); 456 457 return 0; 458 } 459 460 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 461 for (_mb_id = _vm->sbm.first_mb_id; \ 462 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 463 _mb_id++) \ 464 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 465 466 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 467 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 468 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 469 _mb_id--) \ 470 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 471 472 /* 473 * Calculate the bit number in the subblock bitmap for the given subblock 474 * inside the given memory block. 475 */ 476 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 477 unsigned long mb_id, int sb_id) 478 { 479 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 480 } 481 482 /* 483 * Mark all selected subblocks plugged. 484 * 485 * Will not modify the state of the memory block. 486 */ 487 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 488 unsigned long mb_id, int sb_id, 489 int count) 490 { 491 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 492 493 __bitmap_set(vm->sbm.sb_states, bit, count); 494 } 495 496 /* 497 * Mark all selected subblocks unplugged. 498 * 499 * Will not modify the state of the memory block. 500 */ 501 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 502 unsigned long mb_id, int sb_id, 503 int count) 504 { 505 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 506 507 __bitmap_clear(vm->sbm.sb_states, bit, count); 508 } 509 510 /* 511 * Test if all selected subblocks are plugged. 512 */ 513 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 514 unsigned long mb_id, int sb_id, 515 int count) 516 { 517 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 518 519 if (count == 1) 520 return test_bit(bit, vm->sbm.sb_states); 521 522 /* TODO: Helper similar to bitmap_set() */ 523 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 524 bit + count; 525 } 526 527 /* 528 * Test if all selected subblocks are unplugged. 529 */ 530 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 531 unsigned long mb_id, int sb_id, 532 int count) 533 { 534 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 535 536 /* TODO: Helper similar to bitmap_set() */ 537 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 538 bit + count; 539 } 540 541 /* 542 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 543 * none. 544 */ 545 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 546 unsigned long mb_id) 547 { 548 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 549 550 return find_next_zero_bit(vm->sbm.sb_states, 551 bit + vm->sbm.sbs_per_mb, bit) - bit; 552 } 553 554 /* 555 * Prepare the subblock bitmap for the next memory block. 556 */ 557 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 558 { 559 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 560 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 561 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 562 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 563 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 564 unsigned long *new_bitmap, *old_bitmap; 565 566 if (vm->sbm.sb_states && old_pages == new_pages) 567 return 0; 568 569 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 570 if (!new_bitmap) 571 return -ENOMEM; 572 573 mutex_lock(&vm->hotplug_mutex); 574 if (new_bitmap) 575 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 576 577 old_bitmap = vm->sbm.sb_states; 578 vm->sbm.sb_states = new_bitmap; 579 mutex_unlock(&vm->hotplug_mutex); 580 581 vfree(old_bitmap); 582 return 0; 583 } 584 585 /* 586 * Test if we could add memory without creating too much offline memory - 587 * to avoid running OOM if memory is getting onlined deferred. 588 */ 589 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 590 { 591 if (WARN_ON_ONCE(size > vm->offline_threshold)) 592 return false; 593 594 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 595 } 596 597 /* 598 * Try adding memory to Linux. Will usually only fail if out of memory. 599 * 600 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 601 * onlining code). 602 * 603 * Will not modify the state of memory blocks in virtio-mem. 604 */ 605 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 606 uint64_t size) 607 { 608 int rc; 609 610 /* 611 * When force-unloading the driver and we still have memory added to 612 * Linux, the resource name has to stay. 613 */ 614 if (!vm->resource_name) { 615 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 616 GFP_KERNEL); 617 if (!vm->resource_name) 618 return -ENOMEM; 619 } 620 621 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 622 addr + size - 1); 623 /* Memory might get onlined immediately. */ 624 atomic64_add(size, &vm->offline_size); 625 rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, 626 MHP_MERGE_RESOURCE); 627 if (rc) { 628 atomic64_sub(size, &vm->offline_size); 629 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 630 /* 631 * TODO: Linux MM does not properly clean up yet in all cases 632 * where adding of memory failed - especially on -ENOMEM. 633 */ 634 } 635 return rc; 636 } 637 638 /* 639 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 640 */ 641 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 642 { 643 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 644 const uint64_t size = memory_block_size_bytes(); 645 646 return virtio_mem_add_memory(vm, addr, size); 647 } 648 649 /* 650 * See virtio_mem_add_memory(): Try adding a big block. 651 */ 652 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 653 { 654 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 655 const uint64_t size = vm->bbm.bb_size; 656 657 return virtio_mem_add_memory(vm, addr, size); 658 } 659 660 /* 661 * Try removing memory from Linux. Will only fail if memory blocks aren't 662 * offline. 663 * 664 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 665 * onlining code). 666 * 667 * Will not modify the state of memory blocks in virtio-mem. 668 */ 669 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 670 uint64_t size) 671 { 672 int rc; 673 674 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 675 addr + size - 1); 676 rc = remove_memory(vm->nid, addr, size); 677 if (!rc) { 678 atomic64_sub(size, &vm->offline_size); 679 /* 680 * We might have freed up memory we can now unplug, retry 681 * immediately instead of waiting. 682 */ 683 virtio_mem_retry(vm); 684 } else { 685 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 686 } 687 return rc; 688 } 689 690 /* 691 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 692 */ 693 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 694 { 695 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 696 const uint64_t size = memory_block_size_bytes(); 697 698 return virtio_mem_remove_memory(vm, addr, size); 699 } 700 701 /* 702 * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered 703 * by the big block. 704 */ 705 static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) 706 { 707 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 708 const uint64_t size = vm->bbm.bb_size; 709 710 return virtio_mem_remove_memory(vm, addr, size); 711 } 712 713 /* 714 * Try offlining and removing memory from Linux. 715 * 716 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 717 * onlining code). 718 * 719 * Will not modify the state of memory blocks in virtio-mem. 720 */ 721 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 722 uint64_t addr, 723 uint64_t size) 724 { 725 int rc; 726 727 dev_dbg(&vm->vdev->dev, 728 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 729 addr + size - 1); 730 731 rc = offline_and_remove_memory(vm->nid, addr, size); 732 if (!rc) { 733 atomic64_sub(size, &vm->offline_size); 734 /* 735 * We might have freed up memory we can now unplug, retry 736 * immediately instead of waiting. 737 */ 738 virtio_mem_retry(vm); 739 } else { 740 dev_dbg(&vm->vdev->dev, 741 "offlining and removing memory failed: %d\n", rc); 742 } 743 return rc; 744 } 745 746 /* 747 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 748 * a single Linux memory block. 749 */ 750 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 751 unsigned long mb_id) 752 { 753 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 754 const uint64_t size = memory_block_size_bytes(); 755 756 return virtio_mem_offline_and_remove_memory(vm, addr, size); 757 } 758 759 /* 760 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 761 * all Linux memory blocks covered by the big block. 762 */ 763 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 764 unsigned long bb_id) 765 { 766 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 767 const uint64_t size = vm->bbm.bb_size; 768 769 return virtio_mem_offline_and_remove_memory(vm, addr, size); 770 } 771 772 /* 773 * Trigger the workqueue so the device can perform its magic. 774 */ 775 static void virtio_mem_retry(struct virtio_mem *vm) 776 { 777 unsigned long flags; 778 779 spin_lock_irqsave(&vm->removal_lock, flags); 780 if (!vm->removing) 781 queue_work(system_freezable_wq, &vm->wq); 782 spin_unlock_irqrestore(&vm->removal_lock, flags); 783 } 784 785 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 786 { 787 int node = NUMA_NO_NODE; 788 789 #if defined(CONFIG_ACPI_NUMA) 790 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 791 node = pxm_to_node(node_id); 792 #endif 793 return node; 794 } 795 796 /* 797 * Test if a virtio-mem device overlaps with the given range. Can be called 798 * from (notifier) callbacks lockless. 799 */ 800 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 801 uint64_t size) 802 { 803 return start < vm->addr + vm->region_size && vm->addr < start + size; 804 } 805 806 /* 807 * Test if a virtio-mem device contains a given range. Can be called from 808 * (notifier) callbacks lockless. 809 */ 810 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 811 uint64_t size) 812 { 813 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 814 } 815 816 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 817 unsigned long mb_id) 818 { 819 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 820 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 821 case VIRTIO_MEM_SBM_MB_OFFLINE: 822 return NOTIFY_OK; 823 default: 824 break; 825 } 826 dev_warn_ratelimited(&vm->vdev->dev, 827 "memory block onlining denied\n"); 828 return NOTIFY_BAD; 829 } 830 831 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 832 unsigned long mb_id) 833 { 834 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 835 case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: 836 virtio_mem_sbm_set_mb_state(vm, mb_id, 837 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 838 break; 839 case VIRTIO_MEM_SBM_MB_ONLINE: 840 virtio_mem_sbm_set_mb_state(vm, mb_id, 841 VIRTIO_MEM_SBM_MB_OFFLINE); 842 break; 843 default: 844 BUG(); 845 break; 846 } 847 } 848 849 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 850 unsigned long mb_id) 851 { 852 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 853 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 854 virtio_mem_sbm_set_mb_state(vm, mb_id, 855 VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 856 break; 857 case VIRTIO_MEM_SBM_MB_OFFLINE: 858 virtio_mem_sbm_set_mb_state(vm, mb_id, 859 VIRTIO_MEM_SBM_MB_ONLINE); 860 break; 861 default: 862 BUG(); 863 break; 864 } 865 } 866 867 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 868 unsigned long mb_id) 869 { 870 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 871 unsigned long pfn; 872 int sb_id; 873 874 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 875 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 876 continue; 877 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 878 sb_id * vm->sbm.sb_size); 879 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 880 } 881 } 882 883 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 884 unsigned long mb_id) 885 { 886 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 887 unsigned long pfn; 888 int sb_id; 889 890 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 891 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 892 continue; 893 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 894 sb_id * vm->sbm.sb_size); 895 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 896 } 897 } 898 899 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 900 unsigned long bb_id, 901 unsigned long pfn, 902 unsigned long nr_pages) 903 { 904 /* 905 * When marked as "fake-offline", all online memory of this device block 906 * is allocated by us. Otherwise, we don't have any memory allocated. 907 */ 908 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 909 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 910 return; 911 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 912 } 913 914 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 915 unsigned long bb_id, 916 unsigned long pfn, 917 unsigned long nr_pages) 918 { 919 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 920 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 921 return; 922 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 923 } 924 925 /* 926 * This callback will either be called synchronously from add_memory() or 927 * asynchronously (e.g., triggered via user space). We have to be careful 928 * with locking when calling add_memory(). 929 */ 930 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 931 unsigned long action, void *arg) 932 { 933 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 934 memory_notifier); 935 struct memory_notify *mhp = arg; 936 const unsigned long start = PFN_PHYS(mhp->start_pfn); 937 const unsigned long size = PFN_PHYS(mhp->nr_pages); 938 int rc = NOTIFY_OK; 939 unsigned long id; 940 941 if (!virtio_mem_overlaps_range(vm, start, size)) 942 return NOTIFY_DONE; 943 944 if (vm->in_sbm) { 945 id = virtio_mem_phys_to_mb_id(start); 946 /* 947 * In SBM, we add memory in separate memory blocks - we expect 948 * it to be onlined/offlined in the same granularity. Bail out 949 * if this ever changes. 950 */ 951 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 952 !IS_ALIGNED(start, memory_block_size_bytes()))) 953 return NOTIFY_BAD; 954 } else { 955 id = virtio_mem_phys_to_bb_id(vm, start); 956 /* 957 * In BBM, we only care about onlining/offlining happening 958 * within a single big block, we don't care about the 959 * actual granularity as we don't track individual Linux 960 * memory blocks. 961 */ 962 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 963 return NOTIFY_BAD; 964 } 965 966 /* 967 * Avoid circular locking lockdep warnings. We lock the mutex 968 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 969 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 970 * between both notifier calls and will bail out. False positive. 971 */ 972 lockdep_off(); 973 974 switch (action) { 975 case MEM_GOING_OFFLINE: 976 mutex_lock(&vm->hotplug_mutex); 977 if (vm->removing) { 978 rc = notifier_from_errno(-EBUSY); 979 mutex_unlock(&vm->hotplug_mutex); 980 break; 981 } 982 vm->hotplug_active = true; 983 if (vm->in_sbm) 984 virtio_mem_sbm_notify_going_offline(vm, id); 985 else 986 virtio_mem_bbm_notify_going_offline(vm, id, 987 mhp->start_pfn, 988 mhp->nr_pages); 989 break; 990 case MEM_GOING_ONLINE: 991 mutex_lock(&vm->hotplug_mutex); 992 if (vm->removing) { 993 rc = notifier_from_errno(-EBUSY); 994 mutex_unlock(&vm->hotplug_mutex); 995 break; 996 } 997 vm->hotplug_active = true; 998 if (vm->in_sbm) 999 rc = virtio_mem_sbm_notify_going_online(vm, id); 1000 break; 1001 case MEM_OFFLINE: 1002 if (vm->in_sbm) 1003 virtio_mem_sbm_notify_offline(vm, id); 1004 1005 atomic64_add(size, &vm->offline_size); 1006 /* 1007 * Trigger the workqueue. Now that we have some offline memory, 1008 * maybe we can handle pending unplug requests. 1009 */ 1010 if (!unplug_online) 1011 virtio_mem_retry(vm); 1012 1013 vm->hotplug_active = false; 1014 mutex_unlock(&vm->hotplug_mutex); 1015 break; 1016 case MEM_ONLINE: 1017 if (vm->in_sbm) 1018 virtio_mem_sbm_notify_online(vm, id); 1019 1020 atomic64_sub(size, &vm->offline_size); 1021 /* 1022 * Start adding more memory once we onlined half of our 1023 * threshold. Don't trigger if it's possibly due to our actipn 1024 * (e.g., us adding memory which gets onlined immediately from 1025 * the core). 1026 */ 1027 if (!atomic_read(&vm->wq_active) && 1028 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1029 virtio_mem_retry(vm); 1030 1031 vm->hotplug_active = false; 1032 mutex_unlock(&vm->hotplug_mutex); 1033 break; 1034 case MEM_CANCEL_OFFLINE: 1035 if (!vm->hotplug_active) 1036 break; 1037 if (vm->in_sbm) 1038 virtio_mem_sbm_notify_cancel_offline(vm, id); 1039 else 1040 virtio_mem_bbm_notify_cancel_offline(vm, id, 1041 mhp->start_pfn, 1042 mhp->nr_pages); 1043 vm->hotplug_active = false; 1044 mutex_unlock(&vm->hotplug_mutex); 1045 break; 1046 case MEM_CANCEL_ONLINE: 1047 if (!vm->hotplug_active) 1048 break; 1049 vm->hotplug_active = false; 1050 mutex_unlock(&vm->hotplug_mutex); 1051 break; 1052 default: 1053 break; 1054 } 1055 1056 lockdep_on(); 1057 1058 return rc; 1059 } 1060 1061 /* 1062 * Set a range of pages PG_offline. Remember pages that were never onlined 1063 * (via generic_online_page()) using PageDirty(). 1064 */ 1065 static void virtio_mem_set_fake_offline(unsigned long pfn, 1066 unsigned long nr_pages, bool onlined) 1067 { 1068 page_offline_begin(); 1069 for (; nr_pages--; pfn++) { 1070 struct page *page = pfn_to_page(pfn); 1071 1072 __SetPageOffline(page); 1073 if (!onlined) { 1074 SetPageDirty(page); 1075 /* FIXME: remove after cleanups */ 1076 ClearPageReserved(page); 1077 } 1078 } 1079 page_offline_end(); 1080 } 1081 1082 /* 1083 * Clear PG_offline from a range of pages. If the pages were never onlined, 1084 * (via generic_online_page()), clear PageDirty(). 1085 */ 1086 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1087 unsigned long nr_pages, bool onlined) 1088 { 1089 for (; nr_pages--; pfn++) { 1090 struct page *page = pfn_to_page(pfn); 1091 1092 __ClearPageOffline(page); 1093 if (!onlined) 1094 ClearPageDirty(page); 1095 } 1096 } 1097 1098 /* 1099 * Release a range of fake-offline pages to the buddy, effectively 1100 * fake-onlining them. 1101 */ 1102 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1103 { 1104 const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; 1105 unsigned long i; 1106 1107 /* 1108 * We are always called at least with MAX_ORDER_NR_PAGES 1109 * granularity/alignment (e.g., the way subblocks work). All pages 1110 * inside such a block are alike. 1111 */ 1112 for (i = 0; i < nr_pages; i += max_nr_pages) { 1113 struct page *page = pfn_to_page(pfn + i); 1114 1115 /* 1116 * If the page is PageDirty(), it was kept fake-offline when 1117 * onlining the memory block. Otherwise, it was allocated 1118 * using alloc_contig_range(). All pages in a subblock are 1119 * alike. 1120 */ 1121 if (PageDirty(page)) { 1122 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1123 false); 1124 generic_online_page(page, MAX_ORDER - 1); 1125 } else { 1126 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1127 true); 1128 free_contig_range(pfn + i, max_nr_pages); 1129 adjust_managed_page_count(page, max_nr_pages); 1130 } 1131 } 1132 } 1133 1134 /* 1135 * Try to allocate a range, marking pages fake-offline, effectively 1136 * fake-offlining them. 1137 */ 1138 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1139 { 1140 const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == 1141 ZONE_MOVABLE; 1142 int rc, retry_count; 1143 1144 /* 1145 * TODO: We want an alloc_contig_range() mode that tries to allocate 1146 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1147 * with ZONE_MOVABLE. So for now, retry a couple of times with 1148 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1149 * some guarantees. 1150 */ 1151 for (retry_count = 0; retry_count < 5; retry_count++) { 1152 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1153 GFP_KERNEL); 1154 if (rc == -ENOMEM) 1155 /* whoops, out of memory */ 1156 return rc; 1157 else if (rc && !is_movable) 1158 break; 1159 else if (rc) 1160 continue; 1161 1162 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1163 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1164 return 0; 1165 } 1166 1167 return -EBUSY; 1168 } 1169 1170 /* 1171 * Handle fake-offline pages when memory is going offline - such that the 1172 * pages can be skipped by mm-core when offlining. 1173 */ 1174 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1175 unsigned long nr_pages) 1176 { 1177 struct page *page; 1178 unsigned long i; 1179 1180 /* 1181 * Drop our reference to the pages so the memory can get offlined 1182 * and add the unplugged pages to the managed page counters (so 1183 * offlining code can correctly subtract them again). 1184 */ 1185 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1186 /* Drop our reference to the pages so the memory can get offlined. */ 1187 for (i = 0; i < nr_pages; i++) { 1188 page = pfn_to_page(pfn + i); 1189 if (WARN_ON(!page_ref_dec_and_test(page))) 1190 dump_page(page, "fake-offline page referenced"); 1191 } 1192 } 1193 1194 /* 1195 * Handle fake-offline pages when memory offlining is canceled - to undo 1196 * what we did in virtio_mem_fake_offline_going_offline(). 1197 */ 1198 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1199 unsigned long nr_pages) 1200 { 1201 unsigned long i; 1202 1203 /* 1204 * Get the reference we dropped when going offline and subtract the 1205 * unplugged pages from the managed page counters. 1206 */ 1207 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1208 for (i = 0; i < nr_pages; i++) 1209 page_ref_inc(pfn_to_page(pfn + i)); 1210 } 1211 1212 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1213 { 1214 const unsigned long addr = page_to_phys(page); 1215 unsigned long id, sb_id; 1216 struct virtio_mem *vm; 1217 bool do_online; 1218 1219 rcu_read_lock(); 1220 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1221 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1222 continue; 1223 1224 if (vm->in_sbm) { 1225 /* 1226 * We exploit here that subblocks have at least 1227 * MAX_ORDER_NR_PAGES size/alignment - so we cannot 1228 * cross subblocks within one call. 1229 */ 1230 id = virtio_mem_phys_to_mb_id(addr); 1231 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1232 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, 1233 sb_id, 1); 1234 } else { 1235 /* 1236 * If the whole block is marked fake offline, keep 1237 * everything that way. 1238 */ 1239 id = virtio_mem_phys_to_bb_id(vm, addr); 1240 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1241 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1242 } 1243 if (do_online) 1244 generic_online_page(page, order); 1245 else 1246 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1247 false); 1248 rcu_read_unlock(); 1249 return; 1250 } 1251 rcu_read_unlock(); 1252 1253 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1254 generic_online_page(page, order); 1255 } 1256 1257 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1258 const struct virtio_mem_req *req) 1259 { 1260 struct scatterlist *sgs[2], sg_req, sg_resp; 1261 unsigned int len; 1262 int rc; 1263 1264 /* don't use the request residing on the stack (vaddr) */ 1265 vm->req = *req; 1266 1267 /* out: buffer for request */ 1268 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1269 sgs[0] = &sg_req; 1270 1271 /* in: buffer for response */ 1272 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1273 sgs[1] = &sg_resp; 1274 1275 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1276 if (rc < 0) 1277 return rc; 1278 1279 virtqueue_kick(vm->vq); 1280 1281 /* wait for a response */ 1282 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1283 1284 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1285 } 1286 1287 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1288 uint64_t size) 1289 { 1290 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1291 const struct virtio_mem_req req = { 1292 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1293 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1294 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1295 }; 1296 int rc = -ENOMEM; 1297 1298 if (atomic_read(&vm->config_changed)) 1299 return -EAGAIN; 1300 1301 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1302 addr + size - 1); 1303 1304 switch (virtio_mem_send_request(vm, &req)) { 1305 case VIRTIO_MEM_RESP_ACK: 1306 vm->plugged_size += size; 1307 return 0; 1308 case VIRTIO_MEM_RESP_NACK: 1309 rc = -EAGAIN; 1310 break; 1311 case VIRTIO_MEM_RESP_BUSY: 1312 rc = -ETXTBSY; 1313 break; 1314 case VIRTIO_MEM_RESP_ERROR: 1315 rc = -EINVAL; 1316 break; 1317 default: 1318 break; 1319 } 1320 1321 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1322 return rc; 1323 } 1324 1325 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1326 uint64_t size) 1327 { 1328 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1329 const struct virtio_mem_req req = { 1330 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1331 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1332 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1333 }; 1334 int rc = -ENOMEM; 1335 1336 if (atomic_read(&vm->config_changed)) 1337 return -EAGAIN; 1338 1339 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1340 addr + size - 1); 1341 1342 switch (virtio_mem_send_request(vm, &req)) { 1343 case VIRTIO_MEM_RESP_ACK: 1344 vm->plugged_size -= size; 1345 return 0; 1346 case VIRTIO_MEM_RESP_BUSY: 1347 rc = -ETXTBSY; 1348 break; 1349 case VIRTIO_MEM_RESP_ERROR: 1350 rc = -EINVAL; 1351 break; 1352 default: 1353 break; 1354 } 1355 1356 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1357 return rc; 1358 } 1359 1360 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1361 { 1362 const struct virtio_mem_req req = { 1363 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1364 }; 1365 int rc = -ENOMEM; 1366 1367 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1368 1369 switch (virtio_mem_send_request(vm, &req)) { 1370 case VIRTIO_MEM_RESP_ACK: 1371 vm->unplug_all_required = false; 1372 vm->plugged_size = 0; 1373 /* usable region might have shrunk */ 1374 atomic_set(&vm->config_changed, 1); 1375 return 0; 1376 case VIRTIO_MEM_RESP_BUSY: 1377 rc = -ETXTBSY; 1378 break; 1379 default: 1380 break; 1381 } 1382 1383 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1384 return rc; 1385 } 1386 1387 /* 1388 * Plug selected subblocks. Updates the plugged state, but not the state 1389 * of the memory block. 1390 */ 1391 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1392 int sb_id, int count) 1393 { 1394 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1395 sb_id * vm->sbm.sb_size; 1396 const uint64_t size = count * vm->sbm.sb_size; 1397 int rc; 1398 1399 rc = virtio_mem_send_plug_request(vm, addr, size); 1400 if (!rc) 1401 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1402 return rc; 1403 } 1404 1405 /* 1406 * Unplug selected subblocks. Updates the plugged state, but not the state 1407 * of the memory block. 1408 */ 1409 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1410 int sb_id, int count) 1411 { 1412 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1413 sb_id * vm->sbm.sb_size; 1414 const uint64_t size = count * vm->sbm.sb_size; 1415 int rc; 1416 1417 rc = virtio_mem_send_unplug_request(vm, addr, size); 1418 if (!rc) 1419 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1420 return rc; 1421 } 1422 1423 /* 1424 * Request to unplug a big block. 1425 * 1426 * Will not modify the state of the big block. 1427 */ 1428 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1429 { 1430 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1431 const uint64_t size = vm->bbm.bb_size; 1432 1433 return virtio_mem_send_unplug_request(vm, addr, size); 1434 } 1435 1436 /* 1437 * Request to plug a big block. 1438 * 1439 * Will not modify the state of the big block. 1440 */ 1441 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1442 { 1443 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1444 const uint64_t size = vm->bbm.bb_size; 1445 1446 return virtio_mem_send_plug_request(vm, addr, size); 1447 } 1448 1449 /* 1450 * Unplug the desired number of plugged subblocks of a offline or not-added 1451 * memory block. Will fail if any subblock cannot get unplugged (instead of 1452 * skipping it). 1453 * 1454 * Will not modify the state of the memory block. 1455 * 1456 * Note: can fail after some subblocks were unplugged. 1457 */ 1458 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1459 unsigned long mb_id, uint64_t *nb_sb) 1460 { 1461 int sb_id, count; 1462 int rc; 1463 1464 sb_id = vm->sbm.sbs_per_mb - 1; 1465 while (*nb_sb) { 1466 /* Find the next candidate subblock */ 1467 while (sb_id >= 0 && 1468 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1469 sb_id--; 1470 if (sb_id < 0) 1471 break; 1472 /* Try to unplug multiple subblocks at a time */ 1473 count = 1; 1474 while (count < *nb_sb && sb_id > 0 && 1475 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1476 count++; 1477 sb_id--; 1478 } 1479 1480 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1481 if (rc) 1482 return rc; 1483 *nb_sb -= count; 1484 sb_id--; 1485 } 1486 1487 return 0; 1488 } 1489 1490 /* 1491 * Unplug all plugged subblocks of an offline or not-added memory block. 1492 * 1493 * Will not modify the state of the memory block. 1494 * 1495 * Note: can fail after some subblocks were unplugged. 1496 */ 1497 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1498 { 1499 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1500 1501 return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 1502 } 1503 1504 /* 1505 * Prepare tracking data for the next memory block. 1506 */ 1507 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1508 unsigned long *mb_id) 1509 { 1510 int rc; 1511 1512 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1513 return -ENOSPC; 1514 1515 /* Resize the state array if required. */ 1516 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1517 if (rc) 1518 return rc; 1519 1520 /* Resize the subblock bitmap if required. */ 1521 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1522 if (rc) 1523 return rc; 1524 1525 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1526 *mb_id = vm->sbm.next_mb_id++; 1527 return 0; 1528 } 1529 1530 /* 1531 * Try to plug the desired number of subblocks and add the memory block 1532 * to Linux. 1533 * 1534 * Will modify the state of the memory block. 1535 */ 1536 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1537 unsigned long mb_id, uint64_t *nb_sb) 1538 { 1539 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1540 int rc; 1541 1542 if (WARN_ON_ONCE(!count)) 1543 return -EINVAL; 1544 1545 /* 1546 * Plug the requested number of subblocks before adding it to linux, 1547 * so that onlining will directly online all plugged subblocks. 1548 */ 1549 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1550 if (rc) 1551 return rc; 1552 1553 /* 1554 * Mark the block properly offline before adding it to Linux, 1555 * so the memory notifiers will find the block in the right state. 1556 */ 1557 if (count == vm->sbm.sbs_per_mb) 1558 virtio_mem_sbm_set_mb_state(vm, mb_id, 1559 VIRTIO_MEM_SBM_MB_OFFLINE); 1560 else 1561 virtio_mem_sbm_set_mb_state(vm, mb_id, 1562 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1563 1564 /* Add the memory block to linux - if that fails, try to unplug. */ 1565 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1566 if (rc) { 1567 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1568 1569 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1570 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1571 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1572 return rc; 1573 } 1574 1575 *nb_sb -= count; 1576 return 0; 1577 } 1578 1579 /* 1580 * Try to plug the desired number of subblocks of a memory block that 1581 * is already added to Linux. 1582 * 1583 * Will modify the state of the memory block. 1584 * 1585 * Note: Can fail after some subblocks were successfully plugged. 1586 */ 1587 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1588 unsigned long mb_id, uint64_t *nb_sb, 1589 bool online) 1590 { 1591 unsigned long pfn, nr_pages; 1592 int sb_id, count; 1593 int rc; 1594 1595 if (WARN_ON_ONCE(!*nb_sb)) 1596 return -EINVAL; 1597 1598 while (*nb_sb) { 1599 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1600 if (sb_id >= vm->sbm.sbs_per_mb) 1601 break; 1602 count = 1; 1603 while (count < *nb_sb && 1604 sb_id + count < vm->sbm.sbs_per_mb && 1605 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1606 count++; 1607 1608 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1609 if (rc) 1610 return rc; 1611 *nb_sb -= count; 1612 if (!online) 1613 continue; 1614 1615 /* fake-online the pages if the memory block is online */ 1616 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1617 sb_id * vm->sbm.sb_size); 1618 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1619 virtio_mem_fake_online(pfn, nr_pages); 1620 } 1621 1622 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1623 if (online) 1624 virtio_mem_sbm_set_mb_state(vm, mb_id, 1625 VIRTIO_MEM_SBM_MB_ONLINE); 1626 else 1627 virtio_mem_sbm_set_mb_state(vm, mb_id, 1628 VIRTIO_MEM_SBM_MB_OFFLINE); 1629 } 1630 1631 return 0; 1632 } 1633 1634 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1635 { 1636 uint64_t nb_sb = diff / vm->sbm.sb_size; 1637 unsigned long mb_id; 1638 int rc; 1639 1640 if (!nb_sb) 1641 return 0; 1642 1643 /* Don't race with onlining/offlining */ 1644 mutex_lock(&vm->hotplug_mutex); 1645 1646 /* Try to plug subblocks of partially plugged online blocks. */ 1647 virtio_mem_sbm_for_each_mb(vm, mb_id, 1648 VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 1649 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); 1650 if (rc || !nb_sb) 1651 goto out_unlock; 1652 cond_resched(); 1653 } 1654 1655 /* Try to plug subblocks of partially plugged offline blocks. */ 1656 virtio_mem_sbm_for_each_mb(vm, mb_id, 1657 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1658 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); 1659 if (rc || !nb_sb) 1660 goto out_unlock; 1661 cond_resched(); 1662 } 1663 1664 /* 1665 * We won't be working on online/offline memory blocks from this point, 1666 * so we can't race with memory onlining/offlining. Drop the mutex. 1667 */ 1668 mutex_unlock(&vm->hotplug_mutex); 1669 1670 /* Try to plug and add unused blocks */ 1671 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1672 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1673 return -ENOSPC; 1674 1675 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1676 if (rc || !nb_sb) 1677 return rc; 1678 cond_resched(); 1679 } 1680 1681 /* Try to prepare, plug and add new blocks */ 1682 while (nb_sb) { 1683 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1684 return -ENOSPC; 1685 1686 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1687 if (rc) 1688 return rc; 1689 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1690 if (rc) 1691 return rc; 1692 cond_resched(); 1693 } 1694 1695 return 0; 1696 out_unlock: 1697 mutex_unlock(&vm->hotplug_mutex); 1698 return rc; 1699 } 1700 1701 /* 1702 * Plug a big block and add it to Linux. 1703 * 1704 * Will modify the state of the big block. 1705 */ 1706 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1707 unsigned long bb_id) 1708 { 1709 int rc; 1710 1711 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1712 VIRTIO_MEM_BBM_BB_UNUSED)) 1713 return -EINVAL; 1714 1715 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1716 if (rc) 1717 return rc; 1718 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1719 1720 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1721 if (rc) { 1722 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1723 virtio_mem_bbm_set_bb_state(vm, bb_id, 1724 VIRTIO_MEM_BBM_BB_UNUSED); 1725 else 1726 /* Retry from the main loop. */ 1727 virtio_mem_bbm_set_bb_state(vm, bb_id, 1728 VIRTIO_MEM_BBM_BB_PLUGGED); 1729 return rc; 1730 } 1731 return 0; 1732 } 1733 1734 /* 1735 * Prepare tracking data for the next big block. 1736 */ 1737 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1738 unsigned long *bb_id) 1739 { 1740 int rc; 1741 1742 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1743 return -ENOSPC; 1744 1745 /* Resize the big block state array if required. */ 1746 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1747 if (rc) 1748 return rc; 1749 1750 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1751 *bb_id = vm->bbm.next_bb_id; 1752 vm->bbm.next_bb_id++; 1753 return 0; 1754 } 1755 1756 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1757 { 1758 uint64_t nb_bb = diff / vm->bbm.bb_size; 1759 unsigned long bb_id; 1760 int rc; 1761 1762 if (!nb_bb) 1763 return 0; 1764 1765 /* Try to plug and add unused big blocks */ 1766 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1767 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1768 return -ENOSPC; 1769 1770 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1771 if (!rc) 1772 nb_bb--; 1773 if (rc || !nb_bb) 1774 return rc; 1775 cond_resched(); 1776 } 1777 1778 /* Try to prepare, plug and add new big blocks */ 1779 while (nb_bb) { 1780 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1781 return -ENOSPC; 1782 1783 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1784 if (rc) 1785 return rc; 1786 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1787 if (!rc) 1788 nb_bb--; 1789 if (rc) 1790 return rc; 1791 cond_resched(); 1792 } 1793 1794 return 0; 1795 } 1796 1797 /* 1798 * Try to plug the requested amount of memory. 1799 */ 1800 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1801 { 1802 if (vm->in_sbm) 1803 return virtio_mem_sbm_plug_request(vm, diff); 1804 return virtio_mem_bbm_plug_request(vm, diff); 1805 } 1806 1807 /* 1808 * Unplug the desired number of plugged subblocks of an offline memory block. 1809 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1810 * 1811 * Will modify the state of the memory block. Might temporarily drop the 1812 * hotplug_mutex. 1813 * 1814 * Note: Can fail after some subblocks were successfully unplugged. 1815 */ 1816 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1817 unsigned long mb_id, 1818 uint64_t *nb_sb) 1819 { 1820 int rc; 1821 1822 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); 1823 1824 /* some subblocks might have been unplugged even on failure */ 1825 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1826 virtio_mem_sbm_set_mb_state(vm, mb_id, 1827 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1828 if (rc) 1829 return rc; 1830 1831 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1832 /* 1833 * Remove the block from Linux - this should never fail. 1834 * Hinder the block from getting onlined by marking it 1835 * unplugged. Temporarily drop the mutex, so 1836 * any pending GOING_ONLINE requests can be serviced/rejected. 1837 */ 1838 virtio_mem_sbm_set_mb_state(vm, mb_id, 1839 VIRTIO_MEM_SBM_MB_UNUSED); 1840 1841 mutex_unlock(&vm->hotplug_mutex); 1842 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1843 BUG_ON(rc); 1844 mutex_lock(&vm->hotplug_mutex); 1845 } 1846 return 0; 1847 } 1848 1849 /* 1850 * Unplug the given plugged subblocks of an online memory block. 1851 * 1852 * Will modify the state of the memory block. 1853 */ 1854 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1855 unsigned long mb_id, int sb_id, 1856 int count) 1857 { 1858 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1859 unsigned long start_pfn; 1860 int rc; 1861 1862 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1863 sb_id * vm->sbm.sb_size); 1864 1865 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1866 if (rc) 1867 return rc; 1868 1869 /* Try to unplug the allocated memory */ 1870 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1871 if (rc) { 1872 /* Return the memory to the buddy. */ 1873 virtio_mem_fake_online(start_pfn, nr_pages); 1874 return rc; 1875 } 1876 1877 virtio_mem_sbm_set_mb_state(vm, mb_id, 1878 VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 1879 return 0; 1880 } 1881 1882 /* 1883 * Unplug the desired number of plugged subblocks of an online memory block. 1884 * Will skip subblock that are busy. 1885 * 1886 * Will modify the state of the memory block. Might temporarily drop the 1887 * hotplug_mutex. 1888 * 1889 * Note: Can fail after some subblocks were successfully unplugged. Can 1890 * return 0 even if subblocks were busy and could not get unplugged. 1891 */ 1892 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1893 unsigned long mb_id, 1894 uint64_t *nb_sb) 1895 { 1896 int rc, sb_id; 1897 1898 /* If possible, try to unplug the complete block in one shot. */ 1899 if (*nb_sb >= vm->sbm.sbs_per_mb && 1900 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1901 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1902 vm->sbm.sbs_per_mb); 1903 if (!rc) { 1904 *nb_sb -= vm->sbm.sbs_per_mb; 1905 goto unplugged; 1906 } else if (rc != -EBUSY) 1907 return rc; 1908 } 1909 1910 /* Fallback to single subblocks. */ 1911 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1912 /* Find the next candidate subblock */ 1913 while (sb_id >= 0 && 1914 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1915 sb_id--; 1916 if (sb_id < 0) 1917 break; 1918 1919 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1920 if (rc == -EBUSY) 1921 continue; 1922 else if (rc) 1923 return rc; 1924 *nb_sb -= 1; 1925 } 1926 1927 unplugged: 1928 /* 1929 * Once all subblocks of a memory block were unplugged, offline and 1930 * remove it. This will usually not fail, as no memory is in use 1931 * anymore - however some other notifiers might NACK the request. 1932 */ 1933 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1934 mutex_unlock(&vm->hotplug_mutex); 1935 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 1936 mutex_lock(&vm->hotplug_mutex); 1937 if (!rc) 1938 virtio_mem_sbm_set_mb_state(vm, mb_id, 1939 VIRTIO_MEM_SBM_MB_UNUSED); 1940 } 1941 1942 return 0; 1943 } 1944 1945 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1946 { 1947 uint64_t nb_sb = diff / vm->sbm.sb_size; 1948 unsigned long mb_id; 1949 int rc; 1950 1951 if (!nb_sb) 1952 return 0; 1953 1954 /* 1955 * We'll drop the mutex a couple of times when it is safe to do so. 1956 * This might result in some blocks switching the state (online/offline) 1957 * and we could miss them in this run - we will retry again later. 1958 */ 1959 mutex_lock(&vm->hotplug_mutex); 1960 1961 /* Try to unplug subblocks of partially plugged offline blocks. */ 1962 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 1963 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1964 rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 1965 if (rc || !nb_sb) 1966 goto out_unlock; 1967 cond_resched(); 1968 } 1969 1970 /* Try to unplug subblocks of plugged offline blocks. */ 1971 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { 1972 rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 1973 if (rc || !nb_sb) 1974 goto out_unlock; 1975 cond_resched(); 1976 } 1977 1978 if (!unplug_online) { 1979 mutex_unlock(&vm->hotplug_mutex); 1980 return 0; 1981 } 1982 1983 /* Try to unplug subblocks of partially plugged online blocks. */ 1984 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 1985 VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 1986 rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 1987 if (rc || !nb_sb) 1988 goto out_unlock; 1989 mutex_unlock(&vm->hotplug_mutex); 1990 cond_resched(); 1991 mutex_lock(&vm->hotplug_mutex); 1992 } 1993 1994 /* Try to unplug subblocks of plugged online blocks. */ 1995 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { 1996 rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 1997 if (rc || !nb_sb) 1998 goto out_unlock; 1999 mutex_unlock(&vm->hotplug_mutex); 2000 cond_resched(); 2001 mutex_lock(&vm->hotplug_mutex); 2002 } 2003 2004 mutex_unlock(&vm->hotplug_mutex); 2005 return nb_sb ? -EBUSY : 0; 2006 out_unlock: 2007 mutex_unlock(&vm->hotplug_mutex); 2008 return rc; 2009 } 2010 2011 /* 2012 * Try to offline and remove a big block from Linux and unplug it. Will fail 2013 * with -EBUSY if some memory is busy and cannot get unplugged. 2014 * 2015 * Will modify the state of the memory block. Might temporarily drop the 2016 * hotplug_mutex. 2017 */ 2018 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2019 unsigned long bb_id) 2020 { 2021 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2022 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2023 unsigned long end_pfn = start_pfn + nr_pages; 2024 unsigned long pfn; 2025 struct page *page; 2026 int rc; 2027 2028 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2029 VIRTIO_MEM_BBM_BB_ADDED)) 2030 return -EINVAL; 2031 2032 if (bbm_safe_unplug) { 2033 /* 2034 * Start by fake-offlining all memory. Once we marked the device 2035 * block as fake-offline, all newly onlined memory will 2036 * automatically be kept fake-offline. Protect from concurrent 2037 * onlining/offlining until we have a consistent state. 2038 */ 2039 mutex_lock(&vm->hotplug_mutex); 2040 virtio_mem_bbm_set_bb_state(vm, bb_id, 2041 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2042 2043 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2044 page = pfn_to_online_page(pfn); 2045 if (!page) 2046 continue; 2047 2048 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2049 if (rc) { 2050 end_pfn = pfn; 2051 goto rollback_safe_unplug; 2052 } 2053 } 2054 mutex_unlock(&vm->hotplug_mutex); 2055 } 2056 2057 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2058 if (rc) { 2059 if (bbm_safe_unplug) { 2060 mutex_lock(&vm->hotplug_mutex); 2061 goto rollback_safe_unplug; 2062 } 2063 return rc; 2064 } 2065 2066 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2067 if (rc) 2068 virtio_mem_bbm_set_bb_state(vm, bb_id, 2069 VIRTIO_MEM_BBM_BB_PLUGGED); 2070 else 2071 virtio_mem_bbm_set_bb_state(vm, bb_id, 2072 VIRTIO_MEM_BBM_BB_UNUSED); 2073 return rc; 2074 2075 rollback_safe_unplug: 2076 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2077 page = pfn_to_online_page(pfn); 2078 if (!page) 2079 continue; 2080 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2081 } 2082 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2083 mutex_unlock(&vm->hotplug_mutex); 2084 return rc; 2085 } 2086 2087 /* 2088 * Try to remove a big block from Linux and unplug it. Will fail with 2089 * -EBUSY if some memory is online. 2090 * 2091 * Will modify the state of the memory block. 2092 */ 2093 static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, 2094 unsigned long bb_id) 2095 { 2096 int rc; 2097 2098 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2099 VIRTIO_MEM_BBM_BB_ADDED)) 2100 return -EINVAL; 2101 2102 rc = virtio_mem_bbm_remove_bb(vm, bb_id); 2103 if (rc) 2104 return -EBUSY; 2105 2106 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2107 if (rc) 2108 virtio_mem_bbm_set_bb_state(vm, bb_id, 2109 VIRTIO_MEM_BBM_BB_PLUGGED); 2110 else 2111 virtio_mem_bbm_set_bb_state(vm, bb_id, 2112 VIRTIO_MEM_BBM_BB_UNUSED); 2113 return rc; 2114 } 2115 2116 /* 2117 * Test if a big block is completely offline. 2118 */ 2119 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2120 unsigned long bb_id) 2121 { 2122 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2123 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2124 unsigned long pfn; 2125 2126 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2127 pfn += PAGES_PER_SECTION) { 2128 if (pfn_to_online_page(pfn)) 2129 return false; 2130 } 2131 2132 return true; 2133 } 2134 2135 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2136 { 2137 uint64_t nb_bb = diff / vm->bbm.bb_size; 2138 uint64_t bb_id; 2139 int rc; 2140 2141 if (!nb_bb) 2142 return 0; 2143 2144 /* Try to unplug completely offline big blocks first. */ 2145 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2146 cond_resched(); 2147 /* 2148 * As we're holding no locks, this check is racy as memory 2149 * can get onlined in the meantime - but we'll fail gracefully. 2150 */ 2151 if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2152 continue; 2153 rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); 2154 if (rc == -EBUSY) 2155 continue; 2156 if (!rc) 2157 nb_bb--; 2158 if (rc || !nb_bb) 2159 return rc; 2160 } 2161 2162 if (!unplug_online) 2163 return 0; 2164 2165 /* Try to unplug any big blocks. */ 2166 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2167 cond_resched(); 2168 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2169 if (rc == -EBUSY) 2170 continue; 2171 if (!rc) 2172 nb_bb--; 2173 if (rc || !nb_bb) 2174 return rc; 2175 } 2176 2177 return nb_bb ? -EBUSY : 0; 2178 } 2179 2180 /* 2181 * Try to unplug the requested amount of memory. 2182 */ 2183 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2184 { 2185 if (vm->in_sbm) 2186 return virtio_mem_sbm_unplug_request(vm, diff); 2187 return virtio_mem_bbm_unplug_request(vm, diff); 2188 } 2189 2190 /* 2191 * Try to unplug all blocks that couldn't be unplugged before, for example, 2192 * because the hypervisor was busy. 2193 */ 2194 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2195 { 2196 unsigned long id; 2197 int rc; 2198 2199 if (!vm->in_sbm) { 2200 virtio_mem_bbm_for_each_bb(vm, id, 2201 VIRTIO_MEM_BBM_BB_PLUGGED) { 2202 rc = virtio_mem_bbm_unplug_bb(vm, id); 2203 if (rc) 2204 return rc; 2205 virtio_mem_bbm_set_bb_state(vm, id, 2206 VIRTIO_MEM_BBM_BB_UNUSED); 2207 } 2208 return 0; 2209 } 2210 2211 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2212 rc = virtio_mem_sbm_unplug_mb(vm, id); 2213 if (rc) 2214 return rc; 2215 virtio_mem_sbm_set_mb_state(vm, id, 2216 VIRTIO_MEM_SBM_MB_UNUSED); 2217 } 2218 2219 return 0; 2220 } 2221 2222 /* 2223 * Update all parts of the config that could have changed. 2224 */ 2225 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2226 { 2227 const struct range pluggable_range = mhp_get_pluggable_range(true); 2228 uint64_t new_plugged_size, usable_region_size, end_addr; 2229 2230 /* the plugged_size is just a reflection of what _we_ did previously */ 2231 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2232 &new_plugged_size); 2233 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2234 vm->plugged_size = new_plugged_size; 2235 2236 /* calculate the last usable memory block id */ 2237 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2238 usable_region_size, &usable_region_size); 2239 end_addr = min(vm->addr + usable_region_size - 1, 2240 pluggable_range.end); 2241 2242 if (vm->in_sbm) { 2243 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2244 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2245 vm->sbm.last_usable_mb_id--; 2246 } else { 2247 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2248 end_addr); 2249 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2250 vm->bbm.last_usable_bb_id--; 2251 } 2252 /* 2253 * If we cannot plug any of our device memory (e.g., nothing in the 2254 * usable region is addressable), the last usable memory block id will 2255 * be smaller than the first usable memory block id. We'll stop 2256 * attempting to add memory with -ENOSPC from our main loop. 2257 */ 2258 2259 /* see if there is a request to change the size */ 2260 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2261 &vm->requested_size); 2262 2263 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2264 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2265 } 2266 2267 /* 2268 * Workqueue function for handling plug/unplug requests and config updates. 2269 */ 2270 static void virtio_mem_run_wq(struct work_struct *work) 2271 { 2272 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2273 uint64_t diff; 2274 int rc; 2275 2276 hrtimer_cancel(&vm->retry_timer); 2277 2278 if (vm->broken) 2279 return; 2280 2281 atomic_set(&vm->wq_active, 1); 2282 retry: 2283 rc = 0; 2284 2285 /* Make sure we start with a clean state if there are leftovers. */ 2286 if (unlikely(vm->unplug_all_required)) 2287 rc = virtio_mem_send_unplug_all_request(vm); 2288 2289 if (atomic_read(&vm->config_changed)) { 2290 atomic_set(&vm->config_changed, 0); 2291 virtio_mem_refresh_config(vm); 2292 } 2293 2294 /* Unplug any leftovers from previous runs */ 2295 if (!rc) 2296 rc = virtio_mem_unplug_pending_mb(vm); 2297 2298 if (!rc && vm->requested_size != vm->plugged_size) { 2299 if (vm->requested_size > vm->plugged_size) { 2300 diff = vm->requested_size - vm->plugged_size; 2301 rc = virtio_mem_plug_request(vm, diff); 2302 } else { 2303 diff = vm->plugged_size - vm->requested_size; 2304 rc = virtio_mem_unplug_request(vm, diff); 2305 } 2306 } 2307 2308 switch (rc) { 2309 case 0: 2310 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2311 break; 2312 case -ENOSPC: 2313 /* 2314 * We cannot add any more memory (alignment, physical limit) 2315 * or we have too many offline memory blocks. 2316 */ 2317 break; 2318 case -ETXTBSY: 2319 /* 2320 * The hypervisor cannot process our request right now 2321 * (e.g., out of memory, migrating); 2322 */ 2323 case -EBUSY: 2324 /* 2325 * We cannot free up any memory to unplug it (all plugged memory 2326 * is busy). 2327 */ 2328 case -ENOMEM: 2329 /* Out of memory, try again later. */ 2330 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2331 HRTIMER_MODE_REL); 2332 break; 2333 case -EAGAIN: 2334 /* Retry immediately (e.g., the config changed). */ 2335 goto retry; 2336 default: 2337 /* Unknown error, mark as broken */ 2338 dev_err(&vm->vdev->dev, 2339 "unknown error, marking device broken: %d\n", rc); 2340 vm->broken = true; 2341 } 2342 2343 atomic_set(&vm->wq_active, 0); 2344 } 2345 2346 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2347 { 2348 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2349 retry_timer); 2350 2351 virtio_mem_retry(vm); 2352 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2353 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2354 return HRTIMER_NORESTART; 2355 } 2356 2357 static void virtio_mem_handle_response(struct virtqueue *vq) 2358 { 2359 struct virtio_mem *vm = vq->vdev->priv; 2360 2361 wake_up(&vm->host_resp); 2362 } 2363 2364 static int virtio_mem_init_vq(struct virtio_mem *vm) 2365 { 2366 struct virtqueue *vq; 2367 2368 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2369 "guest-request"); 2370 if (IS_ERR(vq)) 2371 return PTR_ERR(vq); 2372 vm->vq = vq; 2373 2374 return 0; 2375 } 2376 2377 static int virtio_mem_init(struct virtio_mem *vm) 2378 { 2379 const struct range pluggable_range = mhp_get_pluggable_range(true); 2380 uint64_t sb_size, addr; 2381 uint16_t node_id; 2382 2383 if (!vm->vdev->config->get) { 2384 dev_err(&vm->vdev->dev, "config access disabled\n"); 2385 return -EINVAL; 2386 } 2387 2388 /* 2389 * We don't want to (un)plug or reuse any memory when in kdump. The 2390 * memory is still accessible (but not mapped). 2391 */ 2392 if (is_kdump_kernel()) { 2393 dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n"); 2394 return -EBUSY; 2395 } 2396 2397 /* Fetch all properties that can't change. */ 2398 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2399 &vm->plugged_size); 2400 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2401 &vm->device_block_size); 2402 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2403 &node_id); 2404 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2405 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2406 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2407 &vm->region_size); 2408 2409 /* Determine the nid for the device based on the lowest address. */ 2410 if (vm->nid == NUMA_NO_NODE) 2411 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2412 2413 /* bad device setup - warn only */ 2414 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2415 dev_warn(&vm->vdev->dev, 2416 "The alignment of the physical start address can make some memory unusable.\n"); 2417 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2418 dev_warn(&vm->vdev->dev, 2419 "The alignment of the physical end address can make some memory unusable.\n"); 2420 if (vm->addr < pluggable_range.start || 2421 vm->addr + vm->region_size - 1 > pluggable_range.end) 2422 dev_warn(&vm->vdev->dev, 2423 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2424 2425 /* 2426 * We want subblocks to span at least MAX_ORDER_NR_PAGES and 2427 * pageblock_nr_pages pages. This: 2428 * - Simplifies our page onlining code (virtio_mem_online_page_cb) 2429 * and fake page onlining code (virtio_mem_fake_online). 2430 * - Is required for now for alloc_contig_range() to work reliably - 2431 * it doesn't properly handle smaller granularity on ZONE_NORMAL. 2432 */ 2433 sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, 2434 pageblock_nr_pages) * PAGE_SIZE; 2435 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2436 2437 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2438 /* SBM: At least two subblocks per Linux memory block. */ 2439 vm->in_sbm = true; 2440 vm->sbm.sb_size = sb_size; 2441 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2442 vm->sbm.sb_size; 2443 2444 /* Round up to the next full memory block */ 2445 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2446 memory_block_size_bytes() - 1; 2447 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2448 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2449 } else { 2450 /* BBM: At least one Linux memory block. */ 2451 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2452 memory_block_size_bytes()); 2453 2454 if (bbm_block_size) { 2455 if (!is_power_of_2(bbm_block_size)) { 2456 dev_warn(&vm->vdev->dev, 2457 "bbm_block_size is not a power of 2"); 2458 } else if (bbm_block_size < vm->bbm.bb_size) { 2459 dev_warn(&vm->vdev->dev, 2460 "bbm_block_size is too small"); 2461 } else { 2462 vm->bbm.bb_size = bbm_block_size; 2463 } 2464 } 2465 2466 /* Round up to the next aligned big block */ 2467 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2468 vm->bbm.bb_size - 1; 2469 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2470 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2471 } 2472 2473 /* Prepare the offline threshold - make sure we can add two blocks. */ 2474 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2475 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2476 /* In BBM, we also want at least two big blocks. */ 2477 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2478 vm->offline_threshold); 2479 2480 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2481 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2482 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2483 (unsigned long long)vm->device_block_size); 2484 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2485 memory_block_size_bytes()); 2486 if (vm->in_sbm) 2487 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2488 (unsigned long long)vm->sbm.sb_size); 2489 else 2490 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2491 (unsigned long long)vm->bbm.bb_size); 2492 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2493 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2494 2495 return 0; 2496 } 2497 2498 static int virtio_mem_create_resource(struct virtio_mem *vm) 2499 { 2500 /* 2501 * When force-unloading the driver and removing the device, we 2502 * could have a garbage pointer. Duplicate the string. 2503 */ 2504 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2505 2506 if (!name) 2507 return -ENOMEM; 2508 2509 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2510 name, IORESOURCE_SYSTEM_RAM); 2511 if (!vm->parent_resource) { 2512 kfree(name); 2513 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2514 dev_info(&vm->vdev->dev, 2515 "reloading the driver is not supported\n"); 2516 return -EBUSY; 2517 } 2518 2519 /* The memory is not actually busy - make add_memory() work. */ 2520 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2521 return 0; 2522 } 2523 2524 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2525 { 2526 const char *name; 2527 2528 if (!vm->parent_resource) 2529 return; 2530 2531 name = vm->parent_resource->name; 2532 release_resource(vm->parent_resource); 2533 kfree(vm->parent_resource); 2534 kfree(name); 2535 vm->parent_resource = NULL; 2536 } 2537 2538 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2539 { 2540 return 1; 2541 } 2542 2543 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2544 { 2545 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2546 2547 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2548 vm->addr + vm->region_size, NULL, 2549 virtio_mem_range_has_system_ram) == 1; 2550 } 2551 2552 static int virtio_mem_probe(struct virtio_device *vdev) 2553 { 2554 struct virtio_mem *vm; 2555 int rc; 2556 2557 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2558 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2559 2560 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2561 if (!vm) 2562 return -ENOMEM; 2563 2564 init_waitqueue_head(&vm->host_resp); 2565 vm->vdev = vdev; 2566 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2567 mutex_init(&vm->hotplug_mutex); 2568 INIT_LIST_HEAD(&vm->next); 2569 spin_lock_init(&vm->removal_lock); 2570 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2571 vm->retry_timer.function = virtio_mem_timer_expired; 2572 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2573 2574 /* register the virtqueue */ 2575 rc = virtio_mem_init_vq(vm); 2576 if (rc) 2577 goto out_free_vm; 2578 2579 /* initialize the device by querying the config */ 2580 rc = virtio_mem_init(vm); 2581 if (rc) 2582 goto out_del_vq; 2583 2584 /* create the parent resource for all memory */ 2585 rc = virtio_mem_create_resource(vm); 2586 if (rc) 2587 goto out_del_vq; 2588 2589 /* 2590 * If we still have memory plugged, we have to unplug all memory first. 2591 * Registering our parent resource makes sure that this memory isn't 2592 * actually in use (e.g., trying to reload the driver). 2593 */ 2594 if (vm->plugged_size) { 2595 vm->unplug_all_required = true; 2596 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2597 } 2598 2599 /* register callbacks */ 2600 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2601 rc = register_memory_notifier(&vm->memory_notifier); 2602 if (rc) 2603 goto out_del_resource; 2604 rc = register_virtio_mem_device(vm); 2605 if (rc) 2606 goto out_unreg_mem; 2607 2608 virtio_device_ready(vdev); 2609 2610 /* trigger a config update to start processing the requested_size */ 2611 atomic_set(&vm->config_changed, 1); 2612 queue_work(system_freezable_wq, &vm->wq); 2613 2614 return 0; 2615 out_unreg_mem: 2616 unregister_memory_notifier(&vm->memory_notifier); 2617 out_del_resource: 2618 virtio_mem_delete_resource(vm); 2619 out_del_vq: 2620 vdev->config->del_vqs(vdev); 2621 out_free_vm: 2622 kfree(vm); 2623 vdev->priv = NULL; 2624 2625 return rc; 2626 } 2627 2628 static void virtio_mem_remove(struct virtio_device *vdev) 2629 { 2630 struct virtio_mem *vm = vdev->priv; 2631 unsigned long mb_id; 2632 int rc; 2633 2634 /* 2635 * Make sure the workqueue won't be triggered anymore and no memory 2636 * blocks can be onlined/offlined until we're finished here. 2637 */ 2638 mutex_lock(&vm->hotplug_mutex); 2639 spin_lock_irq(&vm->removal_lock); 2640 vm->removing = true; 2641 spin_unlock_irq(&vm->removal_lock); 2642 mutex_unlock(&vm->hotplug_mutex); 2643 2644 /* wait until the workqueue stopped */ 2645 cancel_work_sync(&vm->wq); 2646 hrtimer_cancel(&vm->retry_timer); 2647 2648 if (vm->in_sbm) { 2649 /* 2650 * After we unregistered our callbacks, user space can online 2651 * partially plugged offline blocks. Make sure to remove them. 2652 */ 2653 virtio_mem_sbm_for_each_mb(vm, mb_id, 2654 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2655 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2656 BUG_ON(rc); 2657 virtio_mem_sbm_set_mb_state(vm, mb_id, 2658 VIRTIO_MEM_SBM_MB_UNUSED); 2659 } 2660 /* 2661 * After we unregistered our callbacks, user space can no longer 2662 * offline partially plugged online memory blocks. No need to 2663 * worry about them. 2664 */ 2665 } 2666 2667 /* unregister callbacks */ 2668 unregister_virtio_mem_device(vm); 2669 unregister_memory_notifier(&vm->memory_notifier); 2670 2671 /* 2672 * There is no way we could reliably remove all memory we have added to 2673 * the system. And there is no way to stop the driver/device from going 2674 * away. Warn at least. 2675 */ 2676 if (virtio_mem_has_memory_added(vm)) { 2677 dev_warn(&vdev->dev, "device still has system memory added\n"); 2678 } else { 2679 virtio_mem_delete_resource(vm); 2680 kfree_const(vm->resource_name); 2681 } 2682 2683 /* remove all tracking data - no locking needed */ 2684 if (vm->in_sbm) { 2685 vfree(vm->sbm.mb_states); 2686 vfree(vm->sbm.sb_states); 2687 } else { 2688 vfree(vm->bbm.bb_states); 2689 } 2690 2691 /* reset the device and cleanup the queues */ 2692 vdev->config->reset(vdev); 2693 vdev->config->del_vqs(vdev); 2694 2695 kfree(vm); 2696 vdev->priv = NULL; 2697 } 2698 2699 static void virtio_mem_config_changed(struct virtio_device *vdev) 2700 { 2701 struct virtio_mem *vm = vdev->priv; 2702 2703 atomic_set(&vm->config_changed, 1); 2704 virtio_mem_retry(vm); 2705 } 2706 2707 #ifdef CONFIG_PM_SLEEP 2708 static int virtio_mem_freeze(struct virtio_device *vdev) 2709 { 2710 /* 2711 * When restarting the VM, all memory is usually unplugged. Don't 2712 * allow to suspend/hibernate. 2713 */ 2714 dev_err(&vdev->dev, "save/restore not supported.\n"); 2715 return -EPERM; 2716 } 2717 2718 static int virtio_mem_restore(struct virtio_device *vdev) 2719 { 2720 return -EPERM; 2721 } 2722 #endif 2723 2724 static unsigned int virtio_mem_features[] = { 2725 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2726 VIRTIO_MEM_F_ACPI_PXM, 2727 #endif 2728 }; 2729 2730 static const struct virtio_device_id virtio_mem_id_table[] = { 2731 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2732 { 0 }, 2733 }; 2734 2735 static struct virtio_driver virtio_mem_driver = { 2736 .feature_table = virtio_mem_features, 2737 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2738 .driver.name = KBUILD_MODNAME, 2739 .driver.owner = THIS_MODULE, 2740 .id_table = virtio_mem_id_table, 2741 .probe = virtio_mem_probe, 2742 .remove = virtio_mem_remove, 2743 .config_changed = virtio_mem_config_changed, 2744 #ifdef CONFIG_PM_SLEEP 2745 .freeze = virtio_mem_freeze, 2746 .restore = virtio_mem_restore, 2747 #endif 2748 }; 2749 2750 module_virtio_driver(virtio_mem_driver); 2751 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2752 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2753 MODULE_DESCRIPTION("Virtio-mem driver"); 2754 MODULE_LICENSE("GPL"); 2755