1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 24 #include <acpi/acpi_numa.h> 25 26 static bool unplug_online = true; 27 module_param(unplug_online, bool, 0644); 28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 29 30 static bool force_bbm; 31 module_param(force_bbm, bool, 0444); 32 MODULE_PARM_DESC(force_bbm, 33 "Force Big Block Mode. Default is 0 (auto-selection)"); 34 35 static unsigned long bbm_block_size; 36 module_param(bbm_block_size, ulong, 0444); 37 MODULE_PARM_DESC(bbm_block_size, 38 "Big Block size in bytes. Default is 0 (auto-detection)."); 39 40 static bool bbm_safe_unplug = true; 41 module_param(bbm_safe_unplug, bool, 0444); 42 MODULE_PARM_DESC(bbm_safe_unplug, 43 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 44 45 /* 46 * virtio-mem currently supports the following modes of operation: 47 * 48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 49 * size of a Sub Block (SB) is determined based on the device block size, the 50 * pageblock size, and the maximum allocation granularity of the buddy. 51 * Subblocks within a Linux memory block might either be plugged or unplugged. 52 * Memory is added/removed to Linux MM in Linux memory block granularity. 53 * 54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 55 * Memory is added/removed to Linux MM in Big Block granularity. 56 * 57 * The mode is determined automatically based on the Linux memory block size 58 * and the device block size. 59 * 60 * User space / core MM (auto onlining) is responsible for onlining added 61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 62 * always onlined separately, and all memory within a Linux memory block is 63 * onlined to the same zone - virtio-mem relies on this behavior. 64 */ 65 66 /* 67 * State of a Linux memory block in SBM. 68 */ 69 enum virtio_mem_sbm_mb_state { 70 /* Unplugged, not added to Linux. Can be reused later. */ 71 VIRTIO_MEM_SBM_MB_UNUSED = 0, 72 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 73 VIRTIO_MEM_SBM_MB_PLUGGED, 74 /* Fully plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE, 76 /* Partially plugged, fully added to Linux, offline. */ 77 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 78 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 79 VIRTIO_MEM_SBM_MB_KERNEL, 80 /* Partially plugged, fully added to Linux, online to a kernel zone */ 81 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 82 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 VIRTIO_MEM_SBM_MB_MOVABLE, 84 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 85 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 86 VIRTIO_MEM_SBM_MB_COUNT 87 }; 88 89 /* 90 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 91 */ 92 enum virtio_mem_bbm_bb_state { 93 /* Unplugged, not added to Linux. Can be reused later. */ 94 VIRTIO_MEM_BBM_BB_UNUSED = 0, 95 /* Plugged, not added to Linux. Error on add_memory(). */ 96 VIRTIO_MEM_BBM_BB_PLUGGED, 97 /* Plugged and added to Linux. */ 98 VIRTIO_MEM_BBM_BB_ADDED, 99 /* All online parts are fake-offline, ready to remove. */ 100 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 101 VIRTIO_MEM_BBM_BB_COUNT 102 }; 103 104 struct virtio_mem { 105 struct virtio_device *vdev; 106 107 /* We might first have to unplug all memory when starting up. */ 108 bool unplug_all_required; 109 110 /* Workqueue that processes the plug/unplug requests. */ 111 struct work_struct wq; 112 atomic_t wq_active; 113 atomic_t config_changed; 114 115 /* Virtqueue for guest->host requests. */ 116 struct virtqueue *vq; 117 118 /* Wait for a host response to a guest request. */ 119 wait_queue_head_t host_resp; 120 121 /* Space for one guest request and the host response. */ 122 struct virtio_mem_req req; 123 struct virtio_mem_resp resp; 124 125 /* The current size of the device. */ 126 uint64_t plugged_size; 127 /* The requested size of the device. */ 128 uint64_t requested_size; 129 130 /* The device block size (for communicating with the device). */ 131 uint64_t device_block_size; 132 /* The determined node id for all memory of the device. */ 133 int nid; 134 /* Physical start address of the memory region. */ 135 uint64_t addr; 136 /* Maximum region size in bytes. */ 137 uint64_t region_size; 138 139 /* The parent resource for all memory added via this device. */ 140 struct resource *parent_resource; 141 /* 142 * Copy of "System RAM (virtio_mem)" to be used for 143 * add_memory_driver_managed(). 144 */ 145 const char *resource_name; 146 147 /* 148 * We don't want to add too much memory if it's not getting onlined, 149 * to avoid running OOM. Besides this threshold, we allow to have at 150 * least two offline blocks at a time (whatever is bigger). 151 */ 152 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 153 atomic64_t offline_size; 154 uint64_t offline_threshold; 155 156 /* If set, the driver is in SBM, otherwise in BBM. */ 157 bool in_sbm; 158 159 union { 160 struct { 161 /* Id of the first memory block of this device. */ 162 unsigned long first_mb_id; 163 /* Id of the last usable memory block of this device. */ 164 unsigned long last_usable_mb_id; 165 /* Id of the next memory bock to prepare when needed. */ 166 unsigned long next_mb_id; 167 168 /* The subblock size. */ 169 uint64_t sb_size; 170 /* The number of subblocks per Linux memory block. */ 171 uint32_t sbs_per_mb; 172 173 /* Summary of all memory block states. */ 174 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 175 176 /* 177 * One byte state per memory block. Allocated via 178 * vmalloc(). Resized (alloc+copy+free) on demand. 179 * 180 * With 128 MiB memory blocks, we have states for 512 181 * GiB of memory in one 4 KiB page. 182 */ 183 uint8_t *mb_states; 184 185 /* 186 * Bitmap: one bit per subblock. Allocated similar to 187 * sbm.mb_states. 188 * 189 * A set bit means the corresponding subblock is 190 * plugged, otherwise it's unblocked. 191 * 192 * With 4 MiB subblocks, we manage 128 GiB of memory 193 * in one 4 KiB page. 194 */ 195 unsigned long *sb_states; 196 } sbm; 197 198 struct { 199 /* Id of the first big block of this device. */ 200 unsigned long first_bb_id; 201 /* Id of the last usable big block of this device. */ 202 unsigned long last_usable_bb_id; 203 /* Id of the next device bock to prepare when needed. */ 204 unsigned long next_bb_id; 205 206 /* Summary of all big block states. */ 207 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 208 209 /* One byte state per big block. See sbm.mb_states. */ 210 uint8_t *bb_states; 211 212 /* The block size used for plugging/adding/removing. */ 213 uint64_t bb_size; 214 } bbm; 215 }; 216 217 /* 218 * Mutex that protects the sbm.mb_count, sbm.mb_states, 219 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 220 * 221 * When this lock is held the pointers can't change, ONLINE and 222 * OFFLINE blocks can't change the state and no subblocks will get 223 * plugged/unplugged. 224 */ 225 struct mutex hotplug_mutex; 226 bool hotplug_active; 227 228 /* An error occurred we cannot handle - stop processing requests. */ 229 bool broken; 230 231 /* The driver is being removed. */ 232 spinlock_t removal_lock; 233 bool removing; 234 235 /* Timer for retrying to plug/unplug memory. */ 236 struct hrtimer retry_timer; 237 unsigned int retry_timer_ms; 238 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 239 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 240 241 /* Memory notifier (online/offline events). */ 242 struct notifier_block memory_notifier; 243 244 /* Next device in the list of virtio-mem devices. */ 245 struct list_head next; 246 }; 247 248 /* 249 * We have to share a single online_page callback among all virtio-mem 250 * devices. We use RCU to iterate the list in the callback. 251 */ 252 static DEFINE_MUTEX(virtio_mem_mutex); 253 static LIST_HEAD(virtio_mem_devices); 254 255 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 256 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 257 unsigned long nr_pages); 258 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 259 unsigned long nr_pages); 260 static void virtio_mem_retry(struct virtio_mem *vm); 261 262 /* 263 * Register a virtio-mem device so it will be considered for the online_page 264 * callback. 265 */ 266 static int register_virtio_mem_device(struct virtio_mem *vm) 267 { 268 int rc = 0; 269 270 /* First device registers the callback. */ 271 mutex_lock(&virtio_mem_mutex); 272 if (list_empty(&virtio_mem_devices)) 273 rc = set_online_page_callback(&virtio_mem_online_page_cb); 274 if (!rc) 275 list_add_rcu(&vm->next, &virtio_mem_devices); 276 mutex_unlock(&virtio_mem_mutex); 277 278 return rc; 279 } 280 281 /* 282 * Unregister a virtio-mem device so it will no longer be considered for the 283 * online_page callback. 284 */ 285 static void unregister_virtio_mem_device(struct virtio_mem *vm) 286 { 287 /* Last device unregisters the callback. */ 288 mutex_lock(&virtio_mem_mutex); 289 list_del_rcu(&vm->next); 290 if (list_empty(&virtio_mem_devices)) 291 restore_online_page_callback(&virtio_mem_online_page_cb); 292 mutex_unlock(&virtio_mem_mutex); 293 294 synchronize_rcu(); 295 } 296 297 /* 298 * Calculate the memory block id of a given address. 299 */ 300 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 301 { 302 return addr / memory_block_size_bytes(); 303 } 304 305 /* 306 * Calculate the physical start address of a given memory block id. 307 */ 308 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 309 { 310 return mb_id * memory_block_size_bytes(); 311 } 312 313 /* 314 * Calculate the big block id of a given address. 315 */ 316 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 317 uint64_t addr) 318 { 319 return addr / vm->bbm.bb_size; 320 } 321 322 /* 323 * Calculate the physical start address of a given big block id. 324 */ 325 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 326 unsigned long bb_id) 327 { 328 return bb_id * vm->bbm.bb_size; 329 } 330 331 /* 332 * Calculate the subblock id of a given address. 333 */ 334 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 335 unsigned long addr) 336 { 337 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 338 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 339 340 return (addr - mb_addr) / vm->sbm.sb_size; 341 } 342 343 /* 344 * Set the state of a big block, taking care of the state counter. 345 */ 346 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 347 unsigned long bb_id, 348 enum virtio_mem_bbm_bb_state state) 349 { 350 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 351 enum virtio_mem_bbm_bb_state old_state; 352 353 old_state = vm->bbm.bb_states[idx]; 354 vm->bbm.bb_states[idx] = state; 355 356 BUG_ON(vm->bbm.bb_count[old_state] == 0); 357 vm->bbm.bb_count[old_state]--; 358 vm->bbm.bb_count[state]++; 359 } 360 361 /* 362 * Get the state of a big block. 363 */ 364 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 365 unsigned long bb_id) 366 { 367 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 368 } 369 370 /* 371 * Prepare the big block state array for the next big block. 372 */ 373 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 374 { 375 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 376 unsigned long new_bytes = old_bytes + 1; 377 int old_pages = PFN_UP(old_bytes); 378 int new_pages = PFN_UP(new_bytes); 379 uint8_t *new_array; 380 381 if (vm->bbm.bb_states && old_pages == new_pages) 382 return 0; 383 384 new_array = vzalloc(new_pages * PAGE_SIZE); 385 if (!new_array) 386 return -ENOMEM; 387 388 mutex_lock(&vm->hotplug_mutex); 389 if (vm->bbm.bb_states) 390 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 391 vfree(vm->bbm.bb_states); 392 vm->bbm.bb_states = new_array; 393 mutex_unlock(&vm->hotplug_mutex); 394 395 return 0; 396 } 397 398 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 399 for (_bb_id = vm->bbm.first_bb_id; \ 400 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 401 _bb_id++) \ 402 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 403 404 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 405 for (_bb_id = vm->bbm.next_bb_id - 1; \ 406 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 407 _bb_id--) \ 408 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 409 410 /* 411 * Set the state of a memory block, taking care of the state counter. 412 */ 413 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 414 unsigned long mb_id, uint8_t state) 415 { 416 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 417 uint8_t old_state; 418 419 old_state = vm->sbm.mb_states[idx]; 420 vm->sbm.mb_states[idx] = state; 421 422 BUG_ON(vm->sbm.mb_count[old_state] == 0); 423 vm->sbm.mb_count[old_state]--; 424 vm->sbm.mb_count[state]++; 425 } 426 427 /* 428 * Get the state of a memory block. 429 */ 430 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 431 unsigned long mb_id) 432 { 433 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 434 435 return vm->sbm.mb_states[idx]; 436 } 437 438 /* 439 * Prepare the state array for the next memory block. 440 */ 441 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 442 { 443 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 444 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 445 uint8_t *new_array; 446 447 if (vm->sbm.mb_states && old_pages == new_pages) 448 return 0; 449 450 new_array = vzalloc(new_pages * PAGE_SIZE); 451 if (!new_array) 452 return -ENOMEM; 453 454 mutex_lock(&vm->hotplug_mutex); 455 if (vm->sbm.mb_states) 456 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 457 vfree(vm->sbm.mb_states); 458 vm->sbm.mb_states = new_array; 459 mutex_unlock(&vm->hotplug_mutex); 460 461 return 0; 462 } 463 464 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 465 for (_mb_id = _vm->sbm.first_mb_id; \ 466 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 467 _mb_id++) \ 468 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 469 470 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 471 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 472 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 473 _mb_id--) \ 474 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 475 476 /* 477 * Calculate the bit number in the subblock bitmap for the given subblock 478 * inside the given memory block. 479 */ 480 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 481 unsigned long mb_id, int sb_id) 482 { 483 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 484 } 485 486 /* 487 * Mark all selected subblocks plugged. 488 * 489 * Will not modify the state of the memory block. 490 */ 491 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 492 unsigned long mb_id, int sb_id, 493 int count) 494 { 495 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 496 497 __bitmap_set(vm->sbm.sb_states, bit, count); 498 } 499 500 /* 501 * Mark all selected subblocks unplugged. 502 * 503 * Will not modify the state of the memory block. 504 */ 505 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 506 unsigned long mb_id, int sb_id, 507 int count) 508 { 509 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 510 511 __bitmap_clear(vm->sbm.sb_states, bit, count); 512 } 513 514 /* 515 * Test if all selected subblocks are plugged. 516 */ 517 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 518 unsigned long mb_id, int sb_id, 519 int count) 520 { 521 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 522 523 if (count == 1) 524 return test_bit(bit, vm->sbm.sb_states); 525 526 /* TODO: Helper similar to bitmap_set() */ 527 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 528 bit + count; 529 } 530 531 /* 532 * Test if all selected subblocks are unplugged. 533 */ 534 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 535 unsigned long mb_id, int sb_id, 536 int count) 537 { 538 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 539 540 /* TODO: Helper similar to bitmap_set() */ 541 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 542 bit + count; 543 } 544 545 /* 546 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 547 * none. 548 */ 549 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 550 unsigned long mb_id) 551 { 552 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 553 554 return find_next_zero_bit(vm->sbm.sb_states, 555 bit + vm->sbm.sbs_per_mb, bit) - bit; 556 } 557 558 /* 559 * Prepare the subblock bitmap for the next memory block. 560 */ 561 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 562 { 563 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 564 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 565 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 566 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 567 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 568 unsigned long *new_bitmap, *old_bitmap; 569 570 if (vm->sbm.sb_states && old_pages == new_pages) 571 return 0; 572 573 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 574 if (!new_bitmap) 575 return -ENOMEM; 576 577 mutex_lock(&vm->hotplug_mutex); 578 if (new_bitmap) 579 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 580 581 old_bitmap = vm->sbm.sb_states; 582 vm->sbm.sb_states = new_bitmap; 583 mutex_unlock(&vm->hotplug_mutex); 584 585 vfree(old_bitmap); 586 return 0; 587 } 588 589 /* 590 * Test if we could add memory without creating too much offline memory - 591 * to avoid running OOM if memory is getting onlined deferred. 592 */ 593 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 594 { 595 if (WARN_ON_ONCE(size > vm->offline_threshold)) 596 return false; 597 598 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 599 } 600 601 /* 602 * Try adding memory to Linux. Will usually only fail if out of memory. 603 * 604 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 605 * onlining code). 606 * 607 * Will not modify the state of memory blocks in virtio-mem. 608 */ 609 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 610 uint64_t size) 611 { 612 int rc; 613 614 /* 615 * When force-unloading the driver and we still have memory added to 616 * Linux, the resource name has to stay. 617 */ 618 if (!vm->resource_name) { 619 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 620 GFP_KERNEL); 621 if (!vm->resource_name) 622 return -ENOMEM; 623 } 624 625 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 626 addr + size - 1); 627 /* Memory might get onlined immediately. */ 628 atomic64_add(size, &vm->offline_size); 629 rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, 630 MHP_MERGE_RESOURCE); 631 if (rc) { 632 atomic64_sub(size, &vm->offline_size); 633 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 634 /* 635 * TODO: Linux MM does not properly clean up yet in all cases 636 * where adding of memory failed - especially on -ENOMEM. 637 */ 638 } 639 return rc; 640 } 641 642 /* 643 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 644 */ 645 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 646 { 647 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 648 const uint64_t size = memory_block_size_bytes(); 649 650 return virtio_mem_add_memory(vm, addr, size); 651 } 652 653 /* 654 * See virtio_mem_add_memory(): Try adding a big block. 655 */ 656 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 657 { 658 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 659 const uint64_t size = vm->bbm.bb_size; 660 661 return virtio_mem_add_memory(vm, addr, size); 662 } 663 664 /* 665 * Try removing memory from Linux. Will only fail if memory blocks aren't 666 * offline. 667 * 668 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 669 * onlining code). 670 * 671 * Will not modify the state of memory blocks in virtio-mem. 672 */ 673 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 674 uint64_t size) 675 { 676 int rc; 677 678 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 679 addr + size - 1); 680 rc = remove_memory(vm->nid, addr, size); 681 if (!rc) { 682 atomic64_sub(size, &vm->offline_size); 683 /* 684 * We might have freed up memory we can now unplug, retry 685 * immediately instead of waiting. 686 */ 687 virtio_mem_retry(vm); 688 } else { 689 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 690 } 691 return rc; 692 } 693 694 /* 695 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 696 */ 697 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 698 { 699 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 700 const uint64_t size = memory_block_size_bytes(); 701 702 return virtio_mem_remove_memory(vm, addr, size); 703 } 704 705 /* 706 * Try offlining and removing memory from Linux. 707 * 708 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 709 * onlining code). 710 * 711 * Will not modify the state of memory blocks in virtio-mem. 712 */ 713 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 714 uint64_t addr, 715 uint64_t size) 716 { 717 int rc; 718 719 dev_dbg(&vm->vdev->dev, 720 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 721 addr + size - 1); 722 723 rc = offline_and_remove_memory(vm->nid, addr, size); 724 if (!rc) { 725 atomic64_sub(size, &vm->offline_size); 726 /* 727 * We might have freed up memory we can now unplug, retry 728 * immediately instead of waiting. 729 */ 730 virtio_mem_retry(vm); 731 } else { 732 dev_dbg(&vm->vdev->dev, 733 "offlining and removing memory failed: %d\n", rc); 734 } 735 return rc; 736 } 737 738 /* 739 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 740 * a single Linux memory block. 741 */ 742 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 743 unsigned long mb_id) 744 { 745 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 746 const uint64_t size = memory_block_size_bytes(); 747 748 return virtio_mem_offline_and_remove_memory(vm, addr, size); 749 } 750 751 /* 752 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 753 * all Linux memory blocks covered by the big block. 754 */ 755 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 756 unsigned long bb_id) 757 { 758 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 759 const uint64_t size = vm->bbm.bb_size; 760 761 return virtio_mem_offline_and_remove_memory(vm, addr, size); 762 } 763 764 /* 765 * Trigger the workqueue so the device can perform its magic. 766 */ 767 static void virtio_mem_retry(struct virtio_mem *vm) 768 { 769 unsigned long flags; 770 771 spin_lock_irqsave(&vm->removal_lock, flags); 772 if (!vm->removing) 773 queue_work(system_freezable_wq, &vm->wq); 774 spin_unlock_irqrestore(&vm->removal_lock, flags); 775 } 776 777 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 778 { 779 int node = NUMA_NO_NODE; 780 781 #if defined(CONFIG_ACPI_NUMA) 782 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 783 node = pxm_to_node(node_id); 784 #endif 785 return node; 786 } 787 788 /* 789 * Test if a virtio-mem device overlaps with the given range. Can be called 790 * from (notifier) callbacks lockless. 791 */ 792 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 793 uint64_t size) 794 { 795 return start < vm->addr + vm->region_size && vm->addr < start + size; 796 } 797 798 /* 799 * Test if a virtio-mem device contains a given range. Can be called from 800 * (notifier) callbacks lockless. 801 */ 802 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 803 uint64_t size) 804 { 805 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 806 } 807 808 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 809 unsigned long mb_id) 810 { 811 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 812 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 813 case VIRTIO_MEM_SBM_MB_OFFLINE: 814 return NOTIFY_OK; 815 default: 816 break; 817 } 818 dev_warn_ratelimited(&vm->vdev->dev, 819 "memory block onlining denied\n"); 820 return NOTIFY_BAD; 821 } 822 823 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 824 unsigned long mb_id) 825 { 826 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 827 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 828 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 829 virtio_mem_sbm_set_mb_state(vm, mb_id, 830 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 831 break; 832 case VIRTIO_MEM_SBM_MB_KERNEL: 833 case VIRTIO_MEM_SBM_MB_MOVABLE: 834 virtio_mem_sbm_set_mb_state(vm, mb_id, 835 VIRTIO_MEM_SBM_MB_OFFLINE); 836 break; 837 default: 838 BUG(); 839 break; 840 } 841 } 842 843 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 844 unsigned long mb_id, 845 unsigned long start_pfn) 846 { 847 const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == 848 ZONE_MOVABLE; 849 int new_state; 850 851 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 852 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 853 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 854 if (is_movable) 855 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 856 break; 857 case VIRTIO_MEM_SBM_MB_OFFLINE: 858 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 859 if (is_movable) 860 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 861 break; 862 default: 863 BUG(); 864 break; 865 } 866 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 867 } 868 869 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 870 unsigned long mb_id) 871 { 872 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 873 unsigned long pfn; 874 int sb_id; 875 876 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 877 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 878 continue; 879 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 880 sb_id * vm->sbm.sb_size); 881 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 882 } 883 } 884 885 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 886 unsigned long mb_id) 887 { 888 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 889 unsigned long pfn; 890 int sb_id; 891 892 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 893 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 894 continue; 895 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 896 sb_id * vm->sbm.sb_size); 897 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 898 } 899 } 900 901 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 902 unsigned long bb_id, 903 unsigned long pfn, 904 unsigned long nr_pages) 905 { 906 /* 907 * When marked as "fake-offline", all online memory of this device block 908 * is allocated by us. Otherwise, we don't have any memory allocated. 909 */ 910 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 911 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 912 return; 913 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 914 } 915 916 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 917 unsigned long bb_id, 918 unsigned long pfn, 919 unsigned long nr_pages) 920 { 921 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 922 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 923 return; 924 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 925 } 926 927 /* 928 * This callback will either be called synchronously from add_memory() or 929 * asynchronously (e.g., triggered via user space). We have to be careful 930 * with locking when calling add_memory(). 931 */ 932 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 933 unsigned long action, void *arg) 934 { 935 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 936 memory_notifier); 937 struct memory_notify *mhp = arg; 938 const unsigned long start = PFN_PHYS(mhp->start_pfn); 939 const unsigned long size = PFN_PHYS(mhp->nr_pages); 940 int rc = NOTIFY_OK; 941 unsigned long id; 942 943 if (!virtio_mem_overlaps_range(vm, start, size)) 944 return NOTIFY_DONE; 945 946 if (vm->in_sbm) { 947 id = virtio_mem_phys_to_mb_id(start); 948 /* 949 * In SBM, we add memory in separate memory blocks - we expect 950 * it to be onlined/offlined in the same granularity. Bail out 951 * if this ever changes. 952 */ 953 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 954 !IS_ALIGNED(start, memory_block_size_bytes()))) 955 return NOTIFY_BAD; 956 } else { 957 id = virtio_mem_phys_to_bb_id(vm, start); 958 /* 959 * In BBM, we only care about onlining/offlining happening 960 * within a single big block, we don't care about the 961 * actual granularity as we don't track individual Linux 962 * memory blocks. 963 */ 964 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 965 return NOTIFY_BAD; 966 } 967 968 /* 969 * Avoid circular locking lockdep warnings. We lock the mutex 970 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 971 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 972 * between both notifier calls and will bail out. False positive. 973 */ 974 lockdep_off(); 975 976 switch (action) { 977 case MEM_GOING_OFFLINE: 978 mutex_lock(&vm->hotplug_mutex); 979 if (vm->removing) { 980 rc = notifier_from_errno(-EBUSY); 981 mutex_unlock(&vm->hotplug_mutex); 982 break; 983 } 984 vm->hotplug_active = true; 985 if (vm->in_sbm) 986 virtio_mem_sbm_notify_going_offline(vm, id); 987 else 988 virtio_mem_bbm_notify_going_offline(vm, id, 989 mhp->start_pfn, 990 mhp->nr_pages); 991 break; 992 case MEM_GOING_ONLINE: 993 mutex_lock(&vm->hotplug_mutex); 994 if (vm->removing) { 995 rc = notifier_from_errno(-EBUSY); 996 mutex_unlock(&vm->hotplug_mutex); 997 break; 998 } 999 vm->hotplug_active = true; 1000 if (vm->in_sbm) 1001 rc = virtio_mem_sbm_notify_going_online(vm, id); 1002 break; 1003 case MEM_OFFLINE: 1004 if (vm->in_sbm) 1005 virtio_mem_sbm_notify_offline(vm, id); 1006 1007 atomic64_add(size, &vm->offline_size); 1008 /* 1009 * Trigger the workqueue. Now that we have some offline memory, 1010 * maybe we can handle pending unplug requests. 1011 */ 1012 if (!unplug_online) 1013 virtio_mem_retry(vm); 1014 1015 vm->hotplug_active = false; 1016 mutex_unlock(&vm->hotplug_mutex); 1017 break; 1018 case MEM_ONLINE: 1019 if (vm->in_sbm) 1020 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1021 1022 atomic64_sub(size, &vm->offline_size); 1023 /* 1024 * Start adding more memory once we onlined half of our 1025 * threshold. Don't trigger if it's possibly due to our actipn 1026 * (e.g., us adding memory which gets onlined immediately from 1027 * the core). 1028 */ 1029 if (!atomic_read(&vm->wq_active) && 1030 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1031 virtio_mem_retry(vm); 1032 1033 vm->hotplug_active = false; 1034 mutex_unlock(&vm->hotplug_mutex); 1035 break; 1036 case MEM_CANCEL_OFFLINE: 1037 if (!vm->hotplug_active) 1038 break; 1039 if (vm->in_sbm) 1040 virtio_mem_sbm_notify_cancel_offline(vm, id); 1041 else 1042 virtio_mem_bbm_notify_cancel_offline(vm, id, 1043 mhp->start_pfn, 1044 mhp->nr_pages); 1045 vm->hotplug_active = false; 1046 mutex_unlock(&vm->hotplug_mutex); 1047 break; 1048 case MEM_CANCEL_ONLINE: 1049 if (!vm->hotplug_active) 1050 break; 1051 vm->hotplug_active = false; 1052 mutex_unlock(&vm->hotplug_mutex); 1053 break; 1054 default: 1055 break; 1056 } 1057 1058 lockdep_on(); 1059 1060 return rc; 1061 } 1062 1063 /* 1064 * Set a range of pages PG_offline. Remember pages that were never onlined 1065 * (via generic_online_page()) using PageDirty(). 1066 */ 1067 static void virtio_mem_set_fake_offline(unsigned long pfn, 1068 unsigned long nr_pages, bool onlined) 1069 { 1070 page_offline_begin(); 1071 for (; nr_pages--; pfn++) { 1072 struct page *page = pfn_to_page(pfn); 1073 1074 __SetPageOffline(page); 1075 if (!onlined) { 1076 SetPageDirty(page); 1077 /* FIXME: remove after cleanups */ 1078 ClearPageReserved(page); 1079 } 1080 } 1081 page_offline_end(); 1082 } 1083 1084 /* 1085 * Clear PG_offline from a range of pages. If the pages were never onlined, 1086 * (via generic_online_page()), clear PageDirty(). 1087 */ 1088 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1089 unsigned long nr_pages, bool onlined) 1090 { 1091 for (; nr_pages--; pfn++) { 1092 struct page *page = pfn_to_page(pfn); 1093 1094 __ClearPageOffline(page); 1095 if (!onlined) 1096 ClearPageDirty(page); 1097 } 1098 } 1099 1100 /* 1101 * Release a range of fake-offline pages to the buddy, effectively 1102 * fake-onlining them. 1103 */ 1104 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1105 { 1106 const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; 1107 unsigned long i; 1108 1109 /* 1110 * We are always called at least with MAX_ORDER_NR_PAGES 1111 * granularity/alignment (e.g., the way subblocks work). All pages 1112 * inside such a block are alike. 1113 */ 1114 for (i = 0; i < nr_pages; i += max_nr_pages) { 1115 struct page *page = pfn_to_page(pfn + i); 1116 1117 /* 1118 * If the page is PageDirty(), it was kept fake-offline when 1119 * onlining the memory block. Otherwise, it was allocated 1120 * using alloc_contig_range(). All pages in a subblock are 1121 * alike. 1122 */ 1123 if (PageDirty(page)) { 1124 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1125 false); 1126 generic_online_page(page, MAX_ORDER - 1); 1127 } else { 1128 virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1129 true); 1130 free_contig_range(pfn + i, max_nr_pages); 1131 adjust_managed_page_count(page, max_nr_pages); 1132 } 1133 } 1134 } 1135 1136 /* 1137 * Try to allocate a range, marking pages fake-offline, effectively 1138 * fake-offlining them. 1139 */ 1140 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1141 { 1142 const bool is_movable = page_zonenum(pfn_to_page(pfn)) == 1143 ZONE_MOVABLE; 1144 int rc, retry_count; 1145 1146 /* 1147 * TODO: We want an alloc_contig_range() mode that tries to allocate 1148 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1149 * with ZONE_MOVABLE. So for now, retry a couple of times with 1150 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1151 * some guarantees. 1152 */ 1153 for (retry_count = 0; retry_count < 5; retry_count++) { 1154 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1155 GFP_KERNEL); 1156 if (rc == -ENOMEM) 1157 /* whoops, out of memory */ 1158 return rc; 1159 else if (rc && !is_movable) 1160 break; 1161 else if (rc) 1162 continue; 1163 1164 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1165 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1166 return 0; 1167 } 1168 1169 return -EBUSY; 1170 } 1171 1172 /* 1173 * Handle fake-offline pages when memory is going offline - such that the 1174 * pages can be skipped by mm-core when offlining. 1175 */ 1176 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1177 unsigned long nr_pages) 1178 { 1179 struct page *page; 1180 unsigned long i; 1181 1182 /* 1183 * Drop our reference to the pages so the memory can get offlined 1184 * and add the unplugged pages to the managed page counters (so 1185 * offlining code can correctly subtract them again). 1186 */ 1187 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1188 /* Drop our reference to the pages so the memory can get offlined. */ 1189 for (i = 0; i < nr_pages; i++) { 1190 page = pfn_to_page(pfn + i); 1191 if (WARN_ON(!page_ref_dec_and_test(page))) 1192 dump_page(page, "fake-offline page referenced"); 1193 } 1194 } 1195 1196 /* 1197 * Handle fake-offline pages when memory offlining is canceled - to undo 1198 * what we did in virtio_mem_fake_offline_going_offline(). 1199 */ 1200 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1201 unsigned long nr_pages) 1202 { 1203 unsigned long i; 1204 1205 /* 1206 * Get the reference we dropped when going offline and subtract the 1207 * unplugged pages from the managed page counters. 1208 */ 1209 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1210 for (i = 0; i < nr_pages; i++) 1211 page_ref_inc(pfn_to_page(pfn + i)); 1212 } 1213 1214 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1215 { 1216 const unsigned long addr = page_to_phys(page); 1217 unsigned long id, sb_id; 1218 struct virtio_mem *vm; 1219 bool do_online; 1220 1221 rcu_read_lock(); 1222 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1223 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1224 continue; 1225 1226 if (vm->in_sbm) { 1227 /* 1228 * We exploit here that subblocks have at least 1229 * MAX_ORDER_NR_PAGES size/alignment - so we cannot 1230 * cross subblocks within one call. 1231 */ 1232 id = virtio_mem_phys_to_mb_id(addr); 1233 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1234 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, 1235 sb_id, 1); 1236 } else { 1237 /* 1238 * If the whole block is marked fake offline, keep 1239 * everything that way. 1240 */ 1241 id = virtio_mem_phys_to_bb_id(vm, addr); 1242 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1243 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1244 } 1245 1246 /* 1247 * virtio_mem_set_fake_offline() might sleep, we don't need 1248 * the device anymore. See virtio_mem_remove() how races 1249 * between memory onlining and device removal are handled. 1250 */ 1251 rcu_read_unlock(); 1252 1253 if (do_online) 1254 generic_online_page(page, order); 1255 else 1256 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1257 false); 1258 return; 1259 } 1260 rcu_read_unlock(); 1261 1262 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1263 generic_online_page(page, order); 1264 } 1265 1266 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1267 const struct virtio_mem_req *req) 1268 { 1269 struct scatterlist *sgs[2], sg_req, sg_resp; 1270 unsigned int len; 1271 int rc; 1272 1273 /* don't use the request residing on the stack (vaddr) */ 1274 vm->req = *req; 1275 1276 /* out: buffer for request */ 1277 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1278 sgs[0] = &sg_req; 1279 1280 /* in: buffer for response */ 1281 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1282 sgs[1] = &sg_resp; 1283 1284 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1285 if (rc < 0) 1286 return rc; 1287 1288 virtqueue_kick(vm->vq); 1289 1290 /* wait for a response */ 1291 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1292 1293 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1294 } 1295 1296 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1297 uint64_t size) 1298 { 1299 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1300 const struct virtio_mem_req req = { 1301 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1302 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1303 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1304 }; 1305 int rc = -ENOMEM; 1306 1307 if (atomic_read(&vm->config_changed)) 1308 return -EAGAIN; 1309 1310 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1311 addr + size - 1); 1312 1313 switch (virtio_mem_send_request(vm, &req)) { 1314 case VIRTIO_MEM_RESP_ACK: 1315 vm->plugged_size += size; 1316 return 0; 1317 case VIRTIO_MEM_RESP_NACK: 1318 rc = -EAGAIN; 1319 break; 1320 case VIRTIO_MEM_RESP_BUSY: 1321 rc = -ETXTBSY; 1322 break; 1323 case VIRTIO_MEM_RESP_ERROR: 1324 rc = -EINVAL; 1325 break; 1326 default: 1327 break; 1328 } 1329 1330 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1331 return rc; 1332 } 1333 1334 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1335 uint64_t size) 1336 { 1337 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1338 const struct virtio_mem_req req = { 1339 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1340 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1341 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1342 }; 1343 int rc = -ENOMEM; 1344 1345 if (atomic_read(&vm->config_changed)) 1346 return -EAGAIN; 1347 1348 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1349 addr + size - 1); 1350 1351 switch (virtio_mem_send_request(vm, &req)) { 1352 case VIRTIO_MEM_RESP_ACK: 1353 vm->plugged_size -= size; 1354 return 0; 1355 case VIRTIO_MEM_RESP_BUSY: 1356 rc = -ETXTBSY; 1357 break; 1358 case VIRTIO_MEM_RESP_ERROR: 1359 rc = -EINVAL; 1360 break; 1361 default: 1362 break; 1363 } 1364 1365 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1366 return rc; 1367 } 1368 1369 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1370 { 1371 const struct virtio_mem_req req = { 1372 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1373 }; 1374 int rc = -ENOMEM; 1375 1376 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1377 1378 switch (virtio_mem_send_request(vm, &req)) { 1379 case VIRTIO_MEM_RESP_ACK: 1380 vm->unplug_all_required = false; 1381 vm->plugged_size = 0; 1382 /* usable region might have shrunk */ 1383 atomic_set(&vm->config_changed, 1); 1384 return 0; 1385 case VIRTIO_MEM_RESP_BUSY: 1386 rc = -ETXTBSY; 1387 break; 1388 default: 1389 break; 1390 } 1391 1392 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1393 return rc; 1394 } 1395 1396 /* 1397 * Plug selected subblocks. Updates the plugged state, but not the state 1398 * of the memory block. 1399 */ 1400 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1401 int sb_id, int count) 1402 { 1403 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1404 sb_id * vm->sbm.sb_size; 1405 const uint64_t size = count * vm->sbm.sb_size; 1406 int rc; 1407 1408 rc = virtio_mem_send_plug_request(vm, addr, size); 1409 if (!rc) 1410 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1411 return rc; 1412 } 1413 1414 /* 1415 * Unplug selected subblocks. Updates the plugged state, but not the state 1416 * of the memory block. 1417 */ 1418 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1419 int sb_id, int count) 1420 { 1421 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1422 sb_id * vm->sbm.sb_size; 1423 const uint64_t size = count * vm->sbm.sb_size; 1424 int rc; 1425 1426 rc = virtio_mem_send_unplug_request(vm, addr, size); 1427 if (!rc) 1428 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1429 return rc; 1430 } 1431 1432 /* 1433 * Request to unplug a big block. 1434 * 1435 * Will not modify the state of the big block. 1436 */ 1437 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1438 { 1439 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1440 const uint64_t size = vm->bbm.bb_size; 1441 1442 return virtio_mem_send_unplug_request(vm, addr, size); 1443 } 1444 1445 /* 1446 * Request to plug a big block. 1447 * 1448 * Will not modify the state of the big block. 1449 */ 1450 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1451 { 1452 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1453 const uint64_t size = vm->bbm.bb_size; 1454 1455 return virtio_mem_send_plug_request(vm, addr, size); 1456 } 1457 1458 /* 1459 * Unplug the desired number of plugged subblocks of a offline or not-added 1460 * memory block. Will fail if any subblock cannot get unplugged (instead of 1461 * skipping it). 1462 * 1463 * Will not modify the state of the memory block. 1464 * 1465 * Note: can fail after some subblocks were unplugged. 1466 */ 1467 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1468 unsigned long mb_id, uint64_t *nb_sb) 1469 { 1470 int sb_id, count; 1471 int rc; 1472 1473 sb_id = vm->sbm.sbs_per_mb - 1; 1474 while (*nb_sb) { 1475 /* Find the next candidate subblock */ 1476 while (sb_id >= 0 && 1477 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1478 sb_id--; 1479 if (sb_id < 0) 1480 break; 1481 /* Try to unplug multiple subblocks at a time */ 1482 count = 1; 1483 while (count < *nb_sb && sb_id > 0 && 1484 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1485 count++; 1486 sb_id--; 1487 } 1488 1489 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1490 if (rc) 1491 return rc; 1492 *nb_sb -= count; 1493 sb_id--; 1494 } 1495 1496 return 0; 1497 } 1498 1499 /* 1500 * Unplug all plugged subblocks of an offline or not-added memory block. 1501 * 1502 * Will not modify the state of the memory block. 1503 * 1504 * Note: can fail after some subblocks were unplugged. 1505 */ 1506 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1507 { 1508 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1509 1510 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1511 } 1512 1513 /* 1514 * Prepare tracking data for the next memory block. 1515 */ 1516 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1517 unsigned long *mb_id) 1518 { 1519 int rc; 1520 1521 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1522 return -ENOSPC; 1523 1524 /* Resize the state array if required. */ 1525 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1526 if (rc) 1527 return rc; 1528 1529 /* Resize the subblock bitmap if required. */ 1530 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1531 if (rc) 1532 return rc; 1533 1534 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1535 *mb_id = vm->sbm.next_mb_id++; 1536 return 0; 1537 } 1538 1539 /* 1540 * Try to plug the desired number of subblocks and add the memory block 1541 * to Linux. 1542 * 1543 * Will modify the state of the memory block. 1544 */ 1545 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1546 unsigned long mb_id, uint64_t *nb_sb) 1547 { 1548 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1549 int rc; 1550 1551 if (WARN_ON_ONCE(!count)) 1552 return -EINVAL; 1553 1554 /* 1555 * Plug the requested number of subblocks before adding it to linux, 1556 * so that onlining will directly online all plugged subblocks. 1557 */ 1558 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1559 if (rc) 1560 return rc; 1561 1562 /* 1563 * Mark the block properly offline before adding it to Linux, 1564 * so the memory notifiers will find the block in the right state. 1565 */ 1566 if (count == vm->sbm.sbs_per_mb) 1567 virtio_mem_sbm_set_mb_state(vm, mb_id, 1568 VIRTIO_MEM_SBM_MB_OFFLINE); 1569 else 1570 virtio_mem_sbm_set_mb_state(vm, mb_id, 1571 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1572 1573 /* Add the memory block to linux - if that fails, try to unplug. */ 1574 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1575 if (rc) { 1576 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1577 1578 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1579 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1580 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1581 return rc; 1582 } 1583 1584 *nb_sb -= count; 1585 return 0; 1586 } 1587 1588 /* 1589 * Try to plug the desired number of subblocks of a memory block that 1590 * is already added to Linux. 1591 * 1592 * Will modify the state of the memory block. 1593 * 1594 * Note: Can fail after some subblocks were successfully plugged. 1595 */ 1596 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1597 unsigned long mb_id, uint64_t *nb_sb) 1598 { 1599 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1600 unsigned long pfn, nr_pages; 1601 int sb_id, count; 1602 int rc; 1603 1604 if (WARN_ON_ONCE(!*nb_sb)) 1605 return -EINVAL; 1606 1607 while (*nb_sb) { 1608 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1609 if (sb_id >= vm->sbm.sbs_per_mb) 1610 break; 1611 count = 1; 1612 while (count < *nb_sb && 1613 sb_id + count < vm->sbm.sbs_per_mb && 1614 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1615 count++; 1616 1617 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1618 if (rc) 1619 return rc; 1620 *nb_sb -= count; 1621 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1622 continue; 1623 1624 /* fake-online the pages if the memory block is online */ 1625 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1626 sb_id * vm->sbm.sb_size); 1627 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1628 virtio_mem_fake_online(pfn, nr_pages); 1629 } 1630 1631 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1632 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1633 1634 return 0; 1635 } 1636 1637 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1638 { 1639 const int mb_states[] = { 1640 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1641 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1642 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1643 }; 1644 uint64_t nb_sb = diff / vm->sbm.sb_size; 1645 unsigned long mb_id; 1646 int rc, i; 1647 1648 if (!nb_sb) 1649 return 0; 1650 1651 /* Don't race with onlining/offlining */ 1652 mutex_lock(&vm->hotplug_mutex); 1653 1654 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1655 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1656 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1657 if (rc || !nb_sb) 1658 goto out_unlock; 1659 cond_resched(); 1660 } 1661 } 1662 1663 /* 1664 * We won't be working on online/offline memory blocks from this point, 1665 * so we can't race with memory onlining/offlining. Drop the mutex. 1666 */ 1667 mutex_unlock(&vm->hotplug_mutex); 1668 1669 /* Try to plug and add unused blocks */ 1670 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1671 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1672 return -ENOSPC; 1673 1674 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1675 if (rc || !nb_sb) 1676 return rc; 1677 cond_resched(); 1678 } 1679 1680 /* Try to prepare, plug and add new blocks */ 1681 while (nb_sb) { 1682 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1683 return -ENOSPC; 1684 1685 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1686 if (rc) 1687 return rc; 1688 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1689 if (rc) 1690 return rc; 1691 cond_resched(); 1692 } 1693 1694 return 0; 1695 out_unlock: 1696 mutex_unlock(&vm->hotplug_mutex); 1697 return rc; 1698 } 1699 1700 /* 1701 * Plug a big block and add it to Linux. 1702 * 1703 * Will modify the state of the big block. 1704 */ 1705 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1706 unsigned long bb_id) 1707 { 1708 int rc; 1709 1710 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1711 VIRTIO_MEM_BBM_BB_UNUSED)) 1712 return -EINVAL; 1713 1714 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1715 if (rc) 1716 return rc; 1717 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1718 1719 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1720 if (rc) { 1721 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1722 virtio_mem_bbm_set_bb_state(vm, bb_id, 1723 VIRTIO_MEM_BBM_BB_UNUSED); 1724 else 1725 /* Retry from the main loop. */ 1726 virtio_mem_bbm_set_bb_state(vm, bb_id, 1727 VIRTIO_MEM_BBM_BB_PLUGGED); 1728 return rc; 1729 } 1730 return 0; 1731 } 1732 1733 /* 1734 * Prepare tracking data for the next big block. 1735 */ 1736 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1737 unsigned long *bb_id) 1738 { 1739 int rc; 1740 1741 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1742 return -ENOSPC; 1743 1744 /* Resize the big block state array if required. */ 1745 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1746 if (rc) 1747 return rc; 1748 1749 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1750 *bb_id = vm->bbm.next_bb_id; 1751 vm->bbm.next_bb_id++; 1752 return 0; 1753 } 1754 1755 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1756 { 1757 uint64_t nb_bb = diff / vm->bbm.bb_size; 1758 unsigned long bb_id; 1759 int rc; 1760 1761 if (!nb_bb) 1762 return 0; 1763 1764 /* Try to plug and add unused big blocks */ 1765 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1766 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1767 return -ENOSPC; 1768 1769 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1770 if (!rc) 1771 nb_bb--; 1772 if (rc || !nb_bb) 1773 return rc; 1774 cond_resched(); 1775 } 1776 1777 /* Try to prepare, plug and add new big blocks */ 1778 while (nb_bb) { 1779 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1780 return -ENOSPC; 1781 1782 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1783 if (rc) 1784 return rc; 1785 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1786 if (!rc) 1787 nb_bb--; 1788 if (rc) 1789 return rc; 1790 cond_resched(); 1791 } 1792 1793 return 0; 1794 } 1795 1796 /* 1797 * Try to plug the requested amount of memory. 1798 */ 1799 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1800 { 1801 if (vm->in_sbm) 1802 return virtio_mem_sbm_plug_request(vm, diff); 1803 return virtio_mem_bbm_plug_request(vm, diff); 1804 } 1805 1806 /* 1807 * Unplug the desired number of plugged subblocks of an offline memory block. 1808 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1809 * 1810 * Will modify the state of the memory block. Might temporarily drop the 1811 * hotplug_mutex. 1812 * 1813 * Note: Can fail after some subblocks were successfully unplugged. 1814 */ 1815 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1816 unsigned long mb_id, 1817 uint64_t *nb_sb) 1818 { 1819 int rc; 1820 1821 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1822 1823 /* some subblocks might have been unplugged even on failure */ 1824 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1825 virtio_mem_sbm_set_mb_state(vm, mb_id, 1826 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1827 if (rc) 1828 return rc; 1829 1830 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1831 /* 1832 * Remove the block from Linux - this should never fail. 1833 * Hinder the block from getting onlined by marking it 1834 * unplugged. Temporarily drop the mutex, so 1835 * any pending GOING_ONLINE requests can be serviced/rejected. 1836 */ 1837 virtio_mem_sbm_set_mb_state(vm, mb_id, 1838 VIRTIO_MEM_SBM_MB_UNUSED); 1839 1840 mutex_unlock(&vm->hotplug_mutex); 1841 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1842 BUG_ON(rc); 1843 mutex_lock(&vm->hotplug_mutex); 1844 } 1845 return 0; 1846 } 1847 1848 /* 1849 * Unplug the given plugged subblocks of an online memory block. 1850 * 1851 * Will modify the state of the memory block. 1852 */ 1853 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1854 unsigned long mb_id, int sb_id, 1855 int count) 1856 { 1857 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1858 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1859 unsigned long start_pfn; 1860 int rc; 1861 1862 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1863 sb_id * vm->sbm.sb_size); 1864 1865 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1866 if (rc) 1867 return rc; 1868 1869 /* Try to unplug the allocated memory */ 1870 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1871 if (rc) { 1872 /* Return the memory to the buddy. */ 1873 virtio_mem_fake_online(start_pfn, nr_pages); 1874 return rc; 1875 } 1876 1877 switch (old_state) { 1878 case VIRTIO_MEM_SBM_MB_KERNEL: 1879 virtio_mem_sbm_set_mb_state(vm, mb_id, 1880 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1881 break; 1882 case VIRTIO_MEM_SBM_MB_MOVABLE: 1883 virtio_mem_sbm_set_mb_state(vm, mb_id, 1884 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1885 break; 1886 } 1887 1888 return 0; 1889 } 1890 1891 /* 1892 * Unplug the desired number of plugged subblocks of an online memory block. 1893 * Will skip subblock that are busy. 1894 * 1895 * Will modify the state of the memory block. Might temporarily drop the 1896 * hotplug_mutex. 1897 * 1898 * Note: Can fail after some subblocks were successfully unplugged. Can 1899 * return 0 even if subblocks were busy and could not get unplugged. 1900 */ 1901 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1902 unsigned long mb_id, 1903 uint64_t *nb_sb) 1904 { 1905 int rc, sb_id; 1906 1907 /* If possible, try to unplug the complete block in one shot. */ 1908 if (*nb_sb >= vm->sbm.sbs_per_mb && 1909 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1910 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1911 vm->sbm.sbs_per_mb); 1912 if (!rc) { 1913 *nb_sb -= vm->sbm.sbs_per_mb; 1914 goto unplugged; 1915 } else if (rc != -EBUSY) 1916 return rc; 1917 } 1918 1919 /* Fallback to single subblocks. */ 1920 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1921 /* Find the next candidate subblock */ 1922 while (sb_id >= 0 && 1923 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1924 sb_id--; 1925 if (sb_id < 0) 1926 break; 1927 1928 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1929 if (rc == -EBUSY) 1930 continue; 1931 else if (rc) 1932 return rc; 1933 *nb_sb -= 1; 1934 } 1935 1936 unplugged: 1937 /* 1938 * Once all subblocks of a memory block were unplugged, offline and 1939 * remove it. This will usually not fail, as no memory is in use 1940 * anymore - however some other notifiers might NACK the request. 1941 */ 1942 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1943 mutex_unlock(&vm->hotplug_mutex); 1944 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 1945 mutex_lock(&vm->hotplug_mutex); 1946 if (!rc) 1947 virtio_mem_sbm_set_mb_state(vm, mb_id, 1948 VIRTIO_MEM_SBM_MB_UNUSED); 1949 } 1950 1951 return 0; 1952 } 1953 1954 /* 1955 * Unplug the desired number of plugged subblocks of a memory block that is 1956 * already added to Linux. Will skip subblock of online memory blocks that are 1957 * busy (by the OS). Will fail if any subblock that's not busy cannot get 1958 * unplugged. 1959 * 1960 * Will modify the state of the memory block. Might temporarily drop the 1961 * hotplug_mutex. 1962 * 1963 * Note: Can fail after some subblocks were successfully unplugged. Can 1964 * return 0 even if subblocks were busy and could not get unplugged. 1965 */ 1966 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1967 unsigned long mb_id, 1968 uint64_t *nb_sb) 1969 { 1970 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1971 1972 switch (old_state) { 1973 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 1974 case VIRTIO_MEM_SBM_MB_KERNEL: 1975 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 1976 case VIRTIO_MEM_SBM_MB_MOVABLE: 1977 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 1978 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 1979 case VIRTIO_MEM_SBM_MB_OFFLINE: 1980 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 1981 } 1982 return -EINVAL; 1983 } 1984 1985 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1986 { 1987 const int mb_states[] = { 1988 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1989 VIRTIO_MEM_SBM_MB_OFFLINE, 1990 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1991 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1992 VIRTIO_MEM_SBM_MB_MOVABLE, 1993 VIRTIO_MEM_SBM_MB_KERNEL, 1994 }; 1995 uint64_t nb_sb = diff / vm->sbm.sb_size; 1996 unsigned long mb_id; 1997 int rc, i; 1998 1999 if (!nb_sb) 2000 return 0; 2001 2002 /* 2003 * We'll drop the mutex a couple of times when it is safe to do so. 2004 * This might result in some blocks switching the state (online/offline) 2005 * and we could miss them in this run - we will retry again later. 2006 */ 2007 mutex_lock(&vm->hotplug_mutex); 2008 2009 /* 2010 * We try unplug from partially plugged blocks first, to try removing 2011 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2012 * as it's more reliable to unplug memory and remove whole memory 2013 * blocks, and we don't want to trigger a zone imbalances by 2014 * accidentially removing too much kernel memory. 2015 */ 2016 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2017 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2018 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2019 if (rc || !nb_sb) 2020 goto out_unlock; 2021 mutex_unlock(&vm->hotplug_mutex); 2022 cond_resched(); 2023 mutex_lock(&vm->hotplug_mutex); 2024 } 2025 if (!unplug_online && i == 1) { 2026 mutex_unlock(&vm->hotplug_mutex); 2027 return 0; 2028 } 2029 } 2030 2031 mutex_unlock(&vm->hotplug_mutex); 2032 return nb_sb ? -EBUSY : 0; 2033 out_unlock: 2034 mutex_unlock(&vm->hotplug_mutex); 2035 return rc; 2036 } 2037 2038 /* 2039 * Try to offline and remove a big block from Linux and unplug it. Will fail 2040 * with -EBUSY if some memory is busy and cannot get unplugged. 2041 * 2042 * Will modify the state of the memory block. Might temporarily drop the 2043 * hotplug_mutex. 2044 */ 2045 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2046 unsigned long bb_id) 2047 { 2048 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2049 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2050 unsigned long end_pfn = start_pfn + nr_pages; 2051 unsigned long pfn; 2052 struct page *page; 2053 int rc; 2054 2055 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2056 VIRTIO_MEM_BBM_BB_ADDED)) 2057 return -EINVAL; 2058 2059 if (bbm_safe_unplug) { 2060 /* 2061 * Start by fake-offlining all memory. Once we marked the device 2062 * block as fake-offline, all newly onlined memory will 2063 * automatically be kept fake-offline. Protect from concurrent 2064 * onlining/offlining until we have a consistent state. 2065 */ 2066 mutex_lock(&vm->hotplug_mutex); 2067 virtio_mem_bbm_set_bb_state(vm, bb_id, 2068 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2069 2070 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2071 page = pfn_to_online_page(pfn); 2072 if (!page) 2073 continue; 2074 2075 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2076 if (rc) { 2077 end_pfn = pfn; 2078 goto rollback_safe_unplug; 2079 } 2080 } 2081 mutex_unlock(&vm->hotplug_mutex); 2082 } 2083 2084 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2085 if (rc) { 2086 if (bbm_safe_unplug) { 2087 mutex_lock(&vm->hotplug_mutex); 2088 goto rollback_safe_unplug; 2089 } 2090 return rc; 2091 } 2092 2093 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2094 if (rc) 2095 virtio_mem_bbm_set_bb_state(vm, bb_id, 2096 VIRTIO_MEM_BBM_BB_PLUGGED); 2097 else 2098 virtio_mem_bbm_set_bb_state(vm, bb_id, 2099 VIRTIO_MEM_BBM_BB_UNUSED); 2100 return rc; 2101 2102 rollback_safe_unplug: 2103 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2104 page = pfn_to_online_page(pfn); 2105 if (!page) 2106 continue; 2107 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2108 } 2109 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2110 mutex_unlock(&vm->hotplug_mutex); 2111 return rc; 2112 } 2113 2114 /* 2115 * Test if a big block is completely offline. 2116 */ 2117 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2118 unsigned long bb_id) 2119 { 2120 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2121 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2122 unsigned long pfn; 2123 2124 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2125 pfn += PAGES_PER_SECTION) { 2126 if (pfn_to_online_page(pfn)) 2127 return false; 2128 } 2129 2130 return true; 2131 } 2132 2133 /* 2134 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2135 */ 2136 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2137 unsigned long bb_id) 2138 { 2139 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2140 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2141 struct page *page; 2142 unsigned long pfn; 2143 2144 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2145 pfn += PAGES_PER_SECTION) { 2146 page = pfn_to_online_page(pfn); 2147 if (!page) 2148 continue; 2149 if (page_zonenum(page) != ZONE_MOVABLE) 2150 return false; 2151 } 2152 2153 return true; 2154 } 2155 2156 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2157 { 2158 uint64_t nb_bb = diff / vm->bbm.bb_size; 2159 uint64_t bb_id; 2160 int rc, i; 2161 2162 if (!nb_bb) 2163 return 0; 2164 2165 /* 2166 * Try to unplug big blocks. Similar to SBM, start with offline 2167 * big blocks. 2168 */ 2169 for (i = 0; i < 3; i++) { 2170 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2171 cond_resched(); 2172 2173 /* 2174 * As we're holding no locks, these checks are racy, 2175 * but we don't care. 2176 */ 2177 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2178 continue; 2179 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2180 continue; 2181 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2182 if (rc == -EBUSY) 2183 continue; 2184 if (!rc) 2185 nb_bb--; 2186 if (rc || !nb_bb) 2187 return rc; 2188 } 2189 if (i == 0 && !unplug_online) 2190 return 0; 2191 } 2192 2193 return nb_bb ? -EBUSY : 0; 2194 } 2195 2196 /* 2197 * Try to unplug the requested amount of memory. 2198 */ 2199 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2200 { 2201 if (vm->in_sbm) 2202 return virtio_mem_sbm_unplug_request(vm, diff); 2203 return virtio_mem_bbm_unplug_request(vm, diff); 2204 } 2205 2206 /* 2207 * Try to unplug all blocks that couldn't be unplugged before, for example, 2208 * because the hypervisor was busy. 2209 */ 2210 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2211 { 2212 unsigned long id; 2213 int rc; 2214 2215 if (!vm->in_sbm) { 2216 virtio_mem_bbm_for_each_bb(vm, id, 2217 VIRTIO_MEM_BBM_BB_PLUGGED) { 2218 rc = virtio_mem_bbm_unplug_bb(vm, id); 2219 if (rc) 2220 return rc; 2221 virtio_mem_bbm_set_bb_state(vm, id, 2222 VIRTIO_MEM_BBM_BB_UNUSED); 2223 } 2224 return 0; 2225 } 2226 2227 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2228 rc = virtio_mem_sbm_unplug_mb(vm, id); 2229 if (rc) 2230 return rc; 2231 virtio_mem_sbm_set_mb_state(vm, id, 2232 VIRTIO_MEM_SBM_MB_UNUSED); 2233 } 2234 2235 return 0; 2236 } 2237 2238 /* 2239 * Update all parts of the config that could have changed. 2240 */ 2241 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2242 { 2243 const struct range pluggable_range = mhp_get_pluggable_range(true); 2244 uint64_t new_plugged_size, usable_region_size, end_addr; 2245 2246 /* the plugged_size is just a reflection of what _we_ did previously */ 2247 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2248 &new_plugged_size); 2249 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2250 vm->plugged_size = new_plugged_size; 2251 2252 /* calculate the last usable memory block id */ 2253 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2254 usable_region_size, &usable_region_size); 2255 end_addr = min(vm->addr + usable_region_size - 1, 2256 pluggable_range.end); 2257 2258 if (vm->in_sbm) { 2259 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2260 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2261 vm->sbm.last_usable_mb_id--; 2262 } else { 2263 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2264 end_addr); 2265 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2266 vm->bbm.last_usable_bb_id--; 2267 } 2268 /* 2269 * If we cannot plug any of our device memory (e.g., nothing in the 2270 * usable region is addressable), the last usable memory block id will 2271 * be smaller than the first usable memory block id. We'll stop 2272 * attempting to add memory with -ENOSPC from our main loop. 2273 */ 2274 2275 /* see if there is a request to change the size */ 2276 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2277 &vm->requested_size); 2278 2279 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2280 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2281 } 2282 2283 /* 2284 * Workqueue function for handling plug/unplug requests and config updates. 2285 */ 2286 static void virtio_mem_run_wq(struct work_struct *work) 2287 { 2288 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2289 uint64_t diff; 2290 int rc; 2291 2292 hrtimer_cancel(&vm->retry_timer); 2293 2294 if (vm->broken) 2295 return; 2296 2297 atomic_set(&vm->wq_active, 1); 2298 retry: 2299 rc = 0; 2300 2301 /* Make sure we start with a clean state if there are leftovers. */ 2302 if (unlikely(vm->unplug_all_required)) 2303 rc = virtio_mem_send_unplug_all_request(vm); 2304 2305 if (atomic_read(&vm->config_changed)) { 2306 atomic_set(&vm->config_changed, 0); 2307 virtio_mem_refresh_config(vm); 2308 } 2309 2310 /* Unplug any leftovers from previous runs */ 2311 if (!rc) 2312 rc = virtio_mem_unplug_pending_mb(vm); 2313 2314 if (!rc && vm->requested_size != vm->plugged_size) { 2315 if (vm->requested_size > vm->plugged_size) { 2316 diff = vm->requested_size - vm->plugged_size; 2317 rc = virtio_mem_plug_request(vm, diff); 2318 } else { 2319 diff = vm->plugged_size - vm->requested_size; 2320 rc = virtio_mem_unplug_request(vm, diff); 2321 } 2322 } 2323 2324 switch (rc) { 2325 case 0: 2326 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2327 break; 2328 case -ENOSPC: 2329 /* 2330 * We cannot add any more memory (alignment, physical limit) 2331 * or we have too many offline memory blocks. 2332 */ 2333 break; 2334 case -ETXTBSY: 2335 /* 2336 * The hypervisor cannot process our request right now 2337 * (e.g., out of memory, migrating); 2338 */ 2339 case -EBUSY: 2340 /* 2341 * We cannot free up any memory to unplug it (all plugged memory 2342 * is busy). 2343 */ 2344 case -ENOMEM: 2345 /* Out of memory, try again later. */ 2346 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2347 HRTIMER_MODE_REL); 2348 break; 2349 case -EAGAIN: 2350 /* Retry immediately (e.g., the config changed). */ 2351 goto retry; 2352 default: 2353 /* Unknown error, mark as broken */ 2354 dev_err(&vm->vdev->dev, 2355 "unknown error, marking device broken: %d\n", rc); 2356 vm->broken = true; 2357 } 2358 2359 atomic_set(&vm->wq_active, 0); 2360 } 2361 2362 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2363 { 2364 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2365 retry_timer); 2366 2367 virtio_mem_retry(vm); 2368 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2369 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2370 return HRTIMER_NORESTART; 2371 } 2372 2373 static void virtio_mem_handle_response(struct virtqueue *vq) 2374 { 2375 struct virtio_mem *vm = vq->vdev->priv; 2376 2377 wake_up(&vm->host_resp); 2378 } 2379 2380 static int virtio_mem_init_vq(struct virtio_mem *vm) 2381 { 2382 struct virtqueue *vq; 2383 2384 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2385 "guest-request"); 2386 if (IS_ERR(vq)) 2387 return PTR_ERR(vq); 2388 vm->vq = vq; 2389 2390 return 0; 2391 } 2392 2393 static int virtio_mem_init(struct virtio_mem *vm) 2394 { 2395 const struct range pluggable_range = mhp_get_pluggable_range(true); 2396 uint64_t sb_size, addr; 2397 uint16_t node_id; 2398 2399 if (!vm->vdev->config->get) { 2400 dev_err(&vm->vdev->dev, "config access disabled\n"); 2401 return -EINVAL; 2402 } 2403 2404 /* 2405 * We don't want to (un)plug or reuse any memory when in kdump. The 2406 * memory is still accessible (but not mapped). 2407 */ 2408 if (is_kdump_kernel()) { 2409 dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n"); 2410 return -EBUSY; 2411 } 2412 2413 /* Fetch all properties that can't change. */ 2414 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2415 &vm->plugged_size); 2416 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2417 &vm->device_block_size); 2418 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2419 &node_id); 2420 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2421 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2422 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2423 &vm->region_size); 2424 2425 /* Determine the nid for the device based on the lowest address. */ 2426 if (vm->nid == NUMA_NO_NODE) 2427 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2428 2429 /* bad device setup - warn only */ 2430 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2431 dev_warn(&vm->vdev->dev, 2432 "The alignment of the physical start address can make some memory unusable.\n"); 2433 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2434 dev_warn(&vm->vdev->dev, 2435 "The alignment of the physical end address can make some memory unusable.\n"); 2436 if (vm->addr < pluggable_range.start || 2437 vm->addr + vm->region_size - 1 > pluggable_range.end) 2438 dev_warn(&vm->vdev->dev, 2439 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2440 2441 /* Prepare the offline threshold - make sure we can add two blocks. */ 2442 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2443 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2444 2445 /* 2446 * We want subblocks to span at least MAX_ORDER_NR_PAGES and 2447 * pageblock_nr_pages pages. This: 2448 * - Simplifies our page onlining code (virtio_mem_online_page_cb) 2449 * and fake page onlining code (virtio_mem_fake_online). 2450 * - Is required for now for alloc_contig_range() to work reliably - 2451 * it doesn't properly handle smaller granularity on ZONE_NORMAL. 2452 */ 2453 sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, 2454 pageblock_nr_pages) * PAGE_SIZE; 2455 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2456 2457 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2458 /* SBM: At least two subblocks per Linux memory block. */ 2459 vm->in_sbm = true; 2460 vm->sbm.sb_size = sb_size; 2461 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2462 vm->sbm.sb_size; 2463 2464 /* Round up to the next full memory block */ 2465 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2466 memory_block_size_bytes() - 1; 2467 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2468 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2469 } else { 2470 /* BBM: At least one Linux memory block. */ 2471 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2472 memory_block_size_bytes()); 2473 2474 if (bbm_block_size) { 2475 if (!is_power_of_2(bbm_block_size)) { 2476 dev_warn(&vm->vdev->dev, 2477 "bbm_block_size is not a power of 2"); 2478 } else if (bbm_block_size < vm->bbm.bb_size) { 2479 dev_warn(&vm->vdev->dev, 2480 "bbm_block_size is too small"); 2481 } else { 2482 vm->bbm.bb_size = bbm_block_size; 2483 } 2484 } 2485 2486 /* Round up to the next aligned big block */ 2487 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2488 vm->bbm.bb_size - 1; 2489 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2490 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2491 2492 /* Make sure we can add two big blocks. */ 2493 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2494 vm->offline_threshold); 2495 } 2496 2497 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2498 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2499 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2500 (unsigned long long)vm->device_block_size); 2501 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2502 memory_block_size_bytes()); 2503 if (vm->in_sbm) 2504 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2505 (unsigned long long)vm->sbm.sb_size); 2506 else 2507 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2508 (unsigned long long)vm->bbm.bb_size); 2509 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2510 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2511 2512 return 0; 2513 } 2514 2515 static int virtio_mem_create_resource(struct virtio_mem *vm) 2516 { 2517 /* 2518 * When force-unloading the driver and removing the device, we 2519 * could have a garbage pointer. Duplicate the string. 2520 */ 2521 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2522 2523 if (!name) 2524 return -ENOMEM; 2525 2526 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2527 name, IORESOURCE_SYSTEM_RAM); 2528 if (!vm->parent_resource) { 2529 kfree(name); 2530 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2531 dev_info(&vm->vdev->dev, 2532 "reloading the driver is not supported\n"); 2533 return -EBUSY; 2534 } 2535 2536 /* The memory is not actually busy - make add_memory() work. */ 2537 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2538 return 0; 2539 } 2540 2541 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2542 { 2543 const char *name; 2544 2545 if (!vm->parent_resource) 2546 return; 2547 2548 name = vm->parent_resource->name; 2549 release_resource(vm->parent_resource); 2550 kfree(vm->parent_resource); 2551 kfree(name); 2552 vm->parent_resource = NULL; 2553 } 2554 2555 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2556 { 2557 return 1; 2558 } 2559 2560 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2561 { 2562 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2563 2564 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2565 vm->addr + vm->region_size, NULL, 2566 virtio_mem_range_has_system_ram) == 1; 2567 } 2568 2569 static int virtio_mem_probe(struct virtio_device *vdev) 2570 { 2571 struct virtio_mem *vm; 2572 int rc; 2573 2574 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2575 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2576 2577 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2578 if (!vm) 2579 return -ENOMEM; 2580 2581 init_waitqueue_head(&vm->host_resp); 2582 vm->vdev = vdev; 2583 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2584 mutex_init(&vm->hotplug_mutex); 2585 INIT_LIST_HEAD(&vm->next); 2586 spin_lock_init(&vm->removal_lock); 2587 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2588 vm->retry_timer.function = virtio_mem_timer_expired; 2589 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2590 2591 /* register the virtqueue */ 2592 rc = virtio_mem_init_vq(vm); 2593 if (rc) 2594 goto out_free_vm; 2595 2596 /* initialize the device by querying the config */ 2597 rc = virtio_mem_init(vm); 2598 if (rc) 2599 goto out_del_vq; 2600 2601 /* create the parent resource for all memory */ 2602 rc = virtio_mem_create_resource(vm); 2603 if (rc) 2604 goto out_del_vq; 2605 2606 /* 2607 * If we still have memory plugged, we have to unplug all memory first. 2608 * Registering our parent resource makes sure that this memory isn't 2609 * actually in use (e.g., trying to reload the driver). 2610 */ 2611 if (vm->plugged_size) { 2612 vm->unplug_all_required = true; 2613 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2614 } 2615 2616 /* register callbacks */ 2617 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2618 rc = register_memory_notifier(&vm->memory_notifier); 2619 if (rc) 2620 goto out_del_resource; 2621 rc = register_virtio_mem_device(vm); 2622 if (rc) 2623 goto out_unreg_mem; 2624 2625 virtio_device_ready(vdev); 2626 2627 /* trigger a config update to start processing the requested_size */ 2628 atomic_set(&vm->config_changed, 1); 2629 queue_work(system_freezable_wq, &vm->wq); 2630 2631 return 0; 2632 out_unreg_mem: 2633 unregister_memory_notifier(&vm->memory_notifier); 2634 out_del_resource: 2635 virtio_mem_delete_resource(vm); 2636 out_del_vq: 2637 vdev->config->del_vqs(vdev); 2638 out_free_vm: 2639 kfree(vm); 2640 vdev->priv = NULL; 2641 2642 return rc; 2643 } 2644 2645 static void virtio_mem_remove(struct virtio_device *vdev) 2646 { 2647 struct virtio_mem *vm = vdev->priv; 2648 unsigned long mb_id; 2649 int rc; 2650 2651 /* 2652 * Make sure the workqueue won't be triggered anymore and no memory 2653 * blocks can be onlined/offlined until we're finished here. 2654 */ 2655 mutex_lock(&vm->hotplug_mutex); 2656 spin_lock_irq(&vm->removal_lock); 2657 vm->removing = true; 2658 spin_unlock_irq(&vm->removal_lock); 2659 mutex_unlock(&vm->hotplug_mutex); 2660 2661 /* wait until the workqueue stopped */ 2662 cancel_work_sync(&vm->wq); 2663 hrtimer_cancel(&vm->retry_timer); 2664 2665 if (vm->in_sbm) { 2666 /* 2667 * After we unregistered our callbacks, user space can online 2668 * partially plugged offline blocks. Make sure to remove them. 2669 */ 2670 virtio_mem_sbm_for_each_mb(vm, mb_id, 2671 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2672 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2673 BUG_ON(rc); 2674 virtio_mem_sbm_set_mb_state(vm, mb_id, 2675 VIRTIO_MEM_SBM_MB_UNUSED); 2676 } 2677 /* 2678 * After we unregistered our callbacks, user space can no longer 2679 * offline partially plugged online memory blocks. No need to 2680 * worry about them. 2681 */ 2682 } 2683 2684 /* unregister callbacks */ 2685 unregister_virtio_mem_device(vm); 2686 unregister_memory_notifier(&vm->memory_notifier); 2687 2688 /* 2689 * There is no way we could reliably remove all memory we have added to 2690 * the system. And there is no way to stop the driver/device from going 2691 * away. Warn at least. 2692 */ 2693 if (virtio_mem_has_memory_added(vm)) { 2694 dev_warn(&vdev->dev, "device still has system memory added\n"); 2695 } else { 2696 virtio_mem_delete_resource(vm); 2697 kfree_const(vm->resource_name); 2698 } 2699 2700 /* remove all tracking data - no locking needed */ 2701 if (vm->in_sbm) { 2702 vfree(vm->sbm.mb_states); 2703 vfree(vm->sbm.sb_states); 2704 } else { 2705 vfree(vm->bbm.bb_states); 2706 } 2707 2708 /* reset the device and cleanup the queues */ 2709 vdev->config->reset(vdev); 2710 vdev->config->del_vqs(vdev); 2711 2712 kfree(vm); 2713 vdev->priv = NULL; 2714 } 2715 2716 static void virtio_mem_config_changed(struct virtio_device *vdev) 2717 { 2718 struct virtio_mem *vm = vdev->priv; 2719 2720 atomic_set(&vm->config_changed, 1); 2721 virtio_mem_retry(vm); 2722 } 2723 2724 #ifdef CONFIG_PM_SLEEP 2725 static int virtio_mem_freeze(struct virtio_device *vdev) 2726 { 2727 /* 2728 * When restarting the VM, all memory is usually unplugged. Don't 2729 * allow to suspend/hibernate. 2730 */ 2731 dev_err(&vdev->dev, "save/restore not supported.\n"); 2732 return -EPERM; 2733 } 2734 2735 static int virtio_mem_restore(struct virtio_device *vdev) 2736 { 2737 return -EPERM; 2738 } 2739 #endif 2740 2741 static unsigned int virtio_mem_features[] = { 2742 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2743 VIRTIO_MEM_F_ACPI_PXM, 2744 #endif 2745 }; 2746 2747 static const struct virtio_device_id virtio_mem_id_table[] = { 2748 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2749 { 0 }, 2750 }; 2751 2752 static struct virtio_driver virtio_mem_driver = { 2753 .feature_table = virtio_mem_features, 2754 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2755 .driver.name = KBUILD_MODNAME, 2756 .driver.owner = THIS_MODULE, 2757 .id_table = virtio_mem_id_table, 2758 .probe = virtio_mem_probe, 2759 .remove = virtio_mem_remove, 2760 .config_changed = virtio_mem_config_changed, 2761 #ifdef CONFIG_PM_SLEEP 2762 .freeze = virtio_mem_freeze, 2763 .restore = virtio_mem_restore, 2764 #endif 2765 }; 2766 2767 module_virtio_driver(virtio_mem_driver); 2768 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2769 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2770 MODULE_DESCRIPTION("Virtio-mem driver"); 2771 MODULE_LICENSE("GPL"); 2772