xref: /openbmc/linux/drivers/virtio/virtio_mem.c (revision 25b892b5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Virtio-mem device driver.
4  *
5  * Copyright Red Hat, Inc. 2020
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 
10 #include <linux/virtio.h>
11 #include <linux/virtio_mem.h>
12 #include <linux/workqueue.h>
13 #include <linux/slab.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/memory_hotplug.h>
17 #include <linux/memory.h>
18 #include <linux/hrtimer.h>
19 #include <linux/crash_dump.h>
20 #include <linux/mutex.h>
21 #include <linux/bitmap.h>
22 #include <linux/lockdep.h>
23 
24 #include <acpi/acpi_numa.h>
25 
26 static bool unplug_online = true;
27 module_param(unplug_online, bool, 0644);
28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
29 
30 static bool force_bbm;
31 module_param(force_bbm, bool, 0444);
32 MODULE_PARM_DESC(force_bbm,
33 		"Force Big Block Mode. Default is 0 (auto-selection)");
34 
35 static unsigned long bbm_block_size;
36 module_param(bbm_block_size, ulong, 0444);
37 MODULE_PARM_DESC(bbm_block_size,
38 		 "Big Block size in bytes. Default is 0 (auto-detection).");
39 
40 static bool bbm_safe_unplug = true;
41 module_param(bbm_safe_unplug, bool, 0444);
42 MODULE_PARM_DESC(bbm_safe_unplug,
43 	     "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
44 
45 /*
46  * virtio-mem currently supports the following modes of operation:
47  *
48  * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
49  *   size of a Sub Block (SB) is determined based on the device block size, the
50  *   pageblock size, and the maximum allocation granularity of the buddy.
51  *   Subblocks within a Linux memory block might either be plugged or unplugged.
52  *   Memory is added/removed to Linux MM in Linux memory block granularity.
53  *
54  * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
55  *   Memory is added/removed to Linux MM in Big Block granularity.
56  *
57  * The mode is determined automatically based on the Linux memory block size
58  * and the device block size.
59  *
60  * User space / core MM (auto onlining) is responsible for onlining added
61  * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
62  * always onlined separately, and all memory within a Linux memory block is
63  * onlined to the same zone - virtio-mem relies on this behavior.
64  */
65 
66 /*
67  * State of a Linux memory block in SBM.
68  */
69 enum virtio_mem_sbm_mb_state {
70 	/* Unplugged, not added to Linux. Can be reused later. */
71 	VIRTIO_MEM_SBM_MB_UNUSED = 0,
72 	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
73 	VIRTIO_MEM_SBM_MB_PLUGGED,
74 	/* Fully plugged, fully added to Linux, offline. */
75 	VIRTIO_MEM_SBM_MB_OFFLINE,
76 	/* Partially plugged, fully added to Linux, offline. */
77 	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
78 	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
79 	VIRTIO_MEM_SBM_MB_KERNEL,
80 	/* Partially plugged, fully added to Linux, online to a kernel zone */
81 	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
82 	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
83 	VIRTIO_MEM_SBM_MB_MOVABLE,
84 	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
85 	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
86 	VIRTIO_MEM_SBM_MB_COUNT
87 };
88 
89 /*
90  * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
91  */
92 enum virtio_mem_bbm_bb_state {
93 	/* Unplugged, not added to Linux. Can be reused later. */
94 	VIRTIO_MEM_BBM_BB_UNUSED = 0,
95 	/* Plugged, not added to Linux. Error on add_memory(). */
96 	VIRTIO_MEM_BBM_BB_PLUGGED,
97 	/* Plugged and added to Linux. */
98 	VIRTIO_MEM_BBM_BB_ADDED,
99 	/* All online parts are fake-offline, ready to remove. */
100 	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
101 	VIRTIO_MEM_BBM_BB_COUNT
102 };
103 
104 struct virtio_mem {
105 	struct virtio_device *vdev;
106 
107 	/* We might first have to unplug all memory when starting up. */
108 	bool unplug_all_required;
109 
110 	/* Workqueue that processes the plug/unplug requests. */
111 	struct work_struct wq;
112 	atomic_t wq_active;
113 	atomic_t config_changed;
114 
115 	/* Virtqueue for guest->host requests. */
116 	struct virtqueue *vq;
117 
118 	/* Wait for a host response to a guest request. */
119 	wait_queue_head_t host_resp;
120 
121 	/* Space for one guest request and the host response. */
122 	struct virtio_mem_req req;
123 	struct virtio_mem_resp resp;
124 
125 	/* The current size of the device. */
126 	uint64_t plugged_size;
127 	/* The requested size of the device. */
128 	uint64_t requested_size;
129 
130 	/* The device block size (for communicating with the device). */
131 	uint64_t device_block_size;
132 	/* The determined node id for all memory of the device. */
133 	int nid;
134 	/* Physical start address of the memory region. */
135 	uint64_t addr;
136 	/* Maximum region size in bytes. */
137 	uint64_t region_size;
138 
139 	/* The parent resource for all memory added via this device. */
140 	struct resource *parent_resource;
141 	/*
142 	 * Copy of "System RAM (virtio_mem)" to be used for
143 	 * add_memory_driver_managed().
144 	 */
145 	const char *resource_name;
146 	/* Memory group identification. */
147 	int mgid;
148 
149 	/*
150 	 * We don't want to add too much memory if it's not getting onlined,
151 	 * to avoid running OOM. Besides this threshold, we allow to have at
152 	 * least two offline blocks at a time (whatever is bigger).
153 	 */
154 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
155 	atomic64_t offline_size;
156 	uint64_t offline_threshold;
157 
158 	/* If set, the driver is in SBM, otherwise in BBM. */
159 	bool in_sbm;
160 
161 	union {
162 		struct {
163 			/* Id of the first memory block of this device. */
164 			unsigned long first_mb_id;
165 			/* Id of the last usable memory block of this device. */
166 			unsigned long last_usable_mb_id;
167 			/* Id of the next memory bock to prepare when needed. */
168 			unsigned long next_mb_id;
169 
170 			/* The subblock size. */
171 			uint64_t sb_size;
172 			/* The number of subblocks per Linux memory block. */
173 			uint32_t sbs_per_mb;
174 
175 			/* Summary of all memory block states. */
176 			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
177 
178 			/*
179 			 * One byte state per memory block. Allocated via
180 			 * vmalloc(). Resized (alloc+copy+free) on demand.
181 			 *
182 			 * With 128 MiB memory blocks, we have states for 512
183 			 * GiB of memory in one 4 KiB page.
184 			 */
185 			uint8_t *mb_states;
186 
187 			/*
188 			 * Bitmap: one bit per subblock. Allocated similar to
189 			 * sbm.mb_states.
190 			 *
191 			 * A set bit means the corresponding subblock is
192 			 * plugged, otherwise it's unblocked.
193 			 *
194 			 * With 4 MiB subblocks, we manage 128 GiB of memory
195 			 * in one 4 KiB page.
196 			 */
197 			unsigned long *sb_states;
198 		} sbm;
199 
200 		struct {
201 			/* Id of the first big block of this device. */
202 			unsigned long first_bb_id;
203 			/* Id of the last usable big block of this device. */
204 			unsigned long last_usable_bb_id;
205 			/* Id of the next device bock to prepare when needed. */
206 			unsigned long next_bb_id;
207 
208 			/* Summary of all big block states. */
209 			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
210 
211 			/* One byte state per big block. See sbm.mb_states. */
212 			uint8_t *bb_states;
213 
214 			/* The block size used for plugging/adding/removing. */
215 			uint64_t bb_size;
216 		} bbm;
217 	};
218 
219 	/*
220 	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
221 	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
222 	 *
223 	 * When this lock is held the pointers can't change, ONLINE and
224 	 * OFFLINE blocks can't change the state and no subblocks will get
225 	 * plugged/unplugged.
226 	 */
227 	struct mutex hotplug_mutex;
228 	bool hotplug_active;
229 
230 	/* An error occurred we cannot handle - stop processing requests. */
231 	bool broken;
232 
233 	/* The driver is being removed. */
234 	spinlock_t removal_lock;
235 	bool removing;
236 
237 	/* Timer for retrying to plug/unplug memory. */
238 	struct hrtimer retry_timer;
239 	unsigned int retry_timer_ms;
240 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
241 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
242 
243 	/* Memory notifier (online/offline events). */
244 	struct notifier_block memory_notifier;
245 
246 	/* Next device in the list of virtio-mem devices. */
247 	struct list_head next;
248 };
249 
250 /*
251  * We have to share a single online_page callback among all virtio-mem
252  * devices. We use RCU to iterate the list in the callback.
253  */
254 static DEFINE_MUTEX(virtio_mem_mutex);
255 static LIST_HEAD(virtio_mem_devices);
256 
257 static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
258 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
259 						  unsigned long nr_pages);
260 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
261 						   unsigned long nr_pages);
262 static void virtio_mem_retry(struct virtio_mem *vm);
263 
264 /*
265  * Register a virtio-mem device so it will be considered for the online_page
266  * callback.
267  */
268 static int register_virtio_mem_device(struct virtio_mem *vm)
269 {
270 	int rc = 0;
271 
272 	/* First device registers the callback. */
273 	mutex_lock(&virtio_mem_mutex);
274 	if (list_empty(&virtio_mem_devices))
275 		rc = set_online_page_callback(&virtio_mem_online_page_cb);
276 	if (!rc)
277 		list_add_rcu(&vm->next, &virtio_mem_devices);
278 	mutex_unlock(&virtio_mem_mutex);
279 
280 	return rc;
281 }
282 
283 /*
284  * Unregister a virtio-mem device so it will no longer be considered for the
285  * online_page callback.
286  */
287 static void unregister_virtio_mem_device(struct virtio_mem *vm)
288 {
289 	/* Last device unregisters the callback. */
290 	mutex_lock(&virtio_mem_mutex);
291 	list_del_rcu(&vm->next);
292 	if (list_empty(&virtio_mem_devices))
293 		restore_online_page_callback(&virtio_mem_online_page_cb);
294 	mutex_unlock(&virtio_mem_mutex);
295 
296 	synchronize_rcu();
297 }
298 
299 /*
300  * Calculate the memory block id of a given address.
301  */
302 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
303 {
304 	return addr / memory_block_size_bytes();
305 }
306 
307 /*
308  * Calculate the physical start address of a given memory block id.
309  */
310 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
311 {
312 	return mb_id * memory_block_size_bytes();
313 }
314 
315 /*
316  * Calculate the big block id of a given address.
317  */
318 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
319 					      uint64_t addr)
320 {
321 	return addr / vm->bbm.bb_size;
322 }
323 
324 /*
325  * Calculate the physical start address of a given big block id.
326  */
327 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
328 					 unsigned long bb_id)
329 {
330 	return bb_id * vm->bbm.bb_size;
331 }
332 
333 /*
334  * Calculate the subblock id of a given address.
335  */
336 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
337 					      unsigned long addr)
338 {
339 	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
340 	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
341 
342 	return (addr - mb_addr) / vm->sbm.sb_size;
343 }
344 
345 /*
346  * Set the state of a big block, taking care of the state counter.
347  */
348 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
349 					unsigned long bb_id,
350 					enum virtio_mem_bbm_bb_state state)
351 {
352 	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
353 	enum virtio_mem_bbm_bb_state old_state;
354 
355 	old_state = vm->bbm.bb_states[idx];
356 	vm->bbm.bb_states[idx] = state;
357 
358 	BUG_ON(vm->bbm.bb_count[old_state] == 0);
359 	vm->bbm.bb_count[old_state]--;
360 	vm->bbm.bb_count[state]++;
361 }
362 
363 /*
364  * Get the state of a big block.
365  */
366 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
367 								unsigned long bb_id)
368 {
369 	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
370 }
371 
372 /*
373  * Prepare the big block state array for the next big block.
374  */
375 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
376 {
377 	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
378 	unsigned long new_bytes = old_bytes + 1;
379 	int old_pages = PFN_UP(old_bytes);
380 	int new_pages = PFN_UP(new_bytes);
381 	uint8_t *new_array;
382 
383 	if (vm->bbm.bb_states && old_pages == new_pages)
384 		return 0;
385 
386 	new_array = vzalloc(new_pages * PAGE_SIZE);
387 	if (!new_array)
388 		return -ENOMEM;
389 
390 	mutex_lock(&vm->hotplug_mutex);
391 	if (vm->bbm.bb_states)
392 		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
393 	vfree(vm->bbm.bb_states);
394 	vm->bbm.bb_states = new_array;
395 	mutex_unlock(&vm->hotplug_mutex);
396 
397 	return 0;
398 }
399 
400 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
401 	for (_bb_id = vm->bbm.first_bb_id; \
402 	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
403 	     _bb_id++) \
404 		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
405 
406 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
407 	for (_bb_id = vm->bbm.next_bb_id - 1; \
408 	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
409 	     _bb_id--) \
410 		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
411 
412 /*
413  * Set the state of a memory block, taking care of the state counter.
414  */
415 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
416 					unsigned long mb_id, uint8_t state)
417 {
418 	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
419 	uint8_t old_state;
420 
421 	old_state = vm->sbm.mb_states[idx];
422 	vm->sbm.mb_states[idx] = state;
423 
424 	BUG_ON(vm->sbm.mb_count[old_state] == 0);
425 	vm->sbm.mb_count[old_state]--;
426 	vm->sbm.mb_count[state]++;
427 }
428 
429 /*
430  * Get the state of a memory block.
431  */
432 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
433 					   unsigned long mb_id)
434 {
435 	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
436 
437 	return vm->sbm.mb_states[idx];
438 }
439 
440 /*
441  * Prepare the state array for the next memory block.
442  */
443 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
444 {
445 	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
446 	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
447 	uint8_t *new_array;
448 
449 	if (vm->sbm.mb_states && old_pages == new_pages)
450 		return 0;
451 
452 	new_array = vzalloc(new_pages * PAGE_SIZE);
453 	if (!new_array)
454 		return -ENOMEM;
455 
456 	mutex_lock(&vm->hotplug_mutex);
457 	if (vm->sbm.mb_states)
458 		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
459 	vfree(vm->sbm.mb_states);
460 	vm->sbm.mb_states = new_array;
461 	mutex_unlock(&vm->hotplug_mutex);
462 
463 	return 0;
464 }
465 
466 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
467 	for (_mb_id = _vm->sbm.first_mb_id; \
468 	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
469 	     _mb_id++) \
470 		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
471 
472 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
473 	for (_mb_id = _vm->sbm.next_mb_id - 1; \
474 	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
475 	     _mb_id--) \
476 		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
477 
478 /*
479  * Calculate the bit number in the subblock bitmap for the given subblock
480  * inside the given memory block.
481  */
482 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
483 					  unsigned long mb_id, int sb_id)
484 {
485 	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
486 }
487 
488 /*
489  * Mark all selected subblocks plugged.
490  *
491  * Will not modify the state of the memory block.
492  */
493 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
494 					  unsigned long mb_id, int sb_id,
495 					  int count)
496 {
497 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
498 
499 	__bitmap_set(vm->sbm.sb_states, bit, count);
500 }
501 
502 /*
503  * Mark all selected subblocks unplugged.
504  *
505  * Will not modify the state of the memory block.
506  */
507 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
508 					    unsigned long mb_id, int sb_id,
509 					    int count)
510 {
511 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
512 
513 	__bitmap_clear(vm->sbm.sb_states, bit, count);
514 }
515 
516 /*
517  * Test if all selected subblocks are plugged.
518  */
519 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
520 					   unsigned long mb_id, int sb_id,
521 					   int count)
522 {
523 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
524 
525 	if (count == 1)
526 		return test_bit(bit, vm->sbm.sb_states);
527 
528 	/* TODO: Helper similar to bitmap_set() */
529 	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
530 	       bit + count;
531 }
532 
533 /*
534  * Test if all selected subblocks are unplugged.
535  */
536 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
537 					     unsigned long mb_id, int sb_id,
538 					     int count)
539 {
540 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
541 
542 	/* TODO: Helper similar to bitmap_set() */
543 	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
544 	       bit + count;
545 }
546 
547 /*
548  * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
549  * none.
550  */
551 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
552 					    unsigned long mb_id)
553 {
554 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
555 
556 	return find_next_zero_bit(vm->sbm.sb_states,
557 				  bit + vm->sbm.sbs_per_mb, bit) - bit;
558 }
559 
560 /*
561  * Prepare the subblock bitmap for the next memory block.
562  */
563 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
564 {
565 	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
566 	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
567 	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
568 	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
569 	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
570 	unsigned long *new_bitmap, *old_bitmap;
571 
572 	if (vm->sbm.sb_states && old_pages == new_pages)
573 		return 0;
574 
575 	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
576 	if (!new_bitmap)
577 		return -ENOMEM;
578 
579 	mutex_lock(&vm->hotplug_mutex);
580 	if (new_bitmap)
581 		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
582 
583 	old_bitmap = vm->sbm.sb_states;
584 	vm->sbm.sb_states = new_bitmap;
585 	mutex_unlock(&vm->hotplug_mutex);
586 
587 	vfree(old_bitmap);
588 	return 0;
589 }
590 
591 /*
592  * Test if we could add memory without creating too much offline memory -
593  * to avoid running OOM if memory is getting onlined deferred.
594  */
595 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
596 {
597 	if (WARN_ON_ONCE(size > vm->offline_threshold))
598 		return false;
599 
600 	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
601 }
602 
603 /*
604  * Try adding memory to Linux. Will usually only fail if out of memory.
605  *
606  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
607  * onlining code).
608  *
609  * Will not modify the state of memory blocks in virtio-mem.
610  */
611 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
612 				 uint64_t size)
613 {
614 	int rc;
615 
616 	/*
617 	 * When force-unloading the driver and we still have memory added to
618 	 * Linux, the resource name has to stay.
619 	 */
620 	if (!vm->resource_name) {
621 		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
622 						  GFP_KERNEL);
623 		if (!vm->resource_name)
624 			return -ENOMEM;
625 	}
626 
627 	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
628 		addr + size - 1);
629 	/* Memory might get onlined immediately. */
630 	atomic64_add(size, &vm->offline_size);
631 	rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
632 				       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
633 	if (rc) {
634 		atomic64_sub(size, &vm->offline_size);
635 		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
636 		/*
637 		 * TODO: Linux MM does not properly clean up yet in all cases
638 		 * where adding of memory failed - especially on -ENOMEM.
639 		 */
640 	}
641 	return rc;
642 }
643 
644 /*
645  * See virtio_mem_add_memory(): Try adding a single Linux memory block.
646  */
647 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
648 {
649 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
650 	const uint64_t size = memory_block_size_bytes();
651 
652 	return virtio_mem_add_memory(vm, addr, size);
653 }
654 
655 /*
656  * See virtio_mem_add_memory(): Try adding a big block.
657  */
658 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
659 {
660 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
661 	const uint64_t size = vm->bbm.bb_size;
662 
663 	return virtio_mem_add_memory(vm, addr, size);
664 }
665 
666 /*
667  * Try removing memory from Linux. Will only fail if memory blocks aren't
668  * offline.
669  *
670  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
671  * onlining code).
672  *
673  * Will not modify the state of memory blocks in virtio-mem.
674  */
675 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
676 				    uint64_t size)
677 {
678 	int rc;
679 
680 	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
681 		addr + size - 1);
682 	rc = remove_memory(addr, size);
683 	if (!rc) {
684 		atomic64_sub(size, &vm->offline_size);
685 		/*
686 		 * We might have freed up memory we can now unplug, retry
687 		 * immediately instead of waiting.
688 		 */
689 		virtio_mem_retry(vm);
690 	} else {
691 		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
692 	}
693 	return rc;
694 }
695 
696 /*
697  * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
698  */
699 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
700 {
701 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
702 	const uint64_t size = memory_block_size_bytes();
703 
704 	return virtio_mem_remove_memory(vm, addr, size);
705 }
706 
707 /*
708  * Try offlining and removing memory from Linux.
709  *
710  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
711  * onlining code).
712  *
713  * Will not modify the state of memory blocks in virtio-mem.
714  */
715 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
716 						uint64_t addr,
717 						uint64_t size)
718 {
719 	int rc;
720 
721 	dev_dbg(&vm->vdev->dev,
722 		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
723 		addr + size - 1);
724 
725 	rc = offline_and_remove_memory(addr, size);
726 	if (!rc) {
727 		atomic64_sub(size, &vm->offline_size);
728 		/*
729 		 * We might have freed up memory we can now unplug, retry
730 		 * immediately instead of waiting.
731 		 */
732 		virtio_mem_retry(vm);
733 	} else {
734 		dev_dbg(&vm->vdev->dev,
735 			"offlining and removing memory failed: %d\n", rc);
736 	}
737 	return rc;
738 }
739 
740 /*
741  * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
742  * a single Linux memory block.
743  */
744 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
745 						unsigned long mb_id)
746 {
747 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
748 	const uint64_t size = memory_block_size_bytes();
749 
750 	return virtio_mem_offline_and_remove_memory(vm, addr, size);
751 }
752 
753 /*
754  * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
755  * all Linux memory blocks covered by the big block.
756  */
757 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
758 						unsigned long bb_id)
759 {
760 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
761 	const uint64_t size = vm->bbm.bb_size;
762 
763 	return virtio_mem_offline_and_remove_memory(vm, addr, size);
764 }
765 
766 /*
767  * Trigger the workqueue so the device can perform its magic.
768  */
769 static void virtio_mem_retry(struct virtio_mem *vm)
770 {
771 	unsigned long flags;
772 
773 	spin_lock_irqsave(&vm->removal_lock, flags);
774 	if (!vm->removing)
775 		queue_work(system_freezable_wq, &vm->wq);
776 	spin_unlock_irqrestore(&vm->removal_lock, flags);
777 }
778 
779 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
780 {
781 	int node = NUMA_NO_NODE;
782 
783 #if defined(CONFIG_ACPI_NUMA)
784 	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
785 		node = pxm_to_node(node_id);
786 #endif
787 	return node;
788 }
789 
790 /*
791  * Test if a virtio-mem device overlaps with the given range. Can be called
792  * from (notifier) callbacks lockless.
793  */
794 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
795 				      uint64_t size)
796 {
797 	return start < vm->addr + vm->region_size && vm->addr < start + size;
798 }
799 
800 /*
801  * Test if a virtio-mem device contains a given range. Can be called from
802  * (notifier) callbacks lockless.
803  */
804 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
805 				      uint64_t size)
806 {
807 	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
808 }
809 
810 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
811 					      unsigned long mb_id)
812 {
813 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
814 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
815 	case VIRTIO_MEM_SBM_MB_OFFLINE:
816 		return NOTIFY_OK;
817 	default:
818 		break;
819 	}
820 	dev_warn_ratelimited(&vm->vdev->dev,
821 			     "memory block onlining denied\n");
822 	return NOTIFY_BAD;
823 }
824 
825 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
826 					  unsigned long mb_id)
827 {
828 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
829 	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
830 	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
831 		virtio_mem_sbm_set_mb_state(vm, mb_id,
832 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
833 		break;
834 	case VIRTIO_MEM_SBM_MB_KERNEL:
835 	case VIRTIO_MEM_SBM_MB_MOVABLE:
836 		virtio_mem_sbm_set_mb_state(vm, mb_id,
837 					    VIRTIO_MEM_SBM_MB_OFFLINE);
838 		break;
839 	default:
840 		BUG();
841 		break;
842 	}
843 }
844 
845 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
846 					 unsigned long mb_id,
847 					 unsigned long start_pfn)
848 {
849 	const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) ==
850 				ZONE_MOVABLE;
851 	int new_state;
852 
853 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
854 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
855 		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
856 		if (is_movable)
857 			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
858 		break;
859 	case VIRTIO_MEM_SBM_MB_OFFLINE:
860 		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
861 		if (is_movable)
862 			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
863 		break;
864 	default:
865 		BUG();
866 		break;
867 	}
868 	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
869 }
870 
871 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
872 						unsigned long mb_id)
873 {
874 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
875 	unsigned long pfn;
876 	int sb_id;
877 
878 	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
879 		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
880 			continue;
881 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
882 			       sb_id * vm->sbm.sb_size);
883 		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
884 	}
885 }
886 
887 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
888 						 unsigned long mb_id)
889 {
890 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
891 	unsigned long pfn;
892 	int sb_id;
893 
894 	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
895 		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
896 			continue;
897 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
898 			       sb_id * vm->sbm.sb_size);
899 		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
900 	}
901 }
902 
903 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
904 						unsigned long bb_id,
905 						unsigned long pfn,
906 						unsigned long nr_pages)
907 {
908 	/*
909 	 * When marked as "fake-offline", all online memory of this device block
910 	 * is allocated by us. Otherwise, we don't have any memory allocated.
911 	 */
912 	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
913 	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
914 		return;
915 	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
916 }
917 
918 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
919 						 unsigned long bb_id,
920 						 unsigned long pfn,
921 						 unsigned long nr_pages)
922 {
923 	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
924 	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
925 		return;
926 	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
927 }
928 
929 /*
930  * This callback will either be called synchronously from add_memory() or
931  * asynchronously (e.g., triggered via user space). We have to be careful
932  * with locking when calling add_memory().
933  */
934 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
935 					 unsigned long action, void *arg)
936 {
937 	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
938 					     memory_notifier);
939 	struct memory_notify *mhp = arg;
940 	const unsigned long start = PFN_PHYS(mhp->start_pfn);
941 	const unsigned long size = PFN_PHYS(mhp->nr_pages);
942 	int rc = NOTIFY_OK;
943 	unsigned long id;
944 
945 	if (!virtio_mem_overlaps_range(vm, start, size))
946 		return NOTIFY_DONE;
947 
948 	if (vm->in_sbm) {
949 		id = virtio_mem_phys_to_mb_id(start);
950 		/*
951 		 * In SBM, we add memory in separate memory blocks - we expect
952 		 * it to be onlined/offlined in the same granularity. Bail out
953 		 * if this ever changes.
954 		 */
955 		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
956 				 !IS_ALIGNED(start, memory_block_size_bytes())))
957 			return NOTIFY_BAD;
958 	} else {
959 		id = virtio_mem_phys_to_bb_id(vm, start);
960 		/*
961 		 * In BBM, we only care about onlining/offlining happening
962 		 * within a single big block, we don't care about the
963 		 * actual granularity as we don't track individual Linux
964 		 * memory blocks.
965 		 */
966 		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
967 			return NOTIFY_BAD;
968 	}
969 
970 	/*
971 	 * Avoid circular locking lockdep warnings. We lock the mutex
972 	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
973 	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
974 	 * between both notifier calls and will bail out. False positive.
975 	 */
976 	lockdep_off();
977 
978 	switch (action) {
979 	case MEM_GOING_OFFLINE:
980 		mutex_lock(&vm->hotplug_mutex);
981 		if (vm->removing) {
982 			rc = notifier_from_errno(-EBUSY);
983 			mutex_unlock(&vm->hotplug_mutex);
984 			break;
985 		}
986 		vm->hotplug_active = true;
987 		if (vm->in_sbm)
988 			virtio_mem_sbm_notify_going_offline(vm, id);
989 		else
990 			virtio_mem_bbm_notify_going_offline(vm, id,
991 							    mhp->start_pfn,
992 							    mhp->nr_pages);
993 		break;
994 	case MEM_GOING_ONLINE:
995 		mutex_lock(&vm->hotplug_mutex);
996 		if (vm->removing) {
997 			rc = notifier_from_errno(-EBUSY);
998 			mutex_unlock(&vm->hotplug_mutex);
999 			break;
1000 		}
1001 		vm->hotplug_active = true;
1002 		if (vm->in_sbm)
1003 			rc = virtio_mem_sbm_notify_going_online(vm, id);
1004 		break;
1005 	case MEM_OFFLINE:
1006 		if (vm->in_sbm)
1007 			virtio_mem_sbm_notify_offline(vm, id);
1008 
1009 		atomic64_add(size, &vm->offline_size);
1010 		/*
1011 		 * Trigger the workqueue. Now that we have some offline memory,
1012 		 * maybe we can handle pending unplug requests.
1013 		 */
1014 		if (!unplug_online)
1015 			virtio_mem_retry(vm);
1016 
1017 		vm->hotplug_active = false;
1018 		mutex_unlock(&vm->hotplug_mutex);
1019 		break;
1020 	case MEM_ONLINE:
1021 		if (vm->in_sbm)
1022 			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1023 
1024 		atomic64_sub(size, &vm->offline_size);
1025 		/*
1026 		 * Start adding more memory once we onlined half of our
1027 		 * threshold. Don't trigger if it's possibly due to our actipn
1028 		 * (e.g., us adding memory which gets onlined immediately from
1029 		 * the core).
1030 		 */
1031 		if (!atomic_read(&vm->wq_active) &&
1032 		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1033 			virtio_mem_retry(vm);
1034 
1035 		vm->hotplug_active = false;
1036 		mutex_unlock(&vm->hotplug_mutex);
1037 		break;
1038 	case MEM_CANCEL_OFFLINE:
1039 		if (!vm->hotplug_active)
1040 			break;
1041 		if (vm->in_sbm)
1042 			virtio_mem_sbm_notify_cancel_offline(vm, id);
1043 		else
1044 			virtio_mem_bbm_notify_cancel_offline(vm, id,
1045 							     mhp->start_pfn,
1046 							     mhp->nr_pages);
1047 		vm->hotplug_active = false;
1048 		mutex_unlock(&vm->hotplug_mutex);
1049 		break;
1050 	case MEM_CANCEL_ONLINE:
1051 		if (!vm->hotplug_active)
1052 			break;
1053 		vm->hotplug_active = false;
1054 		mutex_unlock(&vm->hotplug_mutex);
1055 		break;
1056 	default:
1057 		break;
1058 	}
1059 
1060 	lockdep_on();
1061 
1062 	return rc;
1063 }
1064 
1065 /*
1066  * Set a range of pages PG_offline. Remember pages that were never onlined
1067  * (via generic_online_page()) using PageDirty().
1068  */
1069 static void virtio_mem_set_fake_offline(unsigned long pfn,
1070 					unsigned long nr_pages, bool onlined)
1071 {
1072 	page_offline_begin();
1073 	for (; nr_pages--; pfn++) {
1074 		struct page *page = pfn_to_page(pfn);
1075 
1076 		__SetPageOffline(page);
1077 		if (!onlined) {
1078 			SetPageDirty(page);
1079 			/* FIXME: remove after cleanups */
1080 			ClearPageReserved(page);
1081 		}
1082 	}
1083 	page_offline_end();
1084 }
1085 
1086 /*
1087  * Clear PG_offline from a range of pages. If the pages were never onlined,
1088  * (via generic_online_page()), clear PageDirty().
1089  */
1090 static void virtio_mem_clear_fake_offline(unsigned long pfn,
1091 					  unsigned long nr_pages, bool onlined)
1092 {
1093 	for (; nr_pages--; pfn++) {
1094 		struct page *page = pfn_to_page(pfn);
1095 
1096 		__ClearPageOffline(page);
1097 		if (!onlined)
1098 			ClearPageDirty(page);
1099 	}
1100 }
1101 
1102 /*
1103  * Release a range of fake-offline pages to the buddy, effectively
1104  * fake-onlining them.
1105  */
1106 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1107 {
1108 	const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
1109 	unsigned long i;
1110 
1111 	/*
1112 	 * We are always called at least with MAX_ORDER_NR_PAGES
1113 	 * granularity/alignment (e.g., the way subblocks work). All pages
1114 	 * inside such a block are alike.
1115 	 */
1116 	for (i = 0; i < nr_pages; i += max_nr_pages) {
1117 		struct page *page = pfn_to_page(pfn + i);
1118 
1119 		/*
1120 		 * If the page is PageDirty(), it was kept fake-offline when
1121 		 * onlining the memory block. Otherwise, it was allocated
1122 		 * using alloc_contig_range(). All pages in a subblock are
1123 		 * alike.
1124 		 */
1125 		if (PageDirty(page)) {
1126 			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1127 						      false);
1128 			generic_online_page(page, MAX_ORDER - 1);
1129 		} else {
1130 			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1131 						      true);
1132 			free_contig_range(pfn + i, max_nr_pages);
1133 			adjust_managed_page_count(page, max_nr_pages);
1134 		}
1135 	}
1136 }
1137 
1138 /*
1139  * Try to allocate a range, marking pages fake-offline, effectively
1140  * fake-offlining them.
1141  */
1142 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
1143 {
1144 	const bool is_movable = page_zonenum(pfn_to_page(pfn)) ==
1145 				ZONE_MOVABLE;
1146 	int rc, retry_count;
1147 
1148 	/*
1149 	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1150 	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1151 	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1152 	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1153 	 * some guarantees.
1154 	 */
1155 	for (retry_count = 0; retry_count < 5; retry_count++) {
1156 		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1157 					GFP_KERNEL);
1158 		if (rc == -ENOMEM)
1159 			/* whoops, out of memory */
1160 			return rc;
1161 		else if (rc && !is_movable)
1162 			break;
1163 		else if (rc)
1164 			continue;
1165 
1166 		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1167 		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1168 		return 0;
1169 	}
1170 
1171 	return -EBUSY;
1172 }
1173 
1174 /*
1175  * Handle fake-offline pages when memory is going offline - such that the
1176  * pages can be skipped by mm-core when offlining.
1177  */
1178 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1179 						  unsigned long nr_pages)
1180 {
1181 	struct page *page;
1182 	unsigned long i;
1183 
1184 	/*
1185 	 * Drop our reference to the pages so the memory can get offlined
1186 	 * and add the unplugged pages to the managed page counters (so
1187 	 * offlining code can correctly subtract them again).
1188 	 */
1189 	adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
1190 	/* Drop our reference to the pages so the memory can get offlined. */
1191 	for (i = 0; i < nr_pages; i++) {
1192 		page = pfn_to_page(pfn + i);
1193 		if (WARN_ON(!page_ref_dec_and_test(page)))
1194 			dump_page(page, "fake-offline page referenced");
1195 	}
1196 }
1197 
1198 /*
1199  * Handle fake-offline pages when memory offlining is canceled - to undo
1200  * what we did in virtio_mem_fake_offline_going_offline().
1201  */
1202 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1203 						   unsigned long nr_pages)
1204 {
1205 	unsigned long i;
1206 
1207 	/*
1208 	 * Get the reference we dropped when going offline and subtract the
1209 	 * unplugged pages from the managed page counters.
1210 	 */
1211 	adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1212 	for (i = 0; i < nr_pages; i++)
1213 		page_ref_inc(pfn_to_page(pfn + i));
1214 }
1215 
1216 static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1217 {
1218 	const unsigned long addr = page_to_phys(page);
1219 	unsigned long id, sb_id;
1220 	struct virtio_mem *vm;
1221 	bool do_online;
1222 
1223 	rcu_read_lock();
1224 	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1225 		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1226 			continue;
1227 
1228 		if (vm->in_sbm) {
1229 			/*
1230 			 * We exploit here that subblocks have at least
1231 			 * MAX_ORDER_NR_PAGES size/alignment - so we cannot
1232 			 * cross subblocks within one call.
1233 			 */
1234 			id = virtio_mem_phys_to_mb_id(addr);
1235 			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1236 			do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
1237 								   sb_id, 1);
1238 		} else {
1239 			/*
1240 			 * If the whole block is marked fake offline, keep
1241 			 * everything that way.
1242 			 */
1243 			id = virtio_mem_phys_to_bb_id(vm, addr);
1244 			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1245 				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1246 		}
1247 
1248 		/*
1249 		 * virtio_mem_set_fake_offline() might sleep, we don't need
1250 		 * the device anymore. See virtio_mem_remove() how races
1251 		 * between memory onlining and device removal are handled.
1252 		 */
1253 		rcu_read_unlock();
1254 
1255 		if (do_online)
1256 			generic_online_page(page, order);
1257 		else
1258 			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1259 						    false);
1260 		return;
1261 	}
1262 	rcu_read_unlock();
1263 
1264 	/* not virtio-mem memory, but e.g., a DIMM. online it */
1265 	generic_online_page(page, order);
1266 }
1267 
1268 static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1269 					const struct virtio_mem_req *req)
1270 {
1271 	struct scatterlist *sgs[2], sg_req, sg_resp;
1272 	unsigned int len;
1273 	int rc;
1274 
1275 	/* don't use the request residing on the stack (vaddr) */
1276 	vm->req = *req;
1277 
1278 	/* out: buffer for request */
1279 	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1280 	sgs[0] = &sg_req;
1281 
1282 	/* in: buffer for response */
1283 	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1284 	sgs[1] = &sg_resp;
1285 
1286 	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1287 	if (rc < 0)
1288 		return rc;
1289 
1290 	virtqueue_kick(vm->vq);
1291 
1292 	/* wait for a response */
1293 	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1294 
1295 	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1296 }
1297 
1298 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1299 					uint64_t size)
1300 {
1301 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1302 	const struct virtio_mem_req req = {
1303 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1304 		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1305 		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1306 	};
1307 	int rc = -ENOMEM;
1308 
1309 	if (atomic_read(&vm->config_changed))
1310 		return -EAGAIN;
1311 
1312 	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1313 		addr + size - 1);
1314 
1315 	switch (virtio_mem_send_request(vm, &req)) {
1316 	case VIRTIO_MEM_RESP_ACK:
1317 		vm->plugged_size += size;
1318 		return 0;
1319 	case VIRTIO_MEM_RESP_NACK:
1320 		rc = -EAGAIN;
1321 		break;
1322 	case VIRTIO_MEM_RESP_BUSY:
1323 		rc = -ETXTBSY;
1324 		break;
1325 	case VIRTIO_MEM_RESP_ERROR:
1326 		rc = -EINVAL;
1327 		break;
1328 	default:
1329 		break;
1330 	}
1331 
1332 	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1333 	return rc;
1334 }
1335 
1336 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1337 					  uint64_t size)
1338 {
1339 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1340 	const struct virtio_mem_req req = {
1341 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1342 		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1343 		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1344 	};
1345 	int rc = -ENOMEM;
1346 
1347 	if (atomic_read(&vm->config_changed))
1348 		return -EAGAIN;
1349 
1350 	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1351 		addr + size - 1);
1352 
1353 	switch (virtio_mem_send_request(vm, &req)) {
1354 	case VIRTIO_MEM_RESP_ACK:
1355 		vm->plugged_size -= size;
1356 		return 0;
1357 	case VIRTIO_MEM_RESP_BUSY:
1358 		rc = -ETXTBSY;
1359 		break;
1360 	case VIRTIO_MEM_RESP_ERROR:
1361 		rc = -EINVAL;
1362 		break;
1363 	default:
1364 		break;
1365 	}
1366 
1367 	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1368 	return rc;
1369 }
1370 
1371 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1372 {
1373 	const struct virtio_mem_req req = {
1374 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1375 	};
1376 	int rc = -ENOMEM;
1377 
1378 	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1379 
1380 	switch (virtio_mem_send_request(vm, &req)) {
1381 	case VIRTIO_MEM_RESP_ACK:
1382 		vm->unplug_all_required = false;
1383 		vm->plugged_size = 0;
1384 		/* usable region might have shrunk */
1385 		atomic_set(&vm->config_changed, 1);
1386 		return 0;
1387 	case VIRTIO_MEM_RESP_BUSY:
1388 		rc = -ETXTBSY;
1389 		break;
1390 	default:
1391 		break;
1392 	}
1393 
1394 	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1395 	return rc;
1396 }
1397 
1398 /*
1399  * Plug selected subblocks. Updates the plugged state, but not the state
1400  * of the memory block.
1401  */
1402 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1403 				  int sb_id, int count)
1404 {
1405 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1406 			      sb_id * vm->sbm.sb_size;
1407 	const uint64_t size = count * vm->sbm.sb_size;
1408 	int rc;
1409 
1410 	rc = virtio_mem_send_plug_request(vm, addr, size);
1411 	if (!rc)
1412 		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1413 	return rc;
1414 }
1415 
1416 /*
1417  * Unplug selected subblocks. Updates the plugged state, but not the state
1418  * of the memory block.
1419  */
1420 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1421 				    int sb_id, int count)
1422 {
1423 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1424 			      sb_id * vm->sbm.sb_size;
1425 	const uint64_t size = count * vm->sbm.sb_size;
1426 	int rc;
1427 
1428 	rc = virtio_mem_send_unplug_request(vm, addr, size);
1429 	if (!rc)
1430 		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1431 	return rc;
1432 }
1433 
1434 /*
1435  * Request to unplug a big block.
1436  *
1437  * Will not modify the state of the big block.
1438  */
1439 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1440 {
1441 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1442 	const uint64_t size = vm->bbm.bb_size;
1443 
1444 	return virtio_mem_send_unplug_request(vm, addr, size);
1445 }
1446 
1447 /*
1448  * Request to plug a big block.
1449  *
1450  * Will not modify the state of the big block.
1451  */
1452 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1453 {
1454 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1455 	const uint64_t size = vm->bbm.bb_size;
1456 
1457 	return virtio_mem_send_plug_request(vm, addr, size);
1458 }
1459 
1460 /*
1461  * Unplug the desired number of plugged subblocks of a offline or not-added
1462  * memory block. Will fail if any subblock cannot get unplugged (instead of
1463  * skipping it).
1464  *
1465  * Will not modify the state of the memory block.
1466  *
1467  * Note: can fail after some subblocks were unplugged.
1468  */
1469 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1470 					    unsigned long mb_id, uint64_t *nb_sb)
1471 {
1472 	int sb_id, count;
1473 	int rc;
1474 
1475 	sb_id = vm->sbm.sbs_per_mb - 1;
1476 	while (*nb_sb) {
1477 		/* Find the next candidate subblock */
1478 		while (sb_id >= 0 &&
1479 		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1480 			sb_id--;
1481 		if (sb_id < 0)
1482 			break;
1483 		/* Try to unplug multiple subblocks at a time */
1484 		count = 1;
1485 		while (count < *nb_sb && sb_id > 0 &&
1486 		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1487 			count++;
1488 			sb_id--;
1489 		}
1490 
1491 		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1492 		if (rc)
1493 			return rc;
1494 		*nb_sb -= count;
1495 		sb_id--;
1496 	}
1497 
1498 	return 0;
1499 }
1500 
1501 /*
1502  * Unplug all plugged subblocks of an offline or not-added memory block.
1503  *
1504  * Will not modify the state of the memory block.
1505  *
1506  * Note: can fail after some subblocks were unplugged.
1507  */
1508 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1509 {
1510 	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1511 
1512 	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1513 }
1514 
1515 /*
1516  * Prepare tracking data for the next memory block.
1517  */
1518 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1519 					  unsigned long *mb_id)
1520 {
1521 	int rc;
1522 
1523 	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1524 		return -ENOSPC;
1525 
1526 	/* Resize the state array if required. */
1527 	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1528 	if (rc)
1529 		return rc;
1530 
1531 	/* Resize the subblock bitmap if required. */
1532 	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1533 	if (rc)
1534 		return rc;
1535 
1536 	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1537 	*mb_id = vm->sbm.next_mb_id++;
1538 	return 0;
1539 }
1540 
1541 /*
1542  * Try to plug the desired number of subblocks and add the memory block
1543  * to Linux.
1544  *
1545  * Will modify the state of the memory block.
1546  */
1547 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1548 					  unsigned long mb_id, uint64_t *nb_sb)
1549 {
1550 	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1551 	int rc;
1552 
1553 	if (WARN_ON_ONCE(!count))
1554 		return -EINVAL;
1555 
1556 	/*
1557 	 * Plug the requested number of subblocks before adding it to linux,
1558 	 * so that onlining will directly online all plugged subblocks.
1559 	 */
1560 	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1561 	if (rc)
1562 		return rc;
1563 
1564 	/*
1565 	 * Mark the block properly offline before adding it to Linux,
1566 	 * so the memory notifiers will find the block in the right state.
1567 	 */
1568 	if (count == vm->sbm.sbs_per_mb)
1569 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1570 					    VIRTIO_MEM_SBM_MB_OFFLINE);
1571 	else
1572 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1573 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1574 
1575 	/* Add the memory block to linux - if that fails, try to unplug. */
1576 	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1577 	if (rc) {
1578 		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1579 
1580 		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1581 			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1582 		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1583 		return rc;
1584 	}
1585 
1586 	*nb_sb -= count;
1587 	return 0;
1588 }
1589 
1590 /*
1591  * Try to plug the desired number of subblocks of a memory block that
1592  * is already added to Linux.
1593  *
1594  * Will modify the state of the memory block.
1595  *
1596  * Note: Can fail after some subblocks were successfully plugged.
1597  */
1598 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1599 				      unsigned long mb_id, uint64_t *nb_sb)
1600 {
1601 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1602 	unsigned long pfn, nr_pages;
1603 	int sb_id, count;
1604 	int rc;
1605 
1606 	if (WARN_ON_ONCE(!*nb_sb))
1607 		return -EINVAL;
1608 
1609 	while (*nb_sb) {
1610 		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1611 		if (sb_id >= vm->sbm.sbs_per_mb)
1612 			break;
1613 		count = 1;
1614 		while (count < *nb_sb &&
1615 		       sb_id + count < vm->sbm.sbs_per_mb &&
1616 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1617 			count++;
1618 
1619 		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1620 		if (rc)
1621 			return rc;
1622 		*nb_sb -= count;
1623 		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1624 			continue;
1625 
1626 		/* fake-online the pages if the memory block is online */
1627 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1628 			       sb_id * vm->sbm.sb_size);
1629 		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1630 		virtio_mem_fake_online(pfn, nr_pages);
1631 	}
1632 
1633 	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1634 		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1635 
1636 	return 0;
1637 }
1638 
1639 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1640 {
1641 	const int mb_states[] = {
1642 		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1643 		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1644 		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1645 	};
1646 	uint64_t nb_sb = diff / vm->sbm.sb_size;
1647 	unsigned long mb_id;
1648 	int rc, i;
1649 
1650 	if (!nb_sb)
1651 		return 0;
1652 
1653 	/* Don't race with onlining/offlining */
1654 	mutex_lock(&vm->hotplug_mutex);
1655 
1656 	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1657 		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1658 			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1659 			if (rc || !nb_sb)
1660 				goto out_unlock;
1661 			cond_resched();
1662 		}
1663 	}
1664 
1665 	/*
1666 	 * We won't be working on online/offline memory blocks from this point,
1667 	 * so we can't race with memory onlining/offlining. Drop the mutex.
1668 	 */
1669 	mutex_unlock(&vm->hotplug_mutex);
1670 
1671 	/* Try to plug and add unused blocks */
1672 	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1673 		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1674 			return -ENOSPC;
1675 
1676 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1677 		if (rc || !nb_sb)
1678 			return rc;
1679 		cond_resched();
1680 	}
1681 
1682 	/* Try to prepare, plug and add new blocks */
1683 	while (nb_sb) {
1684 		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1685 			return -ENOSPC;
1686 
1687 		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1688 		if (rc)
1689 			return rc;
1690 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1691 		if (rc)
1692 			return rc;
1693 		cond_resched();
1694 	}
1695 
1696 	return 0;
1697 out_unlock:
1698 	mutex_unlock(&vm->hotplug_mutex);
1699 	return rc;
1700 }
1701 
1702 /*
1703  * Plug a big block and add it to Linux.
1704  *
1705  * Will modify the state of the big block.
1706  */
1707 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1708 					  unsigned long bb_id)
1709 {
1710 	int rc;
1711 
1712 	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1713 			 VIRTIO_MEM_BBM_BB_UNUSED))
1714 		return -EINVAL;
1715 
1716 	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1717 	if (rc)
1718 		return rc;
1719 	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1720 
1721 	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1722 	if (rc) {
1723 		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1724 			virtio_mem_bbm_set_bb_state(vm, bb_id,
1725 						    VIRTIO_MEM_BBM_BB_UNUSED);
1726 		else
1727 			/* Retry from the main loop. */
1728 			virtio_mem_bbm_set_bb_state(vm, bb_id,
1729 						    VIRTIO_MEM_BBM_BB_PLUGGED);
1730 		return rc;
1731 	}
1732 	return 0;
1733 }
1734 
1735 /*
1736  * Prepare tracking data for the next big block.
1737  */
1738 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1739 					  unsigned long *bb_id)
1740 {
1741 	int rc;
1742 
1743 	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1744 		return -ENOSPC;
1745 
1746 	/* Resize the big block state array if required. */
1747 	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1748 	if (rc)
1749 		return rc;
1750 
1751 	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1752 	*bb_id = vm->bbm.next_bb_id;
1753 	vm->bbm.next_bb_id++;
1754 	return 0;
1755 }
1756 
1757 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1758 {
1759 	uint64_t nb_bb = diff / vm->bbm.bb_size;
1760 	unsigned long bb_id;
1761 	int rc;
1762 
1763 	if (!nb_bb)
1764 		return 0;
1765 
1766 	/* Try to plug and add unused big blocks */
1767 	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1768 		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1769 			return -ENOSPC;
1770 
1771 		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1772 		if (!rc)
1773 			nb_bb--;
1774 		if (rc || !nb_bb)
1775 			return rc;
1776 		cond_resched();
1777 	}
1778 
1779 	/* Try to prepare, plug and add new big blocks */
1780 	while (nb_bb) {
1781 		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1782 			return -ENOSPC;
1783 
1784 		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1785 		if (rc)
1786 			return rc;
1787 		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1788 		if (!rc)
1789 			nb_bb--;
1790 		if (rc)
1791 			return rc;
1792 		cond_resched();
1793 	}
1794 
1795 	return 0;
1796 }
1797 
1798 /*
1799  * Try to plug the requested amount of memory.
1800  */
1801 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1802 {
1803 	if (vm->in_sbm)
1804 		return virtio_mem_sbm_plug_request(vm, diff);
1805 	return virtio_mem_bbm_plug_request(vm, diff);
1806 }
1807 
1808 /*
1809  * Unplug the desired number of plugged subblocks of an offline memory block.
1810  * Will fail if any subblock cannot get unplugged (instead of skipping it).
1811  *
1812  * Will modify the state of the memory block. Might temporarily drop the
1813  * hotplug_mutex.
1814  *
1815  * Note: Can fail after some subblocks were successfully unplugged.
1816  */
1817 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1818 						unsigned long mb_id,
1819 						uint64_t *nb_sb)
1820 {
1821 	int rc;
1822 
1823 	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1824 
1825 	/* some subblocks might have been unplugged even on failure */
1826 	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1827 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1828 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1829 	if (rc)
1830 		return rc;
1831 
1832 	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1833 		/*
1834 		 * Remove the block from Linux - this should never fail.
1835 		 * Hinder the block from getting onlined by marking it
1836 		 * unplugged. Temporarily drop the mutex, so
1837 		 * any pending GOING_ONLINE requests can be serviced/rejected.
1838 		 */
1839 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1840 					    VIRTIO_MEM_SBM_MB_UNUSED);
1841 
1842 		mutex_unlock(&vm->hotplug_mutex);
1843 		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1844 		BUG_ON(rc);
1845 		mutex_lock(&vm->hotplug_mutex);
1846 	}
1847 	return 0;
1848 }
1849 
1850 /*
1851  * Unplug the given plugged subblocks of an online memory block.
1852  *
1853  * Will modify the state of the memory block.
1854  */
1855 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1856 					   unsigned long mb_id, int sb_id,
1857 					   int count)
1858 {
1859 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1860 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1861 	unsigned long start_pfn;
1862 	int rc;
1863 
1864 	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1865 			     sb_id * vm->sbm.sb_size);
1866 
1867 	rc = virtio_mem_fake_offline(start_pfn, nr_pages);
1868 	if (rc)
1869 		return rc;
1870 
1871 	/* Try to unplug the allocated memory */
1872 	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1873 	if (rc) {
1874 		/* Return the memory to the buddy. */
1875 		virtio_mem_fake_online(start_pfn, nr_pages);
1876 		return rc;
1877 	}
1878 
1879 	switch (old_state) {
1880 	case VIRTIO_MEM_SBM_MB_KERNEL:
1881 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1882 					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
1883 		break;
1884 	case VIRTIO_MEM_SBM_MB_MOVABLE:
1885 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1886 					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
1887 		break;
1888 	}
1889 
1890 	return 0;
1891 }
1892 
1893 /*
1894  * Unplug the desired number of plugged subblocks of an online memory block.
1895  * Will skip subblock that are busy.
1896  *
1897  * Will modify the state of the memory block. Might temporarily drop the
1898  * hotplug_mutex.
1899  *
1900  * Note: Can fail after some subblocks were successfully unplugged. Can
1901  *       return 0 even if subblocks were busy and could not get unplugged.
1902  */
1903 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
1904 					       unsigned long mb_id,
1905 					       uint64_t *nb_sb)
1906 {
1907 	int rc, sb_id;
1908 
1909 	/* If possible, try to unplug the complete block in one shot. */
1910 	if (*nb_sb >= vm->sbm.sbs_per_mb &&
1911 	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1912 		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
1913 						     vm->sbm.sbs_per_mb);
1914 		if (!rc) {
1915 			*nb_sb -= vm->sbm.sbs_per_mb;
1916 			goto unplugged;
1917 		} else if (rc != -EBUSY)
1918 			return rc;
1919 	}
1920 
1921 	/* Fallback to single subblocks. */
1922 	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1923 		/* Find the next candidate subblock */
1924 		while (sb_id >= 0 &&
1925 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
1926 			sb_id--;
1927 		if (sb_id < 0)
1928 			break;
1929 
1930 		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
1931 		if (rc == -EBUSY)
1932 			continue;
1933 		else if (rc)
1934 			return rc;
1935 		*nb_sb -= 1;
1936 	}
1937 
1938 unplugged:
1939 	/*
1940 	 * Once all subblocks of a memory block were unplugged, offline and
1941 	 * remove it. This will usually not fail, as no memory is in use
1942 	 * anymore - however some other notifiers might NACK the request.
1943 	 */
1944 	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1945 		mutex_unlock(&vm->hotplug_mutex);
1946 		rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
1947 		mutex_lock(&vm->hotplug_mutex);
1948 		if (!rc)
1949 			virtio_mem_sbm_set_mb_state(vm, mb_id,
1950 						    VIRTIO_MEM_SBM_MB_UNUSED);
1951 	}
1952 
1953 	return 0;
1954 }
1955 
1956 /*
1957  * Unplug the desired number of plugged subblocks of a memory block that is
1958  * already added to Linux. Will skip subblock of online memory blocks that are
1959  * busy (by the OS). Will fail if any subblock that's not busy cannot get
1960  * unplugged.
1961  *
1962  * Will modify the state of the memory block. Might temporarily drop the
1963  * hotplug_mutex.
1964  *
1965  * Note: Can fail after some subblocks were successfully unplugged. Can
1966  *       return 0 even if subblocks were busy and could not get unplugged.
1967  */
1968 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
1969 					unsigned long mb_id,
1970 					uint64_t *nb_sb)
1971 {
1972 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1973 
1974 	switch (old_state) {
1975 	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
1976 	case VIRTIO_MEM_SBM_MB_KERNEL:
1977 	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
1978 	case VIRTIO_MEM_SBM_MB_MOVABLE:
1979 		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
1980 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
1981 	case VIRTIO_MEM_SBM_MB_OFFLINE:
1982 		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
1983 	}
1984 	return -EINVAL;
1985 }
1986 
1987 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
1988 {
1989 	const int mb_states[] = {
1990 		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1991 		VIRTIO_MEM_SBM_MB_OFFLINE,
1992 		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1993 		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1994 		VIRTIO_MEM_SBM_MB_MOVABLE,
1995 		VIRTIO_MEM_SBM_MB_KERNEL,
1996 	};
1997 	uint64_t nb_sb = diff / vm->sbm.sb_size;
1998 	unsigned long mb_id;
1999 	int rc, i;
2000 
2001 	if (!nb_sb)
2002 		return 0;
2003 
2004 	/*
2005 	 * We'll drop the mutex a couple of times when it is safe to do so.
2006 	 * This might result in some blocks switching the state (online/offline)
2007 	 * and we could miss them in this run - we will retry again later.
2008 	 */
2009 	mutex_lock(&vm->hotplug_mutex);
2010 
2011 	/*
2012 	 * We try unplug from partially plugged blocks first, to try removing
2013 	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2014 	 * as it's more reliable to unplug memory and remove whole memory
2015 	 * blocks, and we don't want to trigger a zone imbalances by
2016 	 * accidentially removing too much kernel memory.
2017 	 */
2018 	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2019 		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2020 			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2021 			if (rc || !nb_sb)
2022 				goto out_unlock;
2023 			mutex_unlock(&vm->hotplug_mutex);
2024 			cond_resched();
2025 			mutex_lock(&vm->hotplug_mutex);
2026 		}
2027 		if (!unplug_online && i == 1) {
2028 			mutex_unlock(&vm->hotplug_mutex);
2029 			return 0;
2030 		}
2031 	}
2032 
2033 	mutex_unlock(&vm->hotplug_mutex);
2034 	return nb_sb ? -EBUSY : 0;
2035 out_unlock:
2036 	mutex_unlock(&vm->hotplug_mutex);
2037 	return rc;
2038 }
2039 
2040 /*
2041  * Try to offline and remove a big block from Linux and unplug it. Will fail
2042  * with -EBUSY if some memory is busy and cannot get unplugged.
2043  *
2044  * Will modify the state of the memory block. Might temporarily drop the
2045  * hotplug_mutex.
2046  */
2047 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2048 						       unsigned long bb_id)
2049 {
2050 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2051 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2052 	unsigned long end_pfn = start_pfn + nr_pages;
2053 	unsigned long pfn;
2054 	struct page *page;
2055 	int rc;
2056 
2057 	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2058 			 VIRTIO_MEM_BBM_BB_ADDED))
2059 		return -EINVAL;
2060 
2061 	if (bbm_safe_unplug) {
2062 		/*
2063 		 * Start by fake-offlining all memory. Once we marked the device
2064 		 * block as fake-offline, all newly onlined memory will
2065 		 * automatically be kept fake-offline. Protect from concurrent
2066 		 * onlining/offlining until we have a consistent state.
2067 		 */
2068 		mutex_lock(&vm->hotplug_mutex);
2069 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2070 					    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2071 
2072 		for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2073 			page = pfn_to_online_page(pfn);
2074 			if (!page)
2075 				continue;
2076 
2077 			rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
2078 			if (rc) {
2079 				end_pfn = pfn;
2080 				goto rollback_safe_unplug;
2081 			}
2082 		}
2083 		mutex_unlock(&vm->hotplug_mutex);
2084 	}
2085 
2086 	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2087 	if (rc) {
2088 		if (bbm_safe_unplug) {
2089 			mutex_lock(&vm->hotplug_mutex);
2090 			goto rollback_safe_unplug;
2091 		}
2092 		return rc;
2093 	}
2094 
2095 	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2096 	if (rc)
2097 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2098 					    VIRTIO_MEM_BBM_BB_PLUGGED);
2099 	else
2100 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2101 					    VIRTIO_MEM_BBM_BB_UNUSED);
2102 	return rc;
2103 
2104 rollback_safe_unplug:
2105 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2106 		page = pfn_to_online_page(pfn);
2107 		if (!page)
2108 			continue;
2109 		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2110 	}
2111 	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2112 	mutex_unlock(&vm->hotplug_mutex);
2113 	return rc;
2114 }
2115 
2116 /*
2117  * Test if a big block is completely offline.
2118  */
2119 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2120 					 unsigned long bb_id)
2121 {
2122 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2123 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2124 	unsigned long pfn;
2125 
2126 	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2127 	     pfn += PAGES_PER_SECTION) {
2128 		if (pfn_to_online_page(pfn))
2129 			return false;
2130 	}
2131 
2132 	return true;
2133 }
2134 
2135 /*
2136  * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2137  */
2138 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2139 					 unsigned long bb_id)
2140 {
2141 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2142 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2143 	struct page *page;
2144 	unsigned long pfn;
2145 
2146 	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2147 	     pfn += PAGES_PER_SECTION) {
2148 		page = pfn_to_online_page(pfn);
2149 		if (!page)
2150 			continue;
2151 		if (page_zonenum(page) != ZONE_MOVABLE)
2152 			return false;
2153 	}
2154 
2155 	return true;
2156 }
2157 
2158 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2159 {
2160 	uint64_t nb_bb = diff / vm->bbm.bb_size;
2161 	uint64_t bb_id;
2162 	int rc, i;
2163 
2164 	if (!nb_bb)
2165 		return 0;
2166 
2167 	/*
2168 	 * Try to unplug big blocks. Similar to SBM, start with offline
2169 	 * big blocks.
2170 	 */
2171 	for (i = 0; i < 3; i++) {
2172 		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2173 			cond_resched();
2174 
2175 			/*
2176 			 * As we're holding no locks, these checks are racy,
2177 			 * but we don't care.
2178 			 */
2179 			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2180 				continue;
2181 			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2182 				continue;
2183 			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2184 			if (rc == -EBUSY)
2185 				continue;
2186 			if (!rc)
2187 				nb_bb--;
2188 			if (rc || !nb_bb)
2189 				return rc;
2190 		}
2191 		if (i == 0 && !unplug_online)
2192 			return 0;
2193 	}
2194 
2195 	return nb_bb ? -EBUSY : 0;
2196 }
2197 
2198 /*
2199  * Try to unplug the requested amount of memory.
2200  */
2201 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2202 {
2203 	if (vm->in_sbm)
2204 		return virtio_mem_sbm_unplug_request(vm, diff);
2205 	return virtio_mem_bbm_unplug_request(vm, diff);
2206 }
2207 
2208 /*
2209  * Try to unplug all blocks that couldn't be unplugged before, for example,
2210  * because the hypervisor was busy.
2211  */
2212 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
2213 {
2214 	unsigned long id;
2215 	int rc;
2216 
2217 	if (!vm->in_sbm) {
2218 		virtio_mem_bbm_for_each_bb(vm, id,
2219 					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2220 			rc = virtio_mem_bbm_unplug_bb(vm, id);
2221 			if (rc)
2222 				return rc;
2223 			virtio_mem_bbm_set_bb_state(vm, id,
2224 						    VIRTIO_MEM_BBM_BB_UNUSED);
2225 		}
2226 		return 0;
2227 	}
2228 
2229 	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2230 		rc = virtio_mem_sbm_unplug_mb(vm, id);
2231 		if (rc)
2232 			return rc;
2233 		virtio_mem_sbm_set_mb_state(vm, id,
2234 					    VIRTIO_MEM_SBM_MB_UNUSED);
2235 	}
2236 
2237 	return 0;
2238 }
2239 
2240 /*
2241  * Update all parts of the config that could have changed.
2242  */
2243 static void virtio_mem_refresh_config(struct virtio_mem *vm)
2244 {
2245 	const struct range pluggable_range = mhp_get_pluggable_range(true);
2246 	uint64_t new_plugged_size, usable_region_size, end_addr;
2247 
2248 	/* the plugged_size is just a reflection of what _we_ did previously */
2249 	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2250 			&new_plugged_size);
2251 	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2252 		vm->plugged_size = new_plugged_size;
2253 
2254 	/* calculate the last usable memory block id */
2255 	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2256 			usable_region_size, &usable_region_size);
2257 	end_addr = min(vm->addr + usable_region_size - 1,
2258 		       pluggable_range.end);
2259 
2260 	if (vm->in_sbm) {
2261 		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2262 		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2263 			vm->sbm.last_usable_mb_id--;
2264 	} else {
2265 		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2266 								     end_addr);
2267 		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2268 			vm->bbm.last_usable_bb_id--;
2269 	}
2270 	/*
2271 	 * If we cannot plug any of our device memory (e.g., nothing in the
2272 	 * usable region is addressable), the last usable memory block id will
2273 	 * be smaller than the first usable memory block id. We'll stop
2274 	 * attempting to add memory with -ENOSPC from our main loop.
2275 	 */
2276 
2277 	/* see if there is a request to change the size */
2278 	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2279 			&vm->requested_size);
2280 
2281 	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2282 	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2283 }
2284 
2285 /*
2286  * Workqueue function for handling plug/unplug requests and config updates.
2287  */
2288 static void virtio_mem_run_wq(struct work_struct *work)
2289 {
2290 	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2291 	uint64_t diff;
2292 	int rc;
2293 
2294 	hrtimer_cancel(&vm->retry_timer);
2295 
2296 	if (vm->broken)
2297 		return;
2298 
2299 	atomic_set(&vm->wq_active, 1);
2300 retry:
2301 	rc = 0;
2302 
2303 	/* Make sure we start with a clean state if there are leftovers. */
2304 	if (unlikely(vm->unplug_all_required))
2305 		rc = virtio_mem_send_unplug_all_request(vm);
2306 
2307 	if (atomic_read(&vm->config_changed)) {
2308 		atomic_set(&vm->config_changed, 0);
2309 		virtio_mem_refresh_config(vm);
2310 	}
2311 
2312 	/* Unplug any leftovers from previous runs */
2313 	if (!rc)
2314 		rc = virtio_mem_unplug_pending_mb(vm);
2315 
2316 	if (!rc && vm->requested_size != vm->plugged_size) {
2317 		if (vm->requested_size > vm->plugged_size) {
2318 			diff = vm->requested_size - vm->plugged_size;
2319 			rc = virtio_mem_plug_request(vm, diff);
2320 		} else {
2321 			diff = vm->plugged_size - vm->requested_size;
2322 			rc = virtio_mem_unplug_request(vm, diff);
2323 		}
2324 	}
2325 
2326 	switch (rc) {
2327 	case 0:
2328 		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2329 		break;
2330 	case -ENOSPC:
2331 		/*
2332 		 * We cannot add any more memory (alignment, physical limit)
2333 		 * or we have too many offline memory blocks.
2334 		 */
2335 		break;
2336 	case -ETXTBSY:
2337 		/*
2338 		 * The hypervisor cannot process our request right now
2339 		 * (e.g., out of memory, migrating);
2340 		 */
2341 	case -EBUSY:
2342 		/*
2343 		 * We cannot free up any memory to unplug it (all plugged memory
2344 		 * is busy).
2345 		 */
2346 	case -ENOMEM:
2347 		/* Out of memory, try again later. */
2348 		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2349 			      HRTIMER_MODE_REL);
2350 		break;
2351 	case -EAGAIN:
2352 		/* Retry immediately (e.g., the config changed). */
2353 		goto retry;
2354 	default:
2355 		/* Unknown error, mark as broken */
2356 		dev_err(&vm->vdev->dev,
2357 			"unknown error, marking device broken: %d\n", rc);
2358 		vm->broken = true;
2359 	}
2360 
2361 	atomic_set(&vm->wq_active, 0);
2362 }
2363 
2364 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2365 {
2366 	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2367 					     retry_timer);
2368 
2369 	virtio_mem_retry(vm);
2370 	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2371 				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2372 	return HRTIMER_NORESTART;
2373 }
2374 
2375 static void virtio_mem_handle_response(struct virtqueue *vq)
2376 {
2377 	struct virtio_mem *vm = vq->vdev->priv;
2378 
2379 	wake_up(&vm->host_resp);
2380 }
2381 
2382 static int virtio_mem_init_vq(struct virtio_mem *vm)
2383 {
2384 	struct virtqueue *vq;
2385 
2386 	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2387 				   "guest-request");
2388 	if (IS_ERR(vq))
2389 		return PTR_ERR(vq);
2390 	vm->vq = vq;
2391 
2392 	return 0;
2393 }
2394 
2395 static int virtio_mem_init(struct virtio_mem *vm)
2396 {
2397 	const struct range pluggable_range = mhp_get_pluggable_range(true);
2398 	uint64_t sb_size, addr;
2399 	uint16_t node_id;
2400 
2401 	if (!vm->vdev->config->get) {
2402 		dev_err(&vm->vdev->dev, "config access disabled\n");
2403 		return -EINVAL;
2404 	}
2405 
2406 	/*
2407 	 * We don't want to (un)plug or reuse any memory when in kdump. The
2408 	 * memory is still accessible (but not mapped).
2409 	 */
2410 	if (is_kdump_kernel()) {
2411 		dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
2412 		return -EBUSY;
2413 	}
2414 
2415 	/* Fetch all properties that can't change. */
2416 	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2417 			&vm->plugged_size);
2418 	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2419 			&vm->device_block_size);
2420 	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2421 			&node_id);
2422 	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2423 	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2424 	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2425 			&vm->region_size);
2426 
2427 	/* Determine the nid for the device based on the lowest address. */
2428 	if (vm->nid == NUMA_NO_NODE)
2429 		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2430 
2431 	/* bad device setup - warn only */
2432 	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2433 		dev_warn(&vm->vdev->dev,
2434 			 "The alignment of the physical start address can make some memory unusable.\n");
2435 	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2436 		dev_warn(&vm->vdev->dev,
2437 			 "The alignment of the physical end address can make some memory unusable.\n");
2438 	if (vm->addr < pluggable_range.start ||
2439 	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2440 		dev_warn(&vm->vdev->dev,
2441 			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2442 
2443 	/* Prepare the offline threshold - make sure we can add two blocks. */
2444 	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2445 				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2446 
2447 	/*
2448 	 * We want subblocks to span at least MAX_ORDER_NR_PAGES and
2449 	 * pageblock_nr_pages pages. This:
2450 	 * - Simplifies our page onlining code (virtio_mem_online_page_cb)
2451 	 *   and fake page onlining code (virtio_mem_fake_online).
2452 	 * - Is required for now for alloc_contig_range() to work reliably -
2453 	 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
2454 	 */
2455 	sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
2456 			pageblock_nr_pages) * PAGE_SIZE;
2457 	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2458 
2459 	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2460 		/* SBM: At least two subblocks per Linux memory block. */
2461 		vm->in_sbm = true;
2462 		vm->sbm.sb_size = sb_size;
2463 		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2464 				     vm->sbm.sb_size;
2465 
2466 		/* Round up to the next full memory block */
2467 		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2468 		       memory_block_size_bytes() - 1;
2469 		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2470 		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2471 	} else {
2472 		/* BBM: At least one Linux memory block. */
2473 		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2474 					memory_block_size_bytes());
2475 
2476 		if (bbm_block_size) {
2477 			if (!is_power_of_2(bbm_block_size)) {
2478 				dev_warn(&vm->vdev->dev,
2479 					 "bbm_block_size is not a power of 2");
2480 			} else if (bbm_block_size < vm->bbm.bb_size) {
2481 				dev_warn(&vm->vdev->dev,
2482 					 "bbm_block_size is too small");
2483 			} else {
2484 				vm->bbm.bb_size = bbm_block_size;
2485 			}
2486 		}
2487 
2488 		/* Round up to the next aligned big block */
2489 		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2490 		       vm->bbm.bb_size - 1;
2491 		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2492 		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2493 
2494 		/* Make sure we can add two big blocks. */
2495 		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2496 					      vm->offline_threshold);
2497 	}
2498 
2499 	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2500 	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2501 	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2502 		 (unsigned long long)vm->device_block_size);
2503 	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2504 		 memory_block_size_bytes());
2505 	if (vm->in_sbm)
2506 		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2507 			 (unsigned long long)vm->sbm.sb_size);
2508 	else
2509 		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2510 			 (unsigned long long)vm->bbm.bb_size);
2511 	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2512 		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2513 
2514 	return 0;
2515 }
2516 
2517 static int virtio_mem_create_resource(struct virtio_mem *vm)
2518 {
2519 	/*
2520 	 * When force-unloading the driver and removing the device, we
2521 	 * could have a garbage pointer. Duplicate the string.
2522 	 */
2523 	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2524 
2525 	if (!name)
2526 		return -ENOMEM;
2527 
2528 	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2529 						   name, IORESOURCE_SYSTEM_RAM);
2530 	if (!vm->parent_resource) {
2531 		kfree(name);
2532 		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2533 		dev_info(&vm->vdev->dev,
2534 			 "reloading the driver is not supported\n");
2535 		return -EBUSY;
2536 	}
2537 
2538 	/* The memory is not actually busy - make add_memory() work. */
2539 	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2540 	return 0;
2541 }
2542 
2543 static void virtio_mem_delete_resource(struct virtio_mem *vm)
2544 {
2545 	const char *name;
2546 
2547 	if (!vm->parent_resource)
2548 		return;
2549 
2550 	name = vm->parent_resource->name;
2551 	release_resource(vm->parent_resource);
2552 	kfree(vm->parent_resource);
2553 	kfree(name);
2554 	vm->parent_resource = NULL;
2555 }
2556 
2557 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2558 {
2559 	return 1;
2560 }
2561 
2562 static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2563 {
2564 	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2565 
2566 	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2567 				   vm->addr + vm->region_size, NULL,
2568 				   virtio_mem_range_has_system_ram) == 1;
2569 }
2570 
2571 static int virtio_mem_probe(struct virtio_device *vdev)
2572 {
2573 	struct virtio_mem *vm;
2574 	uint64_t unit_pages;
2575 	int rc;
2576 
2577 	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2578 	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2579 
2580 	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2581 	if (!vm)
2582 		return -ENOMEM;
2583 
2584 	init_waitqueue_head(&vm->host_resp);
2585 	vm->vdev = vdev;
2586 	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2587 	mutex_init(&vm->hotplug_mutex);
2588 	INIT_LIST_HEAD(&vm->next);
2589 	spin_lock_init(&vm->removal_lock);
2590 	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2591 	vm->retry_timer.function = virtio_mem_timer_expired;
2592 	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2593 
2594 	/* register the virtqueue */
2595 	rc = virtio_mem_init_vq(vm);
2596 	if (rc)
2597 		goto out_free_vm;
2598 
2599 	/* initialize the device by querying the config */
2600 	rc = virtio_mem_init(vm);
2601 	if (rc)
2602 		goto out_del_vq;
2603 
2604 	/* create the parent resource for all memory */
2605 	rc = virtio_mem_create_resource(vm);
2606 	if (rc)
2607 		goto out_del_vq;
2608 
2609 	/* use a single dynamic memory group to cover the whole memory device */
2610 	if (vm->in_sbm)
2611 		unit_pages = PHYS_PFN(memory_block_size_bytes());
2612 	else
2613 		unit_pages = PHYS_PFN(vm->bbm.bb_size);
2614 	rc = memory_group_register_dynamic(vm->nid, unit_pages);
2615 	if (rc < 0)
2616 		goto out_del_resource;
2617 	vm->mgid = rc;
2618 
2619 	/*
2620 	 * If we still have memory plugged, we have to unplug all memory first.
2621 	 * Registering our parent resource makes sure that this memory isn't
2622 	 * actually in use (e.g., trying to reload the driver).
2623 	 */
2624 	if (vm->plugged_size) {
2625 		vm->unplug_all_required = true;
2626 		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2627 	}
2628 
2629 	/* register callbacks */
2630 	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2631 	rc = register_memory_notifier(&vm->memory_notifier);
2632 	if (rc)
2633 		goto out_unreg_group;
2634 	rc = register_virtio_mem_device(vm);
2635 	if (rc)
2636 		goto out_unreg_mem;
2637 
2638 	virtio_device_ready(vdev);
2639 
2640 	/* trigger a config update to start processing the requested_size */
2641 	atomic_set(&vm->config_changed, 1);
2642 	queue_work(system_freezable_wq, &vm->wq);
2643 
2644 	return 0;
2645 out_unreg_mem:
2646 	unregister_memory_notifier(&vm->memory_notifier);
2647 out_unreg_group:
2648 	memory_group_unregister(vm->mgid);
2649 out_del_resource:
2650 	virtio_mem_delete_resource(vm);
2651 out_del_vq:
2652 	vdev->config->del_vqs(vdev);
2653 out_free_vm:
2654 	kfree(vm);
2655 	vdev->priv = NULL;
2656 
2657 	return rc;
2658 }
2659 
2660 static void virtio_mem_remove(struct virtio_device *vdev)
2661 {
2662 	struct virtio_mem *vm = vdev->priv;
2663 	unsigned long mb_id;
2664 	int rc;
2665 
2666 	/*
2667 	 * Make sure the workqueue won't be triggered anymore and no memory
2668 	 * blocks can be onlined/offlined until we're finished here.
2669 	 */
2670 	mutex_lock(&vm->hotplug_mutex);
2671 	spin_lock_irq(&vm->removal_lock);
2672 	vm->removing = true;
2673 	spin_unlock_irq(&vm->removal_lock);
2674 	mutex_unlock(&vm->hotplug_mutex);
2675 
2676 	/* wait until the workqueue stopped */
2677 	cancel_work_sync(&vm->wq);
2678 	hrtimer_cancel(&vm->retry_timer);
2679 
2680 	if (vm->in_sbm) {
2681 		/*
2682 		 * After we unregistered our callbacks, user space can online
2683 		 * partially plugged offline blocks. Make sure to remove them.
2684 		 */
2685 		virtio_mem_sbm_for_each_mb(vm, mb_id,
2686 					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2687 			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2688 			BUG_ON(rc);
2689 			virtio_mem_sbm_set_mb_state(vm, mb_id,
2690 						    VIRTIO_MEM_SBM_MB_UNUSED);
2691 		}
2692 		/*
2693 		 * After we unregistered our callbacks, user space can no longer
2694 		 * offline partially plugged online memory blocks. No need to
2695 		 * worry about them.
2696 		 */
2697 	}
2698 
2699 	/* unregister callbacks */
2700 	unregister_virtio_mem_device(vm);
2701 	unregister_memory_notifier(&vm->memory_notifier);
2702 
2703 	/*
2704 	 * There is no way we could reliably remove all memory we have added to
2705 	 * the system. And there is no way to stop the driver/device from going
2706 	 * away. Warn at least.
2707 	 */
2708 	if (virtio_mem_has_memory_added(vm)) {
2709 		dev_warn(&vdev->dev, "device still has system memory added\n");
2710 	} else {
2711 		virtio_mem_delete_resource(vm);
2712 		kfree_const(vm->resource_name);
2713 		memory_group_unregister(vm->mgid);
2714 	}
2715 
2716 	/* remove all tracking data - no locking needed */
2717 	if (vm->in_sbm) {
2718 		vfree(vm->sbm.mb_states);
2719 		vfree(vm->sbm.sb_states);
2720 	} else {
2721 		vfree(vm->bbm.bb_states);
2722 	}
2723 
2724 	/* reset the device and cleanup the queues */
2725 	vdev->config->reset(vdev);
2726 	vdev->config->del_vqs(vdev);
2727 
2728 	kfree(vm);
2729 	vdev->priv = NULL;
2730 }
2731 
2732 static void virtio_mem_config_changed(struct virtio_device *vdev)
2733 {
2734 	struct virtio_mem *vm = vdev->priv;
2735 
2736 	atomic_set(&vm->config_changed, 1);
2737 	virtio_mem_retry(vm);
2738 }
2739 
2740 #ifdef CONFIG_PM_SLEEP
2741 static int virtio_mem_freeze(struct virtio_device *vdev)
2742 {
2743 	/*
2744 	 * When restarting the VM, all memory is usually unplugged. Don't
2745 	 * allow to suspend/hibernate.
2746 	 */
2747 	dev_err(&vdev->dev, "save/restore not supported.\n");
2748 	return -EPERM;
2749 }
2750 
2751 static int virtio_mem_restore(struct virtio_device *vdev)
2752 {
2753 	return -EPERM;
2754 }
2755 #endif
2756 
2757 static unsigned int virtio_mem_features[] = {
2758 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
2759 	VIRTIO_MEM_F_ACPI_PXM,
2760 #endif
2761 };
2762 
2763 static const struct virtio_device_id virtio_mem_id_table[] = {
2764 	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
2765 	{ 0 },
2766 };
2767 
2768 static struct virtio_driver virtio_mem_driver = {
2769 	.feature_table = virtio_mem_features,
2770 	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
2771 	.driver.name = KBUILD_MODNAME,
2772 	.driver.owner = THIS_MODULE,
2773 	.id_table = virtio_mem_id_table,
2774 	.probe = virtio_mem_probe,
2775 	.remove = virtio_mem_remove,
2776 	.config_changed = virtio_mem_config_changed,
2777 #ifdef CONFIG_PM_SLEEP
2778 	.freeze	=	virtio_mem_freeze,
2779 	.restore =	virtio_mem_restore,
2780 #endif
2781 };
2782 
2783 module_virtio_driver(virtio_mem_driver);
2784 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
2785 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
2786 MODULE_DESCRIPTION("Virtio-mem driver");
2787 MODULE_LICENSE("GPL");
2788