xref: /openbmc/linux/drivers/virtio/virtio_mem.c (revision ada066b2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Virtio-mem device driver.
4  *
5  * Copyright Red Hat, Inc. 2020
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 
10 #include <linux/virtio.h>
11 #include <linux/virtio_mem.h>
12 #include <linux/workqueue.h>
13 #include <linux/slab.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/memory_hotplug.h>
17 #include <linux/memory.h>
18 #include <linux/hrtimer.h>
19 #include <linux/crash_dump.h>
20 #include <linux/mutex.h>
21 #include <linux/bitmap.h>
22 #include <linux/lockdep.h>
23 
24 #include <acpi/acpi_numa.h>
25 
26 static bool unplug_online = true;
27 module_param(unplug_online, bool, 0644);
28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
29 
30 static bool force_bbm;
31 module_param(force_bbm, bool, 0444);
32 MODULE_PARM_DESC(force_bbm,
33 		"Force Big Block Mode. Default is 0 (auto-selection)");
34 
35 static unsigned long bbm_block_size;
36 module_param(bbm_block_size, ulong, 0444);
37 MODULE_PARM_DESC(bbm_block_size,
38 		 "Big Block size in bytes. Default is 0 (auto-detection).");
39 
40 static bool bbm_safe_unplug = true;
41 module_param(bbm_safe_unplug, bool, 0444);
42 MODULE_PARM_DESC(bbm_safe_unplug,
43 	     "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
44 
45 /*
46  * virtio-mem currently supports the following modes of operation:
47  *
48  * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
49  *   size of a Sub Block (SB) is determined based on the device block size, the
50  *   pageblock size, and the maximum allocation granularity of the buddy.
51  *   Subblocks within a Linux memory block might either be plugged or unplugged.
52  *   Memory is added/removed to Linux MM in Linux memory block granularity.
53  *
54  * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
55  *   Memory is added/removed to Linux MM in Big Block granularity.
56  *
57  * The mode is determined automatically based on the Linux memory block size
58  * and the device block size.
59  *
60  * User space / core MM (auto onlining) is responsible for onlining added
61  * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
62  * always onlined separately, and all memory within a Linux memory block is
63  * onlined to the same zone - virtio-mem relies on this behavior.
64  */
65 
66 /*
67  * State of a Linux memory block in SBM.
68  */
69 enum virtio_mem_sbm_mb_state {
70 	/* Unplugged, not added to Linux. Can be reused later. */
71 	VIRTIO_MEM_SBM_MB_UNUSED = 0,
72 	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
73 	VIRTIO_MEM_SBM_MB_PLUGGED,
74 	/* Fully plugged, fully added to Linux, offline. */
75 	VIRTIO_MEM_SBM_MB_OFFLINE,
76 	/* Partially plugged, fully added to Linux, offline. */
77 	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
78 	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
79 	VIRTIO_MEM_SBM_MB_KERNEL,
80 	/* Partially plugged, fully added to Linux, online to a kernel zone */
81 	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
82 	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
83 	VIRTIO_MEM_SBM_MB_MOVABLE,
84 	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
85 	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
86 	VIRTIO_MEM_SBM_MB_COUNT
87 };
88 
89 /*
90  * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
91  */
92 enum virtio_mem_bbm_bb_state {
93 	/* Unplugged, not added to Linux. Can be reused later. */
94 	VIRTIO_MEM_BBM_BB_UNUSED = 0,
95 	/* Plugged, not added to Linux. Error on add_memory(). */
96 	VIRTIO_MEM_BBM_BB_PLUGGED,
97 	/* Plugged and added to Linux. */
98 	VIRTIO_MEM_BBM_BB_ADDED,
99 	/* All online parts are fake-offline, ready to remove. */
100 	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
101 	VIRTIO_MEM_BBM_BB_COUNT
102 };
103 
104 struct virtio_mem {
105 	struct virtio_device *vdev;
106 
107 	/* We might first have to unplug all memory when starting up. */
108 	bool unplug_all_required;
109 
110 	/* Workqueue that processes the plug/unplug requests. */
111 	struct work_struct wq;
112 	atomic_t wq_active;
113 	atomic_t config_changed;
114 
115 	/* Virtqueue for guest->host requests. */
116 	struct virtqueue *vq;
117 
118 	/* Wait for a host response to a guest request. */
119 	wait_queue_head_t host_resp;
120 
121 	/* Space for one guest request and the host response. */
122 	struct virtio_mem_req req;
123 	struct virtio_mem_resp resp;
124 
125 	/* The current size of the device. */
126 	uint64_t plugged_size;
127 	/* The requested size of the device. */
128 	uint64_t requested_size;
129 
130 	/* The device block size (for communicating with the device). */
131 	uint64_t device_block_size;
132 	/* The determined node id for all memory of the device. */
133 	int nid;
134 	/* Physical start address of the memory region. */
135 	uint64_t addr;
136 	/* Maximum region size in bytes. */
137 	uint64_t region_size;
138 
139 	/* The parent resource for all memory added via this device. */
140 	struct resource *parent_resource;
141 	/*
142 	 * Copy of "System RAM (virtio_mem)" to be used for
143 	 * add_memory_driver_managed().
144 	 */
145 	const char *resource_name;
146 	/* Memory group identification. */
147 	int mgid;
148 
149 	/*
150 	 * We don't want to add too much memory if it's not getting onlined,
151 	 * to avoid running OOM. Besides this threshold, we allow to have at
152 	 * least two offline blocks at a time (whatever is bigger).
153 	 */
154 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
155 	atomic64_t offline_size;
156 	uint64_t offline_threshold;
157 
158 	/* If set, the driver is in SBM, otherwise in BBM. */
159 	bool in_sbm;
160 
161 	union {
162 		struct {
163 			/* Id of the first memory block of this device. */
164 			unsigned long first_mb_id;
165 			/* Id of the last usable memory block of this device. */
166 			unsigned long last_usable_mb_id;
167 			/* Id of the next memory bock to prepare when needed. */
168 			unsigned long next_mb_id;
169 
170 			/* The subblock size. */
171 			uint64_t sb_size;
172 			/* The number of subblocks per Linux memory block. */
173 			uint32_t sbs_per_mb;
174 
175 			/* Summary of all memory block states. */
176 			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
177 
178 			/*
179 			 * One byte state per memory block. Allocated via
180 			 * vmalloc(). Resized (alloc+copy+free) on demand.
181 			 *
182 			 * With 128 MiB memory blocks, we have states for 512
183 			 * GiB of memory in one 4 KiB page.
184 			 */
185 			uint8_t *mb_states;
186 
187 			/*
188 			 * Bitmap: one bit per subblock. Allocated similar to
189 			 * sbm.mb_states.
190 			 *
191 			 * A set bit means the corresponding subblock is
192 			 * plugged, otherwise it's unblocked.
193 			 *
194 			 * With 4 MiB subblocks, we manage 128 GiB of memory
195 			 * in one 4 KiB page.
196 			 */
197 			unsigned long *sb_states;
198 		} sbm;
199 
200 		struct {
201 			/* Id of the first big block of this device. */
202 			unsigned long first_bb_id;
203 			/* Id of the last usable big block of this device. */
204 			unsigned long last_usable_bb_id;
205 			/* Id of the next device bock to prepare when needed. */
206 			unsigned long next_bb_id;
207 
208 			/* Summary of all big block states. */
209 			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
210 
211 			/* One byte state per big block. See sbm.mb_states. */
212 			uint8_t *bb_states;
213 
214 			/* The block size used for plugging/adding/removing. */
215 			uint64_t bb_size;
216 		} bbm;
217 	};
218 
219 	/*
220 	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
221 	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
222 	 *
223 	 * When this lock is held the pointers can't change, ONLINE and
224 	 * OFFLINE blocks can't change the state and no subblocks will get
225 	 * plugged/unplugged.
226 	 *
227 	 * In kdump mode, used to serialize requests, last_block_addr and
228 	 * last_block_plugged.
229 	 */
230 	struct mutex hotplug_mutex;
231 	bool hotplug_active;
232 
233 	/* An error occurred we cannot handle - stop processing requests. */
234 	bool broken;
235 
236 	/* Cached valued of is_kdump_kernel() when the device was probed. */
237 	bool in_kdump;
238 
239 	/* The driver is being removed. */
240 	spinlock_t removal_lock;
241 	bool removing;
242 
243 	/* Timer for retrying to plug/unplug memory. */
244 	struct hrtimer retry_timer;
245 	unsigned int retry_timer_ms;
246 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
247 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
248 
249 	/* Memory notifier (online/offline events). */
250 	struct notifier_block memory_notifier;
251 
252 #ifdef CONFIG_PROC_VMCORE
253 	/* vmcore callback for /proc/vmcore handling in kdump mode */
254 	struct vmcore_cb vmcore_cb;
255 	uint64_t last_block_addr;
256 	bool last_block_plugged;
257 #endif /* CONFIG_PROC_VMCORE */
258 
259 	/* Next device in the list of virtio-mem devices. */
260 	struct list_head next;
261 };
262 
263 /*
264  * We have to share a single online_page callback among all virtio-mem
265  * devices. We use RCU to iterate the list in the callback.
266  */
267 static DEFINE_MUTEX(virtio_mem_mutex);
268 static LIST_HEAD(virtio_mem_devices);
269 
270 static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
271 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
272 						  unsigned long nr_pages);
273 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
274 						   unsigned long nr_pages);
275 static void virtio_mem_retry(struct virtio_mem *vm);
276 static int virtio_mem_create_resource(struct virtio_mem *vm);
277 static void virtio_mem_delete_resource(struct virtio_mem *vm);
278 
279 /*
280  * Register a virtio-mem device so it will be considered for the online_page
281  * callback.
282  */
283 static int register_virtio_mem_device(struct virtio_mem *vm)
284 {
285 	int rc = 0;
286 
287 	/* First device registers the callback. */
288 	mutex_lock(&virtio_mem_mutex);
289 	if (list_empty(&virtio_mem_devices))
290 		rc = set_online_page_callback(&virtio_mem_online_page_cb);
291 	if (!rc)
292 		list_add_rcu(&vm->next, &virtio_mem_devices);
293 	mutex_unlock(&virtio_mem_mutex);
294 
295 	return rc;
296 }
297 
298 /*
299  * Unregister a virtio-mem device so it will no longer be considered for the
300  * online_page callback.
301  */
302 static void unregister_virtio_mem_device(struct virtio_mem *vm)
303 {
304 	/* Last device unregisters the callback. */
305 	mutex_lock(&virtio_mem_mutex);
306 	list_del_rcu(&vm->next);
307 	if (list_empty(&virtio_mem_devices))
308 		restore_online_page_callback(&virtio_mem_online_page_cb);
309 	mutex_unlock(&virtio_mem_mutex);
310 
311 	synchronize_rcu();
312 }
313 
314 /*
315  * Calculate the memory block id of a given address.
316  */
317 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
318 {
319 	return addr / memory_block_size_bytes();
320 }
321 
322 /*
323  * Calculate the physical start address of a given memory block id.
324  */
325 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
326 {
327 	return mb_id * memory_block_size_bytes();
328 }
329 
330 /*
331  * Calculate the big block id of a given address.
332  */
333 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
334 					      uint64_t addr)
335 {
336 	return addr / vm->bbm.bb_size;
337 }
338 
339 /*
340  * Calculate the physical start address of a given big block id.
341  */
342 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
343 					 unsigned long bb_id)
344 {
345 	return bb_id * vm->bbm.bb_size;
346 }
347 
348 /*
349  * Calculate the subblock id of a given address.
350  */
351 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
352 					      unsigned long addr)
353 {
354 	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
355 	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
356 
357 	return (addr - mb_addr) / vm->sbm.sb_size;
358 }
359 
360 /*
361  * Set the state of a big block, taking care of the state counter.
362  */
363 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
364 					unsigned long bb_id,
365 					enum virtio_mem_bbm_bb_state state)
366 {
367 	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
368 	enum virtio_mem_bbm_bb_state old_state;
369 
370 	old_state = vm->bbm.bb_states[idx];
371 	vm->bbm.bb_states[idx] = state;
372 
373 	BUG_ON(vm->bbm.bb_count[old_state] == 0);
374 	vm->bbm.bb_count[old_state]--;
375 	vm->bbm.bb_count[state]++;
376 }
377 
378 /*
379  * Get the state of a big block.
380  */
381 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
382 								unsigned long bb_id)
383 {
384 	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
385 }
386 
387 /*
388  * Prepare the big block state array for the next big block.
389  */
390 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
391 {
392 	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
393 	unsigned long new_bytes = old_bytes + 1;
394 	int old_pages = PFN_UP(old_bytes);
395 	int new_pages = PFN_UP(new_bytes);
396 	uint8_t *new_array;
397 
398 	if (vm->bbm.bb_states && old_pages == new_pages)
399 		return 0;
400 
401 	new_array = vzalloc(new_pages * PAGE_SIZE);
402 	if (!new_array)
403 		return -ENOMEM;
404 
405 	mutex_lock(&vm->hotplug_mutex);
406 	if (vm->bbm.bb_states)
407 		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
408 	vfree(vm->bbm.bb_states);
409 	vm->bbm.bb_states = new_array;
410 	mutex_unlock(&vm->hotplug_mutex);
411 
412 	return 0;
413 }
414 
415 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
416 	for (_bb_id = vm->bbm.first_bb_id; \
417 	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
418 	     _bb_id++) \
419 		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
420 
421 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
422 	for (_bb_id = vm->bbm.next_bb_id - 1; \
423 	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
424 	     _bb_id--) \
425 		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
426 
427 /*
428  * Set the state of a memory block, taking care of the state counter.
429  */
430 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
431 					unsigned long mb_id, uint8_t state)
432 {
433 	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
434 	uint8_t old_state;
435 
436 	old_state = vm->sbm.mb_states[idx];
437 	vm->sbm.mb_states[idx] = state;
438 
439 	BUG_ON(vm->sbm.mb_count[old_state] == 0);
440 	vm->sbm.mb_count[old_state]--;
441 	vm->sbm.mb_count[state]++;
442 }
443 
444 /*
445  * Get the state of a memory block.
446  */
447 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
448 					   unsigned long mb_id)
449 {
450 	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
451 
452 	return vm->sbm.mb_states[idx];
453 }
454 
455 /*
456  * Prepare the state array for the next memory block.
457  */
458 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
459 {
460 	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
461 	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
462 	uint8_t *new_array;
463 
464 	if (vm->sbm.mb_states && old_pages == new_pages)
465 		return 0;
466 
467 	new_array = vzalloc(new_pages * PAGE_SIZE);
468 	if (!new_array)
469 		return -ENOMEM;
470 
471 	mutex_lock(&vm->hotplug_mutex);
472 	if (vm->sbm.mb_states)
473 		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
474 	vfree(vm->sbm.mb_states);
475 	vm->sbm.mb_states = new_array;
476 	mutex_unlock(&vm->hotplug_mutex);
477 
478 	return 0;
479 }
480 
481 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
482 	for (_mb_id = _vm->sbm.first_mb_id; \
483 	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
484 	     _mb_id++) \
485 		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
486 
487 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
488 	for (_mb_id = _vm->sbm.next_mb_id - 1; \
489 	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
490 	     _mb_id--) \
491 		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
492 
493 /*
494  * Calculate the bit number in the subblock bitmap for the given subblock
495  * inside the given memory block.
496  */
497 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
498 					  unsigned long mb_id, int sb_id)
499 {
500 	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
501 }
502 
503 /*
504  * Mark all selected subblocks plugged.
505  *
506  * Will not modify the state of the memory block.
507  */
508 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
509 					  unsigned long mb_id, int sb_id,
510 					  int count)
511 {
512 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
513 
514 	__bitmap_set(vm->sbm.sb_states, bit, count);
515 }
516 
517 /*
518  * Mark all selected subblocks unplugged.
519  *
520  * Will not modify the state of the memory block.
521  */
522 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
523 					    unsigned long mb_id, int sb_id,
524 					    int count)
525 {
526 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
527 
528 	__bitmap_clear(vm->sbm.sb_states, bit, count);
529 }
530 
531 /*
532  * Test if all selected subblocks are plugged.
533  */
534 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
535 					   unsigned long mb_id, int sb_id,
536 					   int count)
537 {
538 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
539 
540 	if (count == 1)
541 		return test_bit(bit, vm->sbm.sb_states);
542 
543 	/* TODO: Helper similar to bitmap_set() */
544 	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
545 	       bit + count;
546 }
547 
548 /*
549  * Test if all selected subblocks are unplugged.
550  */
551 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
552 					     unsigned long mb_id, int sb_id,
553 					     int count)
554 {
555 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
556 
557 	/* TODO: Helper similar to bitmap_set() */
558 	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
559 	       bit + count;
560 }
561 
562 /*
563  * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
564  * none.
565  */
566 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
567 					    unsigned long mb_id)
568 {
569 	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
570 
571 	return find_next_zero_bit(vm->sbm.sb_states,
572 				  bit + vm->sbm.sbs_per_mb, bit) - bit;
573 }
574 
575 /*
576  * Prepare the subblock bitmap for the next memory block.
577  */
578 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
579 {
580 	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
581 	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
582 	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
583 	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
584 	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
585 	unsigned long *new_bitmap, *old_bitmap;
586 
587 	if (vm->sbm.sb_states && old_pages == new_pages)
588 		return 0;
589 
590 	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
591 	if (!new_bitmap)
592 		return -ENOMEM;
593 
594 	mutex_lock(&vm->hotplug_mutex);
595 	if (new_bitmap)
596 		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
597 
598 	old_bitmap = vm->sbm.sb_states;
599 	vm->sbm.sb_states = new_bitmap;
600 	mutex_unlock(&vm->hotplug_mutex);
601 
602 	vfree(old_bitmap);
603 	return 0;
604 }
605 
606 /*
607  * Test if we could add memory without creating too much offline memory -
608  * to avoid running OOM if memory is getting onlined deferred.
609  */
610 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
611 {
612 	if (WARN_ON_ONCE(size > vm->offline_threshold))
613 		return false;
614 
615 	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
616 }
617 
618 /*
619  * Try adding memory to Linux. Will usually only fail if out of memory.
620  *
621  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
622  * onlining code).
623  *
624  * Will not modify the state of memory blocks in virtio-mem.
625  */
626 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
627 				 uint64_t size)
628 {
629 	int rc;
630 
631 	/*
632 	 * When force-unloading the driver and we still have memory added to
633 	 * Linux, the resource name has to stay.
634 	 */
635 	if (!vm->resource_name) {
636 		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
637 						  GFP_KERNEL);
638 		if (!vm->resource_name)
639 			return -ENOMEM;
640 	}
641 
642 	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
643 		addr + size - 1);
644 	/* Memory might get onlined immediately. */
645 	atomic64_add(size, &vm->offline_size);
646 	rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
647 				       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
648 	if (rc) {
649 		atomic64_sub(size, &vm->offline_size);
650 		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
651 		/*
652 		 * TODO: Linux MM does not properly clean up yet in all cases
653 		 * where adding of memory failed - especially on -ENOMEM.
654 		 */
655 	}
656 	return rc;
657 }
658 
659 /*
660  * See virtio_mem_add_memory(): Try adding a single Linux memory block.
661  */
662 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
663 {
664 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
665 	const uint64_t size = memory_block_size_bytes();
666 
667 	return virtio_mem_add_memory(vm, addr, size);
668 }
669 
670 /*
671  * See virtio_mem_add_memory(): Try adding a big block.
672  */
673 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
674 {
675 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
676 	const uint64_t size = vm->bbm.bb_size;
677 
678 	return virtio_mem_add_memory(vm, addr, size);
679 }
680 
681 /*
682  * Try removing memory from Linux. Will only fail if memory blocks aren't
683  * offline.
684  *
685  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
686  * onlining code).
687  *
688  * Will not modify the state of memory blocks in virtio-mem.
689  */
690 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
691 				    uint64_t size)
692 {
693 	int rc;
694 
695 	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
696 		addr + size - 1);
697 	rc = remove_memory(addr, size);
698 	if (!rc) {
699 		atomic64_sub(size, &vm->offline_size);
700 		/*
701 		 * We might have freed up memory we can now unplug, retry
702 		 * immediately instead of waiting.
703 		 */
704 		virtio_mem_retry(vm);
705 	} else {
706 		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
707 	}
708 	return rc;
709 }
710 
711 /*
712  * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
713  */
714 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
715 {
716 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
717 	const uint64_t size = memory_block_size_bytes();
718 
719 	return virtio_mem_remove_memory(vm, addr, size);
720 }
721 
722 /*
723  * Try offlining and removing memory from Linux.
724  *
725  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
726  * onlining code).
727  *
728  * Will not modify the state of memory blocks in virtio-mem.
729  */
730 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
731 						uint64_t addr,
732 						uint64_t size)
733 {
734 	int rc;
735 
736 	dev_dbg(&vm->vdev->dev,
737 		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
738 		addr + size - 1);
739 
740 	rc = offline_and_remove_memory(addr, size);
741 	if (!rc) {
742 		atomic64_sub(size, &vm->offline_size);
743 		/*
744 		 * We might have freed up memory we can now unplug, retry
745 		 * immediately instead of waiting.
746 		 */
747 		virtio_mem_retry(vm);
748 	} else {
749 		dev_dbg(&vm->vdev->dev,
750 			"offlining and removing memory failed: %d\n", rc);
751 	}
752 	return rc;
753 }
754 
755 /*
756  * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
757  * a single Linux memory block.
758  */
759 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
760 						unsigned long mb_id)
761 {
762 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
763 	const uint64_t size = memory_block_size_bytes();
764 
765 	return virtio_mem_offline_and_remove_memory(vm, addr, size);
766 }
767 
768 /*
769  * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
770  * all Linux memory blocks covered by the big block.
771  */
772 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
773 						unsigned long bb_id)
774 {
775 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
776 	const uint64_t size = vm->bbm.bb_size;
777 
778 	return virtio_mem_offline_and_remove_memory(vm, addr, size);
779 }
780 
781 /*
782  * Trigger the workqueue so the device can perform its magic.
783  */
784 static void virtio_mem_retry(struct virtio_mem *vm)
785 {
786 	unsigned long flags;
787 
788 	spin_lock_irqsave(&vm->removal_lock, flags);
789 	if (!vm->removing)
790 		queue_work(system_freezable_wq, &vm->wq);
791 	spin_unlock_irqrestore(&vm->removal_lock, flags);
792 }
793 
794 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
795 {
796 	int node = NUMA_NO_NODE;
797 
798 #if defined(CONFIG_ACPI_NUMA)
799 	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
800 		node = pxm_to_node(node_id);
801 #endif
802 	return node;
803 }
804 
805 /*
806  * Test if a virtio-mem device overlaps with the given range. Can be called
807  * from (notifier) callbacks lockless.
808  */
809 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
810 				      uint64_t size)
811 {
812 	return start < vm->addr + vm->region_size && vm->addr < start + size;
813 }
814 
815 /*
816  * Test if a virtio-mem device contains a given range. Can be called from
817  * (notifier) callbacks lockless.
818  */
819 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
820 				      uint64_t size)
821 {
822 	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
823 }
824 
825 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
826 					      unsigned long mb_id)
827 {
828 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
829 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
830 	case VIRTIO_MEM_SBM_MB_OFFLINE:
831 		return NOTIFY_OK;
832 	default:
833 		break;
834 	}
835 	dev_warn_ratelimited(&vm->vdev->dev,
836 			     "memory block onlining denied\n");
837 	return NOTIFY_BAD;
838 }
839 
840 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
841 					  unsigned long mb_id)
842 {
843 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
844 	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
845 	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
846 		virtio_mem_sbm_set_mb_state(vm, mb_id,
847 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
848 		break;
849 	case VIRTIO_MEM_SBM_MB_KERNEL:
850 	case VIRTIO_MEM_SBM_MB_MOVABLE:
851 		virtio_mem_sbm_set_mb_state(vm, mb_id,
852 					    VIRTIO_MEM_SBM_MB_OFFLINE);
853 		break;
854 	default:
855 		BUG();
856 		break;
857 	}
858 }
859 
860 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
861 					 unsigned long mb_id,
862 					 unsigned long start_pfn)
863 {
864 	const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) ==
865 				ZONE_MOVABLE;
866 	int new_state;
867 
868 	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
869 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
870 		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
871 		if (is_movable)
872 			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
873 		break;
874 	case VIRTIO_MEM_SBM_MB_OFFLINE:
875 		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
876 		if (is_movable)
877 			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
878 		break;
879 	default:
880 		BUG();
881 		break;
882 	}
883 	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
884 }
885 
886 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
887 						unsigned long mb_id)
888 {
889 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
890 	unsigned long pfn;
891 	int sb_id;
892 
893 	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
894 		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
895 			continue;
896 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
897 			       sb_id * vm->sbm.sb_size);
898 		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
899 	}
900 }
901 
902 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
903 						 unsigned long mb_id)
904 {
905 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
906 	unsigned long pfn;
907 	int sb_id;
908 
909 	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
910 		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
911 			continue;
912 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
913 			       sb_id * vm->sbm.sb_size);
914 		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
915 	}
916 }
917 
918 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
919 						unsigned long bb_id,
920 						unsigned long pfn,
921 						unsigned long nr_pages)
922 {
923 	/*
924 	 * When marked as "fake-offline", all online memory of this device block
925 	 * is allocated by us. Otherwise, we don't have any memory allocated.
926 	 */
927 	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
928 	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
929 		return;
930 	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
931 }
932 
933 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
934 						 unsigned long bb_id,
935 						 unsigned long pfn,
936 						 unsigned long nr_pages)
937 {
938 	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
939 	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
940 		return;
941 	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
942 }
943 
944 /*
945  * This callback will either be called synchronously from add_memory() or
946  * asynchronously (e.g., triggered via user space). We have to be careful
947  * with locking when calling add_memory().
948  */
949 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
950 					 unsigned long action, void *arg)
951 {
952 	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
953 					     memory_notifier);
954 	struct memory_notify *mhp = arg;
955 	const unsigned long start = PFN_PHYS(mhp->start_pfn);
956 	const unsigned long size = PFN_PHYS(mhp->nr_pages);
957 	int rc = NOTIFY_OK;
958 	unsigned long id;
959 
960 	if (!virtio_mem_overlaps_range(vm, start, size))
961 		return NOTIFY_DONE;
962 
963 	if (vm->in_sbm) {
964 		id = virtio_mem_phys_to_mb_id(start);
965 		/*
966 		 * In SBM, we add memory in separate memory blocks - we expect
967 		 * it to be onlined/offlined in the same granularity. Bail out
968 		 * if this ever changes.
969 		 */
970 		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
971 				 !IS_ALIGNED(start, memory_block_size_bytes())))
972 			return NOTIFY_BAD;
973 	} else {
974 		id = virtio_mem_phys_to_bb_id(vm, start);
975 		/*
976 		 * In BBM, we only care about onlining/offlining happening
977 		 * within a single big block, we don't care about the
978 		 * actual granularity as we don't track individual Linux
979 		 * memory blocks.
980 		 */
981 		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
982 			return NOTIFY_BAD;
983 	}
984 
985 	/*
986 	 * Avoid circular locking lockdep warnings. We lock the mutex
987 	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
988 	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
989 	 * between both notifier calls and will bail out. False positive.
990 	 */
991 	lockdep_off();
992 
993 	switch (action) {
994 	case MEM_GOING_OFFLINE:
995 		mutex_lock(&vm->hotplug_mutex);
996 		if (vm->removing) {
997 			rc = notifier_from_errno(-EBUSY);
998 			mutex_unlock(&vm->hotplug_mutex);
999 			break;
1000 		}
1001 		vm->hotplug_active = true;
1002 		if (vm->in_sbm)
1003 			virtio_mem_sbm_notify_going_offline(vm, id);
1004 		else
1005 			virtio_mem_bbm_notify_going_offline(vm, id,
1006 							    mhp->start_pfn,
1007 							    mhp->nr_pages);
1008 		break;
1009 	case MEM_GOING_ONLINE:
1010 		mutex_lock(&vm->hotplug_mutex);
1011 		if (vm->removing) {
1012 			rc = notifier_from_errno(-EBUSY);
1013 			mutex_unlock(&vm->hotplug_mutex);
1014 			break;
1015 		}
1016 		vm->hotplug_active = true;
1017 		if (vm->in_sbm)
1018 			rc = virtio_mem_sbm_notify_going_online(vm, id);
1019 		break;
1020 	case MEM_OFFLINE:
1021 		if (vm->in_sbm)
1022 			virtio_mem_sbm_notify_offline(vm, id);
1023 
1024 		atomic64_add(size, &vm->offline_size);
1025 		/*
1026 		 * Trigger the workqueue. Now that we have some offline memory,
1027 		 * maybe we can handle pending unplug requests.
1028 		 */
1029 		if (!unplug_online)
1030 			virtio_mem_retry(vm);
1031 
1032 		vm->hotplug_active = false;
1033 		mutex_unlock(&vm->hotplug_mutex);
1034 		break;
1035 	case MEM_ONLINE:
1036 		if (vm->in_sbm)
1037 			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1038 
1039 		atomic64_sub(size, &vm->offline_size);
1040 		/*
1041 		 * Start adding more memory once we onlined half of our
1042 		 * threshold. Don't trigger if it's possibly due to our actipn
1043 		 * (e.g., us adding memory which gets onlined immediately from
1044 		 * the core).
1045 		 */
1046 		if (!atomic_read(&vm->wq_active) &&
1047 		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1048 			virtio_mem_retry(vm);
1049 
1050 		vm->hotplug_active = false;
1051 		mutex_unlock(&vm->hotplug_mutex);
1052 		break;
1053 	case MEM_CANCEL_OFFLINE:
1054 		if (!vm->hotplug_active)
1055 			break;
1056 		if (vm->in_sbm)
1057 			virtio_mem_sbm_notify_cancel_offline(vm, id);
1058 		else
1059 			virtio_mem_bbm_notify_cancel_offline(vm, id,
1060 							     mhp->start_pfn,
1061 							     mhp->nr_pages);
1062 		vm->hotplug_active = false;
1063 		mutex_unlock(&vm->hotplug_mutex);
1064 		break;
1065 	case MEM_CANCEL_ONLINE:
1066 		if (!vm->hotplug_active)
1067 			break;
1068 		vm->hotplug_active = false;
1069 		mutex_unlock(&vm->hotplug_mutex);
1070 		break;
1071 	default:
1072 		break;
1073 	}
1074 
1075 	lockdep_on();
1076 
1077 	return rc;
1078 }
1079 
1080 /*
1081  * Set a range of pages PG_offline. Remember pages that were never onlined
1082  * (via generic_online_page()) using PageDirty().
1083  */
1084 static void virtio_mem_set_fake_offline(unsigned long pfn,
1085 					unsigned long nr_pages, bool onlined)
1086 {
1087 	page_offline_begin();
1088 	for (; nr_pages--; pfn++) {
1089 		struct page *page = pfn_to_page(pfn);
1090 
1091 		__SetPageOffline(page);
1092 		if (!onlined) {
1093 			SetPageDirty(page);
1094 			/* FIXME: remove after cleanups */
1095 			ClearPageReserved(page);
1096 		}
1097 	}
1098 	page_offline_end();
1099 }
1100 
1101 /*
1102  * Clear PG_offline from a range of pages. If the pages were never onlined,
1103  * (via generic_online_page()), clear PageDirty().
1104  */
1105 static void virtio_mem_clear_fake_offline(unsigned long pfn,
1106 					  unsigned long nr_pages, bool onlined)
1107 {
1108 	for (; nr_pages--; pfn++) {
1109 		struct page *page = pfn_to_page(pfn);
1110 
1111 		__ClearPageOffline(page);
1112 		if (!onlined)
1113 			ClearPageDirty(page);
1114 	}
1115 }
1116 
1117 /*
1118  * Release a range of fake-offline pages to the buddy, effectively
1119  * fake-onlining them.
1120  */
1121 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1122 {
1123 	const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
1124 	unsigned long i;
1125 
1126 	/*
1127 	 * We are always called at least with MAX_ORDER_NR_PAGES
1128 	 * granularity/alignment (e.g., the way subblocks work). All pages
1129 	 * inside such a block are alike.
1130 	 */
1131 	for (i = 0; i < nr_pages; i += max_nr_pages) {
1132 		struct page *page = pfn_to_page(pfn + i);
1133 
1134 		/*
1135 		 * If the page is PageDirty(), it was kept fake-offline when
1136 		 * onlining the memory block. Otherwise, it was allocated
1137 		 * using alloc_contig_range(). All pages in a subblock are
1138 		 * alike.
1139 		 */
1140 		if (PageDirty(page)) {
1141 			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1142 						      false);
1143 			generic_online_page(page, MAX_ORDER - 1);
1144 		} else {
1145 			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1146 						      true);
1147 			free_contig_range(pfn + i, max_nr_pages);
1148 			adjust_managed_page_count(page, max_nr_pages);
1149 		}
1150 	}
1151 }
1152 
1153 /*
1154  * Try to allocate a range, marking pages fake-offline, effectively
1155  * fake-offlining them.
1156  */
1157 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
1158 {
1159 	const bool is_movable = page_zonenum(pfn_to_page(pfn)) ==
1160 				ZONE_MOVABLE;
1161 	int rc, retry_count;
1162 
1163 	/*
1164 	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1165 	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1166 	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1167 	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1168 	 * some guarantees.
1169 	 */
1170 	for (retry_count = 0; retry_count < 5; retry_count++) {
1171 		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1172 					GFP_KERNEL);
1173 		if (rc == -ENOMEM)
1174 			/* whoops, out of memory */
1175 			return rc;
1176 		else if (rc && !is_movable)
1177 			break;
1178 		else if (rc)
1179 			continue;
1180 
1181 		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1182 		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1183 		return 0;
1184 	}
1185 
1186 	return -EBUSY;
1187 }
1188 
1189 /*
1190  * Handle fake-offline pages when memory is going offline - such that the
1191  * pages can be skipped by mm-core when offlining.
1192  */
1193 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1194 						  unsigned long nr_pages)
1195 {
1196 	struct page *page;
1197 	unsigned long i;
1198 
1199 	/*
1200 	 * Drop our reference to the pages so the memory can get offlined
1201 	 * and add the unplugged pages to the managed page counters (so
1202 	 * offlining code can correctly subtract them again).
1203 	 */
1204 	adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
1205 	/* Drop our reference to the pages so the memory can get offlined. */
1206 	for (i = 0; i < nr_pages; i++) {
1207 		page = pfn_to_page(pfn + i);
1208 		if (WARN_ON(!page_ref_dec_and_test(page)))
1209 			dump_page(page, "fake-offline page referenced");
1210 	}
1211 }
1212 
1213 /*
1214  * Handle fake-offline pages when memory offlining is canceled - to undo
1215  * what we did in virtio_mem_fake_offline_going_offline().
1216  */
1217 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1218 						   unsigned long nr_pages)
1219 {
1220 	unsigned long i;
1221 
1222 	/*
1223 	 * Get the reference we dropped when going offline and subtract the
1224 	 * unplugged pages from the managed page counters.
1225 	 */
1226 	adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1227 	for (i = 0; i < nr_pages; i++)
1228 		page_ref_inc(pfn_to_page(pfn + i));
1229 }
1230 
1231 static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1232 {
1233 	const unsigned long addr = page_to_phys(page);
1234 	unsigned long id, sb_id;
1235 	struct virtio_mem *vm;
1236 	bool do_online;
1237 
1238 	rcu_read_lock();
1239 	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1240 		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1241 			continue;
1242 
1243 		if (vm->in_sbm) {
1244 			/*
1245 			 * We exploit here that subblocks have at least
1246 			 * MAX_ORDER_NR_PAGES size/alignment - so we cannot
1247 			 * cross subblocks within one call.
1248 			 */
1249 			id = virtio_mem_phys_to_mb_id(addr);
1250 			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1251 			do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
1252 								   sb_id, 1);
1253 		} else {
1254 			/*
1255 			 * If the whole block is marked fake offline, keep
1256 			 * everything that way.
1257 			 */
1258 			id = virtio_mem_phys_to_bb_id(vm, addr);
1259 			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1260 				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1261 		}
1262 
1263 		/*
1264 		 * virtio_mem_set_fake_offline() might sleep, we don't need
1265 		 * the device anymore. See virtio_mem_remove() how races
1266 		 * between memory onlining and device removal are handled.
1267 		 */
1268 		rcu_read_unlock();
1269 
1270 		if (do_online)
1271 			generic_online_page(page, order);
1272 		else
1273 			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1274 						    false);
1275 		return;
1276 	}
1277 	rcu_read_unlock();
1278 
1279 	/* not virtio-mem memory, but e.g., a DIMM. online it */
1280 	generic_online_page(page, order);
1281 }
1282 
1283 static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1284 					const struct virtio_mem_req *req)
1285 {
1286 	struct scatterlist *sgs[2], sg_req, sg_resp;
1287 	unsigned int len;
1288 	int rc;
1289 
1290 	/* don't use the request residing on the stack (vaddr) */
1291 	vm->req = *req;
1292 
1293 	/* out: buffer for request */
1294 	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1295 	sgs[0] = &sg_req;
1296 
1297 	/* in: buffer for response */
1298 	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1299 	sgs[1] = &sg_resp;
1300 
1301 	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1302 	if (rc < 0)
1303 		return rc;
1304 
1305 	virtqueue_kick(vm->vq);
1306 
1307 	/* wait for a response */
1308 	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1309 
1310 	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1311 }
1312 
1313 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1314 					uint64_t size)
1315 {
1316 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1317 	const struct virtio_mem_req req = {
1318 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1319 		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1320 		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1321 	};
1322 	int rc = -ENOMEM;
1323 
1324 	if (atomic_read(&vm->config_changed))
1325 		return -EAGAIN;
1326 
1327 	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1328 		addr + size - 1);
1329 
1330 	switch (virtio_mem_send_request(vm, &req)) {
1331 	case VIRTIO_MEM_RESP_ACK:
1332 		vm->plugged_size += size;
1333 		return 0;
1334 	case VIRTIO_MEM_RESP_NACK:
1335 		rc = -EAGAIN;
1336 		break;
1337 	case VIRTIO_MEM_RESP_BUSY:
1338 		rc = -ETXTBSY;
1339 		break;
1340 	case VIRTIO_MEM_RESP_ERROR:
1341 		rc = -EINVAL;
1342 		break;
1343 	default:
1344 		break;
1345 	}
1346 
1347 	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1348 	return rc;
1349 }
1350 
1351 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1352 					  uint64_t size)
1353 {
1354 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1355 	const struct virtio_mem_req req = {
1356 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1357 		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1358 		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1359 	};
1360 	int rc = -ENOMEM;
1361 
1362 	if (atomic_read(&vm->config_changed))
1363 		return -EAGAIN;
1364 
1365 	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1366 		addr + size - 1);
1367 
1368 	switch (virtio_mem_send_request(vm, &req)) {
1369 	case VIRTIO_MEM_RESP_ACK:
1370 		vm->plugged_size -= size;
1371 		return 0;
1372 	case VIRTIO_MEM_RESP_BUSY:
1373 		rc = -ETXTBSY;
1374 		break;
1375 	case VIRTIO_MEM_RESP_ERROR:
1376 		rc = -EINVAL;
1377 		break;
1378 	default:
1379 		break;
1380 	}
1381 
1382 	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1383 	return rc;
1384 }
1385 
1386 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1387 {
1388 	const struct virtio_mem_req req = {
1389 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1390 	};
1391 	int rc = -ENOMEM;
1392 
1393 	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1394 
1395 	switch (virtio_mem_send_request(vm, &req)) {
1396 	case VIRTIO_MEM_RESP_ACK:
1397 		vm->unplug_all_required = false;
1398 		vm->plugged_size = 0;
1399 		/* usable region might have shrunk */
1400 		atomic_set(&vm->config_changed, 1);
1401 		return 0;
1402 	case VIRTIO_MEM_RESP_BUSY:
1403 		rc = -ETXTBSY;
1404 		break;
1405 	default:
1406 		break;
1407 	}
1408 
1409 	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1410 	return rc;
1411 }
1412 
1413 /*
1414  * Plug selected subblocks. Updates the plugged state, but not the state
1415  * of the memory block.
1416  */
1417 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1418 				  int sb_id, int count)
1419 {
1420 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1421 			      sb_id * vm->sbm.sb_size;
1422 	const uint64_t size = count * vm->sbm.sb_size;
1423 	int rc;
1424 
1425 	rc = virtio_mem_send_plug_request(vm, addr, size);
1426 	if (!rc)
1427 		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1428 	return rc;
1429 }
1430 
1431 /*
1432  * Unplug selected subblocks. Updates the plugged state, but not the state
1433  * of the memory block.
1434  */
1435 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1436 				    int sb_id, int count)
1437 {
1438 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1439 			      sb_id * vm->sbm.sb_size;
1440 	const uint64_t size = count * vm->sbm.sb_size;
1441 	int rc;
1442 
1443 	rc = virtio_mem_send_unplug_request(vm, addr, size);
1444 	if (!rc)
1445 		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1446 	return rc;
1447 }
1448 
1449 /*
1450  * Request to unplug a big block.
1451  *
1452  * Will not modify the state of the big block.
1453  */
1454 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1455 {
1456 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1457 	const uint64_t size = vm->bbm.bb_size;
1458 
1459 	return virtio_mem_send_unplug_request(vm, addr, size);
1460 }
1461 
1462 /*
1463  * Request to plug a big block.
1464  *
1465  * Will not modify the state of the big block.
1466  */
1467 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1468 {
1469 	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1470 	const uint64_t size = vm->bbm.bb_size;
1471 
1472 	return virtio_mem_send_plug_request(vm, addr, size);
1473 }
1474 
1475 /*
1476  * Unplug the desired number of plugged subblocks of a offline or not-added
1477  * memory block. Will fail if any subblock cannot get unplugged (instead of
1478  * skipping it).
1479  *
1480  * Will not modify the state of the memory block.
1481  *
1482  * Note: can fail after some subblocks were unplugged.
1483  */
1484 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1485 					    unsigned long mb_id, uint64_t *nb_sb)
1486 {
1487 	int sb_id, count;
1488 	int rc;
1489 
1490 	sb_id = vm->sbm.sbs_per_mb - 1;
1491 	while (*nb_sb) {
1492 		/* Find the next candidate subblock */
1493 		while (sb_id >= 0 &&
1494 		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1495 			sb_id--;
1496 		if (sb_id < 0)
1497 			break;
1498 		/* Try to unplug multiple subblocks at a time */
1499 		count = 1;
1500 		while (count < *nb_sb && sb_id > 0 &&
1501 		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1502 			count++;
1503 			sb_id--;
1504 		}
1505 
1506 		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1507 		if (rc)
1508 			return rc;
1509 		*nb_sb -= count;
1510 		sb_id--;
1511 	}
1512 
1513 	return 0;
1514 }
1515 
1516 /*
1517  * Unplug all plugged subblocks of an offline or not-added memory block.
1518  *
1519  * Will not modify the state of the memory block.
1520  *
1521  * Note: can fail after some subblocks were unplugged.
1522  */
1523 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1524 {
1525 	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1526 
1527 	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1528 }
1529 
1530 /*
1531  * Prepare tracking data for the next memory block.
1532  */
1533 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1534 					  unsigned long *mb_id)
1535 {
1536 	int rc;
1537 
1538 	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1539 		return -ENOSPC;
1540 
1541 	/* Resize the state array if required. */
1542 	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1543 	if (rc)
1544 		return rc;
1545 
1546 	/* Resize the subblock bitmap if required. */
1547 	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1548 	if (rc)
1549 		return rc;
1550 
1551 	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1552 	*mb_id = vm->sbm.next_mb_id++;
1553 	return 0;
1554 }
1555 
1556 /*
1557  * Try to plug the desired number of subblocks and add the memory block
1558  * to Linux.
1559  *
1560  * Will modify the state of the memory block.
1561  */
1562 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1563 					  unsigned long mb_id, uint64_t *nb_sb)
1564 {
1565 	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1566 	int rc;
1567 
1568 	if (WARN_ON_ONCE(!count))
1569 		return -EINVAL;
1570 
1571 	/*
1572 	 * Plug the requested number of subblocks before adding it to linux,
1573 	 * so that onlining will directly online all plugged subblocks.
1574 	 */
1575 	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1576 	if (rc)
1577 		return rc;
1578 
1579 	/*
1580 	 * Mark the block properly offline before adding it to Linux,
1581 	 * so the memory notifiers will find the block in the right state.
1582 	 */
1583 	if (count == vm->sbm.sbs_per_mb)
1584 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1585 					    VIRTIO_MEM_SBM_MB_OFFLINE);
1586 	else
1587 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1588 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1589 
1590 	/* Add the memory block to linux - if that fails, try to unplug. */
1591 	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1592 	if (rc) {
1593 		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1594 
1595 		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1596 			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1597 		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1598 		return rc;
1599 	}
1600 
1601 	*nb_sb -= count;
1602 	return 0;
1603 }
1604 
1605 /*
1606  * Try to plug the desired number of subblocks of a memory block that
1607  * is already added to Linux.
1608  *
1609  * Will modify the state of the memory block.
1610  *
1611  * Note: Can fail after some subblocks were successfully plugged.
1612  */
1613 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1614 				      unsigned long mb_id, uint64_t *nb_sb)
1615 {
1616 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1617 	unsigned long pfn, nr_pages;
1618 	int sb_id, count;
1619 	int rc;
1620 
1621 	if (WARN_ON_ONCE(!*nb_sb))
1622 		return -EINVAL;
1623 
1624 	while (*nb_sb) {
1625 		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1626 		if (sb_id >= vm->sbm.sbs_per_mb)
1627 			break;
1628 		count = 1;
1629 		while (count < *nb_sb &&
1630 		       sb_id + count < vm->sbm.sbs_per_mb &&
1631 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1632 			count++;
1633 
1634 		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1635 		if (rc)
1636 			return rc;
1637 		*nb_sb -= count;
1638 		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1639 			continue;
1640 
1641 		/* fake-online the pages if the memory block is online */
1642 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1643 			       sb_id * vm->sbm.sb_size);
1644 		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1645 		virtio_mem_fake_online(pfn, nr_pages);
1646 	}
1647 
1648 	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1649 		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1650 
1651 	return 0;
1652 }
1653 
1654 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1655 {
1656 	const int mb_states[] = {
1657 		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1658 		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1659 		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1660 	};
1661 	uint64_t nb_sb = diff / vm->sbm.sb_size;
1662 	unsigned long mb_id;
1663 	int rc, i;
1664 
1665 	if (!nb_sb)
1666 		return 0;
1667 
1668 	/* Don't race with onlining/offlining */
1669 	mutex_lock(&vm->hotplug_mutex);
1670 
1671 	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1672 		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1673 			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1674 			if (rc || !nb_sb)
1675 				goto out_unlock;
1676 			cond_resched();
1677 		}
1678 	}
1679 
1680 	/*
1681 	 * We won't be working on online/offline memory blocks from this point,
1682 	 * so we can't race with memory onlining/offlining. Drop the mutex.
1683 	 */
1684 	mutex_unlock(&vm->hotplug_mutex);
1685 
1686 	/* Try to plug and add unused blocks */
1687 	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1688 		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1689 			return -ENOSPC;
1690 
1691 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1692 		if (rc || !nb_sb)
1693 			return rc;
1694 		cond_resched();
1695 	}
1696 
1697 	/* Try to prepare, plug and add new blocks */
1698 	while (nb_sb) {
1699 		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1700 			return -ENOSPC;
1701 
1702 		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1703 		if (rc)
1704 			return rc;
1705 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1706 		if (rc)
1707 			return rc;
1708 		cond_resched();
1709 	}
1710 
1711 	return 0;
1712 out_unlock:
1713 	mutex_unlock(&vm->hotplug_mutex);
1714 	return rc;
1715 }
1716 
1717 /*
1718  * Plug a big block and add it to Linux.
1719  *
1720  * Will modify the state of the big block.
1721  */
1722 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1723 					  unsigned long bb_id)
1724 {
1725 	int rc;
1726 
1727 	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1728 			 VIRTIO_MEM_BBM_BB_UNUSED))
1729 		return -EINVAL;
1730 
1731 	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1732 	if (rc)
1733 		return rc;
1734 	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1735 
1736 	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1737 	if (rc) {
1738 		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1739 			virtio_mem_bbm_set_bb_state(vm, bb_id,
1740 						    VIRTIO_MEM_BBM_BB_UNUSED);
1741 		else
1742 			/* Retry from the main loop. */
1743 			virtio_mem_bbm_set_bb_state(vm, bb_id,
1744 						    VIRTIO_MEM_BBM_BB_PLUGGED);
1745 		return rc;
1746 	}
1747 	return 0;
1748 }
1749 
1750 /*
1751  * Prepare tracking data for the next big block.
1752  */
1753 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1754 					  unsigned long *bb_id)
1755 {
1756 	int rc;
1757 
1758 	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1759 		return -ENOSPC;
1760 
1761 	/* Resize the big block state array if required. */
1762 	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1763 	if (rc)
1764 		return rc;
1765 
1766 	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1767 	*bb_id = vm->bbm.next_bb_id;
1768 	vm->bbm.next_bb_id++;
1769 	return 0;
1770 }
1771 
1772 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1773 {
1774 	uint64_t nb_bb = diff / vm->bbm.bb_size;
1775 	unsigned long bb_id;
1776 	int rc;
1777 
1778 	if (!nb_bb)
1779 		return 0;
1780 
1781 	/* Try to plug and add unused big blocks */
1782 	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1783 		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1784 			return -ENOSPC;
1785 
1786 		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1787 		if (!rc)
1788 			nb_bb--;
1789 		if (rc || !nb_bb)
1790 			return rc;
1791 		cond_resched();
1792 	}
1793 
1794 	/* Try to prepare, plug and add new big blocks */
1795 	while (nb_bb) {
1796 		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1797 			return -ENOSPC;
1798 
1799 		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1800 		if (rc)
1801 			return rc;
1802 		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1803 		if (!rc)
1804 			nb_bb--;
1805 		if (rc)
1806 			return rc;
1807 		cond_resched();
1808 	}
1809 
1810 	return 0;
1811 }
1812 
1813 /*
1814  * Try to plug the requested amount of memory.
1815  */
1816 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1817 {
1818 	if (vm->in_sbm)
1819 		return virtio_mem_sbm_plug_request(vm, diff);
1820 	return virtio_mem_bbm_plug_request(vm, diff);
1821 }
1822 
1823 /*
1824  * Unplug the desired number of plugged subblocks of an offline memory block.
1825  * Will fail if any subblock cannot get unplugged (instead of skipping it).
1826  *
1827  * Will modify the state of the memory block. Might temporarily drop the
1828  * hotplug_mutex.
1829  *
1830  * Note: Can fail after some subblocks were successfully unplugged.
1831  */
1832 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1833 						unsigned long mb_id,
1834 						uint64_t *nb_sb)
1835 {
1836 	int rc;
1837 
1838 	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1839 
1840 	/* some subblocks might have been unplugged even on failure */
1841 	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1842 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1843 					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1844 	if (rc)
1845 		return rc;
1846 
1847 	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1848 		/*
1849 		 * Remove the block from Linux - this should never fail.
1850 		 * Hinder the block from getting onlined by marking it
1851 		 * unplugged. Temporarily drop the mutex, so
1852 		 * any pending GOING_ONLINE requests can be serviced/rejected.
1853 		 */
1854 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1855 					    VIRTIO_MEM_SBM_MB_UNUSED);
1856 
1857 		mutex_unlock(&vm->hotplug_mutex);
1858 		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1859 		BUG_ON(rc);
1860 		mutex_lock(&vm->hotplug_mutex);
1861 	}
1862 	return 0;
1863 }
1864 
1865 /*
1866  * Unplug the given plugged subblocks of an online memory block.
1867  *
1868  * Will modify the state of the memory block.
1869  */
1870 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1871 					   unsigned long mb_id, int sb_id,
1872 					   int count)
1873 {
1874 	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1875 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1876 	unsigned long start_pfn;
1877 	int rc;
1878 
1879 	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1880 			     sb_id * vm->sbm.sb_size);
1881 
1882 	rc = virtio_mem_fake_offline(start_pfn, nr_pages);
1883 	if (rc)
1884 		return rc;
1885 
1886 	/* Try to unplug the allocated memory */
1887 	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1888 	if (rc) {
1889 		/* Return the memory to the buddy. */
1890 		virtio_mem_fake_online(start_pfn, nr_pages);
1891 		return rc;
1892 	}
1893 
1894 	switch (old_state) {
1895 	case VIRTIO_MEM_SBM_MB_KERNEL:
1896 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1897 					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
1898 		break;
1899 	case VIRTIO_MEM_SBM_MB_MOVABLE:
1900 		virtio_mem_sbm_set_mb_state(vm, mb_id,
1901 					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
1902 		break;
1903 	}
1904 
1905 	return 0;
1906 }
1907 
1908 /*
1909  * Unplug the desired number of plugged subblocks of an online memory block.
1910  * Will skip subblock that are busy.
1911  *
1912  * Will modify the state of the memory block. Might temporarily drop the
1913  * hotplug_mutex.
1914  *
1915  * Note: Can fail after some subblocks were successfully unplugged. Can
1916  *       return 0 even if subblocks were busy and could not get unplugged.
1917  */
1918 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
1919 					       unsigned long mb_id,
1920 					       uint64_t *nb_sb)
1921 {
1922 	int rc, sb_id;
1923 
1924 	/* If possible, try to unplug the complete block in one shot. */
1925 	if (*nb_sb >= vm->sbm.sbs_per_mb &&
1926 	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1927 		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
1928 						     vm->sbm.sbs_per_mb);
1929 		if (!rc) {
1930 			*nb_sb -= vm->sbm.sbs_per_mb;
1931 			goto unplugged;
1932 		} else if (rc != -EBUSY)
1933 			return rc;
1934 	}
1935 
1936 	/* Fallback to single subblocks. */
1937 	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1938 		/* Find the next candidate subblock */
1939 		while (sb_id >= 0 &&
1940 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
1941 			sb_id--;
1942 		if (sb_id < 0)
1943 			break;
1944 
1945 		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
1946 		if (rc == -EBUSY)
1947 			continue;
1948 		else if (rc)
1949 			return rc;
1950 		*nb_sb -= 1;
1951 	}
1952 
1953 unplugged:
1954 	/*
1955 	 * Once all subblocks of a memory block were unplugged, offline and
1956 	 * remove it. This will usually not fail, as no memory is in use
1957 	 * anymore - however some other notifiers might NACK the request.
1958 	 */
1959 	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1960 		mutex_unlock(&vm->hotplug_mutex);
1961 		rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
1962 		mutex_lock(&vm->hotplug_mutex);
1963 		if (!rc)
1964 			virtio_mem_sbm_set_mb_state(vm, mb_id,
1965 						    VIRTIO_MEM_SBM_MB_UNUSED);
1966 	}
1967 
1968 	return 0;
1969 }
1970 
1971 /*
1972  * Unplug the desired number of plugged subblocks of a memory block that is
1973  * already added to Linux. Will skip subblock of online memory blocks that are
1974  * busy (by the OS). Will fail if any subblock that's not busy cannot get
1975  * unplugged.
1976  *
1977  * Will modify the state of the memory block. Might temporarily drop the
1978  * hotplug_mutex.
1979  *
1980  * Note: Can fail after some subblocks were successfully unplugged. Can
1981  *       return 0 even if subblocks were busy and could not get unplugged.
1982  */
1983 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
1984 					unsigned long mb_id,
1985 					uint64_t *nb_sb)
1986 {
1987 	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1988 
1989 	switch (old_state) {
1990 	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
1991 	case VIRTIO_MEM_SBM_MB_KERNEL:
1992 	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
1993 	case VIRTIO_MEM_SBM_MB_MOVABLE:
1994 		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
1995 	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
1996 	case VIRTIO_MEM_SBM_MB_OFFLINE:
1997 		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
1998 	}
1999 	return -EINVAL;
2000 }
2001 
2002 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2003 {
2004 	const int mb_states[] = {
2005 		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
2006 		VIRTIO_MEM_SBM_MB_OFFLINE,
2007 		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
2008 		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
2009 		VIRTIO_MEM_SBM_MB_MOVABLE,
2010 		VIRTIO_MEM_SBM_MB_KERNEL,
2011 	};
2012 	uint64_t nb_sb = diff / vm->sbm.sb_size;
2013 	unsigned long mb_id;
2014 	int rc, i;
2015 
2016 	if (!nb_sb)
2017 		return 0;
2018 
2019 	/*
2020 	 * We'll drop the mutex a couple of times when it is safe to do so.
2021 	 * This might result in some blocks switching the state (online/offline)
2022 	 * and we could miss them in this run - we will retry again later.
2023 	 */
2024 	mutex_lock(&vm->hotplug_mutex);
2025 
2026 	/*
2027 	 * We try unplug from partially plugged blocks first, to try removing
2028 	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2029 	 * as it's more reliable to unplug memory and remove whole memory
2030 	 * blocks, and we don't want to trigger a zone imbalances by
2031 	 * accidentially removing too much kernel memory.
2032 	 */
2033 	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2034 		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2035 			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2036 			if (rc || !nb_sb)
2037 				goto out_unlock;
2038 			mutex_unlock(&vm->hotplug_mutex);
2039 			cond_resched();
2040 			mutex_lock(&vm->hotplug_mutex);
2041 		}
2042 		if (!unplug_online && i == 1) {
2043 			mutex_unlock(&vm->hotplug_mutex);
2044 			return 0;
2045 		}
2046 	}
2047 
2048 	mutex_unlock(&vm->hotplug_mutex);
2049 	return nb_sb ? -EBUSY : 0;
2050 out_unlock:
2051 	mutex_unlock(&vm->hotplug_mutex);
2052 	return rc;
2053 }
2054 
2055 /*
2056  * Try to offline and remove a big block from Linux and unplug it. Will fail
2057  * with -EBUSY if some memory is busy and cannot get unplugged.
2058  *
2059  * Will modify the state of the memory block. Might temporarily drop the
2060  * hotplug_mutex.
2061  */
2062 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2063 						       unsigned long bb_id)
2064 {
2065 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2066 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2067 	unsigned long end_pfn = start_pfn + nr_pages;
2068 	unsigned long pfn;
2069 	struct page *page;
2070 	int rc;
2071 
2072 	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2073 			 VIRTIO_MEM_BBM_BB_ADDED))
2074 		return -EINVAL;
2075 
2076 	if (bbm_safe_unplug) {
2077 		/*
2078 		 * Start by fake-offlining all memory. Once we marked the device
2079 		 * block as fake-offline, all newly onlined memory will
2080 		 * automatically be kept fake-offline. Protect from concurrent
2081 		 * onlining/offlining until we have a consistent state.
2082 		 */
2083 		mutex_lock(&vm->hotplug_mutex);
2084 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2085 					    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2086 
2087 		for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2088 			page = pfn_to_online_page(pfn);
2089 			if (!page)
2090 				continue;
2091 
2092 			rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
2093 			if (rc) {
2094 				end_pfn = pfn;
2095 				goto rollback_safe_unplug;
2096 			}
2097 		}
2098 		mutex_unlock(&vm->hotplug_mutex);
2099 	}
2100 
2101 	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2102 	if (rc) {
2103 		if (bbm_safe_unplug) {
2104 			mutex_lock(&vm->hotplug_mutex);
2105 			goto rollback_safe_unplug;
2106 		}
2107 		return rc;
2108 	}
2109 
2110 	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2111 	if (rc)
2112 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2113 					    VIRTIO_MEM_BBM_BB_PLUGGED);
2114 	else
2115 		virtio_mem_bbm_set_bb_state(vm, bb_id,
2116 					    VIRTIO_MEM_BBM_BB_UNUSED);
2117 	return rc;
2118 
2119 rollback_safe_unplug:
2120 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2121 		page = pfn_to_online_page(pfn);
2122 		if (!page)
2123 			continue;
2124 		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2125 	}
2126 	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2127 	mutex_unlock(&vm->hotplug_mutex);
2128 	return rc;
2129 }
2130 
2131 /*
2132  * Test if a big block is completely offline.
2133  */
2134 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2135 					 unsigned long bb_id)
2136 {
2137 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2138 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2139 	unsigned long pfn;
2140 
2141 	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2142 	     pfn += PAGES_PER_SECTION) {
2143 		if (pfn_to_online_page(pfn))
2144 			return false;
2145 	}
2146 
2147 	return true;
2148 }
2149 
2150 /*
2151  * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2152  */
2153 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2154 					 unsigned long bb_id)
2155 {
2156 	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2157 	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2158 	struct page *page;
2159 	unsigned long pfn;
2160 
2161 	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2162 	     pfn += PAGES_PER_SECTION) {
2163 		page = pfn_to_online_page(pfn);
2164 		if (!page)
2165 			continue;
2166 		if (page_zonenum(page) != ZONE_MOVABLE)
2167 			return false;
2168 	}
2169 
2170 	return true;
2171 }
2172 
2173 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2174 {
2175 	uint64_t nb_bb = diff / vm->bbm.bb_size;
2176 	uint64_t bb_id;
2177 	int rc, i;
2178 
2179 	if (!nb_bb)
2180 		return 0;
2181 
2182 	/*
2183 	 * Try to unplug big blocks. Similar to SBM, start with offline
2184 	 * big blocks.
2185 	 */
2186 	for (i = 0; i < 3; i++) {
2187 		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2188 			cond_resched();
2189 
2190 			/*
2191 			 * As we're holding no locks, these checks are racy,
2192 			 * but we don't care.
2193 			 */
2194 			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2195 				continue;
2196 			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2197 				continue;
2198 			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2199 			if (rc == -EBUSY)
2200 				continue;
2201 			if (!rc)
2202 				nb_bb--;
2203 			if (rc || !nb_bb)
2204 				return rc;
2205 		}
2206 		if (i == 0 && !unplug_online)
2207 			return 0;
2208 	}
2209 
2210 	return nb_bb ? -EBUSY : 0;
2211 }
2212 
2213 /*
2214  * Try to unplug the requested amount of memory.
2215  */
2216 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2217 {
2218 	if (vm->in_sbm)
2219 		return virtio_mem_sbm_unplug_request(vm, diff);
2220 	return virtio_mem_bbm_unplug_request(vm, diff);
2221 }
2222 
2223 /*
2224  * Try to unplug all blocks that couldn't be unplugged before, for example,
2225  * because the hypervisor was busy.
2226  */
2227 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
2228 {
2229 	unsigned long id;
2230 	int rc;
2231 
2232 	if (!vm->in_sbm) {
2233 		virtio_mem_bbm_for_each_bb(vm, id,
2234 					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2235 			rc = virtio_mem_bbm_unplug_bb(vm, id);
2236 			if (rc)
2237 				return rc;
2238 			virtio_mem_bbm_set_bb_state(vm, id,
2239 						    VIRTIO_MEM_BBM_BB_UNUSED);
2240 		}
2241 		return 0;
2242 	}
2243 
2244 	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2245 		rc = virtio_mem_sbm_unplug_mb(vm, id);
2246 		if (rc)
2247 			return rc;
2248 		virtio_mem_sbm_set_mb_state(vm, id,
2249 					    VIRTIO_MEM_SBM_MB_UNUSED);
2250 	}
2251 
2252 	return 0;
2253 }
2254 
2255 /*
2256  * Update all parts of the config that could have changed.
2257  */
2258 static void virtio_mem_refresh_config(struct virtio_mem *vm)
2259 {
2260 	const struct range pluggable_range = mhp_get_pluggable_range(true);
2261 	uint64_t new_plugged_size, usable_region_size, end_addr;
2262 
2263 	/* the plugged_size is just a reflection of what _we_ did previously */
2264 	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2265 			&new_plugged_size);
2266 	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2267 		vm->plugged_size = new_plugged_size;
2268 
2269 	/* calculate the last usable memory block id */
2270 	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2271 			usable_region_size, &usable_region_size);
2272 	end_addr = min(vm->addr + usable_region_size - 1,
2273 		       pluggable_range.end);
2274 
2275 	if (vm->in_sbm) {
2276 		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2277 		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2278 			vm->sbm.last_usable_mb_id--;
2279 	} else {
2280 		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2281 								     end_addr);
2282 		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2283 			vm->bbm.last_usable_bb_id--;
2284 	}
2285 	/*
2286 	 * If we cannot plug any of our device memory (e.g., nothing in the
2287 	 * usable region is addressable), the last usable memory block id will
2288 	 * be smaller than the first usable memory block id. We'll stop
2289 	 * attempting to add memory with -ENOSPC from our main loop.
2290 	 */
2291 
2292 	/* see if there is a request to change the size */
2293 	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2294 			&vm->requested_size);
2295 
2296 	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2297 	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2298 }
2299 
2300 /*
2301  * Workqueue function for handling plug/unplug requests and config updates.
2302  */
2303 static void virtio_mem_run_wq(struct work_struct *work)
2304 {
2305 	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2306 	uint64_t diff;
2307 	int rc;
2308 
2309 	if (unlikely(vm->in_kdump)) {
2310 		dev_warn_once(&vm->vdev->dev,
2311 			     "unexpected workqueue run in kdump kernel\n");
2312 		return;
2313 	}
2314 
2315 	hrtimer_cancel(&vm->retry_timer);
2316 
2317 	if (vm->broken)
2318 		return;
2319 
2320 	atomic_set(&vm->wq_active, 1);
2321 retry:
2322 	rc = 0;
2323 
2324 	/* Make sure we start with a clean state if there are leftovers. */
2325 	if (unlikely(vm->unplug_all_required))
2326 		rc = virtio_mem_send_unplug_all_request(vm);
2327 
2328 	if (atomic_read(&vm->config_changed)) {
2329 		atomic_set(&vm->config_changed, 0);
2330 		virtio_mem_refresh_config(vm);
2331 	}
2332 
2333 	/* Unplug any leftovers from previous runs */
2334 	if (!rc)
2335 		rc = virtio_mem_unplug_pending_mb(vm);
2336 
2337 	if (!rc && vm->requested_size != vm->plugged_size) {
2338 		if (vm->requested_size > vm->plugged_size) {
2339 			diff = vm->requested_size - vm->plugged_size;
2340 			rc = virtio_mem_plug_request(vm, diff);
2341 		} else {
2342 			diff = vm->plugged_size - vm->requested_size;
2343 			rc = virtio_mem_unplug_request(vm, diff);
2344 		}
2345 	}
2346 
2347 	switch (rc) {
2348 	case 0:
2349 		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2350 		break;
2351 	case -ENOSPC:
2352 		/*
2353 		 * We cannot add any more memory (alignment, physical limit)
2354 		 * or we have too many offline memory blocks.
2355 		 */
2356 		break;
2357 	case -ETXTBSY:
2358 		/*
2359 		 * The hypervisor cannot process our request right now
2360 		 * (e.g., out of memory, migrating);
2361 		 */
2362 	case -EBUSY:
2363 		/*
2364 		 * We cannot free up any memory to unplug it (all plugged memory
2365 		 * is busy).
2366 		 */
2367 	case -ENOMEM:
2368 		/* Out of memory, try again later. */
2369 		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2370 			      HRTIMER_MODE_REL);
2371 		break;
2372 	case -EAGAIN:
2373 		/* Retry immediately (e.g., the config changed). */
2374 		goto retry;
2375 	default:
2376 		/* Unknown error, mark as broken */
2377 		dev_err(&vm->vdev->dev,
2378 			"unknown error, marking device broken: %d\n", rc);
2379 		vm->broken = true;
2380 	}
2381 
2382 	atomic_set(&vm->wq_active, 0);
2383 }
2384 
2385 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2386 {
2387 	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2388 					     retry_timer);
2389 
2390 	virtio_mem_retry(vm);
2391 	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2392 				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2393 	return HRTIMER_NORESTART;
2394 }
2395 
2396 static void virtio_mem_handle_response(struct virtqueue *vq)
2397 {
2398 	struct virtio_mem *vm = vq->vdev->priv;
2399 
2400 	wake_up(&vm->host_resp);
2401 }
2402 
2403 static int virtio_mem_init_vq(struct virtio_mem *vm)
2404 {
2405 	struct virtqueue *vq;
2406 
2407 	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2408 				   "guest-request");
2409 	if (IS_ERR(vq))
2410 		return PTR_ERR(vq);
2411 	vm->vq = vq;
2412 
2413 	return 0;
2414 }
2415 
2416 static int virtio_mem_init_hotplug(struct virtio_mem *vm)
2417 {
2418 	const struct range pluggable_range = mhp_get_pluggable_range(true);
2419 	uint64_t unit_pages, sb_size, addr;
2420 	int rc;
2421 
2422 	/* bad device setup - warn only */
2423 	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2424 		dev_warn(&vm->vdev->dev,
2425 			 "The alignment of the physical start address can make some memory unusable.\n");
2426 	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2427 		dev_warn(&vm->vdev->dev,
2428 			 "The alignment of the physical end address can make some memory unusable.\n");
2429 	if (vm->addr < pluggable_range.start ||
2430 	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2431 		dev_warn(&vm->vdev->dev,
2432 			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2433 
2434 	/* Prepare the offline threshold - make sure we can add two blocks. */
2435 	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2436 				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2437 
2438 	/*
2439 	 * We want subblocks to span at least MAX_ORDER_NR_PAGES and
2440 	 * pageblock_nr_pages pages. This:
2441 	 * - Simplifies our page onlining code (virtio_mem_online_page_cb)
2442 	 *   and fake page onlining code (virtio_mem_fake_online).
2443 	 * - Is required for now for alloc_contig_range() to work reliably -
2444 	 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
2445 	 */
2446 	sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
2447 			pageblock_nr_pages) * PAGE_SIZE;
2448 	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2449 
2450 	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2451 		/* SBM: At least two subblocks per Linux memory block. */
2452 		vm->in_sbm = true;
2453 		vm->sbm.sb_size = sb_size;
2454 		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2455 				     vm->sbm.sb_size;
2456 
2457 		/* Round up to the next full memory block */
2458 		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2459 		       memory_block_size_bytes() - 1;
2460 		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2461 		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2462 	} else {
2463 		/* BBM: At least one Linux memory block. */
2464 		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2465 					memory_block_size_bytes());
2466 
2467 		if (bbm_block_size) {
2468 			if (!is_power_of_2(bbm_block_size)) {
2469 				dev_warn(&vm->vdev->dev,
2470 					 "bbm_block_size is not a power of 2");
2471 			} else if (bbm_block_size < vm->bbm.bb_size) {
2472 				dev_warn(&vm->vdev->dev,
2473 					 "bbm_block_size is too small");
2474 			} else {
2475 				vm->bbm.bb_size = bbm_block_size;
2476 			}
2477 		}
2478 
2479 		/* Round up to the next aligned big block */
2480 		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2481 		       vm->bbm.bb_size - 1;
2482 		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2483 		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2484 
2485 		/* Make sure we can add two big blocks. */
2486 		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2487 					      vm->offline_threshold);
2488 	}
2489 
2490 	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2491 		 memory_block_size_bytes());
2492 	if (vm->in_sbm)
2493 		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2494 			 (unsigned long long)vm->sbm.sb_size);
2495 	else
2496 		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2497 			 (unsigned long long)vm->bbm.bb_size);
2498 
2499 	/* create the parent resource for all memory */
2500 	rc = virtio_mem_create_resource(vm);
2501 	if (rc)
2502 		return rc;
2503 
2504 	/* use a single dynamic memory group to cover the whole memory device */
2505 	if (vm->in_sbm)
2506 		unit_pages = PHYS_PFN(memory_block_size_bytes());
2507 	else
2508 		unit_pages = PHYS_PFN(vm->bbm.bb_size);
2509 	rc = memory_group_register_dynamic(vm->nid, unit_pages);
2510 	if (rc < 0)
2511 		goto out_del_resource;
2512 	vm->mgid = rc;
2513 
2514 	/*
2515 	 * If we still have memory plugged, we have to unplug all memory first.
2516 	 * Registering our parent resource makes sure that this memory isn't
2517 	 * actually in use (e.g., trying to reload the driver).
2518 	 */
2519 	if (vm->plugged_size) {
2520 		vm->unplug_all_required = true;
2521 		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2522 	}
2523 
2524 	/* register callbacks */
2525 	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2526 	rc = register_memory_notifier(&vm->memory_notifier);
2527 	if (rc)
2528 		goto out_unreg_group;
2529 	rc = register_virtio_mem_device(vm);
2530 	if (rc)
2531 		goto out_unreg_mem;
2532 
2533 	return 0;
2534 out_unreg_mem:
2535 	unregister_memory_notifier(&vm->memory_notifier);
2536 out_unreg_group:
2537 	memory_group_unregister(vm->mgid);
2538 out_del_resource:
2539 	virtio_mem_delete_resource(vm);
2540 	return rc;
2541 }
2542 
2543 #ifdef CONFIG_PROC_VMCORE
2544 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
2545 					 uint64_t size)
2546 {
2547 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
2548 	const struct virtio_mem_req req = {
2549 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
2550 		.u.state.addr = cpu_to_virtio64(vm->vdev, addr),
2551 		.u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
2552 	};
2553 	int rc = -ENOMEM;
2554 
2555 	dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
2556 		addr + size - 1);
2557 
2558 	switch (virtio_mem_send_request(vm, &req)) {
2559 	case VIRTIO_MEM_RESP_ACK:
2560 		return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
2561 	case VIRTIO_MEM_RESP_ERROR:
2562 		rc = -EINVAL;
2563 		break;
2564 	default:
2565 		break;
2566 	}
2567 
2568 	dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
2569 	return rc;
2570 }
2571 
2572 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
2573 					 unsigned long pfn)
2574 {
2575 	struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2576 					     vmcore_cb);
2577 	uint64_t addr = PFN_PHYS(pfn);
2578 	bool is_ram;
2579 	int rc;
2580 
2581 	if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
2582 		return true;
2583 	if (!vm->plugged_size)
2584 		return false;
2585 
2586 	/*
2587 	 * We have to serialize device requests and access to the information
2588 	 * about the block queried last.
2589 	 */
2590 	mutex_lock(&vm->hotplug_mutex);
2591 
2592 	addr = ALIGN_DOWN(addr, vm->device_block_size);
2593 	if (addr != vm->last_block_addr) {
2594 		rc = virtio_mem_send_state_request(vm, addr,
2595 						   vm->device_block_size);
2596 		/* On any kind of error, we're going to signal !ram. */
2597 		if (rc == VIRTIO_MEM_STATE_PLUGGED)
2598 			vm->last_block_plugged = true;
2599 		else
2600 			vm->last_block_plugged = false;
2601 		vm->last_block_addr = addr;
2602 	}
2603 
2604 	is_ram = vm->last_block_plugged;
2605 	mutex_unlock(&vm->hotplug_mutex);
2606 	return is_ram;
2607 }
2608 #endif /* CONFIG_PROC_VMCORE */
2609 
2610 static int virtio_mem_init_kdump(struct virtio_mem *vm)
2611 {
2612 #ifdef CONFIG_PROC_VMCORE
2613 	dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
2614 	vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
2615 	register_vmcore_cb(&vm->vmcore_cb);
2616 	return 0;
2617 #else /* CONFIG_PROC_VMCORE */
2618 	dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
2619 	return -EBUSY;
2620 #endif /* CONFIG_PROC_VMCORE */
2621 }
2622 
2623 static int virtio_mem_init(struct virtio_mem *vm)
2624 {
2625 	uint16_t node_id;
2626 
2627 	if (!vm->vdev->config->get) {
2628 		dev_err(&vm->vdev->dev, "config access disabled\n");
2629 		return -EINVAL;
2630 	}
2631 
2632 	/* Fetch all properties that can't change. */
2633 	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2634 			&vm->plugged_size);
2635 	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2636 			&vm->device_block_size);
2637 	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2638 			&node_id);
2639 	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2640 	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2641 	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2642 			&vm->region_size);
2643 
2644 	/* Determine the nid for the device based on the lowest address. */
2645 	if (vm->nid == NUMA_NO_NODE)
2646 		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2647 
2648 	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2649 	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2650 	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2651 		 (unsigned long long)vm->device_block_size);
2652 	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2653 		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2654 
2655 	/*
2656 	 * We don't want to (un)plug or reuse any memory when in kdump. The
2657 	 * memory is still accessible (but not exposed to Linux).
2658 	 */
2659 	if (vm->in_kdump)
2660 		return virtio_mem_init_kdump(vm);
2661 	return virtio_mem_init_hotplug(vm);
2662 }
2663 
2664 static int virtio_mem_create_resource(struct virtio_mem *vm)
2665 {
2666 	/*
2667 	 * When force-unloading the driver and removing the device, we
2668 	 * could have a garbage pointer. Duplicate the string.
2669 	 */
2670 	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2671 
2672 	if (!name)
2673 		return -ENOMEM;
2674 
2675 	/* Disallow mapping device memory via /dev/mem completely. */
2676 	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2677 						   name, IORESOURCE_SYSTEM_RAM |
2678 						   IORESOURCE_EXCLUSIVE);
2679 	if (!vm->parent_resource) {
2680 		kfree(name);
2681 		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2682 		dev_info(&vm->vdev->dev,
2683 			 "reloading the driver is not supported\n");
2684 		return -EBUSY;
2685 	}
2686 
2687 	/* The memory is not actually busy - make add_memory() work. */
2688 	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2689 	return 0;
2690 }
2691 
2692 static void virtio_mem_delete_resource(struct virtio_mem *vm)
2693 {
2694 	const char *name;
2695 
2696 	if (!vm->parent_resource)
2697 		return;
2698 
2699 	name = vm->parent_resource->name;
2700 	release_resource(vm->parent_resource);
2701 	kfree(vm->parent_resource);
2702 	kfree(name);
2703 	vm->parent_resource = NULL;
2704 }
2705 
2706 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2707 {
2708 	return 1;
2709 }
2710 
2711 static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2712 {
2713 	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2714 
2715 	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2716 				   vm->addr + vm->region_size, NULL,
2717 				   virtio_mem_range_has_system_ram) == 1;
2718 }
2719 
2720 static int virtio_mem_probe(struct virtio_device *vdev)
2721 {
2722 	struct virtio_mem *vm;
2723 	int rc;
2724 
2725 	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2726 	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2727 
2728 	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2729 	if (!vm)
2730 		return -ENOMEM;
2731 
2732 	init_waitqueue_head(&vm->host_resp);
2733 	vm->vdev = vdev;
2734 	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2735 	mutex_init(&vm->hotplug_mutex);
2736 	INIT_LIST_HEAD(&vm->next);
2737 	spin_lock_init(&vm->removal_lock);
2738 	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2739 	vm->retry_timer.function = virtio_mem_timer_expired;
2740 	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2741 	vm->in_kdump = is_kdump_kernel();
2742 
2743 	/* register the virtqueue */
2744 	rc = virtio_mem_init_vq(vm);
2745 	if (rc)
2746 		goto out_free_vm;
2747 
2748 	/* initialize the device by querying the config */
2749 	rc = virtio_mem_init(vm);
2750 	if (rc)
2751 		goto out_del_vq;
2752 
2753 	virtio_device_ready(vdev);
2754 
2755 	/* trigger a config update to start processing the requested_size */
2756 	if (!vm->in_kdump) {
2757 		atomic_set(&vm->config_changed, 1);
2758 		queue_work(system_freezable_wq, &vm->wq);
2759 	}
2760 
2761 	return 0;
2762 out_del_vq:
2763 	vdev->config->del_vqs(vdev);
2764 out_free_vm:
2765 	kfree(vm);
2766 	vdev->priv = NULL;
2767 
2768 	return rc;
2769 }
2770 
2771 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
2772 {
2773 	unsigned long mb_id;
2774 	int rc;
2775 
2776 	/*
2777 	 * Make sure the workqueue won't be triggered anymore and no memory
2778 	 * blocks can be onlined/offlined until we're finished here.
2779 	 */
2780 	mutex_lock(&vm->hotplug_mutex);
2781 	spin_lock_irq(&vm->removal_lock);
2782 	vm->removing = true;
2783 	spin_unlock_irq(&vm->removal_lock);
2784 	mutex_unlock(&vm->hotplug_mutex);
2785 
2786 	/* wait until the workqueue stopped */
2787 	cancel_work_sync(&vm->wq);
2788 	hrtimer_cancel(&vm->retry_timer);
2789 
2790 	if (vm->in_sbm) {
2791 		/*
2792 		 * After we unregistered our callbacks, user space can online
2793 		 * partially plugged offline blocks. Make sure to remove them.
2794 		 */
2795 		virtio_mem_sbm_for_each_mb(vm, mb_id,
2796 					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2797 			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2798 			BUG_ON(rc);
2799 			virtio_mem_sbm_set_mb_state(vm, mb_id,
2800 						    VIRTIO_MEM_SBM_MB_UNUSED);
2801 		}
2802 		/*
2803 		 * After we unregistered our callbacks, user space can no longer
2804 		 * offline partially plugged online memory blocks. No need to
2805 		 * worry about them.
2806 		 */
2807 	}
2808 
2809 	/* unregister callbacks */
2810 	unregister_virtio_mem_device(vm);
2811 	unregister_memory_notifier(&vm->memory_notifier);
2812 
2813 	/*
2814 	 * There is no way we could reliably remove all memory we have added to
2815 	 * the system. And there is no way to stop the driver/device from going
2816 	 * away. Warn at least.
2817 	 */
2818 	if (virtio_mem_has_memory_added(vm)) {
2819 		dev_warn(&vm->vdev->dev,
2820 			 "device still has system memory added\n");
2821 	} else {
2822 		virtio_mem_delete_resource(vm);
2823 		kfree_const(vm->resource_name);
2824 		memory_group_unregister(vm->mgid);
2825 	}
2826 
2827 	/* remove all tracking data - no locking needed */
2828 	if (vm->in_sbm) {
2829 		vfree(vm->sbm.mb_states);
2830 		vfree(vm->sbm.sb_states);
2831 	} else {
2832 		vfree(vm->bbm.bb_states);
2833 	}
2834 }
2835 
2836 static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
2837 {
2838 #ifdef CONFIG_PROC_VMCORE
2839 	unregister_vmcore_cb(&vm->vmcore_cb);
2840 #endif /* CONFIG_PROC_VMCORE */
2841 }
2842 
2843 static void virtio_mem_remove(struct virtio_device *vdev)
2844 {
2845 	struct virtio_mem *vm = vdev->priv;
2846 
2847 	if (vm->in_kdump)
2848 		virtio_mem_deinit_kdump(vm);
2849 	else
2850 		virtio_mem_deinit_hotplug(vm);
2851 
2852 	/* reset the device and cleanup the queues */
2853 	vdev->config->reset(vdev);
2854 	vdev->config->del_vqs(vdev);
2855 
2856 	kfree(vm);
2857 	vdev->priv = NULL;
2858 }
2859 
2860 static void virtio_mem_config_changed(struct virtio_device *vdev)
2861 {
2862 	struct virtio_mem *vm = vdev->priv;
2863 
2864 	if (unlikely(vm->in_kdump))
2865 		return;
2866 
2867 	atomic_set(&vm->config_changed, 1);
2868 	virtio_mem_retry(vm);
2869 }
2870 
2871 #ifdef CONFIG_PM_SLEEP
2872 static int virtio_mem_freeze(struct virtio_device *vdev)
2873 {
2874 	/*
2875 	 * When restarting the VM, all memory is usually unplugged. Don't
2876 	 * allow to suspend/hibernate.
2877 	 */
2878 	dev_err(&vdev->dev, "save/restore not supported.\n");
2879 	return -EPERM;
2880 }
2881 
2882 static int virtio_mem_restore(struct virtio_device *vdev)
2883 {
2884 	return -EPERM;
2885 }
2886 #endif
2887 
2888 static unsigned int virtio_mem_features[] = {
2889 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
2890 	VIRTIO_MEM_F_ACPI_PXM,
2891 #endif
2892 	VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
2893 };
2894 
2895 static const struct virtio_device_id virtio_mem_id_table[] = {
2896 	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
2897 	{ 0 },
2898 };
2899 
2900 static struct virtio_driver virtio_mem_driver = {
2901 	.feature_table = virtio_mem_features,
2902 	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
2903 	.driver.name = KBUILD_MODNAME,
2904 	.driver.owner = THIS_MODULE,
2905 	.id_table = virtio_mem_id_table,
2906 	.probe = virtio_mem_probe,
2907 	.remove = virtio_mem_remove,
2908 	.config_changed = virtio_mem_config_changed,
2909 #ifdef CONFIG_PM_SLEEP
2910 	.freeze	=	virtio_mem_freeze,
2911 	.restore =	virtio_mem_restore,
2912 #endif
2913 };
2914 
2915 module_virtio_driver(virtio_mem_driver);
2916 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
2917 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
2918 MODULE_DESCRIPTION("Virtio-mem driver");
2919 MODULE_LICENSE("GPL");
2920