xref: /openbmc/linux/drivers/virtio/virtio_mem.c (revision 2b1b1267080fe822789d0845a58ebb452724736b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Virtio-mem device driver.
4  *
5  * Copyright Red Hat, Inc. 2020
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 
10 #include <linux/virtio.h>
11 #include <linux/virtio_mem.h>
12 #include <linux/workqueue.h>
13 #include <linux/slab.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/memory_hotplug.h>
17 #include <linux/memory.h>
18 #include <linux/hrtimer.h>
19 #include <linux/crash_dump.h>
20 #include <linux/mutex.h>
21 #include <linux/bitmap.h>
22 #include <linux/lockdep.h>
23 
24 #include <acpi/acpi_numa.h>
25 
26 static bool unplug_online = true;
27 module_param(unplug_online, bool, 0644);
28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
29 
30 enum virtio_mem_mb_state {
31 	/* Unplugged, not added to Linux. Can be reused later. */
32 	VIRTIO_MEM_MB_STATE_UNUSED = 0,
33 	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
34 	VIRTIO_MEM_MB_STATE_PLUGGED,
35 	/* Fully plugged, fully added to Linux, offline. */
36 	VIRTIO_MEM_MB_STATE_OFFLINE,
37 	/* Partially plugged, fully added to Linux, offline. */
38 	VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
39 	/* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */
40 	VIRTIO_MEM_MB_STATE_ONLINE,
41 	/* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */
42 	VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
43 	/*
44 	 * Fully plugged, fully added to Linux, online (ZONE_MOVABLE).
45 	 * We are not allowed to allocate (unplug) parts of this block that
46 	 * are not movable (similar to gigantic pages). We will never allow
47 	 * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain
48 	 * unmovable parts).
49 	 */
50 	VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE,
51 	VIRTIO_MEM_MB_STATE_COUNT
52 };
53 
54 struct virtio_mem {
55 	struct virtio_device *vdev;
56 
57 	/* We might first have to unplug all memory when starting up. */
58 	bool unplug_all_required;
59 
60 	/* Workqueue that processes the plug/unplug requests. */
61 	struct work_struct wq;
62 	atomic_t config_changed;
63 
64 	/* Virtqueue for guest->host requests. */
65 	struct virtqueue *vq;
66 
67 	/* Wait for a host response to a guest request. */
68 	wait_queue_head_t host_resp;
69 
70 	/* Space for one guest request and the host response. */
71 	struct virtio_mem_req req;
72 	struct virtio_mem_resp resp;
73 
74 	/* The current size of the device. */
75 	uint64_t plugged_size;
76 	/* The requested size of the device. */
77 	uint64_t requested_size;
78 
79 	/* The device block size (for communicating with the device). */
80 	uint64_t device_block_size;
81 	/* The translated node id. NUMA_NO_NODE in case not specified. */
82 	int nid;
83 	/* Physical start address of the memory region. */
84 	uint64_t addr;
85 	/* Maximum region size in bytes. */
86 	uint64_t region_size;
87 
88 	/* The subblock size. */
89 	uint64_t subblock_size;
90 	/* The number of subblocks per memory block. */
91 	uint32_t nb_sb_per_mb;
92 
93 	/* Id of the first memory block of this device. */
94 	unsigned long first_mb_id;
95 	/* Id of the last memory block of this device. */
96 	unsigned long last_mb_id;
97 	/* Id of the last usable memory block of this device. */
98 	unsigned long last_usable_mb_id;
99 	/* Id of the next memory bock to prepare when needed. */
100 	unsigned long next_mb_id;
101 
102 	/* The parent resource for all memory added via this device. */
103 	struct resource *parent_resource;
104 
105 	/* Summary of all memory block states. */
106 	unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
107 #define VIRTIO_MEM_NB_OFFLINE_THRESHOLD		10
108 
109 	/*
110 	 * One byte state per memory block.
111 	 *
112 	 * Allocated via vmalloc(). When preparing new blocks, resized
113 	 * (alloc+copy+free) when needed (crossing pages with the next mb).
114 	 * (when crossing pages).
115 	 *
116 	 * With 128MB memory blocks, we have states for 512GB of memory in one
117 	 * page.
118 	 */
119 	uint8_t *mb_state;
120 
121 	/*
122 	 * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
123 	 *
124 	 * With 4MB subblocks, we manage 128GB of memory in one page.
125 	 */
126 	unsigned long *sb_bitmap;
127 
128 	/*
129 	 * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
130 	 *
131 	 * When this lock is held the pointers can't change, ONLINE and
132 	 * OFFLINE blocks can't change the state and no subblocks will get
133 	 * plugged/unplugged.
134 	 */
135 	struct mutex hotplug_mutex;
136 	bool hotplug_active;
137 
138 	/* An error occurred we cannot handle - stop processing requests. */
139 	bool broken;
140 
141 	/* The driver is being removed. */
142 	spinlock_t removal_lock;
143 	bool removing;
144 
145 	/* Timer for retrying to plug/unplug memory. */
146 	struct hrtimer retry_timer;
147 	unsigned int retry_timer_ms;
148 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
149 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
150 
151 	/* Memory notifier (online/offline events). */
152 	struct notifier_block memory_notifier;
153 
154 	/* Next device in the list of virtio-mem devices. */
155 	struct list_head next;
156 };
157 
158 /*
159  * We have to share a single online_page callback among all virtio-mem
160  * devices. We use RCU to iterate the list in the callback.
161  */
162 static DEFINE_MUTEX(virtio_mem_mutex);
163 static LIST_HEAD(virtio_mem_devices);
164 
165 static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
166 
167 /*
168  * Register a virtio-mem device so it will be considered for the online_page
169  * callback.
170  */
171 static int register_virtio_mem_device(struct virtio_mem *vm)
172 {
173 	int rc = 0;
174 
175 	/* First device registers the callback. */
176 	mutex_lock(&virtio_mem_mutex);
177 	if (list_empty(&virtio_mem_devices))
178 		rc = set_online_page_callback(&virtio_mem_online_page_cb);
179 	if (!rc)
180 		list_add_rcu(&vm->next, &virtio_mem_devices);
181 	mutex_unlock(&virtio_mem_mutex);
182 
183 	return rc;
184 }
185 
186 /*
187  * Unregister a virtio-mem device so it will no longer be considered for the
188  * online_page callback.
189  */
190 static void unregister_virtio_mem_device(struct virtio_mem *vm)
191 {
192 	/* Last device unregisters the callback. */
193 	mutex_lock(&virtio_mem_mutex);
194 	list_del_rcu(&vm->next);
195 	if (list_empty(&virtio_mem_devices))
196 		restore_online_page_callback(&virtio_mem_online_page_cb);
197 	mutex_unlock(&virtio_mem_mutex);
198 
199 	synchronize_rcu();
200 }
201 
202 /*
203  * Calculate the memory block id of a given address.
204  */
205 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
206 {
207 	return addr / memory_block_size_bytes();
208 }
209 
210 /*
211  * Calculate the physical start address of a given memory block id.
212  */
213 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
214 {
215 	return mb_id * memory_block_size_bytes();
216 }
217 
218 /*
219  * Calculate the subblock id of a given address.
220  */
221 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
222 					      unsigned long addr)
223 {
224 	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
225 	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
226 
227 	return (addr - mb_addr) / vm->subblock_size;
228 }
229 
230 /*
231  * Set the state of a memory block, taking care of the state counter.
232  */
233 static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id,
234 				    enum virtio_mem_mb_state state)
235 {
236 	const unsigned long idx = mb_id - vm->first_mb_id;
237 	enum virtio_mem_mb_state old_state;
238 
239 	old_state = vm->mb_state[idx];
240 	vm->mb_state[idx] = state;
241 
242 	BUG_ON(vm->nb_mb_state[old_state] == 0);
243 	vm->nb_mb_state[old_state]--;
244 	vm->nb_mb_state[state]++;
245 }
246 
247 /*
248  * Get the state of a memory block.
249  */
250 static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm,
251 							unsigned long mb_id)
252 {
253 	const unsigned long idx = mb_id - vm->first_mb_id;
254 
255 	return vm->mb_state[idx];
256 }
257 
258 /*
259  * Prepare the state array for the next memory block.
260  */
261 static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
262 {
263 	unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
264 	unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
265 	int old_pages = PFN_UP(old_bytes);
266 	int new_pages = PFN_UP(new_bytes);
267 	uint8_t *new_mb_state;
268 
269 	if (vm->mb_state && old_pages == new_pages)
270 		return 0;
271 
272 	new_mb_state = vzalloc(new_pages * PAGE_SIZE);
273 	if (!new_mb_state)
274 		return -ENOMEM;
275 
276 	mutex_lock(&vm->hotplug_mutex);
277 	if (vm->mb_state)
278 		memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE);
279 	vfree(vm->mb_state);
280 	vm->mb_state = new_mb_state;
281 	mutex_unlock(&vm->hotplug_mutex);
282 
283 	return 0;
284 }
285 
286 #define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
287 	for (_mb_id = _vm->first_mb_id; \
288 	     _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
289 	     _mb_id++) \
290 		if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
291 
292 #define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
293 	for (_mb_id = _vm->next_mb_id - 1; \
294 	     _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
295 	     _mb_id--) \
296 		if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
297 
298 /*
299  * Mark all selected subblocks plugged.
300  *
301  * Will not modify the state of the memory block.
302  */
303 static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
304 					 unsigned long mb_id, int sb_id,
305 					 int count)
306 {
307 	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
308 
309 	__bitmap_set(vm->sb_bitmap, bit, count);
310 }
311 
312 /*
313  * Mark all selected subblocks unplugged.
314  *
315  * Will not modify the state of the memory block.
316  */
317 static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
318 					   unsigned long mb_id, int sb_id,
319 					   int count)
320 {
321 	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
322 
323 	__bitmap_clear(vm->sb_bitmap, bit, count);
324 }
325 
326 /*
327  * Test if all selected subblocks are plugged.
328  */
329 static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
330 					  unsigned long mb_id, int sb_id,
331 					  int count)
332 {
333 	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
334 
335 	if (count == 1)
336 		return test_bit(bit, vm->sb_bitmap);
337 
338 	/* TODO: Helper similar to bitmap_set() */
339 	return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
340 	       bit + count;
341 }
342 
343 /*
344  * Test if all selected subblocks are unplugged.
345  */
346 static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
347 					    unsigned long mb_id, int sb_id,
348 					    int count)
349 {
350 	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
351 
352 	/* TODO: Helper similar to bitmap_set() */
353 	return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count;
354 }
355 
356 /*
357  * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
358  * none.
359  */
360 static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm,
361 					    unsigned long mb_id)
362 {
363 	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
364 
365 	return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) -
366 	       bit;
367 }
368 
369 /*
370  * Prepare the subblock bitmap for the next memory block.
371  */
372 static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm)
373 {
374 	const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
375 	const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
376 	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
377 	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
378 	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
379 	unsigned long *new_sb_bitmap, *old_sb_bitmap;
380 
381 	if (vm->sb_bitmap && old_pages == new_pages)
382 		return 0;
383 
384 	new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE);
385 	if (!new_sb_bitmap)
386 		return -ENOMEM;
387 
388 	mutex_lock(&vm->hotplug_mutex);
389 	if (new_sb_bitmap)
390 		memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE);
391 
392 	old_sb_bitmap = vm->sb_bitmap;
393 	vm->sb_bitmap = new_sb_bitmap;
394 	mutex_unlock(&vm->hotplug_mutex);
395 
396 	vfree(old_sb_bitmap);
397 	return 0;
398 }
399 
400 /*
401  * Try to add a memory block to Linux. This will usually only fail
402  * if out of memory.
403  *
404  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
405  * onlining code).
406  *
407  * Will not modify the state of the memory block.
408  */
409 static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
410 {
411 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
412 	int nid = vm->nid;
413 
414 	if (nid == NUMA_NO_NODE)
415 		nid = memory_add_physaddr_to_nid(addr);
416 
417 	dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
418 	return add_memory(nid, addr, memory_block_size_bytes());
419 }
420 
421 /*
422  * Try to remove a memory block from Linux. Will only fail if the memory block
423  * is not offline.
424  *
425  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
426  * onlining code).
427  *
428  * Will not modify the state of the memory block.
429  */
430 static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
431 {
432 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
433 	int nid = vm->nid;
434 
435 	if (nid == NUMA_NO_NODE)
436 		nid = memory_add_physaddr_to_nid(addr);
437 
438 	dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id);
439 	return remove_memory(nid, addr, memory_block_size_bytes());
440 }
441 
442 /*
443  * Try to offline and remove a memory block from Linux.
444  *
445  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
446  * onlining code).
447  *
448  * Will not modify the state of the memory block.
449  */
450 static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm,
451 					    unsigned long mb_id)
452 {
453 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
454 	int nid = vm->nid;
455 
456 	if (nid == NUMA_NO_NODE)
457 		nid = memory_add_physaddr_to_nid(addr);
458 
459 	dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n",
460 		mb_id);
461 	return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
462 }
463 
464 /*
465  * Trigger the workqueue so the device can perform its magic.
466  */
467 static void virtio_mem_retry(struct virtio_mem *vm)
468 {
469 	unsigned long flags;
470 
471 	spin_lock_irqsave(&vm->removal_lock, flags);
472 	if (!vm->removing)
473 		queue_work(system_freezable_wq, &vm->wq);
474 	spin_unlock_irqrestore(&vm->removal_lock, flags);
475 }
476 
477 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
478 {
479 	int node = NUMA_NO_NODE;
480 
481 #if defined(CONFIG_ACPI_NUMA)
482 	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
483 		node = pxm_to_node(node_id);
484 #endif
485 	return node;
486 }
487 
488 /*
489  * Test if a virtio-mem device overlaps with the given range. Can be called
490  * from (notifier) callbacks lockless.
491  */
492 static bool virtio_mem_overlaps_range(struct virtio_mem *vm,
493 				      unsigned long start, unsigned long size)
494 {
495 	unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id);
496 	unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
497 				memory_block_size_bytes();
498 
499 	return start < dev_end && dev_start < start + size;
500 }
501 
502 /*
503  * Test if a virtio-mem device owns a memory block. Can be called from
504  * (notifier) callbacks lockless.
505  */
506 static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
507 {
508 	return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
509 }
510 
511 static int virtio_mem_notify_going_online(struct virtio_mem *vm,
512 					  unsigned long mb_id,
513 					  enum zone_type zone)
514 {
515 	switch (virtio_mem_mb_get_state(vm, mb_id)) {
516 	case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
517 		/*
518 		 * We won't allow to online a partially plugged memory block
519 		 * to the MOVABLE zone - it would contain unmovable parts.
520 		 */
521 		if (zone == ZONE_MOVABLE) {
522 			dev_warn_ratelimited(&vm->vdev->dev,
523 					     "memory block has holes, MOVABLE not supported\n");
524 			return NOTIFY_BAD;
525 		}
526 		return NOTIFY_OK;
527 	case VIRTIO_MEM_MB_STATE_OFFLINE:
528 		return NOTIFY_OK;
529 	default:
530 		break;
531 	}
532 	dev_warn_ratelimited(&vm->vdev->dev,
533 			     "memory block onlining denied\n");
534 	return NOTIFY_BAD;
535 }
536 
537 static void virtio_mem_notify_offline(struct virtio_mem *vm,
538 				      unsigned long mb_id)
539 {
540 	switch (virtio_mem_mb_get_state(vm, mb_id)) {
541 	case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL:
542 		virtio_mem_mb_set_state(vm, mb_id,
543 					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
544 		break;
545 	case VIRTIO_MEM_MB_STATE_ONLINE:
546 	case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE:
547 		virtio_mem_mb_set_state(vm, mb_id,
548 					VIRTIO_MEM_MB_STATE_OFFLINE);
549 		break;
550 	default:
551 		BUG();
552 		break;
553 	}
554 
555 	/*
556 	 * Trigger the workqueue, maybe we can now unplug memory. Also,
557 	 * when we offline and remove a memory block, this will re-trigger
558 	 * us immediately - which is often nice because the removal of
559 	 * the memory block (e.g., memmap) might have freed up memory
560 	 * on other memory blocks we manage.
561 	 */
562 	virtio_mem_retry(vm);
563 }
564 
565 static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id,
566 				     enum zone_type zone)
567 {
568 	unsigned long nb_offline;
569 
570 	switch (virtio_mem_mb_get_state(vm, mb_id)) {
571 	case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
572 		BUG_ON(zone == ZONE_MOVABLE);
573 		virtio_mem_mb_set_state(vm, mb_id,
574 					VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
575 		break;
576 	case VIRTIO_MEM_MB_STATE_OFFLINE:
577 		if (zone == ZONE_MOVABLE)
578 			virtio_mem_mb_set_state(vm, mb_id,
579 					    VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE);
580 		else
581 			virtio_mem_mb_set_state(vm, mb_id,
582 						VIRTIO_MEM_MB_STATE_ONLINE);
583 		break;
584 	default:
585 		BUG();
586 		break;
587 	}
588 	nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
589 		     vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
590 
591 	/* see if we can add new blocks now that we onlined one block */
592 	if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
593 		virtio_mem_retry(vm);
594 }
595 
596 static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
597 					    unsigned long mb_id)
598 {
599 	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
600 	struct page *page;
601 	unsigned long pfn;
602 	int sb_id, i;
603 
604 	for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
605 		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
606 			continue;
607 		/*
608 		 * Drop our reference to the pages so the memory can get
609 		 * offlined and add the unplugged pages to the managed
610 		 * page counters (so offlining code can correctly subtract
611 		 * them again).
612 		 */
613 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
614 			       sb_id * vm->subblock_size);
615 		adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
616 		for (i = 0; i < nr_pages; i++) {
617 			page = pfn_to_page(pfn + i);
618 			if (WARN_ON(!page_ref_dec_and_test(page)))
619 				dump_page(page, "unplugged page referenced");
620 		}
621 	}
622 }
623 
624 static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm,
625 					     unsigned long mb_id)
626 {
627 	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
628 	unsigned long pfn;
629 	int sb_id, i;
630 
631 	for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
632 		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
633 			continue;
634 		/*
635 		 * Get the reference we dropped when going offline and
636 		 * subtract the unplugged pages from the managed page
637 		 * counters.
638 		 */
639 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
640 			       sb_id * vm->subblock_size);
641 		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
642 		for (i = 0; i < nr_pages; i++)
643 			page_ref_inc(pfn_to_page(pfn + i));
644 	}
645 }
646 
647 /*
648  * This callback will either be called synchronously from add_memory() or
649  * asynchronously (e.g., triggered via user space). We have to be careful
650  * with locking when calling add_memory().
651  */
652 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
653 					 unsigned long action, void *arg)
654 {
655 	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
656 					     memory_notifier);
657 	struct memory_notify *mhp = arg;
658 	const unsigned long start = PFN_PHYS(mhp->start_pfn);
659 	const unsigned long size = PFN_PHYS(mhp->nr_pages);
660 	const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
661 	enum zone_type zone;
662 	int rc = NOTIFY_OK;
663 
664 	if (!virtio_mem_overlaps_range(vm, start, size))
665 		return NOTIFY_DONE;
666 
667 	/*
668 	 * Memory is onlined/offlined in memory block granularity. We cannot
669 	 * cross virtio-mem device boundaries and memory block boundaries. Bail
670 	 * out if this ever changes.
671 	 */
672 	if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
673 			 !IS_ALIGNED(start, memory_block_size_bytes())))
674 		return NOTIFY_BAD;
675 
676 	/*
677 	 * Avoid circular locking lockdep warnings. We lock the mutex
678 	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
679 	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
680 	 * between both notifier calls and will bail out. False positive.
681 	 */
682 	lockdep_off();
683 
684 	switch (action) {
685 	case MEM_GOING_OFFLINE:
686 		mutex_lock(&vm->hotplug_mutex);
687 		if (vm->removing) {
688 			rc = notifier_from_errno(-EBUSY);
689 			mutex_unlock(&vm->hotplug_mutex);
690 			break;
691 		}
692 		vm->hotplug_active = true;
693 		virtio_mem_notify_going_offline(vm, mb_id);
694 		break;
695 	case MEM_GOING_ONLINE:
696 		mutex_lock(&vm->hotplug_mutex);
697 		if (vm->removing) {
698 			rc = notifier_from_errno(-EBUSY);
699 			mutex_unlock(&vm->hotplug_mutex);
700 			break;
701 		}
702 		vm->hotplug_active = true;
703 		zone = page_zonenum(pfn_to_page(mhp->start_pfn));
704 		rc = virtio_mem_notify_going_online(vm, mb_id, zone);
705 		break;
706 	case MEM_OFFLINE:
707 		virtio_mem_notify_offline(vm, mb_id);
708 		vm->hotplug_active = false;
709 		mutex_unlock(&vm->hotplug_mutex);
710 		break;
711 	case MEM_ONLINE:
712 		zone = page_zonenum(pfn_to_page(mhp->start_pfn));
713 		virtio_mem_notify_online(vm, mb_id, zone);
714 		vm->hotplug_active = false;
715 		mutex_unlock(&vm->hotplug_mutex);
716 		break;
717 	case MEM_CANCEL_OFFLINE:
718 		if (!vm->hotplug_active)
719 			break;
720 		virtio_mem_notify_cancel_offline(vm, mb_id);
721 		vm->hotplug_active = false;
722 		mutex_unlock(&vm->hotplug_mutex);
723 		break;
724 	case MEM_CANCEL_ONLINE:
725 		if (!vm->hotplug_active)
726 			break;
727 		vm->hotplug_active = false;
728 		mutex_unlock(&vm->hotplug_mutex);
729 		break;
730 	default:
731 		break;
732 	}
733 
734 	lockdep_on();
735 
736 	return rc;
737 }
738 
739 /*
740  * Set a range of pages PG_offline. Remember pages that were never onlined
741  * (via generic_online_page()) using PageDirty().
742  */
743 static void virtio_mem_set_fake_offline(unsigned long pfn,
744 					unsigned int nr_pages, bool onlined)
745 {
746 	for (; nr_pages--; pfn++) {
747 		struct page *page = pfn_to_page(pfn);
748 
749 		__SetPageOffline(page);
750 		if (!onlined) {
751 			SetPageDirty(page);
752 			/* FIXME: remove after cleanups */
753 			ClearPageReserved(page);
754 		}
755 	}
756 }
757 
758 /*
759  * Clear PG_offline from a range of pages. If the pages were never onlined,
760  * (via generic_online_page()), clear PageDirty().
761  */
762 static void virtio_mem_clear_fake_offline(unsigned long pfn,
763 					  unsigned int nr_pages, bool onlined)
764 {
765 	for (; nr_pages--; pfn++) {
766 		struct page *page = pfn_to_page(pfn);
767 
768 		__ClearPageOffline(page);
769 		if (!onlined)
770 			ClearPageDirty(page);
771 	}
772 }
773 
774 /*
775  * Release a range of fake-offline pages to the buddy, effectively
776  * fake-onlining them.
777  */
778 static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
779 {
780 	const int order = MAX_ORDER - 1;
781 	int i;
782 
783 	/*
784 	 * We are always called with subblock granularity, which is at least
785 	 * aligned to MAX_ORDER - 1.
786 	 */
787 	for (i = 0; i < nr_pages; i += 1 << order) {
788 		struct page *page = pfn_to_page(pfn + i);
789 
790 		/*
791 		 * If the page is PageDirty(), it was kept fake-offline when
792 		 * onlining the memory block. Otherwise, it was allocated
793 		 * using alloc_contig_range(). All pages in a subblock are
794 		 * alike.
795 		 */
796 		if (PageDirty(page)) {
797 			virtio_mem_clear_fake_offline(pfn + i, 1 << order,
798 						      false);
799 			generic_online_page(page, order);
800 		} else {
801 			virtio_mem_clear_fake_offline(pfn + i, 1 << order,
802 						      true);
803 			free_contig_range(pfn + i, 1 << order);
804 			adjust_managed_page_count(page, 1 << order);
805 		}
806 	}
807 }
808 
809 static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
810 {
811 	const unsigned long addr = page_to_phys(page);
812 	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
813 	struct virtio_mem *vm;
814 	int sb_id;
815 
816 	/*
817 	 * We exploit here that subblocks have at least MAX_ORDER - 1
818 	 * size/alignment and that this callback is is called with such a
819 	 * size/alignment. So we cannot cross subblocks and therefore
820 	 * also not memory blocks.
821 	 */
822 	rcu_read_lock();
823 	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
824 		if (!virtio_mem_owned_mb(vm, mb_id))
825 			continue;
826 
827 		sb_id = virtio_mem_phys_to_sb_id(vm, addr);
828 		/*
829 		 * If plugged, online the pages, otherwise, set them fake
830 		 * offline (PageOffline).
831 		 */
832 		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
833 			generic_online_page(page, order);
834 		else
835 			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
836 						    false);
837 		rcu_read_unlock();
838 		return;
839 	}
840 	rcu_read_unlock();
841 
842 	/* not virtio-mem memory, but e.g., a DIMM. online it */
843 	generic_online_page(page, order);
844 }
845 
846 static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
847 					const struct virtio_mem_req *req)
848 {
849 	struct scatterlist *sgs[2], sg_req, sg_resp;
850 	unsigned int len;
851 	int rc;
852 
853 	/* don't use the request residing on the stack (vaddr) */
854 	vm->req = *req;
855 
856 	/* out: buffer for request */
857 	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
858 	sgs[0] = &sg_req;
859 
860 	/* in: buffer for response */
861 	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
862 	sgs[1] = &sg_resp;
863 
864 	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
865 	if (rc < 0)
866 		return rc;
867 
868 	virtqueue_kick(vm->vq);
869 
870 	/* wait for a response */
871 	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
872 
873 	return virtio16_to_cpu(vm->vdev, vm->resp.type);
874 }
875 
876 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
877 					uint64_t size)
878 {
879 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
880 	const struct virtio_mem_req req = {
881 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
882 		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
883 		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
884 	};
885 
886 	if (atomic_read(&vm->config_changed))
887 		return -EAGAIN;
888 
889 	switch (virtio_mem_send_request(vm, &req)) {
890 	case VIRTIO_MEM_RESP_ACK:
891 		vm->plugged_size += size;
892 		return 0;
893 	case VIRTIO_MEM_RESP_NACK:
894 		return -EAGAIN;
895 	case VIRTIO_MEM_RESP_BUSY:
896 		return -ETXTBSY;
897 	case VIRTIO_MEM_RESP_ERROR:
898 		return -EINVAL;
899 	default:
900 		return -ENOMEM;
901 	}
902 }
903 
904 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
905 					  uint64_t size)
906 {
907 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
908 	const struct virtio_mem_req req = {
909 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
910 		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
911 		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
912 	};
913 
914 	if (atomic_read(&vm->config_changed))
915 		return -EAGAIN;
916 
917 	switch (virtio_mem_send_request(vm, &req)) {
918 	case VIRTIO_MEM_RESP_ACK:
919 		vm->plugged_size -= size;
920 		return 0;
921 	case VIRTIO_MEM_RESP_BUSY:
922 		return -ETXTBSY;
923 	case VIRTIO_MEM_RESP_ERROR:
924 		return -EINVAL;
925 	default:
926 		return -ENOMEM;
927 	}
928 }
929 
930 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
931 {
932 	const struct virtio_mem_req req = {
933 		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
934 	};
935 
936 	switch (virtio_mem_send_request(vm, &req)) {
937 	case VIRTIO_MEM_RESP_ACK:
938 		vm->unplug_all_required = false;
939 		vm->plugged_size = 0;
940 		/* usable region might have shrunk */
941 		atomic_set(&vm->config_changed, 1);
942 		return 0;
943 	case VIRTIO_MEM_RESP_BUSY:
944 		return -ETXTBSY;
945 	default:
946 		return -ENOMEM;
947 	}
948 }
949 
950 /*
951  * Plug selected subblocks. Updates the plugged state, but not the state
952  * of the memory block.
953  */
954 static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
955 				 int sb_id, int count)
956 {
957 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
958 			      sb_id * vm->subblock_size;
959 	const uint64_t size = count * vm->subblock_size;
960 	int rc;
961 
962 	dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
963 		sb_id, sb_id + count - 1);
964 
965 	rc = virtio_mem_send_plug_request(vm, addr, size);
966 	if (!rc)
967 		virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count);
968 	return rc;
969 }
970 
971 /*
972  * Unplug selected subblocks. Updates the plugged state, but not the state
973  * of the memory block.
974  */
975 static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
976 				   int sb_id, int count)
977 {
978 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
979 			      sb_id * vm->subblock_size;
980 	const uint64_t size = count * vm->subblock_size;
981 	int rc;
982 
983 	dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n",
984 		mb_id, sb_id, sb_id + count - 1);
985 
986 	rc = virtio_mem_send_unplug_request(vm, addr, size);
987 	if (!rc)
988 		virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count);
989 	return rc;
990 }
991 
992 /*
993  * Unplug the desired number of plugged subblocks of a offline or not-added
994  * memory block. Will fail if any subblock cannot get unplugged (instead of
995  * skipping it).
996  *
997  * Will not modify the state of the memory block.
998  *
999  * Note: can fail after some subblocks were unplugged.
1000  */
1001 static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
1002 				       unsigned long mb_id, uint64_t *nb_sb)
1003 {
1004 	int sb_id, count;
1005 	int rc;
1006 
1007 	sb_id = vm->nb_sb_per_mb - 1;
1008 	while (*nb_sb) {
1009 		/* Find the next candidate subblock */
1010 		while (sb_id >= 0 &&
1011 		       virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1))
1012 			sb_id--;
1013 		if (sb_id < 0)
1014 			break;
1015 		/* Try to unplug multiple subblocks at a time */
1016 		count = 1;
1017 		while (count < *nb_sb && sb_id > 0 &&
1018 		       virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1019 			count++;
1020 			sb_id--;
1021 		}
1022 
1023 		rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
1024 		if (rc)
1025 			return rc;
1026 		*nb_sb -= count;
1027 		sb_id--;
1028 	}
1029 
1030 	return 0;
1031 }
1032 
1033 /*
1034  * Unplug all plugged subblocks of an offline or not-added memory block.
1035  *
1036  * Will not modify the state of the memory block.
1037  *
1038  * Note: can fail after some subblocks were unplugged.
1039  */
1040 static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id)
1041 {
1042 	uint64_t nb_sb = vm->nb_sb_per_mb;
1043 
1044 	return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb);
1045 }
1046 
1047 /*
1048  * Prepare tracking data for the next memory block.
1049  */
1050 static int virtio_mem_prepare_next_mb(struct virtio_mem *vm,
1051 				      unsigned long *mb_id)
1052 {
1053 	int rc;
1054 
1055 	if (vm->next_mb_id > vm->last_usable_mb_id)
1056 		return -ENOSPC;
1057 
1058 	/* Resize the state array if required. */
1059 	rc = virtio_mem_mb_state_prepare_next_mb(vm);
1060 	if (rc)
1061 		return rc;
1062 
1063 	/* Resize the subblock bitmap if required. */
1064 	rc = virtio_mem_sb_bitmap_prepare_next_mb(vm);
1065 	if (rc)
1066 		return rc;
1067 
1068 	vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++;
1069 	*mb_id = vm->next_mb_id++;
1070 	return 0;
1071 }
1072 
1073 /*
1074  * Don't add too many blocks that are not onlined yet to avoid running OOM.
1075  */
1076 static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm)
1077 {
1078 	unsigned long nb_offline;
1079 
1080 	nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
1081 		     vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
1082 	return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD;
1083 }
1084 
1085 /*
1086  * Try to plug the desired number of subblocks and add the memory block
1087  * to Linux.
1088  *
1089  * Will modify the state of the memory block.
1090  */
1091 static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
1092 				      unsigned long mb_id,
1093 				      uint64_t *nb_sb)
1094 {
1095 	const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb);
1096 	int rc, rc2;
1097 
1098 	if (WARN_ON_ONCE(!count))
1099 		return -EINVAL;
1100 
1101 	/*
1102 	 * Plug the requested number of subblocks before adding it to linux,
1103 	 * so that onlining will directly online all plugged subblocks.
1104 	 */
1105 	rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count);
1106 	if (rc)
1107 		return rc;
1108 
1109 	/*
1110 	 * Mark the block properly offline before adding it to Linux,
1111 	 * so the memory notifiers will find the block in the right state.
1112 	 */
1113 	if (count == vm->nb_sb_per_mb)
1114 		virtio_mem_mb_set_state(vm, mb_id,
1115 					VIRTIO_MEM_MB_STATE_OFFLINE);
1116 	else
1117 		virtio_mem_mb_set_state(vm, mb_id,
1118 					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
1119 
1120 	/* Add the memory block to linux - if that fails, try to unplug. */
1121 	rc = virtio_mem_mb_add(vm, mb_id);
1122 	if (rc) {
1123 		enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED;
1124 
1125 		dev_err(&vm->vdev->dev,
1126 			"adding memory block %lu failed with %d\n", mb_id, rc);
1127 		rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count);
1128 
1129 		/*
1130 		 * TODO: Linux MM does not properly clean up yet in all cases
1131 		 * where adding of memory failed - especially on -ENOMEM.
1132 		 */
1133 		if (rc2)
1134 			new_state = VIRTIO_MEM_MB_STATE_PLUGGED;
1135 		virtio_mem_mb_set_state(vm, mb_id, new_state);
1136 		return rc;
1137 	}
1138 
1139 	*nb_sb -= count;
1140 	return 0;
1141 }
1142 
1143 /*
1144  * Try to plug the desired number of subblocks of a memory block that
1145  * is already added to Linux.
1146  *
1147  * Will modify the state of the memory block.
1148  *
1149  * Note: Can fail after some subblocks were successfully plugged.
1150  */
1151 static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
1152 				     uint64_t *nb_sb, bool online)
1153 {
1154 	unsigned long pfn, nr_pages;
1155 	int sb_id, count;
1156 	int rc;
1157 
1158 	if (WARN_ON_ONCE(!*nb_sb))
1159 		return -EINVAL;
1160 
1161 	while (*nb_sb) {
1162 		sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id);
1163 		if (sb_id >= vm->nb_sb_per_mb)
1164 			break;
1165 		count = 1;
1166 		while (count < *nb_sb &&
1167 		       sb_id + count < vm->nb_sb_per_mb &&
1168 		       !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count,
1169 						      1))
1170 			count++;
1171 
1172 		rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count);
1173 		if (rc)
1174 			return rc;
1175 		*nb_sb -= count;
1176 		if (!online)
1177 			continue;
1178 
1179 		/* fake-online the pages if the memory block is online */
1180 		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1181 			       sb_id * vm->subblock_size);
1182 		nr_pages = PFN_DOWN(count * vm->subblock_size);
1183 		virtio_mem_fake_online(pfn, nr_pages);
1184 	}
1185 
1186 	if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1187 		if (online)
1188 			virtio_mem_mb_set_state(vm, mb_id,
1189 						VIRTIO_MEM_MB_STATE_ONLINE);
1190 		else
1191 			virtio_mem_mb_set_state(vm, mb_id,
1192 						VIRTIO_MEM_MB_STATE_OFFLINE);
1193 	}
1194 
1195 	return rc;
1196 }
1197 
1198 /*
1199  * Try to plug the requested amount of memory.
1200  */
1201 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1202 {
1203 	uint64_t nb_sb = diff / vm->subblock_size;
1204 	unsigned long mb_id;
1205 	int rc;
1206 
1207 	if (!nb_sb)
1208 		return 0;
1209 
1210 	/* Don't race with onlining/offlining */
1211 	mutex_lock(&vm->hotplug_mutex);
1212 
1213 	/* Try to plug subblocks of partially plugged online blocks. */
1214 	virtio_mem_for_each_mb_state(vm, mb_id,
1215 				     VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
1216 		rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true);
1217 		if (rc || !nb_sb)
1218 			goto out_unlock;
1219 		cond_resched();
1220 	}
1221 
1222 	/* Try to plug subblocks of partially plugged offline blocks. */
1223 	virtio_mem_for_each_mb_state(vm, mb_id,
1224 				     VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1225 		rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false);
1226 		if (rc || !nb_sb)
1227 			goto out_unlock;
1228 		cond_resched();
1229 	}
1230 
1231 	/*
1232 	 * We won't be working on online/offline memory blocks from this point,
1233 	 * so we can't race with memory onlining/offlining. Drop the mutex.
1234 	 */
1235 	mutex_unlock(&vm->hotplug_mutex);
1236 
1237 	/* Try to plug and add unused blocks */
1238 	virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) {
1239 		if (virtio_mem_too_many_mb_offline(vm))
1240 			return -ENOSPC;
1241 
1242 		rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
1243 		if (rc || !nb_sb)
1244 			return rc;
1245 		cond_resched();
1246 	}
1247 
1248 	/* Try to prepare, plug and add new blocks */
1249 	while (nb_sb) {
1250 		if (virtio_mem_too_many_mb_offline(vm))
1251 			return -ENOSPC;
1252 
1253 		rc = virtio_mem_prepare_next_mb(vm, &mb_id);
1254 		if (rc)
1255 			return rc;
1256 		rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
1257 		if (rc)
1258 			return rc;
1259 		cond_resched();
1260 	}
1261 
1262 	return 0;
1263 out_unlock:
1264 	mutex_unlock(&vm->hotplug_mutex);
1265 	return rc;
1266 }
1267 
1268 /*
1269  * Unplug the desired number of plugged subblocks of an offline memory block.
1270  * Will fail if any subblock cannot get unplugged (instead of skipping it).
1271  *
1272  * Will modify the state of the memory block. Might temporarily drop the
1273  * hotplug_mutex.
1274  *
1275  * Note: Can fail after some subblocks were successfully unplugged.
1276  */
1277 static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
1278 					       unsigned long mb_id,
1279 					       uint64_t *nb_sb)
1280 {
1281 	int rc;
1282 
1283 	rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb);
1284 
1285 	/* some subblocks might have been unplugged even on failure */
1286 	if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb))
1287 		virtio_mem_mb_set_state(vm, mb_id,
1288 					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
1289 	if (rc)
1290 		return rc;
1291 
1292 	if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1293 		/*
1294 		 * Remove the block from Linux - this should never fail.
1295 		 * Hinder the block from getting onlined by marking it
1296 		 * unplugged. Temporarily drop the mutex, so
1297 		 * any pending GOING_ONLINE requests can be serviced/rejected.
1298 		 */
1299 		virtio_mem_mb_set_state(vm, mb_id,
1300 					VIRTIO_MEM_MB_STATE_UNUSED);
1301 
1302 		mutex_unlock(&vm->hotplug_mutex);
1303 		rc = virtio_mem_mb_remove(vm, mb_id);
1304 		BUG_ON(rc);
1305 		mutex_lock(&vm->hotplug_mutex);
1306 	}
1307 	return 0;
1308 }
1309 
1310 /*
1311  * Unplug the given plugged subblocks of an online memory block.
1312  *
1313  * Will modify the state of the memory block.
1314  */
1315 static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
1316 					  unsigned long mb_id, int sb_id,
1317 					  int count)
1318 {
1319 	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count;
1320 	unsigned long start_pfn;
1321 	int rc;
1322 
1323 	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1324 			     sb_id * vm->subblock_size);
1325 	rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
1326 				MIGRATE_MOVABLE, GFP_KERNEL);
1327 	if (rc == -ENOMEM)
1328 		/* whoops, out of memory */
1329 		return rc;
1330 	if (rc)
1331 		return -EBUSY;
1332 
1333 	/* Mark it as fake-offline before unplugging it */
1334 	virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
1335 	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1336 
1337 	/* Try to unplug the allocated memory */
1338 	rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
1339 	if (rc) {
1340 		/* Return the memory to the buddy. */
1341 		virtio_mem_fake_online(start_pfn, nr_pages);
1342 		return rc;
1343 	}
1344 
1345 	virtio_mem_mb_set_state(vm, mb_id,
1346 				VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
1347 	return 0;
1348 }
1349 
1350 /*
1351  * Unplug the desired number of plugged subblocks of an online memory block.
1352  * Will skip subblock that are busy.
1353  *
1354  * Will modify the state of the memory block. Might temporarily drop the
1355  * hotplug_mutex.
1356  *
1357  * Note: Can fail after some subblocks were successfully unplugged. Can
1358  *       return 0 even if subblocks were busy and could not get unplugged.
1359  */
1360 static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
1361 					      unsigned long mb_id,
1362 					      uint64_t *nb_sb)
1363 {
1364 	int rc, sb_id;
1365 
1366 	/* If possible, try to unplug the complete block in one shot. */
1367 	if (*nb_sb >= vm->nb_sb_per_mb &&
1368 	    virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1369 		rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0,
1370 						    vm->nb_sb_per_mb);
1371 		if (!rc) {
1372 			*nb_sb -= vm->nb_sb_per_mb;
1373 			goto unplugged;
1374 		} else if (rc != -EBUSY)
1375 			return rc;
1376 	}
1377 
1378 	/* Fallback to single subblocks. */
1379 	for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1380 		/* Find the next candidate subblock */
1381 		while (sb_id >= 0 &&
1382 		       !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
1383 			sb_id--;
1384 		if (sb_id < 0)
1385 			break;
1386 
1387 		rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1);
1388 		if (rc == -EBUSY)
1389 			continue;
1390 		else if (rc)
1391 			return rc;
1392 		*nb_sb -= 1;
1393 	}
1394 
1395 unplugged:
1396 	/*
1397 	 * Once all subblocks of a memory block were unplugged, offline and
1398 	 * remove it. This will usually not fail, as no memory is in use
1399 	 * anymore - however some other notifiers might NACK the request.
1400 	 */
1401 	if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1402 		mutex_unlock(&vm->hotplug_mutex);
1403 		rc = virtio_mem_mb_offline_and_remove(vm, mb_id);
1404 		mutex_lock(&vm->hotplug_mutex);
1405 		if (!rc)
1406 			virtio_mem_mb_set_state(vm, mb_id,
1407 						VIRTIO_MEM_MB_STATE_UNUSED);
1408 	}
1409 
1410 	return 0;
1411 }
1412 
1413 /*
1414  * Try to unplug the requested amount of memory.
1415  */
1416 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
1417 {
1418 	uint64_t nb_sb = diff / vm->subblock_size;
1419 	unsigned long mb_id;
1420 	int rc;
1421 
1422 	if (!nb_sb)
1423 		return 0;
1424 
1425 	/*
1426 	 * We'll drop the mutex a couple of times when it is safe to do so.
1427 	 * This might result in some blocks switching the state (online/offline)
1428 	 * and we could miss them in this run - we will retry again later.
1429 	 */
1430 	mutex_lock(&vm->hotplug_mutex);
1431 
1432 	/* Try to unplug subblocks of partially plugged offline blocks. */
1433 	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1434 					 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1435 		rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
1436 							 &nb_sb);
1437 		if (rc || !nb_sb)
1438 			goto out_unlock;
1439 		cond_resched();
1440 	}
1441 
1442 	/* Try to unplug subblocks of plugged offline blocks. */
1443 	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1444 					 VIRTIO_MEM_MB_STATE_OFFLINE) {
1445 		rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
1446 							 &nb_sb);
1447 		if (rc || !nb_sb)
1448 			goto out_unlock;
1449 		cond_resched();
1450 	}
1451 
1452 	if (!unplug_online) {
1453 		mutex_unlock(&vm->hotplug_mutex);
1454 		return 0;
1455 	}
1456 
1457 	/* Try to unplug subblocks of partially plugged online blocks. */
1458 	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1459 					 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
1460 		rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1461 							&nb_sb);
1462 		if (rc || !nb_sb)
1463 			goto out_unlock;
1464 		mutex_unlock(&vm->hotplug_mutex);
1465 		cond_resched();
1466 		mutex_lock(&vm->hotplug_mutex);
1467 	}
1468 
1469 	/* Try to unplug subblocks of plugged online blocks. */
1470 	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1471 					 VIRTIO_MEM_MB_STATE_ONLINE) {
1472 		rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1473 							&nb_sb);
1474 		if (rc || !nb_sb)
1475 			goto out_unlock;
1476 		mutex_unlock(&vm->hotplug_mutex);
1477 		cond_resched();
1478 		mutex_lock(&vm->hotplug_mutex);
1479 	}
1480 
1481 	mutex_unlock(&vm->hotplug_mutex);
1482 	return nb_sb ? -EBUSY : 0;
1483 out_unlock:
1484 	mutex_unlock(&vm->hotplug_mutex);
1485 	return rc;
1486 }
1487 
1488 /*
1489  * Try to unplug all blocks that couldn't be unplugged before, for example,
1490  * because the hypervisor was busy.
1491  */
1492 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
1493 {
1494 	unsigned long mb_id;
1495 	int rc;
1496 
1497 	virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) {
1498 		rc = virtio_mem_mb_unplug(vm, mb_id);
1499 		if (rc)
1500 			return rc;
1501 		virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
1502 	}
1503 
1504 	return 0;
1505 }
1506 
1507 /*
1508  * Update all parts of the config that could have changed.
1509  */
1510 static void virtio_mem_refresh_config(struct virtio_mem *vm)
1511 {
1512 	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
1513 	uint64_t new_plugged_size, usable_region_size, end_addr;
1514 
1515 	/* the plugged_size is just a reflection of what _we_ did previously */
1516 	virtio_cread(vm->vdev, struct virtio_mem_config, plugged_size,
1517 		     &new_plugged_size);
1518 	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
1519 		vm->plugged_size = new_plugged_size;
1520 
1521 	/* calculate the last usable memory block id */
1522 	virtio_cread(vm->vdev, struct virtio_mem_config,
1523 		     usable_region_size, &usable_region_size);
1524 	end_addr = vm->addr + usable_region_size;
1525 	end_addr = min(end_addr, phys_limit);
1526 	vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1;
1527 
1528 	/* see if there is a request to change the size */
1529 	virtio_cread(vm->vdev, struct virtio_mem_config, requested_size,
1530 		     &vm->requested_size);
1531 
1532 	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
1533 	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
1534 }
1535 
1536 /*
1537  * Workqueue function for handling plug/unplug requests and config updates.
1538  */
1539 static void virtio_mem_run_wq(struct work_struct *work)
1540 {
1541 	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
1542 	uint64_t diff;
1543 	int rc;
1544 
1545 	hrtimer_cancel(&vm->retry_timer);
1546 
1547 	if (vm->broken)
1548 		return;
1549 
1550 retry:
1551 	rc = 0;
1552 
1553 	/* Make sure we start with a clean state if there are leftovers. */
1554 	if (unlikely(vm->unplug_all_required))
1555 		rc = virtio_mem_send_unplug_all_request(vm);
1556 
1557 	if (atomic_read(&vm->config_changed)) {
1558 		atomic_set(&vm->config_changed, 0);
1559 		virtio_mem_refresh_config(vm);
1560 	}
1561 
1562 	/* Unplug any leftovers from previous runs */
1563 	if (!rc)
1564 		rc = virtio_mem_unplug_pending_mb(vm);
1565 
1566 	if (!rc && vm->requested_size != vm->plugged_size) {
1567 		if (vm->requested_size > vm->plugged_size) {
1568 			diff = vm->requested_size - vm->plugged_size;
1569 			rc = virtio_mem_plug_request(vm, diff);
1570 		} else {
1571 			diff = vm->plugged_size - vm->requested_size;
1572 			rc = virtio_mem_unplug_request(vm, diff);
1573 		}
1574 	}
1575 
1576 	switch (rc) {
1577 	case 0:
1578 		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
1579 		break;
1580 	case -ENOSPC:
1581 		/*
1582 		 * We cannot add any more memory (alignment, physical limit)
1583 		 * or we have too many offline memory blocks.
1584 		 */
1585 		break;
1586 	case -ETXTBSY:
1587 		/*
1588 		 * The hypervisor cannot process our request right now
1589 		 * (e.g., out of memory, migrating);
1590 		 */
1591 	case -EBUSY:
1592 		/*
1593 		 * We cannot free up any memory to unplug it (all plugged memory
1594 		 * is busy).
1595 		 */
1596 	case -ENOMEM:
1597 		/* Out of memory, try again later. */
1598 		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
1599 			      HRTIMER_MODE_REL);
1600 		break;
1601 	case -EAGAIN:
1602 		/* Retry immediately (e.g., the config changed). */
1603 		goto retry;
1604 	default:
1605 		/* Unknown error, mark as broken */
1606 		dev_err(&vm->vdev->dev,
1607 			"unknown error, marking device broken: %d\n", rc);
1608 		vm->broken = true;
1609 	}
1610 }
1611 
1612 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
1613 {
1614 	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
1615 					     retry_timer);
1616 
1617 	virtio_mem_retry(vm);
1618 	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
1619 				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
1620 	return HRTIMER_NORESTART;
1621 }
1622 
1623 static void virtio_mem_handle_response(struct virtqueue *vq)
1624 {
1625 	struct virtio_mem *vm = vq->vdev->priv;
1626 
1627 	wake_up(&vm->host_resp);
1628 }
1629 
1630 static int virtio_mem_init_vq(struct virtio_mem *vm)
1631 {
1632 	struct virtqueue *vq;
1633 
1634 	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
1635 				   "guest-request");
1636 	if (IS_ERR(vq))
1637 		return PTR_ERR(vq);
1638 	vm->vq = vq;
1639 
1640 	return 0;
1641 }
1642 
1643 static int virtio_mem_init(struct virtio_mem *vm)
1644 {
1645 	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
1646 	uint16_t node_id;
1647 
1648 	if (!vm->vdev->config->get) {
1649 		dev_err(&vm->vdev->dev, "config access disabled\n");
1650 		return -EINVAL;
1651 	}
1652 
1653 	/*
1654 	 * We don't want to (un)plug or reuse any memory when in kdump. The
1655 	 * memory is still accessible (but not mapped).
1656 	 */
1657 	if (is_kdump_kernel()) {
1658 		dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
1659 		return -EBUSY;
1660 	}
1661 
1662 	/* Fetch all properties that can't change. */
1663 	virtio_cread(vm->vdev, struct virtio_mem_config, plugged_size,
1664 		     &vm->plugged_size);
1665 	virtio_cread(vm->vdev, struct virtio_mem_config, block_size,
1666 		     &vm->device_block_size);
1667 	virtio_cread(vm->vdev, struct virtio_mem_config, node_id,
1668 		     &node_id);
1669 	vm->nid = virtio_mem_translate_node_id(vm, node_id);
1670 	virtio_cread(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
1671 	virtio_cread(vm->vdev, struct virtio_mem_config, region_size,
1672 		     &vm->region_size);
1673 
1674 	/*
1675 	 * We always hotplug memory in memory block granularity. This way,
1676 	 * we have to wait for exactly one memory block to online.
1677 	 */
1678 	if (vm->device_block_size > memory_block_size_bytes()) {
1679 		dev_err(&vm->vdev->dev,
1680 			"The block size is not supported (too big).\n");
1681 		return -EINVAL;
1682 	}
1683 
1684 	/* bad device setup - warn only */
1685 	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
1686 		dev_warn(&vm->vdev->dev,
1687 			 "The alignment of the physical start address can make some memory unusable.\n");
1688 	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
1689 		dev_warn(&vm->vdev->dev,
1690 			 "The alignment of the physical end address can make some memory unusable.\n");
1691 	if (vm->addr + vm->region_size > phys_limit)
1692 		dev_warn(&vm->vdev->dev,
1693 			 "Some memory is not addressable. This can make some memory unusable.\n");
1694 
1695 	/*
1696 	 * Calculate the subblock size:
1697 	 * - At least MAX_ORDER - 1 / pageblock_order.
1698 	 * - At least the device block size.
1699 	 * In the worst case, a single subblock per memory block.
1700 	 */
1701 	vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1,
1702 						     pageblock_order);
1703 	vm->subblock_size = max_t(uint64_t, vm->device_block_size,
1704 				  vm->subblock_size);
1705 	vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size;
1706 
1707 	/* Round up to the next full memory block */
1708 	vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
1709 						   memory_block_size_bytes());
1710 	vm->next_mb_id = vm->first_mb_id;
1711 	vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
1712 			 vm->region_size) - 1;
1713 
1714 	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
1715 	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
1716 	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
1717 		 (unsigned long long)vm->device_block_size);
1718 	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
1719 		 memory_block_size_bytes());
1720 	dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
1721 		 (unsigned long long)vm->subblock_size);
1722 	if (vm->nid != NUMA_NO_NODE)
1723 		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
1724 
1725 	return 0;
1726 }
1727 
1728 static int virtio_mem_create_resource(struct virtio_mem *vm)
1729 {
1730 	/*
1731 	 * When force-unloading the driver and removing the device, we
1732 	 * could have a garbage pointer. Duplicate the string.
1733 	 */
1734 	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
1735 
1736 	if (!name)
1737 		return -ENOMEM;
1738 
1739 	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
1740 						   name, IORESOURCE_SYSTEM_RAM);
1741 	if (!vm->parent_resource) {
1742 		kfree(name);
1743 		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
1744 		dev_info(&vm->vdev->dev,
1745 			 "reloading the driver is not supported\n");
1746 		return -EBUSY;
1747 	}
1748 
1749 	/* The memory is not actually busy - make add_memory() work. */
1750 	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
1751 	return 0;
1752 }
1753 
1754 static void virtio_mem_delete_resource(struct virtio_mem *vm)
1755 {
1756 	const char *name;
1757 
1758 	if (!vm->parent_resource)
1759 		return;
1760 
1761 	name = vm->parent_resource->name;
1762 	release_resource(vm->parent_resource);
1763 	kfree(vm->parent_resource);
1764 	kfree(name);
1765 	vm->parent_resource = NULL;
1766 }
1767 
1768 static int virtio_mem_probe(struct virtio_device *vdev)
1769 {
1770 	struct virtio_mem *vm;
1771 	int rc;
1772 
1773 	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
1774 	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
1775 
1776 	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1777 	if (!vm)
1778 		return -ENOMEM;
1779 
1780 	init_waitqueue_head(&vm->host_resp);
1781 	vm->vdev = vdev;
1782 	INIT_WORK(&vm->wq, virtio_mem_run_wq);
1783 	mutex_init(&vm->hotplug_mutex);
1784 	INIT_LIST_HEAD(&vm->next);
1785 	spin_lock_init(&vm->removal_lock);
1786 	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1787 	vm->retry_timer.function = virtio_mem_timer_expired;
1788 	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
1789 
1790 	/* register the virtqueue */
1791 	rc = virtio_mem_init_vq(vm);
1792 	if (rc)
1793 		goto out_free_vm;
1794 
1795 	/* initialize the device by querying the config */
1796 	rc = virtio_mem_init(vm);
1797 	if (rc)
1798 		goto out_del_vq;
1799 
1800 	/* create the parent resource for all memory */
1801 	rc = virtio_mem_create_resource(vm);
1802 	if (rc)
1803 		goto out_del_vq;
1804 
1805 	/*
1806 	 * If we still have memory plugged, we have to unplug all memory first.
1807 	 * Registering our parent resource makes sure that this memory isn't
1808 	 * actually in use (e.g., trying to reload the driver).
1809 	 */
1810 	if (vm->plugged_size) {
1811 		vm->unplug_all_required = 1;
1812 		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
1813 	}
1814 
1815 	/* register callbacks */
1816 	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
1817 	rc = register_memory_notifier(&vm->memory_notifier);
1818 	if (rc)
1819 		goto out_del_resource;
1820 	rc = register_virtio_mem_device(vm);
1821 	if (rc)
1822 		goto out_unreg_mem;
1823 
1824 	virtio_device_ready(vdev);
1825 
1826 	/* trigger a config update to start processing the requested_size */
1827 	atomic_set(&vm->config_changed, 1);
1828 	queue_work(system_freezable_wq, &vm->wq);
1829 
1830 	return 0;
1831 out_unreg_mem:
1832 	unregister_memory_notifier(&vm->memory_notifier);
1833 out_del_resource:
1834 	virtio_mem_delete_resource(vm);
1835 out_del_vq:
1836 	vdev->config->del_vqs(vdev);
1837 out_free_vm:
1838 	kfree(vm);
1839 	vdev->priv = NULL;
1840 
1841 	return rc;
1842 }
1843 
1844 static void virtio_mem_remove(struct virtio_device *vdev)
1845 {
1846 	struct virtio_mem *vm = vdev->priv;
1847 	unsigned long mb_id;
1848 	int rc;
1849 
1850 	/*
1851 	 * Make sure the workqueue won't be triggered anymore and no memory
1852 	 * blocks can be onlined/offlined until we're finished here.
1853 	 */
1854 	mutex_lock(&vm->hotplug_mutex);
1855 	spin_lock_irq(&vm->removal_lock);
1856 	vm->removing = true;
1857 	spin_unlock_irq(&vm->removal_lock);
1858 	mutex_unlock(&vm->hotplug_mutex);
1859 
1860 	/* wait until the workqueue stopped */
1861 	cancel_work_sync(&vm->wq);
1862 	hrtimer_cancel(&vm->retry_timer);
1863 
1864 	/*
1865 	 * After we unregistered our callbacks, user space can online partially
1866 	 * plugged offline blocks. Make sure to remove them.
1867 	 */
1868 	virtio_mem_for_each_mb_state(vm, mb_id,
1869 				     VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1870 		rc = virtio_mem_mb_remove(vm, mb_id);
1871 		BUG_ON(rc);
1872 		virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
1873 	}
1874 	/*
1875 	 * After we unregistered our callbacks, user space can no longer
1876 	 * offline partially plugged online memory blocks. No need to worry
1877 	 * about them.
1878 	 */
1879 
1880 	/* unregister callbacks */
1881 	unregister_virtio_mem_device(vm);
1882 	unregister_memory_notifier(&vm->memory_notifier);
1883 
1884 	/*
1885 	 * There is no way we could reliably remove all memory we have added to
1886 	 * the system. And there is no way to stop the driver/device from going
1887 	 * away. Warn at least.
1888 	 */
1889 	if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
1890 	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
1891 	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
1892 	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] ||
1893 	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE])
1894 		dev_warn(&vdev->dev, "device still has system memory added\n");
1895 	else
1896 		virtio_mem_delete_resource(vm);
1897 
1898 	/* remove all tracking data - no locking needed */
1899 	vfree(vm->mb_state);
1900 	vfree(vm->sb_bitmap);
1901 
1902 	/* reset the device and cleanup the queues */
1903 	vdev->config->reset(vdev);
1904 	vdev->config->del_vqs(vdev);
1905 
1906 	kfree(vm);
1907 	vdev->priv = NULL;
1908 }
1909 
1910 static void virtio_mem_config_changed(struct virtio_device *vdev)
1911 {
1912 	struct virtio_mem *vm = vdev->priv;
1913 
1914 	atomic_set(&vm->config_changed, 1);
1915 	virtio_mem_retry(vm);
1916 }
1917 
1918 #ifdef CONFIG_PM_SLEEP
1919 static int virtio_mem_freeze(struct virtio_device *vdev)
1920 {
1921 	/*
1922 	 * When restarting the VM, all memory is usually unplugged. Don't
1923 	 * allow to suspend/hibernate.
1924 	 */
1925 	dev_err(&vdev->dev, "save/restore not supported.\n");
1926 	return -EPERM;
1927 }
1928 
1929 static int virtio_mem_restore(struct virtio_device *vdev)
1930 {
1931 	return -EPERM;
1932 }
1933 #endif
1934 
1935 static unsigned int virtio_mem_features[] = {
1936 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
1937 	VIRTIO_MEM_F_ACPI_PXM,
1938 #endif
1939 };
1940 
1941 static struct virtio_device_id virtio_mem_id_table[] = {
1942 	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
1943 	{ 0 },
1944 };
1945 
1946 static struct virtio_driver virtio_mem_driver = {
1947 	.feature_table = virtio_mem_features,
1948 	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
1949 	.driver.name = KBUILD_MODNAME,
1950 	.driver.owner = THIS_MODULE,
1951 	.id_table = virtio_mem_id_table,
1952 	.probe = virtio_mem_probe,
1953 	.remove = virtio_mem_remove,
1954 	.config_changed = virtio_mem_config_changed,
1955 #ifdef CONFIG_PM_SLEEP
1956 	.freeze	=	virtio_mem_freeze,
1957 	.restore =	virtio_mem_restore,
1958 #endif
1959 };
1960 
1961 module_virtio_driver(virtio_mem_driver);
1962 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
1963 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
1964 MODULE_DESCRIPTION("Virtio-mem driver");
1965 MODULE_LICENSE("GPL");
1966