xref: /openbmc/linux/drivers/xen/balloon.c (revision 40095de1f9082f058970b985a96d2fbef43f94f4)
1 /******************************************************************************
2  * balloon.c
3  *
4  * Xen balloon driver - enables returning/claiming memory to/from Xen.
5  *
6  * Copyright (c) 2003, B Dragovic
7  * Copyright (c) 2003-2004, M Williamson, K Fraser
8  * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License version 2
12  * as published by the Free Software Foundation; or, when distributed
13  * separately from the Linux kernel or incorporated into other
14  * software packages, subject to the following license:
15  *
16  * Permission is hereby granted, free of charge, to any person obtaining a copy
17  * of this source file (the "Software"), to deal in the Software without
18  * restriction, including without limitation the rights to use, copy, modify,
19  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20  * and to permit persons to whom the Software is furnished to do so, subject to
21  * the following conditions:
22  *
23  * The above copyright notice and this permission notice shall be included in
24  * all copies or substantial portions of the Software.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32  * IN THE SOFTWARE.
33  */
34 
35 #include <linux/kernel.h>
36 #include <linux/module.h>
37 #include <linux/sched.h>
38 #include <linux/errno.h>
39 #include <linux/mm.h>
40 #include <linux/bootmem.h>
41 #include <linux/pagemap.h>
42 #include <linux/highmem.h>
43 #include <linux/mutex.h>
44 #include <linux/list.h>
45 #include <linux/sysdev.h>
46 #include <linux/gfp.h>
47 
48 #include <asm/page.h>
49 #include <asm/pgalloc.h>
50 #include <asm/pgtable.h>
51 #include <asm/uaccess.h>
52 #include <asm/tlb.h>
53 #include <asm/e820.h>
54 
55 #include <asm/xen/hypervisor.h>
56 #include <asm/xen/hypercall.h>
57 
58 #include <xen/xen.h>
59 #include <xen/interface/xen.h>
60 #include <xen/interface/memory.h>
61 #include <xen/xenbus.h>
62 #include <xen/features.h>
63 #include <xen/page.h>
64 
65 #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
66 
67 #define BALLOON_CLASS_NAME "xen_memory"
68 
69 /*
70  * balloon_process() state:
71  *
72  * BP_DONE: done or nothing to do,
73  * BP_EAGAIN: error, go to sleep,
74  * BP_ECANCELED: error, balloon operation canceled.
75  */
76 
77 enum bp_state {
78 	BP_DONE,
79 	BP_EAGAIN,
80 	BP_ECANCELED
81 };
82 
83 #define RETRY_UNLIMITED	0
84 
85 struct balloon_stats {
86 	/* We aim for 'current allocation' == 'target allocation'. */
87 	unsigned long current_pages;
88 	unsigned long target_pages;
89 	/* Number of pages in high- and low-memory balloons. */
90 	unsigned long balloon_low;
91 	unsigned long balloon_high;
92 	unsigned long schedule_delay;
93 	unsigned long max_schedule_delay;
94 	unsigned long retry_count;
95 	unsigned long max_retry_count;
96 };
97 
98 static DEFINE_MUTEX(balloon_mutex);
99 
100 static struct sys_device balloon_sysdev;
101 
102 static int register_balloon(struct sys_device *sysdev);
103 
104 static struct balloon_stats balloon_stats;
105 
106 /* We increase/decrease in batches which fit in a page */
107 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
108 
109 #ifdef CONFIG_HIGHMEM
110 #define inc_totalhigh_pages() (totalhigh_pages++)
111 #define dec_totalhigh_pages() (totalhigh_pages--)
112 #else
113 #define inc_totalhigh_pages() do {} while(0)
114 #define dec_totalhigh_pages() do {} while(0)
115 #endif
116 
117 /* List of ballooned pages, threaded through the mem_map array. */
118 static LIST_HEAD(ballooned_pages);
119 
120 /* Main work function, always executed in process context. */
121 static void balloon_process(struct work_struct *work);
122 static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);
123 
124 /* When ballooning out (allocating memory to return to Xen) we don't really
125    want the kernel to try too hard since that can trigger the oom killer. */
126 #define GFP_BALLOON \
127 	(GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
128 
129 static void scrub_page(struct page *page)
130 {
131 #ifdef CONFIG_XEN_SCRUB_PAGES
132 	clear_highpage(page);
133 #endif
134 }
135 
136 /* balloon_append: add the given page to the balloon. */
137 static void __balloon_append(struct page *page)
138 {
139 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
140 	if (PageHighMem(page)) {
141 		list_add_tail(&page->lru, &ballooned_pages);
142 		balloon_stats.balloon_high++;
143 		dec_totalhigh_pages();
144 	} else {
145 		list_add(&page->lru, &ballooned_pages);
146 		balloon_stats.balloon_low++;
147 	}
148 }
149 
150 static void balloon_append(struct page *page)
151 {
152 	__balloon_append(page);
153 	totalram_pages--;
154 }
155 
156 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
157 static struct page *balloon_retrieve(void)
158 {
159 	struct page *page;
160 
161 	if (list_empty(&ballooned_pages))
162 		return NULL;
163 
164 	page = list_entry(ballooned_pages.next, struct page, lru);
165 	list_del(&page->lru);
166 
167 	if (PageHighMem(page)) {
168 		balloon_stats.balloon_high--;
169 		inc_totalhigh_pages();
170 	}
171 	else
172 		balloon_stats.balloon_low--;
173 
174 	totalram_pages++;
175 
176 	return page;
177 }
178 
179 static struct page *balloon_first_page(void)
180 {
181 	if (list_empty(&ballooned_pages))
182 		return NULL;
183 	return list_entry(ballooned_pages.next, struct page, lru);
184 }
185 
186 static struct page *balloon_next_page(struct page *page)
187 {
188 	struct list_head *next = page->lru.next;
189 	if (next == &ballooned_pages)
190 		return NULL;
191 	return list_entry(next, struct page, lru);
192 }
193 
194 static enum bp_state update_schedule(enum bp_state state)
195 {
196 	if (state == BP_DONE) {
197 		balloon_stats.schedule_delay = 1;
198 		balloon_stats.retry_count = 1;
199 		return BP_DONE;
200 	}
201 
202 	++balloon_stats.retry_count;
203 
204 	if (balloon_stats.max_retry_count != RETRY_UNLIMITED &&
205 			balloon_stats.retry_count > balloon_stats.max_retry_count) {
206 		balloon_stats.schedule_delay = 1;
207 		balloon_stats.retry_count = 1;
208 		return BP_ECANCELED;
209 	}
210 
211 	balloon_stats.schedule_delay <<= 1;
212 
213 	if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
214 		balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
215 
216 	return BP_EAGAIN;
217 }
218 
219 static unsigned long current_target(void)
220 {
221 	unsigned long target = balloon_stats.target_pages;
222 
223 	target = min(target,
224 		     balloon_stats.current_pages +
225 		     balloon_stats.balloon_low +
226 		     balloon_stats.balloon_high);
227 
228 	return target;
229 }
230 
231 static enum bp_state increase_reservation(unsigned long nr_pages)
232 {
233 	int rc;
234 	unsigned long  pfn, i;
235 	struct page   *page;
236 	struct xen_memory_reservation reservation = {
237 		.address_bits = 0,
238 		.extent_order = 0,
239 		.domid        = DOMID_SELF
240 	};
241 
242 	if (nr_pages > ARRAY_SIZE(frame_list))
243 		nr_pages = ARRAY_SIZE(frame_list);
244 
245 	page = balloon_first_page();
246 	for (i = 0; i < nr_pages; i++) {
247 		if (!page) {
248 			nr_pages = i;
249 			break;
250 		}
251 		frame_list[i] = page_to_pfn(page);
252 		page = balloon_next_page(page);
253 	}
254 
255 	set_xen_guest_handle(reservation.extent_start, frame_list);
256 	reservation.nr_extents = nr_pages;
257 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
258 	if (rc <= 0)
259 		return BP_EAGAIN;
260 
261 	for (i = 0; i < rc; i++) {
262 		page = balloon_retrieve();
263 		BUG_ON(page == NULL);
264 
265 		pfn = page_to_pfn(page);
266 		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
267 		       phys_to_machine_mapping_valid(pfn));
268 
269 		set_phys_to_machine(pfn, frame_list[i]);
270 
271 		/* Link back into the page tables if not highmem. */
272 		if (pfn < max_low_pfn) {
273 			int ret;
274 			ret = HYPERVISOR_update_va_mapping(
275 				(unsigned long)__va(pfn << PAGE_SHIFT),
276 				mfn_pte(frame_list[i], PAGE_KERNEL),
277 				0);
278 			BUG_ON(ret);
279 		}
280 
281 		/* Relinquish the page back to the allocator. */
282 		ClearPageReserved(page);
283 		init_page_count(page);
284 		__free_page(page);
285 	}
286 
287 	balloon_stats.current_pages += rc;
288 
289 	return BP_DONE;
290 }
291 
292 static enum bp_state decrease_reservation(unsigned long nr_pages)
293 {
294 	enum bp_state state = BP_DONE;
295 	unsigned long  pfn, i;
296 	struct page   *page;
297 	int ret;
298 	struct xen_memory_reservation reservation = {
299 		.address_bits = 0,
300 		.extent_order = 0,
301 		.domid        = DOMID_SELF
302 	};
303 
304 	if (nr_pages > ARRAY_SIZE(frame_list))
305 		nr_pages = ARRAY_SIZE(frame_list);
306 
307 	for (i = 0; i < nr_pages; i++) {
308 		if ((page = alloc_page(GFP_BALLOON)) == NULL) {
309 			nr_pages = i;
310 			state = BP_EAGAIN;
311 			break;
312 		}
313 
314 		pfn = page_to_pfn(page);
315 		frame_list[i] = pfn_to_mfn(pfn);
316 
317 		scrub_page(page);
318 
319 		if (!PageHighMem(page)) {
320 			ret = HYPERVISOR_update_va_mapping(
321 				(unsigned long)__va(pfn << PAGE_SHIFT),
322 				__pte_ma(0), 0);
323 			BUG_ON(ret);
324                 }
325 
326 	}
327 
328 	/* Ensure that ballooned highmem pages don't have kmaps. */
329 	kmap_flush_unused();
330 	flush_tlb_all();
331 
332 	/* No more mappings: invalidate P2M and add to balloon. */
333 	for (i = 0; i < nr_pages; i++) {
334 		pfn = mfn_to_pfn(frame_list[i]);
335 		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
336 		balloon_append(pfn_to_page(pfn));
337 	}
338 
339 	set_xen_guest_handle(reservation.extent_start, frame_list);
340 	reservation.nr_extents   = nr_pages;
341 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
342 	BUG_ON(ret != nr_pages);
343 
344 	balloon_stats.current_pages -= nr_pages;
345 
346 	return state;
347 }
348 
349 /*
350  * We avoid multiple worker processes conflicting via the balloon mutex.
351  * We may of course race updates of the target counts (which are protected
352  * by the balloon lock), or with changes to the Xen hard limit, but we will
353  * recover from these in time.
354  */
355 static void balloon_process(struct work_struct *work)
356 {
357 	enum bp_state state = BP_DONE;
358 	long credit;
359 
360 	mutex_lock(&balloon_mutex);
361 
362 	do {
363 		credit = current_target() - balloon_stats.current_pages;
364 
365 		if (credit > 0)
366 			state = increase_reservation(credit);
367 
368 		if (credit < 0)
369 			state = decrease_reservation(-credit);
370 
371 		state = update_schedule(state);
372 
373 #ifndef CONFIG_PREEMPT
374 		if (need_resched())
375 			schedule();
376 #endif
377 	} while (credit && state == BP_DONE);
378 
379 	/* Schedule more work if there is some still to be done. */
380 	if (state == BP_EAGAIN)
381 		schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);
382 
383 	mutex_unlock(&balloon_mutex);
384 }
385 
386 /* Resets the Xen limit, sets new target, and kicks off processing. */
387 static void balloon_set_new_target(unsigned long target)
388 {
389 	/* No need for lock. Not read-modify-write updates. */
390 	balloon_stats.target_pages = target;
391 	schedule_delayed_work(&balloon_worker, 0);
392 }
393 
394 static struct xenbus_watch target_watch =
395 {
396 	.node = "memory/target"
397 };
398 
399 /* React to a change in the target key */
400 static void watch_target(struct xenbus_watch *watch,
401 			 const char **vec, unsigned int len)
402 {
403 	unsigned long long new_target;
404 	int err;
405 
406 	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
407 	if (err != 1) {
408 		/* This is ok (for domain0 at least) - so just return */
409 		return;
410 	}
411 
412 	/* The given memory/target value is in KiB, so it needs converting to
413 	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
414 	 */
415 	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
416 }
417 
418 static int balloon_init_watcher(struct notifier_block *notifier,
419 				unsigned long event,
420 				void *data)
421 {
422 	int err;
423 
424 	err = register_xenbus_watch(&target_watch);
425 	if (err)
426 		printk(KERN_ERR "Failed to set balloon watcher\n");
427 
428 	return NOTIFY_DONE;
429 }
430 
431 static struct notifier_block xenstore_notifier;
432 
433 static int __init balloon_init(void)
434 {
435 	unsigned long pfn, extra_pfn_end;
436 	struct page *page;
437 
438 	if (!xen_pv_domain())
439 		return -ENODEV;
440 
441 	pr_info("xen_balloon: Initialising balloon driver.\n");
442 
443 	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
444 	balloon_stats.target_pages  = balloon_stats.current_pages;
445 	balloon_stats.balloon_low   = 0;
446 	balloon_stats.balloon_high  = 0;
447 
448 	balloon_stats.schedule_delay = 1;
449 	balloon_stats.max_schedule_delay = 32;
450 	balloon_stats.retry_count = 1;
451 	balloon_stats.max_retry_count = RETRY_UNLIMITED;
452 
453 	register_balloon(&balloon_sysdev);
454 
455 	/*
456 	 * Initialise the balloon with excess memory space.  We need
457 	 * to make sure we don't add memory which doesn't exist or
458 	 * logically exist.  The E820 map can be trimmed to be smaller
459 	 * than the amount of physical memory due to the mem= command
460 	 * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
461 	 * on a system with memory which requires highmem to access,
462 	 * don't try to use it.
463 	 */
464 	extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
465 			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
466 	for (pfn = PFN_UP(xen_extra_mem_start);
467 	     pfn < extra_pfn_end;
468 	     pfn++) {
469 		page = pfn_to_page(pfn);
470 		/* totalram_pages doesn't include the boot-time
471 		   balloon extension, so don't subtract from it. */
472 		__balloon_append(page);
473 	}
474 
475 	target_watch.callback = watch_target;
476 	xenstore_notifier.notifier_call = balloon_init_watcher;
477 
478 	register_xenstore_notifier(&xenstore_notifier);
479 
480 	return 0;
481 }
482 
483 subsys_initcall(balloon_init);
484 
485 static void balloon_exit(void)
486 {
487     /* XXX - release balloon here */
488     return;
489 }
490 
491 module_exit(balloon_exit);
492 
493 #define BALLOON_SHOW(name, format, args...)				\
494 	static ssize_t show_##name(struct sys_device *dev,		\
495 				   struct sysdev_attribute *attr,	\
496 				   char *buf)				\
497 	{								\
498 		return sprintf(buf, format, ##args);			\
499 	}								\
500 	static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
501 
502 BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
503 BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
504 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
505 
506 static SYSDEV_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay);
507 static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay);
508 static SYSDEV_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count);
509 static SYSDEV_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count);
510 
511 static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
512 			      char *buf)
513 {
514 	return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
515 }
516 
517 static ssize_t store_target_kb(struct sys_device *dev,
518 			       struct sysdev_attribute *attr,
519 			       const char *buf,
520 			       size_t count)
521 {
522 	char *endchar;
523 	unsigned long long target_bytes;
524 
525 	if (!capable(CAP_SYS_ADMIN))
526 		return -EPERM;
527 
528 	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
529 
530 	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
531 
532 	return count;
533 }
534 
535 static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
536 		   show_target_kb, store_target_kb);
537 
538 
539 static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr,
540 			      char *buf)
541 {
542 	return sprintf(buf, "%llu\n",
543 		       (unsigned long long)balloon_stats.target_pages
544 		       << PAGE_SHIFT);
545 }
546 
547 static ssize_t store_target(struct sys_device *dev,
548 			    struct sysdev_attribute *attr,
549 			    const char *buf,
550 			    size_t count)
551 {
552 	char *endchar;
553 	unsigned long long target_bytes;
554 
555 	if (!capable(CAP_SYS_ADMIN))
556 		return -EPERM;
557 
558 	target_bytes = memparse(buf, &endchar);
559 
560 	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
561 
562 	return count;
563 }
564 
565 static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
566 		   show_target, store_target);
567 
568 
569 static struct sysdev_attribute *balloon_attrs[] = {
570 	&attr_target_kb,
571 	&attr_target,
572 	&attr_schedule_delay.attr,
573 	&attr_max_schedule_delay.attr,
574 	&attr_retry_count.attr,
575 	&attr_max_retry_count.attr
576 };
577 
578 static struct attribute *balloon_info_attrs[] = {
579 	&attr_current_kb.attr,
580 	&attr_low_kb.attr,
581 	&attr_high_kb.attr,
582 	NULL
583 };
584 
585 static struct attribute_group balloon_info_group = {
586 	.name = "info",
587 	.attrs = balloon_info_attrs,
588 };
589 
590 static struct sysdev_class balloon_sysdev_class = {
591 	.name = BALLOON_CLASS_NAME,
592 };
593 
594 static int register_balloon(struct sys_device *sysdev)
595 {
596 	int i, error;
597 
598 	error = sysdev_class_register(&balloon_sysdev_class);
599 	if (error)
600 		return error;
601 
602 	sysdev->id = 0;
603 	sysdev->cls = &balloon_sysdev_class;
604 
605 	error = sysdev_register(sysdev);
606 	if (error) {
607 		sysdev_class_unregister(&balloon_sysdev_class);
608 		return error;
609 	}
610 
611 	for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
612 		error = sysdev_create_file(sysdev, balloon_attrs[i]);
613 		if (error)
614 			goto fail;
615 	}
616 
617 	error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
618 	if (error)
619 		goto fail;
620 
621 	return 0;
622 
623  fail:
624 	while (--i >= 0)
625 		sysdev_remove_file(sysdev, balloon_attrs[i]);
626 	sysdev_unregister(sysdev);
627 	sysdev_class_unregister(&balloon_sysdev_class);
628 	return error;
629 }
630 
631 MODULE_LICENSE("GPL");
632