1 /****************************************************************************** 2 * Xen balloon driver - enables returning/claiming memory to/from Xen. 3 * 4 * Copyright (c) 2003, B Dragovic 5 * Copyright (c) 2003-2004, M Williamson, K Fraser 6 * Copyright (c) 2005 Dan M. Smith, IBM Corporation 7 * Copyright (c) 2010 Daniel Kiper 8 * 9 * Memory hotplug support was written by Daniel Kiper. Work on 10 * it was sponsored by Google under Google Summer of Code 2010 11 * program. Jeremy Fitzhardinge from Citrix was the mentor for 12 * this project. 13 * 14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License version 2 16 * as published by the Free Software Foundation; or, when distributed 17 * separately from the Linux kernel or incorporated into other 18 * software packages, subject to the following license: 19 * 20 * Permission is hereby granted, free of charge, to any person obtaining a copy 21 * of this source file (the "Software"), to deal in the Software without 22 * restriction, including without limitation the rights to use, copy, modify, 23 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 24 * and to permit persons to whom the Software is furnished to do so, subject to 25 * the following conditions: 26 * 27 * The above copyright notice and this permission notice shall be included in 28 * all copies or substantial portions of the Software. 29 * 30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 35 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 36 * IN THE SOFTWARE. 37 */ 38 39 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 40 41 #include <linux/cpu.h> 42 #include <linux/kernel.h> 43 #include <linux/sched.h> 44 #include <linux/errno.h> 45 #include <linux/module.h> 46 #include <linux/mm.h> 47 #include <linux/bootmem.h> 48 #include <linux/pagemap.h> 49 #include <linux/highmem.h> 50 #include <linux/mutex.h> 51 #include <linux/list.h> 52 #include <linux/gfp.h> 53 #include <linux/notifier.h> 54 #include <linux/memory.h> 55 #include <linux/memory_hotplug.h> 56 #include <linux/percpu-defs.h> 57 58 #include <asm/page.h> 59 #include <asm/pgalloc.h> 60 #include <asm/pgtable.h> 61 #include <asm/tlb.h> 62 63 #include <asm/xen/hypervisor.h> 64 #include <asm/xen/hypercall.h> 65 66 #include <xen/xen.h> 67 #include <xen/interface/xen.h> 68 #include <xen/interface/memory.h> 69 #include <xen/balloon.h> 70 #include <xen/features.h> 71 #include <xen/page.h> 72 73 /* 74 * balloon_process() state: 75 * 76 * BP_DONE: done or nothing to do, 77 * BP_EAGAIN: error, go to sleep, 78 * BP_ECANCELED: error, balloon operation canceled. 79 */ 80 81 enum bp_state { 82 BP_DONE, 83 BP_EAGAIN, 84 BP_ECANCELED 85 }; 86 87 88 static DEFINE_MUTEX(balloon_mutex); 89 90 struct balloon_stats balloon_stats; 91 EXPORT_SYMBOL_GPL(balloon_stats); 92 93 /* We increase/decrease in batches which fit in a page */ 94 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; 95 96 97 /* List of ballooned pages, threaded through the mem_map array. */ 98 static LIST_HEAD(ballooned_pages); 99 100 /* Main work function, always executed in process context. */ 101 static void balloon_process(struct work_struct *work); 102 static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); 103 104 /* When ballooning out (allocating memory to return to Xen) we don't really 105 want the kernel to try too hard since that can trigger the oom killer. */ 106 #define GFP_BALLOON \ 107 (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) 108 109 static void scrub_page(struct page *page) 110 { 111 #ifdef CONFIG_XEN_SCRUB_PAGES 112 clear_highpage(page); 113 #endif 114 } 115 116 /* balloon_append: add the given page to the balloon. */ 117 static void __balloon_append(struct page *page) 118 { 119 /* Lowmem is re-populated first, so highmem pages go at list tail. */ 120 if (PageHighMem(page)) { 121 list_add_tail(&page->lru, &ballooned_pages); 122 balloon_stats.balloon_high++; 123 } else { 124 list_add(&page->lru, &ballooned_pages); 125 balloon_stats.balloon_low++; 126 } 127 } 128 129 static void balloon_append(struct page *page) 130 { 131 __balloon_append(page); 132 adjust_managed_page_count(page, -1); 133 } 134 135 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ 136 static struct page *balloon_retrieve(bool prefer_highmem) 137 { 138 struct page *page; 139 140 if (list_empty(&ballooned_pages)) 141 return NULL; 142 143 if (prefer_highmem) 144 page = list_entry(ballooned_pages.prev, struct page, lru); 145 else 146 page = list_entry(ballooned_pages.next, struct page, lru); 147 list_del(&page->lru); 148 149 if (PageHighMem(page)) 150 balloon_stats.balloon_high--; 151 else 152 balloon_stats.balloon_low--; 153 154 adjust_managed_page_count(page, 1); 155 156 return page; 157 } 158 159 static struct page *balloon_next_page(struct page *page) 160 { 161 struct list_head *next = page->lru.next; 162 if (next == &ballooned_pages) 163 return NULL; 164 return list_entry(next, struct page, lru); 165 } 166 167 static enum bp_state update_schedule(enum bp_state state) 168 { 169 if (state == BP_ECANCELED) 170 return BP_ECANCELED; 171 172 if (state == BP_DONE) { 173 balloon_stats.schedule_delay = 1; 174 balloon_stats.retry_count = 1; 175 return BP_DONE; 176 } 177 178 ++balloon_stats.retry_count; 179 180 if (balloon_stats.max_retry_count != RETRY_UNLIMITED && 181 balloon_stats.retry_count > balloon_stats.max_retry_count) { 182 balloon_stats.schedule_delay = 1; 183 balloon_stats.retry_count = 1; 184 return BP_ECANCELED; 185 } 186 187 balloon_stats.schedule_delay <<= 1; 188 189 if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) 190 balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; 191 192 return BP_EAGAIN; 193 } 194 195 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 196 static long current_credit(void) 197 { 198 return balloon_stats.target_pages - balloon_stats.current_pages - 199 balloon_stats.hotplug_pages; 200 } 201 202 static bool balloon_is_inflated(void) 203 { 204 if (balloon_stats.balloon_low || balloon_stats.balloon_high || 205 balloon_stats.balloon_hotplug) 206 return true; 207 else 208 return false; 209 } 210 211 /* 212 * reserve_additional_memory() adds memory region of size >= credit above 213 * max_pfn. New region is section aligned and size is modified to be multiple 214 * of section size. Those features allow optimal use of address space and 215 * establish proper alignment when this function is called first time after 216 * boot (last section not fully populated at boot time contains unused memory 217 * pages with PG_reserved bit not set; online_pages_range() does not allow page 218 * onlining in whole range if first onlined page does not have PG_reserved 219 * bit set). Real size of added memory is established at page onlining stage. 220 */ 221 222 static enum bp_state reserve_additional_memory(long credit) 223 { 224 int nid, rc; 225 u64 hotplug_start_paddr; 226 unsigned long balloon_hotplug = credit; 227 228 hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); 229 balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); 230 nid = memory_add_physaddr_to_nid(hotplug_start_paddr); 231 232 rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); 233 234 if (rc) { 235 pr_warn("Cannot add additional memory (%i)\n", rc); 236 return BP_ECANCELED; 237 } 238 239 balloon_hotplug -= credit; 240 241 balloon_stats.hotplug_pages += credit; 242 balloon_stats.balloon_hotplug = balloon_hotplug; 243 244 return BP_DONE; 245 } 246 247 static void xen_online_page(struct page *page) 248 { 249 __online_page_set_limits(page); 250 251 mutex_lock(&balloon_mutex); 252 253 __balloon_append(page); 254 255 if (balloon_stats.hotplug_pages) 256 --balloon_stats.hotplug_pages; 257 else 258 --balloon_stats.balloon_hotplug; 259 260 mutex_unlock(&balloon_mutex); 261 } 262 263 static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) 264 { 265 if (val == MEM_ONLINE) 266 schedule_delayed_work(&balloon_worker, 0); 267 268 return NOTIFY_OK; 269 } 270 271 static struct notifier_block xen_memory_nb = { 272 .notifier_call = xen_memory_notifier, 273 .priority = 0 274 }; 275 #else 276 static long current_credit(void) 277 { 278 unsigned long target = balloon_stats.target_pages; 279 280 target = min(target, 281 balloon_stats.current_pages + 282 balloon_stats.balloon_low + 283 balloon_stats.balloon_high); 284 285 return target - balloon_stats.current_pages; 286 } 287 288 static bool balloon_is_inflated(void) 289 { 290 if (balloon_stats.balloon_low || balloon_stats.balloon_high) 291 return true; 292 else 293 return false; 294 } 295 296 static enum bp_state reserve_additional_memory(long credit) 297 { 298 balloon_stats.target_pages = balloon_stats.current_pages; 299 return BP_DONE; 300 } 301 #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ 302 303 static enum bp_state increase_reservation(unsigned long nr_pages) 304 { 305 int rc; 306 unsigned long pfn, i; 307 struct page *page; 308 struct xen_memory_reservation reservation = { 309 .address_bits = 0, 310 .extent_order = 0, 311 .domid = DOMID_SELF 312 }; 313 314 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 315 if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { 316 nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); 317 balloon_stats.hotplug_pages += nr_pages; 318 balloon_stats.balloon_hotplug -= nr_pages; 319 return BP_DONE; 320 } 321 #endif 322 323 if (nr_pages > ARRAY_SIZE(frame_list)) 324 nr_pages = ARRAY_SIZE(frame_list); 325 326 page = list_first_entry_or_null(&ballooned_pages, struct page, lru); 327 for (i = 0; i < nr_pages; i++) { 328 if (!page) { 329 nr_pages = i; 330 break; 331 } 332 frame_list[i] = page_to_pfn(page); 333 page = balloon_next_page(page); 334 } 335 336 set_xen_guest_handle(reservation.extent_start, frame_list); 337 reservation.nr_extents = nr_pages; 338 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); 339 if (rc <= 0) 340 return BP_EAGAIN; 341 342 for (i = 0; i < rc; i++) { 343 page = balloon_retrieve(false); 344 BUG_ON(page == NULL); 345 346 pfn = page_to_pfn(page); 347 348 #ifdef CONFIG_XEN_HAVE_PVMMU 349 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 350 set_phys_to_machine(pfn, frame_list[i]); 351 352 /* Link back into the page tables if not highmem. */ 353 if (!PageHighMem(page)) { 354 int ret; 355 ret = HYPERVISOR_update_va_mapping( 356 (unsigned long)__va(pfn << PAGE_SHIFT), 357 mfn_pte(frame_list[i], PAGE_KERNEL), 358 0); 359 BUG_ON(ret); 360 } 361 } 362 #endif 363 364 /* Relinquish the page back to the allocator. */ 365 __free_reserved_page(page); 366 } 367 368 balloon_stats.current_pages += rc; 369 370 return BP_DONE; 371 } 372 373 static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) 374 { 375 enum bp_state state = BP_DONE; 376 unsigned long pfn, i; 377 struct page *page; 378 int ret; 379 struct xen_memory_reservation reservation = { 380 .address_bits = 0, 381 .extent_order = 0, 382 .domid = DOMID_SELF 383 }; 384 385 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 386 if (balloon_stats.hotplug_pages) { 387 nr_pages = min(nr_pages, balloon_stats.hotplug_pages); 388 balloon_stats.hotplug_pages -= nr_pages; 389 balloon_stats.balloon_hotplug += nr_pages; 390 return BP_DONE; 391 } 392 #endif 393 394 if (nr_pages > ARRAY_SIZE(frame_list)) 395 nr_pages = ARRAY_SIZE(frame_list); 396 397 for (i = 0; i < nr_pages; i++) { 398 page = alloc_page(gfp); 399 if (page == NULL) { 400 nr_pages = i; 401 state = BP_EAGAIN; 402 break; 403 } 404 scrub_page(page); 405 406 frame_list[i] = page_to_pfn(page); 407 } 408 409 /* 410 * Ensure that ballooned highmem pages don't have kmaps. 411 * 412 * Do this before changing the p2m as kmap_flush_unused() 413 * reads PTEs to obtain pages (and hence needs the original 414 * p2m entry). 415 */ 416 kmap_flush_unused(); 417 418 /* Update direct mapping, invalidate P2M, and add to balloon. */ 419 for (i = 0; i < nr_pages; i++) { 420 pfn = frame_list[i]; 421 frame_list[i] = pfn_to_mfn(pfn); 422 page = pfn_to_page(pfn); 423 424 #ifdef CONFIG_XEN_HAVE_PVMMU 425 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 426 if (!PageHighMem(page)) { 427 ret = HYPERVISOR_update_va_mapping( 428 (unsigned long)__va(pfn << PAGE_SHIFT), 429 __pte_ma(0), 0); 430 BUG_ON(ret); 431 } 432 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 433 } 434 #endif 435 436 balloon_append(page); 437 } 438 439 flush_tlb_all(); 440 441 set_xen_guest_handle(reservation.extent_start, frame_list); 442 reservation.nr_extents = nr_pages; 443 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); 444 BUG_ON(ret != nr_pages); 445 446 balloon_stats.current_pages -= nr_pages; 447 448 return state; 449 } 450 451 /* 452 * We avoid multiple worker processes conflicting via the balloon mutex. 453 * We may of course race updates of the target counts (which are protected 454 * by the balloon lock), or with changes to the Xen hard limit, but we will 455 * recover from these in time. 456 */ 457 static void balloon_process(struct work_struct *work) 458 { 459 enum bp_state state = BP_DONE; 460 long credit; 461 462 mutex_lock(&balloon_mutex); 463 464 do { 465 credit = current_credit(); 466 467 if (credit > 0) { 468 if (balloon_is_inflated()) 469 state = increase_reservation(credit); 470 else 471 state = reserve_additional_memory(credit); 472 } 473 474 if (credit < 0) 475 state = decrease_reservation(-credit, GFP_BALLOON); 476 477 state = update_schedule(state); 478 479 #ifndef CONFIG_PREEMPT 480 if (need_resched()) 481 schedule(); 482 #endif 483 } while (credit && state == BP_DONE); 484 485 /* Schedule more work if there is some still to be done. */ 486 if (state == BP_EAGAIN) 487 schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); 488 489 mutex_unlock(&balloon_mutex); 490 } 491 492 /* Resets the Xen limit, sets new target, and kicks off processing. */ 493 void balloon_set_new_target(unsigned long target) 494 { 495 /* No need for lock. Not read-modify-write updates. */ 496 balloon_stats.target_pages = target; 497 schedule_delayed_work(&balloon_worker, 0); 498 } 499 EXPORT_SYMBOL_GPL(balloon_set_new_target); 500 501 /** 502 * alloc_xenballooned_pages - get pages that have been ballooned out 503 * @nr_pages: Number of pages to get 504 * @pages: pages returned 505 * @highmem: allow highmem pages 506 * @return 0 on success, error otherwise 507 */ 508 int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) 509 { 510 int pgno = 0; 511 struct page *page; 512 mutex_lock(&balloon_mutex); 513 while (pgno < nr_pages) { 514 page = balloon_retrieve(highmem); 515 if (page && (highmem || !PageHighMem(page))) { 516 pages[pgno++] = page; 517 } else { 518 enum bp_state st; 519 if (page) 520 balloon_append(page); 521 st = decrease_reservation(nr_pages - pgno, 522 highmem ? GFP_HIGHUSER : GFP_USER); 523 if (st != BP_DONE) 524 goto out_undo; 525 } 526 } 527 mutex_unlock(&balloon_mutex); 528 return 0; 529 out_undo: 530 while (pgno) 531 balloon_append(pages[--pgno]); 532 /* Free the memory back to the kernel soon */ 533 schedule_delayed_work(&balloon_worker, 0); 534 mutex_unlock(&balloon_mutex); 535 return -ENOMEM; 536 } 537 EXPORT_SYMBOL(alloc_xenballooned_pages); 538 539 /** 540 * free_xenballooned_pages - return pages retrieved with get_ballooned_pages 541 * @nr_pages: Number of pages 542 * @pages: pages to return 543 */ 544 void free_xenballooned_pages(int nr_pages, struct page **pages) 545 { 546 int i; 547 548 mutex_lock(&balloon_mutex); 549 550 for (i = 0; i < nr_pages; i++) { 551 if (pages[i]) 552 balloon_append(pages[i]); 553 } 554 555 /* The balloon may be too large now. Shrink it if needed. */ 556 if (current_credit()) 557 schedule_delayed_work(&balloon_worker, 0); 558 559 mutex_unlock(&balloon_mutex); 560 } 561 EXPORT_SYMBOL(free_xenballooned_pages); 562 563 static void __init balloon_add_region(unsigned long start_pfn, 564 unsigned long pages) 565 { 566 unsigned long pfn, extra_pfn_end; 567 struct page *page; 568 569 /* 570 * If the amount of usable memory has been limited (e.g., with 571 * the 'mem' command line parameter), don't add pages beyond 572 * this limit. 573 */ 574 extra_pfn_end = min(max_pfn, start_pfn + pages); 575 576 for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { 577 page = pfn_to_page(pfn); 578 /* totalram_pages and totalhigh_pages do not 579 include the boot-time balloon extension, so 580 don't subtract from it. */ 581 __balloon_append(page); 582 } 583 } 584 585 static int __init balloon_init(void) 586 { 587 int i; 588 589 if (!xen_domain()) 590 return -ENODEV; 591 592 pr_info("Initialising balloon driver\n"); 593 594 balloon_stats.current_pages = xen_pv_domain() 595 ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) 596 : get_num_physpages(); 597 balloon_stats.target_pages = balloon_stats.current_pages; 598 balloon_stats.balloon_low = 0; 599 balloon_stats.balloon_high = 0; 600 601 balloon_stats.schedule_delay = 1; 602 balloon_stats.max_schedule_delay = 32; 603 balloon_stats.retry_count = 1; 604 balloon_stats.max_retry_count = RETRY_UNLIMITED; 605 606 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 607 balloon_stats.hotplug_pages = 0; 608 balloon_stats.balloon_hotplug = 0; 609 610 set_online_page_callback(&xen_online_page); 611 register_memory_notifier(&xen_memory_nb); 612 #endif 613 614 /* 615 * Initialize the balloon with pages from the extra memory 616 * regions (see arch/x86/xen/setup.c). 617 */ 618 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) 619 if (xen_extra_mem[i].size) 620 balloon_add_region(PFN_UP(xen_extra_mem[i].start), 621 PFN_DOWN(xen_extra_mem[i].size)); 622 623 return 0; 624 } 625 626 subsys_initcall(balloon_init); 627 628 MODULE_LICENSE("GPL"); 629