1 /* 2 * sparse memory mappings. 3 */ 4 #include <linux/mm.h> 5 #include <linux/mmzone.h> 6 #include <linux/bootmem.h> 7 #include <linux/highmem.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/vmalloc.h> 11 #include "internal.h" 12 #include <asm/dma.h> 13 #include <asm/pgalloc.h> 14 #include <asm/pgtable.h> 15 16 /* 17 * Permanent SPARSEMEM data: 18 * 19 * 1) mem_section - memory sections, mem_map's for valid memory 20 */ 21 #ifdef CONFIG_SPARSEMEM_EXTREME 22 struct mem_section *mem_section[NR_SECTION_ROOTS] 23 ____cacheline_internodealigned_in_smp; 24 #else 25 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] 26 ____cacheline_internodealigned_in_smp; 27 #endif 28 EXPORT_SYMBOL(mem_section); 29 30 #ifdef NODE_NOT_IN_PAGE_FLAGS 31 /* 32 * If we did not store the node number in the page then we have to 33 * do a lookup in the section_to_node_table in order to find which 34 * node the page belongs to. 35 */ 36 #if MAX_NUMNODES <= 256 37 static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 38 #else 39 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 40 #endif 41 42 int page_to_nid(struct page *page) 43 { 44 return section_to_node_table[page_to_section(page)]; 45 } 46 EXPORT_SYMBOL(page_to_nid); 47 48 static void set_section_nid(unsigned long section_nr, int nid) 49 { 50 section_to_node_table[section_nr] = nid; 51 } 52 #else /* !NODE_NOT_IN_PAGE_FLAGS */ 53 static inline void set_section_nid(unsigned long section_nr, int nid) 54 { 55 } 56 #endif 57 58 #ifdef CONFIG_SPARSEMEM_EXTREME 59 static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) 60 { 61 struct mem_section *section = NULL; 62 unsigned long array_size = SECTIONS_PER_ROOT * 63 sizeof(struct mem_section); 64 65 if (slab_is_available()) 66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 67 else 68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 69 70 if (section) 71 memset(section, 0, array_size); 72 73 return section; 74 } 75 76 static int __meminit sparse_index_init(unsigned long section_nr, int nid) 77 { 78 static DEFINE_SPINLOCK(index_init_lock); 79 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 struct mem_section *section; 81 int ret = 0; 82 83 if (mem_section[root]) 84 return -EEXIST; 85 86 section = sparse_index_alloc(nid); 87 if (!section) 88 return -ENOMEM; 89 /* 90 * This lock keeps two different sections from 91 * reallocating for the same index 92 */ 93 spin_lock(&index_init_lock); 94 95 if (mem_section[root]) { 96 ret = -EEXIST; 97 goto out; 98 } 99 100 mem_section[root] = section; 101 out: 102 spin_unlock(&index_init_lock); 103 return ret; 104 } 105 #else /* !SPARSEMEM_EXTREME */ 106 static inline int sparse_index_init(unsigned long section_nr, int nid) 107 { 108 return 0; 109 } 110 #endif 111 112 /* 113 * Although written for the SPARSEMEM_EXTREME case, this happens 114 * to also work for the flat array case because 115 * NR_SECTION_ROOTS==NR_MEM_SECTIONS. 116 */ 117 int __section_nr(struct mem_section* ms) 118 { 119 unsigned long root_nr; 120 struct mem_section* root; 121 122 for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { 123 root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); 124 if (!root) 125 continue; 126 127 if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) 128 break; 129 } 130 131 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 132 } 133 134 /* 135 * During early boot, before section_mem_map is used for an actual 136 * mem_map, we use section_mem_map to store the section's NUMA 137 * node. This keeps us from having to use another data structure. The 138 * node information is cleared just before we store the real mem_map. 139 */ 140 static inline unsigned long sparse_encode_early_nid(int nid) 141 { 142 return (nid << SECTION_NID_SHIFT); 143 } 144 145 static inline int sparse_early_nid(struct mem_section *section) 146 { 147 return (section->section_mem_map >> SECTION_NID_SHIFT); 148 } 149 150 /* Validate the physical addressing limitations of the model */ 151 void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, 152 unsigned long *end_pfn) 153 { 154 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 155 156 /* 157 * Sanity checks - do not allow an architecture to pass 158 * in larger pfns than the maximum scope of sparsemem: 159 */ 160 if (*start_pfn > max_sparsemem_pfn) { 161 mminit_dprintk(MMINIT_WARNING, "pfnvalidation", 162 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", 163 *start_pfn, *end_pfn, max_sparsemem_pfn); 164 WARN_ON_ONCE(1); 165 *start_pfn = max_sparsemem_pfn; 166 *end_pfn = max_sparsemem_pfn; 167 } else if (*end_pfn > max_sparsemem_pfn) { 168 mminit_dprintk(MMINIT_WARNING, "pfnvalidation", 169 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", 170 *start_pfn, *end_pfn, max_sparsemem_pfn); 171 WARN_ON_ONCE(1); 172 *end_pfn = max_sparsemem_pfn; 173 } 174 } 175 176 /* Record a memory area against a node. */ 177 void __init memory_present(int nid, unsigned long start, unsigned long end) 178 { 179 unsigned long pfn; 180 181 start &= PAGE_SECTION_MASK; 182 mminit_validate_memmodel_limits(&start, &end); 183 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 184 unsigned long section = pfn_to_section_nr(pfn); 185 struct mem_section *ms; 186 187 sparse_index_init(section, nid); 188 set_section_nid(section, nid); 189 190 ms = __nr_to_section(section); 191 if (!ms->section_mem_map) 192 ms->section_mem_map = sparse_encode_early_nid(nid) | 193 SECTION_MARKED_PRESENT; 194 } 195 } 196 197 /* 198 * Only used by the i386 NUMA architecures, but relatively 199 * generic code. 200 */ 201 unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, 202 unsigned long end_pfn) 203 { 204 unsigned long pfn; 205 unsigned long nr_pages = 0; 206 207 mminit_validate_memmodel_limits(&start_pfn, &end_pfn); 208 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 209 if (nid != early_pfn_to_nid(pfn)) 210 continue; 211 212 if (pfn_present(pfn)) 213 nr_pages += PAGES_PER_SECTION; 214 } 215 216 return nr_pages * sizeof(struct page); 217 } 218 219 /* 220 * Subtle, we encode the real pfn into the mem_map such that 221 * the identity pfn - section_mem_map will return the actual 222 * physical page frame number. 223 */ 224 static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) 225 { 226 return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); 227 } 228 229 /* 230 * Decode mem_map from the coded memmap 231 */ 232 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) 233 { 234 /* mask off the extra low bits of information */ 235 coded_mem_map &= SECTION_MAP_MASK; 236 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); 237 } 238 239 static int __meminit sparse_init_one_section(struct mem_section *ms, 240 unsigned long pnum, struct page *mem_map, 241 unsigned long *pageblock_bitmap) 242 { 243 if (!present_section(ms)) 244 return -EINVAL; 245 246 ms->section_mem_map &= ~SECTION_MAP_MASK; 247 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | 248 SECTION_HAS_MEM_MAP; 249 ms->pageblock_flags = pageblock_bitmap; 250 251 return 1; 252 } 253 254 unsigned long usemap_size(void) 255 { 256 unsigned long size_bytes; 257 size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; 258 size_bytes = roundup(size_bytes, sizeof(unsigned long)); 259 return size_bytes; 260 } 261 262 #ifdef CONFIG_MEMORY_HOTPLUG 263 static unsigned long *__kmalloc_section_usemap(void) 264 { 265 return kmalloc(usemap_size(), GFP_KERNEL); 266 } 267 #endif /* CONFIG_MEMORY_HOTPLUG */ 268 269 #ifdef CONFIG_MEMORY_HOTREMOVE 270 static unsigned long * __init 271 sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 272 { 273 unsigned long section_nr; 274 275 /* 276 * A page may contain usemaps for other sections preventing the 277 * page being freed and making a section unremovable while 278 * other sections referencing the usemap retmain active. Similarly, 279 * a pgdat can prevent a section being removed. If section A 280 * contains a pgdat and section B contains the usemap, both 281 * sections become inter-dependent. This allocates usemaps 282 * from the same section as the pgdat where possible to avoid 283 * this problem. 284 */ 285 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 286 return alloc_bootmem_section(usemap_size(), section_nr); 287 } 288 289 static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 290 { 291 unsigned long usemap_snr, pgdat_snr; 292 static unsigned long old_usemap_snr = NR_MEM_SECTIONS; 293 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; 294 struct pglist_data *pgdat = NODE_DATA(nid); 295 int usemap_nid; 296 297 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); 298 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 299 if (usemap_snr == pgdat_snr) 300 return; 301 302 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) 303 /* skip redundant message */ 304 return; 305 306 old_usemap_snr = usemap_snr; 307 old_pgdat_snr = pgdat_snr; 308 309 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); 310 if (usemap_nid != nid) { 311 printk(KERN_INFO 312 "node %d must be removed before remove section %ld\n", 313 nid, usemap_snr); 314 return; 315 } 316 /* 317 * There is a circular dependency. 318 * Some platforms allow un-removable section because they will just 319 * gather other removable sections for dynamic partitioning. 320 * Just notify un-removable section's number here. 321 */ 322 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, 323 pgdat_snr, nid); 324 printk(KERN_CONT 325 " have a circular dependency on usemap and pgdat allocations\n"); 326 } 327 #else 328 static unsigned long * __init 329 sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 330 { 331 return NULL; 332 } 333 334 static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 335 { 336 } 337 #endif /* CONFIG_MEMORY_HOTREMOVE */ 338 339 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 340 { 341 unsigned long *usemap; 342 struct mem_section *ms = __nr_to_section(pnum); 343 int nid = sparse_early_nid(ms); 344 345 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); 346 if (usemap) 347 return usemap; 348 349 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 350 if (usemap) { 351 check_usemap_section_nr(nid, usemap); 352 return usemap; 353 } 354 355 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 356 nid = 0; 357 358 printk(KERN_WARNING "%s: allocation failed\n", __func__); 359 return NULL; 360 } 361 362 #ifndef CONFIG_SPARSEMEM_VMEMMAP 363 struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) 364 { 365 struct page *map; 366 367 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 368 if (map) 369 return map; 370 371 map = alloc_bootmem_pages_node(NODE_DATA(nid), 372 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 373 return map; 374 } 375 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 376 377 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 378 { 379 struct page *map; 380 struct mem_section *ms = __nr_to_section(pnum); 381 int nid = sparse_early_nid(ms); 382 383 map = sparse_mem_map_populate(pnum, nid); 384 if (map) 385 return map; 386 387 printk(KERN_ERR "%s: sparsemem memory map backing failed " 388 "some memory will not be available.\n", __func__); 389 ms->section_mem_map = 0; 390 return NULL; 391 } 392 393 void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 394 { 395 } 396 /* 397 * Allocate the accumulated non-linear sections, allocate a mem_map 398 * for each and record the physical to section mapping. 399 */ 400 void __init sparse_init(void) 401 { 402 unsigned long pnum; 403 struct page *map; 404 unsigned long *usemap; 405 unsigned long **usemap_map; 406 int size; 407 408 /* 409 * map is using big page (aka 2M in x86 64 bit) 410 * usemap is less one page (aka 24 bytes) 411 * so alloc 2M (with 2M align) and 24 bytes in turn will 412 * make next 2M slip to one more 2M later. 413 * then in big system, the memory will have a lot of holes... 414 * here try to allocate 2M pages continously. 415 * 416 * powerpc need to call sparse_init_one_section right after each 417 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 418 */ 419 size = sizeof(unsigned long *) * NR_MEM_SECTIONS; 420 usemap_map = alloc_bootmem(size); 421 if (!usemap_map) 422 panic("can not allocate usemap_map\n"); 423 424 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 425 if (!present_section_nr(pnum)) 426 continue; 427 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 428 } 429 430 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 431 if (!present_section_nr(pnum)) 432 continue; 433 434 usemap = usemap_map[pnum]; 435 if (!usemap) 436 continue; 437 438 map = sparse_early_mem_map_alloc(pnum); 439 if (!map) 440 continue; 441 442 sparse_init_one_section(__nr_to_section(pnum), pnum, map, 443 usemap); 444 } 445 446 vmemmap_populate_print_last(); 447 448 free_bootmem(__pa(usemap_map), size); 449 } 450 451 #ifdef CONFIG_MEMORY_HOTPLUG 452 #ifdef CONFIG_SPARSEMEM_VMEMMAP 453 static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 454 unsigned long nr_pages) 455 { 456 /* This will make the necessary allocations eventually. */ 457 return sparse_mem_map_populate(pnum, nid); 458 } 459 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 460 { 461 return; /* XXX: Not implemented yet */ 462 } 463 static void free_map_bootmem(struct page *page, unsigned long nr_pages) 464 { 465 } 466 #else 467 static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 468 { 469 struct page *page, *ret; 470 unsigned long memmap_size = sizeof(struct page) * nr_pages; 471 472 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); 473 if (page) 474 goto got_map_page; 475 476 ret = vmalloc(memmap_size); 477 if (ret) 478 goto got_map_ptr; 479 480 return NULL; 481 got_map_page: 482 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 483 got_map_ptr: 484 memset(ret, 0, memmap_size); 485 486 return ret; 487 } 488 489 static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 490 unsigned long nr_pages) 491 { 492 return __kmalloc_section_memmap(nr_pages); 493 } 494 495 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 496 { 497 if (is_vmalloc_addr(memmap)) 498 vfree(memmap); 499 else 500 free_pages((unsigned long)memmap, 501 get_order(sizeof(struct page) * nr_pages)); 502 } 503 504 static void free_map_bootmem(struct page *page, unsigned long nr_pages) 505 { 506 unsigned long maps_section_nr, removing_section_nr, i; 507 int magic; 508 509 for (i = 0; i < nr_pages; i++, page++) { 510 magic = atomic_read(&page->_mapcount); 511 512 BUG_ON(magic == NODE_INFO); 513 514 maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); 515 removing_section_nr = page->private; 516 517 /* 518 * When this function is called, the removing section is 519 * logical offlined state. This means all pages are isolated 520 * from page allocator. If removing section's memmap is placed 521 * on the same section, it must not be freed. 522 * If it is freed, page allocator may allocate it which will 523 * be removed physically soon. 524 */ 525 if (maps_section_nr != removing_section_nr) 526 put_page_bootmem(page); 527 } 528 } 529 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 530 531 static void free_section_usemap(struct page *memmap, unsigned long *usemap) 532 { 533 struct page *usemap_page; 534 unsigned long nr_pages; 535 536 if (!usemap) 537 return; 538 539 usemap_page = virt_to_page(usemap); 540 /* 541 * Check to see if allocation came from hot-plug-add 542 */ 543 if (PageSlab(usemap_page)) { 544 kfree(usemap); 545 if (memmap) 546 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 547 return; 548 } 549 550 /* 551 * The usemap came from bootmem. This is packed with other usemaps 552 * on the section which has pgdat at boot time. Just keep it as is now. 553 */ 554 555 if (memmap) { 556 struct page *memmap_page; 557 memmap_page = virt_to_page(memmap); 558 559 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) 560 >> PAGE_SHIFT; 561 562 free_map_bootmem(memmap_page, nr_pages); 563 } 564 } 565 566 /* 567 * returns the number of sections whose mem_maps were properly 568 * set. If this is <=0, then that means that the passed-in 569 * map was not consumed and must be freed. 570 */ 571 int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 572 int nr_pages) 573 { 574 unsigned long section_nr = pfn_to_section_nr(start_pfn); 575 struct pglist_data *pgdat = zone->zone_pgdat; 576 struct mem_section *ms; 577 struct page *memmap; 578 unsigned long *usemap; 579 unsigned long flags; 580 int ret; 581 582 /* 583 * no locking for this, because it does its own 584 * plus, it does a kmalloc 585 */ 586 ret = sparse_index_init(section_nr, pgdat->node_id); 587 if (ret < 0 && ret != -EEXIST) 588 return ret; 589 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); 590 if (!memmap) 591 return -ENOMEM; 592 usemap = __kmalloc_section_usemap(); 593 if (!usemap) { 594 __kfree_section_memmap(memmap, nr_pages); 595 return -ENOMEM; 596 } 597 598 pgdat_resize_lock(pgdat, &flags); 599 600 ms = __pfn_to_section(start_pfn); 601 if (ms->section_mem_map & SECTION_MARKED_PRESENT) { 602 ret = -EEXIST; 603 goto out; 604 } 605 606 ms->section_mem_map |= SECTION_MARKED_PRESENT; 607 608 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 609 610 out: 611 pgdat_resize_unlock(pgdat, &flags); 612 if (ret <= 0) { 613 kfree(usemap); 614 __kfree_section_memmap(memmap, nr_pages); 615 } 616 return ret; 617 } 618 619 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 620 { 621 struct page *memmap = NULL; 622 unsigned long *usemap = NULL; 623 624 if (ms->section_mem_map) { 625 usemap = ms->pageblock_flags; 626 memmap = sparse_decode_mem_map(ms->section_mem_map, 627 __section_nr(ms)); 628 ms->section_mem_map = 0; 629 ms->pageblock_flags = NULL; 630 } 631 632 free_section_usemap(memmap, usemap); 633 } 634 #endif 635