1 /* 2 * mm/percpu.c - percpu memory allocator 3 * 4 * Copyright (C) 2009 SUSE Linux Products GmbH 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 6 * 7 * This file is released under the GPLv2. 8 * 9 * This is percpu allocator which can handle both static and dynamic 10 * areas. Percpu areas are allocated in chunks. Each chunk is 11 * consisted of boot-time determined number of units and the first 12 * chunk is used for static percpu variables in the kernel image 13 * (special boot time alloc/init handling necessary as these areas 14 * need to be brought up before allocation services are running). 15 * Unit grows as necessary and all units grow or shrink in unison. 16 * When a chunk is filled up, another chunk is allocated. 17 * 18 * c0 c1 c2 19 * ------------------- ------------------- ------------ 20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u 21 * ------------------- ...... ------------------- .... ------------ 22 * 23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to 26 * cpus. On NUMA, the mapping can be non-linear and even sparse. 27 * Percpu access can be done by configuring percpu base registers 28 * according to cpu to unit mapping and pcpu_unit_size. 29 * 30 * There are usually many small percpu allocations many of them being 31 * as small as 4 bytes. The allocator organizes chunks into lists 32 * according to free size and tries to allocate from the fullest one. 33 * Each chunk keeps the maximum contiguous area size hint which is 34 * guaranteed to be equal to or larger than the maximum contiguous 35 * area in the chunk. This helps the allocator not to iterate the 36 * chunk maps unnecessarily. 37 * 38 * Allocation state in each chunk is kept using an array of integers 39 * on chunk->map. A positive value in the map represents a free 40 * region and negative allocated. Allocation inside a chunk is done 41 * by scanning this map sequentially and serving the first matching 42 * entry. This is mostly copied from the percpu_modalloc() allocator. 43 * Chunks can be determined from the address using the index field 44 * in the page struct. The index field contains a pointer to the chunk. 45 * 46 * To use this allocator, arch code should do the following: 47 * 48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 49 * regular address to percpu pointer and back if they need to be 50 * different from the default 51 * 52 * - use pcpu_setup_first_chunk() during percpu area initialization to 53 * setup the first chunk containing the kernel static percpu area 54 */ 55 56 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 57 58 #include <linux/bitmap.h> 59 #include <linux/bootmem.h> 60 #include <linux/err.h> 61 #include <linux/list.h> 62 #include <linux/log2.h> 63 #include <linux/mm.h> 64 #include <linux/module.h> 65 #include <linux/mutex.h> 66 #include <linux/percpu.h> 67 #include <linux/pfn.h> 68 #include <linux/slab.h> 69 #include <linux/spinlock.h> 70 #include <linux/vmalloc.h> 71 #include <linux/workqueue.h> 72 #include <linux/kmemleak.h> 73 74 #include <asm/cacheflush.h> 75 #include <asm/sections.h> 76 #include <asm/tlbflush.h> 77 #include <asm/io.h> 78 79 #define CREATE_TRACE_POINTS 80 #include <trace/events/percpu.h> 81 82 #include "percpu-internal.h" 83 84 #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 85 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 86 #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 87 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 88 #define PCPU_EMPTY_POP_PAGES_LOW 2 89 #define PCPU_EMPTY_POP_PAGES_HIGH 4 90 91 #ifdef CONFIG_SMP 92 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 93 #ifndef __addr_to_pcpu_ptr 94 #define __addr_to_pcpu_ptr(addr) \ 95 (void __percpu *)((unsigned long)(addr) - \ 96 (unsigned long)pcpu_base_addr + \ 97 (unsigned long)__per_cpu_start) 98 #endif 99 #ifndef __pcpu_ptr_to_addr 100 #define __pcpu_ptr_to_addr(ptr) \ 101 (void __force *)((unsigned long)(ptr) + \ 102 (unsigned long)pcpu_base_addr - \ 103 (unsigned long)__per_cpu_start) 104 #endif 105 #else /* CONFIG_SMP */ 106 /* on UP, it's always identity mapped */ 107 #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) 108 #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) 109 #endif /* CONFIG_SMP */ 110 111 static int pcpu_unit_pages __ro_after_init; 112 static int pcpu_unit_size __ro_after_init; 113 static int pcpu_nr_units __ro_after_init; 114 static int pcpu_atom_size __ro_after_init; 115 int pcpu_nr_slots __ro_after_init; 116 static size_t pcpu_chunk_struct_size __ro_after_init; 117 118 /* cpus with the lowest and highest unit addresses */ 119 static unsigned int pcpu_low_unit_cpu __ro_after_init; 120 static unsigned int pcpu_high_unit_cpu __ro_after_init; 121 122 /* the address of the first chunk which starts with the kernel static area */ 123 void *pcpu_base_addr __ro_after_init; 124 EXPORT_SYMBOL_GPL(pcpu_base_addr); 125 126 static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ 127 const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ 128 129 /* group information, used for vm allocation */ 130 static int pcpu_nr_groups __ro_after_init; 131 static const unsigned long *pcpu_group_offsets __ro_after_init; 132 static const size_t *pcpu_group_sizes __ro_after_init; 133 134 /* 135 * The first chunk which always exists. Note that unlike other 136 * chunks, this one can be allocated and mapped in several different 137 * ways and thus often doesn't live in the vmalloc area. 138 */ 139 struct pcpu_chunk *pcpu_first_chunk __ro_after_init; 140 141 /* 142 * Optional reserved chunk. This chunk reserves part of the first 143 * chunk and serves it for reserved allocations. The amount of 144 * reserved offset is in pcpu_reserved_chunk_limit. When reserved 145 * area doesn't exist, the following variables contain NULL and 0 146 * respectively. 147 */ 148 struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; 149 static int pcpu_reserved_chunk_limit __ro_after_init; 150 151 DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ 152 static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ 153 154 struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */ 155 156 /* chunks which need their map areas extended, protected by pcpu_lock */ 157 static LIST_HEAD(pcpu_map_extend_chunks); 158 159 /* 160 * The number of empty populated pages, protected by pcpu_lock. The 161 * reserved chunk doesn't contribute to the count. 162 */ 163 static int pcpu_nr_empty_pop_pages; 164 165 /* 166 * Balance work is used to populate or destroy chunks asynchronously. We 167 * try to keep the number of populated free pages between 168 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one 169 * empty chunk. 170 */ 171 static void pcpu_balance_workfn(struct work_struct *work); 172 static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); 173 static bool pcpu_async_enabled __read_mostly; 174 static bool pcpu_atomic_alloc_failed; 175 176 static void pcpu_schedule_balance_work(void) 177 { 178 if (pcpu_async_enabled) 179 schedule_work(&pcpu_balance_work); 180 } 181 182 static bool pcpu_addr_in_first_chunk(void *addr) 183 { 184 void *first_start = pcpu_first_chunk->base_addr; 185 186 return addr >= first_start && addr < first_start + pcpu_unit_size; 187 } 188 189 static bool pcpu_addr_in_reserved_chunk(void *addr) 190 { 191 void *first_start = pcpu_first_chunk->base_addr; 192 193 return addr >= first_start && 194 addr < first_start + pcpu_reserved_chunk_limit; 195 } 196 197 static int __pcpu_size_to_slot(int size) 198 { 199 int highbit = fls(size); /* size is in bytes */ 200 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); 201 } 202 203 static int pcpu_size_to_slot(int size) 204 { 205 if (size == pcpu_unit_size) 206 return pcpu_nr_slots - 1; 207 return __pcpu_size_to_slot(size); 208 } 209 210 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 211 { 212 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) 213 return 0; 214 215 return pcpu_size_to_slot(chunk->free_size); 216 } 217 218 /* set the pointer to a chunk in a page struct */ 219 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 220 { 221 page->index = (unsigned long)pcpu; 222 } 223 224 /* obtain pointer to a chunk from a page struct */ 225 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) 226 { 227 return (struct pcpu_chunk *)page->index; 228 } 229 230 static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) 231 { 232 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; 233 } 234 235 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 236 unsigned int cpu, int page_idx) 237 { 238 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + 239 (page_idx << PAGE_SHIFT); 240 } 241 242 static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, 243 int *rs, int *re, int end) 244 { 245 *rs = find_next_zero_bit(chunk->populated, end, *rs); 246 *re = find_next_bit(chunk->populated, end, *rs + 1); 247 } 248 249 static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, 250 int *rs, int *re, int end) 251 { 252 *rs = find_next_bit(chunk->populated, end, *rs); 253 *re = find_next_zero_bit(chunk->populated, end, *rs + 1); 254 } 255 256 /* 257 * (Un)populated page region iterators. Iterate over (un)populated 258 * page regions between @start and @end in @chunk. @rs and @re should 259 * be integer variables and will be set to start and end page index of 260 * the current region. 261 */ 262 #define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ 263 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ 264 (rs) < (re); \ 265 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) 266 267 #define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ 268 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ 269 (rs) < (re); \ 270 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) 271 272 /** 273 * pcpu_mem_zalloc - allocate memory 274 * @size: bytes to allocate 275 * 276 * Allocate @size bytes. If @size is smaller than PAGE_SIZE, 277 * kzalloc() is used; otherwise, vzalloc() is used. The returned 278 * memory is always zeroed. 279 * 280 * CONTEXT: 281 * Does GFP_KERNEL allocation. 282 * 283 * RETURNS: 284 * Pointer to the allocated area on success, NULL on failure. 285 */ 286 static void *pcpu_mem_zalloc(size_t size) 287 { 288 if (WARN_ON_ONCE(!slab_is_available())) 289 return NULL; 290 291 if (size <= PAGE_SIZE) 292 return kzalloc(size, GFP_KERNEL); 293 else 294 return vzalloc(size); 295 } 296 297 /** 298 * pcpu_mem_free - free memory 299 * @ptr: memory to free 300 * 301 * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). 302 */ 303 static void pcpu_mem_free(void *ptr) 304 { 305 kvfree(ptr); 306 } 307 308 /** 309 * pcpu_count_occupied_pages - count the number of pages an area occupies 310 * @chunk: chunk of interest 311 * @i: index of the area in question 312 * 313 * Count the number of pages chunk's @i'th area occupies. When the area's 314 * start and/or end address isn't aligned to page boundary, the straddled 315 * page is included in the count iff the rest of the page is free. 316 */ 317 static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) 318 { 319 int off = chunk->map[i] & ~1; 320 int end = chunk->map[i + 1] & ~1; 321 322 if (!PAGE_ALIGNED(off) && i > 0) { 323 int prev = chunk->map[i - 1]; 324 325 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) 326 off = round_down(off, PAGE_SIZE); 327 } 328 329 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { 330 int next = chunk->map[i + 1]; 331 int nend = chunk->map[i + 2] & ~1; 332 333 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) 334 end = round_up(end, PAGE_SIZE); 335 } 336 337 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); 338 } 339 340 /** 341 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 342 * @chunk: chunk of interest 343 * @oslot: the previous slot it was on 344 * 345 * This function is called after an allocation or free changed @chunk. 346 * New slot according to the changed state is determined and @chunk is 347 * moved to the slot. Note that the reserved chunk is never put on 348 * chunk slots. 349 * 350 * CONTEXT: 351 * pcpu_lock. 352 */ 353 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 354 { 355 int nslot = pcpu_chunk_slot(chunk); 356 357 if (chunk != pcpu_reserved_chunk && oslot != nslot) { 358 if (oslot < nslot) 359 list_move(&chunk->list, &pcpu_slot[nslot]); 360 else 361 list_move_tail(&chunk->list, &pcpu_slot[nslot]); 362 } 363 } 364 365 /** 366 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 367 * @chunk: chunk of interest 368 * @is_atomic: the allocation context 369 * 370 * Determine whether area map of @chunk needs to be extended. If 371 * @is_atomic, only the amount necessary for a new allocation is 372 * considered; however, async extension is scheduled if the left amount is 373 * low. If !@is_atomic, it aims for more empty space. Combined, this 374 * ensures that the map is likely to have enough available space to 375 * accomodate atomic allocations which can't extend maps directly. 376 * 377 * CONTEXT: 378 * pcpu_lock. 379 * 380 * RETURNS: 381 * New target map allocation length if extension is necessary, 0 382 * otherwise. 383 */ 384 static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) 385 { 386 int margin, new_alloc; 387 388 lockdep_assert_held(&pcpu_lock); 389 390 if (is_atomic) { 391 margin = 3; 392 393 if (chunk->map_alloc < 394 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { 395 if (list_empty(&chunk->map_extend_list)) { 396 list_add_tail(&chunk->map_extend_list, 397 &pcpu_map_extend_chunks); 398 pcpu_schedule_balance_work(); 399 } 400 } 401 } else { 402 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; 403 } 404 405 if (chunk->map_alloc >= chunk->map_used + margin) 406 return 0; 407 408 new_alloc = PCPU_DFL_MAP_ALLOC; 409 while (new_alloc < chunk->map_used + margin) 410 new_alloc *= 2; 411 412 return new_alloc; 413 } 414 415 /** 416 * pcpu_extend_area_map - extend area map of a chunk 417 * @chunk: chunk of interest 418 * @new_alloc: new target allocation length of the area map 419 * 420 * Extend area map of @chunk to have @new_alloc entries. 421 * 422 * CONTEXT: 423 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. 424 * 425 * RETURNS: 426 * 0 on success, -errno on failure. 427 */ 428 static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) 429 { 430 int *old = NULL, *new = NULL; 431 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); 432 unsigned long flags; 433 434 lockdep_assert_held(&pcpu_alloc_mutex); 435 436 new = pcpu_mem_zalloc(new_size); 437 if (!new) 438 return -ENOMEM; 439 440 /* acquire pcpu_lock and switch to new area map */ 441 spin_lock_irqsave(&pcpu_lock, flags); 442 443 if (new_alloc <= chunk->map_alloc) 444 goto out_unlock; 445 446 old_size = chunk->map_alloc * sizeof(chunk->map[0]); 447 old = chunk->map; 448 449 memcpy(new, old, old_size); 450 451 chunk->map_alloc = new_alloc; 452 chunk->map = new; 453 new = NULL; 454 455 out_unlock: 456 spin_unlock_irqrestore(&pcpu_lock, flags); 457 458 /* 459 * pcpu_mem_free() might end up calling vfree() which uses 460 * IRQ-unsafe lock and thus can't be called under pcpu_lock. 461 */ 462 pcpu_mem_free(old); 463 pcpu_mem_free(new); 464 465 return 0; 466 } 467 468 /** 469 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area 470 * @chunk: chunk the candidate area belongs to 471 * @off: the offset to the start of the candidate area 472 * @this_size: the size of the candidate area 473 * @size: the size of the target allocation 474 * @align: the alignment of the target allocation 475 * @pop_only: only allocate from already populated region 476 * 477 * We're trying to allocate @size bytes aligned at @align. @chunk's area 478 * at @off sized @this_size is a candidate. This function determines 479 * whether the target allocation fits in the candidate area and returns the 480 * number of bytes to pad after @off. If the target area doesn't fit, -1 481 * is returned. 482 * 483 * If @pop_only is %true, this function only considers the already 484 * populated part of the candidate area. 485 */ 486 static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, 487 int size, int align, bool pop_only) 488 { 489 int cand_off = off; 490 491 while (true) { 492 int head = ALIGN(cand_off, align) - off; 493 int page_start, page_end, rs, re; 494 495 if (this_size < head + size) 496 return -1; 497 498 if (!pop_only) 499 return head; 500 501 /* 502 * If the first unpopulated page is beyond the end of the 503 * allocation, the whole allocation is populated; 504 * otherwise, retry from the end of the unpopulated area. 505 */ 506 page_start = PFN_DOWN(head + off); 507 page_end = PFN_UP(head + off + size); 508 509 rs = page_start; 510 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); 511 if (rs >= page_end) 512 return head; 513 cand_off = re * PAGE_SIZE; 514 } 515 } 516 517 /** 518 * pcpu_alloc_area - allocate area from a pcpu_chunk 519 * @chunk: chunk of interest 520 * @size: wanted size in bytes 521 * @align: wanted align 522 * @pop_only: allocate only from the populated area 523 * @occ_pages_p: out param for the number of pages the area occupies 524 * 525 * Try to allocate @size bytes area aligned at @align from @chunk. 526 * Note that this function only allocates the offset. It doesn't 527 * populate or map the area. 528 * 529 * @chunk->map must have at least two free slots. 530 * 531 * CONTEXT: 532 * pcpu_lock. 533 * 534 * RETURNS: 535 * Allocated offset in @chunk on success, -1 if no matching area is 536 * found. 537 */ 538 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, 539 bool pop_only, int *occ_pages_p) 540 { 541 int oslot = pcpu_chunk_slot(chunk); 542 int max_contig = 0; 543 int i, off; 544 bool seen_free = false; 545 int *p; 546 547 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { 548 int head, tail; 549 int this_size; 550 551 off = *p; 552 if (off & 1) 553 continue; 554 555 this_size = (p[1] & ~1) - off; 556 557 head = pcpu_fit_in_area(chunk, off, this_size, size, align, 558 pop_only); 559 if (head < 0) { 560 if (!seen_free) { 561 chunk->first_free = i; 562 seen_free = true; 563 } 564 max_contig = max(this_size, max_contig); 565 continue; 566 } 567 568 /* 569 * If head is small or the previous block is free, 570 * merge'em. Note that 'small' is defined as smaller 571 * than sizeof(int), which is very small but isn't too 572 * uncommon for percpu allocations. 573 */ 574 if (head && (head < sizeof(int) || !(p[-1] & 1))) { 575 *p = off += head; 576 if (p[-1] & 1) 577 chunk->free_size -= head; 578 else 579 max_contig = max(*p - p[-1], max_contig); 580 this_size -= head; 581 head = 0; 582 } 583 584 /* if tail is small, just keep it around */ 585 tail = this_size - head - size; 586 if (tail < sizeof(int)) { 587 tail = 0; 588 size = this_size - head; 589 } 590 591 /* split if warranted */ 592 if (head || tail) { 593 int nr_extra = !!head + !!tail; 594 595 /* insert new subblocks */ 596 memmove(p + nr_extra + 1, p + 1, 597 sizeof(chunk->map[0]) * (chunk->map_used - i)); 598 chunk->map_used += nr_extra; 599 600 if (head) { 601 if (!seen_free) { 602 chunk->first_free = i; 603 seen_free = true; 604 } 605 *++p = off += head; 606 ++i; 607 max_contig = max(head, max_contig); 608 } 609 if (tail) { 610 p[1] = off + size; 611 max_contig = max(tail, max_contig); 612 } 613 } 614 615 if (!seen_free) 616 chunk->first_free = i + 1; 617 618 /* update hint and mark allocated */ 619 if (i + 1 == chunk->map_used) 620 chunk->contig_hint = max_contig; /* fully scanned */ 621 else 622 chunk->contig_hint = max(chunk->contig_hint, 623 max_contig); 624 625 chunk->free_size -= size; 626 *p |= 1; 627 628 *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 629 pcpu_chunk_relocate(chunk, oslot); 630 return off; 631 } 632 633 chunk->contig_hint = max_contig; /* fully scanned */ 634 pcpu_chunk_relocate(chunk, oslot); 635 636 /* tell the upper layer that this chunk has no matching area */ 637 return -1; 638 } 639 640 /** 641 * pcpu_free_area - free area to a pcpu_chunk 642 * @chunk: chunk of interest 643 * @freeme: offset of area to free 644 * @occ_pages_p: out param for the number of pages the area occupies 645 * 646 * Free area starting from @freeme to @chunk. Note that this function 647 * only modifies the allocation map. It doesn't depopulate or unmap 648 * the area. 649 * 650 * CONTEXT: 651 * pcpu_lock. 652 */ 653 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, 654 int *occ_pages_p) 655 { 656 int oslot = pcpu_chunk_slot(chunk); 657 int off = 0; 658 unsigned i, j; 659 int to_free = 0; 660 int *p; 661 662 lockdep_assert_held(&pcpu_lock); 663 pcpu_stats_area_dealloc(chunk); 664 665 freeme |= 1; /* we are searching for <given offset, in use> pair */ 666 667 i = 0; 668 j = chunk->map_used; 669 while (i != j) { 670 unsigned k = (i + j) / 2; 671 off = chunk->map[k]; 672 if (off < freeme) 673 i = k + 1; 674 else if (off > freeme) 675 j = k; 676 else 677 i = j = k; 678 } 679 BUG_ON(off != freeme); 680 681 if (i < chunk->first_free) 682 chunk->first_free = i; 683 684 p = chunk->map + i; 685 *p = off &= ~1; 686 chunk->free_size += (p[1] & ~1) - off; 687 688 *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 689 690 /* merge with next? */ 691 if (!(p[1] & 1)) 692 to_free++; 693 /* merge with previous? */ 694 if (i > 0 && !(p[-1] & 1)) { 695 to_free++; 696 i--; 697 p--; 698 } 699 if (to_free) { 700 chunk->map_used -= to_free; 701 memmove(p + 1, p + 1 + to_free, 702 (chunk->map_used - i) * sizeof(chunk->map[0])); 703 } 704 705 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); 706 pcpu_chunk_relocate(chunk, oslot); 707 } 708 709 static struct pcpu_chunk *pcpu_alloc_chunk(void) 710 { 711 struct pcpu_chunk *chunk; 712 713 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); 714 if (!chunk) 715 return NULL; 716 717 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * 718 sizeof(chunk->map[0])); 719 if (!chunk->map) { 720 pcpu_mem_free(chunk); 721 return NULL; 722 } 723 724 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 725 chunk->map[0] = 0; 726 chunk->map[1] = pcpu_unit_size | 1; 727 chunk->map_used = 1; 728 chunk->has_reserved = false; 729 730 INIT_LIST_HEAD(&chunk->list); 731 INIT_LIST_HEAD(&chunk->map_extend_list); 732 chunk->free_size = pcpu_unit_size; 733 chunk->contig_hint = pcpu_unit_size; 734 735 return chunk; 736 } 737 738 static void pcpu_free_chunk(struct pcpu_chunk *chunk) 739 { 740 if (!chunk) 741 return; 742 pcpu_mem_free(chunk->map); 743 pcpu_mem_free(chunk); 744 } 745 746 /** 747 * pcpu_chunk_populated - post-population bookkeeping 748 * @chunk: pcpu_chunk which got populated 749 * @page_start: the start page 750 * @page_end: the end page 751 * 752 * Pages in [@page_start,@page_end) have been populated to @chunk. Update 753 * the bookkeeping information accordingly. Must be called after each 754 * successful population. 755 */ 756 static void pcpu_chunk_populated(struct pcpu_chunk *chunk, 757 int page_start, int page_end) 758 { 759 int nr = page_end - page_start; 760 761 lockdep_assert_held(&pcpu_lock); 762 763 bitmap_set(chunk->populated, page_start, nr); 764 chunk->nr_populated += nr; 765 pcpu_nr_empty_pop_pages += nr; 766 } 767 768 /** 769 * pcpu_chunk_depopulated - post-depopulation bookkeeping 770 * @chunk: pcpu_chunk which got depopulated 771 * @page_start: the start page 772 * @page_end: the end page 773 * 774 * Pages in [@page_start,@page_end) have been depopulated from @chunk. 775 * Update the bookkeeping information accordingly. Must be called after 776 * each successful depopulation. 777 */ 778 static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, 779 int page_start, int page_end) 780 { 781 int nr = page_end - page_start; 782 783 lockdep_assert_held(&pcpu_lock); 784 785 bitmap_clear(chunk->populated, page_start, nr); 786 chunk->nr_populated -= nr; 787 pcpu_nr_empty_pop_pages -= nr; 788 } 789 790 /* 791 * Chunk management implementation. 792 * 793 * To allow different implementations, chunk alloc/free and 794 * [de]population are implemented in a separate file which is pulled 795 * into this file and compiled together. The following functions 796 * should be implemented. 797 * 798 * pcpu_populate_chunk - populate the specified range of a chunk 799 * pcpu_depopulate_chunk - depopulate the specified range of a chunk 800 * pcpu_create_chunk - create a new chunk 801 * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop 802 * pcpu_addr_to_page - translate address to physical address 803 * pcpu_verify_alloc_info - check alloc_info is acceptable during init 804 */ 805 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); 806 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); 807 static struct pcpu_chunk *pcpu_create_chunk(void); 808 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); 809 static struct page *pcpu_addr_to_page(void *addr); 810 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); 811 812 #ifdef CONFIG_NEED_PER_CPU_KM 813 #include "percpu-km.c" 814 #else 815 #include "percpu-vm.c" 816 #endif 817 818 /** 819 * pcpu_chunk_addr_search - determine chunk containing specified address 820 * @addr: address for which the chunk needs to be determined. 821 * 822 * RETURNS: 823 * The address of the found chunk. 824 */ 825 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 826 { 827 /* is it in the first chunk? */ 828 if (pcpu_addr_in_first_chunk(addr)) { 829 /* is it in the reserved area? */ 830 if (pcpu_addr_in_reserved_chunk(addr)) 831 return pcpu_reserved_chunk; 832 return pcpu_first_chunk; 833 } 834 835 /* 836 * The address is relative to unit0 which might be unused and 837 * thus unmapped. Offset the address to the unit space of the 838 * current processor before looking it up in the vmalloc 839 * space. Note that any possible cpu id can be used here, so 840 * there's no need to worry about preemption or cpu hotplug. 841 */ 842 addr += pcpu_unit_offsets[raw_smp_processor_id()]; 843 return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); 844 } 845 846 /** 847 * pcpu_alloc - the percpu allocator 848 * @size: size of area to allocate in bytes 849 * @align: alignment of area (max PAGE_SIZE) 850 * @reserved: allocate from the reserved chunk if available 851 * @gfp: allocation flags 852 * 853 * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't 854 * contain %GFP_KERNEL, the allocation is atomic. 855 * 856 * RETURNS: 857 * Percpu pointer to the allocated area on success, NULL on failure. 858 */ 859 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, 860 gfp_t gfp) 861 { 862 static int warn_limit = 10; 863 struct pcpu_chunk *chunk; 864 const char *err; 865 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; 866 int occ_pages = 0; 867 int slot, off, new_alloc, cpu, ret; 868 unsigned long flags; 869 void __percpu *ptr; 870 871 /* 872 * We want the lowest bit of offset available for in-use/free 873 * indicator, so force >= 16bit alignment and make size even. 874 */ 875 if (unlikely(align < 2)) 876 align = 2; 877 878 size = ALIGN(size, 2); 879 880 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || 881 !is_power_of_2(align))) { 882 WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", 883 size, align); 884 return NULL; 885 } 886 887 if (!is_atomic) 888 mutex_lock(&pcpu_alloc_mutex); 889 890 spin_lock_irqsave(&pcpu_lock, flags); 891 892 /* serve reserved allocations from the reserved chunk if available */ 893 if (reserved && pcpu_reserved_chunk) { 894 chunk = pcpu_reserved_chunk; 895 896 if (size > chunk->contig_hint) { 897 err = "alloc from reserved chunk failed"; 898 goto fail_unlock; 899 } 900 901 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { 902 spin_unlock_irqrestore(&pcpu_lock, flags); 903 if (is_atomic || 904 pcpu_extend_area_map(chunk, new_alloc) < 0) { 905 err = "failed to extend area map of reserved chunk"; 906 goto fail; 907 } 908 spin_lock_irqsave(&pcpu_lock, flags); 909 } 910 911 off = pcpu_alloc_area(chunk, size, align, is_atomic, 912 &occ_pages); 913 if (off >= 0) 914 goto area_found; 915 916 err = "alloc from reserved chunk failed"; 917 goto fail_unlock; 918 } 919 920 restart: 921 /* search through normal chunks */ 922 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 923 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 924 if (size > chunk->contig_hint) 925 continue; 926 927 new_alloc = pcpu_need_to_extend(chunk, is_atomic); 928 if (new_alloc) { 929 if (is_atomic) 930 continue; 931 spin_unlock_irqrestore(&pcpu_lock, flags); 932 if (pcpu_extend_area_map(chunk, 933 new_alloc) < 0) { 934 err = "failed to extend area map"; 935 goto fail; 936 } 937 spin_lock_irqsave(&pcpu_lock, flags); 938 /* 939 * pcpu_lock has been dropped, need to 940 * restart cpu_slot list walking. 941 */ 942 goto restart; 943 } 944 945 off = pcpu_alloc_area(chunk, size, align, is_atomic, 946 &occ_pages); 947 if (off >= 0) 948 goto area_found; 949 } 950 } 951 952 spin_unlock_irqrestore(&pcpu_lock, flags); 953 954 /* 955 * No space left. Create a new chunk. We don't want multiple 956 * tasks to create chunks simultaneously. Serialize and create iff 957 * there's still no empty chunk after grabbing the mutex. 958 */ 959 if (is_atomic) { 960 err = "atomic alloc failed, no space left"; 961 goto fail; 962 } 963 964 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { 965 chunk = pcpu_create_chunk(); 966 if (!chunk) { 967 err = "failed to allocate new chunk"; 968 goto fail; 969 } 970 971 spin_lock_irqsave(&pcpu_lock, flags); 972 pcpu_chunk_relocate(chunk, -1); 973 } else { 974 spin_lock_irqsave(&pcpu_lock, flags); 975 } 976 977 goto restart; 978 979 area_found: 980 pcpu_stats_area_alloc(chunk, size); 981 spin_unlock_irqrestore(&pcpu_lock, flags); 982 983 /* populate if not all pages are already there */ 984 if (!is_atomic) { 985 int page_start, page_end, rs, re; 986 987 page_start = PFN_DOWN(off); 988 page_end = PFN_UP(off + size); 989 990 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 991 WARN_ON(chunk->immutable); 992 993 ret = pcpu_populate_chunk(chunk, rs, re); 994 995 spin_lock_irqsave(&pcpu_lock, flags); 996 if (ret) { 997 pcpu_free_area(chunk, off, &occ_pages); 998 err = "failed to populate"; 999 goto fail_unlock; 1000 } 1001 pcpu_chunk_populated(chunk, rs, re); 1002 spin_unlock_irqrestore(&pcpu_lock, flags); 1003 } 1004 1005 mutex_unlock(&pcpu_alloc_mutex); 1006 } 1007 1008 if (chunk != pcpu_reserved_chunk) { 1009 spin_lock_irqsave(&pcpu_lock, flags); 1010 pcpu_nr_empty_pop_pages -= occ_pages; 1011 spin_unlock_irqrestore(&pcpu_lock, flags); 1012 } 1013 1014 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) 1015 pcpu_schedule_balance_work(); 1016 1017 /* clear the areas and return address relative to base address */ 1018 for_each_possible_cpu(cpu) 1019 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 1020 1021 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1022 kmemleak_alloc_percpu(ptr, size, gfp); 1023 1024 trace_percpu_alloc_percpu(reserved, is_atomic, size, align, 1025 chunk->base_addr, off, ptr); 1026 1027 return ptr; 1028 1029 fail_unlock: 1030 spin_unlock_irqrestore(&pcpu_lock, flags); 1031 fail: 1032 trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); 1033 1034 if (!is_atomic && warn_limit) { 1035 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", 1036 size, align, is_atomic, err); 1037 dump_stack(); 1038 if (!--warn_limit) 1039 pr_info("limit reached, disable warning\n"); 1040 } 1041 if (is_atomic) { 1042 /* see the flag handling in pcpu_blance_workfn() */ 1043 pcpu_atomic_alloc_failed = true; 1044 pcpu_schedule_balance_work(); 1045 } else { 1046 mutex_unlock(&pcpu_alloc_mutex); 1047 } 1048 return NULL; 1049 } 1050 1051 /** 1052 * __alloc_percpu_gfp - allocate dynamic percpu area 1053 * @size: size of area to allocate in bytes 1054 * @align: alignment of area (max PAGE_SIZE) 1055 * @gfp: allocation flags 1056 * 1057 * Allocate zero-filled percpu area of @size bytes aligned at @align. If 1058 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can 1059 * be called from any context but is a lot more likely to fail. 1060 * 1061 * RETURNS: 1062 * Percpu pointer to the allocated area on success, NULL on failure. 1063 */ 1064 void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) 1065 { 1066 return pcpu_alloc(size, align, false, gfp); 1067 } 1068 EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); 1069 1070 /** 1071 * __alloc_percpu - allocate dynamic percpu area 1072 * @size: size of area to allocate in bytes 1073 * @align: alignment of area (max PAGE_SIZE) 1074 * 1075 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). 1076 */ 1077 void __percpu *__alloc_percpu(size_t size, size_t align) 1078 { 1079 return pcpu_alloc(size, align, false, GFP_KERNEL); 1080 } 1081 EXPORT_SYMBOL_GPL(__alloc_percpu); 1082 1083 /** 1084 * __alloc_reserved_percpu - allocate reserved percpu area 1085 * @size: size of area to allocate in bytes 1086 * @align: alignment of area (max PAGE_SIZE) 1087 * 1088 * Allocate zero-filled percpu area of @size bytes aligned at @align 1089 * from reserved percpu area if arch has set it up; otherwise, 1090 * allocation is served from the same dynamic area. Might sleep. 1091 * Might trigger writeouts. 1092 * 1093 * CONTEXT: 1094 * Does GFP_KERNEL allocation. 1095 * 1096 * RETURNS: 1097 * Percpu pointer to the allocated area on success, NULL on failure. 1098 */ 1099 void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1100 { 1101 return pcpu_alloc(size, align, true, GFP_KERNEL); 1102 } 1103 1104 /** 1105 * pcpu_balance_workfn - manage the amount of free chunks and populated pages 1106 * @work: unused 1107 * 1108 * Reclaim all fully free chunks except for the first one. 1109 */ 1110 static void pcpu_balance_workfn(struct work_struct *work) 1111 { 1112 LIST_HEAD(to_free); 1113 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; 1114 struct pcpu_chunk *chunk, *next; 1115 int slot, nr_to_pop, ret; 1116 1117 /* 1118 * There's no reason to keep around multiple unused chunks and VM 1119 * areas can be scarce. Destroy all free chunks except for one. 1120 */ 1121 mutex_lock(&pcpu_alloc_mutex); 1122 spin_lock_irq(&pcpu_lock); 1123 1124 list_for_each_entry_safe(chunk, next, free_head, list) { 1125 WARN_ON(chunk->immutable); 1126 1127 /* spare the first one */ 1128 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) 1129 continue; 1130 1131 list_del_init(&chunk->map_extend_list); 1132 list_move(&chunk->list, &to_free); 1133 } 1134 1135 spin_unlock_irq(&pcpu_lock); 1136 1137 list_for_each_entry_safe(chunk, next, &to_free, list) { 1138 int rs, re; 1139 1140 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { 1141 pcpu_depopulate_chunk(chunk, rs, re); 1142 spin_lock_irq(&pcpu_lock); 1143 pcpu_chunk_depopulated(chunk, rs, re); 1144 spin_unlock_irq(&pcpu_lock); 1145 } 1146 pcpu_destroy_chunk(chunk); 1147 } 1148 1149 /* service chunks which requested async area map extension */ 1150 do { 1151 int new_alloc = 0; 1152 1153 spin_lock_irq(&pcpu_lock); 1154 1155 chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, 1156 struct pcpu_chunk, map_extend_list); 1157 if (chunk) { 1158 list_del_init(&chunk->map_extend_list); 1159 new_alloc = pcpu_need_to_extend(chunk, false); 1160 } 1161 1162 spin_unlock_irq(&pcpu_lock); 1163 1164 if (new_alloc) 1165 pcpu_extend_area_map(chunk, new_alloc); 1166 } while (chunk); 1167 1168 /* 1169 * Ensure there are certain number of free populated pages for 1170 * atomic allocs. Fill up from the most packed so that atomic 1171 * allocs don't increase fragmentation. If atomic allocation 1172 * failed previously, always populate the maximum amount. This 1173 * should prevent atomic allocs larger than PAGE_SIZE from keeping 1174 * failing indefinitely; however, large atomic allocs are not 1175 * something we support properly and can be highly unreliable and 1176 * inefficient. 1177 */ 1178 retry_pop: 1179 if (pcpu_atomic_alloc_failed) { 1180 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; 1181 /* best effort anyway, don't worry about synchronization */ 1182 pcpu_atomic_alloc_failed = false; 1183 } else { 1184 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - 1185 pcpu_nr_empty_pop_pages, 1186 0, PCPU_EMPTY_POP_PAGES_HIGH); 1187 } 1188 1189 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { 1190 int nr_unpop = 0, rs, re; 1191 1192 if (!nr_to_pop) 1193 break; 1194 1195 spin_lock_irq(&pcpu_lock); 1196 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1197 nr_unpop = pcpu_unit_pages - chunk->nr_populated; 1198 if (nr_unpop) 1199 break; 1200 } 1201 spin_unlock_irq(&pcpu_lock); 1202 1203 if (!nr_unpop) 1204 continue; 1205 1206 /* @chunk can't go away while pcpu_alloc_mutex is held */ 1207 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { 1208 int nr = min(re - rs, nr_to_pop); 1209 1210 ret = pcpu_populate_chunk(chunk, rs, rs + nr); 1211 if (!ret) { 1212 nr_to_pop -= nr; 1213 spin_lock_irq(&pcpu_lock); 1214 pcpu_chunk_populated(chunk, rs, rs + nr); 1215 spin_unlock_irq(&pcpu_lock); 1216 } else { 1217 nr_to_pop = 0; 1218 } 1219 1220 if (!nr_to_pop) 1221 break; 1222 } 1223 } 1224 1225 if (nr_to_pop) { 1226 /* ran out of chunks to populate, create a new one and retry */ 1227 chunk = pcpu_create_chunk(); 1228 if (chunk) { 1229 spin_lock_irq(&pcpu_lock); 1230 pcpu_chunk_relocate(chunk, -1); 1231 spin_unlock_irq(&pcpu_lock); 1232 goto retry_pop; 1233 } 1234 } 1235 1236 mutex_unlock(&pcpu_alloc_mutex); 1237 } 1238 1239 /** 1240 * free_percpu - free percpu area 1241 * @ptr: pointer to area to free 1242 * 1243 * Free percpu area @ptr. 1244 * 1245 * CONTEXT: 1246 * Can be called from atomic context. 1247 */ 1248 void free_percpu(void __percpu *ptr) 1249 { 1250 void *addr; 1251 struct pcpu_chunk *chunk; 1252 unsigned long flags; 1253 int off, occ_pages; 1254 1255 if (!ptr) 1256 return; 1257 1258 kmemleak_free_percpu(ptr); 1259 1260 addr = __pcpu_ptr_to_addr(ptr); 1261 1262 spin_lock_irqsave(&pcpu_lock, flags); 1263 1264 chunk = pcpu_chunk_addr_search(addr); 1265 off = addr - chunk->base_addr; 1266 1267 pcpu_free_area(chunk, off, &occ_pages); 1268 1269 if (chunk != pcpu_reserved_chunk) 1270 pcpu_nr_empty_pop_pages += occ_pages; 1271 1272 /* if there are more than one fully free chunks, wake up grim reaper */ 1273 if (chunk->free_size == pcpu_unit_size) { 1274 struct pcpu_chunk *pos; 1275 1276 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1277 if (pos != chunk) { 1278 pcpu_schedule_balance_work(); 1279 break; 1280 } 1281 } 1282 1283 trace_percpu_free_percpu(chunk->base_addr, off, ptr); 1284 1285 spin_unlock_irqrestore(&pcpu_lock, flags); 1286 } 1287 EXPORT_SYMBOL_GPL(free_percpu); 1288 1289 bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) 1290 { 1291 #ifdef CONFIG_SMP 1292 const size_t static_size = __per_cpu_end - __per_cpu_start; 1293 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 1294 unsigned int cpu; 1295 1296 for_each_possible_cpu(cpu) { 1297 void *start = per_cpu_ptr(base, cpu); 1298 void *va = (void *)addr; 1299 1300 if (va >= start && va < start + static_size) { 1301 if (can_addr) { 1302 *can_addr = (unsigned long) (va - start); 1303 *can_addr += (unsigned long) 1304 per_cpu_ptr(base, get_boot_cpu_id()); 1305 } 1306 return true; 1307 } 1308 } 1309 #endif 1310 /* on UP, can't distinguish from other static vars, always false */ 1311 return false; 1312 } 1313 1314 /** 1315 * is_kernel_percpu_address - test whether address is from static percpu area 1316 * @addr: address to test 1317 * 1318 * Test whether @addr belongs to in-kernel static percpu area. Module 1319 * static percpu areas are not considered. For those, use 1320 * is_module_percpu_address(). 1321 * 1322 * RETURNS: 1323 * %true if @addr is from in-kernel static percpu area, %false otherwise. 1324 */ 1325 bool is_kernel_percpu_address(unsigned long addr) 1326 { 1327 return __is_kernel_percpu_address(addr, NULL); 1328 } 1329 1330 /** 1331 * per_cpu_ptr_to_phys - convert translated percpu address to physical address 1332 * @addr: the address to be converted to physical address 1333 * 1334 * Given @addr which is dereferenceable address obtained via one of 1335 * percpu access macros, this function translates it into its physical 1336 * address. The caller is responsible for ensuring @addr stays valid 1337 * until this function finishes. 1338 * 1339 * percpu allocator has special setup for the first chunk, which currently 1340 * supports either embedding in linear address space or vmalloc mapping, 1341 * and, from the second one, the backing allocator (currently either vm or 1342 * km) provides translation. 1343 * 1344 * The addr can be translated simply without checking if it falls into the 1345 * first chunk. But the current code reflects better how percpu allocator 1346 * actually works, and the verification can discover both bugs in percpu 1347 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current 1348 * code. 1349 * 1350 * RETURNS: 1351 * The physical address for @addr. 1352 */ 1353 phys_addr_t per_cpu_ptr_to_phys(void *addr) 1354 { 1355 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 1356 bool in_first_chunk = false; 1357 unsigned long first_low, first_high; 1358 unsigned int cpu; 1359 1360 /* 1361 * The following test on unit_low/high isn't strictly 1362 * necessary but will speed up lookups of addresses which 1363 * aren't in the first chunk. 1364 */ 1365 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); 1366 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, 1367 pcpu_unit_pages); 1368 if ((unsigned long)addr >= first_low && 1369 (unsigned long)addr < first_high) { 1370 for_each_possible_cpu(cpu) { 1371 void *start = per_cpu_ptr(base, cpu); 1372 1373 if (addr >= start && addr < start + pcpu_unit_size) { 1374 in_first_chunk = true; 1375 break; 1376 } 1377 } 1378 } 1379 1380 if (in_first_chunk) { 1381 if (!is_vmalloc_addr(addr)) 1382 return __pa(addr); 1383 else 1384 return page_to_phys(vmalloc_to_page(addr)) + 1385 offset_in_page(addr); 1386 } else 1387 return page_to_phys(pcpu_addr_to_page(addr)) + 1388 offset_in_page(addr); 1389 } 1390 1391 /** 1392 * pcpu_alloc_alloc_info - allocate percpu allocation info 1393 * @nr_groups: the number of groups 1394 * @nr_units: the number of units 1395 * 1396 * Allocate ai which is large enough for @nr_groups groups containing 1397 * @nr_units units. The returned ai's groups[0].cpu_map points to the 1398 * cpu_map array which is long enough for @nr_units and filled with 1399 * NR_CPUS. It's the caller's responsibility to initialize cpu_map 1400 * pointer of other groups. 1401 * 1402 * RETURNS: 1403 * Pointer to the allocated pcpu_alloc_info on success, NULL on 1404 * failure. 1405 */ 1406 struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, 1407 int nr_units) 1408 { 1409 struct pcpu_alloc_info *ai; 1410 size_t base_size, ai_size; 1411 void *ptr; 1412 int unit; 1413 1414 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), 1415 __alignof__(ai->groups[0].cpu_map[0])); 1416 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1417 1418 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); 1419 if (!ptr) 1420 return NULL; 1421 ai = ptr; 1422 ptr += base_size; 1423 1424 ai->groups[0].cpu_map = ptr; 1425 1426 for (unit = 0; unit < nr_units; unit++) 1427 ai->groups[0].cpu_map[unit] = NR_CPUS; 1428 1429 ai->nr_groups = nr_groups; 1430 ai->__ai_size = PFN_ALIGN(ai_size); 1431 1432 return ai; 1433 } 1434 1435 /** 1436 * pcpu_free_alloc_info - free percpu allocation info 1437 * @ai: pcpu_alloc_info to free 1438 * 1439 * Free @ai which was allocated by pcpu_alloc_alloc_info(). 1440 */ 1441 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1442 { 1443 memblock_free_early(__pa(ai), ai->__ai_size); 1444 } 1445 1446 /** 1447 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info 1448 * @lvl: loglevel 1449 * @ai: allocation info to dump 1450 * 1451 * Print out information about @ai using loglevel @lvl. 1452 */ 1453 static void pcpu_dump_alloc_info(const char *lvl, 1454 const struct pcpu_alloc_info *ai) 1455 { 1456 int group_width = 1, cpu_width = 1, width; 1457 char empty_str[] = "--------"; 1458 int alloc = 0, alloc_end = 0; 1459 int group, v; 1460 int upa, apl; /* units per alloc, allocs per line */ 1461 1462 v = ai->nr_groups; 1463 while (v /= 10) 1464 group_width++; 1465 1466 v = num_possible_cpus(); 1467 while (v /= 10) 1468 cpu_width++; 1469 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; 1470 1471 upa = ai->alloc_size / ai->unit_size; 1472 width = upa * (cpu_width + 1) + group_width + 3; 1473 apl = rounddown_pow_of_two(max(60 / width, 1)); 1474 1475 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", 1476 lvl, ai->static_size, ai->reserved_size, ai->dyn_size, 1477 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); 1478 1479 for (group = 0; group < ai->nr_groups; group++) { 1480 const struct pcpu_group_info *gi = &ai->groups[group]; 1481 int unit = 0, unit_end = 0; 1482 1483 BUG_ON(gi->nr_units % upa); 1484 for (alloc_end += gi->nr_units / upa; 1485 alloc < alloc_end; alloc++) { 1486 if (!(alloc % apl)) { 1487 pr_cont("\n"); 1488 printk("%spcpu-alloc: ", lvl); 1489 } 1490 pr_cont("[%0*d] ", group_width, group); 1491 1492 for (unit_end += upa; unit < unit_end; unit++) 1493 if (gi->cpu_map[unit] != NR_CPUS) 1494 pr_cont("%0*d ", 1495 cpu_width, gi->cpu_map[unit]); 1496 else 1497 pr_cont("%s ", empty_str); 1498 } 1499 } 1500 pr_cont("\n"); 1501 } 1502 1503 /** 1504 * pcpu_setup_first_chunk - initialize the first percpu chunk 1505 * @ai: pcpu_alloc_info describing how to percpu area is shaped 1506 * @base_addr: mapped address 1507 * 1508 * Initialize the first percpu chunk which contains the kernel static 1509 * perpcu area. This function is to be called from arch percpu area 1510 * setup path. 1511 * 1512 * @ai contains all information necessary to initialize the first 1513 * chunk and prime the dynamic percpu allocator. 1514 * 1515 * @ai->static_size is the size of static percpu area. 1516 * 1517 * @ai->reserved_size, if non-zero, specifies the amount of bytes to 1518 * reserve after the static area in the first chunk. This reserves 1519 * the first chunk such that it's available only through reserved 1520 * percpu allocation. This is primarily used to serve module percpu 1521 * static areas on architectures where the addressing model has 1522 * limited offset range for symbol relocations to guarantee module 1523 * percpu symbols fall inside the relocatable range. 1524 * 1525 * @ai->dyn_size determines the number of bytes available for dynamic 1526 * allocation in the first chunk. The area between @ai->static_size + 1527 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. 1528 * 1529 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE 1530 * and equal to or larger than @ai->static_size + @ai->reserved_size + 1531 * @ai->dyn_size. 1532 * 1533 * @ai->atom_size is the allocation atom size and used as alignment 1534 * for vm areas. 1535 * 1536 * @ai->alloc_size is the allocation size and always multiple of 1537 * @ai->atom_size. This is larger than @ai->atom_size if 1538 * @ai->unit_size is larger than @ai->atom_size. 1539 * 1540 * @ai->nr_groups and @ai->groups describe virtual memory layout of 1541 * percpu areas. Units which should be colocated are put into the 1542 * same group. Dynamic VM areas will be allocated according to these 1543 * groupings. If @ai->nr_groups is zero, a single group containing 1544 * all units is assumed. 1545 * 1546 * The caller should have mapped the first chunk at @base_addr and 1547 * copied static data to each unit. 1548 * 1549 * If the first chunk ends up with both reserved and dynamic areas, it 1550 * is served by two chunks - one to serve the core static and reserved 1551 * areas and the other for the dynamic area. They share the same vm 1552 * and page map but uses different area allocation map to stay away 1553 * from each other. The latter chunk is circulated in the chunk slots 1554 * and available for dynamic allocation like any other chunks. 1555 * 1556 * RETURNS: 1557 * 0 on success, -errno on failure. 1558 */ 1559 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1560 void *base_addr) 1561 { 1562 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1563 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1564 size_t dyn_size = ai->dyn_size; 1565 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; 1566 struct pcpu_chunk *schunk, *dchunk = NULL; 1567 unsigned long *group_offsets; 1568 size_t *group_sizes; 1569 unsigned long *unit_off; 1570 unsigned int cpu; 1571 int *unit_map; 1572 int group, unit, i; 1573 1574 #define PCPU_SETUP_BUG_ON(cond) do { \ 1575 if (unlikely(cond)) { \ 1576 pr_emerg("failed to initialize, %s\n", #cond); \ 1577 pr_emerg("cpu_possible_mask=%*pb\n", \ 1578 cpumask_pr_args(cpu_possible_mask)); \ 1579 pcpu_dump_alloc_info(KERN_EMERG, ai); \ 1580 BUG(); \ 1581 } \ 1582 } while (0) 1583 1584 /* sanity checks */ 1585 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1586 #ifdef CONFIG_SMP 1587 PCPU_SETUP_BUG_ON(!ai->static_size); 1588 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); 1589 #endif 1590 PCPU_SETUP_BUG_ON(!base_addr); 1591 PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); 1592 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1593 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); 1594 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1595 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); 1596 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1597 1598 /* process group information and build config tables accordingly */ 1599 group_offsets = memblock_virt_alloc(ai->nr_groups * 1600 sizeof(group_offsets[0]), 0); 1601 group_sizes = memblock_virt_alloc(ai->nr_groups * 1602 sizeof(group_sizes[0]), 0); 1603 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); 1604 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); 1605 1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1607 unit_map[cpu] = UINT_MAX; 1608 1609 pcpu_low_unit_cpu = NR_CPUS; 1610 pcpu_high_unit_cpu = NR_CPUS; 1611 1612 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1613 const struct pcpu_group_info *gi = &ai->groups[group]; 1614 1615 group_offsets[group] = gi->base_offset; 1616 group_sizes[group] = gi->nr_units * ai->unit_size; 1617 1618 for (i = 0; i < gi->nr_units; i++) { 1619 cpu = gi->cpu_map[i]; 1620 if (cpu == NR_CPUS) 1621 continue; 1622 1623 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); 1624 PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); 1625 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); 1626 1627 unit_map[cpu] = unit + i; 1628 unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1629 1630 /* determine low/high unit_cpu */ 1631 if (pcpu_low_unit_cpu == NR_CPUS || 1632 unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) 1633 pcpu_low_unit_cpu = cpu; 1634 if (pcpu_high_unit_cpu == NR_CPUS || 1635 unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) 1636 pcpu_high_unit_cpu = cpu; 1637 } 1638 } 1639 pcpu_nr_units = unit; 1640 1641 for_each_possible_cpu(cpu) 1642 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); 1643 1644 /* we're done parsing the input, undefine BUG macro and dump config */ 1645 #undef PCPU_SETUP_BUG_ON 1646 pcpu_dump_alloc_info(KERN_DEBUG, ai); 1647 1648 pcpu_nr_groups = ai->nr_groups; 1649 pcpu_group_offsets = group_offsets; 1650 pcpu_group_sizes = group_sizes; 1651 pcpu_unit_map = unit_map; 1652 pcpu_unit_offsets = unit_off; 1653 1654 /* determine basic parameters */ 1655 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; 1656 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1657 pcpu_atom_size = ai->atom_size; 1658 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + 1659 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); 1660 1661 pcpu_stats_save_ai(ai); 1662 1663 /* 1664 * Allocate chunk slots. The additional last slot is for 1665 * empty chunks. 1666 */ 1667 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1668 pcpu_slot = memblock_virt_alloc( 1669 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); 1670 for (i = 0; i < pcpu_nr_slots; i++) 1671 INIT_LIST_HEAD(&pcpu_slot[i]); 1672 1673 /* 1674 * Initialize static chunk. If reserved_size is zero, the 1675 * static chunk covers static area + dynamic allocation area 1676 * in the first chunk. If reserved_size is not zero, it 1677 * covers static area + reserved area (mostly used for module 1678 * static percpu allocation). 1679 */ 1680 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1681 INIT_LIST_HEAD(&schunk->list); 1682 INIT_LIST_HEAD(&schunk->map_extend_list); 1683 schunk->base_addr = base_addr; 1684 schunk->map = smap; 1685 schunk->map_alloc = ARRAY_SIZE(smap); 1686 schunk->immutable = true; 1687 bitmap_fill(schunk->populated, pcpu_unit_pages); 1688 schunk->nr_populated = pcpu_unit_pages; 1689 1690 if (ai->reserved_size) { 1691 schunk->free_size = ai->reserved_size; 1692 pcpu_reserved_chunk = schunk; 1693 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; 1694 } else { 1695 schunk->free_size = dyn_size; 1696 dyn_size = 0; /* dynamic area covered */ 1697 } 1698 schunk->contig_hint = schunk->free_size; 1699 1700 schunk->map[0] = 1; 1701 schunk->map[1] = ai->static_size; 1702 schunk->map_used = 1; 1703 if (schunk->free_size) 1704 schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; 1705 schunk->map[schunk->map_used] |= 1; 1706 schunk->has_reserved = true; 1707 1708 /* init dynamic chunk if necessary */ 1709 if (dyn_size) { 1710 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1711 INIT_LIST_HEAD(&dchunk->list); 1712 INIT_LIST_HEAD(&dchunk->map_extend_list); 1713 dchunk->base_addr = base_addr; 1714 dchunk->map = dmap; 1715 dchunk->map_alloc = ARRAY_SIZE(dmap); 1716 dchunk->immutable = true; 1717 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1718 dchunk->nr_populated = pcpu_unit_pages; 1719 1720 dchunk->contig_hint = dchunk->free_size = dyn_size; 1721 dchunk->map[0] = 1; 1722 dchunk->map[1] = pcpu_reserved_chunk_limit; 1723 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; 1724 dchunk->map_used = 2; 1725 dchunk->has_reserved = true; 1726 } 1727 1728 /* link the first chunk in */ 1729 pcpu_first_chunk = dchunk ?: schunk; 1730 pcpu_nr_empty_pop_pages += 1731 pcpu_count_occupied_pages(pcpu_first_chunk, 1); 1732 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1733 1734 pcpu_stats_chunk_alloc(); 1735 trace_percpu_create_chunk(base_addr); 1736 1737 /* we're done */ 1738 pcpu_base_addr = base_addr; 1739 return 0; 1740 } 1741 1742 #ifdef CONFIG_SMP 1743 1744 const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { 1745 [PCPU_FC_AUTO] = "auto", 1746 [PCPU_FC_EMBED] = "embed", 1747 [PCPU_FC_PAGE] = "page", 1748 }; 1749 1750 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; 1751 1752 static int __init percpu_alloc_setup(char *str) 1753 { 1754 if (!str) 1755 return -EINVAL; 1756 1757 if (0) 1758 /* nada */; 1759 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK 1760 else if (!strcmp(str, "embed")) 1761 pcpu_chosen_fc = PCPU_FC_EMBED; 1762 #endif 1763 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1764 else if (!strcmp(str, "page")) 1765 pcpu_chosen_fc = PCPU_FC_PAGE; 1766 #endif 1767 else 1768 pr_warn("unknown allocator %s specified\n", str); 1769 1770 return 0; 1771 } 1772 early_param("percpu_alloc", percpu_alloc_setup); 1773 1774 /* 1775 * pcpu_embed_first_chunk() is used by the generic percpu setup. 1776 * Build it if needed by the arch config or the generic setup is going 1777 * to be used. 1778 */ 1779 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ 1780 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 1781 #define BUILD_EMBED_FIRST_CHUNK 1782 #endif 1783 1784 /* build pcpu_page_first_chunk() iff needed by the arch config */ 1785 #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) 1786 #define BUILD_PAGE_FIRST_CHUNK 1787 #endif 1788 1789 /* pcpu_build_alloc_info() is used by both embed and page first chunk */ 1790 #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) 1791 /** 1792 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs 1793 * @reserved_size: the size of reserved percpu area in bytes 1794 * @dyn_size: minimum free size for dynamic allocation in bytes 1795 * @atom_size: allocation atom size 1796 * @cpu_distance_fn: callback to determine distance between cpus, optional 1797 * 1798 * This function determines grouping of units, their mappings to cpus 1799 * and other parameters considering needed percpu size, allocation 1800 * atom size and distances between CPUs. 1801 * 1802 * Groups are always multiples of atom size and CPUs which are of 1803 * LOCAL_DISTANCE both ways are grouped together and share space for 1804 * units in the same group. The returned configuration is guaranteed 1805 * to have CPUs on different nodes on different groups and >=75% usage 1806 * of allocated virtual address space. 1807 * 1808 * RETURNS: 1809 * On success, pointer to the new allocation_info is returned. On 1810 * failure, ERR_PTR value is returned. 1811 */ 1812 static struct pcpu_alloc_info * __init pcpu_build_alloc_info( 1813 size_t reserved_size, size_t dyn_size, 1814 size_t atom_size, 1815 pcpu_fc_cpu_distance_fn_t cpu_distance_fn) 1816 { 1817 static int group_map[NR_CPUS] __initdata; 1818 static int group_cnt[NR_CPUS] __initdata; 1819 const size_t static_size = __per_cpu_end - __per_cpu_start; 1820 int nr_groups = 1, nr_units = 0; 1821 size_t size_sum, min_unit_size, alloc_size; 1822 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ 1823 int last_allocs, group, unit; 1824 unsigned int cpu, tcpu; 1825 struct pcpu_alloc_info *ai; 1826 unsigned int *cpu_map; 1827 1828 /* this function may be called multiple times */ 1829 memset(group_map, 0, sizeof(group_map)); 1830 memset(group_cnt, 0, sizeof(group_cnt)); 1831 1832 /* calculate size_sum and ensure dyn_size is enough for early alloc */ 1833 size_sum = PFN_ALIGN(static_size + reserved_size + 1834 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); 1835 dyn_size = size_sum - static_size - reserved_size; 1836 1837 /* 1838 * Determine min_unit_size, alloc_size and max_upa such that 1839 * alloc_size is multiple of atom_size and is the smallest 1840 * which can accommodate 4k aligned segments which are equal to 1841 * or larger than min_unit_size. 1842 */ 1843 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 1844 1845 alloc_size = roundup(min_unit_size, atom_size); 1846 upa = alloc_size / min_unit_size; 1847 while (alloc_size % upa || (offset_in_page(alloc_size / upa))) 1848 upa--; 1849 max_upa = upa; 1850 1851 /* group cpus according to their proximity */ 1852 for_each_possible_cpu(cpu) { 1853 group = 0; 1854 next_group: 1855 for_each_possible_cpu(tcpu) { 1856 if (cpu == tcpu) 1857 break; 1858 if (group_map[tcpu] == group && cpu_distance_fn && 1859 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || 1860 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { 1861 group++; 1862 nr_groups = max(nr_groups, group + 1); 1863 goto next_group; 1864 } 1865 } 1866 group_map[cpu] = group; 1867 group_cnt[group]++; 1868 } 1869 1870 /* 1871 * Expand unit size until address space usage goes over 75% 1872 * and then as much as possible without using more address 1873 * space. 1874 */ 1875 last_allocs = INT_MAX; 1876 for (upa = max_upa; upa; upa--) { 1877 int allocs = 0, wasted = 0; 1878 1879 if (alloc_size % upa || (offset_in_page(alloc_size / upa))) 1880 continue; 1881 1882 for (group = 0; group < nr_groups; group++) { 1883 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); 1884 allocs += this_allocs; 1885 wasted += this_allocs * upa - group_cnt[group]; 1886 } 1887 1888 /* 1889 * Don't accept if wastage is over 1/3. The 1890 * greater-than comparison ensures upa==1 always 1891 * passes the following check. 1892 */ 1893 if (wasted > num_possible_cpus() / 3) 1894 continue; 1895 1896 /* and then don't consume more memory */ 1897 if (allocs > last_allocs) 1898 break; 1899 last_allocs = allocs; 1900 best_upa = upa; 1901 } 1902 upa = best_upa; 1903 1904 /* allocate and fill alloc_info */ 1905 for (group = 0; group < nr_groups; group++) 1906 nr_units += roundup(group_cnt[group], upa); 1907 1908 ai = pcpu_alloc_alloc_info(nr_groups, nr_units); 1909 if (!ai) 1910 return ERR_PTR(-ENOMEM); 1911 cpu_map = ai->groups[0].cpu_map; 1912 1913 for (group = 0; group < nr_groups; group++) { 1914 ai->groups[group].cpu_map = cpu_map; 1915 cpu_map += roundup(group_cnt[group], upa); 1916 } 1917 1918 ai->static_size = static_size; 1919 ai->reserved_size = reserved_size; 1920 ai->dyn_size = dyn_size; 1921 ai->unit_size = alloc_size / upa; 1922 ai->atom_size = atom_size; 1923 ai->alloc_size = alloc_size; 1924 1925 for (group = 0, unit = 0; group_cnt[group]; group++) { 1926 struct pcpu_group_info *gi = &ai->groups[group]; 1927 1928 /* 1929 * Initialize base_offset as if all groups are located 1930 * back-to-back. The caller should update this to 1931 * reflect actual allocation. 1932 */ 1933 gi->base_offset = unit * ai->unit_size; 1934 1935 for_each_possible_cpu(cpu) 1936 if (group_map[cpu] == group) 1937 gi->cpu_map[gi->nr_units++] = cpu; 1938 gi->nr_units = roundup(gi->nr_units, upa); 1939 unit += gi->nr_units; 1940 } 1941 BUG_ON(unit != nr_units); 1942 1943 return ai; 1944 } 1945 #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ 1946 1947 #if defined(BUILD_EMBED_FIRST_CHUNK) 1948 /** 1949 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1950 * @reserved_size: the size of reserved percpu area in bytes 1951 * @dyn_size: minimum free size for dynamic allocation in bytes 1952 * @atom_size: allocation atom size 1953 * @cpu_distance_fn: callback to determine distance between cpus, optional 1954 * @alloc_fn: function to allocate percpu page 1955 * @free_fn: function to free percpu page 1956 * 1957 * This is a helper to ease setting up embedded first percpu chunk and 1958 * can be called where pcpu_setup_first_chunk() is expected. 1959 * 1960 * If this function is used to setup the first chunk, it is allocated 1961 * by calling @alloc_fn and used as-is without being mapped into 1962 * vmalloc area. Allocations are always whole multiples of @atom_size 1963 * aligned to @atom_size. 1964 * 1965 * This enables the first chunk to piggy back on the linear physical 1966 * mapping which often uses larger page size. Please note that this 1967 * can result in very sparse cpu->unit mapping on NUMA machines thus 1968 * requiring large vmalloc address space. Don't use this allocator if 1969 * vmalloc space is not orders of magnitude larger than distances 1970 * between node memory addresses (ie. 32bit NUMA machines). 1971 * 1972 * @dyn_size specifies the minimum dynamic area size. 1973 * 1974 * If the needed size is smaller than the minimum or specified unit 1975 * size, the leftover is returned using @free_fn. 1976 * 1977 * RETURNS: 1978 * 0 on success, -errno on failure. 1979 */ 1980 int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, 1981 size_t atom_size, 1982 pcpu_fc_cpu_distance_fn_t cpu_distance_fn, 1983 pcpu_fc_alloc_fn_t alloc_fn, 1984 pcpu_fc_free_fn_t free_fn) 1985 { 1986 void *base = (void *)ULONG_MAX; 1987 void **areas = NULL; 1988 struct pcpu_alloc_info *ai; 1989 size_t size_sum, areas_size; 1990 unsigned long max_distance; 1991 int group, i, highest_group, rc; 1992 1993 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, 1994 cpu_distance_fn); 1995 if (IS_ERR(ai)) 1996 return PTR_ERR(ai); 1997 1998 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1999 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 2000 2001 areas = memblock_virt_alloc_nopanic(areas_size, 0); 2002 if (!areas) { 2003 rc = -ENOMEM; 2004 goto out_free; 2005 } 2006 2007 /* allocate, copy and determine base address & max_distance */ 2008 highest_group = 0; 2009 for (group = 0; group < ai->nr_groups; group++) { 2010 struct pcpu_group_info *gi = &ai->groups[group]; 2011 unsigned int cpu = NR_CPUS; 2012 void *ptr; 2013 2014 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) 2015 cpu = gi->cpu_map[i]; 2016 BUG_ON(cpu == NR_CPUS); 2017 2018 /* allocate space for the whole group */ 2019 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); 2020 if (!ptr) { 2021 rc = -ENOMEM; 2022 goto out_free_areas; 2023 } 2024 /* kmemleak tracks the percpu allocations separately */ 2025 kmemleak_free(ptr); 2026 areas[group] = ptr; 2027 2028 base = min(ptr, base); 2029 if (ptr > areas[highest_group]) 2030 highest_group = group; 2031 } 2032 max_distance = areas[highest_group] - base; 2033 max_distance += ai->unit_size * ai->groups[highest_group].nr_units; 2034 2035 /* warn if maximum distance is further than 75% of vmalloc space */ 2036 if (max_distance > VMALLOC_TOTAL * 3 / 4) { 2037 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", 2038 max_distance, VMALLOC_TOTAL); 2039 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 2040 /* and fail if we have fallback */ 2041 rc = -EINVAL; 2042 goto out_free_areas; 2043 #endif 2044 } 2045 2046 /* 2047 * Copy data and free unused parts. This should happen after all 2048 * allocations are complete; otherwise, we may end up with 2049 * overlapping groups. 2050 */ 2051 for (group = 0; group < ai->nr_groups; group++) { 2052 struct pcpu_group_info *gi = &ai->groups[group]; 2053 void *ptr = areas[group]; 2054 2055 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { 2056 if (gi->cpu_map[i] == NR_CPUS) { 2057 /* unused unit, free whole */ 2058 free_fn(ptr, ai->unit_size); 2059 continue; 2060 } 2061 /* copy and return the unused part */ 2062 memcpy(ptr, __per_cpu_load, ai->static_size); 2063 free_fn(ptr + size_sum, ai->unit_size - size_sum); 2064 } 2065 } 2066 2067 /* base address is now known, determine group base offsets */ 2068 for (group = 0; group < ai->nr_groups; group++) { 2069 ai->groups[group].base_offset = areas[group] - base; 2070 } 2071 2072 pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", 2073 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, 2074 ai->dyn_size, ai->unit_size); 2075 2076 rc = pcpu_setup_first_chunk(ai, base); 2077 goto out_free; 2078 2079 out_free_areas: 2080 for (group = 0; group < ai->nr_groups; group++) 2081 if (areas[group]) 2082 free_fn(areas[group], 2083 ai->groups[group].nr_units * ai->unit_size); 2084 out_free: 2085 pcpu_free_alloc_info(ai); 2086 if (areas) 2087 memblock_free_early(__pa(areas), areas_size); 2088 return rc; 2089 } 2090 #endif /* BUILD_EMBED_FIRST_CHUNK */ 2091 2092 #ifdef BUILD_PAGE_FIRST_CHUNK 2093 /** 2094 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 2095 * @reserved_size: the size of reserved percpu area in bytes 2096 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE 2097 * @free_fn: function to free percpu page, always called with PAGE_SIZE 2098 * @populate_pte_fn: function to populate pte 2099 * 2100 * This is a helper to ease setting up page-remapped first percpu 2101 * chunk and can be called where pcpu_setup_first_chunk() is expected. 2102 * 2103 * This is the basic allocator. Static percpu area is allocated 2104 * page-by-page into vmalloc area. 2105 * 2106 * RETURNS: 2107 * 0 on success, -errno on failure. 2108 */ 2109 int __init pcpu_page_first_chunk(size_t reserved_size, 2110 pcpu_fc_alloc_fn_t alloc_fn, 2111 pcpu_fc_free_fn_t free_fn, 2112 pcpu_fc_populate_pte_fn_t populate_pte_fn) 2113 { 2114 static struct vm_struct vm; 2115 struct pcpu_alloc_info *ai; 2116 char psize_str[16]; 2117 int unit_pages; 2118 size_t pages_size; 2119 struct page **pages; 2120 int unit, i, j, rc; 2121 int upa; 2122 int nr_g0_units; 2123 2124 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); 2125 2126 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); 2127 if (IS_ERR(ai)) 2128 return PTR_ERR(ai); 2129 BUG_ON(ai->nr_groups != 1); 2130 upa = ai->alloc_size/ai->unit_size; 2131 nr_g0_units = roundup(num_possible_cpus(), upa); 2132 if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { 2133 pcpu_free_alloc_info(ai); 2134 return -EINVAL; 2135 } 2136 2137 unit_pages = ai->unit_size >> PAGE_SHIFT; 2138 2139 /* unaligned allocations can't be freed, round up to page size */ 2140 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 2141 sizeof(pages[0])); 2142 pages = memblock_virt_alloc(pages_size, 0); 2143 2144 /* allocate pages */ 2145 j = 0; 2146 for (unit = 0; unit < num_possible_cpus(); unit++) { 2147 unsigned int cpu = ai->groups[0].cpu_map[unit]; 2148 for (i = 0; i < unit_pages; i++) { 2149 void *ptr; 2150 2151 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); 2152 if (!ptr) { 2153 pr_warn("failed to allocate %s page for cpu%u\n", 2154 psize_str, cpu); 2155 goto enomem; 2156 } 2157 /* kmemleak tracks the percpu allocations separately */ 2158 kmemleak_free(ptr); 2159 pages[j++] = virt_to_page(ptr); 2160 } 2161 } 2162 2163 /* allocate vm area, map the pages and copy static data */ 2164 vm.flags = VM_ALLOC; 2165 vm.size = num_possible_cpus() * ai->unit_size; 2166 vm_area_register_early(&vm, PAGE_SIZE); 2167 2168 for (unit = 0; unit < num_possible_cpus(); unit++) { 2169 unsigned long unit_addr = 2170 (unsigned long)vm.addr + unit * ai->unit_size; 2171 2172 for (i = 0; i < unit_pages; i++) 2173 populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); 2174 2175 /* pte already populated, the following shouldn't fail */ 2176 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], 2177 unit_pages); 2178 if (rc < 0) 2179 panic("failed to map percpu area, err=%d\n", rc); 2180 2181 /* 2182 * FIXME: Archs with virtual cache should flush local 2183 * cache for the linear mapping here - something 2184 * equivalent to flush_cache_vmap() on the local cpu. 2185 * flush_cache_vmap() can't be used as most supporting 2186 * data structures are not set up yet. 2187 */ 2188 2189 /* copy static data */ 2190 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); 2191 } 2192 2193 /* we're ready, commit */ 2194 pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", 2195 unit_pages, psize_str, vm.addr, ai->static_size, 2196 ai->reserved_size, ai->dyn_size); 2197 2198 rc = pcpu_setup_first_chunk(ai, vm.addr); 2199 goto out_free_ar; 2200 2201 enomem: 2202 while (--j >= 0) 2203 free_fn(page_address(pages[j]), PAGE_SIZE); 2204 rc = -ENOMEM; 2205 out_free_ar: 2206 memblock_free_early(__pa(pages), pages_size); 2207 pcpu_free_alloc_info(ai); 2208 return rc; 2209 } 2210 #endif /* BUILD_PAGE_FIRST_CHUNK */ 2211 2212 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA 2213 /* 2214 * Generic SMP percpu area setup. 2215 * 2216 * The embedding helper is used because its behavior closely resembles 2217 * the original non-dynamic generic percpu area setup. This is 2218 * important because many archs have addressing restrictions and might 2219 * fail if the percpu area is located far away from the previous 2220 * location. As an added bonus, in non-NUMA cases, embedding is 2221 * generally a good idea TLB-wise because percpu area can piggy back 2222 * on the physical linear memory mapping which uses large page 2223 * mappings on applicable archs. 2224 */ 2225 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 2226 EXPORT_SYMBOL(__per_cpu_offset); 2227 2228 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 2229 size_t align) 2230 { 2231 return memblock_virt_alloc_from_nopanic( 2232 size, align, __pa(MAX_DMA_ADDRESS)); 2233 } 2234 2235 static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 2236 { 2237 memblock_free_early(__pa(ptr), size); 2238 } 2239 2240 void __init setup_per_cpu_areas(void) 2241 { 2242 unsigned long delta; 2243 unsigned int cpu; 2244 int rc; 2245 2246 /* 2247 * Always reserve area for module percpu variables. That's 2248 * what the legacy allocator did. 2249 */ 2250 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 2251 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, 2252 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); 2253 if (rc < 0) 2254 panic("Failed to initialize percpu areas."); 2255 2256 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 2257 for_each_possible_cpu(cpu) 2258 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 2259 } 2260 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ 2261 2262 #else /* CONFIG_SMP */ 2263 2264 /* 2265 * UP percpu area setup. 2266 * 2267 * UP always uses km-based percpu allocator with identity mapping. 2268 * Static percpu variables are indistinguishable from the usual static 2269 * variables and don't require any special preparation. 2270 */ 2271 void __init setup_per_cpu_areas(void) 2272 { 2273 const size_t unit_size = 2274 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, 2275 PERCPU_DYNAMIC_RESERVE)); 2276 struct pcpu_alloc_info *ai; 2277 void *fc; 2278 2279 ai = pcpu_alloc_alloc_info(1, 1); 2280 fc = memblock_virt_alloc_from_nopanic(unit_size, 2281 PAGE_SIZE, 2282 __pa(MAX_DMA_ADDRESS)); 2283 if (!ai || !fc) 2284 panic("Failed to allocate memory for percpu areas."); 2285 /* kmemleak tracks the percpu allocations separately */ 2286 kmemleak_free(fc); 2287 2288 ai->dyn_size = unit_size; 2289 ai->unit_size = unit_size; 2290 ai->atom_size = unit_size; 2291 ai->alloc_size = unit_size; 2292 ai->groups[0].nr_units = 1; 2293 ai->groups[0].cpu_map[0] = 0; 2294 2295 if (pcpu_setup_first_chunk(ai, fc) < 0) 2296 panic("Failed to initialize percpu areas."); 2297 } 2298 2299 #endif /* CONFIG_SMP */ 2300 2301 /* 2302 * First and reserved chunks are initialized with temporary allocation 2303 * map in initdata so that they can be used before slab is online. 2304 * This function is called after slab is brought up and replaces those 2305 * with properly allocated maps. 2306 */ 2307 void __init percpu_init_late(void) 2308 { 2309 struct pcpu_chunk *target_chunks[] = 2310 { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; 2311 struct pcpu_chunk *chunk; 2312 unsigned long flags; 2313 int i; 2314 2315 for (i = 0; (chunk = target_chunks[i]); i++) { 2316 int *map; 2317 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); 2318 2319 BUILD_BUG_ON(size > PAGE_SIZE); 2320 2321 map = pcpu_mem_zalloc(size); 2322 BUG_ON(!map); 2323 2324 spin_lock_irqsave(&pcpu_lock, flags); 2325 memcpy(map, chunk->map, size); 2326 chunk->map = map; 2327 spin_unlock_irqrestore(&pcpu_lock, flags); 2328 } 2329 } 2330 2331 /* 2332 * Percpu allocator is initialized early during boot when neither slab or 2333 * workqueue is available. Plug async management until everything is up 2334 * and running. 2335 */ 2336 static int __init percpu_enable_async(void) 2337 { 2338 pcpu_async_enabled = true; 2339 return 0; 2340 } 2341 subsys_initcall(percpu_enable_async); 2342