1 /* 2 * linux/mm/percpu.c - percpu memory allocator 3 * 4 * Copyright (C) 2009 SUSE Linux Products GmbH 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 6 * 7 * This file is released under the GPLv2. 8 * 9 * This is percpu allocator which can handle both static and dynamic 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 11 * chunk is consisted of num_possible_cpus() units and the first chunk 12 * is used for static percpu variables in the kernel image (special 13 * boot time alloc/init handling necessary as these areas need to be 14 * brought up before allocation services are running). Unit grows as 15 * necessary and all units grow or shrink in unison. When a chunk is 16 * filled up, another chunk is allocated. ie. in vmalloc area 17 * 18 * c0 c1 c2 19 * ------------------- ------------------- ------------ 20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u 21 * ------------------- ...... ------------------- .... ------------ 22 * 23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * percpu base registers pcpu_unit_size apart. 27 * 28 * There are usually many small percpu allocations many of them as 29 * small as 4 bytes. The allocator organizes chunks into lists 30 * according to free size and tries to allocate from the fullest one. 31 * Each chunk keeps the maximum contiguous area size hint which is 32 * guaranteed to be eqaul to or larger than the maximum contiguous 33 * area in the chunk. This helps the allocator not to iterate the 34 * chunk maps unnecessarily. 35 * 36 * Allocation state in each chunk is kept using an array of integers 37 * on chunk->map. A positive value in the map represents a free 38 * region and negative allocated. Allocation inside a chunk is done 39 * by scanning this map sequentially and serving the first matching 40 * entry. This is mostly copied from the percpu_modalloc() allocator. 41 * Chunks can be determined from the address using the index field 42 * in the page struct. The index field contains a pointer to the chunk. 43 * 44 * To use this allocator, arch code should do the followings. 45 * 46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 47 * 48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 49 * regular address to percpu pointer and back if they need to be 50 * different from the default 51 * 52 * - use pcpu_setup_first_chunk() during percpu area initialization to 53 * setup the first chunk containing the kernel static percpu area 54 */ 55 56 #include <linux/bitmap.h> 57 #include <linux/bootmem.h> 58 #include <linux/list.h> 59 #include <linux/mm.h> 60 #include <linux/module.h> 61 #include <linux/mutex.h> 62 #include <linux/percpu.h> 63 #include <linux/pfn.h> 64 #include <linux/slab.h> 65 #include <linux/spinlock.h> 66 #include <linux/vmalloc.h> 67 #include <linux/workqueue.h> 68 69 #include <asm/cacheflush.h> 70 #include <asm/sections.h> 71 #include <asm/tlbflush.h> 72 73 #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 74 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 75 76 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 77 #ifndef __addr_to_pcpu_ptr 78 #define __addr_to_pcpu_ptr(addr) \ 79 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 80 + (unsigned long)__per_cpu_start) 81 #endif 82 #ifndef __pcpu_ptr_to_addr 83 #define __pcpu_ptr_to_addr(ptr) \ 84 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 85 - (unsigned long)__per_cpu_start) 86 #endif 87 88 struct pcpu_chunk { 89 struct list_head list; /* linked to pcpu_slot lists */ 90 int free_size; /* free bytes in the chunk */ 91 int contig_hint; /* max contiguous size hint */ 92 struct vm_struct *vm; /* mapped vmalloc region */ 93 int map_used; /* # of map entries used */ 94 int map_alloc; /* # of map entries allocated */ 95 int *map; /* allocation map */ 96 bool immutable; /* no [de]population allowed */ 97 struct page **page; /* points to page array */ 98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */ 99 }; 100 101 static int pcpu_unit_pages __read_mostly; 102 static int pcpu_unit_size __read_mostly; 103 static int pcpu_chunk_size __read_mostly; 104 static int pcpu_nr_slots __read_mostly; 105 static size_t pcpu_chunk_struct_size __read_mostly; 106 107 /* the address of the first chunk which starts with the kernel static area */ 108 void *pcpu_base_addr __read_mostly; 109 EXPORT_SYMBOL_GPL(pcpu_base_addr); 110 111 /* 112 * The first chunk which always exists. Note that unlike other 113 * chunks, this one can be allocated and mapped in several different 114 * ways and thus often doesn't live in the vmalloc area. 115 */ 116 static struct pcpu_chunk *pcpu_first_chunk; 117 118 /* 119 * Optional reserved chunk. This chunk reserves part of the first 120 * chunk and serves it for reserved allocations. The amount of 121 * reserved offset is in pcpu_reserved_chunk_limit. When reserved 122 * area doesn't exist, the following variables contain NULL and 0 123 * respectively. 124 */ 125 static struct pcpu_chunk *pcpu_reserved_chunk; 126 static int pcpu_reserved_chunk_limit; 127 128 /* 129 * Synchronization rules. 130 * 131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 133 * The latter is a spinlock and protects the index data structures - 134 * chunk slots, chunks and area maps in chunks. 135 * 136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 137 * pcpu_lock is grabbed and released as necessary. All actual memory 138 * allocations are done using GFP_KERNEL with pcpu_lock released. 139 * 140 * Free path accesses and alters only the index data structures, so it 141 * can be safely called from atomic context. When memory needs to be 142 * returned to the system, free path schedules reclaim_work which 143 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be 144 * reclaimed, release both locks and frees the chunks. Note that it's 145 * necessary to grab both locks to remove a chunk from circulation as 146 * allocation path might be referencing the chunk with only 147 * pcpu_alloc_mutex locked. 148 */ 149 static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 150 static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 151 152 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 153 154 /* reclaim work to release fully free chunks, scheduled from free path */ 155 static void pcpu_reclaim(struct work_struct *work); 156 static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 157 158 static int __pcpu_size_to_slot(int size) 159 { 160 int highbit = fls(size); /* size is in bytes */ 161 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); 162 } 163 164 static int pcpu_size_to_slot(int size) 165 { 166 if (size == pcpu_unit_size) 167 return pcpu_nr_slots - 1; 168 return __pcpu_size_to_slot(size); 169 } 170 171 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 172 { 173 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) 174 return 0; 175 176 return pcpu_size_to_slot(chunk->free_size); 177 } 178 179 static int pcpu_page_idx(unsigned int cpu, int page_idx) 180 { 181 return cpu * pcpu_unit_pages + page_idx; 182 } 183 184 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, 185 unsigned int cpu, int page_idx) 186 { 187 return &chunk->page[pcpu_page_idx(cpu, page_idx)]; 188 } 189 190 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 191 unsigned int cpu, int page_idx) 192 { 193 return (unsigned long)chunk->vm->addr + 194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 195 } 196 197 static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 198 int page_idx) 199 { 200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 201 } 202 203 /* set the pointer to a chunk in a page struct */ 204 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 205 { 206 page->index = (unsigned long)pcpu; 207 } 208 209 /* obtain pointer to a chunk from a page struct */ 210 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) 211 { 212 return (struct pcpu_chunk *)page->index; 213 } 214 215 /** 216 * pcpu_mem_alloc - allocate memory 217 * @size: bytes to allocate 218 * 219 * Allocate @size bytes. If @size is smaller than PAGE_SIZE, 220 * kzalloc() is used; otherwise, vmalloc() is used. The returned 221 * memory is always zeroed. 222 * 223 * CONTEXT: 224 * Does GFP_KERNEL allocation. 225 * 226 * RETURNS: 227 * Pointer to the allocated area on success, NULL on failure. 228 */ 229 static void *pcpu_mem_alloc(size_t size) 230 { 231 if (size <= PAGE_SIZE) 232 return kzalloc(size, GFP_KERNEL); 233 else { 234 void *ptr = vmalloc(size); 235 if (ptr) 236 memset(ptr, 0, size); 237 return ptr; 238 } 239 } 240 241 /** 242 * pcpu_mem_free - free memory 243 * @ptr: memory to free 244 * @size: size of the area 245 * 246 * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). 247 */ 248 static void pcpu_mem_free(void *ptr, size_t size) 249 { 250 if (size <= PAGE_SIZE) 251 kfree(ptr); 252 else 253 vfree(ptr); 254 } 255 256 /** 257 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 258 * @chunk: chunk of interest 259 * @oslot: the previous slot it was on 260 * 261 * This function is called after an allocation or free changed @chunk. 262 * New slot according to the changed state is determined and @chunk is 263 * moved to the slot. Note that the reserved chunk is never put on 264 * chunk slots. 265 * 266 * CONTEXT: 267 * pcpu_lock. 268 */ 269 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 270 { 271 int nslot = pcpu_chunk_slot(chunk); 272 273 if (chunk != pcpu_reserved_chunk && oslot != nslot) { 274 if (oslot < nslot) 275 list_move(&chunk->list, &pcpu_slot[nslot]); 276 else 277 list_move_tail(&chunk->list, &pcpu_slot[nslot]); 278 } 279 } 280 281 /** 282 * pcpu_chunk_addr_search - determine chunk containing specified address 283 * @addr: address for which the chunk needs to be determined. 284 * 285 * RETURNS: 286 * The address of the found chunk. 287 */ 288 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 289 { 290 void *first_start = pcpu_first_chunk->vm->addr; 291 292 /* is it in the first chunk? */ 293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 294 /* is it in the reserved area? */ 295 if (addr < first_start + pcpu_reserved_chunk_limit) 296 return pcpu_reserved_chunk; 297 return pcpu_first_chunk; 298 } 299 300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 301 } 302 303 /** 304 * pcpu_extend_area_map - extend area map for allocation 305 * @chunk: target chunk 306 * 307 * Extend area map of @chunk so that it can accomodate an allocation. 308 * A single allocation can split an area into three areas, so this 309 * function makes sure that @chunk->map has at least two extra slots. 310 * 311 * CONTEXT: 312 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 313 * if area map is extended. 314 * 315 * RETURNS: 316 * 0 if noop, 1 if successfully extended, -errno on failure. 317 */ 318 static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 319 { 320 int new_alloc; 321 int *new; 322 size_t size; 323 324 /* has enough? */ 325 if (chunk->map_alloc >= chunk->map_used + 2) 326 return 0; 327 328 spin_unlock_irq(&pcpu_lock); 329 330 new_alloc = PCPU_DFL_MAP_ALLOC; 331 while (new_alloc < chunk->map_used + 2) 332 new_alloc *= 2; 333 334 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 335 if (!new) { 336 spin_lock_irq(&pcpu_lock); 337 return -ENOMEM; 338 } 339 340 /* 341 * Acquire pcpu_lock and switch to new area map. Only free 342 * could have happened inbetween, so map_used couldn't have 343 * grown. 344 */ 345 spin_lock_irq(&pcpu_lock); 346 BUG_ON(new_alloc < chunk->map_used + 2); 347 348 size = chunk->map_alloc * sizeof(chunk->map[0]); 349 memcpy(new, chunk->map, size); 350 351 /* 352 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 353 * one of the first chunks and still using static map. 354 */ 355 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 356 pcpu_mem_free(chunk->map, size); 357 358 chunk->map_alloc = new_alloc; 359 chunk->map = new; 360 return 0; 361 } 362 363 /** 364 * pcpu_split_block - split a map block 365 * @chunk: chunk of interest 366 * @i: index of map block to split 367 * @head: head size in bytes (can be 0) 368 * @tail: tail size in bytes (can be 0) 369 * 370 * Split the @i'th map block into two or three blocks. If @head is 371 * non-zero, @head bytes block is inserted before block @i moving it 372 * to @i+1 and reducing its size by @head bytes. 373 * 374 * If @tail is non-zero, the target block, which can be @i or @i+1 375 * depending on @head, is reduced by @tail bytes and @tail byte block 376 * is inserted after the target block. 377 * 378 * @chunk->map must have enough free slots to accomodate the split. 379 * 380 * CONTEXT: 381 * pcpu_lock. 382 */ 383 static void pcpu_split_block(struct pcpu_chunk *chunk, int i, 384 int head, int tail) 385 { 386 int nr_extra = !!head + !!tail; 387 388 BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); 389 390 /* insert new subblocks */ 391 memmove(&chunk->map[i + nr_extra], &chunk->map[i], 392 sizeof(chunk->map[0]) * (chunk->map_used - i)); 393 chunk->map_used += nr_extra; 394 395 if (head) { 396 chunk->map[i + 1] = chunk->map[i] - head; 397 chunk->map[i++] = head; 398 } 399 if (tail) { 400 chunk->map[i++] -= tail; 401 chunk->map[i] = tail; 402 } 403 } 404 405 /** 406 * pcpu_alloc_area - allocate area from a pcpu_chunk 407 * @chunk: chunk of interest 408 * @size: wanted size in bytes 409 * @align: wanted align 410 * 411 * Try to allocate @size bytes area aligned at @align from @chunk. 412 * Note that this function only allocates the offset. It doesn't 413 * populate or map the area. 414 * 415 * @chunk->map must have at least two free slots. 416 * 417 * CONTEXT: 418 * pcpu_lock. 419 * 420 * RETURNS: 421 * Allocated offset in @chunk on success, -1 if no matching area is 422 * found. 423 */ 424 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 425 { 426 int oslot = pcpu_chunk_slot(chunk); 427 int max_contig = 0; 428 int i, off; 429 430 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { 431 bool is_last = i + 1 == chunk->map_used; 432 int head, tail; 433 434 /* extra for alignment requirement */ 435 head = ALIGN(off, align) - off; 436 BUG_ON(i == 0 && head != 0); 437 438 if (chunk->map[i] < 0) 439 continue; 440 if (chunk->map[i] < head + size) { 441 max_contig = max(chunk->map[i], max_contig); 442 continue; 443 } 444 445 /* 446 * If head is small or the previous block is free, 447 * merge'em. Note that 'small' is defined as smaller 448 * than sizeof(int), which is very small but isn't too 449 * uncommon for percpu allocations. 450 */ 451 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { 452 if (chunk->map[i - 1] > 0) 453 chunk->map[i - 1] += head; 454 else { 455 chunk->map[i - 1] -= head; 456 chunk->free_size -= head; 457 } 458 chunk->map[i] -= head; 459 off += head; 460 head = 0; 461 } 462 463 /* if tail is small, just keep it around */ 464 tail = chunk->map[i] - head - size; 465 if (tail < sizeof(int)) 466 tail = 0; 467 468 /* split if warranted */ 469 if (head || tail) { 470 pcpu_split_block(chunk, i, head, tail); 471 if (head) { 472 i++; 473 off += head; 474 max_contig = max(chunk->map[i - 1], max_contig); 475 } 476 if (tail) 477 max_contig = max(chunk->map[i + 1], max_contig); 478 } 479 480 /* update hint and mark allocated */ 481 if (is_last) 482 chunk->contig_hint = max_contig; /* fully scanned */ 483 else 484 chunk->contig_hint = max(chunk->contig_hint, 485 max_contig); 486 487 chunk->free_size -= chunk->map[i]; 488 chunk->map[i] = -chunk->map[i]; 489 490 pcpu_chunk_relocate(chunk, oslot); 491 return off; 492 } 493 494 chunk->contig_hint = max_contig; /* fully scanned */ 495 pcpu_chunk_relocate(chunk, oslot); 496 497 /* tell the upper layer that this chunk has no matching area */ 498 return -1; 499 } 500 501 /** 502 * pcpu_free_area - free area to a pcpu_chunk 503 * @chunk: chunk of interest 504 * @freeme: offset of area to free 505 * 506 * Free area starting from @freeme to @chunk. Note that this function 507 * only modifies the allocation map. It doesn't depopulate or unmap 508 * the area. 509 * 510 * CONTEXT: 511 * pcpu_lock. 512 */ 513 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 514 { 515 int oslot = pcpu_chunk_slot(chunk); 516 int i, off; 517 518 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) 519 if (off == freeme) 520 break; 521 BUG_ON(off != freeme); 522 BUG_ON(chunk->map[i] > 0); 523 524 chunk->map[i] = -chunk->map[i]; 525 chunk->free_size += chunk->map[i]; 526 527 /* merge with previous? */ 528 if (i > 0 && chunk->map[i - 1] >= 0) { 529 chunk->map[i - 1] += chunk->map[i]; 530 chunk->map_used--; 531 memmove(&chunk->map[i], &chunk->map[i + 1], 532 (chunk->map_used - i) * sizeof(chunk->map[0])); 533 i--; 534 } 535 /* merge with next? */ 536 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { 537 chunk->map[i] += chunk->map[i + 1]; 538 chunk->map_used--; 539 memmove(&chunk->map[i + 1], &chunk->map[i + 2], 540 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); 541 } 542 543 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); 544 pcpu_chunk_relocate(chunk, oslot); 545 } 546 547 /** 548 * pcpu_unmap - unmap pages out of a pcpu_chunk 549 * @chunk: chunk of interest 550 * @page_start: page index of the first page to unmap 551 * @page_end: page index of the last page to unmap + 1 552 * @flush: whether to flush cache and tlb or not 553 * 554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 555 * If @flush is true, vcache is flushed before unmapping and tlb 556 * after. 557 */ 558 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 559 bool flush) 560 { 561 unsigned int last = num_possible_cpus() - 1; 562 unsigned int cpu; 563 564 /* unmap must not be done on immutable chunk */ 565 WARN_ON(chunk->immutable); 566 567 /* 568 * Each flushing trial can be very expensive, issue flush on 569 * the whole region at once rather than doing it for each cpu. 570 * This could be an overkill but is more scalable. 571 */ 572 if (flush) 573 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 574 pcpu_chunk_addr(chunk, last, page_end)); 575 576 for_each_possible_cpu(cpu) 577 unmap_kernel_range_noflush( 578 pcpu_chunk_addr(chunk, cpu, page_start), 579 (page_end - page_start) << PAGE_SHIFT); 580 581 /* ditto as flush_cache_vunmap() */ 582 if (flush) 583 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 584 pcpu_chunk_addr(chunk, last, page_end)); 585 } 586 587 /** 588 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 589 * @chunk: chunk to depopulate 590 * @off: offset to the area to depopulate 591 * @size: size of the area to depopulate in bytes 592 * @flush: whether to flush cache and tlb or not 593 * 594 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 595 * from @chunk. If @flush is true, vcache is flushed before unmapping 596 * and tlb after. 597 * 598 * CONTEXT: 599 * pcpu_alloc_mutex. 600 */ 601 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 602 bool flush) 603 { 604 int page_start = PFN_DOWN(off); 605 int page_end = PFN_UP(off + size); 606 int unmap_start = -1; 607 int uninitialized_var(unmap_end); 608 unsigned int cpu; 609 int i; 610 611 for (i = page_start; i < page_end; i++) { 612 for_each_possible_cpu(cpu) { 613 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 614 615 if (!*pagep) 616 continue; 617 618 __free_page(*pagep); 619 620 /* 621 * If it's partial depopulation, it might get 622 * populated or depopulated again. Mark the 623 * page gone. 624 */ 625 *pagep = NULL; 626 627 unmap_start = unmap_start < 0 ? i : unmap_start; 628 unmap_end = i + 1; 629 } 630 } 631 632 if (unmap_start >= 0) 633 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 634 } 635 636 /** 637 * pcpu_map - map pages into a pcpu_chunk 638 * @chunk: chunk of interest 639 * @page_start: page index of the first page to map 640 * @page_end: page index of the last page to map + 1 641 * 642 * For each cpu, map pages [@page_start,@page_end) into @chunk. 643 * vcache is flushed afterwards. 644 */ 645 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 646 { 647 unsigned int last = num_possible_cpus() - 1; 648 unsigned int cpu; 649 int err; 650 651 /* map must not be done on immutable chunk */ 652 WARN_ON(chunk->immutable); 653 654 for_each_possible_cpu(cpu) { 655 err = map_kernel_range_noflush( 656 pcpu_chunk_addr(chunk, cpu, page_start), 657 (page_end - page_start) << PAGE_SHIFT, 658 PAGE_KERNEL, 659 pcpu_chunk_pagep(chunk, cpu, page_start)); 660 if (err < 0) 661 return err; 662 } 663 664 /* flush at once, please read comments in pcpu_unmap() */ 665 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), 666 pcpu_chunk_addr(chunk, last, page_end)); 667 return 0; 668 } 669 670 /** 671 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 672 * @chunk: chunk of interest 673 * @off: offset to the area to populate 674 * @size: size of the area to populate in bytes 675 * 676 * For each cpu, populate and map pages [@page_start,@page_end) into 677 * @chunk. The area is cleared on return. 678 * 679 * CONTEXT: 680 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 681 */ 682 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 683 { 684 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 685 int page_start = PFN_DOWN(off); 686 int page_end = PFN_UP(off + size); 687 int map_start = -1; 688 int uninitialized_var(map_end); 689 unsigned int cpu; 690 int i; 691 692 for (i = page_start; i < page_end; i++) { 693 if (pcpu_chunk_page_occupied(chunk, i)) { 694 if (map_start >= 0) { 695 if (pcpu_map(chunk, map_start, map_end)) 696 goto err; 697 map_start = -1; 698 } 699 continue; 700 } 701 702 map_start = map_start < 0 ? i : map_start; 703 map_end = i + 1; 704 705 for_each_possible_cpu(cpu) { 706 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 707 708 *pagep = alloc_pages_node(cpu_to_node(cpu), 709 alloc_mask, 0); 710 if (!*pagep) 711 goto err; 712 pcpu_set_page_chunk(*pagep, chunk); 713 } 714 } 715 716 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 717 goto err; 718 719 for_each_possible_cpu(cpu) 720 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 721 size); 722 723 return 0; 724 err: 725 /* likely under heavy memory pressure, give memory back */ 726 pcpu_depopulate_chunk(chunk, off, size, true); 727 return -ENOMEM; 728 } 729 730 static void free_pcpu_chunk(struct pcpu_chunk *chunk) 731 { 732 if (!chunk) 733 return; 734 if (chunk->vm) 735 free_vm_area(chunk->vm); 736 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 737 kfree(chunk); 738 } 739 740 static struct pcpu_chunk *alloc_pcpu_chunk(void) 741 { 742 struct pcpu_chunk *chunk; 743 744 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); 745 if (!chunk) 746 return NULL; 747 748 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 749 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 750 chunk->map[chunk->map_used++] = pcpu_unit_size; 751 chunk->page = chunk->page_ar; 752 753 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 754 if (!chunk->vm) { 755 free_pcpu_chunk(chunk); 756 return NULL; 757 } 758 759 INIT_LIST_HEAD(&chunk->list); 760 chunk->free_size = pcpu_unit_size; 761 chunk->contig_hint = pcpu_unit_size; 762 763 return chunk; 764 } 765 766 /** 767 * pcpu_alloc - the percpu allocator 768 * @size: size of area to allocate in bytes 769 * @align: alignment of area (max PAGE_SIZE) 770 * @reserved: allocate from the reserved chunk if available 771 * 772 * Allocate percpu area of @size bytes aligned at @align. 773 * 774 * CONTEXT: 775 * Does GFP_KERNEL allocation. 776 * 777 * RETURNS: 778 * Percpu pointer to the allocated area on success, NULL on failure. 779 */ 780 static void *pcpu_alloc(size_t size, size_t align, bool reserved) 781 { 782 struct pcpu_chunk *chunk; 783 int slot, off; 784 785 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 786 WARN(true, "illegal size (%zu) or align (%zu) for " 787 "percpu allocation\n", size, align); 788 return NULL; 789 } 790 791 mutex_lock(&pcpu_alloc_mutex); 792 spin_lock_irq(&pcpu_lock); 793 794 /* serve reserved allocations from the reserved chunk if available */ 795 if (reserved && pcpu_reserved_chunk) { 796 chunk = pcpu_reserved_chunk; 797 if (size > chunk->contig_hint || 798 pcpu_extend_area_map(chunk) < 0) 799 goto fail_unlock; 800 off = pcpu_alloc_area(chunk, size, align); 801 if (off >= 0) 802 goto area_found; 803 goto fail_unlock; 804 } 805 806 restart: 807 /* search through normal chunks */ 808 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 809 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 810 if (size > chunk->contig_hint) 811 continue; 812 813 switch (pcpu_extend_area_map(chunk)) { 814 case 0: 815 break; 816 case 1: 817 goto restart; /* pcpu_lock dropped, restart */ 818 default: 819 goto fail_unlock; 820 } 821 822 off = pcpu_alloc_area(chunk, size, align); 823 if (off >= 0) 824 goto area_found; 825 } 826 } 827 828 /* hmmm... no space left, create a new chunk */ 829 spin_unlock_irq(&pcpu_lock); 830 831 chunk = alloc_pcpu_chunk(); 832 if (!chunk) 833 goto fail_unlock_mutex; 834 835 spin_lock_irq(&pcpu_lock); 836 pcpu_chunk_relocate(chunk, -1); 837 goto restart; 838 839 area_found: 840 spin_unlock_irq(&pcpu_lock); 841 842 /* populate, map and clear the area */ 843 if (pcpu_populate_chunk(chunk, off, size)) { 844 spin_lock_irq(&pcpu_lock); 845 pcpu_free_area(chunk, off); 846 goto fail_unlock; 847 } 848 849 mutex_unlock(&pcpu_alloc_mutex); 850 851 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 852 853 fail_unlock: 854 spin_unlock_irq(&pcpu_lock); 855 fail_unlock_mutex: 856 mutex_unlock(&pcpu_alloc_mutex); 857 return NULL; 858 } 859 860 /** 861 * __alloc_percpu - allocate dynamic percpu area 862 * @size: size of area to allocate in bytes 863 * @align: alignment of area (max PAGE_SIZE) 864 * 865 * Allocate percpu area of @size bytes aligned at @align. Might 866 * sleep. Might trigger writeouts. 867 * 868 * CONTEXT: 869 * Does GFP_KERNEL allocation. 870 * 871 * RETURNS: 872 * Percpu pointer to the allocated area on success, NULL on failure. 873 */ 874 void *__alloc_percpu(size_t size, size_t align) 875 { 876 return pcpu_alloc(size, align, false); 877 } 878 EXPORT_SYMBOL_GPL(__alloc_percpu); 879 880 /** 881 * __alloc_reserved_percpu - allocate reserved percpu area 882 * @size: size of area to allocate in bytes 883 * @align: alignment of area (max PAGE_SIZE) 884 * 885 * Allocate percpu area of @size bytes aligned at @align from reserved 886 * percpu area if arch has set it up; otherwise, allocation is served 887 * from the same dynamic area. Might sleep. Might trigger writeouts. 888 * 889 * CONTEXT: 890 * Does GFP_KERNEL allocation. 891 * 892 * RETURNS: 893 * Percpu pointer to the allocated area on success, NULL on failure. 894 */ 895 void *__alloc_reserved_percpu(size_t size, size_t align) 896 { 897 return pcpu_alloc(size, align, true); 898 } 899 900 /** 901 * pcpu_reclaim - reclaim fully free chunks, workqueue function 902 * @work: unused 903 * 904 * Reclaim all fully free chunks except for the first one. 905 * 906 * CONTEXT: 907 * workqueue context. 908 */ 909 static void pcpu_reclaim(struct work_struct *work) 910 { 911 LIST_HEAD(todo); 912 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 913 struct pcpu_chunk *chunk, *next; 914 915 mutex_lock(&pcpu_alloc_mutex); 916 spin_lock_irq(&pcpu_lock); 917 918 list_for_each_entry_safe(chunk, next, head, list) { 919 WARN_ON(chunk->immutable); 920 921 /* spare the first one */ 922 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 923 continue; 924 925 list_move(&chunk->list, &todo); 926 } 927 928 spin_unlock_irq(&pcpu_lock); 929 mutex_unlock(&pcpu_alloc_mutex); 930 931 list_for_each_entry_safe(chunk, next, &todo, list) { 932 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 933 free_pcpu_chunk(chunk); 934 } 935 } 936 937 /** 938 * free_percpu - free percpu area 939 * @ptr: pointer to area to free 940 * 941 * Free percpu area @ptr. 942 * 943 * CONTEXT: 944 * Can be called from atomic context. 945 */ 946 void free_percpu(void *ptr) 947 { 948 void *addr = __pcpu_ptr_to_addr(ptr); 949 struct pcpu_chunk *chunk; 950 unsigned long flags; 951 int off; 952 953 if (!ptr) 954 return; 955 956 spin_lock_irqsave(&pcpu_lock, flags); 957 958 chunk = pcpu_chunk_addr_search(addr); 959 off = addr - chunk->vm->addr; 960 961 pcpu_free_area(chunk, off); 962 963 /* if there are more than one fully free chunks, wake up grim reaper */ 964 if (chunk->free_size == pcpu_unit_size) { 965 struct pcpu_chunk *pos; 966 967 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 968 if (pos != chunk) { 969 schedule_work(&pcpu_reclaim_work); 970 break; 971 } 972 } 973 974 spin_unlock_irqrestore(&pcpu_lock, flags); 975 } 976 EXPORT_SYMBOL_GPL(free_percpu); 977 978 /** 979 * pcpu_setup_first_chunk - initialize the first percpu chunk 980 * @get_page_fn: callback to fetch page pointer 981 * @static_size: the size of static percpu area in bytes 982 * @reserved_size: the size of reserved percpu area in bytes 983 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 984 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 985 * @base_addr: mapped address, NULL for auto 986 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 987 * 988 * Initialize the first percpu chunk which contains the kernel static 989 * perpcu area. This function is to be called from arch percpu area 990 * setup path. The first two parameters are mandatory. The rest are 991 * optional. 992 * 993 * @get_page_fn() should return pointer to percpu page given cpu 994 * number and page number. It should at least return enough pages to 995 * cover the static area. The returned pages for static area should 996 * have been initialized with valid data. If @unit_size is specified, 997 * it can also return pages after the static area. NULL return 998 * indicates end of pages for the cpu. Note that @get_page_fn() must 999 * return the same number of pages for all cpus. 1000 * 1001 * @reserved_size, if non-zero, specifies the amount of bytes to 1002 * reserve after the static area in the first chunk. This reserves 1003 * the first chunk such that it's available only through reserved 1004 * percpu allocation. This is primarily used to serve module percpu 1005 * static areas on architectures where the addressing model has 1006 * limited offset range for symbol relocations to guarantee module 1007 * percpu symbols fall inside the relocatable range. 1008 * 1009 * @dyn_size, if non-negative, determines the number of bytes 1010 * available for dynamic allocation in the first chunk. Specifying 1011 * non-negative value makes percpu leave alone the area beyond 1012 * @static_size + @reserved_size + @dyn_size. 1013 * 1014 * @unit_size, if non-negative, specifies unit size and must be 1015 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1016 * @reserved_size + if non-negative, @dyn_size. 1017 * 1018 * Non-null @base_addr means that the caller already allocated virtual 1019 * region for the first chunk and mapped it. percpu must not mess 1020 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL 1021 * @populate_pte_fn doesn't make any sense. 1022 * 1023 * @populate_pte_fn is used to populate the pagetable. NULL means the 1024 * caller already populated the pagetable. 1025 * 1026 * If the first chunk ends up with both reserved and dynamic areas, it 1027 * is served by two chunks - one to serve the core static and reserved 1028 * areas and the other for the dynamic area. They share the same vm 1029 * and page map but uses different area allocation map to stay away 1030 * from each other. The latter chunk is circulated in the chunk slots 1031 * and available for dynamic allocation like any other chunks. 1032 * 1033 * RETURNS: 1034 * The determined pcpu_unit_size which can be used to initialize 1035 * percpu access. 1036 */ 1037 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1038 size_t static_size, size_t reserved_size, 1039 ssize_t dyn_size, ssize_t unit_size, 1040 void *base_addr, 1041 pcpu_populate_pte_fn_t populate_pte_fn) 1042 { 1043 static struct vm_struct first_vm; 1044 static int smap[2], dmap[2]; 1045 size_t size_sum = static_size + reserved_size + 1046 (dyn_size >= 0 ? dyn_size : 0); 1047 struct pcpu_chunk *schunk, *dchunk = NULL; 1048 unsigned int cpu; 1049 int nr_pages; 1050 int err, i; 1051 1052 /* santiy checks */ 1053 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1054 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1055 BUG_ON(!static_size); 1056 if (unit_size >= 0) { 1057 BUG_ON(unit_size < size_sum); 1058 BUG_ON(unit_size & ~PAGE_MASK); 1059 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1060 } else 1061 BUG_ON(base_addr); 1062 BUG_ON(base_addr && populate_pte_fn); 1063 1064 if (unit_size >= 0) 1065 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1066 else 1067 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1068 PFN_UP(size_sum)); 1069 1070 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1071 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1072 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1073 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1074 1075 if (dyn_size < 0) 1076 dyn_size = pcpu_unit_size - static_size - reserved_size; 1077 1078 /* 1079 * Allocate chunk slots. The additional last slot is for 1080 * empty chunks. 1081 */ 1082 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1083 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1084 for (i = 0; i < pcpu_nr_slots; i++) 1085 INIT_LIST_HEAD(&pcpu_slot[i]); 1086 1087 /* 1088 * Initialize static chunk. If reserved_size is zero, the 1089 * static chunk covers static area + dynamic allocation area 1090 * in the first chunk. If reserved_size is not zero, it 1091 * covers static area + reserved area (mostly used for module 1092 * static percpu allocation). 1093 */ 1094 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1095 INIT_LIST_HEAD(&schunk->list); 1096 schunk->vm = &first_vm; 1097 schunk->map = smap; 1098 schunk->map_alloc = ARRAY_SIZE(smap); 1099 schunk->page = schunk->page_ar; 1100 1101 if (reserved_size) { 1102 schunk->free_size = reserved_size; 1103 pcpu_reserved_chunk = schunk; 1104 pcpu_reserved_chunk_limit = static_size + reserved_size; 1105 } else { 1106 schunk->free_size = dyn_size; 1107 dyn_size = 0; /* dynamic area covered */ 1108 } 1109 schunk->contig_hint = schunk->free_size; 1110 1111 schunk->map[schunk->map_used++] = -static_size; 1112 if (schunk->free_size) 1113 schunk->map[schunk->map_used++] = schunk->free_size; 1114 1115 /* init dynamic chunk if necessary */ 1116 if (dyn_size) { 1117 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1118 INIT_LIST_HEAD(&dchunk->list); 1119 dchunk->vm = &first_vm; 1120 dchunk->map = dmap; 1121 dchunk->map_alloc = ARRAY_SIZE(dmap); 1122 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1123 1124 dchunk->contig_hint = dchunk->free_size = dyn_size; 1125 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1126 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1127 } 1128 1129 /* allocate vm address */ 1130 first_vm.flags = VM_ALLOC; 1131 first_vm.size = pcpu_chunk_size; 1132 1133 if (!base_addr) 1134 vm_area_register_early(&first_vm, PAGE_SIZE); 1135 else { 1136 /* 1137 * Pages already mapped. No need to remap into 1138 * vmalloc area. In this case the first chunks can't 1139 * be mapped or unmapped by percpu and are marked 1140 * immutable. 1141 */ 1142 first_vm.addr = base_addr; 1143 schunk->immutable = true; 1144 if (dchunk) 1145 dchunk->immutable = true; 1146 } 1147 1148 /* assign pages */ 1149 nr_pages = -1; 1150 for_each_possible_cpu(cpu) { 1151 for (i = 0; i < pcpu_unit_pages; i++) { 1152 struct page *page = get_page_fn(cpu, i); 1153 1154 if (!page) 1155 break; 1156 *pcpu_chunk_pagep(schunk, cpu, i) = page; 1157 } 1158 1159 BUG_ON(i < PFN_UP(static_size)); 1160 1161 if (nr_pages < 0) 1162 nr_pages = i; 1163 else 1164 BUG_ON(nr_pages != i); 1165 } 1166 1167 /* map them */ 1168 if (populate_pte_fn) { 1169 for_each_possible_cpu(cpu) 1170 for (i = 0; i < nr_pages; i++) 1171 populate_pte_fn(pcpu_chunk_addr(schunk, 1172 cpu, i)); 1173 1174 err = pcpu_map(schunk, 0, nr_pages); 1175 if (err) 1176 panic("failed to setup static percpu area, err=%d\n", 1177 err); 1178 } 1179 1180 /* link the first chunk in */ 1181 pcpu_first_chunk = dchunk ?: schunk; 1182 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1183 1184 /* we're done */ 1185 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1186 return pcpu_unit_size; 1187 } 1188 1189 /* 1190 * Embedding first chunk setup helper. 1191 */ 1192 static void *pcpue_ptr __initdata; 1193 static size_t pcpue_size __initdata; 1194 static size_t pcpue_unit_size __initdata; 1195 1196 static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1197 { 1198 size_t off = (size_t)pageno << PAGE_SHIFT; 1199 1200 if (off >= pcpue_size) 1201 return NULL; 1202 1203 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1204 } 1205 1206 /** 1207 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1208 * @static_size: the size of static percpu area in bytes 1209 * @reserved_size: the size of reserved percpu area in bytes 1210 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1211 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1212 * 1213 * This is a helper to ease setting up embedded first percpu chunk and 1214 * can be called where pcpu_setup_first_chunk() is expected. 1215 * 1216 * If this function is used to setup the first chunk, it is allocated 1217 * as a contiguous area using bootmem allocator and used as-is without 1218 * being mapped into vmalloc area. This enables the first chunk to 1219 * piggy back on the linear physical mapping which often uses larger 1220 * page size. 1221 * 1222 * When @dyn_size is positive, dynamic area might be larger than 1223 * specified to fill page alignment. Also, when @dyn_size is auto, 1224 * @dyn_size does not fill the whole first chunk but only what's 1225 * necessary for page alignment after static and reserved areas. 1226 * 1227 * If the needed size is smaller than the minimum or specified unit 1228 * size, the leftover is returned to the bootmem allocator. 1229 * 1230 * RETURNS: 1231 * The determined pcpu_unit_size which can be used to initialize 1232 * percpu access on success, -errno on failure. 1233 */ 1234 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1235 ssize_t dyn_size, ssize_t unit_size) 1236 { 1237 unsigned int cpu; 1238 1239 /* determine parameters and allocate */ 1240 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1241 (dyn_size >= 0 ? dyn_size : 0)); 1242 if (dyn_size != 0) 1243 dyn_size = pcpue_size - static_size - reserved_size; 1244 1245 if (unit_size >= 0) { 1246 BUG_ON(unit_size < pcpue_size); 1247 pcpue_unit_size = unit_size; 1248 } else 1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1250 1251 pcpue_ptr = __alloc_bootmem_nopanic( 1252 num_possible_cpus() * pcpue_unit_size, 1253 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1254 if (!pcpue_ptr) 1255 return -ENOMEM; 1256 1257 /* return the leftover and copy */ 1258 for_each_possible_cpu(cpu) { 1259 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1260 1261 free_bootmem(__pa(ptr + pcpue_size), 1262 pcpue_unit_size - pcpue_size); 1263 memcpy(ptr, __per_cpu_load, static_size); 1264 } 1265 1266 /* we're ready, commit */ 1267 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1268 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1269 1270 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1271 reserved_size, dyn_size, 1272 pcpue_unit_size, pcpue_ptr, NULL); 1273 } 1274