1 /* 2 * linux/mm/percpu.c - percpu memory allocator 3 * 4 * Copyright (C) 2009 SUSE Linux Products GmbH 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 6 * 7 * This file is released under the GPLv2. 8 * 9 * This is percpu allocator which can handle both static and dynamic 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 11 * chunk is consisted of num_possible_cpus() units and the first chunk 12 * is used for static percpu variables in the kernel image (special 13 * boot time alloc/init handling necessary as these areas need to be 14 * brought up before allocation services are running). Unit grows as 15 * necessary and all units grow or shrink in unison. When a chunk is 16 * filled up, another chunk is allocated. ie. in vmalloc area 17 * 18 * c0 c1 c2 19 * ------------------- ------------------- ------------ 20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u 21 * ------------------- ...... ------------------- .... ------------ 22 * 23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * percpu base registers pcpu_unit_size apart. 27 * 28 * There are usually many small percpu allocations many of them as 29 * small as 4 bytes. The allocator organizes chunks into lists 30 * according to free size and tries to allocate from the fullest one. 31 * Each chunk keeps the maximum contiguous area size hint which is 32 * guaranteed to be eqaul to or larger than the maximum contiguous 33 * area in the chunk. This helps the allocator not to iterate the 34 * chunk maps unnecessarily. 35 * 36 * Allocation state in each chunk is kept using an array of integers 37 * on chunk->map. A positive value in the map represents a free 38 * region and negative allocated. Allocation inside a chunk is done 39 * by scanning this map sequentially and serving the first matching 40 * entry. This is mostly copied from the percpu_modalloc() allocator. 41 * Chunks can be determined from the address using the index field 42 * in the page struct. The index field contains a pointer to the chunk. 43 * 44 * To use this allocator, arch code should do the followings. 45 * 46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 47 * 48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 49 * regular address to percpu pointer and back if they need to be 50 * different from the default 51 * 52 * - use pcpu_setup_first_chunk() during percpu area initialization to 53 * setup the first chunk containing the kernel static percpu area 54 */ 55 56 #include <linux/bitmap.h> 57 #include <linux/bootmem.h> 58 #include <linux/list.h> 59 #include <linux/mm.h> 60 #include <linux/module.h> 61 #include <linux/mutex.h> 62 #include <linux/percpu.h> 63 #include <linux/pfn.h> 64 #include <linux/slab.h> 65 #include <linux/spinlock.h> 66 #include <linux/vmalloc.h> 67 #include <linux/workqueue.h> 68 69 #include <asm/cacheflush.h> 70 #include <asm/sections.h> 71 #include <asm/tlbflush.h> 72 73 #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 74 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 75 76 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 77 #ifndef __addr_to_pcpu_ptr 78 #define __addr_to_pcpu_ptr(addr) \ 79 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 80 + (unsigned long)__per_cpu_start) 81 #endif 82 #ifndef __pcpu_ptr_to_addr 83 #define __pcpu_ptr_to_addr(ptr) \ 84 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 85 - (unsigned long)__per_cpu_start) 86 #endif 87 88 struct pcpu_chunk { 89 struct list_head list; /* linked to pcpu_slot lists */ 90 int free_size; /* free bytes in the chunk */ 91 int contig_hint; /* max contiguous size hint */ 92 struct vm_struct *vm; /* mapped vmalloc region */ 93 int map_used; /* # of map entries used */ 94 int map_alloc; /* # of map entries allocated */ 95 int *map; /* allocation map */ 96 bool immutable; /* no [de]population allowed */ 97 struct page **page; /* points to page array */ 98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */ 99 }; 100 101 static int pcpu_unit_pages __read_mostly; 102 static int pcpu_unit_size __read_mostly; 103 static int pcpu_chunk_size __read_mostly; 104 static int pcpu_nr_slots __read_mostly; 105 static size_t pcpu_chunk_struct_size __read_mostly; 106 107 /* the address of the first chunk which starts with the kernel static area */ 108 void *pcpu_base_addr __read_mostly; 109 EXPORT_SYMBOL_GPL(pcpu_base_addr); 110 111 /* 112 * The first chunk which always exists. Note that unlike other 113 * chunks, this one can be allocated and mapped in several different 114 * ways and thus often doesn't live in the vmalloc area. 115 */ 116 static struct pcpu_chunk *pcpu_first_chunk; 117 118 /* 119 * Optional reserved chunk. This chunk reserves part of the first 120 * chunk and serves it for reserved allocations. The amount of 121 * reserved offset is in pcpu_reserved_chunk_limit. When reserved 122 * area doesn't exist, the following variables contain NULL and 0 123 * respectively. 124 */ 125 static struct pcpu_chunk *pcpu_reserved_chunk; 126 static int pcpu_reserved_chunk_limit; 127 128 /* 129 * Synchronization rules. 130 * 131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 133 * The latter is a spinlock and protects the index data structures - 134 * chunk slots, chunks and area maps in chunks. 135 * 136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 137 * pcpu_lock is grabbed and released as necessary. All actual memory 138 * allocations are done using GFP_KERNEL with pcpu_lock released. 139 * 140 * Free path accesses and alters only the index data structures, so it 141 * can be safely called from atomic context. When memory needs to be 142 * returned to the system, free path schedules reclaim_work which 143 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be 144 * reclaimed, release both locks and frees the chunks. Note that it's 145 * necessary to grab both locks to remove a chunk from circulation as 146 * allocation path might be referencing the chunk with only 147 * pcpu_alloc_mutex locked. 148 */ 149 static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 150 static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 151 152 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 153 154 /* reclaim work to release fully free chunks, scheduled from free path */ 155 static void pcpu_reclaim(struct work_struct *work); 156 static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 157 158 static int __pcpu_size_to_slot(int size) 159 { 160 int highbit = fls(size); /* size is in bytes */ 161 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); 162 } 163 164 static int pcpu_size_to_slot(int size) 165 { 166 if (size == pcpu_unit_size) 167 return pcpu_nr_slots - 1; 168 return __pcpu_size_to_slot(size); 169 } 170 171 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 172 { 173 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) 174 return 0; 175 176 return pcpu_size_to_slot(chunk->free_size); 177 } 178 179 static int pcpu_page_idx(unsigned int cpu, int page_idx) 180 { 181 return cpu * pcpu_unit_pages + page_idx; 182 } 183 184 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, 185 unsigned int cpu, int page_idx) 186 { 187 return &chunk->page[pcpu_page_idx(cpu, page_idx)]; 188 } 189 190 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 191 unsigned int cpu, int page_idx) 192 { 193 return (unsigned long)chunk->vm->addr + 194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 195 } 196 197 static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 198 int page_idx) 199 { 200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 201 } 202 203 /* set the pointer to a chunk in a page struct */ 204 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 205 { 206 page->index = (unsigned long)pcpu; 207 } 208 209 /* obtain pointer to a chunk from a page struct */ 210 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) 211 { 212 return (struct pcpu_chunk *)page->index; 213 } 214 215 /** 216 * pcpu_mem_alloc - allocate memory 217 * @size: bytes to allocate 218 * 219 * Allocate @size bytes. If @size is smaller than PAGE_SIZE, 220 * kzalloc() is used; otherwise, vmalloc() is used. The returned 221 * memory is always zeroed. 222 * 223 * CONTEXT: 224 * Does GFP_KERNEL allocation. 225 * 226 * RETURNS: 227 * Pointer to the allocated area on success, NULL on failure. 228 */ 229 static void *pcpu_mem_alloc(size_t size) 230 { 231 if (size <= PAGE_SIZE) 232 return kzalloc(size, GFP_KERNEL); 233 else { 234 void *ptr = vmalloc(size); 235 if (ptr) 236 memset(ptr, 0, size); 237 return ptr; 238 } 239 } 240 241 /** 242 * pcpu_mem_free - free memory 243 * @ptr: memory to free 244 * @size: size of the area 245 * 246 * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). 247 */ 248 static void pcpu_mem_free(void *ptr, size_t size) 249 { 250 if (size <= PAGE_SIZE) 251 kfree(ptr); 252 else 253 vfree(ptr); 254 } 255 256 /** 257 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 258 * @chunk: chunk of interest 259 * @oslot: the previous slot it was on 260 * 261 * This function is called after an allocation or free changed @chunk. 262 * New slot according to the changed state is determined and @chunk is 263 * moved to the slot. Note that the reserved chunk is never put on 264 * chunk slots. 265 * 266 * CONTEXT: 267 * pcpu_lock. 268 */ 269 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 270 { 271 int nslot = pcpu_chunk_slot(chunk); 272 273 if (chunk != pcpu_reserved_chunk && oslot != nslot) { 274 if (oslot < nslot) 275 list_move(&chunk->list, &pcpu_slot[nslot]); 276 else 277 list_move_tail(&chunk->list, &pcpu_slot[nslot]); 278 } 279 } 280 281 /** 282 * pcpu_chunk_addr_search - determine chunk containing specified address 283 * @addr: address for which the chunk needs to be determined. 284 * 285 * RETURNS: 286 * The address of the found chunk. 287 */ 288 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 289 { 290 void *first_start = pcpu_first_chunk->vm->addr; 291 292 /* is it in the first chunk? */ 293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 294 /* is it in the reserved area? */ 295 if (addr < first_start + pcpu_reserved_chunk_limit) 296 return pcpu_reserved_chunk; 297 return pcpu_first_chunk; 298 } 299 300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 301 } 302 303 /** 304 * pcpu_extend_area_map - extend area map for allocation 305 * @chunk: target chunk 306 * 307 * Extend area map of @chunk so that it can accomodate an allocation. 308 * A single allocation can split an area into three areas, so this 309 * function makes sure that @chunk->map has at least two extra slots. 310 * 311 * CONTEXT: 312 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 313 * if area map is extended. 314 * 315 * RETURNS: 316 * 0 if noop, 1 if successfully extended, -errno on failure. 317 */ 318 static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 319 { 320 int new_alloc; 321 int *new; 322 size_t size; 323 324 /* has enough? */ 325 if (chunk->map_alloc >= chunk->map_used + 2) 326 return 0; 327 328 spin_unlock_irq(&pcpu_lock); 329 330 new_alloc = PCPU_DFL_MAP_ALLOC; 331 while (new_alloc < chunk->map_used + 2) 332 new_alloc *= 2; 333 334 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 335 if (!new) { 336 spin_lock_irq(&pcpu_lock); 337 return -ENOMEM; 338 } 339 340 /* 341 * Acquire pcpu_lock and switch to new area map. Only free 342 * could have happened inbetween, so map_used couldn't have 343 * grown. 344 */ 345 spin_lock_irq(&pcpu_lock); 346 BUG_ON(new_alloc < chunk->map_used + 2); 347 348 size = chunk->map_alloc * sizeof(chunk->map[0]); 349 memcpy(new, chunk->map, size); 350 351 /* 352 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 353 * one of the first chunks and still using static map. 354 */ 355 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 356 pcpu_mem_free(chunk->map, size); 357 358 chunk->map_alloc = new_alloc; 359 chunk->map = new; 360 return 0; 361 } 362 363 /** 364 * pcpu_split_block - split a map block 365 * @chunk: chunk of interest 366 * @i: index of map block to split 367 * @head: head size in bytes (can be 0) 368 * @tail: tail size in bytes (can be 0) 369 * 370 * Split the @i'th map block into two or three blocks. If @head is 371 * non-zero, @head bytes block is inserted before block @i moving it 372 * to @i+1 and reducing its size by @head bytes. 373 * 374 * If @tail is non-zero, the target block, which can be @i or @i+1 375 * depending on @head, is reduced by @tail bytes and @tail byte block 376 * is inserted after the target block. 377 * 378 * @chunk->map must have enough free slots to accomodate the split. 379 * 380 * CONTEXT: 381 * pcpu_lock. 382 */ 383 static void pcpu_split_block(struct pcpu_chunk *chunk, int i, 384 int head, int tail) 385 { 386 int nr_extra = !!head + !!tail; 387 388 BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); 389 390 /* insert new subblocks */ 391 memmove(&chunk->map[i + nr_extra], &chunk->map[i], 392 sizeof(chunk->map[0]) * (chunk->map_used - i)); 393 chunk->map_used += nr_extra; 394 395 if (head) { 396 chunk->map[i + 1] = chunk->map[i] - head; 397 chunk->map[i++] = head; 398 } 399 if (tail) { 400 chunk->map[i++] -= tail; 401 chunk->map[i] = tail; 402 } 403 } 404 405 /** 406 * pcpu_alloc_area - allocate area from a pcpu_chunk 407 * @chunk: chunk of interest 408 * @size: wanted size in bytes 409 * @align: wanted align 410 * 411 * Try to allocate @size bytes area aligned at @align from @chunk. 412 * Note that this function only allocates the offset. It doesn't 413 * populate or map the area. 414 * 415 * @chunk->map must have at least two free slots. 416 * 417 * CONTEXT: 418 * pcpu_lock. 419 * 420 * RETURNS: 421 * Allocated offset in @chunk on success, -1 if no matching area is 422 * found. 423 */ 424 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 425 { 426 int oslot = pcpu_chunk_slot(chunk); 427 int max_contig = 0; 428 int i, off; 429 430 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { 431 bool is_last = i + 1 == chunk->map_used; 432 int head, tail; 433 434 /* extra for alignment requirement */ 435 head = ALIGN(off, align) - off; 436 BUG_ON(i == 0 && head != 0); 437 438 if (chunk->map[i] < 0) 439 continue; 440 if (chunk->map[i] < head + size) { 441 max_contig = max(chunk->map[i], max_contig); 442 continue; 443 } 444 445 /* 446 * If head is small or the previous block is free, 447 * merge'em. Note that 'small' is defined as smaller 448 * than sizeof(int), which is very small but isn't too 449 * uncommon for percpu allocations. 450 */ 451 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { 452 if (chunk->map[i - 1] > 0) 453 chunk->map[i - 1] += head; 454 else { 455 chunk->map[i - 1] -= head; 456 chunk->free_size -= head; 457 } 458 chunk->map[i] -= head; 459 off += head; 460 head = 0; 461 } 462 463 /* if tail is small, just keep it around */ 464 tail = chunk->map[i] - head - size; 465 if (tail < sizeof(int)) 466 tail = 0; 467 468 /* split if warranted */ 469 if (head || tail) { 470 pcpu_split_block(chunk, i, head, tail); 471 if (head) { 472 i++; 473 off += head; 474 max_contig = max(chunk->map[i - 1], max_contig); 475 } 476 if (tail) 477 max_contig = max(chunk->map[i + 1], max_contig); 478 } 479 480 /* update hint and mark allocated */ 481 if (is_last) 482 chunk->contig_hint = max_contig; /* fully scanned */ 483 else 484 chunk->contig_hint = max(chunk->contig_hint, 485 max_contig); 486 487 chunk->free_size -= chunk->map[i]; 488 chunk->map[i] = -chunk->map[i]; 489 490 pcpu_chunk_relocate(chunk, oslot); 491 return off; 492 } 493 494 chunk->contig_hint = max_contig; /* fully scanned */ 495 pcpu_chunk_relocate(chunk, oslot); 496 497 /* tell the upper layer that this chunk has no matching area */ 498 return -1; 499 } 500 501 /** 502 * pcpu_free_area - free area to a pcpu_chunk 503 * @chunk: chunk of interest 504 * @freeme: offset of area to free 505 * 506 * Free area starting from @freeme to @chunk. Note that this function 507 * only modifies the allocation map. It doesn't depopulate or unmap 508 * the area. 509 * 510 * CONTEXT: 511 * pcpu_lock. 512 */ 513 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 514 { 515 int oslot = pcpu_chunk_slot(chunk); 516 int i, off; 517 518 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) 519 if (off == freeme) 520 break; 521 BUG_ON(off != freeme); 522 BUG_ON(chunk->map[i] > 0); 523 524 chunk->map[i] = -chunk->map[i]; 525 chunk->free_size += chunk->map[i]; 526 527 /* merge with previous? */ 528 if (i > 0 && chunk->map[i - 1] >= 0) { 529 chunk->map[i - 1] += chunk->map[i]; 530 chunk->map_used--; 531 memmove(&chunk->map[i], &chunk->map[i + 1], 532 (chunk->map_used - i) * sizeof(chunk->map[0])); 533 i--; 534 } 535 /* merge with next? */ 536 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { 537 chunk->map[i] += chunk->map[i + 1]; 538 chunk->map_used--; 539 memmove(&chunk->map[i + 1], &chunk->map[i + 2], 540 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); 541 } 542 543 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); 544 pcpu_chunk_relocate(chunk, oslot); 545 } 546 547 /** 548 * pcpu_unmap - unmap pages out of a pcpu_chunk 549 * @chunk: chunk of interest 550 * @page_start: page index of the first page to unmap 551 * @page_end: page index of the last page to unmap + 1 552 * @flush_tlb: whether to flush tlb or not 553 * 554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 555 * If @flush is true, vcache is flushed before unmapping and tlb 556 * after. 557 */ 558 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 559 bool flush_tlb) 560 { 561 unsigned int last = num_possible_cpus() - 1; 562 unsigned int cpu; 563 564 /* unmap must not be done on immutable chunk */ 565 WARN_ON(chunk->immutable); 566 567 /* 568 * Each flushing trial can be very expensive, issue flush on 569 * the whole region at once rather than doing it for each cpu. 570 * This could be an overkill but is more scalable. 571 */ 572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 573 pcpu_chunk_addr(chunk, last, page_end)); 574 575 for_each_possible_cpu(cpu) 576 unmap_kernel_range_noflush( 577 pcpu_chunk_addr(chunk, cpu, page_start), 578 (page_end - page_start) << PAGE_SHIFT); 579 580 /* ditto as flush_cache_vunmap() */ 581 if (flush_tlb) 582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 583 pcpu_chunk_addr(chunk, last, page_end)); 584 } 585 586 /** 587 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 588 * @chunk: chunk to depopulate 589 * @off: offset to the area to depopulate 590 * @size: size of the area to depopulate in bytes 591 * @flush: whether to flush cache and tlb or not 592 * 593 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 594 * from @chunk. If @flush is true, vcache is flushed before unmapping 595 * and tlb after. 596 * 597 * CONTEXT: 598 * pcpu_alloc_mutex. 599 */ 600 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 601 bool flush) 602 { 603 int page_start = PFN_DOWN(off); 604 int page_end = PFN_UP(off + size); 605 int unmap_start = -1; 606 int uninitialized_var(unmap_end); 607 unsigned int cpu; 608 int i; 609 610 for (i = page_start; i < page_end; i++) { 611 for_each_possible_cpu(cpu) { 612 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 613 614 if (!*pagep) 615 continue; 616 617 __free_page(*pagep); 618 619 /* 620 * If it's partial depopulation, it might get 621 * populated or depopulated again. Mark the 622 * page gone. 623 */ 624 *pagep = NULL; 625 626 unmap_start = unmap_start < 0 ? i : unmap_start; 627 unmap_end = i + 1; 628 } 629 } 630 631 if (unmap_start >= 0) 632 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 633 } 634 635 /** 636 * pcpu_map - map pages into a pcpu_chunk 637 * @chunk: chunk of interest 638 * @page_start: page index of the first page to map 639 * @page_end: page index of the last page to map + 1 640 * 641 * For each cpu, map pages [@page_start,@page_end) into @chunk. 642 * vcache is flushed afterwards. 643 */ 644 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 645 { 646 unsigned int last = num_possible_cpus() - 1; 647 unsigned int cpu; 648 int err; 649 650 /* map must not be done on immutable chunk */ 651 WARN_ON(chunk->immutable); 652 653 for_each_possible_cpu(cpu) { 654 err = map_kernel_range_noflush( 655 pcpu_chunk_addr(chunk, cpu, page_start), 656 (page_end - page_start) << PAGE_SHIFT, 657 PAGE_KERNEL, 658 pcpu_chunk_pagep(chunk, cpu, page_start)); 659 if (err < 0) 660 return err; 661 } 662 663 /* flush at once, please read comments in pcpu_unmap() */ 664 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), 665 pcpu_chunk_addr(chunk, last, page_end)); 666 return 0; 667 } 668 669 /** 670 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 671 * @chunk: chunk of interest 672 * @off: offset to the area to populate 673 * @size: size of the area to populate in bytes 674 * 675 * For each cpu, populate and map pages [@page_start,@page_end) into 676 * @chunk. The area is cleared on return. 677 * 678 * CONTEXT: 679 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 680 */ 681 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 682 { 683 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 684 int page_start = PFN_DOWN(off); 685 int page_end = PFN_UP(off + size); 686 int map_start = -1; 687 int uninitialized_var(map_end); 688 unsigned int cpu; 689 int i; 690 691 for (i = page_start; i < page_end; i++) { 692 if (pcpu_chunk_page_occupied(chunk, i)) { 693 if (map_start >= 0) { 694 if (pcpu_map(chunk, map_start, map_end)) 695 goto err; 696 map_start = -1; 697 } 698 continue; 699 } 700 701 map_start = map_start < 0 ? i : map_start; 702 map_end = i + 1; 703 704 for_each_possible_cpu(cpu) { 705 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 706 707 *pagep = alloc_pages_node(cpu_to_node(cpu), 708 alloc_mask, 0); 709 if (!*pagep) 710 goto err; 711 pcpu_set_page_chunk(*pagep, chunk); 712 } 713 } 714 715 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 716 goto err; 717 718 for_each_possible_cpu(cpu) 719 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 720 size); 721 722 return 0; 723 err: 724 /* likely under heavy memory pressure, give memory back */ 725 pcpu_depopulate_chunk(chunk, off, size, true); 726 return -ENOMEM; 727 } 728 729 static void free_pcpu_chunk(struct pcpu_chunk *chunk) 730 { 731 if (!chunk) 732 return; 733 if (chunk->vm) 734 free_vm_area(chunk->vm); 735 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 736 kfree(chunk); 737 } 738 739 static struct pcpu_chunk *alloc_pcpu_chunk(void) 740 { 741 struct pcpu_chunk *chunk; 742 743 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); 744 if (!chunk) 745 return NULL; 746 747 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 748 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 749 chunk->map[chunk->map_used++] = pcpu_unit_size; 750 chunk->page = chunk->page_ar; 751 752 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 753 if (!chunk->vm) { 754 free_pcpu_chunk(chunk); 755 return NULL; 756 } 757 758 INIT_LIST_HEAD(&chunk->list); 759 chunk->free_size = pcpu_unit_size; 760 chunk->contig_hint = pcpu_unit_size; 761 762 return chunk; 763 } 764 765 /** 766 * pcpu_alloc - the percpu allocator 767 * @size: size of area to allocate in bytes 768 * @align: alignment of area (max PAGE_SIZE) 769 * @reserved: allocate from the reserved chunk if available 770 * 771 * Allocate percpu area of @size bytes aligned at @align. 772 * 773 * CONTEXT: 774 * Does GFP_KERNEL allocation. 775 * 776 * RETURNS: 777 * Percpu pointer to the allocated area on success, NULL on failure. 778 */ 779 static void *pcpu_alloc(size_t size, size_t align, bool reserved) 780 { 781 struct pcpu_chunk *chunk; 782 int slot, off; 783 784 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 785 WARN(true, "illegal size (%zu) or align (%zu) for " 786 "percpu allocation\n", size, align); 787 return NULL; 788 } 789 790 mutex_lock(&pcpu_alloc_mutex); 791 spin_lock_irq(&pcpu_lock); 792 793 /* serve reserved allocations from the reserved chunk if available */ 794 if (reserved && pcpu_reserved_chunk) { 795 chunk = pcpu_reserved_chunk; 796 if (size > chunk->contig_hint || 797 pcpu_extend_area_map(chunk) < 0) 798 goto fail_unlock; 799 off = pcpu_alloc_area(chunk, size, align); 800 if (off >= 0) 801 goto area_found; 802 goto fail_unlock; 803 } 804 805 restart: 806 /* search through normal chunks */ 807 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 808 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 809 if (size > chunk->contig_hint) 810 continue; 811 812 switch (pcpu_extend_area_map(chunk)) { 813 case 0: 814 break; 815 case 1: 816 goto restart; /* pcpu_lock dropped, restart */ 817 default: 818 goto fail_unlock; 819 } 820 821 off = pcpu_alloc_area(chunk, size, align); 822 if (off >= 0) 823 goto area_found; 824 } 825 } 826 827 /* hmmm... no space left, create a new chunk */ 828 spin_unlock_irq(&pcpu_lock); 829 830 chunk = alloc_pcpu_chunk(); 831 if (!chunk) 832 goto fail_unlock_mutex; 833 834 spin_lock_irq(&pcpu_lock); 835 pcpu_chunk_relocate(chunk, -1); 836 goto restart; 837 838 area_found: 839 spin_unlock_irq(&pcpu_lock); 840 841 /* populate, map and clear the area */ 842 if (pcpu_populate_chunk(chunk, off, size)) { 843 spin_lock_irq(&pcpu_lock); 844 pcpu_free_area(chunk, off); 845 goto fail_unlock; 846 } 847 848 mutex_unlock(&pcpu_alloc_mutex); 849 850 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 851 852 fail_unlock: 853 spin_unlock_irq(&pcpu_lock); 854 fail_unlock_mutex: 855 mutex_unlock(&pcpu_alloc_mutex); 856 return NULL; 857 } 858 859 /** 860 * __alloc_percpu - allocate dynamic percpu area 861 * @size: size of area to allocate in bytes 862 * @align: alignment of area (max PAGE_SIZE) 863 * 864 * Allocate percpu area of @size bytes aligned at @align. Might 865 * sleep. Might trigger writeouts. 866 * 867 * CONTEXT: 868 * Does GFP_KERNEL allocation. 869 * 870 * RETURNS: 871 * Percpu pointer to the allocated area on success, NULL on failure. 872 */ 873 void *__alloc_percpu(size_t size, size_t align) 874 { 875 return pcpu_alloc(size, align, false); 876 } 877 EXPORT_SYMBOL_GPL(__alloc_percpu); 878 879 /** 880 * __alloc_reserved_percpu - allocate reserved percpu area 881 * @size: size of area to allocate in bytes 882 * @align: alignment of area (max PAGE_SIZE) 883 * 884 * Allocate percpu area of @size bytes aligned at @align from reserved 885 * percpu area if arch has set it up; otherwise, allocation is served 886 * from the same dynamic area. Might sleep. Might trigger writeouts. 887 * 888 * CONTEXT: 889 * Does GFP_KERNEL allocation. 890 * 891 * RETURNS: 892 * Percpu pointer to the allocated area on success, NULL on failure. 893 */ 894 void *__alloc_reserved_percpu(size_t size, size_t align) 895 { 896 return pcpu_alloc(size, align, true); 897 } 898 899 /** 900 * pcpu_reclaim - reclaim fully free chunks, workqueue function 901 * @work: unused 902 * 903 * Reclaim all fully free chunks except for the first one. 904 * 905 * CONTEXT: 906 * workqueue context. 907 */ 908 static void pcpu_reclaim(struct work_struct *work) 909 { 910 LIST_HEAD(todo); 911 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 912 struct pcpu_chunk *chunk, *next; 913 914 mutex_lock(&pcpu_alloc_mutex); 915 spin_lock_irq(&pcpu_lock); 916 917 list_for_each_entry_safe(chunk, next, head, list) { 918 WARN_ON(chunk->immutable); 919 920 /* spare the first one */ 921 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 922 continue; 923 924 list_move(&chunk->list, &todo); 925 } 926 927 spin_unlock_irq(&pcpu_lock); 928 mutex_unlock(&pcpu_alloc_mutex); 929 930 list_for_each_entry_safe(chunk, next, &todo, list) { 931 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 932 free_pcpu_chunk(chunk); 933 } 934 } 935 936 /** 937 * free_percpu - free percpu area 938 * @ptr: pointer to area to free 939 * 940 * Free percpu area @ptr. 941 * 942 * CONTEXT: 943 * Can be called from atomic context. 944 */ 945 void free_percpu(void *ptr) 946 { 947 void *addr = __pcpu_ptr_to_addr(ptr); 948 struct pcpu_chunk *chunk; 949 unsigned long flags; 950 int off; 951 952 if (!ptr) 953 return; 954 955 spin_lock_irqsave(&pcpu_lock, flags); 956 957 chunk = pcpu_chunk_addr_search(addr); 958 off = addr - chunk->vm->addr; 959 960 pcpu_free_area(chunk, off); 961 962 /* if there are more than one fully free chunks, wake up grim reaper */ 963 if (chunk->free_size == pcpu_unit_size) { 964 struct pcpu_chunk *pos; 965 966 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 967 if (pos != chunk) { 968 schedule_work(&pcpu_reclaim_work); 969 break; 970 } 971 } 972 973 spin_unlock_irqrestore(&pcpu_lock, flags); 974 } 975 EXPORT_SYMBOL_GPL(free_percpu); 976 977 /** 978 * pcpu_setup_first_chunk - initialize the first percpu chunk 979 * @get_page_fn: callback to fetch page pointer 980 * @static_size: the size of static percpu area in bytes 981 * @reserved_size: the size of reserved percpu area in bytes 982 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 983 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 984 * @base_addr: mapped address, NULL for auto 985 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 986 * 987 * Initialize the first percpu chunk which contains the kernel static 988 * perpcu area. This function is to be called from arch percpu area 989 * setup path. The first two parameters are mandatory. The rest are 990 * optional. 991 * 992 * @get_page_fn() should return pointer to percpu page given cpu 993 * number and page number. It should at least return enough pages to 994 * cover the static area. The returned pages for static area should 995 * have been initialized with valid data. If @unit_size is specified, 996 * it can also return pages after the static area. NULL return 997 * indicates end of pages for the cpu. Note that @get_page_fn() must 998 * return the same number of pages for all cpus. 999 * 1000 * @reserved_size, if non-zero, specifies the amount of bytes to 1001 * reserve after the static area in the first chunk. This reserves 1002 * the first chunk such that it's available only through reserved 1003 * percpu allocation. This is primarily used to serve module percpu 1004 * static areas on architectures where the addressing model has 1005 * limited offset range for symbol relocations to guarantee module 1006 * percpu symbols fall inside the relocatable range. 1007 * 1008 * @dyn_size, if non-negative, determines the number of bytes 1009 * available for dynamic allocation in the first chunk. Specifying 1010 * non-negative value makes percpu leave alone the area beyond 1011 * @static_size + @reserved_size + @dyn_size. 1012 * 1013 * @unit_size, if non-negative, specifies unit size and must be 1014 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1015 * @reserved_size + if non-negative, @dyn_size. 1016 * 1017 * Non-null @base_addr means that the caller already allocated virtual 1018 * region for the first chunk and mapped it. percpu must not mess 1019 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL 1020 * @populate_pte_fn doesn't make any sense. 1021 * 1022 * @populate_pte_fn is used to populate the pagetable. NULL means the 1023 * caller already populated the pagetable. 1024 * 1025 * If the first chunk ends up with both reserved and dynamic areas, it 1026 * is served by two chunks - one to serve the core static and reserved 1027 * areas and the other for the dynamic area. They share the same vm 1028 * and page map but uses different area allocation map to stay away 1029 * from each other. The latter chunk is circulated in the chunk slots 1030 * and available for dynamic allocation like any other chunks. 1031 * 1032 * RETURNS: 1033 * The determined pcpu_unit_size which can be used to initialize 1034 * percpu access. 1035 */ 1036 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1037 size_t static_size, size_t reserved_size, 1038 ssize_t dyn_size, ssize_t unit_size, 1039 void *base_addr, 1040 pcpu_populate_pte_fn_t populate_pte_fn) 1041 { 1042 static struct vm_struct first_vm; 1043 static int smap[2], dmap[2]; 1044 size_t size_sum = static_size + reserved_size + 1045 (dyn_size >= 0 ? dyn_size : 0); 1046 struct pcpu_chunk *schunk, *dchunk = NULL; 1047 unsigned int cpu; 1048 int nr_pages; 1049 int err, i; 1050 1051 /* santiy checks */ 1052 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1053 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1054 BUG_ON(!static_size); 1055 if (unit_size >= 0) { 1056 BUG_ON(unit_size < size_sum); 1057 BUG_ON(unit_size & ~PAGE_MASK); 1058 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1059 } else 1060 BUG_ON(base_addr); 1061 BUG_ON(base_addr && populate_pte_fn); 1062 1063 if (unit_size >= 0) 1064 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1065 else 1066 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1067 PFN_UP(size_sum)); 1068 1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1070 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1072 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1073 1074 if (dyn_size < 0) 1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1076 1077 /* 1078 * Allocate chunk slots. The additional last slot is for 1079 * empty chunks. 1080 */ 1081 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1082 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1083 for (i = 0; i < pcpu_nr_slots; i++) 1084 INIT_LIST_HEAD(&pcpu_slot[i]); 1085 1086 /* 1087 * Initialize static chunk. If reserved_size is zero, the 1088 * static chunk covers static area + dynamic allocation area 1089 * in the first chunk. If reserved_size is not zero, it 1090 * covers static area + reserved area (mostly used for module 1091 * static percpu allocation). 1092 */ 1093 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1094 INIT_LIST_HEAD(&schunk->list); 1095 schunk->vm = &first_vm; 1096 schunk->map = smap; 1097 schunk->map_alloc = ARRAY_SIZE(smap); 1098 schunk->page = schunk->page_ar; 1099 1100 if (reserved_size) { 1101 schunk->free_size = reserved_size; 1102 pcpu_reserved_chunk = schunk; 1103 pcpu_reserved_chunk_limit = static_size + reserved_size; 1104 } else { 1105 schunk->free_size = dyn_size; 1106 dyn_size = 0; /* dynamic area covered */ 1107 } 1108 schunk->contig_hint = schunk->free_size; 1109 1110 schunk->map[schunk->map_used++] = -static_size; 1111 if (schunk->free_size) 1112 schunk->map[schunk->map_used++] = schunk->free_size; 1113 1114 /* init dynamic chunk if necessary */ 1115 if (dyn_size) { 1116 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1117 INIT_LIST_HEAD(&dchunk->list); 1118 dchunk->vm = &first_vm; 1119 dchunk->map = dmap; 1120 dchunk->map_alloc = ARRAY_SIZE(dmap); 1121 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1122 1123 dchunk->contig_hint = dchunk->free_size = dyn_size; 1124 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1125 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1126 } 1127 1128 /* allocate vm address */ 1129 first_vm.flags = VM_ALLOC; 1130 first_vm.size = pcpu_chunk_size; 1131 1132 if (!base_addr) 1133 vm_area_register_early(&first_vm, PAGE_SIZE); 1134 else { 1135 /* 1136 * Pages already mapped. No need to remap into 1137 * vmalloc area. In this case the first chunks can't 1138 * be mapped or unmapped by percpu and are marked 1139 * immutable. 1140 */ 1141 first_vm.addr = base_addr; 1142 schunk->immutable = true; 1143 if (dchunk) 1144 dchunk->immutable = true; 1145 } 1146 1147 /* assign pages */ 1148 nr_pages = -1; 1149 for_each_possible_cpu(cpu) { 1150 for (i = 0; i < pcpu_unit_pages; i++) { 1151 struct page *page = get_page_fn(cpu, i); 1152 1153 if (!page) 1154 break; 1155 *pcpu_chunk_pagep(schunk, cpu, i) = page; 1156 } 1157 1158 BUG_ON(i < PFN_UP(static_size)); 1159 1160 if (nr_pages < 0) 1161 nr_pages = i; 1162 else 1163 BUG_ON(nr_pages != i); 1164 } 1165 1166 /* map them */ 1167 if (populate_pte_fn) { 1168 for_each_possible_cpu(cpu) 1169 for (i = 0; i < nr_pages; i++) 1170 populate_pte_fn(pcpu_chunk_addr(schunk, 1171 cpu, i)); 1172 1173 err = pcpu_map(schunk, 0, nr_pages); 1174 if (err) 1175 panic("failed to setup static percpu area, err=%d\n", 1176 err); 1177 } 1178 1179 /* link the first chunk in */ 1180 pcpu_first_chunk = dchunk ?: schunk; 1181 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1182 1183 /* we're done */ 1184 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1185 return pcpu_unit_size; 1186 } 1187 1188 /* 1189 * Embedding first chunk setup helper. 1190 */ 1191 static void *pcpue_ptr __initdata; 1192 static size_t pcpue_size __initdata; 1193 static size_t pcpue_unit_size __initdata; 1194 1195 static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1196 { 1197 size_t off = (size_t)pageno << PAGE_SHIFT; 1198 1199 if (off >= pcpue_size) 1200 return NULL; 1201 1202 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1203 } 1204 1205 /** 1206 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1207 * @static_size: the size of static percpu area in bytes 1208 * @reserved_size: the size of reserved percpu area in bytes 1209 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1210 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1211 * 1212 * This is a helper to ease setting up embedded first percpu chunk and 1213 * can be called where pcpu_setup_first_chunk() is expected. 1214 * 1215 * If this function is used to setup the first chunk, it is allocated 1216 * as a contiguous area using bootmem allocator and used as-is without 1217 * being mapped into vmalloc area. This enables the first chunk to 1218 * piggy back on the linear physical mapping which often uses larger 1219 * page size. 1220 * 1221 * When @dyn_size is positive, dynamic area might be larger than 1222 * specified to fill page alignment. Also, when @dyn_size is auto, 1223 * @dyn_size does not fill the whole first chunk but only what's 1224 * necessary for page alignment after static and reserved areas. 1225 * 1226 * If the needed size is smaller than the minimum or specified unit 1227 * size, the leftover is returned to the bootmem allocator. 1228 * 1229 * RETURNS: 1230 * The determined pcpu_unit_size which can be used to initialize 1231 * percpu access on success, -errno on failure. 1232 */ 1233 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1234 ssize_t dyn_size, ssize_t unit_size) 1235 { 1236 size_t chunk_size; 1237 unsigned int cpu; 1238 1239 /* determine parameters and allocate */ 1240 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1241 (dyn_size >= 0 ? dyn_size : 0)); 1242 if (dyn_size != 0) 1243 dyn_size = pcpue_size - static_size - reserved_size; 1244 1245 if (unit_size >= 0) { 1246 BUG_ON(unit_size < pcpue_size); 1247 pcpue_unit_size = unit_size; 1248 } else 1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1250 1251 chunk_size = pcpue_unit_size * num_possible_cpus(); 1252 1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1254 __pa(MAX_DMA_ADDRESS)); 1255 if (!pcpue_ptr) { 1256 pr_warning("PERCPU: failed to allocate %zu bytes for " 1257 "embedding\n", chunk_size); 1258 return -ENOMEM; 1259 } 1260 1261 /* return the leftover and copy */ 1262 for_each_possible_cpu(cpu) { 1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1264 1265 free_bootmem(__pa(ptr + pcpue_size), 1266 pcpue_unit_size - pcpue_size); 1267 memcpy(ptr, __per_cpu_load, static_size); 1268 } 1269 1270 /* we're ready, commit */ 1271 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1272 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1273 1274 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1275 reserved_size, dyn_size, 1276 pcpue_unit_size, pcpue_ptr, NULL); 1277 } 1278