1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/kmemcheck.h> 22 #include <linux/cpu.h> 23 #include <linux/cpuset.h> 24 #include <linux/mempolicy.h> 25 #include <linux/ctype.h> 26 #include <linux/debugobjects.h> 27 #include <linux/kallsyms.h> 28 #include <linux/memory.h> 29 #include <linux/math64.h> 30 #include <linux/fault-inject.h> 31 #include <linux/stacktrace.h> 32 33 #include <trace/events/kmem.h> 34 35 /* 36 * Lock order: 37 * 1. slub_lock (Global Semaphore) 38 * 2. node->list_lock 39 * 3. slab_lock(page) (Only on some arches and for debugging) 40 * 41 * slub_lock 42 * 43 * The role of the slub_lock is to protect the list of all the slabs 44 * and to synchronize major metadata changes to slab cache structures. 45 * 46 * The slab_lock is only used for debugging and on arches that do not 47 * have the ability to do a cmpxchg_double. It only protects the second 48 * double word in the page struct. Meaning 49 * A. page->freelist -> List of object free in a page 50 * B. page->counters -> Counters of objects 51 * C. page->frozen -> frozen state 52 * 53 * If a slab is frozen then it is exempt from list management. It is not 54 * on any list. The processor that froze the slab is the one who can 55 * perform list operations on the page. Other processors may put objects 56 * onto the freelist but the processor that froze the slab is the only 57 * one that can retrieve the objects from the page's freelist. 58 * 59 * The list_lock protects the partial and full list on each node and 60 * the partial slab counter. If taken then no new slabs may be added or 61 * removed from the lists nor make the number of partial slabs be modified. 62 * (Note that the total number of slabs is an atomic value that may be 63 * modified without taking the list lock). 64 * 65 * The list_lock is a centralized lock and thus we avoid taking it as 66 * much as possible. As long as SLUB does not have to handle partial 67 * slabs, operations can continue without any centralized lock. F.e. 68 * allocating a long series of objects that fill up slabs does not require 69 * the list lock. 70 * Interrupts are disabled during allocation and deallocation in order to 71 * make the slab allocator safe to use in the context of an irq. In addition 72 * interrupts are disabled to ensure that the processor does not change 73 * while handling per_cpu slabs, due to kernel preemption. 74 * 75 * SLUB assigns one slab for allocation to each processor. 76 * Allocations only occur from these slabs called cpu slabs. 77 * 78 * Slabs with free elements are kept on a partial list and during regular 79 * operations no list for full slabs is used. If an object in a full slab is 80 * freed then the slab will show up again on the partial lists. 81 * We track full slabs for debugging purposes though because otherwise we 82 * cannot scan all objects. 83 * 84 * Slabs are freed when they become empty. Teardown and setup is 85 * minimal so we rely on the page allocators per cpu caches for 86 * fast frees and allocs. 87 * 88 * Overloading of page flags that are otherwise used for LRU management. 89 * 90 * PageActive The slab is frozen and exempt from list processing. 91 * This means that the slab is dedicated to a purpose 92 * such as satisfying allocations for a specific 93 * processor. Objects may be freed in the slab while 94 * it is frozen but slab_free will then skip the usual 95 * list operations. It is up to the processor holding 96 * the slab to integrate the slab into the slab lists 97 * when the slab is no longer needed. 98 * 99 * One use of this flag is to mark slabs that are 100 * used for allocations. Then such a slab becomes a cpu 101 * slab. The cpu slab may be equipped with an additional 102 * freelist that allows lockless access to 103 * free objects in addition to the regular freelist 104 * that requires the slab lock. 105 * 106 * PageError Slab requires special handling due to debug 107 * options set. This moves slab handling out of 108 * the fast path and disables lockless freelists. 109 */ 110 111 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 112 SLAB_TRACE | SLAB_DEBUG_FREE) 113 114 static inline int kmem_cache_debug(struct kmem_cache *s) 115 { 116 #ifdef CONFIG_SLUB_DEBUG 117 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 118 #else 119 return 0; 120 #endif 121 } 122 123 /* 124 * Issues still to be resolved: 125 * 126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 127 * 128 * - Variable sizing of the per node arrays 129 */ 130 131 /* Enable to test recovery from slab corruption on boot */ 132 #undef SLUB_RESILIENCY_TEST 133 134 /* Enable to log cmpxchg failures */ 135 #undef SLUB_DEBUG_CMPXCHG 136 137 /* 138 * Mininum number of partial slabs. These will be left on the partial 139 * lists even if they are empty. kmem_cache_shrink may reclaim them. 140 */ 141 #define MIN_PARTIAL 5 142 143 /* 144 * Maximum number of desirable partial slabs. 145 * The existence of more partial slabs makes kmem_cache_shrink 146 * sort the partial list by the number of objects in the. 147 */ 148 #define MAX_PARTIAL 10 149 150 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 151 SLAB_POISON | SLAB_STORE_USER) 152 153 /* 154 * Debugging flags that require metadata to be stored in the slab. These get 155 * disabled when slub_debug=O is used and a cache's min order increases with 156 * metadata. 157 */ 158 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 159 160 /* 161 * Set of flags that will prevent slab merging 162 */ 163 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 164 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 165 SLAB_FAILSLAB) 166 167 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 168 SLAB_CACHE_DMA | SLAB_NOTRACK) 169 170 #define OO_SHIFT 16 171 #define OO_MASK ((1 << OO_SHIFT) - 1) 172 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 173 174 /* Internal SLUB flags */ 175 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 176 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 177 178 static int kmem_size = sizeof(struct kmem_cache); 179 180 #ifdef CONFIG_SMP 181 static struct notifier_block slab_notifier; 182 #endif 183 184 static enum { 185 DOWN, /* No slab functionality available */ 186 PARTIAL, /* Kmem_cache_node works */ 187 UP, /* Everything works but does not show up in sysfs */ 188 SYSFS /* Sysfs up */ 189 } slab_state = DOWN; 190 191 /* A list of all slab caches on the system */ 192 static DECLARE_RWSEM(slub_lock); 193 static LIST_HEAD(slab_caches); 194 195 /* 196 * Tracking user of a slab. 197 */ 198 #define TRACK_ADDRS_COUNT 16 199 struct track { 200 unsigned long addr; /* Called from address */ 201 #ifdef CONFIG_STACKTRACE 202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 203 #endif 204 int cpu; /* Was running on cpu */ 205 int pid; /* Pid context */ 206 unsigned long when; /* When did the operation occur */ 207 }; 208 209 enum track_item { TRACK_ALLOC, TRACK_FREE }; 210 211 #ifdef CONFIG_SYSFS 212 static int sysfs_slab_add(struct kmem_cache *); 213 static int sysfs_slab_alias(struct kmem_cache *, const char *); 214 static void sysfs_slab_remove(struct kmem_cache *); 215 216 #else 217 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 218 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 219 { return 0; } 220 static inline void sysfs_slab_remove(struct kmem_cache *s) 221 { 222 kfree(s->name); 223 kfree(s); 224 } 225 226 #endif 227 228 static inline void stat(const struct kmem_cache *s, enum stat_item si) 229 { 230 #ifdef CONFIG_SLUB_STATS 231 __this_cpu_inc(s->cpu_slab->stat[si]); 232 #endif 233 } 234 235 /******************************************************************** 236 * Core slab cache functions 237 *******************************************************************/ 238 239 int slab_is_available(void) 240 { 241 return slab_state >= UP; 242 } 243 244 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 245 { 246 return s->node[node]; 247 } 248 249 /* Verify that a pointer has an address that is valid within a slab page */ 250 static inline int check_valid_pointer(struct kmem_cache *s, 251 struct page *page, const void *object) 252 { 253 void *base; 254 255 if (!object) 256 return 1; 257 258 base = page_address(page); 259 if (object < base || object >= base + page->objects * s->size || 260 (object - base) % s->size) { 261 return 0; 262 } 263 264 return 1; 265 } 266 267 static inline void *get_freepointer(struct kmem_cache *s, void *object) 268 { 269 return *(void **)(object + s->offset); 270 } 271 272 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 273 { 274 void *p; 275 276 #ifdef CONFIG_DEBUG_PAGEALLOC 277 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 278 #else 279 p = get_freepointer(s, object); 280 #endif 281 return p; 282 } 283 284 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 285 { 286 *(void **)(object + s->offset) = fp; 287 } 288 289 /* Loop over all objects in a slab */ 290 #define for_each_object(__p, __s, __addr, __objects) \ 291 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 292 __p += (__s)->size) 293 294 /* Determine object index from a given position */ 295 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 296 { 297 return (p - addr) / s->size; 298 } 299 300 static inline size_t slab_ksize(const struct kmem_cache *s) 301 { 302 #ifdef CONFIG_SLUB_DEBUG 303 /* 304 * Debugging requires use of the padding between object 305 * and whatever may come after it. 306 */ 307 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 308 return s->objsize; 309 310 #endif 311 /* 312 * If we have the need to store the freelist pointer 313 * back there or track user information then we can 314 * only use the space before that information. 315 */ 316 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 317 return s->inuse; 318 /* 319 * Else we can use all the padding etc for the allocation 320 */ 321 return s->size; 322 } 323 324 static inline int order_objects(int order, unsigned long size, int reserved) 325 { 326 return ((PAGE_SIZE << order) - reserved) / size; 327 } 328 329 static inline struct kmem_cache_order_objects oo_make(int order, 330 unsigned long size, int reserved) 331 { 332 struct kmem_cache_order_objects x = { 333 (order << OO_SHIFT) + order_objects(order, size, reserved) 334 }; 335 336 return x; 337 } 338 339 static inline int oo_order(struct kmem_cache_order_objects x) 340 { 341 return x.x >> OO_SHIFT; 342 } 343 344 static inline int oo_objects(struct kmem_cache_order_objects x) 345 { 346 return x.x & OO_MASK; 347 } 348 349 /* 350 * Per slab locking using the pagelock 351 */ 352 static __always_inline void slab_lock(struct page *page) 353 { 354 bit_spin_lock(PG_locked, &page->flags); 355 } 356 357 static __always_inline void slab_unlock(struct page *page) 358 { 359 __bit_spin_unlock(PG_locked, &page->flags); 360 } 361 362 /* Interrupts must be disabled (for the fallback code to work right) */ 363 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 364 void *freelist_old, unsigned long counters_old, 365 void *freelist_new, unsigned long counters_new, 366 const char *n) 367 { 368 VM_BUG_ON(!irqs_disabled()); 369 #ifdef CONFIG_CMPXCHG_DOUBLE 370 if (s->flags & __CMPXCHG_DOUBLE) { 371 if (cmpxchg_double(&page->freelist, 372 freelist_old, counters_old, 373 freelist_new, counters_new)) 374 return 1; 375 } else 376 #endif 377 { 378 slab_lock(page); 379 if (page->freelist == freelist_old && page->counters == counters_old) { 380 page->freelist = freelist_new; 381 page->counters = counters_new; 382 slab_unlock(page); 383 return 1; 384 } 385 slab_unlock(page); 386 } 387 388 cpu_relax(); 389 stat(s, CMPXCHG_DOUBLE_FAIL); 390 391 #ifdef SLUB_DEBUG_CMPXCHG 392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 393 #endif 394 395 return 0; 396 } 397 398 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 399 void *freelist_old, unsigned long counters_old, 400 void *freelist_new, unsigned long counters_new, 401 const char *n) 402 { 403 #ifdef CONFIG_CMPXCHG_DOUBLE 404 if (s->flags & __CMPXCHG_DOUBLE) { 405 if (cmpxchg_double(&page->freelist, 406 freelist_old, counters_old, 407 freelist_new, counters_new)) 408 return 1; 409 } else 410 #endif 411 { 412 unsigned long flags; 413 414 local_irq_save(flags); 415 slab_lock(page); 416 if (page->freelist == freelist_old && page->counters == counters_old) { 417 page->freelist = freelist_new; 418 page->counters = counters_new; 419 slab_unlock(page); 420 local_irq_restore(flags); 421 return 1; 422 } 423 slab_unlock(page); 424 local_irq_restore(flags); 425 } 426 427 cpu_relax(); 428 stat(s, CMPXCHG_DOUBLE_FAIL); 429 430 #ifdef SLUB_DEBUG_CMPXCHG 431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 432 #endif 433 434 return 0; 435 } 436 437 #ifdef CONFIG_SLUB_DEBUG 438 /* 439 * Determine a map of object in use on a page. 440 * 441 * Node listlock must be held to guarantee that the page does 442 * not vanish from under us. 443 */ 444 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 445 { 446 void *p; 447 void *addr = page_address(page); 448 449 for (p = page->freelist; p; p = get_freepointer(s, p)) 450 set_bit(slab_index(p, s, addr), map); 451 } 452 453 /* 454 * Debug settings: 455 */ 456 #ifdef CONFIG_SLUB_DEBUG_ON 457 static int slub_debug = DEBUG_DEFAULT_FLAGS; 458 #else 459 static int slub_debug; 460 #endif 461 462 static char *slub_debug_slabs; 463 static int disable_higher_order_debug; 464 465 /* 466 * Object debugging 467 */ 468 static void print_section(char *text, u8 *addr, unsigned int length) 469 { 470 int i, offset; 471 int newline = 1; 472 char ascii[17]; 473 474 ascii[16] = 0; 475 476 for (i = 0; i < length; i++) { 477 if (newline) { 478 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 479 newline = 0; 480 } 481 printk(KERN_CONT " %02x", addr[i]); 482 offset = i % 16; 483 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 484 if (offset == 15) { 485 printk(KERN_CONT " %s\n", ascii); 486 newline = 1; 487 } 488 } 489 if (!newline) { 490 i %= 16; 491 while (i < 16) { 492 printk(KERN_CONT " "); 493 ascii[i] = ' '; 494 i++; 495 } 496 printk(KERN_CONT " %s\n", ascii); 497 } 498 } 499 500 static struct track *get_track(struct kmem_cache *s, void *object, 501 enum track_item alloc) 502 { 503 struct track *p; 504 505 if (s->offset) 506 p = object + s->offset + sizeof(void *); 507 else 508 p = object + s->inuse; 509 510 return p + alloc; 511 } 512 513 static void set_track(struct kmem_cache *s, void *object, 514 enum track_item alloc, unsigned long addr) 515 { 516 struct track *p = get_track(s, object, alloc); 517 518 if (addr) { 519 #ifdef CONFIG_STACKTRACE 520 struct stack_trace trace; 521 int i; 522 523 trace.nr_entries = 0; 524 trace.max_entries = TRACK_ADDRS_COUNT; 525 trace.entries = p->addrs; 526 trace.skip = 3; 527 save_stack_trace(&trace); 528 529 /* See rant in lockdep.c */ 530 if (trace.nr_entries != 0 && 531 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 532 trace.nr_entries--; 533 534 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 535 p->addrs[i] = 0; 536 #endif 537 p->addr = addr; 538 p->cpu = smp_processor_id(); 539 p->pid = current->pid; 540 p->when = jiffies; 541 } else 542 memset(p, 0, sizeof(struct track)); 543 } 544 545 static void init_tracking(struct kmem_cache *s, void *object) 546 { 547 if (!(s->flags & SLAB_STORE_USER)) 548 return; 549 550 set_track(s, object, TRACK_FREE, 0UL); 551 set_track(s, object, TRACK_ALLOC, 0UL); 552 } 553 554 static void print_track(const char *s, struct track *t) 555 { 556 if (!t->addr) 557 return; 558 559 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 560 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 561 #ifdef CONFIG_STACKTRACE 562 { 563 int i; 564 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 565 if (t->addrs[i]) 566 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 567 else 568 break; 569 } 570 #endif 571 } 572 573 static void print_tracking(struct kmem_cache *s, void *object) 574 { 575 if (!(s->flags & SLAB_STORE_USER)) 576 return; 577 578 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 579 print_track("Freed", get_track(s, object, TRACK_FREE)); 580 } 581 582 static void print_page_info(struct page *page) 583 { 584 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 585 page, page->objects, page->inuse, page->freelist, page->flags); 586 587 } 588 589 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 590 { 591 va_list args; 592 char buf[100]; 593 594 va_start(args, fmt); 595 vsnprintf(buf, sizeof(buf), fmt, args); 596 va_end(args); 597 printk(KERN_ERR "========================================" 598 "=====================================\n"); 599 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 600 printk(KERN_ERR "----------------------------------------" 601 "-------------------------------------\n\n"); 602 } 603 604 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 605 { 606 va_list args; 607 char buf[100]; 608 609 va_start(args, fmt); 610 vsnprintf(buf, sizeof(buf), fmt, args); 611 va_end(args); 612 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 613 } 614 615 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 616 { 617 unsigned int off; /* Offset of last byte */ 618 u8 *addr = page_address(page); 619 620 print_tracking(s, p); 621 622 print_page_info(page); 623 624 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 625 p, p - addr, get_freepointer(s, p)); 626 627 if (p > addr + 16) 628 print_section("Bytes b4", p - 16, 16); 629 630 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); 631 632 if (s->flags & SLAB_RED_ZONE) 633 print_section("Redzone", p + s->objsize, 634 s->inuse - s->objsize); 635 636 if (s->offset) 637 off = s->offset + sizeof(void *); 638 else 639 off = s->inuse; 640 641 if (s->flags & SLAB_STORE_USER) 642 off += 2 * sizeof(struct track); 643 644 if (off != s->size) 645 /* Beginning of the filler is the free pointer */ 646 print_section("Padding", p + off, s->size - off); 647 648 dump_stack(); 649 } 650 651 static void object_err(struct kmem_cache *s, struct page *page, 652 u8 *object, char *reason) 653 { 654 slab_bug(s, "%s", reason); 655 print_trailer(s, page, object); 656 } 657 658 static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 659 { 660 va_list args; 661 char buf[100]; 662 663 va_start(args, fmt); 664 vsnprintf(buf, sizeof(buf), fmt, args); 665 va_end(args); 666 slab_bug(s, "%s", buf); 667 print_page_info(page); 668 dump_stack(); 669 } 670 671 static void init_object(struct kmem_cache *s, void *object, u8 val) 672 { 673 u8 *p = object; 674 675 if (s->flags & __OBJECT_POISON) { 676 memset(p, POISON_FREE, s->objsize - 1); 677 p[s->objsize - 1] = POISON_END; 678 } 679 680 if (s->flags & SLAB_RED_ZONE) 681 memset(p + s->objsize, val, s->inuse - s->objsize); 682 } 683 684 static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) 685 { 686 while (bytes) { 687 if (*start != value) 688 return start; 689 start++; 690 bytes--; 691 } 692 return NULL; 693 } 694 695 static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) 696 { 697 u64 value64; 698 unsigned int words, prefix; 699 700 if (bytes <= 16) 701 return check_bytes8(start, value, bytes); 702 703 value64 = value | value << 8 | value << 16 | value << 24; 704 value64 = (value64 & 0xffffffff) | value64 << 32; 705 prefix = 8 - ((unsigned long)start) % 8; 706 707 if (prefix) { 708 u8 *r = check_bytes8(start, value, prefix); 709 if (r) 710 return r; 711 start += prefix; 712 bytes -= prefix; 713 } 714 715 words = bytes / 8; 716 717 while (words) { 718 if (*(u64 *)start != value64) 719 return check_bytes8(start, value, 8); 720 start += 8; 721 words--; 722 } 723 724 return check_bytes8(start, value, bytes % 8); 725 } 726 727 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 728 void *from, void *to) 729 { 730 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 731 memset(from, data, to - from); 732 } 733 734 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 735 u8 *object, char *what, 736 u8 *start, unsigned int value, unsigned int bytes) 737 { 738 u8 *fault; 739 u8 *end; 740 741 fault = check_bytes(start, value, bytes); 742 if (!fault) 743 return 1; 744 745 end = start + bytes; 746 while (end > fault && end[-1] == value) 747 end--; 748 749 slab_bug(s, "%s overwritten", what); 750 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 751 fault, end - 1, fault[0], value); 752 print_trailer(s, page, object); 753 754 restore_bytes(s, what, value, fault, end); 755 return 0; 756 } 757 758 /* 759 * Object layout: 760 * 761 * object address 762 * Bytes of the object to be managed. 763 * If the freepointer may overlay the object then the free 764 * pointer is the first word of the object. 765 * 766 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 767 * 0xa5 (POISON_END) 768 * 769 * object + s->objsize 770 * Padding to reach word boundary. This is also used for Redzoning. 771 * Padding is extended by another word if Redzoning is enabled and 772 * objsize == inuse. 773 * 774 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 775 * 0xcc (RED_ACTIVE) for objects in use. 776 * 777 * object + s->inuse 778 * Meta data starts here. 779 * 780 * A. Free pointer (if we cannot overwrite object on free) 781 * B. Tracking data for SLAB_STORE_USER 782 * C. Padding to reach required alignment boundary or at mininum 783 * one word if debugging is on to be able to detect writes 784 * before the word boundary. 785 * 786 * Padding is done using 0x5a (POISON_INUSE) 787 * 788 * object + s->size 789 * Nothing is used beyond s->size. 790 * 791 * If slabcaches are merged then the objsize and inuse boundaries are mostly 792 * ignored. And therefore no slab options that rely on these boundaries 793 * may be used with merged slabcaches. 794 */ 795 796 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 797 { 798 unsigned long off = s->inuse; /* The end of info */ 799 800 if (s->offset) 801 /* Freepointer is placed after the object. */ 802 off += sizeof(void *); 803 804 if (s->flags & SLAB_STORE_USER) 805 /* We also have user information there */ 806 off += 2 * sizeof(struct track); 807 808 if (s->size == off) 809 return 1; 810 811 return check_bytes_and_report(s, page, p, "Object padding", 812 p + off, POISON_INUSE, s->size - off); 813 } 814 815 /* Check the pad bytes at the end of a slab page */ 816 static int slab_pad_check(struct kmem_cache *s, struct page *page) 817 { 818 u8 *start; 819 u8 *fault; 820 u8 *end; 821 int length; 822 int remainder; 823 824 if (!(s->flags & SLAB_POISON)) 825 return 1; 826 827 start = page_address(page); 828 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 829 end = start + length; 830 remainder = length % s->size; 831 if (!remainder) 832 return 1; 833 834 fault = check_bytes(end - remainder, POISON_INUSE, remainder); 835 if (!fault) 836 return 1; 837 while (end > fault && end[-1] == POISON_INUSE) 838 end--; 839 840 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 841 print_section("Padding", end - remainder, remainder); 842 843 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 844 return 0; 845 } 846 847 static int check_object(struct kmem_cache *s, struct page *page, 848 void *object, u8 val) 849 { 850 u8 *p = object; 851 u8 *endobject = object + s->objsize; 852 853 if (s->flags & SLAB_RED_ZONE) { 854 if (!check_bytes_and_report(s, page, object, "Redzone", 855 endobject, val, s->inuse - s->objsize)) 856 return 0; 857 } else { 858 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 859 check_bytes_and_report(s, page, p, "Alignment padding", 860 endobject, POISON_INUSE, s->inuse - s->objsize); 861 } 862 } 863 864 if (s->flags & SLAB_POISON) { 865 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 866 (!check_bytes_and_report(s, page, p, "Poison", p, 867 POISON_FREE, s->objsize - 1) || 868 !check_bytes_and_report(s, page, p, "Poison", 869 p + s->objsize - 1, POISON_END, 1))) 870 return 0; 871 /* 872 * check_pad_bytes cleans up on its own. 873 */ 874 check_pad_bytes(s, page, p); 875 } 876 877 if (!s->offset && val == SLUB_RED_ACTIVE) 878 /* 879 * Object and freepointer overlap. Cannot check 880 * freepointer while object is allocated. 881 */ 882 return 1; 883 884 /* Check free pointer validity */ 885 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 886 object_err(s, page, p, "Freepointer corrupt"); 887 /* 888 * No choice but to zap it and thus lose the remainder 889 * of the free objects in this slab. May cause 890 * another error because the object count is now wrong. 891 */ 892 set_freepointer(s, p, NULL); 893 return 0; 894 } 895 return 1; 896 } 897 898 static int check_slab(struct kmem_cache *s, struct page *page) 899 { 900 int maxobj; 901 902 VM_BUG_ON(!irqs_disabled()); 903 904 if (!PageSlab(page)) { 905 slab_err(s, page, "Not a valid slab page"); 906 return 0; 907 } 908 909 maxobj = order_objects(compound_order(page), s->size, s->reserved); 910 if (page->objects > maxobj) { 911 slab_err(s, page, "objects %u > max %u", 912 s->name, page->objects, maxobj); 913 return 0; 914 } 915 if (page->inuse > page->objects) { 916 slab_err(s, page, "inuse %u > max %u", 917 s->name, page->inuse, page->objects); 918 return 0; 919 } 920 /* Slab_pad_check fixes things up after itself */ 921 slab_pad_check(s, page); 922 return 1; 923 } 924 925 /* 926 * Determine if a certain object on a page is on the freelist. Must hold the 927 * slab lock to guarantee that the chains are in a consistent state. 928 */ 929 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 930 { 931 int nr = 0; 932 void *fp; 933 void *object = NULL; 934 unsigned long max_objects; 935 936 fp = page->freelist; 937 while (fp && nr <= page->objects) { 938 if (fp == search) 939 return 1; 940 if (!check_valid_pointer(s, page, fp)) { 941 if (object) { 942 object_err(s, page, object, 943 "Freechain corrupt"); 944 set_freepointer(s, object, NULL); 945 break; 946 } else { 947 slab_err(s, page, "Freepointer corrupt"); 948 page->freelist = NULL; 949 page->inuse = page->objects; 950 slab_fix(s, "Freelist cleared"); 951 return 0; 952 } 953 break; 954 } 955 object = fp; 956 fp = get_freepointer(s, object); 957 nr++; 958 } 959 960 max_objects = order_objects(compound_order(page), s->size, s->reserved); 961 if (max_objects > MAX_OBJS_PER_PAGE) 962 max_objects = MAX_OBJS_PER_PAGE; 963 964 if (page->objects != max_objects) { 965 slab_err(s, page, "Wrong number of objects. Found %d but " 966 "should be %d", page->objects, max_objects); 967 page->objects = max_objects; 968 slab_fix(s, "Number of objects adjusted."); 969 } 970 if (page->inuse != page->objects - nr) { 971 slab_err(s, page, "Wrong object count. Counter is %d but " 972 "counted were %d", page->inuse, page->objects - nr); 973 page->inuse = page->objects - nr; 974 slab_fix(s, "Object count adjusted."); 975 } 976 return search == NULL; 977 } 978 979 static void trace(struct kmem_cache *s, struct page *page, void *object, 980 int alloc) 981 { 982 if (s->flags & SLAB_TRACE) { 983 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 984 s->name, 985 alloc ? "alloc" : "free", 986 object, page->inuse, 987 page->freelist); 988 989 if (!alloc) 990 print_section("Object", (void *)object, s->objsize); 991 992 dump_stack(); 993 } 994 } 995 996 /* 997 * Hooks for other subsystems that check memory allocations. In a typical 998 * production configuration these hooks all should produce no code at all. 999 */ 1000 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1001 { 1002 flags &= gfp_allowed_mask; 1003 lockdep_trace_alloc(flags); 1004 might_sleep_if(flags & __GFP_WAIT); 1005 1006 return should_failslab(s->objsize, flags, s->flags); 1007 } 1008 1009 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 1010 { 1011 flags &= gfp_allowed_mask; 1012 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1013 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 1014 } 1015 1016 static inline void slab_free_hook(struct kmem_cache *s, void *x) 1017 { 1018 kmemleak_free_recursive(x, s->flags); 1019 1020 /* 1021 * Trouble is that we may no longer disable interupts in the fast path 1022 * So in order to make the debug calls that expect irqs to be 1023 * disabled we need to disable interrupts temporarily. 1024 */ 1025 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 1026 { 1027 unsigned long flags; 1028 1029 local_irq_save(flags); 1030 kmemcheck_slab_free(s, x, s->objsize); 1031 debug_check_no_locks_freed(x, s->objsize); 1032 local_irq_restore(flags); 1033 } 1034 #endif 1035 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1036 debug_check_no_obj_freed(x, s->objsize); 1037 } 1038 1039 /* 1040 * Tracking of fully allocated slabs for debugging purposes. 1041 * 1042 * list_lock must be held. 1043 */ 1044 static void add_full(struct kmem_cache *s, 1045 struct kmem_cache_node *n, struct page *page) 1046 { 1047 if (!(s->flags & SLAB_STORE_USER)) 1048 return; 1049 1050 list_add(&page->lru, &n->full); 1051 } 1052 1053 /* 1054 * list_lock must be held. 1055 */ 1056 static void remove_full(struct kmem_cache *s, struct page *page) 1057 { 1058 if (!(s->flags & SLAB_STORE_USER)) 1059 return; 1060 1061 list_del(&page->lru); 1062 } 1063 1064 /* Tracking of the number of slabs for debugging purposes */ 1065 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1066 { 1067 struct kmem_cache_node *n = get_node(s, node); 1068 1069 return atomic_long_read(&n->nr_slabs); 1070 } 1071 1072 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1073 { 1074 return atomic_long_read(&n->nr_slabs); 1075 } 1076 1077 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1078 { 1079 struct kmem_cache_node *n = get_node(s, node); 1080 1081 /* 1082 * May be called early in order to allocate a slab for the 1083 * kmem_cache_node structure. Solve the chicken-egg 1084 * dilemma by deferring the increment of the count during 1085 * bootstrap (see early_kmem_cache_node_alloc). 1086 */ 1087 if (n) { 1088 atomic_long_inc(&n->nr_slabs); 1089 atomic_long_add(objects, &n->total_objects); 1090 } 1091 } 1092 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1093 { 1094 struct kmem_cache_node *n = get_node(s, node); 1095 1096 atomic_long_dec(&n->nr_slabs); 1097 atomic_long_sub(objects, &n->total_objects); 1098 } 1099 1100 /* Object debug checks for alloc/free paths */ 1101 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1102 void *object) 1103 { 1104 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1105 return; 1106 1107 init_object(s, object, SLUB_RED_INACTIVE); 1108 init_tracking(s, object); 1109 } 1110 1111 static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, 1112 void *object, unsigned long addr) 1113 { 1114 if (!check_slab(s, page)) 1115 goto bad; 1116 1117 if (!check_valid_pointer(s, page, object)) { 1118 object_err(s, page, object, "Freelist Pointer check fails"); 1119 goto bad; 1120 } 1121 1122 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1123 goto bad; 1124 1125 /* Success perform special debug activities for allocs */ 1126 if (s->flags & SLAB_STORE_USER) 1127 set_track(s, object, TRACK_ALLOC, addr); 1128 trace(s, page, object, 1); 1129 init_object(s, object, SLUB_RED_ACTIVE); 1130 return 1; 1131 1132 bad: 1133 if (PageSlab(page)) { 1134 /* 1135 * If this is a slab page then lets do the best we can 1136 * to avoid issues in the future. Marking all objects 1137 * as used avoids touching the remaining objects. 1138 */ 1139 slab_fix(s, "Marking all objects used"); 1140 page->inuse = page->objects; 1141 page->freelist = NULL; 1142 } 1143 return 0; 1144 } 1145 1146 static noinline int free_debug_processing(struct kmem_cache *s, 1147 struct page *page, void *object, unsigned long addr) 1148 { 1149 unsigned long flags; 1150 int rc = 0; 1151 1152 local_irq_save(flags); 1153 slab_lock(page); 1154 1155 if (!check_slab(s, page)) 1156 goto fail; 1157 1158 if (!check_valid_pointer(s, page, object)) { 1159 slab_err(s, page, "Invalid object pointer 0x%p", object); 1160 goto fail; 1161 } 1162 1163 if (on_freelist(s, page, object)) { 1164 object_err(s, page, object, "Object already free"); 1165 goto fail; 1166 } 1167 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1169 goto out; 1170 1171 if (unlikely(s != page->slab)) { 1172 if (!PageSlab(page)) { 1173 slab_err(s, page, "Attempt to free object(0x%p) " 1174 "outside of slab", object); 1175 } else if (!page->slab) { 1176 printk(KERN_ERR 1177 "SLUB <none>: no slab for object 0x%p.\n", 1178 object); 1179 dump_stack(); 1180 } else 1181 object_err(s, page, object, 1182 "page slab pointer corrupt."); 1183 goto fail; 1184 } 1185 1186 if (s->flags & SLAB_STORE_USER) 1187 set_track(s, object, TRACK_FREE, addr); 1188 trace(s, page, object, 0); 1189 init_object(s, object, SLUB_RED_INACTIVE); 1190 rc = 1; 1191 out: 1192 slab_unlock(page); 1193 local_irq_restore(flags); 1194 return rc; 1195 1196 fail: 1197 slab_fix(s, "Object at 0x%p not freed", object); 1198 goto out; 1199 } 1200 1201 static int __init setup_slub_debug(char *str) 1202 { 1203 slub_debug = DEBUG_DEFAULT_FLAGS; 1204 if (*str++ != '=' || !*str) 1205 /* 1206 * No options specified. Switch on full debugging. 1207 */ 1208 goto out; 1209 1210 if (*str == ',') 1211 /* 1212 * No options but restriction on slabs. This means full 1213 * debugging for slabs matching a pattern. 1214 */ 1215 goto check_slabs; 1216 1217 if (tolower(*str) == 'o') { 1218 /* 1219 * Avoid enabling debugging on caches if its minimum order 1220 * would increase as a result. 1221 */ 1222 disable_higher_order_debug = 1; 1223 goto out; 1224 } 1225 1226 slub_debug = 0; 1227 if (*str == '-') 1228 /* 1229 * Switch off all debugging measures. 1230 */ 1231 goto out; 1232 1233 /* 1234 * Determine which debug features should be switched on 1235 */ 1236 for (; *str && *str != ','; str++) { 1237 switch (tolower(*str)) { 1238 case 'f': 1239 slub_debug |= SLAB_DEBUG_FREE; 1240 break; 1241 case 'z': 1242 slub_debug |= SLAB_RED_ZONE; 1243 break; 1244 case 'p': 1245 slub_debug |= SLAB_POISON; 1246 break; 1247 case 'u': 1248 slub_debug |= SLAB_STORE_USER; 1249 break; 1250 case 't': 1251 slub_debug |= SLAB_TRACE; 1252 break; 1253 case 'a': 1254 slub_debug |= SLAB_FAILSLAB; 1255 break; 1256 default: 1257 printk(KERN_ERR "slub_debug option '%c' " 1258 "unknown. skipped\n", *str); 1259 } 1260 } 1261 1262 check_slabs: 1263 if (*str == ',') 1264 slub_debug_slabs = str + 1; 1265 out: 1266 return 1; 1267 } 1268 1269 __setup("slub_debug", setup_slub_debug); 1270 1271 static unsigned long kmem_cache_flags(unsigned long objsize, 1272 unsigned long flags, const char *name, 1273 void (*ctor)(void *)) 1274 { 1275 /* 1276 * Enable debugging if selected on the kernel commandline. 1277 */ 1278 if (slub_debug && (!slub_debug_slabs || 1279 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1280 flags |= slub_debug; 1281 1282 return flags; 1283 } 1284 #else 1285 static inline void setup_object_debug(struct kmem_cache *s, 1286 struct page *page, void *object) {} 1287 1288 static inline int alloc_debug_processing(struct kmem_cache *s, 1289 struct page *page, void *object, unsigned long addr) { return 0; } 1290 1291 static inline int free_debug_processing(struct kmem_cache *s, 1292 struct page *page, void *object, unsigned long addr) { return 0; } 1293 1294 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1295 { return 1; } 1296 static inline int check_object(struct kmem_cache *s, struct page *page, 1297 void *object, u8 val) { return 1; } 1298 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1299 struct page *page) {} 1300 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1301 static inline unsigned long kmem_cache_flags(unsigned long objsize, 1302 unsigned long flags, const char *name, 1303 void (*ctor)(void *)) 1304 { 1305 return flags; 1306 } 1307 #define slub_debug 0 1308 1309 #define disable_higher_order_debug 0 1310 1311 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1312 { return 0; } 1313 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1314 { return 0; } 1315 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1316 int objects) {} 1317 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1318 int objects) {} 1319 1320 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1321 { return 0; } 1322 1323 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1324 void *object) {} 1325 1326 static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1327 1328 #endif /* CONFIG_SLUB_DEBUG */ 1329 1330 /* 1331 * Slab allocation and freeing 1332 */ 1333 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1334 struct kmem_cache_order_objects oo) 1335 { 1336 int order = oo_order(oo); 1337 1338 flags |= __GFP_NOTRACK; 1339 1340 if (node == NUMA_NO_NODE) 1341 return alloc_pages(flags, order); 1342 else 1343 return alloc_pages_exact_node(node, flags, order); 1344 } 1345 1346 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1347 { 1348 struct page *page; 1349 struct kmem_cache_order_objects oo = s->oo; 1350 gfp_t alloc_gfp; 1351 1352 flags &= gfp_allowed_mask; 1353 1354 if (flags & __GFP_WAIT) 1355 local_irq_enable(); 1356 1357 flags |= s->allocflags; 1358 1359 /* 1360 * Let the initial higher-order allocation fail under memory pressure 1361 * so we fall-back to the minimum order allocation. 1362 */ 1363 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1364 1365 page = alloc_slab_page(alloc_gfp, node, oo); 1366 if (unlikely(!page)) { 1367 oo = s->min; 1368 /* 1369 * Allocation may have failed due to fragmentation. 1370 * Try a lower order alloc if possible 1371 */ 1372 page = alloc_slab_page(flags, node, oo); 1373 1374 if (page) 1375 stat(s, ORDER_FALLBACK); 1376 } 1377 1378 if (flags & __GFP_WAIT) 1379 local_irq_disable(); 1380 1381 if (!page) 1382 return NULL; 1383 1384 if (kmemcheck_enabled 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1386 int pages = 1 << oo_order(oo); 1387 1388 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1389 1390 /* 1391 * Objects from caches that have a constructor don't get 1392 * cleared when they're allocated, so we need to do it here. 1393 */ 1394 if (s->ctor) 1395 kmemcheck_mark_uninitialized_pages(page, pages); 1396 else 1397 kmemcheck_mark_unallocated_pages(page, pages); 1398 } 1399 1400 page->objects = oo_objects(oo); 1401 mod_zone_page_state(page_zone(page), 1402 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1403 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1404 1 << oo_order(oo)); 1405 1406 return page; 1407 } 1408 1409 static void setup_object(struct kmem_cache *s, struct page *page, 1410 void *object) 1411 { 1412 setup_object_debug(s, page, object); 1413 if (unlikely(s->ctor)) 1414 s->ctor(object); 1415 } 1416 1417 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1418 { 1419 struct page *page; 1420 void *start; 1421 void *last; 1422 void *p; 1423 1424 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1425 1426 page = allocate_slab(s, 1427 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1428 if (!page) 1429 goto out; 1430 1431 inc_slabs_node(s, page_to_nid(page), page->objects); 1432 page->slab = s; 1433 page->flags |= 1 << PG_slab; 1434 1435 start = page_address(page); 1436 1437 if (unlikely(s->flags & SLAB_POISON)) 1438 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1439 1440 last = start; 1441 for_each_object(p, s, start, page->objects) { 1442 setup_object(s, page, last); 1443 set_freepointer(s, last, p); 1444 last = p; 1445 } 1446 setup_object(s, page, last); 1447 set_freepointer(s, last, NULL); 1448 1449 page->freelist = start; 1450 page->inuse = 0; 1451 page->frozen = 1; 1452 out: 1453 return page; 1454 } 1455 1456 static void __free_slab(struct kmem_cache *s, struct page *page) 1457 { 1458 int order = compound_order(page); 1459 int pages = 1 << order; 1460 1461 if (kmem_cache_debug(s)) { 1462 void *p; 1463 1464 slab_pad_check(s, page); 1465 for_each_object(p, s, page_address(page), 1466 page->objects) 1467 check_object(s, page, p, SLUB_RED_INACTIVE); 1468 } 1469 1470 kmemcheck_free_shadow(page, compound_order(page)); 1471 1472 mod_zone_page_state(page_zone(page), 1473 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1474 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1475 -pages); 1476 1477 __ClearPageSlab(page); 1478 reset_page_mapcount(page); 1479 if (current->reclaim_state) 1480 current->reclaim_state->reclaimed_slab += pages; 1481 __free_pages(page, order); 1482 } 1483 1484 #define need_reserve_slab_rcu \ 1485 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1486 1487 static void rcu_free_slab(struct rcu_head *h) 1488 { 1489 struct page *page; 1490 1491 if (need_reserve_slab_rcu) 1492 page = virt_to_head_page(h); 1493 else 1494 page = container_of((struct list_head *)h, struct page, lru); 1495 1496 __free_slab(page->slab, page); 1497 } 1498 1499 static void free_slab(struct kmem_cache *s, struct page *page) 1500 { 1501 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1502 struct rcu_head *head; 1503 1504 if (need_reserve_slab_rcu) { 1505 int order = compound_order(page); 1506 int offset = (PAGE_SIZE << order) - s->reserved; 1507 1508 VM_BUG_ON(s->reserved != sizeof(*head)); 1509 head = page_address(page) + offset; 1510 } else { 1511 /* 1512 * RCU free overloads the RCU head over the LRU 1513 */ 1514 head = (void *)&page->lru; 1515 } 1516 1517 call_rcu(head, rcu_free_slab); 1518 } else 1519 __free_slab(s, page); 1520 } 1521 1522 static void discard_slab(struct kmem_cache *s, struct page *page) 1523 { 1524 dec_slabs_node(s, page_to_nid(page), page->objects); 1525 free_slab(s, page); 1526 } 1527 1528 /* 1529 * Management of partially allocated slabs. 1530 * 1531 * list_lock must be held. 1532 */ 1533 static inline void add_partial(struct kmem_cache_node *n, 1534 struct page *page, int tail) 1535 { 1536 n->nr_partial++; 1537 if (tail) 1538 list_add_tail(&page->lru, &n->partial); 1539 else 1540 list_add(&page->lru, &n->partial); 1541 } 1542 1543 /* 1544 * list_lock must be held. 1545 */ 1546 static inline void remove_partial(struct kmem_cache_node *n, 1547 struct page *page) 1548 { 1549 list_del(&page->lru); 1550 n->nr_partial--; 1551 } 1552 1553 /* 1554 * Lock slab, remove from the partial list and put the object into the 1555 * per cpu freelist. 1556 * 1557 * Must hold list_lock. 1558 */ 1559 static inline int acquire_slab(struct kmem_cache *s, 1560 struct kmem_cache_node *n, struct page *page) 1561 { 1562 void *freelist; 1563 unsigned long counters; 1564 struct page new; 1565 1566 /* 1567 * Zap the freelist and set the frozen bit. 1568 * The old freelist is the list of objects for the 1569 * per cpu allocation list. 1570 */ 1571 do { 1572 freelist = page->freelist; 1573 counters = page->counters; 1574 new.counters = counters; 1575 new.inuse = page->objects; 1576 1577 VM_BUG_ON(new.frozen); 1578 new.frozen = 1; 1579 1580 } while (!__cmpxchg_double_slab(s, page, 1581 freelist, counters, 1582 NULL, new.counters, 1583 "lock and freeze")); 1584 1585 remove_partial(n, page); 1586 1587 if (freelist) { 1588 /* Populate the per cpu freelist */ 1589 this_cpu_write(s->cpu_slab->freelist, freelist); 1590 this_cpu_write(s->cpu_slab->page, page); 1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page)); 1592 return 1; 1593 } else { 1594 /* 1595 * Slab page came from the wrong list. No object to allocate 1596 * from. Put it onto the correct list and continue partial 1597 * scan. 1598 */ 1599 printk(KERN_ERR "SLUB: %s : Page without available objects on" 1600 " partial list\n", s->name); 1601 return 0; 1602 } 1603 } 1604 1605 /* 1606 * Try to allocate a partial slab from a specific node. 1607 */ 1608 static struct page *get_partial_node(struct kmem_cache *s, 1609 struct kmem_cache_node *n) 1610 { 1611 struct page *page; 1612 1613 /* 1614 * Racy check. If we mistakenly see no partial slabs then we 1615 * just allocate an empty slab. If we mistakenly try to get a 1616 * partial slab and there is none available then get_partials() 1617 * will return NULL. 1618 */ 1619 if (!n || !n->nr_partial) 1620 return NULL; 1621 1622 spin_lock(&n->list_lock); 1623 list_for_each_entry(page, &n->partial, lru) 1624 if (acquire_slab(s, n, page)) 1625 goto out; 1626 page = NULL; 1627 out: 1628 spin_unlock(&n->list_lock); 1629 return page; 1630 } 1631 1632 /* 1633 * Get a page from somewhere. Search in increasing NUMA distances. 1634 */ 1635 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1636 { 1637 #ifdef CONFIG_NUMA 1638 struct zonelist *zonelist; 1639 struct zoneref *z; 1640 struct zone *zone; 1641 enum zone_type high_zoneidx = gfp_zone(flags); 1642 struct page *page; 1643 1644 /* 1645 * The defrag ratio allows a configuration of the tradeoffs between 1646 * inter node defragmentation and node local allocations. A lower 1647 * defrag_ratio increases the tendency to do local allocations 1648 * instead of attempting to obtain partial slabs from other nodes. 1649 * 1650 * If the defrag_ratio is set to 0 then kmalloc() always 1651 * returns node local objects. If the ratio is higher then kmalloc() 1652 * may return off node objects because partial slabs are obtained 1653 * from other nodes and filled up. 1654 * 1655 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1656 * defrag_ratio = 1000) then every (well almost) allocation will 1657 * first attempt to defrag slab caches on other nodes. This means 1658 * scanning over all nodes to look for partial slabs which may be 1659 * expensive if we do it every time we are trying to find a slab 1660 * with available objects. 1661 */ 1662 if (!s->remote_node_defrag_ratio || 1663 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1664 return NULL; 1665 1666 get_mems_allowed(); 1667 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1668 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1669 struct kmem_cache_node *n; 1670 1671 n = get_node(s, zone_to_nid(zone)); 1672 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1674 n->nr_partial > s->min_partial) { 1675 page = get_partial_node(s, n); 1676 if (page) { 1677 put_mems_allowed(); 1678 return page; 1679 } 1680 } 1681 } 1682 put_mems_allowed(); 1683 #endif 1684 return NULL; 1685 } 1686 1687 /* 1688 * Get a partial page, lock it and return it. 1689 */ 1690 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1691 { 1692 struct page *page; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1694 1695 page = get_partial_node(s, get_node(s, searchnode)); 1696 if (page || node != NUMA_NO_NODE) 1697 return page; 1698 1699 return get_any_partial(s, flags); 1700 } 1701 1702 #ifdef CONFIG_PREEMPT 1703 /* 1704 * Calculate the next globally unique transaction for disambiguiation 1705 * during cmpxchg. The transactions start with the cpu number and are then 1706 * incremented by CONFIG_NR_CPUS. 1707 */ 1708 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1709 #else 1710 /* 1711 * No preemption supported therefore also no need to check for 1712 * different cpus. 1713 */ 1714 #define TID_STEP 1 1715 #endif 1716 1717 static inline unsigned long next_tid(unsigned long tid) 1718 { 1719 return tid + TID_STEP; 1720 } 1721 1722 static inline unsigned int tid_to_cpu(unsigned long tid) 1723 { 1724 return tid % TID_STEP; 1725 } 1726 1727 static inline unsigned long tid_to_event(unsigned long tid) 1728 { 1729 return tid / TID_STEP; 1730 } 1731 1732 static inline unsigned int init_tid(int cpu) 1733 { 1734 return cpu; 1735 } 1736 1737 static inline void note_cmpxchg_failure(const char *n, 1738 const struct kmem_cache *s, unsigned long tid) 1739 { 1740 #ifdef SLUB_DEBUG_CMPXCHG 1741 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1742 1743 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1744 1745 #ifdef CONFIG_PREEMPT 1746 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1747 printk("due to cpu change %d -> %d\n", 1748 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1749 else 1750 #endif 1751 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1752 printk("due to cpu running other code. Event %ld->%ld\n", 1753 tid_to_event(tid), tid_to_event(actual_tid)); 1754 else 1755 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1756 actual_tid, tid, next_tid(tid)); 1757 #endif 1758 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1759 } 1760 1761 void init_kmem_cache_cpus(struct kmem_cache *s) 1762 { 1763 int cpu; 1764 1765 for_each_possible_cpu(cpu) 1766 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1767 } 1768 /* 1769 * Remove the cpu slab 1770 */ 1771 1772 /* 1773 * Remove the cpu slab 1774 */ 1775 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1776 { 1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1778 struct page *page = c->page; 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1780 int lock = 0; 1781 enum slab_modes l = M_NONE, m = M_NONE; 1782 void *freelist; 1783 void *nextfree; 1784 int tail = 0; 1785 struct page new; 1786 struct page old; 1787 1788 if (page->freelist) { 1789 stat(s, DEACTIVATE_REMOTE_FREES); 1790 tail = 1; 1791 } 1792 1793 c->tid = next_tid(c->tid); 1794 c->page = NULL; 1795 freelist = c->freelist; 1796 c->freelist = NULL; 1797 1798 /* 1799 * Stage one: Free all available per cpu objects back 1800 * to the page freelist while it is still frozen. Leave the 1801 * last one. 1802 * 1803 * There is no need to take the list->lock because the page 1804 * is still frozen. 1805 */ 1806 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1807 void *prior; 1808 unsigned long counters; 1809 1810 do { 1811 prior = page->freelist; 1812 counters = page->counters; 1813 set_freepointer(s, freelist, prior); 1814 new.counters = counters; 1815 new.inuse--; 1816 VM_BUG_ON(!new.frozen); 1817 1818 } while (!__cmpxchg_double_slab(s, page, 1819 prior, counters, 1820 freelist, new.counters, 1821 "drain percpu freelist")); 1822 1823 freelist = nextfree; 1824 } 1825 1826 /* 1827 * Stage two: Ensure that the page is unfrozen while the 1828 * list presence reflects the actual number of objects 1829 * during unfreeze. 1830 * 1831 * We setup the list membership and then perform a cmpxchg 1832 * with the count. If there is a mismatch then the page 1833 * is not unfrozen but the page is on the wrong list. 1834 * 1835 * Then we restart the process which may have to remove 1836 * the page from the list that we just put it on again 1837 * because the number of objects in the slab may have 1838 * changed. 1839 */ 1840 redo: 1841 1842 old.freelist = page->freelist; 1843 old.counters = page->counters; 1844 VM_BUG_ON(!old.frozen); 1845 1846 /* Determine target state of the slab */ 1847 new.counters = old.counters; 1848 if (freelist) { 1849 new.inuse--; 1850 set_freepointer(s, freelist, old.freelist); 1851 new.freelist = freelist; 1852 } else 1853 new.freelist = old.freelist; 1854 1855 new.frozen = 0; 1856 1857 if (!new.inuse && n->nr_partial > s->min_partial) 1858 m = M_FREE; 1859 else if (new.freelist) { 1860 m = M_PARTIAL; 1861 if (!lock) { 1862 lock = 1; 1863 /* 1864 * Taking the spinlock removes the possiblity 1865 * that acquire_slab() will see a slab page that 1866 * is frozen 1867 */ 1868 spin_lock(&n->list_lock); 1869 } 1870 } else { 1871 m = M_FULL; 1872 if (kmem_cache_debug(s) && !lock) { 1873 lock = 1; 1874 /* 1875 * This also ensures that the scanning of full 1876 * slabs from diagnostic functions will not see 1877 * any frozen slabs. 1878 */ 1879 spin_lock(&n->list_lock); 1880 } 1881 } 1882 1883 if (l != m) { 1884 1885 if (l == M_PARTIAL) 1886 1887 remove_partial(n, page); 1888 1889 else if (l == M_FULL) 1890 1891 remove_full(s, page); 1892 1893 if (m == M_PARTIAL) { 1894 1895 add_partial(n, page, tail); 1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1897 1898 } else if (m == M_FULL) { 1899 1900 stat(s, DEACTIVATE_FULL); 1901 add_full(s, n, page); 1902 1903 } 1904 } 1905 1906 l = m; 1907 if (!__cmpxchg_double_slab(s, page, 1908 old.freelist, old.counters, 1909 new.freelist, new.counters, 1910 "unfreezing slab")) 1911 goto redo; 1912 1913 if (lock) 1914 spin_unlock(&n->list_lock); 1915 1916 if (m == M_FREE) { 1917 stat(s, DEACTIVATE_EMPTY); 1918 discard_slab(s, page); 1919 stat(s, FREE_SLAB); 1920 } 1921 } 1922 1923 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1924 { 1925 stat(s, CPUSLAB_FLUSH); 1926 deactivate_slab(s, c); 1927 } 1928 1929 /* 1930 * Flush cpu slab. 1931 * 1932 * Called from IPI handler with interrupts disabled. 1933 */ 1934 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1935 { 1936 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1937 1938 if (likely(c && c->page)) 1939 flush_slab(s, c); 1940 } 1941 1942 static void flush_cpu_slab(void *d) 1943 { 1944 struct kmem_cache *s = d; 1945 1946 __flush_cpu_slab(s, smp_processor_id()); 1947 } 1948 1949 static void flush_all(struct kmem_cache *s) 1950 { 1951 on_each_cpu(flush_cpu_slab, s, 1); 1952 } 1953 1954 /* 1955 * Check if the objects in a per cpu structure fit numa 1956 * locality expectations. 1957 */ 1958 static inline int node_match(struct kmem_cache_cpu *c, int node) 1959 { 1960 #ifdef CONFIG_NUMA 1961 if (node != NUMA_NO_NODE && c->node != node) 1962 return 0; 1963 #endif 1964 return 1; 1965 } 1966 1967 static int count_free(struct page *page) 1968 { 1969 return page->objects - page->inuse; 1970 } 1971 1972 static unsigned long count_partial(struct kmem_cache_node *n, 1973 int (*get_count)(struct page *)) 1974 { 1975 unsigned long flags; 1976 unsigned long x = 0; 1977 struct page *page; 1978 1979 spin_lock_irqsave(&n->list_lock, flags); 1980 list_for_each_entry(page, &n->partial, lru) 1981 x += get_count(page); 1982 spin_unlock_irqrestore(&n->list_lock, flags); 1983 return x; 1984 } 1985 1986 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 1987 { 1988 #ifdef CONFIG_SLUB_DEBUG 1989 return atomic_long_read(&n->total_objects); 1990 #else 1991 return 0; 1992 #endif 1993 } 1994 1995 static noinline void 1996 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 1997 { 1998 int node; 1999 2000 printk(KERN_WARNING 2001 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2002 nid, gfpflags); 2003 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2004 "default order: %d, min order: %d\n", s->name, s->objsize, 2005 s->size, oo_order(s->oo), oo_order(s->min)); 2006 2007 if (oo_order(s->min) > get_order(s->objsize)) 2008 printk(KERN_WARNING " %s debugging increased min order, use " 2009 "slub_debug=O to disable.\n", s->name); 2010 2011 for_each_online_node(node) { 2012 struct kmem_cache_node *n = get_node(s, node); 2013 unsigned long nr_slabs; 2014 unsigned long nr_objs; 2015 unsigned long nr_free; 2016 2017 if (!n) 2018 continue; 2019 2020 nr_free = count_partial(n, count_free); 2021 nr_slabs = node_nr_slabs(n); 2022 nr_objs = node_nr_objs(n); 2023 2024 printk(KERN_WARNING 2025 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2026 node, nr_slabs, nr_objs, nr_free); 2027 } 2028 } 2029 2030 /* 2031 * Slow path. The lockless freelist is empty or we need to perform 2032 * debugging duties. 2033 * 2034 * Interrupts are disabled. 2035 * 2036 * Processing is still very fast if new objects have been freed to the 2037 * regular freelist. In that case we simply take over the regular freelist 2038 * as the lockless freelist and zap the regular freelist. 2039 * 2040 * If that is not working then we fall back to the partial lists. We take the 2041 * first element of the freelist as the object to allocate now and move the 2042 * rest of the freelist to the lockless freelist. 2043 * 2044 * And if we were unable to get a new slab from the partial slab lists then 2045 * we need to allocate a new slab. This is the slowest path since it involves 2046 * a call to the page allocator and the setup of a new slab. 2047 */ 2048 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2049 unsigned long addr, struct kmem_cache_cpu *c) 2050 { 2051 void **object; 2052 struct page *page; 2053 unsigned long flags; 2054 struct page new; 2055 unsigned long counters; 2056 2057 local_irq_save(flags); 2058 #ifdef CONFIG_PREEMPT 2059 /* 2060 * We may have been preempted and rescheduled on a different 2061 * cpu before disabling interrupts. Need to reload cpu area 2062 * pointer. 2063 */ 2064 c = this_cpu_ptr(s->cpu_slab); 2065 #endif 2066 2067 /* We handle __GFP_ZERO in the caller */ 2068 gfpflags &= ~__GFP_ZERO; 2069 2070 page = c->page; 2071 if (!page) 2072 goto new_slab; 2073 2074 if (unlikely(!node_match(c, node))) { 2075 stat(s, ALLOC_NODE_MISMATCH); 2076 deactivate_slab(s, c); 2077 goto new_slab; 2078 } 2079 2080 stat(s, ALLOC_SLOWPATH); 2081 2082 do { 2083 object = page->freelist; 2084 counters = page->counters; 2085 new.counters = counters; 2086 VM_BUG_ON(!new.frozen); 2087 2088 /* 2089 * If there is no object left then we use this loop to 2090 * deactivate the slab which is simple since no objects 2091 * are left in the slab and therefore we do not need to 2092 * put the page back onto the partial list. 2093 * 2094 * If there are objects left then we retrieve them 2095 * and use them to refill the per cpu queue. 2096 */ 2097 2098 new.inuse = page->objects; 2099 new.frozen = object != NULL; 2100 2101 } while (!__cmpxchg_double_slab(s, page, 2102 object, counters, 2103 NULL, new.counters, 2104 "__slab_alloc")); 2105 2106 if (unlikely(!object)) { 2107 c->page = NULL; 2108 stat(s, DEACTIVATE_BYPASS); 2109 goto new_slab; 2110 } 2111 2112 stat(s, ALLOC_REFILL); 2113 2114 load_freelist: 2115 VM_BUG_ON(!page->frozen); 2116 c->freelist = get_freepointer(s, object); 2117 c->tid = next_tid(c->tid); 2118 local_irq_restore(flags); 2119 return object; 2120 2121 new_slab: 2122 page = get_partial(s, gfpflags, node); 2123 if (page) { 2124 stat(s, ALLOC_FROM_PARTIAL); 2125 object = c->freelist; 2126 2127 if (kmem_cache_debug(s)) 2128 goto debug; 2129 goto load_freelist; 2130 } 2131 2132 page = new_slab(s, gfpflags, node); 2133 2134 if (page) { 2135 c = __this_cpu_ptr(s->cpu_slab); 2136 if (c->page) 2137 flush_slab(s, c); 2138 2139 /* 2140 * No other reference to the page yet so we can 2141 * muck around with it freely without cmpxchg 2142 */ 2143 object = page->freelist; 2144 page->freelist = NULL; 2145 page->inuse = page->objects; 2146 2147 stat(s, ALLOC_SLAB); 2148 c->node = page_to_nid(page); 2149 c->page = page; 2150 2151 if (kmem_cache_debug(s)) 2152 goto debug; 2153 goto load_freelist; 2154 } 2155 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2156 slab_out_of_memory(s, gfpflags, node); 2157 local_irq_restore(flags); 2158 return NULL; 2159 2160 debug: 2161 if (!object || !alloc_debug_processing(s, page, object, addr)) 2162 goto new_slab; 2163 2164 c->freelist = get_freepointer(s, object); 2165 deactivate_slab(s, c); 2166 c->page = NULL; 2167 c->node = NUMA_NO_NODE; 2168 local_irq_restore(flags); 2169 return object; 2170 } 2171 2172 /* 2173 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2174 * have the fastpath folded into their functions. So no function call 2175 * overhead for requests that can be satisfied on the fastpath. 2176 * 2177 * The fastpath works by first checking if the lockless freelist can be used. 2178 * If not then __slab_alloc is called for slow processing. 2179 * 2180 * Otherwise we can simply pick the next object from the lockless free list. 2181 */ 2182 static __always_inline void *slab_alloc(struct kmem_cache *s, 2183 gfp_t gfpflags, int node, unsigned long addr) 2184 { 2185 void **object; 2186 struct kmem_cache_cpu *c; 2187 unsigned long tid; 2188 2189 if (slab_pre_alloc_hook(s, gfpflags)) 2190 return NULL; 2191 2192 redo: 2193 2194 /* 2195 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2196 * enabled. We may switch back and forth between cpus while 2197 * reading from one cpu area. That does not matter as long 2198 * as we end up on the original cpu again when doing the cmpxchg. 2199 */ 2200 c = __this_cpu_ptr(s->cpu_slab); 2201 2202 /* 2203 * The transaction ids are globally unique per cpu and per operation on 2204 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2205 * occurs on the right processor and that there was no operation on the 2206 * linked list in between. 2207 */ 2208 tid = c->tid; 2209 barrier(); 2210 2211 object = c->freelist; 2212 if (unlikely(!object || !node_match(c, node))) 2213 2214 object = __slab_alloc(s, gfpflags, node, addr, c); 2215 2216 else { 2217 /* 2218 * The cmpxchg will only match if there was no additional 2219 * operation and if we are on the right processor. 2220 * 2221 * The cmpxchg does the following atomically (without lock semantics!) 2222 * 1. Relocate first pointer to the current per cpu area. 2223 * 2. Verify that tid and freelist have not been changed 2224 * 3. If they were not changed replace tid and freelist 2225 * 2226 * Since this is without lock semantics the protection is only against 2227 * code executing on this cpu *not* from access by other cpus. 2228 */ 2229 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2230 s->cpu_slab->freelist, s->cpu_slab->tid, 2231 object, tid, 2232 get_freepointer_safe(s, object), next_tid(tid)))) { 2233 2234 note_cmpxchg_failure("slab_alloc", s, tid); 2235 goto redo; 2236 } 2237 stat(s, ALLOC_FASTPATH); 2238 } 2239 2240 if (unlikely(gfpflags & __GFP_ZERO) && object) 2241 memset(object, 0, s->objsize); 2242 2243 slab_post_alloc_hook(s, gfpflags, object); 2244 2245 return object; 2246 } 2247 2248 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2249 { 2250 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2251 2252 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2253 2254 return ret; 2255 } 2256 EXPORT_SYMBOL(kmem_cache_alloc); 2257 2258 #ifdef CONFIG_TRACING 2259 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2260 { 2261 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2262 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2263 return ret; 2264 } 2265 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2266 2267 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 2268 { 2269 void *ret = kmalloc_order(size, flags, order); 2270 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 2271 return ret; 2272 } 2273 EXPORT_SYMBOL(kmalloc_order_trace); 2274 #endif 2275 2276 #ifdef CONFIG_NUMA 2277 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2278 { 2279 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2280 2281 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2282 s->objsize, s->size, gfpflags, node); 2283 2284 return ret; 2285 } 2286 EXPORT_SYMBOL(kmem_cache_alloc_node); 2287 2288 #ifdef CONFIG_TRACING 2289 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2290 gfp_t gfpflags, 2291 int node, size_t size) 2292 { 2293 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2294 2295 trace_kmalloc_node(_RET_IP_, ret, 2296 size, s->size, gfpflags, node); 2297 return ret; 2298 } 2299 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2300 #endif 2301 #endif 2302 2303 /* 2304 * Slow patch handling. This may still be called frequently since objects 2305 * have a longer lifetime than the cpu slabs in most processing loads. 2306 * 2307 * So we still attempt to reduce cache line usage. Just take the slab 2308 * lock and free the item. If there is no additional partial page 2309 * handling required then we can return immediately. 2310 */ 2311 static void __slab_free(struct kmem_cache *s, struct page *page, 2312 void *x, unsigned long addr) 2313 { 2314 void *prior; 2315 void **object = (void *)x; 2316 int was_frozen; 2317 int inuse; 2318 struct page new; 2319 unsigned long counters; 2320 struct kmem_cache_node *n = NULL; 2321 unsigned long uninitialized_var(flags); 2322 2323 stat(s, FREE_SLOWPATH); 2324 2325 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2326 return; 2327 2328 do { 2329 prior = page->freelist; 2330 counters = page->counters; 2331 set_freepointer(s, object, prior); 2332 new.counters = counters; 2333 was_frozen = new.frozen; 2334 new.inuse--; 2335 if ((!new.inuse || !prior) && !was_frozen && !n) { 2336 n = get_node(s, page_to_nid(page)); 2337 /* 2338 * Speculatively acquire the list_lock. 2339 * If the cmpxchg does not succeed then we may 2340 * drop the list_lock without any processing. 2341 * 2342 * Otherwise the list_lock will synchronize with 2343 * other processors updating the list of slabs. 2344 */ 2345 spin_lock_irqsave(&n->list_lock, flags); 2346 } 2347 inuse = new.inuse; 2348 2349 } while (!cmpxchg_double_slab(s, page, 2350 prior, counters, 2351 object, new.counters, 2352 "__slab_free")); 2353 2354 if (likely(!n)) { 2355 /* 2356 * The list lock was not taken therefore no list 2357 * activity can be necessary. 2358 */ 2359 if (was_frozen) 2360 stat(s, FREE_FROZEN); 2361 return; 2362 } 2363 2364 /* 2365 * was_frozen may have been set after we acquired the list_lock in 2366 * an earlier loop. So we need to check it here again. 2367 */ 2368 if (was_frozen) 2369 stat(s, FREE_FROZEN); 2370 else { 2371 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2372 goto slab_empty; 2373 2374 /* 2375 * Objects left in the slab. If it was not on the partial list before 2376 * then add it. 2377 */ 2378 if (unlikely(!prior)) { 2379 remove_full(s, page); 2380 add_partial(n, page, 1); 2381 stat(s, FREE_ADD_PARTIAL); 2382 } 2383 } 2384 spin_unlock_irqrestore(&n->list_lock, flags); 2385 return; 2386 2387 slab_empty: 2388 if (prior) { 2389 /* 2390 * Slab on the partial list. 2391 */ 2392 remove_partial(n, page); 2393 stat(s, FREE_REMOVE_PARTIAL); 2394 } else 2395 /* Slab must be on the full list */ 2396 remove_full(s, page); 2397 2398 spin_unlock_irqrestore(&n->list_lock, flags); 2399 stat(s, FREE_SLAB); 2400 discard_slab(s, page); 2401 } 2402 2403 /* 2404 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2405 * can perform fastpath freeing without additional function calls. 2406 * 2407 * The fastpath is only possible if we are freeing to the current cpu slab 2408 * of this processor. This typically the case if we have just allocated 2409 * the item before. 2410 * 2411 * If fastpath is not possible then fall back to __slab_free where we deal 2412 * with all sorts of special processing. 2413 */ 2414 static __always_inline void slab_free(struct kmem_cache *s, 2415 struct page *page, void *x, unsigned long addr) 2416 { 2417 void **object = (void *)x; 2418 struct kmem_cache_cpu *c; 2419 unsigned long tid; 2420 2421 slab_free_hook(s, x); 2422 2423 redo: 2424 2425 /* 2426 * Determine the currently cpus per cpu slab. 2427 * The cpu may change afterward. However that does not matter since 2428 * data is retrieved via this pointer. If we are on the same cpu 2429 * during the cmpxchg then the free will succedd. 2430 */ 2431 c = __this_cpu_ptr(s->cpu_slab); 2432 2433 tid = c->tid; 2434 barrier(); 2435 2436 if (likely(page == c->page)) { 2437 set_freepointer(s, object, c->freelist); 2438 2439 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2440 s->cpu_slab->freelist, s->cpu_slab->tid, 2441 c->freelist, tid, 2442 object, next_tid(tid)))) { 2443 2444 note_cmpxchg_failure("slab_free", s, tid); 2445 goto redo; 2446 } 2447 stat(s, FREE_FASTPATH); 2448 } else 2449 __slab_free(s, page, x, addr); 2450 2451 } 2452 2453 void kmem_cache_free(struct kmem_cache *s, void *x) 2454 { 2455 struct page *page; 2456 2457 page = virt_to_head_page(x); 2458 2459 slab_free(s, page, x, _RET_IP_); 2460 2461 trace_kmem_cache_free(_RET_IP_, x); 2462 } 2463 EXPORT_SYMBOL(kmem_cache_free); 2464 2465 /* 2466 * Object placement in a slab is made very easy because we always start at 2467 * offset 0. If we tune the size of the object to the alignment then we can 2468 * get the required alignment by putting one properly sized object after 2469 * another. 2470 * 2471 * Notice that the allocation order determines the sizes of the per cpu 2472 * caches. Each processor has always one slab available for allocations. 2473 * Increasing the allocation order reduces the number of times that slabs 2474 * must be moved on and off the partial lists and is therefore a factor in 2475 * locking overhead. 2476 */ 2477 2478 /* 2479 * Mininum / Maximum order of slab pages. This influences locking overhead 2480 * and slab fragmentation. A higher order reduces the number of partial slabs 2481 * and increases the number of allocations possible without having to 2482 * take the list_lock. 2483 */ 2484 static int slub_min_order; 2485 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2486 static int slub_min_objects; 2487 2488 /* 2489 * Merge control. If this is set then no merging of slab caches will occur. 2490 * (Could be removed. This was introduced to pacify the merge skeptics.) 2491 */ 2492 static int slub_nomerge; 2493 2494 /* 2495 * Calculate the order of allocation given an slab object size. 2496 * 2497 * The order of allocation has significant impact on performance and other 2498 * system components. Generally order 0 allocations should be preferred since 2499 * order 0 does not cause fragmentation in the page allocator. Larger objects 2500 * be problematic to put into order 0 slabs because there may be too much 2501 * unused space left. We go to a higher order if more than 1/16th of the slab 2502 * would be wasted. 2503 * 2504 * In order to reach satisfactory performance we must ensure that a minimum 2505 * number of objects is in one slab. Otherwise we may generate too much 2506 * activity on the partial lists which requires taking the list_lock. This is 2507 * less a concern for large slabs though which are rarely used. 2508 * 2509 * slub_max_order specifies the order where we begin to stop considering the 2510 * number of objects in a slab as critical. If we reach slub_max_order then 2511 * we try to keep the page order as low as possible. So we accept more waste 2512 * of space in favor of a small page order. 2513 * 2514 * Higher order allocations also allow the placement of more objects in a 2515 * slab and thereby reduce object handling overhead. If the user has 2516 * requested a higher mininum order then we start with that one instead of 2517 * the smallest order which will fit the object. 2518 */ 2519 static inline int slab_order(int size, int min_objects, 2520 int max_order, int fract_leftover, int reserved) 2521 { 2522 int order; 2523 int rem; 2524 int min_order = slub_min_order; 2525 2526 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2527 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2528 2529 for (order = max(min_order, 2530 fls(min_objects * size - 1) - PAGE_SHIFT); 2531 order <= max_order; order++) { 2532 2533 unsigned long slab_size = PAGE_SIZE << order; 2534 2535 if (slab_size < min_objects * size + reserved) 2536 continue; 2537 2538 rem = (slab_size - reserved) % size; 2539 2540 if (rem <= slab_size / fract_leftover) 2541 break; 2542 2543 } 2544 2545 return order; 2546 } 2547 2548 static inline int calculate_order(int size, int reserved) 2549 { 2550 int order; 2551 int min_objects; 2552 int fraction; 2553 int max_objects; 2554 2555 /* 2556 * Attempt to find best configuration for a slab. This 2557 * works by first attempting to generate a layout with 2558 * the best configuration and backing off gradually. 2559 * 2560 * First we reduce the acceptable waste in a slab. Then 2561 * we reduce the minimum objects required in a slab. 2562 */ 2563 min_objects = slub_min_objects; 2564 if (!min_objects) 2565 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2566 max_objects = order_objects(slub_max_order, size, reserved); 2567 min_objects = min(min_objects, max_objects); 2568 2569 while (min_objects > 1) { 2570 fraction = 16; 2571 while (fraction >= 4) { 2572 order = slab_order(size, min_objects, 2573 slub_max_order, fraction, reserved); 2574 if (order <= slub_max_order) 2575 return order; 2576 fraction /= 2; 2577 } 2578 min_objects--; 2579 } 2580 2581 /* 2582 * We were unable to place multiple objects in a slab. Now 2583 * lets see if we can place a single object there. 2584 */ 2585 order = slab_order(size, 1, slub_max_order, 1, reserved); 2586 if (order <= slub_max_order) 2587 return order; 2588 2589 /* 2590 * Doh this slab cannot be placed using slub_max_order. 2591 */ 2592 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2593 if (order < MAX_ORDER) 2594 return order; 2595 return -ENOSYS; 2596 } 2597 2598 /* 2599 * Figure out what the alignment of the objects will be. 2600 */ 2601 static unsigned long calculate_alignment(unsigned long flags, 2602 unsigned long align, unsigned long size) 2603 { 2604 /* 2605 * If the user wants hardware cache aligned objects then follow that 2606 * suggestion if the object is sufficiently large. 2607 * 2608 * The hardware cache alignment cannot override the specified 2609 * alignment though. If that is greater then use it. 2610 */ 2611 if (flags & SLAB_HWCACHE_ALIGN) { 2612 unsigned long ralign = cache_line_size(); 2613 while (size <= ralign / 2) 2614 ralign /= 2; 2615 align = max(align, ralign); 2616 } 2617 2618 if (align < ARCH_SLAB_MINALIGN) 2619 align = ARCH_SLAB_MINALIGN; 2620 2621 return ALIGN(align, sizeof(void *)); 2622 } 2623 2624 static void 2625 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2626 { 2627 n->nr_partial = 0; 2628 spin_lock_init(&n->list_lock); 2629 INIT_LIST_HEAD(&n->partial); 2630 #ifdef CONFIG_SLUB_DEBUG 2631 atomic_long_set(&n->nr_slabs, 0); 2632 atomic_long_set(&n->total_objects, 0); 2633 INIT_LIST_HEAD(&n->full); 2634 #endif 2635 } 2636 2637 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2638 { 2639 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2640 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2641 2642 /* 2643 * Must align to double word boundary for the double cmpxchg 2644 * instructions to work; see __pcpu_double_call_return_bool(). 2645 */ 2646 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2647 2 * sizeof(void *)); 2648 2649 if (!s->cpu_slab) 2650 return 0; 2651 2652 init_kmem_cache_cpus(s); 2653 2654 return 1; 2655 } 2656 2657 static struct kmem_cache *kmem_cache_node; 2658 2659 /* 2660 * No kmalloc_node yet so do it by hand. We know that this is the first 2661 * slab on the node for this slabcache. There are no concurrent accesses 2662 * possible. 2663 * 2664 * Note that this function only works on the kmalloc_node_cache 2665 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2666 * memory on a fresh node that has no slab structures yet. 2667 */ 2668 static void early_kmem_cache_node_alloc(int node) 2669 { 2670 struct page *page; 2671 struct kmem_cache_node *n; 2672 2673 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2674 2675 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2676 2677 BUG_ON(!page); 2678 if (page_to_nid(page) != node) { 2679 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2680 "node %d\n", node); 2681 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2682 "in order to be able to continue\n"); 2683 } 2684 2685 n = page->freelist; 2686 BUG_ON(!n); 2687 page->freelist = get_freepointer(kmem_cache_node, n); 2688 page->inuse++; 2689 page->frozen = 0; 2690 kmem_cache_node->node[node] = n; 2691 #ifdef CONFIG_SLUB_DEBUG 2692 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2693 init_tracking(kmem_cache_node, n); 2694 #endif 2695 init_kmem_cache_node(n, kmem_cache_node); 2696 inc_slabs_node(kmem_cache_node, node, page->objects); 2697 2698 add_partial(n, page, 0); 2699 } 2700 2701 static void free_kmem_cache_nodes(struct kmem_cache *s) 2702 { 2703 int node; 2704 2705 for_each_node_state(node, N_NORMAL_MEMORY) { 2706 struct kmem_cache_node *n = s->node[node]; 2707 2708 if (n) 2709 kmem_cache_free(kmem_cache_node, n); 2710 2711 s->node[node] = NULL; 2712 } 2713 } 2714 2715 static int init_kmem_cache_nodes(struct kmem_cache *s) 2716 { 2717 int node; 2718 2719 for_each_node_state(node, N_NORMAL_MEMORY) { 2720 struct kmem_cache_node *n; 2721 2722 if (slab_state == DOWN) { 2723 early_kmem_cache_node_alloc(node); 2724 continue; 2725 } 2726 n = kmem_cache_alloc_node(kmem_cache_node, 2727 GFP_KERNEL, node); 2728 2729 if (!n) { 2730 free_kmem_cache_nodes(s); 2731 return 0; 2732 } 2733 2734 s->node[node] = n; 2735 init_kmem_cache_node(n, s); 2736 } 2737 return 1; 2738 } 2739 2740 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2741 { 2742 if (min < MIN_PARTIAL) 2743 min = MIN_PARTIAL; 2744 else if (min > MAX_PARTIAL) 2745 min = MAX_PARTIAL; 2746 s->min_partial = min; 2747 } 2748 2749 /* 2750 * calculate_sizes() determines the order and the distribution of data within 2751 * a slab object. 2752 */ 2753 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2754 { 2755 unsigned long flags = s->flags; 2756 unsigned long size = s->objsize; 2757 unsigned long align = s->align; 2758 int order; 2759 2760 /* 2761 * Round up object size to the next word boundary. We can only 2762 * place the free pointer at word boundaries and this determines 2763 * the possible location of the free pointer. 2764 */ 2765 size = ALIGN(size, sizeof(void *)); 2766 2767 #ifdef CONFIG_SLUB_DEBUG 2768 /* 2769 * Determine if we can poison the object itself. If the user of 2770 * the slab may touch the object after free or before allocation 2771 * then we should never poison the object itself. 2772 */ 2773 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2774 !s->ctor) 2775 s->flags |= __OBJECT_POISON; 2776 else 2777 s->flags &= ~__OBJECT_POISON; 2778 2779 2780 /* 2781 * If we are Redzoning then check if there is some space between the 2782 * end of the object and the free pointer. If not then add an 2783 * additional word to have some bytes to store Redzone information. 2784 */ 2785 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2786 size += sizeof(void *); 2787 #endif 2788 2789 /* 2790 * With that we have determined the number of bytes in actual use 2791 * by the object. This is the potential offset to the free pointer. 2792 */ 2793 s->inuse = size; 2794 2795 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2796 s->ctor)) { 2797 /* 2798 * Relocate free pointer after the object if it is not 2799 * permitted to overwrite the first word of the object on 2800 * kmem_cache_free. 2801 * 2802 * This is the case if we do RCU, have a constructor or 2803 * destructor or are poisoning the objects. 2804 */ 2805 s->offset = size; 2806 size += sizeof(void *); 2807 } 2808 2809 #ifdef CONFIG_SLUB_DEBUG 2810 if (flags & SLAB_STORE_USER) 2811 /* 2812 * Need to store information about allocs and frees after 2813 * the object. 2814 */ 2815 size += 2 * sizeof(struct track); 2816 2817 if (flags & SLAB_RED_ZONE) 2818 /* 2819 * Add some empty padding so that we can catch 2820 * overwrites from earlier objects rather than let 2821 * tracking information or the free pointer be 2822 * corrupted if a user writes before the start 2823 * of the object. 2824 */ 2825 size += sizeof(void *); 2826 #endif 2827 2828 /* 2829 * Determine the alignment based on various parameters that the 2830 * user specified and the dynamic determination of cache line size 2831 * on bootup. 2832 */ 2833 align = calculate_alignment(flags, align, s->objsize); 2834 s->align = align; 2835 2836 /* 2837 * SLUB stores one object immediately after another beginning from 2838 * offset 0. In order to align the objects we have to simply size 2839 * each object to conform to the alignment. 2840 */ 2841 size = ALIGN(size, align); 2842 s->size = size; 2843 if (forced_order >= 0) 2844 order = forced_order; 2845 else 2846 order = calculate_order(size, s->reserved); 2847 2848 if (order < 0) 2849 return 0; 2850 2851 s->allocflags = 0; 2852 if (order) 2853 s->allocflags |= __GFP_COMP; 2854 2855 if (s->flags & SLAB_CACHE_DMA) 2856 s->allocflags |= SLUB_DMA; 2857 2858 if (s->flags & SLAB_RECLAIM_ACCOUNT) 2859 s->allocflags |= __GFP_RECLAIMABLE; 2860 2861 /* 2862 * Determine the number of objects per slab 2863 */ 2864 s->oo = oo_make(order, size, s->reserved); 2865 s->min = oo_make(get_order(size), size, s->reserved); 2866 if (oo_objects(s->oo) > oo_objects(s->max)) 2867 s->max = s->oo; 2868 2869 return !!oo_objects(s->oo); 2870 2871 } 2872 2873 static int kmem_cache_open(struct kmem_cache *s, 2874 const char *name, size_t size, 2875 size_t align, unsigned long flags, 2876 void (*ctor)(void *)) 2877 { 2878 memset(s, 0, kmem_size); 2879 s->name = name; 2880 s->ctor = ctor; 2881 s->objsize = size; 2882 s->align = align; 2883 s->flags = kmem_cache_flags(size, flags, name, ctor); 2884 s->reserved = 0; 2885 2886 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 2887 s->reserved = sizeof(struct rcu_head); 2888 2889 if (!calculate_sizes(s, -1)) 2890 goto error; 2891 if (disable_higher_order_debug) { 2892 /* 2893 * Disable debugging flags that store metadata if the min slab 2894 * order increased. 2895 */ 2896 if (get_order(s->size) > get_order(s->objsize)) { 2897 s->flags &= ~DEBUG_METADATA_FLAGS; 2898 s->offset = 0; 2899 if (!calculate_sizes(s, -1)) 2900 goto error; 2901 } 2902 } 2903 2904 #ifdef CONFIG_CMPXCHG_DOUBLE 2905 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 2906 /* Enable fast mode */ 2907 s->flags |= __CMPXCHG_DOUBLE; 2908 #endif 2909 2910 /* 2911 * The larger the object size is, the more pages we want on the partial 2912 * list to avoid pounding the page allocator excessively. 2913 */ 2914 set_min_partial(s, ilog2(s->size)); 2915 s->refcount = 1; 2916 #ifdef CONFIG_NUMA 2917 s->remote_node_defrag_ratio = 1000; 2918 #endif 2919 if (!init_kmem_cache_nodes(s)) 2920 goto error; 2921 2922 if (alloc_kmem_cache_cpus(s)) 2923 return 1; 2924 2925 free_kmem_cache_nodes(s); 2926 error: 2927 if (flags & SLAB_PANIC) 2928 panic("Cannot create slab %s size=%lu realsize=%u " 2929 "order=%u offset=%u flags=%lx\n", 2930 s->name, (unsigned long)size, s->size, oo_order(s->oo), 2931 s->offset, flags); 2932 return 0; 2933 } 2934 2935 /* 2936 * Determine the size of a slab object 2937 */ 2938 unsigned int kmem_cache_size(struct kmem_cache *s) 2939 { 2940 return s->objsize; 2941 } 2942 EXPORT_SYMBOL(kmem_cache_size); 2943 2944 static void list_slab_objects(struct kmem_cache *s, struct page *page, 2945 const char *text) 2946 { 2947 #ifdef CONFIG_SLUB_DEBUG 2948 void *addr = page_address(page); 2949 void *p; 2950 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 2951 sizeof(long), GFP_ATOMIC); 2952 if (!map) 2953 return; 2954 slab_err(s, page, "%s", text); 2955 slab_lock(page); 2956 2957 get_map(s, page, map); 2958 for_each_object(p, s, addr, page->objects) { 2959 2960 if (!test_bit(slab_index(p, s, addr), map)) { 2961 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 2962 p, p - addr); 2963 print_tracking(s, p); 2964 } 2965 } 2966 slab_unlock(page); 2967 kfree(map); 2968 #endif 2969 } 2970 2971 /* 2972 * Attempt to free all partial slabs on a node. 2973 */ 2974 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 2975 { 2976 unsigned long flags; 2977 struct page *page, *h; 2978 2979 spin_lock_irqsave(&n->list_lock, flags); 2980 list_for_each_entry_safe(page, h, &n->partial, lru) { 2981 if (!page->inuse) { 2982 remove_partial(n, page); 2983 discard_slab(s, page); 2984 } else { 2985 list_slab_objects(s, page, 2986 "Objects remaining on kmem_cache_close()"); 2987 } 2988 } 2989 spin_unlock_irqrestore(&n->list_lock, flags); 2990 } 2991 2992 /* 2993 * Release all resources used by a slab cache. 2994 */ 2995 static inline int kmem_cache_close(struct kmem_cache *s) 2996 { 2997 int node; 2998 2999 flush_all(s); 3000 free_percpu(s->cpu_slab); 3001 /* Attempt to free all objects */ 3002 for_each_node_state(node, N_NORMAL_MEMORY) { 3003 struct kmem_cache_node *n = get_node(s, node); 3004 3005 free_partial(s, n); 3006 if (n->nr_partial || slabs_node(s, node)) 3007 return 1; 3008 } 3009 free_kmem_cache_nodes(s); 3010 return 0; 3011 } 3012 3013 /* 3014 * Close a cache and release the kmem_cache structure 3015 * (must be used for caches created using kmem_cache_create) 3016 */ 3017 void kmem_cache_destroy(struct kmem_cache *s) 3018 { 3019 down_write(&slub_lock); 3020 s->refcount--; 3021 if (!s->refcount) { 3022 list_del(&s->list); 3023 if (kmem_cache_close(s)) { 3024 printk(KERN_ERR "SLUB %s: %s called for cache that " 3025 "still has objects.\n", s->name, __func__); 3026 dump_stack(); 3027 } 3028 if (s->flags & SLAB_DESTROY_BY_RCU) 3029 rcu_barrier(); 3030 sysfs_slab_remove(s); 3031 } 3032 up_write(&slub_lock); 3033 } 3034 EXPORT_SYMBOL(kmem_cache_destroy); 3035 3036 /******************************************************************** 3037 * Kmalloc subsystem 3038 *******************************************************************/ 3039 3040 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3041 EXPORT_SYMBOL(kmalloc_caches); 3042 3043 static struct kmem_cache *kmem_cache; 3044 3045 #ifdef CONFIG_ZONE_DMA 3046 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3047 #endif 3048 3049 static int __init setup_slub_min_order(char *str) 3050 { 3051 get_option(&str, &slub_min_order); 3052 3053 return 1; 3054 } 3055 3056 __setup("slub_min_order=", setup_slub_min_order); 3057 3058 static int __init setup_slub_max_order(char *str) 3059 { 3060 get_option(&str, &slub_max_order); 3061 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3062 3063 return 1; 3064 } 3065 3066 __setup("slub_max_order=", setup_slub_max_order); 3067 3068 static int __init setup_slub_min_objects(char *str) 3069 { 3070 get_option(&str, &slub_min_objects); 3071 3072 return 1; 3073 } 3074 3075 __setup("slub_min_objects=", setup_slub_min_objects); 3076 3077 static int __init setup_slub_nomerge(char *str) 3078 { 3079 slub_nomerge = 1; 3080 return 1; 3081 } 3082 3083 __setup("slub_nomerge", setup_slub_nomerge); 3084 3085 static struct kmem_cache *__init create_kmalloc_cache(const char *name, 3086 int size, unsigned int flags) 3087 { 3088 struct kmem_cache *s; 3089 3090 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3091 3092 /* 3093 * This function is called with IRQs disabled during early-boot on 3094 * single CPU so there's no need to take slub_lock here. 3095 */ 3096 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3097 flags, NULL)) 3098 goto panic; 3099 3100 list_add(&s->list, &slab_caches); 3101 return s; 3102 3103 panic: 3104 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 3105 return NULL; 3106 } 3107 3108 /* 3109 * Conversion table for small slabs sizes / 8 to the index in the 3110 * kmalloc array. This is necessary for slabs < 192 since we have non power 3111 * of two cache sizes there. The size of larger slabs can be determined using 3112 * fls. 3113 */ 3114 static s8 size_index[24] = { 3115 3, /* 8 */ 3116 4, /* 16 */ 3117 5, /* 24 */ 3118 5, /* 32 */ 3119 6, /* 40 */ 3120 6, /* 48 */ 3121 6, /* 56 */ 3122 6, /* 64 */ 3123 1, /* 72 */ 3124 1, /* 80 */ 3125 1, /* 88 */ 3126 1, /* 96 */ 3127 7, /* 104 */ 3128 7, /* 112 */ 3129 7, /* 120 */ 3130 7, /* 128 */ 3131 2, /* 136 */ 3132 2, /* 144 */ 3133 2, /* 152 */ 3134 2, /* 160 */ 3135 2, /* 168 */ 3136 2, /* 176 */ 3137 2, /* 184 */ 3138 2 /* 192 */ 3139 }; 3140 3141 static inline int size_index_elem(size_t bytes) 3142 { 3143 return (bytes - 1) / 8; 3144 } 3145 3146 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 3147 { 3148 int index; 3149 3150 if (size <= 192) { 3151 if (!size) 3152 return ZERO_SIZE_PTR; 3153 3154 index = size_index[size_index_elem(size)]; 3155 } else 3156 index = fls(size - 1); 3157 3158 #ifdef CONFIG_ZONE_DMA 3159 if (unlikely((flags & SLUB_DMA))) 3160 return kmalloc_dma_caches[index]; 3161 3162 #endif 3163 return kmalloc_caches[index]; 3164 } 3165 3166 void *__kmalloc(size_t size, gfp_t flags) 3167 { 3168 struct kmem_cache *s; 3169 void *ret; 3170 3171 if (unlikely(size > SLUB_MAX_SIZE)) 3172 return kmalloc_large(size, flags); 3173 3174 s = get_slab(size, flags); 3175 3176 if (unlikely(ZERO_OR_NULL_PTR(s))) 3177 return s; 3178 3179 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); 3180 3181 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3182 3183 return ret; 3184 } 3185 EXPORT_SYMBOL(__kmalloc); 3186 3187 #ifdef CONFIG_NUMA 3188 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3189 { 3190 struct page *page; 3191 void *ptr = NULL; 3192 3193 flags |= __GFP_COMP | __GFP_NOTRACK; 3194 page = alloc_pages_node(node, flags, get_order(size)); 3195 if (page) 3196 ptr = page_address(page); 3197 3198 kmemleak_alloc(ptr, size, 1, flags); 3199 return ptr; 3200 } 3201 3202 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3203 { 3204 struct kmem_cache *s; 3205 void *ret; 3206 3207 if (unlikely(size > SLUB_MAX_SIZE)) { 3208 ret = kmalloc_large_node(size, flags, node); 3209 3210 trace_kmalloc_node(_RET_IP_, ret, 3211 size, PAGE_SIZE << get_order(size), 3212 flags, node); 3213 3214 return ret; 3215 } 3216 3217 s = get_slab(size, flags); 3218 3219 if (unlikely(ZERO_OR_NULL_PTR(s))) 3220 return s; 3221 3222 ret = slab_alloc(s, flags, node, _RET_IP_); 3223 3224 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3225 3226 return ret; 3227 } 3228 EXPORT_SYMBOL(__kmalloc_node); 3229 #endif 3230 3231 size_t ksize(const void *object) 3232 { 3233 struct page *page; 3234 3235 if (unlikely(object == ZERO_SIZE_PTR)) 3236 return 0; 3237 3238 page = virt_to_head_page(object); 3239 3240 if (unlikely(!PageSlab(page))) { 3241 WARN_ON(!PageCompound(page)); 3242 return PAGE_SIZE << compound_order(page); 3243 } 3244 3245 return slab_ksize(page->slab); 3246 } 3247 EXPORT_SYMBOL(ksize); 3248 3249 #ifdef CONFIG_SLUB_DEBUG 3250 bool verify_mem_not_deleted(const void *x) 3251 { 3252 struct page *page; 3253 void *object = (void *)x; 3254 unsigned long flags; 3255 bool rv; 3256 3257 if (unlikely(ZERO_OR_NULL_PTR(x))) 3258 return false; 3259 3260 local_irq_save(flags); 3261 3262 page = virt_to_head_page(x); 3263 if (unlikely(!PageSlab(page))) { 3264 /* maybe it was from stack? */ 3265 rv = true; 3266 goto out_unlock; 3267 } 3268 3269 slab_lock(page); 3270 if (on_freelist(page->slab, page, object)) { 3271 object_err(page->slab, page, object, "Object is on free-list"); 3272 rv = false; 3273 } else { 3274 rv = true; 3275 } 3276 slab_unlock(page); 3277 3278 out_unlock: 3279 local_irq_restore(flags); 3280 return rv; 3281 } 3282 EXPORT_SYMBOL(verify_mem_not_deleted); 3283 #endif 3284 3285 void kfree(const void *x) 3286 { 3287 struct page *page; 3288 void *object = (void *)x; 3289 3290 trace_kfree(_RET_IP_, x); 3291 3292 if (unlikely(ZERO_OR_NULL_PTR(x))) 3293 return; 3294 3295 page = virt_to_head_page(x); 3296 if (unlikely(!PageSlab(page))) { 3297 BUG_ON(!PageCompound(page)); 3298 kmemleak_free(x); 3299 put_page(page); 3300 return; 3301 } 3302 slab_free(page->slab, page, object, _RET_IP_); 3303 } 3304 EXPORT_SYMBOL(kfree); 3305 3306 /* 3307 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3308 * the remaining slabs by the number of items in use. The slabs with the 3309 * most items in use come first. New allocations will then fill those up 3310 * and thus they can be removed from the partial lists. 3311 * 3312 * The slabs with the least items are placed last. This results in them 3313 * being allocated from last increasing the chance that the last objects 3314 * are freed in them. 3315 */ 3316 int kmem_cache_shrink(struct kmem_cache *s) 3317 { 3318 int node; 3319 int i; 3320 struct kmem_cache_node *n; 3321 struct page *page; 3322 struct page *t; 3323 int objects = oo_objects(s->max); 3324 struct list_head *slabs_by_inuse = 3325 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3326 unsigned long flags; 3327 3328 if (!slabs_by_inuse) 3329 return -ENOMEM; 3330 3331 flush_all(s); 3332 for_each_node_state(node, N_NORMAL_MEMORY) { 3333 n = get_node(s, node); 3334 3335 if (!n->nr_partial) 3336 continue; 3337 3338 for (i = 0; i < objects; i++) 3339 INIT_LIST_HEAD(slabs_by_inuse + i); 3340 3341 spin_lock_irqsave(&n->list_lock, flags); 3342 3343 /* 3344 * Build lists indexed by the items in use in each slab. 3345 * 3346 * Note that concurrent frees may occur while we hold the 3347 * list_lock. page->inuse here is the upper limit. 3348 */ 3349 list_for_each_entry_safe(page, t, &n->partial, lru) { 3350 if (!page->inuse) { 3351 remove_partial(n, page); 3352 discard_slab(s, page); 3353 } else { 3354 list_move(&page->lru, 3355 slabs_by_inuse + page->inuse); 3356 } 3357 } 3358 3359 /* 3360 * Rebuild the partial list with the slabs filled up most 3361 * first and the least used slabs at the end. 3362 */ 3363 for (i = objects - 1; i >= 0; i--) 3364 list_splice(slabs_by_inuse + i, n->partial.prev); 3365 3366 spin_unlock_irqrestore(&n->list_lock, flags); 3367 } 3368 3369 kfree(slabs_by_inuse); 3370 return 0; 3371 } 3372 EXPORT_SYMBOL(kmem_cache_shrink); 3373 3374 #if defined(CONFIG_MEMORY_HOTPLUG) 3375 static int slab_mem_going_offline_callback(void *arg) 3376 { 3377 struct kmem_cache *s; 3378 3379 down_read(&slub_lock); 3380 list_for_each_entry(s, &slab_caches, list) 3381 kmem_cache_shrink(s); 3382 up_read(&slub_lock); 3383 3384 return 0; 3385 } 3386 3387 static void slab_mem_offline_callback(void *arg) 3388 { 3389 struct kmem_cache_node *n; 3390 struct kmem_cache *s; 3391 struct memory_notify *marg = arg; 3392 int offline_node; 3393 3394 offline_node = marg->status_change_nid; 3395 3396 /* 3397 * If the node still has available memory. we need kmem_cache_node 3398 * for it yet. 3399 */ 3400 if (offline_node < 0) 3401 return; 3402 3403 down_read(&slub_lock); 3404 list_for_each_entry(s, &slab_caches, list) { 3405 n = get_node(s, offline_node); 3406 if (n) { 3407 /* 3408 * if n->nr_slabs > 0, slabs still exist on the node 3409 * that is going down. We were unable to free them, 3410 * and offline_pages() function shouldn't call this 3411 * callback. So, we must fail. 3412 */ 3413 BUG_ON(slabs_node(s, offline_node)); 3414 3415 s->node[offline_node] = NULL; 3416 kmem_cache_free(kmem_cache_node, n); 3417 } 3418 } 3419 up_read(&slub_lock); 3420 } 3421 3422 static int slab_mem_going_online_callback(void *arg) 3423 { 3424 struct kmem_cache_node *n; 3425 struct kmem_cache *s; 3426 struct memory_notify *marg = arg; 3427 int nid = marg->status_change_nid; 3428 int ret = 0; 3429 3430 /* 3431 * If the node's memory is already available, then kmem_cache_node is 3432 * already created. Nothing to do. 3433 */ 3434 if (nid < 0) 3435 return 0; 3436 3437 /* 3438 * We are bringing a node online. No memory is available yet. We must 3439 * allocate a kmem_cache_node structure in order to bring the node 3440 * online. 3441 */ 3442 down_read(&slub_lock); 3443 list_for_each_entry(s, &slab_caches, list) { 3444 /* 3445 * XXX: kmem_cache_alloc_node will fallback to other nodes 3446 * since memory is not yet available from the node that 3447 * is brought up. 3448 */ 3449 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3450 if (!n) { 3451 ret = -ENOMEM; 3452 goto out; 3453 } 3454 init_kmem_cache_node(n, s); 3455 s->node[nid] = n; 3456 } 3457 out: 3458 up_read(&slub_lock); 3459 return ret; 3460 } 3461 3462 static int slab_memory_callback(struct notifier_block *self, 3463 unsigned long action, void *arg) 3464 { 3465 int ret = 0; 3466 3467 switch (action) { 3468 case MEM_GOING_ONLINE: 3469 ret = slab_mem_going_online_callback(arg); 3470 break; 3471 case MEM_GOING_OFFLINE: 3472 ret = slab_mem_going_offline_callback(arg); 3473 break; 3474 case MEM_OFFLINE: 3475 case MEM_CANCEL_ONLINE: 3476 slab_mem_offline_callback(arg); 3477 break; 3478 case MEM_ONLINE: 3479 case MEM_CANCEL_OFFLINE: 3480 break; 3481 } 3482 if (ret) 3483 ret = notifier_from_errno(ret); 3484 else 3485 ret = NOTIFY_OK; 3486 return ret; 3487 } 3488 3489 #endif /* CONFIG_MEMORY_HOTPLUG */ 3490 3491 /******************************************************************** 3492 * Basic setup of slabs 3493 *******************************************************************/ 3494 3495 /* 3496 * Used for early kmem_cache structures that were allocated using 3497 * the page allocator 3498 */ 3499 3500 static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3501 { 3502 int node; 3503 3504 list_add(&s->list, &slab_caches); 3505 s->refcount = -1; 3506 3507 for_each_node_state(node, N_NORMAL_MEMORY) { 3508 struct kmem_cache_node *n = get_node(s, node); 3509 struct page *p; 3510 3511 if (n) { 3512 list_for_each_entry(p, &n->partial, lru) 3513 p->slab = s; 3514 3515 #ifdef CONFIG_SLUB_DEBUG 3516 list_for_each_entry(p, &n->full, lru) 3517 p->slab = s; 3518 #endif 3519 } 3520 } 3521 } 3522 3523 void __init kmem_cache_init(void) 3524 { 3525 int i; 3526 int caches = 0; 3527 struct kmem_cache *temp_kmem_cache; 3528 int order; 3529 struct kmem_cache *temp_kmem_cache_node; 3530 unsigned long kmalloc_size; 3531 3532 kmem_size = offsetof(struct kmem_cache, node) + 3533 nr_node_ids * sizeof(struct kmem_cache_node *); 3534 3535 /* Allocate two kmem_caches from the page allocator */ 3536 kmalloc_size = ALIGN(kmem_size, cache_line_size()); 3537 order = get_order(2 * kmalloc_size); 3538 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); 3539 3540 /* 3541 * Must first have the slab cache available for the allocations of the 3542 * struct kmem_cache_node's. There is special bootstrap code in 3543 * kmem_cache_open for slab_state == DOWN. 3544 */ 3545 kmem_cache_node = (void *)kmem_cache + kmalloc_size; 3546 3547 kmem_cache_open(kmem_cache_node, "kmem_cache_node", 3548 sizeof(struct kmem_cache_node), 3549 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3550 3551 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3552 3553 /* Able to allocate the per node structures */ 3554 slab_state = PARTIAL; 3555 3556 temp_kmem_cache = kmem_cache; 3557 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, 3558 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3559 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3560 memcpy(kmem_cache, temp_kmem_cache, kmem_size); 3561 3562 /* 3563 * Allocate kmem_cache_node properly from the kmem_cache slab. 3564 * kmem_cache_node is separately allocated so no need to 3565 * update any list pointers. 3566 */ 3567 temp_kmem_cache_node = kmem_cache_node; 3568 3569 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3570 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); 3571 3572 kmem_cache_bootstrap_fixup(kmem_cache_node); 3573 3574 caches++; 3575 kmem_cache_bootstrap_fixup(kmem_cache); 3576 caches++; 3577 /* Free temporary boot structure */ 3578 free_pages((unsigned long)temp_kmem_cache, order); 3579 3580 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3581 3582 /* 3583 * Patch up the size_index table if we have strange large alignment 3584 * requirements for the kmalloc array. This is only the case for 3585 * MIPS it seems. The standard arches will not generate any code here. 3586 * 3587 * Largest permitted alignment is 256 bytes due to the way we 3588 * handle the index determination for the smaller caches. 3589 * 3590 * Make sure that nothing crazy happens if someone starts tinkering 3591 * around with ARCH_KMALLOC_MINALIGN 3592 */ 3593 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3594 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3595 3596 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 3597 int elem = size_index_elem(i); 3598 if (elem >= ARRAY_SIZE(size_index)) 3599 break; 3600 size_index[elem] = KMALLOC_SHIFT_LOW; 3601 } 3602 3603 if (KMALLOC_MIN_SIZE == 64) { 3604 /* 3605 * The 96 byte size cache is not used if the alignment 3606 * is 64 byte. 3607 */ 3608 for (i = 64 + 8; i <= 96; i += 8) 3609 size_index[size_index_elem(i)] = 7; 3610 } else if (KMALLOC_MIN_SIZE == 128) { 3611 /* 3612 * The 192 byte sized cache is not used if the alignment 3613 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3614 * instead. 3615 */ 3616 for (i = 128 + 8; i <= 192; i += 8) 3617 size_index[size_index_elem(i)] = 8; 3618 } 3619 3620 /* Caches that are not of the two-to-the-power-of size */ 3621 if (KMALLOC_MIN_SIZE <= 32) { 3622 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); 3623 caches++; 3624 } 3625 3626 if (KMALLOC_MIN_SIZE <= 64) { 3627 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); 3628 caches++; 3629 } 3630 3631 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3632 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); 3633 caches++; 3634 } 3635 3636 slab_state = UP; 3637 3638 /* Provide the correct kmalloc names now that the caches are up */ 3639 if (KMALLOC_MIN_SIZE <= 32) { 3640 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); 3641 BUG_ON(!kmalloc_caches[1]->name); 3642 } 3643 3644 if (KMALLOC_MIN_SIZE <= 64) { 3645 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); 3646 BUG_ON(!kmalloc_caches[2]->name); 3647 } 3648 3649 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3650 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3651 3652 BUG_ON(!s); 3653 kmalloc_caches[i]->name = s; 3654 } 3655 3656 #ifdef CONFIG_SMP 3657 register_cpu_notifier(&slab_notifier); 3658 #endif 3659 3660 #ifdef CONFIG_ZONE_DMA 3661 for (i = 0; i < SLUB_PAGE_SHIFT; i++) { 3662 struct kmem_cache *s = kmalloc_caches[i]; 3663 3664 if (s && s->size) { 3665 char *name = kasprintf(GFP_NOWAIT, 3666 "dma-kmalloc-%d", s->objsize); 3667 3668 BUG_ON(!name); 3669 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3670 s->objsize, SLAB_CACHE_DMA); 3671 } 3672 } 3673 #endif 3674 printk(KERN_INFO 3675 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3676 " CPUs=%d, Nodes=%d\n", 3677 caches, cache_line_size(), 3678 slub_min_order, slub_max_order, slub_min_objects, 3679 nr_cpu_ids, nr_node_ids); 3680 } 3681 3682 void __init kmem_cache_init_late(void) 3683 { 3684 } 3685 3686 /* 3687 * Find a mergeable slab cache 3688 */ 3689 static int slab_unmergeable(struct kmem_cache *s) 3690 { 3691 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3692 return 1; 3693 3694 if (s->ctor) 3695 return 1; 3696 3697 /* 3698 * We may have set a slab to be unmergeable during bootstrap. 3699 */ 3700 if (s->refcount < 0) 3701 return 1; 3702 3703 return 0; 3704 } 3705 3706 static struct kmem_cache *find_mergeable(size_t size, 3707 size_t align, unsigned long flags, const char *name, 3708 void (*ctor)(void *)) 3709 { 3710 struct kmem_cache *s; 3711 3712 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3713 return NULL; 3714 3715 if (ctor) 3716 return NULL; 3717 3718 size = ALIGN(size, sizeof(void *)); 3719 align = calculate_alignment(flags, align, size); 3720 size = ALIGN(size, align); 3721 flags = kmem_cache_flags(size, flags, name, NULL); 3722 3723 list_for_each_entry(s, &slab_caches, list) { 3724 if (slab_unmergeable(s)) 3725 continue; 3726 3727 if (size > s->size) 3728 continue; 3729 3730 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3731 continue; 3732 /* 3733 * Check if alignment is compatible. 3734 * Courtesy of Adrian Drzewiecki 3735 */ 3736 if ((s->size & ~(align - 1)) != s->size) 3737 continue; 3738 3739 if (s->size - size >= sizeof(void *)) 3740 continue; 3741 3742 return s; 3743 } 3744 return NULL; 3745 } 3746 3747 struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3748 size_t align, unsigned long flags, void (*ctor)(void *)) 3749 { 3750 struct kmem_cache *s; 3751 char *n; 3752 3753 if (WARN_ON(!name)) 3754 return NULL; 3755 3756 down_write(&slub_lock); 3757 s = find_mergeable(size, align, flags, name, ctor); 3758 if (s) { 3759 s->refcount++; 3760 /* 3761 * Adjust the object sizes so that we clear 3762 * the complete object on kzalloc. 3763 */ 3764 s->objsize = max(s->objsize, (int)size); 3765 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3766 3767 if (sysfs_slab_alias(s, name)) { 3768 s->refcount--; 3769 goto err; 3770 } 3771 up_write(&slub_lock); 3772 return s; 3773 } 3774 3775 n = kstrdup(name, GFP_KERNEL); 3776 if (!n) 3777 goto err; 3778 3779 s = kmalloc(kmem_size, GFP_KERNEL); 3780 if (s) { 3781 if (kmem_cache_open(s, n, 3782 size, align, flags, ctor)) { 3783 list_add(&s->list, &slab_caches); 3784 if (sysfs_slab_add(s)) { 3785 list_del(&s->list); 3786 kfree(n); 3787 kfree(s); 3788 goto err; 3789 } 3790 up_write(&slub_lock); 3791 return s; 3792 } 3793 kfree(n); 3794 kfree(s); 3795 } 3796 err: 3797 up_write(&slub_lock); 3798 3799 if (flags & SLAB_PANIC) 3800 panic("Cannot create slabcache %s\n", name); 3801 else 3802 s = NULL; 3803 return s; 3804 } 3805 EXPORT_SYMBOL(kmem_cache_create); 3806 3807 #ifdef CONFIG_SMP 3808 /* 3809 * Use the cpu notifier to insure that the cpu slabs are flushed when 3810 * necessary. 3811 */ 3812 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3813 unsigned long action, void *hcpu) 3814 { 3815 long cpu = (long)hcpu; 3816 struct kmem_cache *s; 3817 unsigned long flags; 3818 3819 switch (action) { 3820 case CPU_UP_CANCELED: 3821 case CPU_UP_CANCELED_FROZEN: 3822 case CPU_DEAD: 3823 case CPU_DEAD_FROZEN: 3824 down_read(&slub_lock); 3825 list_for_each_entry(s, &slab_caches, list) { 3826 local_irq_save(flags); 3827 __flush_cpu_slab(s, cpu); 3828 local_irq_restore(flags); 3829 } 3830 up_read(&slub_lock); 3831 break; 3832 default: 3833 break; 3834 } 3835 return NOTIFY_OK; 3836 } 3837 3838 static struct notifier_block __cpuinitdata slab_notifier = { 3839 .notifier_call = slab_cpuup_callback 3840 }; 3841 3842 #endif 3843 3844 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3845 { 3846 struct kmem_cache *s; 3847 void *ret; 3848 3849 if (unlikely(size > SLUB_MAX_SIZE)) 3850 return kmalloc_large(size, gfpflags); 3851 3852 s = get_slab(size, gfpflags); 3853 3854 if (unlikely(ZERO_OR_NULL_PTR(s))) 3855 return s; 3856 3857 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3858 3859 /* Honor the call site pointer we received. */ 3860 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3861 3862 return ret; 3863 } 3864 3865 #ifdef CONFIG_NUMA 3866 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3867 int node, unsigned long caller) 3868 { 3869 struct kmem_cache *s; 3870 void *ret; 3871 3872 if (unlikely(size > SLUB_MAX_SIZE)) { 3873 ret = kmalloc_large_node(size, gfpflags, node); 3874 3875 trace_kmalloc_node(caller, ret, 3876 size, PAGE_SIZE << get_order(size), 3877 gfpflags, node); 3878 3879 return ret; 3880 } 3881 3882 s = get_slab(size, gfpflags); 3883 3884 if (unlikely(ZERO_OR_NULL_PTR(s))) 3885 return s; 3886 3887 ret = slab_alloc(s, gfpflags, node, caller); 3888 3889 /* Honor the call site pointer we received. */ 3890 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3891 3892 return ret; 3893 } 3894 #endif 3895 3896 #ifdef CONFIG_SYSFS 3897 static int count_inuse(struct page *page) 3898 { 3899 return page->inuse; 3900 } 3901 3902 static int count_total(struct page *page) 3903 { 3904 return page->objects; 3905 } 3906 #endif 3907 3908 #ifdef CONFIG_SLUB_DEBUG 3909 static int validate_slab(struct kmem_cache *s, struct page *page, 3910 unsigned long *map) 3911 { 3912 void *p; 3913 void *addr = page_address(page); 3914 3915 if (!check_slab(s, page) || 3916 !on_freelist(s, page, NULL)) 3917 return 0; 3918 3919 /* Now we know that a valid freelist exists */ 3920 bitmap_zero(map, page->objects); 3921 3922 get_map(s, page, map); 3923 for_each_object(p, s, addr, page->objects) { 3924 if (test_bit(slab_index(p, s, addr), map)) 3925 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3926 return 0; 3927 } 3928 3929 for_each_object(p, s, addr, page->objects) 3930 if (!test_bit(slab_index(p, s, addr), map)) 3931 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 3932 return 0; 3933 return 1; 3934 } 3935 3936 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3937 unsigned long *map) 3938 { 3939 slab_lock(page); 3940 validate_slab(s, page, map); 3941 slab_unlock(page); 3942 } 3943 3944 static int validate_slab_node(struct kmem_cache *s, 3945 struct kmem_cache_node *n, unsigned long *map) 3946 { 3947 unsigned long count = 0; 3948 struct page *page; 3949 unsigned long flags; 3950 3951 spin_lock_irqsave(&n->list_lock, flags); 3952 3953 list_for_each_entry(page, &n->partial, lru) { 3954 validate_slab_slab(s, page, map); 3955 count++; 3956 } 3957 if (count != n->nr_partial) 3958 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3959 "counter=%ld\n", s->name, count, n->nr_partial); 3960 3961 if (!(s->flags & SLAB_STORE_USER)) 3962 goto out; 3963 3964 list_for_each_entry(page, &n->full, lru) { 3965 validate_slab_slab(s, page, map); 3966 count++; 3967 } 3968 if (count != atomic_long_read(&n->nr_slabs)) 3969 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3970 "counter=%ld\n", s->name, count, 3971 atomic_long_read(&n->nr_slabs)); 3972 3973 out: 3974 spin_unlock_irqrestore(&n->list_lock, flags); 3975 return count; 3976 } 3977 3978 static long validate_slab_cache(struct kmem_cache *s) 3979 { 3980 int node; 3981 unsigned long count = 0; 3982 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3983 sizeof(unsigned long), GFP_KERNEL); 3984 3985 if (!map) 3986 return -ENOMEM; 3987 3988 flush_all(s); 3989 for_each_node_state(node, N_NORMAL_MEMORY) { 3990 struct kmem_cache_node *n = get_node(s, node); 3991 3992 count += validate_slab_node(s, n, map); 3993 } 3994 kfree(map); 3995 return count; 3996 } 3997 /* 3998 * Generate lists of code addresses where slabcache objects are allocated 3999 * and freed. 4000 */ 4001 4002 struct location { 4003 unsigned long count; 4004 unsigned long addr; 4005 long long sum_time; 4006 long min_time; 4007 long max_time; 4008 long min_pid; 4009 long max_pid; 4010 DECLARE_BITMAP(cpus, NR_CPUS); 4011 nodemask_t nodes; 4012 }; 4013 4014 struct loc_track { 4015 unsigned long max; 4016 unsigned long count; 4017 struct location *loc; 4018 }; 4019 4020 static void free_loc_track(struct loc_track *t) 4021 { 4022 if (t->max) 4023 free_pages((unsigned long)t->loc, 4024 get_order(sizeof(struct location) * t->max)); 4025 } 4026 4027 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 4028 { 4029 struct location *l; 4030 int order; 4031 4032 order = get_order(sizeof(struct location) * max); 4033 4034 l = (void *)__get_free_pages(flags, order); 4035 if (!l) 4036 return 0; 4037 4038 if (t->count) { 4039 memcpy(l, t->loc, sizeof(struct location) * t->count); 4040 free_loc_track(t); 4041 } 4042 t->max = max; 4043 t->loc = l; 4044 return 1; 4045 } 4046 4047 static int add_location(struct loc_track *t, struct kmem_cache *s, 4048 const struct track *track) 4049 { 4050 long start, end, pos; 4051 struct location *l; 4052 unsigned long caddr; 4053 unsigned long age = jiffies - track->when; 4054 4055 start = -1; 4056 end = t->count; 4057 4058 for ( ; ; ) { 4059 pos = start + (end - start + 1) / 2; 4060 4061 /* 4062 * There is nothing at "end". If we end up there 4063 * we need to add something to before end. 4064 */ 4065 if (pos == end) 4066 break; 4067 4068 caddr = t->loc[pos].addr; 4069 if (track->addr == caddr) { 4070 4071 l = &t->loc[pos]; 4072 l->count++; 4073 if (track->when) { 4074 l->sum_time += age; 4075 if (age < l->min_time) 4076 l->min_time = age; 4077 if (age > l->max_time) 4078 l->max_time = age; 4079 4080 if (track->pid < l->min_pid) 4081 l->min_pid = track->pid; 4082 if (track->pid > l->max_pid) 4083 l->max_pid = track->pid; 4084 4085 cpumask_set_cpu(track->cpu, 4086 to_cpumask(l->cpus)); 4087 } 4088 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4089 return 1; 4090 } 4091 4092 if (track->addr < caddr) 4093 end = pos; 4094 else 4095 start = pos; 4096 } 4097 4098 /* 4099 * Not found. Insert new tracking element. 4100 */ 4101 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4102 return 0; 4103 4104 l = t->loc + pos; 4105 if (pos < t->count) 4106 memmove(l + 1, l, 4107 (t->count - pos) * sizeof(struct location)); 4108 t->count++; 4109 l->count = 1; 4110 l->addr = track->addr; 4111 l->sum_time = age; 4112 l->min_time = age; 4113 l->max_time = age; 4114 l->min_pid = track->pid; 4115 l->max_pid = track->pid; 4116 cpumask_clear(to_cpumask(l->cpus)); 4117 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4118 nodes_clear(l->nodes); 4119 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4120 return 1; 4121 } 4122 4123 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4124 struct page *page, enum track_item alloc, 4125 unsigned long *map) 4126 { 4127 void *addr = page_address(page); 4128 void *p; 4129 4130 bitmap_zero(map, page->objects); 4131 get_map(s, page, map); 4132 4133 for_each_object(p, s, addr, page->objects) 4134 if (!test_bit(slab_index(p, s, addr), map)) 4135 add_location(t, s, get_track(s, p, alloc)); 4136 } 4137 4138 static int list_locations(struct kmem_cache *s, char *buf, 4139 enum track_item alloc) 4140 { 4141 int len = 0; 4142 unsigned long i; 4143 struct loc_track t = { 0, 0, NULL }; 4144 int node; 4145 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4146 sizeof(unsigned long), GFP_KERNEL); 4147 4148 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4149 GFP_TEMPORARY)) { 4150 kfree(map); 4151 return sprintf(buf, "Out of memory\n"); 4152 } 4153 /* Push back cpu slabs */ 4154 flush_all(s); 4155 4156 for_each_node_state(node, N_NORMAL_MEMORY) { 4157 struct kmem_cache_node *n = get_node(s, node); 4158 unsigned long flags; 4159 struct page *page; 4160 4161 if (!atomic_long_read(&n->nr_slabs)) 4162 continue; 4163 4164 spin_lock_irqsave(&n->list_lock, flags); 4165 list_for_each_entry(page, &n->partial, lru) 4166 process_slab(&t, s, page, alloc, map); 4167 list_for_each_entry(page, &n->full, lru) 4168 process_slab(&t, s, page, alloc, map); 4169 spin_unlock_irqrestore(&n->list_lock, flags); 4170 } 4171 4172 for (i = 0; i < t.count; i++) { 4173 struct location *l = &t.loc[i]; 4174 4175 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4176 break; 4177 len += sprintf(buf + len, "%7ld ", l->count); 4178 4179 if (l->addr) 4180 len += sprintf(buf + len, "%pS", (void *)l->addr); 4181 else 4182 len += sprintf(buf + len, "<not-available>"); 4183 4184 if (l->sum_time != l->min_time) { 4185 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4186 l->min_time, 4187 (long)div_u64(l->sum_time, l->count), 4188 l->max_time); 4189 } else 4190 len += sprintf(buf + len, " age=%ld", 4191 l->min_time); 4192 4193 if (l->min_pid != l->max_pid) 4194 len += sprintf(buf + len, " pid=%ld-%ld", 4195 l->min_pid, l->max_pid); 4196 else 4197 len += sprintf(buf + len, " pid=%ld", 4198 l->min_pid); 4199 4200 if (num_online_cpus() > 1 && 4201 !cpumask_empty(to_cpumask(l->cpus)) && 4202 len < PAGE_SIZE - 60) { 4203 len += sprintf(buf + len, " cpus="); 4204 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4205 to_cpumask(l->cpus)); 4206 } 4207 4208 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4209 len < PAGE_SIZE - 60) { 4210 len += sprintf(buf + len, " nodes="); 4211 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4212 l->nodes); 4213 } 4214 4215 len += sprintf(buf + len, "\n"); 4216 } 4217 4218 free_loc_track(&t); 4219 kfree(map); 4220 if (!t.count) 4221 len += sprintf(buf, "No data\n"); 4222 return len; 4223 } 4224 #endif 4225 4226 #ifdef SLUB_RESILIENCY_TEST 4227 static void resiliency_test(void) 4228 { 4229 u8 *p; 4230 4231 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); 4232 4233 printk(KERN_ERR "SLUB resiliency testing\n"); 4234 printk(KERN_ERR "-----------------------\n"); 4235 printk(KERN_ERR "A. Corruption after allocation\n"); 4236 4237 p = kzalloc(16, GFP_KERNEL); 4238 p[16] = 0x12; 4239 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4240 " 0x12->0x%p\n\n", p + 16); 4241 4242 validate_slab_cache(kmalloc_caches[4]); 4243 4244 /* Hmmm... The next two are dangerous */ 4245 p = kzalloc(32, GFP_KERNEL); 4246 p[32 + sizeof(void *)] = 0x34; 4247 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4248 " 0x34 -> -0x%p\n", p); 4249 printk(KERN_ERR 4250 "If allocated object is overwritten then not detectable\n\n"); 4251 4252 validate_slab_cache(kmalloc_caches[5]); 4253 p = kzalloc(64, GFP_KERNEL); 4254 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4255 *p = 0x56; 4256 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4257 p); 4258 printk(KERN_ERR 4259 "If allocated object is overwritten then not detectable\n\n"); 4260 validate_slab_cache(kmalloc_caches[6]); 4261 4262 printk(KERN_ERR "\nB. Corruption after free\n"); 4263 p = kzalloc(128, GFP_KERNEL); 4264 kfree(p); 4265 *p = 0x78; 4266 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4267 validate_slab_cache(kmalloc_caches[7]); 4268 4269 p = kzalloc(256, GFP_KERNEL); 4270 kfree(p); 4271 p[50] = 0x9a; 4272 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4273 p); 4274 validate_slab_cache(kmalloc_caches[8]); 4275 4276 p = kzalloc(512, GFP_KERNEL); 4277 kfree(p); 4278 p[512] = 0xab; 4279 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4280 validate_slab_cache(kmalloc_caches[9]); 4281 } 4282 #else 4283 #ifdef CONFIG_SYSFS 4284 static void resiliency_test(void) {}; 4285 #endif 4286 #endif 4287 4288 #ifdef CONFIG_SYSFS 4289 enum slab_stat_type { 4290 SL_ALL, /* All slabs */ 4291 SL_PARTIAL, /* Only partially allocated slabs */ 4292 SL_CPU, /* Only slabs used for cpu caches */ 4293 SL_OBJECTS, /* Determine allocated objects not slabs */ 4294 SL_TOTAL /* Determine object capacity not slabs */ 4295 }; 4296 4297 #define SO_ALL (1 << SL_ALL) 4298 #define SO_PARTIAL (1 << SL_PARTIAL) 4299 #define SO_CPU (1 << SL_CPU) 4300 #define SO_OBJECTS (1 << SL_OBJECTS) 4301 #define SO_TOTAL (1 << SL_TOTAL) 4302 4303 static ssize_t show_slab_objects(struct kmem_cache *s, 4304 char *buf, unsigned long flags) 4305 { 4306 unsigned long total = 0; 4307 int node; 4308 int x; 4309 unsigned long *nodes; 4310 unsigned long *per_cpu; 4311 4312 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4313 if (!nodes) 4314 return -ENOMEM; 4315 per_cpu = nodes + nr_node_ids; 4316 4317 if (flags & SO_CPU) { 4318 int cpu; 4319 4320 for_each_possible_cpu(cpu) { 4321 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4322 4323 if (!c || c->node < 0) 4324 continue; 4325 4326 if (c->page) { 4327 if (flags & SO_TOTAL) 4328 x = c->page->objects; 4329 else if (flags & SO_OBJECTS) 4330 x = c->page->inuse; 4331 else 4332 x = 1; 4333 4334 total += x; 4335 nodes[c->node] += x; 4336 } 4337 per_cpu[c->node]++; 4338 } 4339 } 4340 4341 lock_memory_hotplug(); 4342 #ifdef CONFIG_SLUB_DEBUG 4343 if (flags & SO_ALL) { 4344 for_each_node_state(node, N_NORMAL_MEMORY) { 4345 struct kmem_cache_node *n = get_node(s, node); 4346 4347 if (flags & SO_TOTAL) 4348 x = atomic_long_read(&n->total_objects); 4349 else if (flags & SO_OBJECTS) 4350 x = atomic_long_read(&n->total_objects) - 4351 count_partial(n, count_free); 4352 4353 else 4354 x = atomic_long_read(&n->nr_slabs); 4355 total += x; 4356 nodes[node] += x; 4357 } 4358 4359 } else 4360 #endif 4361 if (flags & SO_PARTIAL) { 4362 for_each_node_state(node, N_NORMAL_MEMORY) { 4363 struct kmem_cache_node *n = get_node(s, node); 4364 4365 if (flags & SO_TOTAL) 4366 x = count_partial(n, count_total); 4367 else if (flags & SO_OBJECTS) 4368 x = count_partial(n, count_inuse); 4369 else 4370 x = n->nr_partial; 4371 total += x; 4372 nodes[node] += x; 4373 } 4374 } 4375 x = sprintf(buf, "%lu", total); 4376 #ifdef CONFIG_NUMA 4377 for_each_node_state(node, N_NORMAL_MEMORY) 4378 if (nodes[node]) 4379 x += sprintf(buf + x, " N%d=%lu", 4380 node, nodes[node]); 4381 #endif 4382 unlock_memory_hotplug(); 4383 kfree(nodes); 4384 return x + sprintf(buf + x, "\n"); 4385 } 4386 4387 #ifdef CONFIG_SLUB_DEBUG 4388 static int any_slab_objects(struct kmem_cache *s) 4389 { 4390 int node; 4391 4392 for_each_online_node(node) { 4393 struct kmem_cache_node *n = get_node(s, node); 4394 4395 if (!n) 4396 continue; 4397 4398 if (atomic_long_read(&n->total_objects)) 4399 return 1; 4400 } 4401 return 0; 4402 } 4403 #endif 4404 4405 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4406 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4407 4408 struct slab_attribute { 4409 struct attribute attr; 4410 ssize_t (*show)(struct kmem_cache *s, char *buf); 4411 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4412 }; 4413 4414 #define SLAB_ATTR_RO(_name) \ 4415 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4416 4417 #define SLAB_ATTR(_name) \ 4418 static struct slab_attribute _name##_attr = \ 4419 __ATTR(_name, 0644, _name##_show, _name##_store) 4420 4421 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4422 { 4423 return sprintf(buf, "%d\n", s->size); 4424 } 4425 SLAB_ATTR_RO(slab_size); 4426 4427 static ssize_t align_show(struct kmem_cache *s, char *buf) 4428 { 4429 return sprintf(buf, "%d\n", s->align); 4430 } 4431 SLAB_ATTR_RO(align); 4432 4433 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4434 { 4435 return sprintf(buf, "%d\n", s->objsize); 4436 } 4437 SLAB_ATTR_RO(object_size); 4438 4439 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4440 { 4441 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4442 } 4443 SLAB_ATTR_RO(objs_per_slab); 4444 4445 static ssize_t order_store(struct kmem_cache *s, 4446 const char *buf, size_t length) 4447 { 4448 unsigned long order; 4449 int err; 4450 4451 err = strict_strtoul(buf, 10, &order); 4452 if (err) 4453 return err; 4454 4455 if (order > slub_max_order || order < slub_min_order) 4456 return -EINVAL; 4457 4458 calculate_sizes(s, order); 4459 return length; 4460 } 4461 4462 static ssize_t order_show(struct kmem_cache *s, char *buf) 4463 { 4464 return sprintf(buf, "%d\n", oo_order(s->oo)); 4465 } 4466 SLAB_ATTR(order); 4467 4468 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4469 { 4470 return sprintf(buf, "%lu\n", s->min_partial); 4471 } 4472 4473 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4474 size_t length) 4475 { 4476 unsigned long min; 4477 int err; 4478 4479 err = strict_strtoul(buf, 10, &min); 4480 if (err) 4481 return err; 4482 4483 set_min_partial(s, min); 4484 return length; 4485 } 4486 SLAB_ATTR(min_partial); 4487 4488 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4489 { 4490 if (!s->ctor) 4491 return 0; 4492 return sprintf(buf, "%pS\n", s->ctor); 4493 } 4494 SLAB_ATTR_RO(ctor); 4495 4496 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4497 { 4498 return sprintf(buf, "%d\n", s->refcount - 1); 4499 } 4500 SLAB_ATTR_RO(aliases); 4501 4502 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4503 { 4504 return show_slab_objects(s, buf, SO_PARTIAL); 4505 } 4506 SLAB_ATTR_RO(partial); 4507 4508 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4509 { 4510 return show_slab_objects(s, buf, SO_CPU); 4511 } 4512 SLAB_ATTR_RO(cpu_slabs); 4513 4514 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4515 { 4516 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4517 } 4518 SLAB_ATTR_RO(objects); 4519 4520 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4521 { 4522 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4523 } 4524 SLAB_ATTR_RO(objects_partial); 4525 4526 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4527 { 4528 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4529 } 4530 4531 static ssize_t reclaim_account_store(struct kmem_cache *s, 4532 const char *buf, size_t length) 4533 { 4534 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4535 if (buf[0] == '1') 4536 s->flags |= SLAB_RECLAIM_ACCOUNT; 4537 return length; 4538 } 4539 SLAB_ATTR(reclaim_account); 4540 4541 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4542 { 4543 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4544 } 4545 SLAB_ATTR_RO(hwcache_align); 4546 4547 #ifdef CONFIG_ZONE_DMA 4548 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4549 { 4550 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4551 } 4552 SLAB_ATTR_RO(cache_dma); 4553 #endif 4554 4555 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4556 { 4557 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4558 } 4559 SLAB_ATTR_RO(destroy_by_rcu); 4560 4561 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4562 { 4563 return sprintf(buf, "%d\n", s->reserved); 4564 } 4565 SLAB_ATTR_RO(reserved); 4566 4567 #ifdef CONFIG_SLUB_DEBUG 4568 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4569 { 4570 return show_slab_objects(s, buf, SO_ALL); 4571 } 4572 SLAB_ATTR_RO(slabs); 4573 4574 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4575 { 4576 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4577 } 4578 SLAB_ATTR_RO(total_objects); 4579 4580 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4581 { 4582 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4583 } 4584 4585 static ssize_t sanity_checks_store(struct kmem_cache *s, 4586 const char *buf, size_t length) 4587 { 4588 s->flags &= ~SLAB_DEBUG_FREE; 4589 if (buf[0] == '1') { 4590 s->flags &= ~__CMPXCHG_DOUBLE; 4591 s->flags |= SLAB_DEBUG_FREE; 4592 } 4593 return length; 4594 } 4595 SLAB_ATTR(sanity_checks); 4596 4597 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4598 { 4599 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4600 } 4601 4602 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4603 size_t length) 4604 { 4605 s->flags &= ~SLAB_TRACE; 4606 if (buf[0] == '1') { 4607 s->flags &= ~__CMPXCHG_DOUBLE; 4608 s->flags |= SLAB_TRACE; 4609 } 4610 return length; 4611 } 4612 SLAB_ATTR(trace); 4613 4614 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4615 { 4616 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4617 } 4618 4619 static ssize_t red_zone_store(struct kmem_cache *s, 4620 const char *buf, size_t length) 4621 { 4622 if (any_slab_objects(s)) 4623 return -EBUSY; 4624 4625 s->flags &= ~SLAB_RED_ZONE; 4626 if (buf[0] == '1') { 4627 s->flags &= ~__CMPXCHG_DOUBLE; 4628 s->flags |= SLAB_RED_ZONE; 4629 } 4630 calculate_sizes(s, -1); 4631 return length; 4632 } 4633 SLAB_ATTR(red_zone); 4634 4635 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4636 { 4637 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4638 } 4639 4640 static ssize_t poison_store(struct kmem_cache *s, 4641 const char *buf, size_t length) 4642 { 4643 if (any_slab_objects(s)) 4644 return -EBUSY; 4645 4646 s->flags &= ~SLAB_POISON; 4647 if (buf[0] == '1') { 4648 s->flags &= ~__CMPXCHG_DOUBLE; 4649 s->flags |= SLAB_POISON; 4650 } 4651 calculate_sizes(s, -1); 4652 return length; 4653 } 4654 SLAB_ATTR(poison); 4655 4656 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4657 { 4658 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4659 } 4660 4661 static ssize_t store_user_store(struct kmem_cache *s, 4662 const char *buf, size_t length) 4663 { 4664 if (any_slab_objects(s)) 4665 return -EBUSY; 4666 4667 s->flags &= ~SLAB_STORE_USER; 4668 if (buf[0] == '1') { 4669 s->flags &= ~__CMPXCHG_DOUBLE; 4670 s->flags |= SLAB_STORE_USER; 4671 } 4672 calculate_sizes(s, -1); 4673 return length; 4674 } 4675 SLAB_ATTR(store_user); 4676 4677 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4678 { 4679 return 0; 4680 } 4681 4682 static ssize_t validate_store(struct kmem_cache *s, 4683 const char *buf, size_t length) 4684 { 4685 int ret = -EINVAL; 4686 4687 if (buf[0] == '1') { 4688 ret = validate_slab_cache(s); 4689 if (ret >= 0) 4690 ret = length; 4691 } 4692 return ret; 4693 } 4694 SLAB_ATTR(validate); 4695 4696 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4697 { 4698 if (!(s->flags & SLAB_STORE_USER)) 4699 return -ENOSYS; 4700 return list_locations(s, buf, TRACK_ALLOC); 4701 } 4702 SLAB_ATTR_RO(alloc_calls); 4703 4704 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4705 { 4706 if (!(s->flags & SLAB_STORE_USER)) 4707 return -ENOSYS; 4708 return list_locations(s, buf, TRACK_FREE); 4709 } 4710 SLAB_ATTR_RO(free_calls); 4711 #endif /* CONFIG_SLUB_DEBUG */ 4712 4713 #ifdef CONFIG_FAILSLAB 4714 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4715 { 4716 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4717 } 4718 4719 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4720 size_t length) 4721 { 4722 s->flags &= ~SLAB_FAILSLAB; 4723 if (buf[0] == '1') 4724 s->flags |= SLAB_FAILSLAB; 4725 return length; 4726 } 4727 SLAB_ATTR(failslab); 4728 #endif 4729 4730 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4731 { 4732 return 0; 4733 } 4734 4735 static ssize_t shrink_store(struct kmem_cache *s, 4736 const char *buf, size_t length) 4737 { 4738 if (buf[0] == '1') { 4739 int rc = kmem_cache_shrink(s); 4740 4741 if (rc) 4742 return rc; 4743 } else 4744 return -EINVAL; 4745 return length; 4746 } 4747 SLAB_ATTR(shrink); 4748 4749 #ifdef CONFIG_NUMA 4750 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4751 { 4752 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4753 } 4754 4755 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4756 const char *buf, size_t length) 4757 { 4758 unsigned long ratio; 4759 int err; 4760 4761 err = strict_strtoul(buf, 10, &ratio); 4762 if (err) 4763 return err; 4764 4765 if (ratio <= 100) 4766 s->remote_node_defrag_ratio = ratio * 10; 4767 4768 return length; 4769 } 4770 SLAB_ATTR(remote_node_defrag_ratio); 4771 #endif 4772 4773 #ifdef CONFIG_SLUB_STATS 4774 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4775 { 4776 unsigned long sum = 0; 4777 int cpu; 4778 int len; 4779 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4780 4781 if (!data) 4782 return -ENOMEM; 4783 4784 for_each_online_cpu(cpu) { 4785 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4786 4787 data[cpu] = x; 4788 sum += x; 4789 } 4790 4791 len = sprintf(buf, "%lu", sum); 4792 4793 #ifdef CONFIG_SMP 4794 for_each_online_cpu(cpu) { 4795 if (data[cpu] && len < PAGE_SIZE - 20) 4796 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4797 } 4798 #endif 4799 kfree(data); 4800 return len + sprintf(buf + len, "\n"); 4801 } 4802 4803 static void clear_stat(struct kmem_cache *s, enum stat_item si) 4804 { 4805 int cpu; 4806 4807 for_each_online_cpu(cpu) 4808 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4809 } 4810 4811 #define STAT_ATTR(si, text) \ 4812 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4813 { \ 4814 return show_stat(s, buf, si); \ 4815 } \ 4816 static ssize_t text##_store(struct kmem_cache *s, \ 4817 const char *buf, size_t length) \ 4818 { \ 4819 if (buf[0] != '0') \ 4820 return -EINVAL; \ 4821 clear_stat(s, si); \ 4822 return length; \ 4823 } \ 4824 SLAB_ATTR(text); \ 4825 4826 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4827 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4828 STAT_ATTR(FREE_FASTPATH, free_fastpath); 4829 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4830 STAT_ATTR(FREE_FROZEN, free_frozen); 4831 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4832 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4833 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4834 STAT_ATTR(ALLOC_SLAB, alloc_slab); 4835 STAT_ATTR(ALLOC_REFILL, alloc_refill); 4836 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 4837 STAT_ATTR(FREE_SLAB, free_slab); 4838 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4839 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4840 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4841 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4842 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4843 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4844 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 4845 STAT_ATTR(ORDER_FALLBACK, order_fallback); 4846 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4847 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4848 #endif 4849 4850 static struct attribute *slab_attrs[] = { 4851 &slab_size_attr.attr, 4852 &object_size_attr.attr, 4853 &objs_per_slab_attr.attr, 4854 &order_attr.attr, 4855 &min_partial_attr.attr, 4856 &objects_attr.attr, 4857 &objects_partial_attr.attr, 4858 &partial_attr.attr, 4859 &cpu_slabs_attr.attr, 4860 &ctor_attr.attr, 4861 &aliases_attr.attr, 4862 &align_attr.attr, 4863 &hwcache_align_attr.attr, 4864 &reclaim_account_attr.attr, 4865 &destroy_by_rcu_attr.attr, 4866 &shrink_attr.attr, 4867 &reserved_attr.attr, 4868 #ifdef CONFIG_SLUB_DEBUG 4869 &total_objects_attr.attr, 4870 &slabs_attr.attr, 4871 &sanity_checks_attr.attr, 4872 &trace_attr.attr, 4873 &red_zone_attr.attr, 4874 &poison_attr.attr, 4875 &store_user_attr.attr, 4876 &validate_attr.attr, 4877 &alloc_calls_attr.attr, 4878 &free_calls_attr.attr, 4879 #endif 4880 #ifdef CONFIG_ZONE_DMA 4881 &cache_dma_attr.attr, 4882 #endif 4883 #ifdef CONFIG_NUMA 4884 &remote_node_defrag_ratio_attr.attr, 4885 #endif 4886 #ifdef CONFIG_SLUB_STATS 4887 &alloc_fastpath_attr.attr, 4888 &alloc_slowpath_attr.attr, 4889 &free_fastpath_attr.attr, 4890 &free_slowpath_attr.attr, 4891 &free_frozen_attr.attr, 4892 &free_add_partial_attr.attr, 4893 &free_remove_partial_attr.attr, 4894 &alloc_from_partial_attr.attr, 4895 &alloc_slab_attr.attr, 4896 &alloc_refill_attr.attr, 4897 &alloc_node_mismatch_attr.attr, 4898 &free_slab_attr.attr, 4899 &cpuslab_flush_attr.attr, 4900 &deactivate_full_attr.attr, 4901 &deactivate_empty_attr.attr, 4902 &deactivate_to_head_attr.attr, 4903 &deactivate_to_tail_attr.attr, 4904 &deactivate_remote_frees_attr.attr, 4905 &deactivate_bypass_attr.attr, 4906 &order_fallback_attr.attr, 4907 &cmpxchg_double_fail_attr.attr, 4908 &cmpxchg_double_cpu_fail_attr.attr, 4909 #endif 4910 #ifdef CONFIG_FAILSLAB 4911 &failslab_attr.attr, 4912 #endif 4913 4914 NULL 4915 }; 4916 4917 static struct attribute_group slab_attr_group = { 4918 .attrs = slab_attrs, 4919 }; 4920 4921 static ssize_t slab_attr_show(struct kobject *kobj, 4922 struct attribute *attr, 4923 char *buf) 4924 { 4925 struct slab_attribute *attribute; 4926 struct kmem_cache *s; 4927 int err; 4928 4929 attribute = to_slab_attr(attr); 4930 s = to_slab(kobj); 4931 4932 if (!attribute->show) 4933 return -EIO; 4934 4935 err = attribute->show(s, buf); 4936 4937 return err; 4938 } 4939 4940 static ssize_t slab_attr_store(struct kobject *kobj, 4941 struct attribute *attr, 4942 const char *buf, size_t len) 4943 { 4944 struct slab_attribute *attribute; 4945 struct kmem_cache *s; 4946 int err; 4947 4948 attribute = to_slab_attr(attr); 4949 s = to_slab(kobj); 4950 4951 if (!attribute->store) 4952 return -EIO; 4953 4954 err = attribute->store(s, buf, len); 4955 4956 return err; 4957 } 4958 4959 static void kmem_cache_release(struct kobject *kobj) 4960 { 4961 struct kmem_cache *s = to_slab(kobj); 4962 4963 kfree(s->name); 4964 kfree(s); 4965 } 4966 4967 static const struct sysfs_ops slab_sysfs_ops = { 4968 .show = slab_attr_show, 4969 .store = slab_attr_store, 4970 }; 4971 4972 static struct kobj_type slab_ktype = { 4973 .sysfs_ops = &slab_sysfs_ops, 4974 .release = kmem_cache_release 4975 }; 4976 4977 static int uevent_filter(struct kset *kset, struct kobject *kobj) 4978 { 4979 struct kobj_type *ktype = get_ktype(kobj); 4980 4981 if (ktype == &slab_ktype) 4982 return 1; 4983 return 0; 4984 } 4985 4986 static const struct kset_uevent_ops slab_uevent_ops = { 4987 .filter = uevent_filter, 4988 }; 4989 4990 static struct kset *slab_kset; 4991 4992 #define ID_STR_LENGTH 64 4993 4994 /* Create a unique string id for a slab cache: 4995 * 4996 * Format :[flags-]size 4997 */ 4998 static char *create_unique_id(struct kmem_cache *s) 4999 { 5000 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 5001 char *p = name; 5002 5003 BUG_ON(!name); 5004 5005 *p++ = ':'; 5006 /* 5007 * First flags affecting slabcache operations. We will only 5008 * get here for aliasable slabs so we do not need to support 5009 * too many flags. The flags here must cover all flags that 5010 * are matched during merging to guarantee that the id is 5011 * unique. 5012 */ 5013 if (s->flags & SLAB_CACHE_DMA) 5014 *p++ = 'd'; 5015 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5016 *p++ = 'a'; 5017 if (s->flags & SLAB_DEBUG_FREE) 5018 *p++ = 'F'; 5019 if (!(s->flags & SLAB_NOTRACK)) 5020 *p++ = 't'; 5021 if (p != name + 1) 5022 *p++ = '-'; 5023 p += sprintf(p, "%07d", s->size); 5024 BUG_ON(p > name + ID_STR_LENGTH - 1); 5025 return name; 5026 } 5027 5028 static int sysfs_slab_add(struct kmem_cache *s) 5029 { 5030 int err; 5031 const char *name; 5032 int unmergeable; 5033 5034 if (slab_state < SYSFS) 5035 /* Defer until later */ 5036 return 0; 5037 5038 unmergeable = slab_unmergeable(s); 5039 if (unmergeable) { 5040 /* 5041 * Slabcache can never be merged so we can use the name proper. 5042 * This is typically the case for debug situations. In that 5043 * case we can catch duplicate names easily. 5044 */ 5045 sysfs_remove_link(&slab_kset->kobj, s->name); 5046 name = s->name; 5047 } else { 5048 /* 5049 * Create a unique name for the slab as a target 5050 * for the symlinks. 5051 */ 5052 name = create_unique_id(s); 5053 } 5054 5055 s->kobj.kset = slab_kset; 5056 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5057 if (err) { 5058 kobject_put(&s->kobj); 5059 return err; 5060 } 5061 5062 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5063 if (err) { 5064 kobject_del(&s->kobj); 5065 kobject_put(&s->kobj); 5066 return err; 5067 } 5068 kobject_uevent(&s->kobj, KOBJ_ADD); 5069 if (!unmergeable) { 5070 /* Setup first alias */ 5071 sysfs_slab_alias(s, s->name); 5072 kfree(name); 5073 } 5074 return 0; 5075 } 5076 5077 static void sysfs_slab_remove(struct kmem_cache *s) 5078 { 5079 if (slab_state < SYSFS) 5080 /* 5081 * Sysfs has not been setup yet so no need to remove the 5082 * cache from sysfs. 5083 */ 5084 return; 5085 5086 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5087 kobject_del(&s->kobj); 5088 kobject_put(&s->kobj); 5089 } 5090 5091 /* 5092 * Need to buffer aliases during bootup until sysfs becomes 5093 * available lest we lose that information. 5094 */ 5095 struct saved_alias { 5096 struct kmem_cache *s; 5097 const char *name; 5098 struct saved_alias *next; 5099 }; 5100 5101 static struct saved_alias *alias_list; 5102 5103 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5104 { 5105 struct saved_alias *al; 5106 5107 if (slab_state == SYSFS) { 5108 /* 5109 * If we have a leftover link then remove it. 5110 */ 5111 sysfs_remove_link(&slab_kset->kobj, name); 5112 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5113 } 5114 5115 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5116 if (!al) 5117 return -ENOMEM; 5118 5119 al->s = s; 5120 al->name = name; 5121 al->next = alias_list; 5122 alias_list = al; 5123 return 0; 5124 } 5125 5126 static int __init slab_sysfs_init(void) 5127 { 5128 struct kmem_cache *s; 5129 int err; 5130 5131 down_write(&slub_lock); 5132 5133 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5134 if (!slab_kset) { 5135 up_write(&slub_lock); 5136 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5137 return -ENOSYS; 5138 } 5139 5140 slab_state = SYSFS; 5141 5142 list_for_each_entry(s, &slab_caches, list) { 5143 err = sysfs_slab_add(s); 5144 if (err) 5145 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5146 " to sysfs\n", s->name); 5147 } 5148 5149 while (alias_list) { 5150 struct saved_alias *al = alias_list; 5151 5152 alias_list = alias_list->next; 5153 err = sysfs_slab_alias(al->s, al->name); 5154 if (err) 5155 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5156 " %s to sysfs\n", s->name); 5157 kfree(al); 5158 } 5159 5160 up_write(&slub_lock); 5161 resiliency_test(); 5162 return 0; 5163 } 5164 5165 __initcall(slab_sysfs_init); 5166 #endif /* CONFIG_SYSFS */ 5167 5168 /* 5169 * The /proc/slabinfo ABI 5170 */ 5171 #ifdef CONFIG_SLABINFO 5172 static void print_slabinfo_header(struct seq_file *m) 5173 { 5174 seq_puts(m, "slabinfo - version: 2.1\n"); 5175 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5176 "<objperslab> <pagesperslab>"); 5177 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5178 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5179 seq_putc(m, '\n'); 5180 } 5181 5182 static void *s_start(struct seq_file *m, loff_t *pos) 5183 { 5184 loff_t n = *pos; 5185 5186 down_read(&slub_lock); 5187 if (!n) 5188 print_slabinfo_header(m); 5189 5190 return seq_list_start(&slab_caches, *pos); 5191 } 5192 5193 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 5194 { 5195 return seq_list_next(p, &slab_caches, pos); 5196 } 5197 5198 static void s_stop(struct seq_file *m, void *p) 5199 { 5200 up_read(&slub_lock); 5201 } 5202 5203 static int s_show(struct seq_file *m, void *p) 5204 { 5205 unsigned long nr_partials = 0; 5206 unsigned long nr_slabs = 0; 5207 unsigned long nr_inuse = 0; 5208 unsigned long nr_objs = 0; 5209 unsigned long nr_free = 0; 5210 struct kmem_cache *s; 5211 int node; 5212 5213 s = list_entry(p, struct kmem_cache, list); 5214 5215 for_each_online_node(node) { 5216 struct kmem_cache_node *n = get_node(s, node); 5217 5218 if (!n) 5219 continue; 5220 5221 nr_partials += n->nr_partial; 5222 nr_slabs += atomic_long_read(&n->nr_slabs); 5223 nr_objs += atomic_long_read(&n->total_objects); 5224 nr_free += count_partial(n, count_free); 5225 } 5226 5227 nr_inuse = nr_objs - nr_free; 5228 5229 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5230 nr_objs, s->size, oo_objects(s->oo), 5231 (1 << oo_order(s->oo))); 5232 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5233 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 5234 0UL); 5235 seq_putc(m, '\n'); 5236 return 0; 5237 } 5238 5239 static const struct seq_operations slabinfo_op = { 5240 .start = s_start, 5241 .next = s_next, 5242 .stop = s_stop, 5243 .show = s_show, 5244 }; 5245 5246 static int slabinfo_open(struct inode *inode, struct file *file) 5247 { 5248 return seq_open(file, &slabinfo_op); 5249 } 5250 5251 static const struct file_operations proc_slabinfo_operations = { 5252 .open = slabinfo_open, 5253 .read = seq_read, 5254 .llseek = seq_lseek, 5255 .release = seq_release, 5256 }; 5257 5258 static int __init slab_proc_init(void) 5259 { 5260 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5261 return 0; 5262 } 5263 module_init(slab_proc_init); 5264 #endif /* CONFIG_SLABINFO */ 5265