1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include "slab.h" 20 #include <linux/proc_fs.h> 21 #include <linux/notifier.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmemcheck.h> 24 #include <linux/cpu.h> 25 #include <linux/cpuset.h> 26 #include <linux/mempolicy.h> 27 #include <linux/ctype.h> 28 #include <linux/debugobjects.h> 29 #include <linux/kallsyms.h> 30 #include <linux/memory.h> 31 #include <linux/math64.h> 32 #include <linux/fault-inject.h> 33 #include <linux/stacktrace.h> 34 #include <linux/prefetch.h> 35 #include <linux/memcontrol.h> 36 37 #include <trace/events/kmem.h> 38 39 #include "internal.h" 40 41 /* 42 * Lock order: 43 * 1. slab_mutex (Global Mutex) 44 * 2. node->list_lock 45 * 3. slab_lock(page) (Only on some arches and for debugging) 46 * 47 * slab_mutex 48 * 49 * The role of the slab_mutex is to protect the list of all the slabs 50 * and to synchronize major metadata changes to slab cache structures. 51 * 52 * The slab_lock is only used for debugging and on arches that do not 53 * have the ability to do a cmpxchg_double. It only protects the second 54 * double word in the page struct. Meaning 55 * A. page->freelist -> List of object free in a page 56 * B. page->counters -> Counters of objects 57 * C. page->frozen -> frozen state 58 * 59 * If a slab is frozen then it is exempt from list management. It is not 60 * on any list. The processor that froze the slab is the one who can 61 * perform list operations on the page. Other processors may put objects 62 * onto the freelist but the processor that froze the slab is the only 63 * one that can retrieve the objects from the page's freelist. 64 * 65 * The list_lock protects the partial and full list on each node and 66 * the partial slab counter. If taken then no new slabs may be added or 67 * removed from the lists nor make the number of partial slabs be modified. 68 * (Note that the total number of slabs is an atomic value that may be 69 * modified without taking the list lock). 70 * 71 * The list_lock is a centralized lock and thus we avoid taking it as 72 * much as possible. As long as SLUB does not have to handle partial 73 * slabs, operations can continue without any centralized lock. F.e. 74 * allocating a long series of objects that fill up slabs does not require 75 * the list lock. 76 * Interrupts are disabled during allocation and deallocation in order to 77 * make the slab allocator safe to use in the context of an irq. In addition 78 * interrupts are disabled to ensure that the processor does not change 79 * while handling per_cpu slabs, due to kernel preemption. 80 * 81 * SLUB assigns one slab for allocation to each processor. 82 * Allocations only occur from these slabs called cpu slabs. 83 * 84 * Slabs with free elements are kept on a partial list and during regular 85 * operations no list for full slabs is used. If an object in a full slab is 86 * freed then the slab will show up again on the partial lists. 87 * We track full slabs for debugging purposes though because otherwise we 88 * cannot scan all objects. 89 * 90 * Slabs are freed when they become empty. Teardown and setup is 91 * minimal so we rely on the page allocators per cpu caches for 92 * fast frees and allocs. 93 * 94 * Overloading of page flags that are otherwise used for LRU management. 95 * 96 * PageActive The slab is frozen and exempt from list processing. 97 * This means that the slab is dedicated to a purpose 98 * such as satisfying allocations for a specific 99 * processor. Objects may be freed in the slab while 100 * it is frozen but slab_free will then skip the usual 101 * list operations. It is up to the processor holding 102 * the slab to integrate the slab into the slab lists 103 * when the slab is no longer needed. 104 * 105 * One use of this flag is to mark slabs that are 106 * used for allocations. Then such a slab becomes a cpu 107 * slab. The cpu slab may be equipped with an additional 108 * freelist that allows lockless access to 109 * free objects in addition to the regular freelist 110 * that requires the slab lock. 111 * 112 * PageError Slab requires special handling due to debug 113 * options set. This moves slab handling out of 114 * the fast path and disables lockless freelists. 115 */ 116 117 static inline int kmem_cache_debug(struct kmem_cache *s) 118 { 119 #ifdef CONFIG_SLUB_DEBUG 120 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 121 #else 122 return 0; 123 #endif 124 } 125 126 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 127 { 128 #ifdef CONFIG_SLUB_CPU_PARTIAL 129 return !kmem_cache_debug(s); 130 #else 131 return false; 132 #endif 133 } 134 135 /* 136 * Issues still to be resolved: 137 * 138 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 139 * 140 * - Variable sizing of the per node arrays 141 */ 142 143 /* Enable to test recovery from slab corruption on boot */ 144 #undef SLUB_RESILIENCY_TEST 145 146 /* Enable to log cmpxchg failures */ 147 #undef SLUB_DEBUG_CMPXCHG 148 149 /* 150 * Mininum number of partial slabs. These will be left on the partial 151 * lists even if they are empty. kmem_cache_shrink may reclaim them. 152 */ 153 #define MIN_PARTIAL 5 154 155 /* 156 * Maximum number of desirable partial slabs. 157 * The existence of more partial slabs makes kmem_cache_shrink 158 * sort the partial list by the number of objects in the. 159 */ 160 #define MAX_PARTIAL 10 161 162 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 163 SLAB_POISON | SLAB_STORE_USER) 164 165 /* 166 * Debugging flags that require metadata to be stored in the slab. These get 167 * disabled when slub_debug=O is used and a cache's min order increases with 168 * metadata. 169 */ 170 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 171 172 /* 173 * Set of flags that will prevent slab merging 174 */ 175 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 176 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 177 SLAB_FAILSLAB) 178 179 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 180 SLAB_CACHE_DMA | SLAB_NOTRACK) 181 182 #define OO_SHIFT 16 183 #define OO_MASK ((1 << OO_SHIFT) - 1) 184 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 185 186 /* Internal SLUB flags */ 187 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 188 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 189 190 #ifdef CONFIG_SMP 191 static struct notifier_block slab_notifier; 192 #endif 193 194 /* 195 * Tracking user of a slab. 196 */ 197 #define TRACK_ADDRS_COUNT 16 198 struct track { 199 unsigned long addr; /* Called from address */ 200 #ifdef CONFIG_STACKTRACE 201 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 202 #endif 203 int cpu; /* Was running on cpu */ 204 int pid; /* Pid context */ 205 unsigned long when; /* When did the operation occur */ 206 }; 207 208 enum track_item { TRACK_ALLOC, TRACK_FREE }; 209 210 #ifdef CONFIG_SYSFS 211 static int sysfs_slab_add(struct kmem_cache *); 212 static int sysfs_slab_alias(struct kmem_cache *, const char *); 213 static void sysfs_slab_remove(struct kmem_cache *); 214 static void memcg_propagate_slab_attrs(struct kmem_cache *s); 215 #else 216 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 217 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 218 { return 0; } 219 static inline void sysfs_slab_remove(struct kmem_cache *s) { } 220 221 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } 222 #endif 223 224 static inline void stat(const struct kmem_cache *s, enum stat_item si) 225 { 226 #ifdef CONFIG_SLUB_STATS 227 __this_cpu_inc(s->cpu_slab->stat[si]); 228 #endif 229 } 230 231 /******************************************************************** 232 * Core slab cache functions 233 *******************************************************************/ 234 235 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 236 { 237 return s->node[node]; 238 } 239 240 /* Verify that a pointer has an address that is valid within a slab page */ 241 static inline int check_valid_pointer(struct kmem_cache *s, 242 struct page *page, const void *object) 243 { 244 void *base; 245 246 if (!object) 247 return 1; 248 249 base = page_address(page); 250 if (object < base || object >= base + page->objects * s->size || 251 (object - base) % s->size) { 252 return 0; 253 } 254 255 return 1; 256 } 257 258 static inline void *get_freepointer(struct kmem_cache *s, void *object) 259 { 260 return *(void **)(object + s->offset); 261 } 262 263 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 264 { 265 prefetch(object + s->offset); 266 } 267 268 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 269 { 270 void *p; 271 272 #ifdef CONFIG_DEBUG_PAGEALLOC 273 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 274 #else 275 p = get_freepointer(s, object); 276 #endif 277 return p; 278 } 279 280 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 281 { 282 *(void **)(object + s->offset) = fp; 283 } 284 285 /* Loop over all objects in a slab */ 286 #define for_each_object(__p, __s, __addr, __objects) \ 287 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 288 __p += (__s)->size) 289 290 /* Determine object index from a given position */ 291 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 292 { 293 return (p - addr) / s->size; 294 } 295 296 static inline size_t slab_ksize(const struct kmem_cache *s) 297 { 298 #ifdef CONFIG_SLUB_DEBUG 299 /* 300 * Debugging requires use of the padding between object 301 * and whatever may come after it. 302 */ 303 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 304 return s->object_size; 305 306 #endif 307 /* 308 * If we have the need to store the freelist pointer 309 * back there or track user information then we can 310 * only use the space before that information. 311 */ 312 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 313 return s->inuse; 314 /* 315 * Else we can use all the padding etc for the allocation 316 */ 317 return s->size; 318 } 319 320 static inline int order_objects(int order, unsigned long size, int reserved) 321 { 322 return ((PAGE_SIZE << order) - reserved) / size; 323 } 324 325 static inline struct kmem_cache_order_objects oo_make(int order, 326 unsigned long size, int reserved) 327 { 328 struct kmem_cache_order_objects x = { 329 (order << OO_SHIFT) + order_objects(order, size, reserved) 330 }; 331 332 return x; 333 } 334 335 static inline int oo_order(struct kmem_cache_order_objects x) 336 { 337 return x.x >> OO_SHIFT; 338 } 339 340 static inline int oo_objects(struct kmem_cache_order_objects x) 341 { 342 return x.x & OO_MASK; 343 } 344 345 /* 346 * Per slab locking using the pagelock 347 */ 348 static __always_inline void slab_lock(struct page *page) 349 { 350 bit_spin_lock(PG_locked, &page->flags); 351 } 352 353 static __always_inline void slab_unlock(struct page *page) 354 { 355 __bit_spin_unlock(PG_locked, &page->flags); 356 } 357 358 /* Interrupts must be disabled (for the fallback code to work right) */ 359 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 360 void *freelist_old, unsigned long counters_old, 361 void *freelist_new, unsigned long counters_new, 362 const char *n) 363 { 364 VM_BUG_ON(!irqs_disabled()); 365 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 366 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 367 if (s->flags & __CMPXCHG_DOUBLE) { 368 if (cmpxchg_double(&page->freelist, &page->counters, 369 freelist_old, counters_old, 370 freelist_new, counters_new)) 371 return 1; 372 } else 373 #endif 374 { 375 slab_lock(page); 376 if (page->freelist == freelist_old && 377 page->counters == counters_old) { 378 page->freelist = freelist_new; 379 page->counters = counters_new; 380 slab_unlock(page); 381 return 1; 382 } 383 slab_unlock(page); 384 } 385 386 cpu_relax(); 387 stat(s, CMPXCHG_DOUBLE_FAIL); 388 389 #ifdef SLUB_DEBUG_CMPXCHG 390 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 391 #endif 392 393 return 0; 394 } 395 396 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 397 void *freelist_old, unsigned long counters_old, 398 void *freelist_new, unsigned long counters_new, 399 const char *n) 400 { 401 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 402 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 403 if (s->flags & __CMPXCHG_DOUBLE) { 404 if (cmpxchg_double(&page->freelist, &page->counters, 405 freelist_old, counters_old, 406 freelist_new, counters_new)) 407 return 1; 408 } else 409 #endif 410 { 411 unsigned long flags; 412 413 local_irq_save(flags); 414 slab_lock(page); 415 if (page->freelist == freelist_old && 416 page->counters == counters_old) { 417 page->freelist = freelist_new; 418 page->counters = counters_new; 419 slab_unlock(page); 420 local_irq_restore(flags); 421 return 1; 422 } 423 slab_unlock(page); 424 local_irq_restore(flags); 425 } 426 427 cpu_relax(); 428 stat(s, CMPXCHG_DOUBLE_FAIL); 429 430 #ifdef SLUB_DEBUG_CMPXCHG 431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 432 #endif 433 434 return 0; 435 } 436 437 #ifdef CONFIG_SLUB_DEBUG 438 /* 439 * Determine a map of object in use on a page. 440 * 441 * Node listlock must be held to guarantee that the page does 442 * not vanish from under us. 443 */ 444 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 445 { 446 void *p; 447 void *addr = page_address(page); 448 449 for (p = page->freelist; p; p = get_freepointer(s, p)) 450 set_bit(slab_index(p, s, addr), map); 451 } 452 453 /* 454 * Debug settings: 455 */ 456 #ifdef CONFIG_SLUB_DEBUG_ON 457 static int slub_debug = DEBUG_DEFAULT_FLAGS; 458 #else 459 static int slub_debug; 460 #endif 461 462 static char *slub_debug_slabs; 463 static int disable_higher_order_debug; 464 465 /* 466 * Object debugging 467 */ 468 static void print_section(char *text, u8 *addr, unsigned int length) 469 { 470 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 471 length, 1); 472 } 473 474 static struct track *get_track(struct kmem_cache *s, void *object, 475 enum track_item alloc) 476 { 477 struct track *p; 478 479 if (s->offset) 480 p = object + s->offset + sizeof(void *); 481 else 482 p = object + s->inuse; 483 484 return p + alloc; 485 } 486 487 static void set_track(struct kmem_cache *s, void *object, 488 enum track_item alloc, unsigned long addr) 489 { 490 struct track *p = get_track(s, object, alloc); 491 492 if (addr) { 493 #ifdef CONFIG_STACKTRACE 494 struct stack_trace trace; 495 int i; 496 497 trace.nr_entries = 0; 498 trace.max_entries = TRACK_ADDRS_COUNT; 499 trace.entries = p->addrs; 500 trace.skip = 3; 501 save_stack_trace(&trace); 502 503 /* See rant in lockdep.c */ 504 if (trace.nr_entries != 0 && 505 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 506 trace.nr_entries--; 507 508 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 509 p->addrs[i] = 0; 510 #endif 511 p->addr = addr; 512 p->cpu = smp_processor_id(); 513 p->pid = current->pid; 514 p->when = jiffies; 515 } else 516 memset(p, 0, sizeof(struct track)); 517 } 518 519 static void init_tracking(struct kmem_cache *s, void *object) 520 { 521 if (!(s->flags & SLAB_STORE_USER)) 522 return; 523 524 set_track(s, object, TRACK_FREE, 0UL); 525 set_track(s, object, TRACK_ALLOC, 0UL); 526 } 527 528 static void print_track(const char *s, struct track *t) 529 { 530 if (!t->addr) 531 return; 532 533 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 534 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 535 #ifdef CONFIG_STACKTRACE 536 { 537 int i; 538 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 539 if (t->addrs[i]) 540 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 541 else 542 break; 543 } 544 #endif 545 } 546 547 static void print_tracking(struct kmem_cache *s, void *object) 548 { 549 if (!(s->flags & SLAB_STORE_USER)) 550 return; 551 552 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 553 print_track("Freed", get_track(s, object, TRACK_FREE)); 554 } 555 556 static void print_page_info(struct page *page) 557 { 558 printk(KERN_ERR 559 "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 560 page, page->objects, page->inuse, page->freelist, page->flags); 561 562 } 563 564 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 565 { 566 va_list args; 567 char buf[100]; 568 569 va_start(args, fmt); 570 vsnprintf(buf, sizeof(buf), fmt, args); 571 va_end(args); 572 printk(KERN_ERR "========================================" 573 "=====================================\n"); 574 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 575 printk(KERN_ERR "----------------------------------------" 576 "-------------------------------------\n\n"); 577 578 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 579 } 580 581 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 582 { 583 va_list args; 584 char buf[100]; 585 586 va_start(args, fmt); 587 vsnprintf(buf, sizeof(buf), fmt, args); 588 va_end(args); 589 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 590 } 591 592 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 593 { 594 unsigned int off; /* Offset of last byte */ 595 u8 *addr = page_address(page); 596 597 print_tracking(s, p); 598 599 print_page_info(page); 600 601 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 602 p, p - addr, get_freepointer(s, p)); 603 604 if (p > addr + 16) 605 print_section("Bytes b4 ", p - 16, 16); 606 607 print_section("Object ", p, min_t(unsigned long, s->object_size, 608 PAGE_SIZE)); 609 if (s->flags & SLAB_RED_ZONE) 610 print_section("Redzone ", p + s->object_size, 611 s->inuse - s->object_size); 612 613 if (s->offset) 614 off = s->offset + sizeof(void *); 615 else 616 off = s->inuse; 617 618 if (s->flags & SLAB_STORE_USER) 619 off += 2 * sizeof(struct track); 620 621 if (off != s->size) 622 /* Beginning of the filler is the free pointer */ 623 print_section("Padding ", p + off, s->size - off); 624 625 dump_stack(); 626 } 627 628 static void object_err(struct kmem_cache *s, struct page *page, 629 u8 *object, char *reason) 630 { 631 slab_bug(s, "%s", reason); 632 print_trailer(s, page, object); 633 } 634 635 static void slab_err(struct kmem_cache *s, struct page *page, 636 const char *fmt, ...) 637 { 638 va_list args; 639 char buf[100]; 640 641 va_start(args, fmt); 642 vsnprintf(buf, sizeof(buf), fmt, args); 643 va_end(args); 644 slab_bug(s, "%s", buf); 645 print_page_info(page); 646 dump_stack(); 647 } 648 649 static void init_object(struct kmem_cache *s, void *object, u8 val) 650 { 651 u8 *p = object; 652 653 if (s->flags & __OBJECT_POISON) { 654 memset(p, POISON_FREE, s->object_size - 1); 655 p[s->object_size - 1] = POISON_END; 656 } 657 658 if (s->flags & SLAB_RED_ZONE) 659 memset(p + s->object_size, val, s->inuse - s->object_size); 660 } 661 662 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 663 void *from, void *to) 664 { 665 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 666 memset(from, data, to - from); 667 } 668 669 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 670 u8 *object, char *what, 671 u8 *start, unsigned int value, unsigned int bytes) 672 { 673 u8 *fault; 674 u8 *end; 675 676 fault = memchr_inv(start, value, bytes); 677 if (!fault) 678 return 1; 679 680 end = start + bytes; 681 while (end > fault && end[-1] == value) 682 end--; 683 684 slab_bug(s, "%s overwritten", what); 685 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 686 fault, end - 1, fault[0], value); 687 print_trailer(s, page, object); 688 689 restore_bytes(s, what, value, fault, end); 690 return 0; 691 } 692 693 /* 694 * Object layout: 695 * 696 * object address 697 * Bytes of the object to be managed. 698 * If the freepointer may overlay the object then the free 699 * pointer is the first word of the object. 700 * 701 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 702 * 0xa5 (POISON_END) 703 * 704 * object + s->object_size 705 * Padding to reach word boundary. This is also used for Redzoning. 706 * Padding is extended by another word if Redzoning is enabled and 707 * object_size == inuse. 708 * 709 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 710 * 0xcc (RED_ACTIVE) for objects in use. 711 * 712 * object + s->inuse 713 * Meta data starts here. 714 * 715 * A. Free pointer (if we cannot overwrite object on free) 716 * B. Tracking data for SLAB_STORE_USER 717 * C. Padding to reach required alignment boundary or at mininum 718 * one word if debugging is on to be able to detect writes 719 * before the word boundary. 720 * 721 * Padding is done using 0x5a (POISON_INUSE) 722 * 723 * object + s->size 724 * Nothing is used beyond s->size. 725 * 726 * If slabcaches are merged then the object_size and inuse boundaries are mostly 727 * ignored. And therefore no slab options that rely on these boundaries 728 * may be used with merged slabcaches. 729 */ 730 731 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 732 { 733 unsigned long off = s->inuse; /* The end of info */ 734 735 if (s->offset) 736 /* Freepointer is placed after the object. */ 737 off += sizeof(void *); 738 739 if (s->flags & SLAB_STORE_USER) 740 /* We also have user information there */ 741 off += 2 * sizeof(struct track); 742 743 if (s->size == off) 744 return 1; 745 746 return check_bytes_and_report(s, page, p, "Object padding", 747 p + off, POISON_INUSE, s->size - off); 748 } 749 750 /* Check the pad bytes at the end of a slab page */ 751 static int slab_pad_check(struct kmem_cache *s, struct page *page) 752 { 753 u8 *start; 754 u8 *fault; 755 u8 *end; 756 int length; 757 int remainder; 758 759 if (!(s->flags & SLAB_POISON)) 760 return 1; 761 762 start = page_address(page); 763 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 764 end = start + length; 765 remainder = length % s->size; 766 if (!remainder) 767 return 1; 768 769 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 770 if (!fault) 771 return 1; 772 while (end > fault && end[-1] == POISON_INUSE) 773 end--; 774 775 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 776 print_section("Padding ", end - remainder, remainder); 777 778 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 779 return 0; 780 } 781 782 static int check_object(struct kmem_cache *s, struct page *page, 783 void *object, u8 val) 784 { 785 u8 *p = object; 786 u8 *endobject = object + s->object_size; 787 788 if (s->flags & SLAB_RED_ZONE) { 789 if (!check_bytes_and_report(s, page, object, "Redzone", 790 endobject, val, s->inuse - s->object_size)) 791 return 0; 792 } else { 793 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { 794 check_bytes_and_report(s, page, p, "Alignment padding", 795 endobject, POISON_INUSE, 796 s->inuse - s->object_size); 797 } 798 } 799 800 if (s->flags & SLAB_POISON) { 801 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 802 (!check_bytes_and_report(s, page, p, "Poison", p, 803 POISON_FREE, s->object_size - 1) || 804 !check_bytes_and_report(s, page, p, "Poison", 805 p + s->object_size - 1, POISON_END, 1))) 806 return 0; 807 /* 808 * check_pad_bytes cleans up on its own. 809 */ 810 check_pad_bytes(s, page, p); 811 } 812 813 if (!s->offset && val == SLUB_RED_ACTIVE) 814 /* 815 * Object and freepointer overlap. Cannot check 816 * freepointer while object is allocated. 817 */ 818 return 1; 819 820 /* Check free pointer validity */ 821 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 822 object_err(s, page, p, "Freepointer corrupt"); 823 /* 824 * No choice but to zap it and thus lose the remainder 825 * of the free objects in this slab. May cause 826 * another error because the object count is now wrong. 827 */ 828 set_freepointer(s, p, NULL); 829 return 0; 830 } 831 return 1; 832 } 833 834 static int check_slab(struct kmem_cache *s, struct page *page) 835 { 836 int maxobj; 837 838 VM_BUG_ON(!irqs_disabled()); 839 840 if (!PageSlab(page)) { 841 slab_err(s, page, "Not a valid slab page"); 842 return 0; 843 } 844 845 maxobj = order_objects(compound_order(page), s->size, s->reserved); 846 if (page->objects > maxobj) { 847 slab_err(s, page, "objects %u > max %u", 848 s->name, page->objects, maxobj); 849 return 0; 850 } 851 if (page->inuse > page->objects) { 852 slab_err(s, page, "inuse %u > max %u", 853 s->name, page->inuse, page->objects); 854 return 0; 855 } 856 /* Slab_pad_check fixes things up after itself */ 857 slab_pad_check(s, page); 858 return 1; 859 } 860 861 /* 862 * Determine if a certain object on a page is on the freelist. Must hold the 863 * slab lock to guarantee that the chains are in a consistent state. 864 */ 865 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 866 { 867 int nr = 0; 868 void *fp; 869 void *object = NULL; 870 unsigned long max_objects; 871 872 fp = page->freelist; 873 while (fp && nr <= page->objects) { 874 if (fp == search) 875 return 1; 876 if (!check_valid_pointer(s, page, fp)) { 877 if (object) { 878 object_err(s, page, object, 879 "Freechain corrupt"); 880 set_freepointer(s, object, NULL); 881 } else { 882 slab_err(s, page, "Freepointer corrupt"); 883 page->freelist = NULL; 884 page->inuse = page->objects; 885 slab_fix(s, "Freelist cleared"); 886 return 0; 887 } 888 break; 889 } 890 object = fp; 891 fp = get_freepointer(s, object); 892 nr++; 893 } 894 895 max_objects = order_objects(compound_order(page), s->size, s->reserved); 896 if (max_objects > MAX_OBJS_PER_PAGE) 897 max_objects = MAX_OBJS_PER_PAGE; 898 899 if (page->objects != max_objects) { 900 slab_err(s, page, "Wrong number of objects. Found %d but " 901 "should be %d", page->objects, max_objects); 902 page->objects = max_objects; 903 slab_fix(s, "Number of objects adjusted."); 904 } 905 if (page->inuse != page->objects - nr) { 906 slab_err(s, page, "Wrong object count. Counter is %d but " 907 "counted were %d", page->inuse, page->objects - nr); 908 page->inuse = page->objects - nr; 909 slab_fix(s, "Object count adjusted."); 910 } 911 return search == NULL; 912 } 913 914 static void trace(struct kmem_cache *s, struct page *page, void *object, 915 int alloc) 916 { 917 if (s->flags & SLAB_TRACE) { 918 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 919 s->name, 920 alloc ? "alloc" : "free", 921 object, page->inuse, 922 page->freelist); 923 924 if (!alloc) 925 print_section("Object ", (void *)object, 926 s->object_size); 927 928 dump_stack(); 929 } 930 } 931 932 /* 933 * Hooks for other subsystems that check memory allocations. In a typical 934 * production configuration these hooks all should produce no code at all. 935 */ 936 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 937 { 938 flags &= gfp_allowed_mask; 939 lockdep_trace_alloc(flags); 940 might_sleep_if(flags & __GFP_WAIT); 941 942 return should_failslab(s->object_size, flags, s->flags); 943 } 944 945 static inline void slab_post_alloc_hook(struct kmem_cache *s, 946 gfp_t flags, void *object) 947 { 948 flags &= gfp_allowed_mask; 949 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 950 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 951 } 952 953 static inline void slab_free_hook(struct kmem_cache *s, void *x) 954 { 955 kmemleak_free_recursive(x, s->flags); 956 957 /* 958 * Trouble is that we may no longer disable interupts in the fast path 959 * So in order to make the debug calls that expect irqs to be 960 * disabled we need to disable interrupts temporarily. 961 */ 962 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 963 { 964 unsigned long flags; 965 966 local_irq_save(flags); 967 kmemcheck_slab_free(s, x, s->object_size); 968 debug_check_no_locks_freed(x, s->object_size); 969 local_irq_restore(flags); 970 } 971 #endif 972 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 973 debug_check_no_obj_freed(x, s->object_size); 974 } 975 976 /* 977 * Tracking of fully allocated slabs for debugging purposes. 978 * 979 * list_lock must be held. 980 */ 981 static void add_full(struct kmem_cache *s, 982 struct kmem_cache_node *n, struct page *page) 983 { 984 if (!(s->flags & SLAB_STORE_USER)) 985 return; 986 987 list_add(&page->lru, &n->full); 988 } 989 990 /* 991 * list_lock must be held. 992 */ 993 static void remove_full(struct kmem_cache *s, struct page *page) 994 { 995 if (!(s->flags & SLAB_STORE_USER)) 996 return; 997 998 list_del(&page->lru); 999 } 1000 1001 /* Tracking of the number of slabs for debugging purposes */ 1002 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1003 { 1004 struct kmem_cache_node *n = get_node(s, node); 1005 1006 return atomic_long_read(&n->nr_slabs); 1007 } 1008 1009 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1010 { 1011 return atomic_long_read(&n->nr_slabs); 1012 } 1013 1014 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1015 { 1016 struct kmem_cache_node *n = get_node(s, node); 1017 1018 /* 1019 * May be called early in order to allocate a slab for the 1020 * kmem_cache_node structure. Solve the chicken-egg 1021 * dilemma by deferring the increment of the count during 1022 * bootstrap (see early_kmem_cache_node_alloc). 1023 */ 1024 if (likely(n)) { 1025 atomic_long_inc(&n->nr_slabs); 1026 atomic_long_add(objects, &n->total_objects); 1027 } 1028 } 1029 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1030 { 1031 struct kmem_cache_node *n = get_node(s, node); 1032 1033 atomic_long_dec(&n->nr_slabs); 1034 atomic_long_sub(objects, &n->total_objects); 1035 } 1036 1037 /* Object debug checks for alloc/free paths */ 1038 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1039 void *object) 1040 { 1041 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1042 return; 1043 1044 init_object(s, object, SLUB_RED_INACTIVE); 1045 init_tracking(s, object); 1046 } 1047 1048 static noinline int alloc_debug_processing(struct kmem_cache *s, 1049 struct page *page, 1050 void *object, unsigned long addr) 1051 { 1052 if (!check_slab(s, page)) 1053 goto bad; 1054 1055 if (!check_valid_pointer(s, page, object)) { 1056 object_err(s, page, object, "Freelist Pointer check fails"); 1057 goto bad; 1058 } 1059 1060 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1061 goto bad; 1062 1063 /* Success perform special debug activities for allocs */ 1064 if (s->flags & SLAB_STORE_USER) 1065 set_track(s, object, TRACK_ALLOC, addr); 1066 trace(s, page, object, 1); 1067 init_object(s, object, SLUB_RED_ACTIVE); 1068 return 1; 1069 1070 bad: 1071 if (PageSlab(page)) { 1072 /* 1073 * If this is a slab page then lets do the best we can 1074 * to avoid issues in the future. Marking all objects 1075 * as used avoids touching the remaining objects. 1076 */ 1077 slab_fix(s, "Marking all objects used"); 1078 page->inuse = page->objects; 1079 page->freelist = NULL; 1080 } 1081 return 0; 1082 } 1083 1084 static noinline struct kmem_cache_node *free_debug_processing( 1085 struct kmem_cache *s, struct page *page, void *object, 1086 unsigned long addr, unsigned long *flags) 1087 { 1088 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1089 1090 spin_lock_irqsave(&n->list_lock, *flags); 1091 slab_lock(page); 1092 1093 if (!check_slab(s, page)) 1094 goto fail; 1095 1096 if (!check_valid_pointer(s, page, object)) { 1097 slab_err(s, page, "Invalid object pointer 0x%p", object); 1098 goto fail; 1099 } 1100 1101 if (on_freelist(s, page, object)) { 1102 object_err(s, page, object, "Object already free"); 1103 goto fail; 1104 } 1105 1106 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1107 goto out; 1108 1109 if (unlikely(s != page->slab_cache)) { 1110 if (!PageSlab(page)) { 1111 slab_err(s, page, "Attempt to free object(0x%p) " 1112 "outside of slab", object); 1113 } else if (!page->slab_cache) { 1114 printk(KERN_ERR 1115 "SLUB <none>: no slab for object 0x%p.\n", 1116 object); 1117 dump_stack(); 1118 } else 1119 object_err(s, page, object, 1120 "page slab pointer corrupt."); 1121 goto fail; 1122 } 1123 1124 if (s->flags & SLAB_STORE_USER) 1125 set_track(s, object, TRACK_FREE, addr); 1126 trace(s, page, object, 0); 1127 init_object(s, object, SLUB_RED_INACTIVE); 1128 out: 1129 slab_unlock(page); 1130 /* 1131 * Keep node_lock to preserve integrity 1132 * until the object is actually freed 1133 */ 1134 return n; 1135 1136 fail: 1137 slab_unlock(page); 1138 spin_unlock_irqrestore(&n->list_lock, *flags); 1139 slab_fix(s, "Object at 0x%p not freed", object); 1140 return NULL; 1141 } 1142 1143 static int __init setup_slub_debug(char *str) 1144 { 1145 slub_debug = DEBUG_DEFAULT_FLAGS; 1146 if (*str++ != '=' || !*str) 1147 /* 1148 * No options specified. Switch on full debugging. 1149 */ 1150 goto out; 1151 1152 if (*str == ',') 1153 /* 1154 * No options but restriction on slabs. This means full 1155 * debugging for slabs matching a pattern. 1156 */ 1157 goto check_slabs; 1158 1159 if (tolower(*str) == 'o') { 1160 /* 1161 * Avoid enabling debugging on caches if its minimum order 1162 * would increase as a result. 1163 */ 1164 disable_higher_order_debug = 1; 1165 goto out; 1166 } 1167 1168 slub_debug = 0; 1169 if (*str == '-') 1170 /* 1171 * Switch off all debugging measures. 1172 */ 1173 goto out; 1174 1175 /* 1176 * Determine which debug features should be switched on 1177 */ 1178 for (; *str && *str != ','; str++) { 1179 switch (tolower(*str)) { 1180 case 'f': 1181 slub_debug |= SLAB_DEBUG_FREE; 1182 break; 1183 case 'z': 1184 slub_debug |= SLAB_RED_ZONE; 1185 break; 1186 case 'p': 1187 slub_debug |= SLAB_POISON; 1188 break; 1189 case 'u': 1190 slub_debug |= SLAB_STORE_USER; 1191 break; 1192 case 't': 1193 slub_debug |= SLAB_TRACE; 1194 break; 1195 case 'a': 1196 slub_debug |= SLAB_FAILSLAB; 1197 break; 1198 default: 1199 printk(KERN_ERR "slub_debug option '%c' " 1200 "unknown. skipped\n", *str); 1201 } 1202 } 1203 1204 check_slabs: 1205 if (*str == ',') 1206 slub_debug_slabs = str + 1; 1207 out: 1208 return 1; 1209 } 1210 1211 __setup("slub_debug", setup_slub_debug); 1212 1213 static unsigned long kmem_cache_flags(unsigned long object_size, 1214 unsigned long flags, const char *name, 1215 void (*ctor)(void *)) 1216 { 1217 /* 1218 * Enable debugging if selected on the kernel commandline. 1219 */ 1220 if (slub_debug && (!slub_debug_slabs || 1221 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1222 flags |= slub_debug; 1223 1224 return flags; 1225 } 1226 #else 1227 static inline void setup_object_debug(struct kmem_cache *s, 1228 struct page *page, void *object) {} 1229 1230 static inline int alloc_debug_processing(struct kmem_cache *s, 1231 struct page *page, void *object, unsigned long addr) { return 0; } 1232 1233 static inline struct kmem_cache_node *free_debug_processing( 1234 struct kmem_cache *s, struct page *page, void *object, 1235 unsigned long addr, unsigned long *flags) { return NULL; } 1236 1237 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1238 { return 1; } 1239 static inline int check_object(struct kmem_cache *s, struct page *page, 1240 void *object, u8 val) { return 1; } 1241 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1242 struct page *page) {} 1243 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1244 static inline unsigned long kmem_cache_flags(unsigned long object_size, 1245 unsigned long flags, const char *name, 1246 void (*ctor)(void *)) 1247 { 1248 return flags; 1249 } 1250 #define slub_debug 0 1251 1252 #define disable_higher_order_debug 0 1253 1254 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1255 { return 0; } 1256 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1257 { return 0; } 1258 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1259 int objects) {} 1260 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1261 int objects) {} 1262 1263 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1264 { return 0; } 1265 1266 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1267 void *object) {} 1268 1269 static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1270 1271 #endif /* CONFIG_SLUB_DEBUG */ 1272 1273 /* 1274 * Slab allocation and freeing 1275 */ 1276 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1277 struct kmem_cache_order_objects oo) 1278 { 1279 int order = oo_order(oo); 1280 1281 flags |= __GFP_NOTRACK; 1282 1283 if (node == NUMA_NO_NODE) 1284 return alloc_pages(flags, order); 1285 else 1286 return alloc_pages_exact_node(node, flags, order); 1287 } 1288 1289 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1290 { 1291 struct page *page; 1292 struct kmem_cache_order_objects oo = s->oo; 1293 gfp_t alloc_gfp; 1294 1295 flags &= gfp_allowed_mask; 1296 1297 if (flags & __GFP_WAIT) 1298 local_irq_enable(); 1299 1300 flags |= s->allocflags; 1301 1302 /* 1303 * Let the initial higher-order allocation fail under memory pressure 1304 * so we fall-back to the minimum order allocation. 1305 */ 1306 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1307 1308 page = alloc_slab_page(alloc_gfp, node, oo); 1309 if (unlikely(!page)) { 1310 oo = s->min; 1311 /* 1312 * Allocation may have failed due to fragmentation. 1313 * Try a lower order alloc if possible 1314 */ 1315 page = alloc_slab_page(flags, node, oo); 1316 1317 if (page) 1318 stat(s, ORDER_FALLBACK); 1319 } 1320 1321 if (kmemcheck_enabled && page 1322 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1323 int pages = 1 << oo_order(oo); 1324 1325 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1326 1327 /* 1328 * Objects from caches that have a constructor don't get 1329 * cleared when they're allocated, so we need to do it here. 1330 */ 1331 if (s->ctor) 1332 kmemcheck_mark_uninitialized_pages(page, pages); 1333 else 1334 kmemcheck_mark_unallocated_pages(page, pages); 1335 } 1336 1337 if (flags & __GFP_WAIT) 1338 local_irq_disable(); 1339 if (!page) 1340 return NULL; 1341 1342 page->objects = oo_objects(oo); 1343 mod_zone_page_state(page_zone(page), 1344 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1345 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1346 1 << oo_order(oo)); 1347 1348 return page; 1349 } 1350 1351 static void setup_object(struct kmem_cache *s, struct page *page, 1352 void *object) 1353 { 1354 setup_object_debug(s, page, object); 1355 if (unlikely(s->ctor)) 1356 s->ctor(object); 1357 } 1358 1359 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1360 { 1361 struct page *page; 1362 void *start; 1363 void *last; 1364 void *p; 1365 int order; 1366 1367 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1368 1369 page = allocate_slab(s, 1370 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1371 if (!page) 1372 goto out; 1373 1374 order = compound_order(page); 1375 inc_slabs_node(s, page_to_nid(page), page->objects); 1376 memcg_bind_pages(s, order); 1377 page->slab_cache = s; 1378 __SetPageSlab(page); 1379 if (page->pfmemalloc) 1380 SetPageSlabPfmemalloc(page); 1381 1382 start = page_address(page); 1383 1384 if (unlikely(s->flags & SLAB_POISON)) 1385 memset(start, POISON_INUSE, PAGE_SIZE << order); 1386 1387 last = start; 1388 for_each_object(p, s, start, page->objects) { 1389 setup_object(s, page, last); 1390 set_freepointer(s, last, p); 1391 last = p; 1392 } 1393 setup_object(s, page, last); 1394 set_freepointer(s, last, NULL); 1395 1396 page->freelist = start; 1397 page->inuse = page->objects; 1398 page->frozen = 1; 1399 out: 1400 return page; 1401 } 1402 1403 static void __free_slab(struct kmem_cache *s, struct page *page) 1404 { 1405 int order = compound_order(page); 1406 int pages = 1 << order; 1407 1408 if (kmem_cache_debug(s)) { 1409 void *p; 1410 1411 slab_pad_check(s, page); 1412 for_each_object(p, s, page_address(page), 1413 page->objects) 1414 check_object(s, page, p, SLUB_RED_INACTIVE); 1415 } 1416 1417 kmemcheck_free_shadow(page, compound_order(page)); 1418 1419 mod_zone_page_state(page_zone(page), 1420 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1421 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1422 -pages); 1423 1424 __ClearPageSlabPfmemalloc(page); 1425 __ClearPageSlab(page); 1426 1427 memcg_release_pages(s, order); 1428 page_mapcount_reset(page); 1429 if (current->reclaim_state) 1430 current->reclaim_state->reclaimed_slab += pages; 1431 __free_memcg_kmem_pages(page, order); 1432 } 1433 1434 #define need_reserve_slab_rcu \ 1435 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1436 1437 static void rcu_free_slab(struct rcu_head *h) 1438 { 1439 struct page *page; 1440 1441 if (need_reserve_slab_rcu) 1442 page = virt_to_head_page(h); 1443 else 1444 page = container_of((struct list_head *)h, struct page, lru); 1445 1446 __free_slab(page->slab_cache, page); 1447 } 1448 1449 static void free_slab(struct kmem_cache *s, struct page *page) 1450 { 1451 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1452 struct rcu_head *head; 1453 1454 if (need_reserve_slab_rcu) { 1455 int order = compound_order(page); 1456 int offset = (PAGE_SIZE << order) - s->reserved; 1457 1458 VM_BUG_ON(s->reserved != sizeof(*head)); 1459 head = page_address(page) + offset; 1460 } else { 1461 /* 1462 * RCU free overloads the RCU head over the LRU 1463 */ 1464 head = (void *)&page->lru; 1465 } 1466 1467 call_rcu(head, rcu_free_slab); 1468 } else 1469 __free_slab(s, page); 1470 } 1471 1472 static void discard_slab(struct kmem_cache *s, struct page *page) 1473 { 1474 dec_slabs_node(s, page_to_nid(page), page->objects); 1475 free_slab(s, page); 1476 } 1477 1478 /* 1479 * Management of partially allocated slabs. 1480 * 1481 * list_lock must be held. 1482 */ 1483 static inline void add_partial(struct kmem_cache_node *n, 1484 struct page *page, int tail) 1485 { 1486 n->nr_partial++; 1487 if (tail == DEACTIVATE_TO_TAIL) 1488 list_add_tail(&page->lru, &n->partial); 1489 else 1490 list_add(&page->lru, &n->partial); 1491 } 1492 1493 /* 1494 * list_lock must be held. 1495 */ 1496 static inline void remove_partial(struct kmem_cache_node *n, 1497 struct page *page) 1498 { 1499 list_del(&page->lru); 1500 n->nr_partial--; 1501 } 1502 1503 /* 1504 * Remove slab from the partial list, freeze it and 1505 * return the pointer to the freelist. 1506 * 1507 * Returns a list of objects or NULL if it fails. 1508 * 1509 * Must hold list_lock since we modify the partial list. 1510 */ 1511 static inline void *acquire_slab(struct kmem_cache *s, 1512 struct kmem_cache_node *n, struct page *page, 1513 int mode, int *objects) 1514 { 1515 void *freelist; 1516 unsigned long counters; 1517 struct page new; 1518 1519 /* 1520 * Zap the freelist and set the frozen bit. 1521 * The old freelist is the list of objects for the 1522 * per cpu allocation list. 1523 */ 1524 freelist = page->freelist; 1525 counters = page->counters; 1526 new.counters = counters; 1527 *objects = new.objects - new.inuse; 1528 if (mode) { 1529 new.inuse = page->objects; 1530 new.freelist = NULL; 1531 } else { 1532 new.freelist = freelist; 1533 } 1534 1535 VM_BUG_ON(new.frozen); 1536 new.frozen = 1; 1537 1538 if (!__cmpxchg_double_slab(s, page, 1539 freelist, counters, 1540 new.freelist, new.counters, 1541 "acquire_slab")) 1542 return NULL; 1543 1544 remove_partial(n, page); 1545 WARN_ON(!freelist); 1546 return freelist; 1547 } 1548 1549 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1550 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); 1551 1552 /* 1553 * Try to allocate a partial slab from a specific node. 1554 */ 1555 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 1556 struct kmem_cache_cpu *c, gfp_t flags) 1557 { 1558 struct page *page, *page2; 1559 void *object = NULL; 1560 int available = 0; 1561 int objects; 1562 1563 /* 1564 * Racy check. If we mistakenly see no partial slabs then we 1565 * just allocate an empty slab. If we mistakenly try to get a 1566 * partial slab and there is none available then get_partials() 1567 * will return NULL. 1568 */ 1569 if (!n || !n->nr_partial) 1570 return NULL; 1571 1572 spin_lock(&n->list_lock); 1573 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1574 void *t; 1575 1576 if (!pfmemalloc_match(page, flags)) 1577 continue; 1578 1579 t = acquire_slab(s, n, page, object == NULL, &objects); 1580 if (!t) 1581 break; 1582 1583 available += objects; 1584 if (!object) { 1585 c->page = page; 1586 stat(s, ALLOC_FROM_PARTIAL); 1587 object = t; 1588 } else { 1589 put_cpu_partial(s, page, 0); 1590 stat(s, CPU_PARTIAL_NODE); 1591 } 1592 if (!kmem_cache_has_cpu_partial(s) 1593 || available > s->cpu_partial / 2) 1594 break; 1595 1596 } 1597 spin_unlock(&n->list_lock); 1598 return object; 1599 } 1600 1601 /* 1602 * Get a page from somewhere. Search in increasing NUMA distances. 1603 */ 1604 static void *get_any_partial(struct kmem_cache *s, gfp_t flags, 1605 struct kmem_cache_cpu *c) 1606 { 1607 #ifdef CONFIG_NUMA 1608 struct zonelist *zonelist; 1609 struct zoneref *z; 1610 struct zone *zone; 1611 enum zone_type high_zoneidx = gfp_zone(flags); 1612 void *object; 1613 unsigned int cpuset_mems_cookie; 1614 1615 /* 1616 * The defrag ratio allows a configuration of the tradeoffs between 1617 * inter node defragmentation and node local allocations. A lower 1618 * defrag_ratio increases the tendency to do local allocations 1619 * instead of attempting to obtain partial slabs from other nodes. 1620 * 1621 * If the defrag_ratio is set to 0 then kmalloc() always 1622 * returns node local objects. If the ratio is higher then kmalloc() 1623 * may return off node objects because partial slabs are obtained 1624 * from other nodes and filled up. 1625 * 1626 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1627 * defrag_ratio = 1000) then every (well almost) allocation will 1628 * first attempt to defrag slab caches on other nodes. This means 1629 * scanning over all nodes to look for partial slabs which may be 1630 * expensive if we do it every time we are trying to find a slab 1631 * with available objects. 1632 */ 1633 if (!s->remote_node_defrag_ratio || 1634 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1635 return NULL; 1636 1637 do { 1638 cpuset_mems_cookie = get_mems_allowed(); 1639 zonelist = node_zonelist(slab_node(), flags); 1640 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1641 struct kmem_cache_node *n; 1642 1643 n = get_node(s, zone_to_nid(zone)); 1644 1645 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1646 n->nr_partial > s->min_partial) { 1647 object = get_partial_node(s, n, c, flags); 1648 if (object) { 1649 /* 1650 * Return the object even if 1651 * put_mems_allowed indicated that 1652 * the cpuset mems_allowed was 1653 * updated in parallel. It's a 1654 * harmless race between the alloc 1655 * and the cpuset update. 1656 */ 1657 put_mems_allowed(cpuset_mems_cookie); 1658 return object; 1659 } 1660 } 1661 } 1662 } while (!put_mems_allowed(cpuset_mems_cookie)); 1663 #endif 1664 return NULL; 1665 } 1666 1667 /* 1668 * Get a partial page, lock it and return it. 1669 */ 1670 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1671 struct kmem_cache_cpu *c) 1672 { 1673 void *object; 1674 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1675 1676 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1677 if (object || node != NUMA_NO_NODE) 1678 return object; 1679 1680 return get_any_partial(s, flags, c); 1681 } 1682 1683 #ifdef CONFIG_PREEMPT 1684 /* 1685 * Calculate the next globally unique transaction for disambiguiation 1686 * during cmpxchg. The transactions start with the cpu number and are then 1687 * incremented by CONFIG_NR_CPUS. 1688 */ 1689 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1690 #else 1691 /* 1692 * No preemption supported therefore also no need to check for 1693 * different cpus. 1694 */ 1695 #define TID_STEP 1 1696 #endif 1697 1698 static inline unsigned long next_tid(unsigned long tid) 1699 { 1700 return tid + TID_STEP; 1701 } 1702 1703 static inline unsigned int tid_to_cpu(unsigned long tid) 1704 { 1705 return tid % TID_STEP; 1706 } 1707 1708 static inline unsigned long tid_to_event(unsigned long tid) 1709 { 1710 return tid / TID_STEP; 1711 } 1712 1713 static inline unsigned int init_tid(int cpu) 1714 { 1715 return cpu; 1716 } 1717 1718 static inline void note_cmpxchg_failure(const char *n, 1719 const struct kmem_cache *s, unsigned long tid) 1720 { 1721 #ifdef SLUB_DEBUG_CMPXCHG 1722 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1723 1724 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1725 1726 #ifdef CONFIG_PREEMPT 1727 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1728 printk("due to cpu change %d -> %d\n", 1729 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1730 else 1731 #endif 1732 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1733 printk("due to cpu running other code. Event %ld->%ld\n", 1734 tid_to_event(tid), tid_to_event(actual_tid)); 1735 else 1736 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1737 actual_tid, tid, next_tid(tid)); 1738 #endif 1739 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1740 } 1741 1742 static void init_kmem_cache_cpus(struct kmem_cache *s) 1743 { 1744 int cpu; 1745 1746 for_each_possible_cpu(cpu) 1747 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1748 } 1749 1750 /* 1751 * Remove the cpu slab 1752 */ 1753 static void deactivate_slab(struct kmem_cache *s, struct page *page, 1754 void *freelist) 1755 { 1756 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1757 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1758 int lock = 0; 1759 enum slab_modes l = M_NONE, m = M_NONE; 1760 void *nextfree; 1761 int tail = DEACTIVATE_TO_HEAD; 1762 struct page new; 1763 struct page old; 1764 1765 if (page->freelist) { 1766 stat(s, DEACTIVATE_REMOTE_FREES); 1767 tail = DEACTIVATE_TO_TAIL; 1768 } 1769 1770 /* 1771 * Stage one: Free all available per cpu objects back 1772 * to the page freelist while it is still frozen. Leave the 1773 * last one. 1774 * 1775 * There is no need to take the list->lock because the page 1776 * is still frozen. 1777 */ 1778 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1779 void *prior; 1780 unsigned long counters; 1781 1782 do { 1783 prior = page->freelist; 1784 counters = page->counters; 1785 set_freepointer(s, freelist, prior); 1786 new.counters = counters; 1787 new.inuse--; 1788 VM_BUG_ON(!new.frozen); 1789 1790 } while (!__cmpxchg_double_slab(s, page, 1791 prior, counters, 1792 freelist, new.counters, 1793 "drain percpu freelist")); 1794 1795 freelist = nextfree; 1796 } 1797 1798 /* 1799 * Stage two: Ensure that the page is unfrozen while the 1800 * list presence reflects the actual number of objects 1801 * during unfreeze. 1802 * 1803 * We setup the list membership and then perform a cmpxchg 1804 * with the count. If there is a mismatch then the page 1805 * is not unfrozen but the page is on the wrong list. 1806 * 1807 * Then we restart the process which may have to remove 1808 * the page from the list that we just put it on again 1809 * because the number of objects in the slab may have 1810 * changed. 1811 */ 1812 redo: 1813 1814 old.freelist = page->freelist; 1815 old.counters = page->counters; 1816 VM_BUG_ON(!old.frozen); 1817 1818 /* Determine target state of the slab */ 1819 new.counters = old.counters; 1820 if (freelist) { 1821 new.inuse--; 1822 set_freepointer(s, freelist, old.freelist); 1823 new.freelist = freelist; 1824 } else 1825 new.freelist = old.freelist; 1826 1827 new.frozen = 0; 1828 1829 if (!new.inuse && n->nr_partial > s->min_partial) 1830 m = M_FREE; 1831 else if (new.freelist) { 1832 m = M_PARTIAL; 1833 if (!lock) { 1834 lock = 1; 1835 /* 1836 * Taking the spinlock removes the possiblity 1837 * that acquire_slab() will see a slab page that 1838 * is frozen 1839 */ 1840 spin_lock(&n->list_lock); 1841 } 1842 } else { 1843 m = M_FULL; 1844 if (kmem_cache_debug(s) && !lock) { 1845 lock = 1; 1846 /* 1847 * This also ensures that the scanning of full 1848 * slabs from diagnostic functions will not see 1849 * any frozen slabs. 1850 */ 1851 spin_lock(&n->list_lock); 1852 } 1853 } 1854 1855 if (l != m) { 1856 1857 if (l == M_PARTIAL) 1858 1859 remove_partial(n, page); 1860 1861 else if (l == M_FULL) 1862 1863 remove_full(s, page); 1864 1865 if (m == M_PARTIAL) { 1866 1867 add_partial(n, page, tail); 1868 stat(s, tail); 1869 1870 } else if (m == M_FULL) { 1871 1872 stat(s, DEACTIVATE_FULL); 1873 add_full(s, n, page); 1874 1875 } 1876 } 1877 1878 l = m; 1879 if (!__cmpxchg_double_slab(s, page, 1880 old.freelist, old.counters, 1881 new.freelist, new.counters, 1882 "unfreezing slab")) 1883 goto redo; 1884 1885 if (lock) 1886 spin_unlock(&n->list_lock); 1887 1888 if (m == M_FREE) { 1889 stat(s, DEACTIVATE_EMPTY); 1890 discard_slab(s, page); 1891 stat(s, FREE_SLAB); 1892 } 1893 } 1894 1895 /* 1896 * Unfreeze all the cpu partial slabs. 1897 * 1898 * This function must be called with interrupts disabled 1899 * for the cpu using c (or some other guarantee must be there 1900 * to guarantee no concurrent accesses). 1901 */ 1902 static void unfreeze_partials(struct kmem_cache *s, 1903 struct kmem_cache_cpu *c) 1904 { 1905 #ifdef CONFIG_SLUB_CPU_PARTIAL 1906 struct kmem_cache_node *n = NULL, *n2 = NULL; 1907 struct page *page, *discard_page = NULL; 1908 1909 while ((page = c->partial)) { 1910 struct page new; 1911 struct page old; 1912 1913 c->partial = page->next; 1914 1915 n2 = get_node(s, page_to_nid(page)); 1916 if (n != n2) { 1917 if (n) 1918 spin_unlock(&n->list_lock); 1919 1920 n = n2; 1921 spin_lock(&n->list_lock); 1922 } 1923 1924 do { 1925 1926 old.freelist = page->freelist; 1927 old.counters = page->counters; 1928 VM_BUG_ON(!old.frozen); 1929 1930 new.counters = old.counters; 1931 new.freelist = old.freelist; 1932 1933 new.frozen = 0; 1934 1935 } while (!__cmpxchg_double_slab(s, page, 1936 old.freelist, old.counters, 1937 new.freelist, new.counters, 1938 "unfreezing slab")); 1939 1940 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { 1941 page->next = discard_page; 1942 discard_page = page; 1943 } else { 1944 add_partial(n, page, DEACTIVATE_TO_TAIL); 1945 stat(s, FREE_ADD_PARTIAL); 1946 } 1947 } 1948 1949 if (n) 1950 spin_unlock(&n->list_lock); 1951 1952 while (discard_page) { 1953 page = discard_page; 1954 discard_page = discard_page->next; 1955 1956 stat(s, DEACTIVATE_EMPTY); 1957 discard_slab(s, page); 1958 stat(s, FREE_SLAB); 1959 } 1960 #endif 1961 } 1962 1963 /* 1964 * Put a page that was just frozen (in __slab_free) into a partial page 1965 * slot if available. This is done without interrupts disabled and without 1966 * preemption disabled. The cmpxchg is racy and may put the partial page 1967 * onto a random cpus partial slot. 1968 * 1969 * If we did not find a slot then simply move all the partials to the 1970 * per node partial list. 1971 */ 1972 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1973 { 1974 #ifdef CONFIG_SLUB_CPU_PARTIAL 1975 struct page *oldpage; 1976 int pages; 1977 int pobjects; 1978 1979 do { 1980 pages = 0; 1981 pobjects = 0; 1982 oldpage = this_cpu_read(s->cpu_slab->partial); 1983 1984 if (oldpage) { 1985 pobjects = oldpage->pobjects; 1986 pages = oldpage->pages; 1987 if (drain && pobjects > s->cpu_partial) { 1988 unsigned long flags; 1989 /* 1990 * partial array is full. Move the existing 1991 * set to the per node partial list. 1992 */ 1993 local_irq_save(flags); 1994 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 1995 local_irq_restore(flags); 1996 oldpage = NULL; 1997 pobjects = 0; 1998 pages = 0; 1999 stat(s, CPU_PARTIAL_DRAIN); 2000 } 2001 } 2002 2003 pages++; 2004 pobjects += page->objects - page->inuse; 2005 2006 page->pages = pages; 2007 page->pobjects = pobjects; 2008 page->next = oldpage; 2009 2010 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2011 != oldpage); 2012 #endif 2013 } 2014 2015 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 2016 { 2017 stat(s, CPUSLAB_FLUSH); 2018 deactivate_slab(s, c->page, c->freelist); 2019 2020 c->tid = next_tid(c->tid); 2021 c->page = NULL; 2022 c->freelist = NULL; 2023 } 2024 2025 /* 2026 * Flush cpu slab. 2027 * 2028 * Called from IPI handler with interrupts disabled. 2029 */ 2030 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 2031 { 2032 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2033 2034 if (likely(c)) { 2035 if (c->page) 2036 flush_slab(s, c); 2037 2038 unfreeze_partials(s, c); 2039 } 2040 } 2041 2042 static void flush_cpu_slab(void *d) 2043 { 2044 struct kmem_cache *s = d; 2045 2046 __flush_cpu_slab(s, smp_processor_id()); 2047 } 2048 2049 static bool has_cpu_slab(int cpu, void *info) 2050 { 2051 struct kmem_cache *s = info; 2052 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2053 2054 return c->page || c->partial; 2055 } 2056 2057 static void flush_all(struct kmem_cache *s) 2058 { 2059 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); 2060 } 2061 2062 /* 2063 * Check if the objects in a per cpu structure fit numa 2064 * locality expectations. 2065 */ 2066 static inline int node_match(struct page *page, int node) 2067 { 2068 #ifdef CONFIG_NUMA 2069 if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) 2070 return 0; 2071 #endif 2072 return 1; 2073 } 2074 2075 static int count_free(struct page *page) 2076 { 2077 return page->objects - page->inuse; 2078 } 2079 2080 static unsigned long count_partial(struct kmem_cache_node *n, 2081 int (*get_count)(struct page *)) 2082 { 2083 unsigned long flags; 2084 unsigned long x = 0; 2085 struct page *page; 2086 2087 spin_lock_irqsave(&n->list_lock, flags); 2088 list_for_each_entry(page, &n->partial, lru) 2089 x += get_count(page); 2090 spin_unlock_irqrestore(&n->list_lock, flags); 2091 return x; 2092 } 2093 2094 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 2095 { 2096 #ifdef CONFIG_SLUB_DEBUG 2097 return atomic_long_read(&n->total_objects); 2098 #else 2099 return 0; 2100 #endif 2101 } 2102 2103 static noinline void 2104 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2105 { 2106 int node; 2107 2108 printk(KERN_WARNING 2109 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2110 nid, gfpflags); 2111 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2112 "default order: %d, min order: %d\n", s->name, s->object_size, 2113 s->size, oo_order(s->oo), oo_order(s->min)); 2114 2115 if (oo_order(s->min) > get_order(s->object_size)) 2116 printk(KERN_WARNING " %s debugging increased min order, use " 2117 "slub_debug=O to disable.\n", s->name); 2118 2119 for_each_online_node(node) { 2120 struct kmem_cache_node *n = get_node(s, node); 2121 unsigned long nr_slabs; 2122 unsigned long nr_objs; 2123 unsigned long nr_free; 2124 2125 if (!n) 2126 continue; 2127 2128 nr_free = count_partial(n, count_free); 2129 nr_slabs = node_nr_slabs(n); 2130 nr_objs = node_nr_objs(n); 2131 2132 printk(KERN_WARNING 2133 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2134 node, nr_slabs, nr_objs, nr_free); 2135 } 2136 } 2137 2138 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2139 int node, struct kmem_cache_cpu **pc) 2140 { 2141 void *freelist; 2142 struct kmem_cache_cpu *c = *pc; 2143 struct page *page; 2144 2145 freelist = get_partial(s, flags, node, c); 2146 2147 if (freelist) 2148 return freelist; 2149 2150 page = new_slab(s, flags, node); 2151 if (page) { 2152 c = __this_cpu_ptr(s->cpu_slab); 2153 if (c->page) 2154 flush_slab(s, c); 2155 2156 /* 2157 * No other reference to the page yet so we can 2158 * muck around with it freely without cmpxchg 2159 */ 2160 freelist = page->freelist; 2161 page->freelist = NULL; 2162 2163 stat(s, ALLOC_SLAB); 2164 c->page = page; 2165 *pc = c; 2166 } else 2167 freelist = NULL; 2168 2169 return freelist; 2170 } 2171 2172 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) 2173 { 2174 if (unlikely(PageSlabPfmemalloc(page))) 2175 return gfp_pfmemalloc_allowed(gfpflags); 2176 2177 return true; 2178 } 2179 2180 /* 2181 * Check the page->freelist of a page and either transfer the freelist to the 2182 * per cpu freelist or deactivate the page. 2183 * 2184 * The page is still frozen if the return value is not NULL. 2185 * 2186 * If this function returns NULL then the page has been unfrozen. 2187 * 2188 * This function must be called with interrupt disabled. 2189 */ 2190 static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2191 { 2192 struct page new; 2193 unsigned long counters; 2194 void *freelist; 2195 2196 do { 2197 freelist = page->freelist; 2198 counters = page->counters; 2199 2200 new.counters = counters; 2201 VM_BUG_ON(!new.frozen); 2202 2203 new.inuse = page->objects; 2204 new.frozen = freelist != NULL; 2205 2206 } while (!__cmpxchg_double_slab(s, page, 2207 freelist, counters, 2208 NULL, new.counters, 2209 "get_freelist")); 2210 2211 return freelist; 2212 } 2213 2214 /* 2215 * Slow path. The lockless freelist is empty or we need to perform 2216 * debugging duties. 2217 * 2218 * Processing is still very fast if new objects have been freed to the 2219 * regular freelist. In that case we simply take over the regular freelist 2220 * as the lockless freelist and zap the regular freelist. 2221 * 2222 * If that is not working then we fall back to the partial lists. We take the 2223 * first element of the freelist as the object to allocate now and move the 2224 * rest of the freelist to the lockless freelist. 2225 * 2226 * And if we were unable to get a new slab from the partial slab lists then 2227 * we need to allocate a new slab. This is the slowest path since it involves 2228 * a call to the page allocator and the setup of a new slab. 2229 */ 2230 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2231 unsigned long addr, struct kmem_cache_cpu *c) 2232 { 2233 void *freelist; 2234 struct page *page; 2235 unsigned long flags; 2236 2237 local_irq_save(flags); 2238 #ifdef CONFIG_PREEMPT 2239 /* 2240 * We may have been preempted and rescheduled on a different 2241 * cpu before disabling interrupts. Need to reload cpu area 2242 * pointer. 2243 */ 2244 c = this_cpu_ptr(s->cpu_slab); 2245 #endif 2246 2247 page = c->page; 2248 if (!page) 2249 goto new_slab; 2250 redo: 2251 2252 if (unlikely(!node_match(page, node))) { 2253 stat(s, ALLOC_NODE_MISMATCH); 2254 deactivate_slab(s, page, c->freelist); 2255 c->page = NULL; 2256 c->freelist = NULL; 2257 goto new_slab; 2258 } 2259 2260 /* 2261 * By rights, we should be searching for a slab page that was 2262 * PFMEMALLOC but right now, we are losing the pfmemalloc 2263 * information when the page leaves the per-cpu allocator 2264 */ 2265 if (unlikely(!pfmemalloc_match(page, gfpflags))) { 2266 deactivate_slab(s, page, c->freelist); 2267 c->page = NULL; 2268 c->freelist = NULL; 2269 goto new_slab; 2270 } 2271 2272 /* must check again c->freelist in case of cpu migration or IRQ */ 2273 freelist = c->freelist; 2274 if (freelist) 2275 goto load_freelist; 2276 2277 stat(s, ALLOC_SLOWPATH); 2278 2279 freelist = get_freelist(s, page); 2280 2281 if (!freelist) { 2282 c->page = NULL; 2283 stat(s, DEACTIVATE_BYPASS); 2284 goto new_slab; 2285 } 2286 2287 stat(s, ALLOC_REFILL); 2288 2289 load_freelist: 2290 /* 2291 * freelist is pointing to the list of objects to be used. 2292 * page is pointing to the page from which the objects are obtained. 2293 * That page must be frozen for per cpu allocations to work. 2294 */ 2295 VM_BUG_ON(!c->page->frozen); 2296 c->freelist = get_freepointer(s, freelist); 2297 c->tid = next_tid(c->tid); 2298 local_irq_restore(flags); 2299 return freelist; 2300 2301 new_slab: 2302 2303 if (c->partial) { 2304 page = c->page = c->partial; 2305 c->partial = page->next; 2306 stat(s, CPU_PARTIAL_ALLOC); 2307 c->freelist = NULL; 2308 goto redo; 2309 } 2310 2311 freelist = new_slab_objects(s, gfpflags, node, &c); 2312 2313 if (unlikely(!freelist)) { 2314 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2315 slab_out_of_memory(s, gfpflags, node); 2316 2317 local_irq_restore(flags); 2318 return NULL; 2319 } 2320 2321 page = c->page; 2322 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2323 goto load_freelist; 2324 2325 /* Only entered in the debug case */ 2326 if (kmem_cache_debug(s) && 2327 !alloc_debug_processing(s, page, freelist, addr)) 2328 goto new_slab; /* Slab failed checks. Next slab needed */ 2329 2330 deactivate_slab(s, page, get_freepointer(s, freelist)); 2331 c->page = NULL; 2332 c->freelist = NULL; 2333 local_irq_restore(flags); 2334 return freelist; 2335 } 2336 2337 /* 2338 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2339 * have the fastpath folded into their functions. So no function call 2340 * overhead for requests that can be satisfied on the fastpath. 2341 * 2342 * The fastpath works by first checking if the lockless freelist can be used. 2343 * If not then __slab_alloc is called for slow processing. 2344 * 2345 * Otherwise we can simply pick the next object from the lockless free list. 2346 */ 2347 static __always_inline void *slab_alloc_node(struct kmem_cache *s, 2348 gfp_t gfpflags, int node, unsigned long addr) 2349 { 2350 void **object; 2351 struct kmem_cache_cpu *c; 2352 struct page *page; 2353 unsigned long tid; 2354 2355 if (slab_pre_alloc_hook(s, gfpflags)) 2356 return NULL; 2357 2358 s = memcg_kmem_get_cache(s, gfpflags); 2359 redo: 2360 /* 2361 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2362 * enabled. We may switch back and forth between cpus while 2363 * reading from one cpu area. That does not matter as long 2364 * as we end up on the original cpu again when doing the cmpxchg. 2365 * 2366 * Preemption is disabled for the retrieval of the tid because that 2367 * must occur from the current processor. We cannot allow rescheduling 2368 * on a different processor between the determination of the pointer 2369 * and the retrieval of the tid. 2370 */ 2371 preempt_disable(); 2372 c = __this_cpu_ptr(s->cpu_slab); 2373 2374 /* 2375 * The transaction ids are globally unique per cpu and per operation on 2376 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2377 * occurs on the right processor and that there was no operation on the 2378 * linked list in between. 2379 */ 2380 tid = c->tid; 2381 preempt_enable(); 2382 2383 object = c->freelist; 2384 page = c->page; 2385 if (unlikely(!object || !node_match(page, node))) 2386 object = __slab_alloc(s, gfpflags, node, addr, c); 2387 2388 else { 2389 void *next_object = get_freepointer_safe(s, object); 2390 2391 /* 2392 * The cmpxchg will only match if there was no additional 2393 * operation and if we are on the right processor. 2394 * 2395 * The cmpxchg does the following atomically (without lock 2396 * semantics!) 2397 * 1. Relocate first pointer to the current per cpu area. 2398 * 2. Verify that tid and freelist have not been changed 2399 * 3. If they were not changed replace tid and freelist 2400 * 2401 * Since this is without lock semantics the protection is only 2402 * against code executing on this cpu *not* from access by 2403 * other cpus. 2404 */ 2405 if (unlikely(!this_cpu_cmpxchg_double( 2406 s->cpu_slab->freelist, s->cpu_slab->tid, 2407 object, tid, 2408 next_object, next_tid(tid)))) { 2409 2410 note_cmpxchg_failure("slab_alloc", s, tid); 2411 goto redo; 2412 } 2413 prefetch_freepointer(s, next_object); 2414 stat(s, ALLOC_FASTPATH); 2415 } 2416 2417 if (unlikely(gfpflags & __GFP_ZERO) && object) 2418 memset(object, 0, s->object_size); 2419 2420 slab_post_alloc_hook(s, gfpflags, object); 2421 2422 return object; 2423 } 2424 2425 static __always_inline void *slab_alloc(struct kmem_cache *s, 2426 gfp_t gfpflags, unsigned long addr) 2427 { 2428 return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); 2429 } 2430 2431 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2432 { 2433 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2434 2435 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, 2436 s->size, gfpflags); 2437 2438 return ret; 2439 } 2440 EXPORT_SYMBOL(kmem_cache_alloc); 2441 2442 #ifdef CONFIG_TRACING 2443 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2444 { 2445 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2446 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2447 return ret; 2448 } 2449 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2450 #endif 2451 2452 #ifdef CONFIG_NUMA 2453 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2454 { 2455 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2456 2457 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2458 s->object_size, s->size, gfpflags, node); 2459 2460 return ret; 2461 } 2462 EXPORT_SYMBOL(kmem_cache_alloc_node); 2463 2464 #ifdef CONFIG_TRACING 2465 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2466 gfp_t gfpflags, 2467 int node, size_t size) 2468 { 2469 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2470 2471 trace_kmalloc_node(_RET_IP_, ret, 2472 size, s->size, gfpflags, node); 2473 return ret; 2474 } 2475 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2476 #endif 2477 #endif 2478 2479 /* 2480 * Slow patch handling. This may still be called frequently since objects 2481 * have a longer lifetime than the cpu slabs in most processing loads. 2482 * 2483 * So we still attempt to reduce cache line usage. Just take the slab 2484 * lock and free the item. If there is no additional partial page 2485 * handling required then we can return immediately. 2486 */ 2487 static void __slab_free(struct kmem_cache *s, struct page *page, 2488 void *x, unsigned long addr) 2489 { 2490 void *prior; 2491 void **object = (void *)x; 2492 int was_frozen; 2493 struct page new; 2494 unsigned long counters; 2495 struct kmem_cache_node *n = NULL; 2496 unsigned long uninitialized_var(flags); 2497 2498 stat(s, FREE_SLOWPATH); 2499 2500 if (kmem_cache_debug(s) && 2501 !(n = free_debug_processing(s, page, x, addr, &flags))) 2502 return; 2503 2504 do { 2505 if (unlikely(n)) { 2506 spin_unlock_irqrestore(&n->list_lock, flags); 2507 n = NULL; 2508 } 2509 prior = page->freelist; 2510 counters = page->counters; 2511 set_freepointer(s, object, prior); 2512 new.counters = counters; 2513 was_frozen = new.frozen; 2514 new.inuse--; 2515 if ((!new.inuse || !prior) && !was_frozen) { 2516 2517 if (kmem_cache_has_cpu_partial(s) && !prior) 2518 2519 /* 2520 * Slab was on no list before and will be 2521 * partially empty 2522 * We can defer the list move and instead 2523 * freeze it. 2524 */ 2525 new.frozen = 1; 2526 2527 else { /* Needs to be taken off a list */ 2528 2529 n = get_node(s, page_to_nid(page)); 2530 /* 2531 * Speculatively acquire the list_lock. 2532 * If the cmpxchg does not succeed then we may 2533 * drop the list_lock without any processing. 2534 * 2535 * Otherwise the list_lock will synchronize with 2536 * other processors updating the list of slabs. 2537 */ 2538 spin_lock_irqsave(&n->list_lock, flags); 2539 2540 } 2541 } 2542 2543 } while (!cmpxchg_double_slab(s, page, 2544 prior, counters, 2545 object, new.counters, 2546 "__slab_free")); 2547 2548 if (likely(!n)) { 2549 2550 /* 2551 * If we just froze the page then put it onto the 2552 * per cpu partial list. 2553 */ 2554 if (new.frozen && !was_frozen) { 2555 put_cpu_partial(s, page, 1); 2556 stat(s, CPU_PARTIAL_FREE); 2557 } 2558 /* 2559 * The list lock was not taken therefore no list 2560 * activity can be necessary. 2561 */ 2562 if (was_frozen) 2563 stat(s, FREE_FROZEN); 2564 return; 2565 } 2566 2567 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) 2568 goto slab_empty; 2569 2570 /* 2571 * Objects left in the slab. If it was not on the partial list before 2572 * then add it. 2573 */ 2574 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 2575 if (kmem_cache_debug(s)) 2576 remove_full(s, page); 2577 add_partial(n, page, DEACTIVATE_TO_TAIL); 2578 stat(s, FREE_ADD_PARTIAL); 2579 } 2580 spin_unlock_irqrestore(&n->list_lock, flags); 2581 return; 2582 2583 slab_empty: 2584 if (prior) { 2585 /* 2586 * Slab on the partial list. 2587 */ 2588 remove_partial(n, page); 2589 stat(s, FREE_REMOVE_PARTIAL); 2590 } else 2591 /* Slab must be on the full list */ 2592 remove_full(s, page); 2593 2594 spin_unlock_irqrestore(&n->list_lock, flags); 2595 stat(s, FREE_SLAB); 2596 discard_slab(s, page); 2597 } 2598 2599 /* 2600 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2601 * can perform fastpath freeing without additional function calls. 2602 * 2603 * The fastpath is only possible if we are freeing to the current cpu slab 2604 * of this processor. This typically the case if we have just allocated 2605 * the item before. 2606 * 2607 * If fastpath is not possible then fall back to __slab_free where we deal 2608 * with all sorts of special processing. 2609 */ 2610 static __always_inline void slab_free(struct kmem_cache *s, 2611 struct page *page, void *x, unsigned long addr) 2612 { 2613 void **object = (void *)x; 2614 struct kmem_cache_cpu *c; 2615 unsigned long tid; 2616 2617 slab_free_hook(s, x); 2618 2619 redo: 2620 /* 2621 * Determine the currently cpus per cpu slab. 2622 * The cpu may change afterward. However that does not matter since 2623 * data is retrieved via this pointer. If we are on the same cpu 2624 * during the cmpxchg then the free will succedd. 2625 */ 2626 preempt_disable(); 2627 c = __this_cpu_ptr(s->cpu_slab); 2628 2629 tid = c->tid; 2630 preempt_enable(); 2631 2632 if (likely(page == c->page)) { 2633 set_freepointer(s, object, c->freelist); 2634 2635 if (unlikely(!this_cpu_cmpxchg_double( 2636 s->cpu_slab->freelist, s->cpu_slab->tid, 2637 c->freelist, tid, 2638 object, next_tid(tid)))) { 2639 2640 note_cmpxchg_failure("slab_free", s, tid); 2641 goto redo; 2642 } 2643 stat(s, FREE_FASTPATH); 2644 } else 2645 __slab_free(s, page, x, addr); 2646 2647 } 2648 2649 void kmem_cache_free(struct kmem_cache *s, void *x) 2650 { 2651 s = cache_from_obj(s, x); 2652 if (!s) 2653 return; 2654 slab_free(s, virt_to_head_page(x), x, _RET_IP_); 2655 trace_kmem_cache_free(_RET_IP_, x); 2656 } 2657 EXPORT_SYMBOL(kmem_cache_free); 2658 2659 /* 2660 * Object placement in a slab is made very easy because we always start at 2661 * offset 0. If we tune the size of the object to the alignment then we can 2662 * get the required alignment by putting one properly sized object after 2663 * another. 2664 * 2665 * Notice that the allocation order determines the sizes of the per cpu 2666 * caches. Each processor has always one slab available for allocations. 2667 * Increasing the allocation order reduces the number of times that slabs 2668 * must be moved on and off the partial lists and is therefore a factor in 2669 * locking overhead. 2670 */ 2671 2672 /* 2673 * Mininum / Maximum order of slab pages. This influences locking overhead 2674 * and slab fragmentation. A higher order reduces the number of partial slabs 2675 * and increases the number of allocations possible without having to 2676 * take the list_lock. 2677 */ 2678 static int slub_min_order; 2679 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2680 static int slub_min_objects; 2681 2682 /* 2683 * Merge control. If this is set then no merging of slab caches will occur. 2684 * (Could be removed. This was introduced to pacify the merge skeptics.) 2685 */ 2686 static int slub_nomerge; 2687 2688 /* 2689 * Calculate the order of allocation given an slab object size. 2690 * 2691 * The order of allocation has significant impact on performance and other 2692 * system components. Generally order 0 allocations should be preferred since 2693 * order 0 does not cause fragmentation in the page allocator. Larger objects 2694 * be problematic to put into order 0 slabs because there may be too much 2695 * unused space left. We go to a higher order if more than 1/16th of the slab 2696 * would be wasted. 2697 * 2698 * In order to reach satisfactory performance we must ensure that a minimum 2699 * number of objects is in one slab. Otherwise we may generate too much 2700 * activity on the partial lists which requires taking the list_lock. This is 2701 * less a concern for large slabs though which are rarely used. 2702 * 2703 * slub_max_order specifies the order where we begin to stop considering the 2704 * number of objects in a slab as critical. If we reach slub_max_order then 2705 * we try to keep the page order as low as possible. So we accept more waste 2706 * of space in favor of a small page order. 2707 * 2708 * Higher order allocations also allow the placement of more objects in a 2709 * slab and thereby reduce object handling overhead. If the user has 2710 * requested a higher mininum order then we start with that one instead of 2711 * the smallest order which will fit the object. 2712 */ 2713 static inline int slab_order(int size, int min_objects, 2714 int max_order, int fract_leftover, int reserved) 2715 { 2716 int order; 2717 int rem; 2718 int min_order = slub_min_order; 2719 2720 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2721 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2722 2723 for (order = max(min_order, 2724 fls(min_objects * size - 1) - PAGE_SHIFT); 2725 order <= max_order; order++) { 2726 2727 unsigned long slab_size = PAGE_SIZE << order; 2728 2729 if (slab_size < min_objects * size + reserved) 2730 continue; 2731 2732 rem = (slab_size - reserved) % size; 2733 2734 if (rem <= slab_size / fract_leftover) 2735 break; 2736 2737 } 2738 2739 return order; 2740 } 2741 2742 static inline int calculate_order(int size, int reserved) 2743 { 2744 int order; 2745 int min_objects; 2746 int fraction; 2747 int max_objects; 2748 2749 /* 2750 * Attempt to find best configuration for a slab. This 2751 * works by first attempting to generate a layout with 2752 * the best configuration and backing off gradually. 2753 * 2754 * First we reduce the acceptable waste in a slab. Then 2755 * we reduce the minimum objects required in a slab. 2756 */ 2757 min_objects = slub_min_objects; 2758 if (!min_objects) 2759 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2760 max_objects = order_objects(slub_max_order, size, reserved); 2761 min_objects = min(min_objects, max_objects); 2762 2763 while (min_objects > 1) { 2764 fraction = 16; 2765 while (fraction >= 4) { 2766 order = slab_order(size, min_objects, 2767 slub_max_order, fraction, reserved); 2768 if (order <= slub_max_order) 2769 return order; 2770 fraction /= 2; 2771 } 2772 min_objects--; 2773 } 2774 2775 /* 2776 * We were unable to place multiple objects in a slab. Now 2777 * lets see if we can place a single object there. 2778 */ 2779 order = slab_order(size, 1, slub_max_order, 1, reserved); 2780 if (order <= slub_max_order) 2781 return order; 2782 2783 /* 2784 * Doh this slab cannot be placed using slub_max_order. 2785 */ 2786 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2787 if (order < MAX_ORDER) 2788 return order; 2789 return -ENOSYS; 2790 } 2791 2792 static void 2793 init_kmem_cache_node(struct kmem_cache_node *n) 2794 { 2795 n->nr_partial = 0; 2796 spin_lock_init(&n->list_lock); 2797 INIT_LIST_HEAD(&n->partial); 2798 #ifdef CONFIG_SLUB_DEBUG 2799 atomic_long_set(&n->nr_slabs, 0); 2800 atomic_long_set(&n->total_objects, 0); 2801 INIT_LIST_HEAD(&n->full); 2802 #endif 2803 } 2804 2805 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2806 { 2807 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2808 KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); 2809 2810 /* 2811 * Must align to double word boundary for the double cmpxchg 2812 * instructions to work; see __pcpu_double_call_return_bool(). 2813 */ 2814 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2815 2 * sizeof(void *)); 2816 2817 if (!s->cpu_slab) 2818 return 0; 2819 2820 init_kmem_cache_cpus(s); 2821 2822 return 1; 2823 } 2824 2825 static struct kmem_cache *kmem_cache_node; 2826 2827 /* 2828 * No kmalloc_node yet so do it by hand. We know that this is the first 2829 * slab on the node for this slabcache. There are no concurrent accesses 2830 * possible. 2831 * 2832 * Note that this function only works on the kmalloc_node_cache 2833 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2834 * memory on a fresh node that has no slab structures yet. 2835 */ 2836 static void early_kmem_cache_node_alloc(int node) 2837 { 2838 struct page *page; 2839 struct kmem_cache_node *n; 2840 2841 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2842 2843 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2844 2845 BUG_ON(!page); 2846 if (page_to_nid(page) != node) { 2847 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2848 "node %d\n", node); 2849 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2850 "in order to be able to continue\n"); 2851 } 2852 2853 n = page->freelist; 2854 BUG_ON(!n); 2855 page->freelist = get_freepointer(kmem_cache_node, n); 2856 page->inuse = 1; 2857 page->frozen = 0; 2858 kmem_cache_node->node[node] = n; 2859 #ifdef CONFIG_SLUB_DEBUG 2860 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2861 init_tracking(kmem_cache_node, n); 2862 #endif 2863 init_kmem_cache_node(n); 2864 inc_slabs_node(kmem_cache_node, node, page->objects); 2865 2866 add_partial(n, page, DEACTIVATE_TO_HEAD); 2867 } 2868 2869 static void free_kmem_cache_nodes(struct kmem_cache *s) 2870 { 2871 int node; 2872 2873 for_each_node_state(node, N_NORMAL_MEMORY) { 2874 struct kmem_cache_node *n = s->node[node]; 2875 2876 if (n) 2877 kmem_cache_free(kmem_cache_node, n); 2878 2879 s->node[node] = NULL; 2880 } 2881 } 2882 2883 static int init_kmem_cache_nodes(struct kmem_cache *s) 2884 { 2885 int node; 2886 2887 for_each_node_state(node, N_NORMAL_MEMORY) { 2888 struct kmem_cache_node *n; 2889 2890 if (slab_state == DOWN) { 2891 early_kmem_cache_node_alloc(node); 2892 continue; 2893 } 2894 n = kmem_cache_alloc_node(kmem_cache_node, 2895 GFP_KERNEL, node); 2896 2897 if (!n) { 2898 free_kmem_cache_nodes(s); 2899 return 0; 2900 } 2901 2902 s->node[node] = n; 2903 init_kmem_cache_node(n); 2904 } 2905 return 1; 2906 } 2907 2908 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2909 { 2910 if (min < MIN_PARTIAL) 2911 min = MIN_PARTIAL; 2912 else if (min > MAX_PARTIAL) 2913 min = MAX_PARTIAL; 2914 s->min_partial = min; 2915 } 2916 2917 /* 2918 * calculate_sizes() determines the order and the distribution of data within 2919 * a slab object. 2920 */ 2921 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2922 { 2923 unsigned long flags = s->flags; 2924 unsigned long size = s->object_size; 2925 int order; 2926 2927 /* 2928 * Round up object size to the next word boundary. We can only 2929 * place the free pointer at word boundaries and this determines 2930 * the possible location of the free pointer. 2931 */ 2932 size = ALIGN(size, sizeof(void *)); 2933 2934 #ifdef CONFIG_SLUB_DEBUG 2935 /* 2936 * Determine if we can poison the object itself. If the user of 2937 * the slab may touch the object after free or before allocation 2938 * then we should never poison the object itself. 2939 */ 2940 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2941 !s->ctor) 2942 s->flags |= __OBJECT_POISON; 2943 else 2944 s->flags &= ~__OBJECT_POISON; 2945 2946 2947 /* 2948 * If we are Redzoning then check if there is some space between the 2949 * end of the object and the free pointer. If not then add an 2950 * additional word to have some bytes to store Redzone information. 2951 */ 2952 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 2953 size += sizeof(void *); 2954 #endif 2955 2956 /* 2957 * With that we have determined the number of bytes in actual use 2958 * by the object. This is the potential offset to the free pointer. 2959 */ 2960 s->inuse = size; 2961 2962 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2963 s->ctor)) { 2964 /* 2965 * Relocate free pointer after the object if it is not 2966 * permitted to overwrite the first word of the object on 2967 * kmem_cache_free. 2968 * 2969 * This is the case if we do RCU, have a constructor or 2970 * destructor or are poisoning the objects. 2971 */ 2972 s->offset = size; 2973 size += sizeof(void *); 2974 } 2975 2976 #ifdef CONFIG_SLUB_DEBUG 2977 if (flags & SLAB_STORE_USER) 2978 /* 2979 * Need to store information about allocs and frees after 2980 * the object. 2981 */ 2982 size += 2 * sizeof(struct track); 2983 2984 if (flags & SLAB_RED_ZONE) 2985 /* 2986 * Add some empty padding so that we can catch 2987 * overwrites from earlier objects rather than let 2988 * tracking information or the free pointer be 2989 * corrupted if a user writes before the start 2990 * of the object. 2991 */ 2992 size += sizeof(void *); 2993 #endif 2994 2995 /* 2996 * SLUB stores one object immediately after another beginning from 2997 * offset 0. In order to align the objects we have to simply size 2998 * each object to conform to the alignment. 2999 */ 3000 size = ALIGN(size, s->align); 3001 s->size = size; 3002 if (forced_order >= 0) 3003 order = forced_order; 3004 else 3005 order = calculate_order(size, s->reserved); 3006 3007 if (order < 0) 3008 return 0; 3009 3010 s->allocflags = 0; 3011 if (order) 3012 s->allocflags |= __GFP_COMP; 3013 3014 if (s->flags & SLAB_CACHE_DMA) 3015 s->allocflags |= GFP_DMA; 3016 3017 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3018 s->allocflags |= __GFP_RECLAIMABLE; 3019 3020 /* 3021 * Determine the number of objects per slab 3022 */ 3023 s->oo = oo_make(order, size, s->reserved); 3024 s->min = oo_make(get_order(size), size, s->reserved); 3025 if (oo_objects(s->oo) > oo_objects(s->max)) 3026 s->max = s->oo; 3027 3028 return !!oo_objects(s->oo); 3029 } 3030 3031 static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3032 { 3033 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3034 s->reserved = 0; 3035 3036 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3037 s->reserved = sizeof(struct rcu_head); 3038 3039 if (!calculate_sizes(s, -1)) 3040 goto error; 3041 if (disable_higher_order_debug) { 3042 /* 3043 * Disable debugging flags that store metadata if the min slab 3044 * order increased. 3045 */ 3046 if (get_order(s->size) > get_order(s->object_size)) { 3047 s->flags &= ~DEBUG_METADATA_FLAGS; 3048 s->offset = 0; 3049 if (!calculate_sizes(s, -1)) 3050 goto error; 3051 } 3052 } 3053 3054 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3055 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3056 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3057 /* Enable fast mode */ 3058 s->flags |= __CMPXCHG_DOUBLE; 3059 #endif 3060 3061 /* 3062 * The larger the object size is, the more pages we want on the partial 3063 * list to avoid pounding the page allocator excessively. 3064 */ 3065 set_min_partial(s, ilog2(s->size) / 2); 3066 3067 /* 3068 * cpu_partial determined the maximum number of objects kept in the 3069 * per cpu partial lists of a processor. 3070 * 3071 * Per cpu partial lists mainly contain slabs that just have one 3072 * object freed. If they are used for allocation then they can be 3073 * filled up again with minimal effort. The slab will never hit the 3074 * per node partial lists and therefore no locking will be required. 3075 * 3076 * This setting also determines 3077 * 3078 * A) The number of objects from per cpu partial slabs dumped to the 3079 * per node list when we reach the limit. 3080 * B) The number of objects in cpu partial slabs to extract from the 3081 * per node list when we run out of per cpu objects. We only fetch 3082 * 50% to keep some capacity around for frees. 3083 */ 3084 if (!kmem_cache_has_cpu_partial(s)) 3085 s->cpu_partial = 0; 3086 else if (s->size >= PAGE_SIZE) 3087 s->cpu_partial = 2; 3088 else if (s->size >= 1024) 3089 s->cpu_partial = 6; 3090 else if (s->size >= 256) 3091 s->cpu_partial = 13; 3092 else 3093 s->cpu_partial = 30; 3094 3095 #ifdef CONFIG_NUMA 3096 s->remote_node_defrag_ratio = 1000; 3097 #endif 3098 if (!init_kmem_cache_nodes(s)) 3099 goto error; 3100 3101 if (alloc_kmem_cache_cpus(s)) 3102 return 0; 3103 3104 free_kmem_cache_nodes(s); 3105 error: 3106 if (flags & SLAB_PANIC) 3107 panic("Cannot create slab %s size=%lu realsize=%u " 3108 "order=%u offset=%u flags=%lx\n", 3109 s->name, (unsigned long)s->size, s->size, 3110 oo_order(s->oo), s->offset, flags); 3111 return -EINVAL; 3112 } 3113 3114 static void list_slab_objects(struct kmem_cache *s, struct page *page, 3115 const char *text) 3116 { 3117 #ifdef CONFIG_SLUB_DEBUG 3118 void *addr = page_address(page); 3119 void *p; 3120 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 3121 sizeof(long), GFP_ATOMIC); 3122 if (!map) 3123 return; 3124 slab_err(s, page, text, s->name); 3125 slab_lock(page); 3126 3127 get_map(s, page, map); 3128 for_each_object(p, s, addr, page->objects) { 3129 3130 if (!test_bit(slab_index(p, s, addr), map)) { 3131 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3132 p, p - addr); 3133 print_tracking(s, p); 3134 } 3135 } 3136 slab_unlock(page); 3137 kfree(map); 3138 #endif 3139 } 3140 3141 /* 3142 * Attempt to free all partial slabs on a node. 3143 * This is called from kmem_cache_close(). We must be the last thread 3144 * using the cache and therefore we do not need to lock anymore. 3145 */ 3146 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3147 { 3148 struct page *page, *h; 3149 3150 list_for_each_entry_safe(page, h, &n->partial, lru) { 3151 if (!page->inuse) { 3152 remove_partial(n, page); 3153 discard_slab(s, page); 3154 } else { 3155 list_slab_objects(s, page, 3156 "Objects remaining in %s on kmem_cache_close()"); 3157 } 3158 } 3159 } 3160 3161 /* 3162 * Release all resources used by a slab cache. 3163 */ 3164 static inline int kmem_cache_close(struct kmem_cache *s) 3165 { 3166 int node; 3167 3168 flush_all(s); 3169 /* Attempt to free all objects */ 3170 for_each_node_state(node, N_NORMAL_MEMORY) { 3171 struct kmem_cache_node *n = get_node(s, node); 3172 3173 free_partial(s, n); 3174 if (n->nr_partial || slabs_node(s, node)) 3175 return 1; 3176 } 3177 free_percpu(s->cpu_slab); 3178 free_kmem_cache_nodes(s); 3179 return 0; 3180 } 3181 3182 int __kmem_cache_shutdown(struct kmem_cache *s) 3183 { 3184 int rc = kmem_cache_close(s); 3185 3186 if (!rc) { 3187 /* 3188 * We do the same lock strategy around sysfs_slab_add, see 3189 * __kmem_cache_create. Because this is pretty much the last 3190 * operation we do and the lock will be released shortly after 3191 * that in slab_common.c, we could just move sysfs_slab_remove 3192 * to a later point in common code. We should do that when we 3193 * have a common sysfs framework for all allocators. 3194 */ 3195 mutex_unlock(&slab_mutex); 3196 sysfs_slab_remove(s); 3197 mutex_lock(&slab_mutex); 3198 } 3199 3200 return rc; 3201 } 3202 3203 /******************************************************************** 3204 * Kmalloc subsystem 3205 *******************************************************************/ 3206 3207 static int __init setup_slub_min_order(char *str) 3208 { 3209 get_option(&str, &slub_min_order); 3210 3211 return 1; 3212 } 3213 3214 __setup("slub_min_order=", setup_slub_min_order); 3215 3216 static int __init setup_slub_max_order(char *str) 3217 { 3218 get_option(&str, &slub_max_order); 3219 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3220 3221 return 1; 3222 } 3223 3224 __setup("slub_max_order=", setup_slub_max_order); 3225 3226 static int __init setup_slub_min_objects(char *str) 3227 { 3228 get_option(&str, &slub_min_objects); 3229 3230 return 1; 3231 } 3232 3233 __setup("slub_min_objects=", setup_slub_min_objects); 3234 3235 static int __init setup_slub_nomerge(char *str) 3236 { 3237 slub_nomerge = 1; 3238 return 1; 3239 } 3240 3241 __setup("slub_nomerge", setup_slub_nomerge); 3242 3243 void *__kmalloc(size_t size, gfp_t flags) 3244 { 3245 struct kmem_cache *s; 3246 void *ret; 3247 3248 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 3249 return kmalloc_large(size, flags); 3250 3251 s = kmalloc_slab(size, flags); 3252 3253 if (unlikely(ZERO_OR_NULL_PTR(s))) 3254 return s; 3255 3256 ret = slab_alloc(s, flags, _RET_IP_); 3257 3258 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3259 3260 return ret; 3261 } 3262 EXPORT_SYMBOL(__kmalloc); 3263 3264 #ifdef CONFIG_NUMA 3265 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3266 { 3267 struct page *page; 3268 void *ptr = NULL; 3269 3270 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3271 page = alloc_pages_node(node, flags, get_order(size)); 3272 if (page) 3273 ptr = page_address(page); 3274 3275 kmemleak_alloc(ptr, size, 1, flags); 3276 return ptr; 3277 } 3278 3279 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3280 { 3281 struct kmem_cache *s; 3282 void *ret; 3283 3284 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 3285 ret = kmalloc_large_node(size, flags, node); 3286 3287 trace_kmalloc_node(_RET_IP_, ret, 3288 size, PAGE_SIZE << get_order(size), 3289 flags, node); 3290 3291 return ret; 3292 } 3293 3294 s = kmalloc_slab(size, flags); 3295 3296 if (unlikely(ZERO_OR_NULL_PTR(s))) 3297 return s; 3298 3299 ret = slab_alloc_node(s, flags, node, _RET_IP_); 3300 3301 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3302 3303 return ret; 3304 } 3305 EXPORT_SYMBOL(__kmalloc_node); 3306 #endif 3307 3308 size_t ksize(const void *object) 3309 { 3310 struct page *page; 3311 3312 if (unlikely(object == ZERO_SIZE_PTR)) 3313 return 0; 3314 3315 page = virt_to_head_page(object); 3316 3317 if (unlikely(!PageSlab(page))) { 3318 WARN_ON(!PageCompound(page)); 3319 return PAGE_SIZE << compound_order(page); 3320 } 3321 3322 return slab_ksize(page->slab_cache); 3323 } 3324 EXPORT_SYMBOL(ksize); 3325 3326 void kfree(const void *x) 3327 { 3328 struct page *page; 3329 void *object = (void *)x; 3330 3331 trace_kfree(_RET_IP_, x); 3332 3333 if (unlikely(ZERO_OR_NULL_PTR(x))) 3334 return; 3335 3336 page = virt_to_head_page(x); 3337 if (unlikely(!PageSlab(page))) { 3338 BUG_ON(!PageCompound(page)); 3339 kmemleak_free(x); 3340 __free_memcg_kmem_pages(page, compound_order(page)); 3341 return; 3342 } 3343 slab_free(page->slab_cache, page, object, _RET_IP_); 3344 } 3345 EXPORT_SYMBOL(kfree); 3346 3347 /* 3348 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3349 * the remaining slabs by the number of items in use. The slabs with the 3350 * most items in use come first. New allocations will then fill those up 3351 * and thus they can be removed from the partial lists. 3352 * 3353 * The slabs with the least items are placed last. This results in them 3354 * being allocated from last increasing the chance that the last objects 3355 * are freed in them. 3356 */ 3357 int kmem_cache_shrink(struct kmem_cache *s) 3358 { 3359 int node; 3360 int i; 3361 struct kmem_cache_node *n; 3362 struct page *page; 3363 struct page *t; 3364 int objects = oo_objects(s->max); 3365 struct list_head *slabs_by_inuse = 3366 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3367 unsigned long flags; 3368 3369 if (!slabs_by_inuse) 3370 return -ENOMEM; 3371 3372 flush_all(s); 3373 for_each_node_state(node, N_NORMAL_MEMORY) { 3374 n = get_node(s, node); 3375 3376 if (!n->nr_partial) 3377 continue; 3378 3379 for (i = 0; i < objects; i++) 3380 INIT_LIST_HEAD(slabs_by_inuse + i); 3381 3382 spin_lock_irqsave(&n->list_lock, flags); 3383 3384 /* 3385 * Build lists indexed by the items in use in each slab. 3386 * 3387 * Note that concurrent frees may occur while we hold the 3388 * list_lock. page->inuse here is the upper limit. 3389 */ 3390 list_for_each_entry_safe(page, t, &n->partial, lru) { 3391 list_move(&page->lru, slabs_by_inuse + page->inuse); 3392 if (!page->inuse) 3393 n->nr_partial--; 3394 } 3395 3396 /* 3397 * Rebuild the partial list with the slabs filled up most 3398 * first and the least used slabs at the end. 3399 */ 3400 for (i = objects - 1; i > 0; i--) 3401 list_splice(slabs_by_inuse + i, n->partial.prev); 3402 3403 spin_unlock_irqrestore(&n->list_lock, flags); 3404 3405 /* Release empty slabs */ 3406 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3407 discard_slab(s, page); 3408 } 3409 3410 kfree(slabs_by_inuse); 3411 return 0; 3412 } 3413 EXPORT_SYMBOL(kmem_cache_shrink); 3414 3415 static int slab_mem_going_offline_callback(void *arg) 3416 { 3417 struct kmem_cache *s; 3418 3419 mutex_lock(&slab_mutex); 3420 list_for_each_entry(s, &slab_caches, list) 3421 kmem_cache_shrink(s); 3422 mutex_unlock(&slab_mutex); 3423 3424 return 0; 3425 } 3426 3427 static void slab_mem_offline_callback(void *arg) 3428 { 3429 struct kmem_cache_node *n; 3430 struct kmem_cache *s; 3431 struct memory_notify *marg = arg; 3432 int offline_node; 3433 3434 offline_node = marg->status_change_nid_normal; 3435 3436 /* 3437 * If the node still has available memory. we need kmem_cache_node 3438 * for it yet. 3439 */ 3440 if (offline_node < 0) 3441 return; 3442 3443 mutex_lock(&slab_mutex); 3444 list_for_each_entry(s, &slab_caches, list) { 3445 n = get_node(s, offline_node); 3446 if (n) { 3447 /* 3448 * if n->nr_slabs > 0, slabs still exist on the node 3449 * that is going down. We were unable to free them, 3450 * and offline_pages() function shouldn't call this 3451 * callback. So, we must fail. 3452 */ 3453 BUG_ON(slabs_node(s, offline_node)); 3454 3455 s->node[offline_node] = NULL; 3456 kmem_cache_free(kmem_cache_node, n); 3457 } 3458 } 3459 mutex_unlock(&slab_mutex); 3460 } 3461 3462 static int slab_mem_going_online_callback(void *arg) 3463 { 3464 struct kmem_cache_node *n; 3465 struct kmem_cache *s; 3466 struct memory_notify *marg = arg; 3467 int nid = marg->status_change_nid_normal; 3468 int ret = 0; 3469 3470 /* 3471 * If the node's memory is already available, then kmem_cache_node is 3472 * already created. Nothing to do. 3473 */ 3474 if (nid < 0) 3475 return 0; 3476 3477 /* 3478 * We are bringing a node online. No memory is available yet. We must 3479 * allocate a kmem_cache_node structure in order to bring the node 3480 * online. 3481 */ 3482 mutex_lock(&slab_mutex); 3483 list_for_each_entry(s, &slab_caches, list) { 3484 /* 3485 * XXX: kmem_cache_alloc_node will fallback to other nodes 3486 * since memory is not yet available from the node that 3487 * is brought up. 3488 */ 3489 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3490 if (!n) { 3491 ret = -ENOMEM; 3492 goto out; 3493 } 3494 init_kmem_cache_node(n); 3495 s->node[nid] = n; 3496 } 3497 out: 3498 mutex_unlock(&slab_mutex); 3499 return ret; 3500 } 3501 3502 static int slab_memory_callback(struct notifier_block *self, 3503 unsigned long action, void *arg) 3504 { 3505 int ret = 0; 3506 3507 switch (action) { 3508 case MEM_GOING_ONLINE: 3509 ret = slab_mem_going_online_callback(arg); 3510 break; 3511 case MEM_GOING_OFFLINE: 3512 ret = slab_mem_going_offline_callback(arg); 3513 break; 3514 case MEM_OFFLINE: 3515 case MEM_CANCEL_ONLINE: 3516 slab_mem_offline_callback(arg); 3517 break; 3518 case MEM_ONLINE: 3519 case MEM_CANCEL_OFFLINE: 3520 break; 3521 } 3522 if (ret) 3523 ret = notifier_from_errno(ret); 3524 else 3525 ret = NOTIFY_OK; 3526 return ret; 3527 } 3528 3529 static struct notifier_block slab_memory_callback_nb = { 3530 .notifier_call = slab_memory_callback, 3531 .priority = SLAB_CALLBACK_PRI, 3532 }; 3533 3534 /******************************************************************** 3535 * Basic setup of slabs 3536 *******************************************************************/ 3537 3538 /* 3539 * Used for early kmem_cache structures that were allocated using 3540 * the page allocator. Allocate them properly then fix up the pointers 3541 * that may be pointing to the wrong kmem_cache structure. 3542 */ 3543 3544 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) 3545 { 3546 int node; 3547 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 3548 3549 memcpy(s, static_cache, kmem_cache->object_size); 3550 3551 /* 3552 * This runs very early, and only the boot processor is supposed to be 3553 * up. Even if it weren't true, IRQs are not up so we couldn't fire 3554 * IPIs around. 3555 */ 3556 __flush_cpu_slab(s, smp_processor_id()); 3557 for_each_node_state(node, N_NORMAL_MEMORY) { 3558 struct kmem_cache_node *n = get_node(s, node); 3559 struct page *p; 3560 3561 if (n) { 3562 list_for_each_entry(p, &n->partial, lru) 3563 p->slab_cache = s; 3564 3565 #ifdef CONFIG_SLUB_DEBUG 3566 list_for_each_entry(p, &n->full, lru) 3567 p->slab_cache = s; 3568 #endif 3569 } 3570 } 3571 list_add(&s->list, &slab_caches); 3572 return s; 3573 } 3574 3575 void __init kmem_cache_init(void) 3576 { 3577 static __initdata struct kmem_cache boot_kmem_cache, 3578 boot_kmem_cache_node; 3579 3580 if (debug_guardpage_minorder()) 3581 slub_max_order = 0; 3582 3583 kmem_cache_node = &boot_kmem_cache_node; 3584 kmem_cache = &boot_kmem_cache; 3585 3586 create_boot_cache(kmem_cache_node, "kmem_cache_node", 3587 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); 3588 3589 register_hotmemory_notifier(&slab_memory_callback_nb); 3590 3591 /* Able to allocate the per node structures */ 3592 slab_state = PARTIAL; 3593 3594 create_boot_cache(kmem_cache, "kmem_cache", 3595 offsetof(struct kmem_cache, node) + 3596 nr_node_ids * sizeof(struct kmem_cache_node *), 3597 SLAB_HWCACHE_ALIGN); 3598 3599 kmem_cache = bootstrap(&boot_kmem_cache); 3600 3601 /* 3602 * Allocate kmem_cache_node properly from the kmem_cache slab. 3603 * kmem_cache_node is separately allocated so no need to 3604 * update any list pointers. 3605 */ 3606 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 3607 3608 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3609 create_kmalloc_caches(0); 3610 3611 #ifdef CONFIG_SMP 3612 register_cpu_notifier(&slab_notifier); 3613 #endif 3614 3615 printk(KERN_INFO 3616 "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," 3617 " CPUs=%d, Nodes=%d\n", 3618 cache_line_size(), 3619 slub_min_order, slub_max_order, slub_min_objects, 3620 nr_cpu_ids, nr_node_ids); 3621 } 3622 3623 void __init kmem_cache_init_late(void) 3624 { 3625 } 3626 3627 /* 3628 * Find a mergeable slab cache 3629 */ 3630 static int slab_unmergeable(struct kmem_cache *s) 3631 { 3632 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3633 return 1; 3634 3635 if (s->ctor) 3636 return 1; 3637 3638 /* 3639 * We may have set a slab to be unmergeable during bootstrap. 3640 */ 3641 if (s->refcount < 0) 3642 return 1; 3643 3644 return 0; 3645 } 3646 3647 static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, 3648 size_t align, unsigned long flags, const char *name, 3649 void (*ctor)(void *)) 3650 { 3651 struct kmem_cache *s; 3652 3653 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3654 return NULL; 3655 3656 if (ctor) 3657 return NULL; 3658 3659 size = ALIGN(size, sizeof(void *)); 3660 align = calculate_alignment(flags, align, size); 3661 size = ALIGN(size, align); 3662 flags = kmem_cache_flags(size, flags, name, NULL); 3663 3664 list_for_each_entry(s, &slab_caches, list) { 3665 if (slab_unmergeable(s)) 3666 continue; 3667 3668 if (size > s->size) 3669 continue; 3670 3671 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3672 continue; 3673 /* 3674 * Check if alignment is compatible. 3675 * Courtesy of Adrian Drzewiecki 3676 */ 3677 if ((s->size & ~(align - 1)) != s->size) 3678 continue; 3679 3680 if (s->size - size >= sizeof(void *)) 3681 continue; 3682 3683 if (!cache_match_memcg(s, memcg)) 3684 continue; 3685 3686 return s; 3687 } 3688 return NULL; 3689 } 3690 3691 struct kmem_cache * 3692 __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 3693 size_t align, unsigned long flags, void (*ctor)(void *)) 3694 { 3695 struct kmem_cache *s; 3696 3697 s = find_mergeable(memcg, size, align, flags, name, ctor); 3698 if (s) { 3699 s->refcount++; 3700 /* 3701 * Adjust the object sizes so that we clear 3702 * the complete object on kzalloc. 3703 */ 3704 s->object_size = max(s->object_size, (int)size); 3705 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3706 3707 if (sysfs_slab_alias(s, name)) { 3708 s->refcount--; 3709 s = NULL; 3710 } 3711 } 3712 3713 return s; 3714 } 3715 3716 int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 3717 { 3718 int err; 3719 3720 err = kmem_cache_open(s, flags); 3721 if (err) 3722 return err; 3723 3724 /* Mutex is not taken during early boot */ 3725 if (slab_state <= UP) 3726 return 0; 3727 3728 memcg_propagate_slab_attrs(s); 3729 mutex_unlock(&slab_mutex); 3730 err = sysfs_slab_add(s); 3731 mutex_lock(&slab_mutex); 3732 3733 if (err) 3734 kmem_cache_close(s); 3735 3736 return err; 3737 } 3738 3739 #ifdef CONFIG_SMP 3740 /* 3741 * Use the cpu notifier to insure that the cpu slabs are flushed when 3742 * necessary. 3743 */ 3744 static int slab_cpuup_callback(struct notifier_block *nfb, 3745 unsigned long action, void *hcpu) 3746 { 3747 long cpu = (long)hcpu; 3748 struct kmem_cache *s; 3749 unsigned long flags; 3750 3751 switch (action) { 3752 case CPU_UP_CANCELED: 3753 case CPU_UP_CANCELED_FROZEN: 3754 case CPU_DEAD: 3755 case CPU_DEAD_FROZEN: 3756 mutex_lock(&slab_mutex); 3757 list_for_each_entry(s, &slab_caches, list) { 3758 local_irq_save(flags); 3759 __flush_cpu_slab(s, cpu); 3760 local_irq_restore(flags); 3761 } 3762 mutex_unlock(&slab_mutex); 3763 break; 3764 default: 3765 break; 3766 } 3767 return NOTIFY_OK; 3768 } 3769 3770 static struct notifier_block slab_notifier = { 3771 .notifier_call = slab_cpuup_callback 3772 }; 3773 3774 #endif 3775 3776 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3777 { 3778 struct kmem_cache *s; 3779 void *ret; 3780 3781 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 3782 return kmalloc_large(size, gfpflags); 3783 3784 s = kmalloc_slab(size, gfpflags); 3785 3786 if (unlikely(ZERO_OR_NULL_PTR(s))) 3787 return s; 3788 3789 ret = slab_alloc(s, gfpflags, caller); 3790 3791 /* Honor the call site pointer we received. */ 3792 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3793 3794 return ret; 3795 } 3796 3797 #ifdef CONFIG_NUMA 3798 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3799 int node, unsigned long caller) 3800 { 3801 struct kmem_cache *s; 3802 void *ret; 3803 3804 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 3805 ret = kmalloc_large_node(size, gfpflags, node); 3806 3807 trace_kmalloc_node(caller, ret, 3808 size, PAGE_SIZE << get_order(size), 3809 gfpflags, node); 3810 3811 return ret; 3812 } 3813 3814 s = kmalloc_slab(size, gfpflags); 3815 3816 if (unlikely(ZERO_OR_NULL_PTR(s))) 3817 return s; 3818 3819 ret = slab_alloc_node(s, gfpflags, node, caller); 3820 3821 /* Honor the call site pointer we received. */ 3822 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3823 3824 return ret; 3825 } 3826 #endif 3827 3828 #ifdef CONFIG_SYSFS 3829 static int count_inuse(struct page *page) 3830 { 3831 return page->inuse; 3832 } 3833 3834 static int count_total(struct page *page) 3835 { 3836 return page->objects; 3837 } 3838 #endif 3839 3840 #ifdef CONFIG_SLUB_DEBUG 3841 static int validate_slab(struct kmem_cache *s, struct page *page, 3842 unsigned long *map) 3843 { 3844 void *p; 3845 void *addr = page_address(page); 3846 3847 if (!check_slab(s, page) || 3848 !on_freelist(s, page, NULL)) 3849 return 0; 3850 3851 /* Now we know that a valid freelist exists */ 3852 bitmap_zero(map, page->objects); 3853 3854 get_map(s, page, map); 3855 for_each_object(p, s, addr, page->objects) { 3856 if (test_bit(slab_index(p, s, addr), map)) 3857 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3858 return 0; 3859 } 3860 3861 for_each_object(p, s, addr, page->objects) 3862 if (!test_bit(slab_index(p, s, addr), map)) 3863 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 3864 return 0; 3865 return 1; 3866 } 3867 3868 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3869 unsigned long *map) 3870 { 3871 slab_lock(page); 3872 validate_slab(s, page, map); 3873 slab_unlock(page); 3874 } 3875 3876 static int validate_slab_node(struct kmem_cache *s, 3877 struct kmem_cache_node *n, unsigned long *map) 3878 { 3879 unsigned long count = 0; 3880 struct page *page; 3881 unsigned long flags; 3882 3883 spin_lock_irqsave(&n->list_lock, flags); 3884 3885 list_for_each_entry(page, &n->partial, lru) { 3886 validate_slab_slab(s, page, map); 3887 count++; 3888 } 3889 if (count != n->nr_partial) 3890 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3891 "counter=%ld\n", s->name, count, n->nr_partial); 3892 3893 if (!(s->flags & SLAB_STORE_USER)) 3894 goto out; 3895 3896 list_for_each_entry(page, &n->full, lru) { 3897 validate_slab_slab(s, page, map); 3898 count++; 3899 } 3900 if (count != atomic_long_read(&n->nr_slabs)) 3901 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3902 "counter=%ld\n", s->name, count, 3903 atomic_long_read(&n->nr_slabs)); 3904 3905 out: 3906 spin_unlock_irqrestore(&n->list_lock, flags); 3907 return count; 3908 } 3909 3910 static long validate_slab_cache(struct kmem_cache *s) 3911 { 3912 int node; 3913 unsigned long count = 0; 3914 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3915 sizeof(unsigned long), GFP_KERNEL); 3916 3917 if (!map) 3918 return -ENOMEM; 3919 3920 flush_all(s); 3921 for_each_node_state(node, N_NORMAL_MEMORY) { 3922 struct kmem_cache_node *n = get_node(s, node); 3923 3924 count += validate_slab_node(s, n, map); 3925 } 3926 kfree(map); 3927 return count; 3928 } 3929 /* 3930 * Generate lists of code addresses where slabcache objects are allocated 3931 * and freed. 3932 */ 3933 3934 struct location { 3935 unsigned long count; 3936 unsigned long addr; 3937 long long sum_time; 3938 long min_time; 3939 long max_time; 3940 long min_pid; 3941 long max_pid; 3942 DECLARE_BITMAP(cpus, NR_CPUS); 3943 nodemask_t nodes; 3944 }; 3945 3946 struct loc_track { 3947 unsigned long max; 3948 unsigned long count; 3949 struct location *loc; 3950 }; 3951 3952 static void free_loc_track(struct loc_track *t) 3953 { 3954 if (t->max) 3955 free_pages((unsigned long)t->loc, 3956 get_order(sizeof(struct location) * t->max)); 3957 } 3958 3959 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3960 { 3961 struct location *l; 3962 int order; 3963 3964 order = get_order(sizeof(struct location) * max); 3965 3966 l = (void *)__get_free_pages(flags, order); 3967 if (!l) 3968 return 0; 3969 3970 if (t->count) { 3971 memcpy(l, t->loc, sizeof(struct location) * t->count); 3972 free_loc_track(t); 3973 } 3974 t->max = max; 3975 t->loc = l; 3976 return 1; 3977 } 3978 3979 static int add_location(struct loc_track *t, struct kmem_cache *s, 3980 const struct track *track) 3981 { 3982 long start, end, pos; 3983 struct location *l; 3984 unsigned long caddr; 3985 unsigned long age = jiffies - track->when; 3986 3987 start = -1; 3988 end = t->count; 3989 3990 for ( ; ; ) { 3991 pos = start + (end - start + 1) / 2; 3992 3993 /* 3994 * There is nothing at "end". If we end up there 3995 * we need to add something to before end. 3996 */ 3997 if (pos == end) 3998 break; 3999 4000 caddr = t->loc[pos].addr; 4001 if (track->addr == caddr) { 4002 4003 l = &t->loc[pos]; 4004 l->count++; 4005 if (track->when) { 4006 l->sum_time += age; 4007 if (age < l->min_time) 4008 l->min_time = age; 4009 if (age > l->max_time) 4010 l->max_time = age; 4011 4012 if (track->pid < l->min_pid) 4013 l->min_pid = track->pid; 4014 if (track->pid > l->max_pid) 4015 l->max_pid = track->pid; 4016 4017 cpumask_set_cpu(track->cpu, 4018 to_cpumask(l->cpus)); 4019 } 4020 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4021 return 1; 4022 } 4023 4024 if (track->addr < caddr) 4025 end = pos; 4026 else 4027 start = pos; 4028 } 4029 4030 /* 4031 * Not found. Insert new tracking element. 4032 */ 4033 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4034 return 0; 4035 4036 l = t->loc + pos; 4037 if (pos < t->count) 4038 memmove(l + 1, l, 4039 (t->count - pos) * sizeof(struct location)); 4040 t->count++; 4041 l->count = 1; 4042 l->addr = track->addr; 4043 l->sum_time = age; 4044 l->min_time = age; 4045 l->max_time = age; 4046 l->min_pid = track->pid; 4047 l->max_pid = track->pid; 4048 cpumask_clear(to_cpumask(l->cpus)); 4049 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4050 nodes_clear(l->nodes); 4051 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4052 return 1; 4053 } 4054 4055 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4056 struct page *page, enum track_item alloc, 4057 unsigned long *map) 4058 { 4059 void *addr = page_address(page); 4060 void *p; 4061 4062 bitmap_zero(map, page->objects); 4063 get_map(s, page, map); 4064 4065 for_each_object(p, s, addr, page->objects) 4066 if (!test_bit(slab_index(p, s, addr), map)) 4067 add_location(t, s, get_track(s, p, alloc)); 4068 } 4069 4070 static int list_locations(struct kmem_cache *s, char *buf, 4071 enum track_item alloc) 4072 { 4073 int len = 0; 4074 unsigned long i; 4075 struct loc_track t = { 0, 0, NULL }; 4076 int node; 4077 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4078 sizeof(unsigned long), GFP_KERNEL); 4079 4080 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4081 GFP_TEMPORARY)) { 4082 kfree(map); 4083 return sprintf(buf, "Out of memory\n"); 4084 } 4085 /* Push back cpu slabs */ 4086 flush_all(s); 4087 4088 for_each_node_state(node, N_NORMAL_MEMORY) { 4089 struct kmem_cache_node *n = get_node(s, node); 4090 unsigned long flags; 4091 struct page *page; 4092 4093 if (!atomic_long_read(&n->nr_slabs)) 4094 continue; 4095 4096 spin_lock_irqsave(&n->list_lock, flags); 4097 list_for_each_entry(page, &n->partial, lru) 4098 process_slab(&t, s, page, alloc, map); 4099 list_for_each_entry(page, &n->full, lru) 4100 process_slab(&t, s, page, alloc, map); 4101 spin_unlock_irqrestore(&n->list_lock, flags); 4102 } 4103 4104 for (i = 0; i < t.count; i++) { 4105 struct location *l = &t.loc[i]; 4106 4107 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4108 break; 4109 len += sprintf(buf + len, "%7ld ", l->count); 4110 4111 if (l->addr) 4112 len += sprintf(buf + len, "%pS", (void *)l->addr); 4113 else 4114 len += sprintf(buf + len, "<not-available>"); 4115 4116 if (l->sum_time != l->min_time) { 4117 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4118 l->min_time, 4119 (long)div_u64(l->sum_time, l->count), 4120 l->max_time); 4121 } else 4122 len += sprintf(buf + len, " age=%ld", 4123 l->min_time); 4124 4125 if (l->min_pid != l->max_pid) 4126 len += sprintf(buf + len, " pid=%ld-%ld", 4127 l->min_pid, l->max_pid); 4128 else 4129 len += sprintf(buf + len, " pid=%ld", 4130 l->min_pid); 4131 4132 if (num_online_cpus() > 1 && 4133 !cpumask_empty(to_cpumask(l->cpus)) && 4134 len < PAGE_SIZE - 60) { 4135 len += sprintf(buf + len, " cpus="); 4136 len += cpulist_scnprintf(buf + len, 4137 PAGE_SIZE - len - 50, 4138 to_cpumask(l->cpus)); 4139 } 4140 4141 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4142 len < PAGE_SIZE - 60) { 4143 len += sprintf(buf + len, " nodes="); 4144 len += nodelist_scnprintf(buf + len, 4145 PAGE_SIZE - len - 50, 4146 l->nodes); 4147 } 4148 4149 len += sprintf(buf + len, "\n"); 4150 } 4151 4152 free_loc_track(&t); 4153 kfree(map); 4154 if (!t.count) 4155 len += sprintf(buf, "No data\n"); 4156 return len; 4157 } 4158 #endif 4159 4160 #ifdef SLUB_RESILIENCY_TEST 4161 static void resiliency_test(void) 4162 { 4163 u8 *p; 4164 4165 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); 4166 4167 printk(KERN_ERR "SLUB resiliency testing\n"); 4168 printk(KERN_ERR "-----------------------\n"); 4169 printk(KERN_ERR "A. Corruption after allocation\n"); 4170 4171 p = kzalloc(16, GFP_KERNEL); 4172 p[16] = 0x12; 4173 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4174 " 0x12->0x%p\n\n", p + 16); 4175 4176 validate_slab_cache(kmalloc_caches[4]); 4177 4178 /* Hmmm... The next two are dangerous */ 4179 p = kzalloc(32, GFP_KERNEL); 4180 p[32 + sizeof(void *)] = 0x34; 4181 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4182 " 0x34 -> -0x%p\n", p); 4183 printk(KERN_ERR 4184 "If allocated object is overwritten then not detectable\n\n"); 4185 4186 validate_slab_cache(kmalloc_caches[5]); 4187 p = kzalloc(64, GFP_KERNEL); 4188 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4189 *p = 0x56; 4190 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4191 p); 4192 printk(KERN_ERR 4193 "If allocated object is overwritten then not detectable\n\n"); 4194 validate_slab_cache(kmalloc_caches[6]); 4195 4196 printk(KERN_ERR "\nB. Corruption after free\n"); 4197 p = kzalloc(128, GFP_KERNEL); 4198 kfree(p); 4199 *p = 0x78; 4200 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4201 validate_slab_cache(kmalloc_caches[7]); 4202 4203 p = kzalloc(256, GFP_KERNEL); 4204 kfree(p); 4205 p[50] = 0x9a; 4206 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4207 p); 4208 validate_slab_cache(kmalloc_caches[8]); 4209 4210 p = kzalloc(512, GFP_KERNEL); 4211 kfree(p); 4212 p[512] = 0xab; 4213 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4214 validate_slab_cache(kmalloc_caches[9]); 4215 } 4216 #else 4217 #ifdef CONFIG_SYSFS 4218 static void resiliency_test(void) {}; 4219 #endif 4220 #endif 4221 4222 #ifdef CONFIG_SYSFS 4223 enum slab_stat_type { 4224 SL_ALL, /* All slabs */ 4225 SL_PARTIAL, /* Only partially allocated slabs */ 4226 SL_CPU, /* Only slabs used for cpu caches */ 4227 SL_OBJECTS, /* Determine allocated objects not slabs */ 4228 SL_TOTAL /* Determine object capacity not slabs */ 4229 }; 4230 4231 #define SO_ALL (1 << SL_ALL) 4232 #define SO_PARTIAL (1 << SL_PARTIAL) 4233 #define SO_CPU (1 << SL_CPU) 4234 #define SO_OBJECTS (1 << SL_OBJECTS) 4235 #define SO_TOTAL (1 << SL_TOTAL) 4236 4237 static ssize_t show_slab_objects(struct kmem_cache *s, 4238 char *buf, unsigned long flags) 4239 { 4240 unsigned long total = 0; 4241 int node; 4242 int x; 4243 unsigned long *nodes; 4244 4245 nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4246 if (!nodes) 4247 return -ENOMEM; 4248 4249 if (flags & SO_CPU) { 4250 int cpu; 4251 4252 for_each_possible_cpu(cpu) { 4253 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 4254 cpu); 4255 int node; 4256 struct page *page; 4257 4258 page = ACCESS_ONCE(c->page); 4259 if (!page) 4260 continue; 4261 4262 node = page_to_nid(page); 4263 if (flags & SO_TOTAL) 4264 x = page->objects; 4265 else if (flags & SO_OBJECTS) 4266 x = page->inuse; 4267 else 4268 x = 1; 4269 4270 total += x; 4271 nodes[node] += x; 4272 4273 page = ACCESS_ONCE(c->partial); 4274 if (page) { 4275 x = page->pobjects; 4276 total += x; 4277 nodes[node] += x; 4278 } 4279 } 4280 } 4281 4282 lock_memory_hotplug(); 4283 #ifdef CONFIG_SLUB_DEBUG 4284 if (flags & SO_ALL) { 4285 for_each_node_state(node, N_NORMAL_MEMORY) { 4286 struct kmem_cache_node *n = get_node(s, node); 4287 4288 if (flags & SO_TOTAL) 4289 x = atomic_long_read(&n->total_objects); 4290 else if (flags & SO_OBJECTS) 4291 x = atomic_long_read(&n->total_objects) - 4292 count_partial(n, count_free); 4293 else 4294 x = atomic_long_read(&n->nr_slabs); 4295 total += x; 4296 nodes[node] += x; 4297 } 4298 4299 } else 4300 #endif 4301 if (flags & SO_PARTIAL) { 4302 for_each_node_state(node, N_NORMAL_MEMORY) { 4303 struct kmem_cache_node *n = get_node(s, node); 4304 4305 if (flags & SO_TOTAL) 4306 x = count_partial(n, count_total); 4307 else if (flags & SO_OBJECTS) 4308 x = count_partial(n, count_inuse); 4309 else 4310 x = n->nr_partial; 4311 total += x; 4312 nodes[node] += x; 4313 } 4314 } 4315 x = sprintf(buf, "%lu", total); 4316 #ifdef CONFIG_NUMA 4317 for_each_node_state(node, N_NORMAL_MEMORY) 4318 if (nodes[node]) 4319 x += sprintf(buf + x, " N%d=%lu", 4320 node, nodes[node]); 4321 #endif 4322 unlock_memory_hotplug(); 4323 kfree(nodes); 4324 return x + sprintf(buf + x, "\n"); 4325 } 4326 4327 #ifdef CONFIG_SLUB_DEBUG 4328 static int any_slab_objects(struct kmem_cache *s) 4329 { 4330 int node; 4331 4332 for_each_online_node(node) { 4333 struct kmem_cache_node *n = get_node(s, node); 4334 4335 if (!n) 4336 continue; 4337 4338 if (atomic_long_read(&n->total_objects)) 4339 return 1; 4340 } 4341 return 0; 4342 } 4343 #endif 4344 4345 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4346 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4347 4348 struct slab_attribute { 4349 struct attribute attr; 4350 ssize_t (*show)(struct kmem_cache *s, char *buf); 4351 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4352 }; 4353 4354 #define SLAB_ATTR_RO(_name) \ 4355 static struct slab_attribute _name##_attr = \ 4356 __ATTR(_name, 0400, _name##_show, NULL) 4357 4358 #define SLAB_ATTR(_name) \ 4359 static struct slab_attribute _name##_attr = \ 4360 __ATTR(_name, 0600, _name##_show, _name##_store) 4361 4362 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4363 { 4364 return sprintf(buf, "%d\n", s->size); 4365 } 4366 SLAB_ATTR_RO(slab_size); 4367 4368 static ssize_t align_show(struct kmem_cache *s, char *buf) 4369 { 4370 return sprintf(buf, "%d\n", s->align); 4371 } 4372 SLAB_ATTR_RO(align); 4373 4374 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4375 { 4376 return sprintf(buf, "%d\n", s->object_size); 4377 } 4378 SLAB_ATTR_RO(object_size); 4379 4380 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4381 { 4382 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4383 } 4384 SLAB_ATTR_RO(objs_per_slab); 4385 4386 static ssize_t order_store(struct kmem_cache *s, 4387 const char *buf, size_t length) 4388 { 4389 unsigned long order; 4390 int err; 4391 4392 err = kstrtoul(buf, 10, &order); 4393 if (err) 4394 return err; 4395 4396 if (order > slub_max_order || order < slub_min_order) 4397 return -EINVAL; 4398 4399 calculate_sizes(s, order); 4400 return length; 4401 } 4402 4403 static ssize_t order_show(struct kmem_cache *s, char *buf) 4404 { 4405 return sprintf(buf, "%d\n", oo_order(s->oo)); 4406 } 4407 SLAB_ATTR(order); 4408 4409 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4410 { 4411 return sprintf(buf, "%lu\n", s->min_partial); 4412 } 4413 4414 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4415 size_t length) 4416 { 4417 unsigned long min; 4418 int err; 4419 4420 err = kstrtoul(buf, 10, &min); 4421 if (err) 4422 return err; 4423 4424 set_min_partial(s, min); 4425 return length; 4426 } 4427 SLAB_ATTR(min_partial); 4428 4429 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 4430 { 4431 return sprintf(buf, "%u\n", s->cpu_partial); 4432 } 4433 4434 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4435 size_t length) 4436 { 4437 unsigned long objects; 4438 int err; 4439 4440 err = kstrtoul(buf, 10, &objects); 4441 if (err) 4442 return err; 4443 if (objects && !kmem_cache_has_cpu_partial(s)) 4444 return -EINVAL; 4445 4446 s->cpu_partial = objects; 4447 flush_all(s); 4448 return length; 4449 } 4450 SLAB_ATTR(cpu_partial); 4451 4452 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4453 { 4454 if (!s->ctor) 4455 return 0; 4456 return sprintf(buf, "%pS\n", s->ctor); 4457 } 4458 SLAB_ATTR_RO(ctor); 4459 4460 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4461 { 4462 return sprintf(buf, "%d\n", s->refcount - 1); 4463 } 4464 SLAB_ATTR_RO(aliases); 4465 4466 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4467 { 4468 return show_slab_objects(s, buf, SO_PARTIAL); 4469 } 4470 SLAB_ATTR_RO(partial); 4471 4472 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4473 { 4474 return show_slab_objects(s, buf, SO_CPU); 4475 } 4476 SLAB_ATTR_RO(cpu_slabs); 4477 4478 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4479 { 4480 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4481 } 4482 SLAB_ATTR_RO(objects); 4483 4484 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4485 { 4486 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4487 } 4488 SLAB_ATTR_RO(objects_partial); 4489 4490 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 4491 { 4492 int objects = 0; 4493 int pages = 0; 4494 int cpu; 4495 int len; 4496 4497 for_each_online_cpu(cpu) { 4498 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; 4499 4500 if (page) { 4501 pages += page->pages; 4502 objects += page->pobjects; 4503 } 4504 } 4505 4506 len = sprintf(buf, "%d(%d)", objects, pages); 4507 4508 #ifdef CONFIG_SMP 4509 for_each_online_cpu(cpu) { 4510 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; 4511 4512 if (page && len < PAGE_SIZE - 20) 4513 len += sprintf(buf + len, " C%d=%d(%d)", cpu, 4514 page->pobjects, page->pages); 4515 } 4516 #endif 4517 return len + sprintf(buf + len, "\n"); 4518 } 4519 SLAB_ATTR_RO(slabs_cpu_partial); 4520 4521 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4522 { 4523 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4524 } 4525 4526 static ssize_t reclaim_account_store(struct kmem_cache *s, 4527 const char *buf, size_t length) 4528 { 4529 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4530 if (buf[0] == '1') 4531 s->flags |= SLAB_RECLAIM_ACCOUNT; 4532 return length; 4533 } 4534 SLAB_ATTR(reclaim_account); 4535 4536 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4537 { 4538 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4539 } 4540 SLAB_ATTR_RO(hwcache_align); 4541 4542 #ifdef CONFIG_ZONE_DMA 4543 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4544 { 4545 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4546 } 4547 SLAB_ATTR_RO(cache_dma); 4548 #endif 4549 4550 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4551 { 4552 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4553 } 4554 SLAB_ATTR_RO(destroy_by_rcu); 4555 4556 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4557 { 4558 return sprintf(buf, "%d\n", s->reserved); 4559 } 4560 SLAB_ATTR_RO(reserved); 4561 4562 #ifdef CONFIG_SLUB_DEBUG 4563 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4564 { 4565 return show_slab_objects(s, buf, SO_ALL); 4566 } 4567 SLAB_ATTR_RO(slabs); 4568 4569 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4570 { 4571 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4572 } 4573 SLAB_ATTR_RO(total_objects); 4574 4575 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4576 { 4577 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4578 } 4579 4580 static ssize_t sanity_checks_store(struct kmem_cache *s, 4581 const char *buf, size_t length) 4582 { 4583 s->flags &= ~SLAB_DEBUG_FREE; 4584 if (buf[0] == '1') { 4585 s->flags &= ~__CMPXCHG_DOUBLE; 4586 s->flags |= SLAB_DEBUG_FREE; 4587 } 4588 return length; 4589 } 4590 SLAB_ATTR(sanity_checks); 4591 4592 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4593 { 4594 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4595 } 4596 4597 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4598 size_t length) 4599 { 4600 s->flags &= ~SLAB_TRACE; 4601 if (buf[0] == '1') { 4602 s->flags &= ~__CMPXCHG_DOUBLE; 4603 s->flags |= SLAB_TRACE; 4604 } 4605 return length; 4606 } 4607 SLAB_ATTR(trace); 4608 4609 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4610 { 4611 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4612 } 4613 4614 static ssize_t red_zone_store(struct kmem_cache *s, 4615 const char *buf, size_t length) 4616 { 4617 if (any_slab_objects(s)) 4618 return -EBUSY; 4619 4620 s->flags &= ~SLAB_RED_ZONE; 4621 if (buf[0] == '1') { 4622 s->flags &= ~__CMPXCHG_DOUBLE; 4623 s->flags |= SLAB_RED_ZONE; 4624 } 4625 calculate_sizes(s, -1); 4626 return length; 4627 } 4628 SLAB_ATTR(red_zone); 4629 4630 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4631 { 4632 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4633 } 4634 4635 static ssize_t poison_store(struct kmem_cache *s, 4636 const char *buf, size_t length) 4637 { 4638 if (any_slab_objects(s)) 4639 return -EBUSY; 4640 4641 s->flags &= ~SLAB_POISON; 4642 if (buf[0] == '1') { 4643 s->flags &= ~__CMPXCHG_DOUBLE; 4644 s->flags |= SLAB_POISON; 4645 } 4646 calculate_sizes(s, -1); 4647 return length; 4648 } 4649 SLAB_ATTR(poison); 4650 4651 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4652 { 4653 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4654 } 4655 4656 static ssize_t store_user_store(struct kmem_cache *s, 4657 const char *buf, size_t length) 4658 { 4659 if (any_slab_objects(s)) 4660 return -EBUSY; 4661 4662 s->flags &= ~SLAB_STORE_USER; 4663 if (buf[0] == '1') { 4664 s->flags &= ~__CMPXCHG_DOUBLE; 4665 s->flags |= SLAB_STORE_USER; 4666 } 4667 calculate_sizes(s, -1); 4668 return length; 4669 } 4670 SLAB_ATTR(store_user); 4671 4672 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4673 { 4674 return 0; 4675 } 4676 4677 static ssize_t validate_store(struct kmem_cache *s, 4678 const char *buf, size_t length) 4679 { 4680 int ret = -EINVAL; 4681 4682 if (buf[0] == '1') { 4683 ret = validate_slab_cache(s); 4684 if (ret >= 0) 4685 ret = length; 4686 } 4687 return ret; 4688 } 4689 SLAB_ATTR(validate); 4690 4691 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4692 { 4693 if (!(s->flags & SLAB_STORE_USER)) 4694 return -ENOSYS; 4695 return list_locations(s, buf, TRACK_ALLOC); 4696 } 4697 SLAB_ATTR_RO(alloc_calls); 4698 4699 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4700 { 4701 if (!(s->flags & SLAB_STORE_USER)) 4702 return -ENOSYS; 4703 return list_locations(s, buf, TRACK_FREE); 4704 } 4705 SLAB_ATTR_RO(free_calls); 4706 #endif /* CONFIG_SLUB_DEBUG */ 4707 4708 #ifdef CONFIG_FAILSLAB 4709 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4710 { 4711 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4712 } 4713 4714 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4715 size_t length) 4716 { 4717 s->flags &= ~SLAB_FAILSLAB; 4718 if (buf[0] == '1') 4719 s->flags |= SLAB_FAILSLAB; 4720 return length; 4721 } 4722 SLAB_ATTR(failslab); 4723 #endif 4724 4725 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4726 { 4727 return 0; 4728 } 4729 4730 static ssize_t shrink_store(struct kmem_cache *s, 4731 const char *buf, size_t length) 4732 { 4733 if (buf[0] == '1') { 4734 int rc = kmem_cache_shrink(s); 4735 4736 if (rc) 4737 return rc; 4738 } else 4739 return -EINVAL; 4740 return length; 4741 } 4742 SLAB_ATTR(shrink); 4743 4744 #ifdef CONFIG_NUMA 4745 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4746 { 4747 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4748 } 4749 4750 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4751 const char *buf, size_t length) 4752 { 4753 unsigned long ratio; 4754 int err; 4755 4756 err = kstrtoul(buf, 10, &ratio); 4757 if (err) 4758 return err; 4759 4760 if (ratio <= 100) 4761 s->remote_node_defrag_ratio = ratio * 10; 4762 4763 return length; 4764 } 4765 SLAB_ATTR(remote_node_defrag_ratio); 4766 #endif 4767 4768 #ifdef CONFIG_SLUB_STATS 4769 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4770 { 4771 unsigned long sum = 0; 4772 int cpu; 4773 int len; 4774 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4775 4776 if (!data) 4777 return -ENOMEM; 4778 4779 for_each_online_cpu(cpu) { 4780 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4781 4782 data[cpu] = x; 4783 sum += x; 4784 } 4785 4786 len = sprintf(buf, "%lu", sum); 4787 4788 #ifdef CONFIG_SMP 4789 for_each_online_cpu(cpu) { 4790 if (data[cpu] && len < PAGE_SIZE - 20) 4791 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4792 } 4793 #endif 4794 kfree(data); 4795 return len + sprintf(buf + len, "\n"); 4796 } 4797 4798 static void clear_stat(struct kmem_cache *s, enum stat_item si) 4799 { 4800 int cpu; 4801 4802 for_each_online_cpu(cpu) 4803 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4804 } 4805 4806 #define STAT_ATTR(si, text) \ 4807 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4808 { \ 4809 return show_stat(s, buf, si); \ 4810 } \ 4811 static ssize_t text##_store(struct kmem_cache *s, \ 4812 const char *buf, size_t length) \ 4813 { \ 4814 if (buf[0] != '0') \ 4815 return -EINVAL; \ 4816 clear_stat(s, si); \ 4817 return length; \ 4818 } \ 4819 SLAB_ATTR(text); \ 4820 4821 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4822 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4823 STAT_ATTR(FREE_FASTPATH, free_fastpath); 4824 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4825 STAT_ATTR(FREE_FROZEN, free_frozen); 4826 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4827 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4828 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4829 STAT_ATTR(ALLOC_SLAB, alloc_slab); 4830 STAT_ATTR(ALLOC_REFILL, alloc_refill); 4831 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 4832 STAT_ATTR(FREE_SLAB, free_slab); 4833 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4834 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4835 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4836 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4837 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4838 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4839 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 4840 STAT_ATTR(ORDER_FALLBACK, order_fallback); 4841 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4842 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4843 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 4844 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 4845 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 4846 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 4847 #endif 4848 4849 static struct attribute *slab_attrs[] = { 4850 &slab_size_attr.attr, 4851 &object_size_attr.attr, 4852 &objs_per_slab_attr.attr, 4853 &order_attr.attr, 4854 &min_partial_attr.attr, 4855 &cpu_partial_attr.attr, 4856 &objects_attr.attr, 4857 &objects_partial_attr.attr, 4858 &partial_attr.attr, 4859 &cpu_slabs_attr.attr, 4860 &ctor_attr.attr, 4861 &aliases_attr.attr, 4862 &align_attr.attr, 4863 &hwcache_align_attr.attr, 4864 &reclaim_account_attr.attr, 4865 &destroy_by_rcu_attr.attr, 4866 &shrink_attr.attr, 4867 &reserved_attr.attr, 4868 &slabs_cpu_partial_attr.attr, 4869 #ifdef CONFIG_SLUB_DEBUG 4870 &total_objects_attr.attr, 4871 &slabs_attr.attr, 4872 &sanity_checks_attr.attr, 4873 &trace_attr.attr, 4874 &red_zone_attr.attr, 4875 &poison_attr.attr, 4876 &store_user_attr.attr, 4877 &validate_attr.attr, 4878 &alloc_calls_attr.attr, 4879 &free_calls_attr.attr, 4880 #endif 4881 #ifdef CONFIG_ZONE_DMA 4882 &cache_dma_attr.attr, 4883 #endif 4884 #ifdef CONFIG_NUMA 4885 &remote_node_defrag_ratio_attr.attr, 4886 #endif 4887 #ifdef CONFIG_SLUB_STATS 4888 &alloc_fastpath_attr.attr, 4889 &alloc_slowpath_attr.attr, 4890 &free_fastpath_attr.attr, 4891 &free_slowpath_attr.attr, 4892 &free_frozen_attr.attr, 4893 &free_add_partial_attr.attr, 4894 &free_remove_partial_attr.attr, 4895 &alloc_from_partial_attr.attr, 4896 &alloc_slab_attr.attr, 4897 &alloc_refill_attr.attr, 4898 &alloc_node_mismatch_attr.attr, 4899 &free_slab_attr.attr, 4900 &cpuslab_flush_attr.attr, 4901 &deactivate_full_attr.attr, 4902 &deactivate_empty_attr.attr, 4903 &deactivate_to_head_attr.attr, 4904 &deactivate_to_tail_attr.attr, 4905 &deactivate_remote_frees_attr.attr, 4906 &deactivate_bypass_attr.attr, 4907 &order_fallback_attr.attr, 4908 &cmpxchg_double_fail_attr.attr, 4909 &cmpxchg_double_cpu_fail_attr.attr, 4910 &cpu_partial_alloc_attr.attr, 4911 &cpu_partial_free_attr.attr, 4912 &cpu_partial_node_attr.attr, 4913 &cpu_partial_drain_attr.attr, 4914 #endif 4915 #ifdef CONFIG_FAILSLAB 4916 &failslab_attr.attr, 4917 #endif 4918 4919 NULL 4920 }; 4921 4922 static struct attribute_group slab_attr_group = { 4923 .attrs = slab_attrs, 4924 }; 4925 4926 static ssize_t slab_attr_show(struct kobject *kobj, 4927 struct attribute *attr, 4928 char *buf) 4929 { 4930 struct slab_attribute *attribute; 4931 struct kmem_cache *s; 4932 int err; 4933 4934 attribute = to_slab_attr(attr); 4935 s = to_slab(kobj); 4936 4937 if (!attribute->show) 4938 return -EIO; 4939 4940 err = attribute->show(s, buf); 4941 4942 return err; 4943 } 4944 4945 static ssize_t slab_attr_store(struct kobject *kobj, 4946 struct attribute *attr, 4947 const char *buf, size_t len) 4948 { 4949 struct slab_attribute *attribute; 4950 struct kmem_cache *s; 4951 int err; 4952 4953 attribute = to_slab_attr(attr); 4954 s = to_slab(kobj); 4955 4956 if (!attribute->store) 4957 return -EIO; 4958 4959 err = attribute->store(s, buf, len); 4960 #ifdef CONFIG_MEMCG_KMEM 4961 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 4962 int i; 4963 4964 mutex_lock(&slab_mutex); 4965 if (s->max_attr_size < len) 4966 s->max_attr_size = len; 4967 4968 /* 4969 * This is a best effort propagation, so this function's return 4970 * value will be determined by the parent cache only. This is 4971 * basically because not all attributes will have a well 4972 * defined semantics for rollbacks - most of the actions will 4973 * have permanent effects. 4974 * 4975 * Returning the error value of any of the children that fail 4976 * is not 100 % defined, in the sense that users seeing the 4977 * error code won't be able to know anything about the state of 4978 * the cache. 4979 * 4980 * Only returning the error code for the parent cache at least 4981 * has well defined semantics. The cache being written to 4982 * directly either failed or succeeded, in which case we loop 4983 * through the descendants with best-effort propagation. 4984 */ 4985 for_each_memcg_cache_index(i) { 4986 struct kmem_cache *c = cache_from_memcg(s, i); 4987 if (c) 4988 attribute->store(c, buf, len); 4989 } 4990 mutex_unlock(&slab_mutex); 4991 } 4992 #endif 4993 return err; 4994 } 4995 4996 static void memcg_propagate_slab_attrs(struct kmem_cache *s) 4997 { 4998 #ifdef CONFIG_MEMCG_KMEM 4999 int i; 5000 char *buffer = NULL; 5001 5002 if (!is_root_cache(s)) 5003 return; 5004 5005 /* 5006 * This mean this cache had no attribute written. Therefore, no point 5007 * in copying default values around 5008 */ 5009 if (!s->max_attr_size) 5010 return; 5011 5012 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { 5013 char mbuf[64]; 5014 char *buf; 5015 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); 5016 5017 if (!attr || !attr->store || !attr->show) 5018 continue; 5019 5020 /* 5021 * It is really bad that we have to allocate here, so we will 5022 * do it only as a fallback. If we actually allocate, though, 5023 * we can just use the allocated buffer until the end. 5024 * 5025 * Most of the slub attributes will tend to be very small in 5026 * size, but sysfs allows buffers up to a page, so they can 5027 * theoretically happen. 5028 */ 5029 if (buffer) 5030 buf = buffer; 5031 else if (s->max_attr_size < ARRAY_SIZE(mbuf)) 5032 buf = mbuf; 5033 else { 5034 buffer = (char *) get_zeroed_page(GFP_KERNEL); 5035 if (WARN_ON(!buffer)) 5036 continue; 5037 buf = buffer; 5038 } 5039 5040 attr->show(s->memcg_params->root_cache, buf); 5041 attr->store(s, buf, strlen(buf)); 5042 } 5043 5044 if (buffer) 5045 free_page((unsigned long)buffer); 5046 #endif 5047 } 5048 5049 static const struct sysfs_ops slab_sysfs_ops = { 5050 .show = slab_attr_show, 5051 .store = slab_attr_store, 5052 }; 5053 5054 static struct kobj_type slab_ktype = { 5055 .sysfs_ops = &slab_sysfs_ops, 5056 }; 5057 5058 static int uevent_filter(struct kset *kset, struct kobject *kobj) 5059 { 5060 struct kobj_type *ktype = get_ktype(kobj); 5061 5062 if (ktype == &slab_ktype) 5063 return 1; 5064 return 0; 5065 } 5066 5067 static const struct kset_uevent_ops slab_uevent_ops = { 5068 .filter = uevent_filter, 5069 }; 5070 5071 static struct kset *slab_kset; 5072 5073 #define ID_STR_LENGTH 64 5074 5075 /* Create a unique string id for a slab cache: 5076 * 5077 * Format :[flags-]size 5078 */ 5079 static char *create_unique_id(struct kmem_cache *s) 5080 { 5081 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 5082 char *p = name; 5083 5084 BUG_ON(!name); 5085 5086 *p++ = ':'; 5087 /* 5088 * First flags affecting slabcache operations. We will only 5089 * get here for aliasable slabs so we do not need to support 5090 * too many flags. The flags here must cover all flags that 5091 * are matched during merging to guarantee that the id is 5092 * unique. 5093 */ 5094 if (s->flags & SLAB_CACHE_DMA) 5095 *p++ = 'd'; 5096 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5097 *p++ = 'a'; 5098 if (s->flags & SLAB_DEBUG_FREE) 5099 *p++ = 'F'; 5100 if (!(s->flags & SLAB_NOTRACK)) 5101 *p++ = 't'; 5102 if (p != name + 1) 5103 *p++ = '-'; 5104 p += sprintf(p, "%07d", s->size); 5105 5106 #ifdef CONFIG_MEMCG_KMEM 5107 if (!is_root_cache(s)) 5108 p += sprintf(p, "-%08d", 5109 memcg_cache_id(s->memcg_params->memcg)); 5110 #endif 5111 5112 BUG_ON(p > name + ID_STR_LENGTH - 1); 5113 return name; 5114 } 5115 5116 static int sysfs_slab_add(struct kmem_cache *s) 5117 { 5118 int err; 5119 const char *name; 5120 int unmergeable = slab_unmergeable(s); 5121 5122 if (unmergeable) { 5123 /* 5124 * Slabcache can never be merged so we can use the name proper. 5125 * This is typically the case for debug situations. In that 5126 * case we can catch duplicate names easily. 5127 */ 5128 sysfs_remove_link(&slab_kset->kobj, s->name); 5129 name = s->name; 5130 } else { 5131 /* 5132 * Create a unique name for the slab as a target 5133 * for the symlinks. 5134 */ 5135 name = create_unique_id(s); 5136 } 5137 5138 s->kobj.kset = slab_kset; 5139 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5140 if (err) { 5141 kobject_put(&s->kobj); 5142 return err; 5143 } 5144 5145 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5146 if (err) { 5147 kobject_del(&s->kobj); 5148 kobject_put(&s->kobj); 5149 return err; 5150 } 5151 kobject_uevent(&s->kobj, KOBJ_ADD); 5152 if (!unmergeable) { 5153 /* Setup first alias */ 5154 sysfs_slab_alias(s, s->name); 5155 kfree(name); 5156 } 5157 return 0; 5158 } 5159 5160 static void sysfs_slab_remove(struct kmem_cache *s) 5161 { 5162 if (slab_state < FULL) 5163 /* 5164 * Sysfs has not been setup yet so no need to remove the 5165 * cache from sysfs. 5166 */ 5167 return; 5168 5169 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5170 kobject_del(&s->kobj); 5171 kobject_put(&s->kobj); 5172 } 5173 5174 /* 5175 * Need to buffer aliases during bootup until sysfs becomes 5176 * available lest we lose that information. 5177 */ 5178 struct saved_alias { 5179 struct kmem_cache *s; 5180 const char *name; 5181 struct saved_alias *next; 5182 }; 5183 5184 static struct saved_alias *alias_list; 5185 5186 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5187 { 5188 struct saved_alias *al; 5189 5190 if (slab_state == FULL) { 5191 /* 5192 * If we have a leftover link then remove it. 5193 */ 5194 sysfs_remove_link(&slab_kset->kobj, name); 5195 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5196 } 5197 5198 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5199 if (!al) 5200 return -ENOMEM; 5201 5202 al->s = s; 5203 al->name = name; 5204 al->next = alias_list; 5205 alias_list = al; 5206 return 0; 5207 } 5208 5209 static int __init slab_sysfs_init(void) 5210 { 5211 struct kmem_cache *s; 5212 int err; 5213 5214 mutex_lock(&slab_mutex); 5215 5216 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5217 if (!slab_kset) { 5218 mutex_unlock(&slab_mutex); 5219 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5220 return -ENOSYS; 5221 } 5222 5223 slab_state = FULL; 5224 5225 list_for_each_entry(s, &slab_caches, list) { 5226 err = sysfs_slab_add(s); 5227 if (err) 5228 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5229 " to sysfs\n", s->name); 5230 } 5231 5232 while (alias_list) { 5233 struct saved_alias *al = alias_list; 5234 5235 alias_list = alias_list->next; 5236 err = sysfs_slab_alias(al->s, al->name); 5237 if (err) 5238 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5239 " %s to sysfs\n", al->name); 5240 kfree(al); 5241 } 5242 5243 mutex_unlock(&slab_mutex); 5244 resiliency_test(); 5245 return 0; 5246 } 5247 5248 __initcall(slab_sysfs_init); 5249 #endif /* CONFIG_SYSFS */ 5250 5251 /* 5252 * The /proc/slabinfo ABI 5253 */ 5254 #ifdef CONFIG_SLABINFO 5255 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5256 { 5257 unsigned long nr_slabs = 0; 5258 unsigned long nr_objs = 0; 5259 unsigned long nr_free = 0; 5260 int node; 5261 5262 for_each_online_node(node) { 5263 struct kmem_cache_node *n = get_node(s, node); 5264 5265 if (!n) 5266 continue; 5267 5268 nr_slabs += node_nr_slabs(n); 5269 nr_objs += node_nr_objs(n); 5270 nr_free += count_partial(n, count_free); 5271 } 5272 5273 sinfo->active_objs = nr_objs - nr_free; 5274 sinfo->num_objs = nr_objs; 5275 sinfo->active_slabs = nr_slabs; 5276 sinfo->num_slabs = nr_slabs; 5277 sinfo->objects_per_slab = oo_objects(s->oo); 5278 sinfo->cache_order = oo_order(s->oo); 5279 } 5280 5281 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) 5282 { 5283 } 5284 5285 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 5286 size_t count, loff_t *ppos) 5287 { 5288 return -EIO; 5289 } 5290 #endif /* CONFIG_SLABINFO */ 5291