1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include "slab.h" 20 #include <linux/proc_fs.h> 21 #include <linux/notifier.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmemcheck.h> 24 #include <linux/cpu.h> 25 #include <linux/cpuset.h> 26 #include <linux/mempolicy.h> 27 #include <linux/ctype.h> 28 #include <linux/debugobjects.h> 29 #include <linux/kallsyms.h> 30 #include <linux/memory.h> 31 #include <linux/math64.h> 32 #include <linux/fault-inject.h> 33 #include <linux/stacktrace.h> 34 #include <linux/prefetch.h> 35 #include <linux/memcontrol.h> 36 37 #include <trace/events/kmem.h> 38 39 #include "internal.h" 40 41 /* 42 * Lock order: 43 * 1. slab_mutex (Global Mutex) 44 * 2. node->list_lock 45 * 3. slab_lock(page) (Only on some arches and for debugging) 46 * 47 * slab_mutex 48 * 49 * The role of the slab_mutex is to protect the list of all the slabs 50 * and to synchronize major metadata changes to slab cache structures. 51 * 52 * The slab_lock is only used for debugging and on arches that do not 53 * have the ability to do a cmpxchg_double. It only protects the second 54 * double word in the page struct. Meaning 55 * A. page->freelist -> List of object free in a page 56 * B. page->counters -> Counters of objects 57 * C. page->frozen -> frozen state 58 * 59 * If a slab is frozen then it is exempt from list management. It is not 60 * on any list. The processor that froze the slab is the one who can 61 * perform list operations on the page. Other processors may put objects 62 * onto the freelist but the processor that froze the slab is the only 63 * one that can retrieve the objects from the page's freelist. 64 * 65 * The list_lock protects the partial and full list on each node and 66 * the partial slab counter. If taken then no new slabs may be added or 67 * removed from the lists nor make the number of partial slabs be modified. 68 * (Note that the total number of slabs is an atomic value that may be 69 * modified without taking the list lock). 70 * 71 * The list_lock is a centralized lock and thus we avoid taking it as 72 * much as possible. As long as SLUB does not have to handle partial 73 * slabs, operations can continue without any centralized lock. F.e. 74 * allocating a long series of objects that fill up slabs does not require 75 * the list lock. 76 * Interrupts are disabled during allocation and deallocation in order to 77 * make the slab allocator safe to use in the context of an irq. In addition 78 * interrupts are disabled to ensure that the processor does not change 79 * while handling per_cpu slabs, due to kernel preemption. 80 * 81 * SLUB assigns one slab for allocation to each processor. 82 * Allocations only occur from these slabs called cpu slabs. 83 * 84 * Slabs with free elements are kept on a partial list and during regular 85 * operations no list for full slabs is used. If an object in a full slab is 86 * freed then the slab will show up again on the partial lists. 87 * We track full slabs for debugging purposes though because otherwise we 88 * cannot scan all objects. 89 * 90 * Slabs are freed when they become empty. Teardown and setup is 91 * minimal so we rely on the page allocators per cpu caches for 92 * fast frees and allocs. 93 * 94 * Overloading of page flags that are otherwise used for LRU management. 95 * 96 * PageActive The slab is frozen and exempt from list processing. 97 * This means that the slab is dedicated to a purpose 98 * such as satisfying allocations for a specific 99 * processor. Objects may be freed in the slab while 100 * it is frozen but slab_free will then skip the usual 101 * list operations. It is up to the processor holding 102 * the slab to integrate the slab into the slab lists 103 * when the slab is no longer needed. 104 * 105 * One use of this flag is to mark slabs that are 106 * used for allocations. Then such a slab becomes a cpu 107 * slab. The cpu slab may be equipped with an additional 108 * freelist that allows lockless access to 109 * free objects in addition to the regular freelist 110 * that requires the slab lock. 111 * 112 * PageError Slab requires special handling due to debug 113 * options set. This moves slab handling out of 114 * the fast path and disables lockless freelists. 115 */ 116 117 static inline int kmem_cache_debug(struct kmem_cache *s) 118 { 119 #ifdef CONFIG_SLUB_DEBUG 120 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 121 #else 122 return 0; 123 #endif 124 } 125 126 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 127 { 128 #ifdef CONFIG_SLUB_CPU_PARTIAL 129 return !kmem_cache_debug(s); 130 #else 131 return false; 132 #endif 133 } 134 135 /* 136 * Issues still to be resolved: 137 * 138 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 139 * 140 * - Variable sizing of the per node arrays 141 */ 142 143 /* Enable to test recovery from slab corruption on boot */ 144 #undef SLUB_RESILIENCY_TEST 145 146 /* Enable to log cmpxchg failures */ 147 #undef SLUB_DEBUG_CMPXCHG 148 149 /* 150 * Mininum number of partial slabs. These will be left on the partial 151 * lists even if they are empty. kmem_cache_shrink may reclaim them. 152 */ 153 #define MIN_PARTIAL 5 154 155 /* 156 * Maximum number of desirable partial slabs. 157 * The existence of more partial slabs makes kmem_cache_shrink 158 * sort the partial list by the number of objects in use. 159 */ 160 #define MAX_PARTIAL 10 161 162 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 163 SLAB_POISON | SLAB_STORE_USER) 164 165 /* 166 * Debugging flags that require metadata to be stored in the slab. These get 167 * disabled when slub_debug=O is used and a cache's min order increases with 168 * metadata. 169 */ 170 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 171 172 /* 173 * Set of flags that will prevent slab merging 174 */ 175 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 176 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 177 SLAB_FAILSLAB) 178 179 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 180 SLAB_CACHE_DMA | SLAB_NOTRACK) 181 182 #define OO_SHIFT 16 183 #define OO_MASK ((1 << OO_SHIFT) - 1) 184 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 185 186 /* Internal SLUB flags */ 187 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 188 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 189 190 #ifdef CONFIG_SMP 191 static struct notifier_block slab_notifier; 192 #endif 193 194 /* 195 * Tracking user of a slab. 196 */ 197 #define TRACK_ADDRS_COUNT 16 198 struct track { 199 unsigned long addr; /* Called from address */ 200 #ifdef CONFIG_STACKTRACE 201 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 202 #endif 203 int cpu; /* Was running on cpu */ 204 int pid; /* Pid context */ 205 unsigned long when; /* When did the operation occur */ 206 }; 207 208 enum track_item { TRACK_ALLOC, TRACK_FREE }; 209 210 #ifdef CONFIG_SYSFS 211 static int sysfs_slab_add(struct kmem_cache *); 212 static int sysfs_slab_alias(struct kmem_cache *, const char *); 213 static void sysfs_slab_remove(struct kmem_cache *); 214 static void memcg_propagate_slab_attrs(struct kmem_cache *s); 215 #else 216 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 217 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 218 { return 0; } 219 static inline void sysfs_slab_remove(struct kmem_cache *s) { } 220 221 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } 222 #endif 223 224 static inline void stat(const struct kmem_cache *s, enum stat_item si) 225 { 226 #ifdef CONFIG_SLUB_STATS 227 __this_cpu_inc(s->cpu_slab->stat[si]); 228 #endif 229 } 230 231 /******************************************************************** 232 * Core slab cache functions 233 *******************************************************************/ 234 235 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 236 { 237 return s->node[node]; 238 } 239 240 /* Verify that a pointer has an address that is valid within a slab page */ 241 static inline int check_valid_pointer(struct kmem_cache *s, 242 struct page *page, const void *object) 243 { 244 void *base; 245 246 if (!object) 247 return 1; 248 249 base = page_address(page); 250 if (object < base || object >= base + page->objects * s->size || 251 (object - base) % s->size) { 252 return 0; 253 } 254 255 return 1; 256 } 257 258 static inline void *get_freepointer(struct kmem_cache *s, void *object) 259 { 260 return *(void **)(object + s->offset); 261 } 262 263 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 264 { 265 prefetch(object + s->offset); 266 } 267 268 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 269 { 270 void *p; 271 272 #ifdef CONFIG_DEBUG_PAGEALLOC 273 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 274 #else 275 p = get_freepointer(s, object); 276 #endif 277 return p; 278 } 279 280 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 281 { 282 *(void **)(object + s->offset) = fp; 283 } 284 285 /* Loop over all objects in a slab */ 286 #define for_each_object(__p, __s, __addr, __objects) \ 287 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 288 __p += (__s)->size) 289 290 /* Determine object index from a given position */ 291 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 292 { 293 return (p - addr) / s->size; 294 } 295 296 static inline size_t slab_ksize(const struct kmem_cache *s) 297 { 298 #ifdef CONFIG_SLUB_DEBUG 299 /* 300 * Debugging requires use of the padding between object 301 * and whatever may come after it. 302 */ 303 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 304 return s->object_size; 305 306 #endif 307 /* 308 * If we have the need to store the freelist pointer 309 * back there or track user information then we can 310 * only use the space before that information. 311 */ 312 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 313 return s->inuse; 314 /* 315 * Else we can use all the padding etc for the allocation 316 */ 317 return s->size; 318 } 319 320 static inline int order_objects(int order, unsigned long size, int reserved) 321 { 322 return ((PAGE_SIZE << order) - reserved) / size; 323 } 324 325 static inline struct kmem_cache_order_objects oo_make(int order, 326 unsigned long size, int reserved) 327 { 328 struct kmem_cache_order_objects x = { 329 (order << OO_SHIFT) + order_objects(order, size, reserved) 330 }; 331 332 return x; 333 } 334 335 static inline int oo_order(struct kmem_cache_order_objects x) 336 { 337 return x.x >> OO_SHIFT; 338 } 339 340 static inline int oo_objects(struct kmem_cache_order_objects x) 341 { 342 return x.x & OO_MASK; 343 } 344 345 /* 346 * Per slab locking using the pagelock 347 */ 348 static __always_inline void slab_lock(struct page *page) 349 { 350 bit_spin_lock(PG_locked, &page->flags); 351 } 352 353 static __always_inline void slab_unlock(struct page *page) 354 { 355 __bit_spin_unlock(PG_locked, &page->flags); 356 } 357 358 /* Interrupts must be disabled (for the fallback code to work right) */ 359 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 360 void *freelist_old, unsigned long counters_old, 361 void *freelist_new, unsigned long counters_new, 362 const char *n) 363 { 364 VM_BUG_ON(!irqs_disabled()); 365 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 366 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 367 if (s->flags & __CMPXCHG_DOUBLE) { 368 if (cmpxchg_double(&page->freelist, &page->counters, 369 freelist_old, counters_old, 370 freelist_new, counters_new)) 371 return 1; 372 } else 373 #endif 374 { 375 slab_lock(page); 376 if (page->freelist == freelist_old && 377 page->counters == counters_old) { 378 page->freelist = freelist_new; 379 page->counters = counters_new; 380 slab_unlock(page); 381 return 1; 382 } 383 slab_unlock(page); 384 } 385 386 cpu_relax(); 387 stat(s, CMPXCHG_DOUBLE_FAIL); 388 389 #ifdef SLUB_DEBUG_CMPXCHG 390 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 391 #endif 392 393 return 0; 394 } 395 396 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 397 void *freelist_old, unsigned long counters_old, 398 void *freelist_new, unsigned long counters_new, 399 const char *n) 400 { 401 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 402 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 403 if (s->flags & __CMPXCHG_DOUBLE) { 404 if (cmpxchg_double(&page->freelist, &page->counters, 405 freelist_old, counters_old, 406 freelist_new, counters_new)) 407 return 1; 408 } else 409 #endif 410 { 411 unsigned long flags; 412 413 local_irq_save(flags); 414 slab_lock(page); 415 if (page->freelist == freelist_old && 416 page->counters == counters_old) { 417 page->freelist = freelist_new; 418 page->counters = counters_new; 419 slab_unlock(page); 420 local_irq_restore(flags); 421 return 1; 422 } 423 slab_unlock(page); 424 local_irq_restore(flags); 425 } 426 427 cpu_relax(); 428 stat(s, CMPXCHG_DOUBLE_FAIL); 429 430 #ifdef SLUB_DEBUG_CMPXCHG 431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 432 #endif 433 434 return 0; 435 } 436 437 #ifdef CONFIG_SLUB_DEBUG 438 /* 439 * Determine a map of object in use on a page. 440 * 441 * Node listlock must be held to guarantee that the page does 442 * not vanish from under us. 443 */ 444 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 445 { 446 void *p; 447 void *addr = page_address(page); 448 449 for (p = page->freelist; p; p = get_freepointer(s, p)) 450 set_bit(slab_index(p, s, addr), map); 451 } 452 453 /* 454 * Debug settings: 455 */ 456 #ifdef CONFIG_SLUB_DEBUG_ON 457 static int slub_debug = DEBUG_DEFAULT_FLAGS; 458 #else 459 static int slub_debug; 460 #endif 461 462 static char *slub_debug_slabs; 463 static int disable_higher_order_debug; 464 465 /* 466 * Object debugging 467 */ 468 static void print_section(char *text, u8 *addr, unsigned int length) 469 { 470 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 471 length, 1); 472 } 473 474 static struct track *get_track(struct kmem_cache *s, void *object, 475 enum track_item alloc) 476 { 477 struct track *p; 478 479 if (s->offset) 480 p = object + s->offset + sizeof(void *); 481 else 482 p = object + s->inuse; 483 484 return p + alloc; 485 } 486 487 static void set_track(struct kmem_cache *s, void *object, 488 enum track_item alloc, unsigned long addr) 489 { 490 struct track *p = get_track(s, object, alloc); 491 492 if (addr) { 493 #ifdef CONFIG_STACKTRACE 494 struct stack_trace trace; 495 int i; 496 497 trace.nr_entries = 0; 498 trace.max_entries = TRACK_ADDRS_COUNT; 499 trace.entries = p->addrs; 500 trace.skip = 3; 501 save_stack_trace(&trace); 502 503 /* See rant in lockdep.c */ 504 if (trace.nr_entries != 0 && 505 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 506 trace.nr_entries--; 507 508 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 509 p->addrs[i] = 0; 510 #endif 511 p->addr = addr; 512 p->cpu = smp_processor_id(); 513 p->pid = current->pid; 514 p->when = jiffies; 515 } else 516 memset(p, 0, sizeof(struct track)); 517 } 518 519 static void init_tracking(struct kmem_cache *s, void *object) 520 { 521 if (!(s->flags & SLAB_STORE_USER)) 522 return; 523 524 set_track(s, object, TRACK_FREE, 0UL); 525 set_track(s, object, TRACK_ALLOC, 0UL); 526 } 527 528 static void print_track(const char *s, struct track *t) 529 { 530 if (!t->addr) 531 return; 532 533 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 534 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 535 #ifdef CONFIG_STACKTRACE 536 { 537 int i; 538 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 539 if (t->addrs[i]) 540 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 541 else 542 break; 543 } 544 #endif 545 } 546 547 static void print_tracking(struct kmem_cache *s, void *object) 548 { 549 if (!(s->flags & SLAB_STORE_USER)) 550 return; 551 552 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 553 print_track("Freed", get_track(s, object, TRACK_FREE)); 554 } 555 556 static void print_page_info(struct page *page) 557 { 558 printk(KERN_ERR 559 "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 560 page, page->objects, page->inuse, page->freelist, page->flags); 561 562 } 563 564 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 565 { 566 va_list args; 567 char buf[100]; 568 569 va_start(args, fmt); 570 vsnprintf(buf, sizeof(buf), fmt, args); 571 va_end(args); 572 printk(KERN_ERR "========================================" 573 "=====================================\n"); 574 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 575 printk(KERN_ERR "----------------------------------------" 576 "-------------------------------------\n\n"); 577 578 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 579 } 580 581 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 582 { 583 va_list args; 584 char buf[100]; 585 586 va_start(args, fmt); 587 vsnprintf(buf, sizeof(buf), fmt, args); 588 va_end(args); 589 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 590 } 591 592 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 593 { 594 unsigned int off; /* Offset of last byte */ 595 u8 *addr = page_address(page); 596 597 print_tracking(s, p); 598 599 print_page_info(page); 600 601 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 602 p, p - addr, get_freepointer(s, p)); 603 604 if (p > addr + 16) 605 print_section("Bytes b4 ", p - 16, 16); 606 607 print_section("Object ", p, min_t(unsigned long, s->object_size, 608 PAGE_SIZE)); 609 if (s->flags & SLAB_RED_ZONE) 610 print_section("Redzone ", p + s->object_size, 611 s->inuse - s->object_size); 612 613 if (s->offset) 614 off = s->offset + sizeof(void *); 615 else 616 off = s->inuse; 617 618 if (s->flags & SLAB_STORE_USER) 619 off += 2 * sizeof(struct track); 620 621 if (off != s->size) 622 /* Beginning of the filler is the free pointer */ 623 print_section("Padding ", p + off, s->size - off); 624 625 dump_stack(); 626 } 627 628 static void object_err(struct kmem_cache *s, struct page *page, 629 u8 *object, char *reason) 630 { 631 slab_bug(s, "%s", reason); 632 print_trailer(s, page, object); 633 } 634 635 static void slab_err(struct kmem_cache *s, struct page *page, 636 const char *fmt, ...) 637 { 638 va_list args; 639 char buf[100]; 640 641 va_start(args, fmt); 642 vsnprintf(buf, sizeof(buf), fmt, args); 643 va_end(args); 644 slab_bug(s, "%s", buf); 645 print_page_info(page); 646 dump_stack(); 647 } 648 649 static void init_object(struct kmem_cache *s, void *object, u8 val) 650 { 651 u8 *p = object; 652 653 if (s->flags & __OBJECT_POISON) { 654 memset(p, POISON_FREE, s->object_size - 1); 655 p[s->object_size - 1] = POISON_END; 656 } 657 658 if (s->flags & SLAB_RED_ZONE) 659 memset(p + s->object_size, val, s->inuse - s->object_size); 660 } 661 662 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 663 void *from, void *to) 664 { 665 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 666 memset(from, data, to - from); 667 } 668 669 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 670 u8 *object, char *what, 671 u8 *start, unsigned int value, unsigned int bytes) 672 { 673 u8 *fault; 674 u8 *end; 675 676 fault = memchr_inv(start, value, bytes); 677 if (!fault) 678 return 1; 679 680 end = start + bytes; 681 while (end > fault && end[-1] == value) 682 end--; 683 684 slab_bug(s, "%s overwritten", what); 685 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 686 fault, end - 1, fault[0], value); 687 print_trailer(s, page, object); 688 689 restore_bytes(s, what, value, fault, end); 690 return 0; 691 } 692 693 /* 694 * Object layout: 695 * 696 * object address 697 * Bytes of the object to be managed. 698 * If the freepointer may overlay the object then the free 699 * pointer is the first word of the object. 700 * 701 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 702 * 0xa5 (POISON_END) 703 * 704 * object + s->object_size 705 * Padding to reach word boundary. This is also used for Redzoning. 706 * Padding is extended by another word if Redzoning is enabled and 707 * object_size == inuse. 708 * 709 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 710 * 0xcc (RED_ACTIVE) for objects in use. 711 * 712 * object + s->inuse 713 * Meta data starts here. 714 * 715 * A. Free pointer (if we cannot overwrite object on free) 716 * B. Tracking data for SLAB_STORE_USER 717 * C. Padding to reach required alignment boundary or at mininum 718 * one word if debugging is on to be able to detect writes 719 * before the word boundary. 720 * 721 * Padding is done using 0x5a (POISON_INUSE) 722 * 723 * object + s->size 724 * Nothing is used beyond s->size. 725 * 726 * If slabcaches are merged then the object_size and inuse boundaries are mostly 727 * ignored. And therefore no slab options that rely on these boundaries 728 * may be used with merged slabcaches. 729 */ 730 731 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 732 { 733 unsigned long off = s->inuse; /* The end of info */ 734 735 if (s->offset) 736 /* Freepointer is placed after the object. */ 737 off += sizeof(void *); 738 739 if (s->flags & SLAB_STORE_USER) 740 /* We also have user information there */ 741 off += 2 * sizeof(struct track); 742 743 if (s->size == off) 744 return 1; 745 746 return check_bytes_and_report(s, page, p, "Object padding", 747 p + off, POISON_INUSE, s->size - off); 748 } 749 750 /* Check the pad bytes at the end of a slab page */ 751 static int slab_pad_check(struct kmem_cache *s, struct page *page) 752 { 753 u8 *start; 754 u8 *fault; 755 u8 *end; 756 int length; 757 int remainder; 758 759 if (!(s->flags & SLAB_POISON)) 760 return 1; 761 762 start = page_address(page); 763 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 764 end = start + length; 765 remainder = length % s->size; 766 if (!remainder) 767 return 1; 768 769 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 770 if (!fault) 771 return 1; 772 while (end > fault && end[-1] == POISON_INUSE) 773 end--; 774 775 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 776 print_section("Padding ", end - remainder, remainder); 777 778 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 779 return 0; 780 } 781 782 static int check_object(struct kmem_cache *s, struct page *page, 783 void *object, u8 val) 784 { 785 u8 *p = object; 786 u8 *endobject = object + s->object_size; 787 788 if (s->flags & SLAB_RED_ZONE) { 789 if (!check_bytes_and_report(s, page, object, "Redzone", 790 endobject, val, s->inuse - s->object_size)) 791 return 0; 792 } else { 793 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { 794 check_bytes_and_report(s, page, p, "Alignment padding", 795 endobject, POISON_INUSE, 796 s->inuse - s->object_size); 797 } 798 } 799 800 if (s->flags & SLAB_POISON) { 801 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 802 (!check_bytes_and_report(s, page, p, "Poison", p, 803 POISON_FREE, s->object_size - 1) || 804 !check_bytes_and_report(s, page, p, "Poison", 805 p + s->object_size - 1, POISON_END, 1))) 806 return 0; 807 /* 808 * check_pad_bytes cleans up on its own. 809 */ 810 check_pad_bytes(s, page, p); 811 } 812 813 if (!s->offset && val == SLUB_RED_ACTIVE) 814 /* 815 * Object and freepointer overlap. Cannot check 816 * freepointer while object is allocated. 817 */ 818 return 1; 819 820 /* Check free pointer validity */ 821 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 822 object_err(s, page, p, "Freepointer corrupt"); 823 /* 824 * No choice but to zap it and thus lose the remainder 825 * of the free objects in this slab. May cause 826 * another error because the object count is now wrong. 827 */ 828 set_freepointer(s, p, NULL); 829 return 0; 830 } 831 return 1; 832 } 833 834 static int check_slab(struct kmem_cache *s, struct page *page) 835 { 836 int maxobj; 837 838 VM_BUG_ON(!irqs_disabled()); 839 840 if (!PageSlab(page)) { 841 slab_err(s, page, "Not a valid slab page"); 842 return 0; 843 } 844 845 maxobj = order_objects(compound_order(page), s->size, s->reserved); 846 if (page->objects > maxobj) { 847 slab_err(s, page, "objects %u > max %u", 848 s->name, page->objects, maxobj); 849 return 0; 850 } 851 if (page->inuse > page->objects) { 852 slab_err(s, page, "inuse %u > max %u", 853 s->name, page->inuse, page->objects); 854 return 0; 855 } 856 /* Slab_pad_check fixes things up after itself */ 857 slab_pad_check(s, page); 858 return 1; 859 } 860 861 /* 862 * Determine if a certain object on a page is on the freelist. Must hold the 863 * slab lock to guarantee that the chains are in a consistent state. 864 */ 865 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 866 { 867 int nr = 0; 868 void *fp; 869 void *object = NULL; 870 unsigned long max_objects; 871 872 fp = page->freelist; 873 while (fp && nr <= page->objects) { 874 if (fp == search) 875 return 1; 876 if (!check_valid_pointer(s, page, fp)) { 877 if (object) { 878 object_err(s, page, object, 879 "Freechain corrupt"); 880 set_freepointer(s, object, NULL); 881 } else { 882 slab_err(s, page, "Freepointer corrupt"); 883 page->freelist = NULL; 884 page->inuse = page->objects; 885 slab_fix(s, "Freelist cleared"); 886 return 0; 887 } 888 break; 889 } 890 object = fp; 891 fp = get_freepointer(s, object); 892 nr++; 893 } 894 895 max_objects = order_objects(compound_order(page), s->size, s->reserved); 896 if (max_objects > MAX_OBJS_PER_PAGE) 897 max_objects = MAX_OBJS_PER_PAGE; 898 899 if (page->objects != max_objects) { 900 slab_err(s, page, "Wrong number of objects. Found %d but " 901 "should be %d", page->objects, max_objects); 902 page->objects = max_objects; 903 slab_fix(s, "Number of objects adjusted."); 904 } 905 if (page->inuse != page->objects - nr) { 906 slab_err(s, page, "Wrong object count. Counter is %d but " 907 "counted were %d", page->inuse, page->objects - nr); 908 page->inuse = page->objects - nr; 909 slab_fix(s, "Object count adjusted."); 910 } 911 return search == NULL; 912 } 913 914 static void trace(struct kmem_cache *s, struct page *page, void *object, 915 int alloc) 916 { 917 if (s->flags & SLAB_TRACE) { 918 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 919 s->name, 920 alloc ? "alloc" : "free", 921 object, page->inuse, 922 page->freelist); 923 924 if (!alloc) 925 print_section("Object ", (void *)object, 926 s->object_size); 927 928 dump_stack(); 929 } 930 } 931 932 /* 933 * Hooks for other subsystems that check memory allocations. In a typical 934 * production configuration these hooks all should produce no code at all. 935 */ 936 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 937 { 938 kmemleak_alloc(ptr, size, 1, flags); 939 } 940 941 static inline void kfree_hook(const void *x) 942 { 943 kmemleak_free(x); 944 } 945 946 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 947 { 948 flags &= gfp_allowed_mask; 949 lockdep_trace_alloc(flags); 950 might_sleep_if(flags & __GFP_WAIT); 951 952 return should_failslab(s->object_size, flags, s->flags); 953 } 954 955 static inline void slab_post_alloc_hook(struct kmem_cache *s, 956 gfp_t flags, void *object) 957 { 958 flags &= gfp_allowed_mask; 959 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 960 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 961 } 962 963 static inline void slab_free_hook(struct kmem_cache *s, void *x) 964 { 965 kmemleak_free_recursive(x, s->flags); 966 967 /* 968 * Trouble is that we may no longer disable interrupts in the fast path 969 * So in order to make the debug calls that expect irqs to be 970 * disabled we need to disable interrupts temporarily. 971 */ 972 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 973 { 974 unsigned long flags; 975 976 local_irq_save(flags); 977 kmemcheck_slab_free(s, x, s->object_size); 978 debug_check_no_locks_freed(x, s->object_size); 979 local_irq_restore(flags); 980 } 981 #endif 982 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 983 debug_check_no_obj_freed(x, s->object_size); 984 } 985 986 /* 987 * Tracking of fully allocated slabs for debugging purposes. 988 * 989 * list_lock must be held. 990 */ 991 static void add_full(struct kmem_cache *s, 992 struct kmem_cache_node *n, struct page *page) 993 { 994 if (!(s->flags & SLAB_STORE_USER)) 995 return; 996 997 list_add(&page->lru, &n->full); 998 } 999 1000 /* 1001 * list_lock must be held. 1002 */ 1003 static void remove_full(struct kmem_cache *s, struct page *page) 1004 { 1005 if (!(s->flags & SLAB_STORE_USER)) 1006 return; 1007 1008 list_del(&page->lru); 1009 } 1010 1011 /* Tracking of the number of slabs for debugging purposes */ 1012 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1013 { 1014 struct kmem_cache_node *n = get_node(s, node); 1015 1016 return atomic_long_read(&n->nr_slabs); 1017 } 1018 1019 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1020 { 1021 return atomic_long_read(&n->nr_slabs); 1022 } 1023 1024 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1025 { 1026 struct kmem_cache_node *n = get_node(s, node); 1027 1028 /* 1029 * May be called early in order to allocate a slab for the 1030 * kmem_cache_node structure. Solve the chicken-egg 1031 * dilemma by deferring the increment of the count during 1032 * bootstrap (see early_kmem_cache_node_alloc). 1033 */ 1034 if (likely(n)) { 1035 atomic_long_inc(&n->nr_slabs); 1036 atomic_long_add(objects, &n->total_objects); 1037 } 1038 } 1039 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1040 { 1041 struct kmem_cache_node *n = get_node(s, node); 1042 1043 atomic_long_dec(&n->nr_slabs); 1044 atomic_long_sub(objects, &n->total_objects); 1045 } 1046 1047 /* Object debug checks for alloc/free paths */ 1048 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1049 void *object) 1050 { 1051 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1052 return; 1053 1054 init_object(s, object, SLUB_RED_INACTIVE); 1055 init_tracking(s, object); 1056 } 1057 1058 static noinline int alloc_debug_processing(struct kmem_cache *s, 1059 struct page *page, 1060 void *object, unsigned long addr) 1061 { 1062 if (!check_slab(s, page)) 1063 goto bad; 1064 1065 if (!check_valid_pointer(s, page, object)) { 1066 object_err(s, page, object, "Freelist Pointer check fails"); 1067 goto bad; 1068 } 1069 1070 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1071 goto bad; 1072 1073 /* Success perform special debug activities for allocs */ 1074 if (s->flags & SLAB_STORE_USER) 1075 set_track(s, object, TRACK_ALLOC, addr); 1076 trace(s, page, object, 1); 1077 init_object(s, object, SLUB_RED_ACTIVE); 1078 return 1; 1079 1080 bad: 1081 if (PageSlab(page)) { 1082 /* 1083 * If this is a slab page then lets do the best we can 1084 * to avoid issues in the future. Marking all objects 1085 * as used avoids touching the remaining objects. 1086 */ 1087 slab_fix(s, "Marking all objects used"); 1088 page->inuse = page->objects; 1089 page->freelist = NULL; 1090 } 1091 return 0; 1092 } 1093 1094 static noinline struct kmem_cache_node *free_debug_processing( 1095 struct kmem_cache *s, struct page *page, void *object, 1096 unsigned long addr, unsigned long *flags) 1097 { 1098 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1099 1100 spin_lock_irqsave(&n->list_lock, *flags); 1101 slab_lock(page); 1102 1103 if (!check_slab(s, page)) 1104 goto fail; 1105 1106 if (!check_valid_pointer(s, page, object)) { 1107 slab_err(s, page, "Invalid object pointer 0x%p", object); 1108 goto fail; 1109 } 1110 1111 if (on_freelist(s, page, object)) { 1112 object_err(s, page, object, "Object already free"); 1113 goto fail; 1114 } 1115 1116 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1117 goto out; 1118 1119 if (unlikely(s != page->slab_cache)) { 1120 if (!PageSlab(page)) { 1121 slab_err(s, page, "Attempt to free object(0x%p) " 1122 "outside of slab", object); 1123 } else if (!page->slab_cache) { 1124 printk(KERN_ERR 1125 "SLUB <none>: no slab for object 0x%p.\n", 1126 object); 1127 dump_stack(); 1128 } else 1129 object_err(s, page, object, 1130 "page slab pointer corrupt."); 1131 goto fail; 1132 } 1133 1134 if (s->flags & SLAB_STORE_USER) 1135 set_track(s, object, TRACK_FREE, addr); 1136 trace(s, page, object, 0); 1137 init_object(s, object, SLUB_RED_INACTIVE); 1138 out: 1139 slab_unlock(page); 1140 /* 1141 * Keep node_lock to preserve integrity 1142 * until the object is actually freed 1143 */ 1144 return n; 1145 1146 fail: 1147 slab_unlock(page); 1148 spin_unlock_irqrestore(&n->list_lock, *flags); 1149 slab_fix(s, "Object at 0x%p not freed", object); 1150 return NULL; 1151 } 1152 1153 static int __init setup_slub_debug(char *str) 1154 { 1155 slub_debug = DEBUG_DEFAULT_FLAGS; 1156 if (*str++ != '=' || !*str) 1157 /* 1158 * No options specified. Switch on full debugging. 1159 */ 1160 goto out; 1161 1162 if (*str == ',') 1163 /* 1164 * No options but restriction on slabs. This means full 1165 * debugging for slabs matching a pattern. 1166 */ 1167 goto check_slabs; 1168 1169 if (tolower(*str) == 'o') { 1170 /* 1171 * Avoid enabling debugging on caches if its minimum order 1172 * would increase as a result. 1173 */ 1174 disable_higher_order_debug = 1; 1175 goto out; 1176 } 1177 1178 slub_debug = 0; 1179 if (*str == '-') 1180 /* 1181 * Switch off all debugging measures. 1182 */ 1183 goto out; 1184 1185 /* 1186 * Determine which debug features should be switched on 1187 */ 1188 for (; *str && *str != ','; str++) { 1189 switch (tolower(*str)) { 1190 case 'f': 1191 slub_debug |= SLAB_DEBUG_FREE; 1192 break; 1193 case 'z': 1194 slub_debug |= SLAB_RED_ZONE; 1195 break; 1196 case 'p': 1197 slub_debug |= SLAB_POISON; 1198 break; 1199 case 'u': 1200 slub_debug |= SLAB_STORE_USER; 1201 break; 1202 case 't': 1203 slub_debug |= SLAB_TRACE; 1204 break; 1205 case 'a': 1206 slub_debug |= SLAB_FAILSLAB; 1207 break; 1208 default: 1209 printk(KERN_ERR "slub_debug option '%c' " 1210 "unknown. skipped\n", *str); 1211 } 1212 } 1213 1214 check_slabs: 1215 if (*str == ',') 1216 slub_debug_slabs = str + 1; 1217 out: 1218 return 1; 1219 } 1220 1221 __setup("slub_debug", setup_slub_debug); 1222 1223 static unsigned long kmem_cache_flags(unsigned long object_size, 1224 unsigned long flags, const char *name, 1225 void (*ctor)(void *)) 1226 { 1227 /* 1228 * Enable debugging if selected on the kernel commandline. 1229 */ 1230 if (slub_debug && (!slub_debug_slabs || (name && 1231 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) 1232 flags |= slub_debug; 1233 1234 return flags; 1235 } 1236 #else 1237 static inline void setup_object_debug(struct kmem_cache *s, 1238 struct page *page, void *object) {} 1239 1240 static inline int alloc_debug_processing(struct kmem_cache *s, 1241 struct page *page, void *object, unsigned long addr) { return 0; } 1242 1243 static inline struct kmem_cache_node *free_debug_processing( 1244 struct kmem_cache *s, struct page *page, void *object, 1245 unsigned long addr, unsigned long *flags) { return NULL; } 1246 1247 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1248 { return 1; } 1249 static inline int check_object(struct kmem_cache *s, struct page *page, 1250 void *object, u8 val) { return 1; } 1251 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1252 struct page *page) {} 1253 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1254 static inline unsigned long kmem_cache_flags(unsigned long object_size, 1255 unsigned long flags, const char *name, 1256 void (*ctor)(void *)) 1257 { 1258 return flags; 1259 } 1260 #define slub_debug 0 1261 1262 #define disable_higher_order_debug 0 1263 1264 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1265 { return 0; } 1266 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1267 { return 0; } 1268 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1269 int objects) {} 1270 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1271 int objects) {} 1272 1273 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1274 { 1275 kmemleak_alloc(ptr, size, 1, flags); 1276 } 1277 1278 static inline void kfree_hook(const void *x) 1279 { 1280 kmemleak_free(x); 1281 } 1282 1283 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1284 { return 0; } 1285 1286 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1287 void *object) 1288 { 1289 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, 1290 flags & gfp_allowed_mask); 1291 } 1292 1293 static inline void slab_free_hook(struct kmem_cache *s, void *x) 1294 { 1295 kmemleak_free_recursive(x, s->flags); 1296 } 1297 1298 #endif /* CONFIG_SLUB_DEBUG */ 1299 1300 /* 1301 * Slab allocation and freeing 1302 */ 1303 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1304 struct kmem_cache_order_objects oo) 1305 { 1306 int order = oo_order(oo); 1307 1308 flags |= __GFP_NOTRACK; 1309 1310 if (node == NUMA_NO_NODE) 1311 return alloc_pages(flags, order); 1312 else 1313 return alloc_pages_exact_node(node, flags, order); 1314 } 1315 1316 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1317 { 1318 struct page *page; 1319 struct kmem_cache_order_objects oo = s->oo; 1320 gfp_t alloc_gfp; 1321 1322 flags &= gfp_allowed_mask; 1323 1324 if (flags & __GFP_WAIT) 1325 local_irq_enable(); 1326 1327 flags |= s->allocflags; 1328 1329 /* 1330 * Let the initial higher-order allocation fail under memory pressure 1331 * so we fall-back to the minimum order allocation. 1332 */ 1333 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1334 1335 page = alloc_slab_page(alloc_gfp, node, oo); 1336 if (unlikely(!page)) { 1337 oo = s->min; 1338 /* 1339 * Allocation may have failed due to fragmentation. 1340 * Try a lower order alloc if possible 1341 */ 1342 page = alloc_slab_page(flags, node, oo); 1343 1344 if (page) 1345 stat(s, ORDER_FALLBACK); 1346 } 1347 1348 if (kmemcheck_enabled && page 1349 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1350 int pages = 1 << oo_order(oo); 1351 1352 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1353 1354 /* 1355 * Objects from caches that have a constructor don't get 1356 * cleared when they're allocated, so we need to do it here. 1357 */ 1358 if (s->ctor) 1359 kmemcheck_mark_uninitialized_pages(page, pages); 1360 else 1361 kmemcheck_mark_unallocated_pages(page, pages); 1362 } 1363 1364 if (flags & __GFP_WAIT) 1365 local_irq_disable(); 1366 if (!page) 1367 return NULL; 1368 1369 page->objects = oo_objects(oo); 1370 mod_zone_page_state(page_zone(page), 1371 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1372 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1373 1 << oo_order(oo)); 1374 1375 return page; 1376 } 1377 1378 static void setup_object(struct kmem_cache *s, struct page *page, 1379 void *object) 1380 { 1381 setup_object_debug(s, page, object); 1382 if (unlikely(s->ctor)) 1383 s->ctor(object); 1384 } 1385 1386 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1387 { 1388 struct page *page; 1389 void *start; 1390 void *last; 1391 void *p; 1392 int order; 1393 1394 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1395 1396 page = allocate_slab(s, 1397 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1398 if (!page) 1399 goto out; 1400 1401 order = compound_order(page); 1402 inc_slabs_node(s, page_to_nid(page), page->objects); 1403 memcg_bind_pages(s, order); 1404 page->slab_cache = s; 1405 __SetPageSlab(page); 1406 if (page->pfmemalloc) 1407 SetPageSlabPfmemalloc(page); 1408 1409 start = page_address(page); 1410 1411 if (unlikely(s->flags & SLAB_POISON)) 1412 memset(start, POISON_INUSE, PAGE_SIZE << order); 1413 1414 last = start; 1415 for_each_object(p, s, start, page->objects) { 1416 setup_object(s, page, last); 1417 set_freepointer(s, last, p); 1418 last = p; 1419 } 1420 setup_object(s, page, last); 1421 set_freepointer(s, last, NULL); 1422 1423 page->freelist = start; 1424 page->inuse = page->objects; 1425 page->frozen = 1; 1426 out: 1427 return page; 1428 } 1429 1430 static void __free_slab(struct kmem_cache *s, struct page *page) 1431 { 1432 int order = compound_order(page); 1433 int pages = 1 << order; 1434 1435 if (kmem_cache_debug(s)) { 1436 void *p; 1437 1438 slab_pad_check(s, page); 1439 for_each_object(p, s, page_address(page), 1440 page->objects) 1441 check_object(s, page, p, SLUB_RED_INACTIVE); 1442 } 1443 1444 kmemcheck_free_shadow(page, compound_order(page)); 1445 1446 mod_zone_page_state(page_zone(page), 1447 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1448 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1449 -pages); 1450 1451 __ClearPageSlabPfmemalloc(page); 1452 __ClearPageSlab(page); 1453 1454 memcg_release_pages(s, order); 1455 page_mapcount_reset(page); 1456 if (current->reclaim_state) 1457 current->reclaim_state->reclaimed_slab += pages; 1458 __free_memcg_kmem_pages(page, order); 1459 } 1460 1461 #define need_reserve_slab_rcu \ 1462 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1463 1464 static void rcu_free_slab(struct rcu_head *h) 1465 { 1466 struct page *page; 1467 1468 if (need_reserve_slab_rcu) 1469 page = virt_to_head_page(h); 1470 else 1471 page = container_of((struct list_head *)h, struct page, lru); 1472 1473 __free_slab(page->slab_cache, page); 1474 } 1475 1476 static void free_slab(struct kmem_cache *s, struct page *page) 1477 { 1478 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1479 struct rcu_head *head; 1480 1481 if (need_reserve_slab_rcu) { 1482 int order = compound_order(page); 1483 int offset = (PAGE_SIZE << order) - s->reserved; 1484 1485 VM_BUG_ON(s->reserved != sizeof(*head)); 1486 head = page_address(page) + offset; 1487 } else { 1488 /* 1489 * RCU free overloads the RCU head over the LRU 1490 */ 1491 head = (void *)&page->lru; 1492 } 1493 1494 call_rcu(head, rcu_free_slab); 1495 } else 1496 __free_slab(s, page); 1497 } 1498 1499 static void discard_slab(struct kmem_cache *s, struct page *page) 1500 { 1501 dec_slabs_node(s, page_to_nid(page), page->objects); 1502 free_slab(s, page); 1503 } 1504 1505 /* 1506 * Management of partially allocated slabs. 1507 * 1508 * list_lock must be held. 1509 */ 1510 static inline void add_partial(struct kmem_cache_node *n, 1511 struct page *page, int tail) 1512 { 1513 n->nr_partial++; 1514 if (tail == DEACTIVATE_TO_TAIL) 1515 list_add_tail(&page->lru, &n->partial); 1516 else 1517 list_add(&page->lru, &n->partial); 1518 } 1519 1520 /* 1521 * list_lock must be held. 1522 */ 1523 static inline void remove_partial(struct kmem_cache_node *n, 1524 struct page *page) 1525 { 1526 list_del(&page->lru); 1527 n->nr_partial--; 1528 } 1529 1530 /* 1531 * Remove slab from the partial list, freeze it and 1532 * return the pointer to the freelist. 1533 * 1534 * Returns a list of objects or NULL if it fails. 1535 * 1536 * Must hold list_lock since we modify the partial list. 1537 */ 1538 static inline void *acquire_slab(struct kmem_cache *s, 1539 struct kmem_cache_node *n, struct page *page, 1540 int mode, int *objects) 1541 { 1542 void *freelist; 1543 unsigned long counters; 1544 struct page new; 1545 1546 /* 1547 * Zap the freelist and set the frozen bit. 1548 * The old freelist is the list of objects for the 1549 * per cpu allocation list. 1550 */ 1551 freelist = page->freelist; 1552 counters = page->counters; 1553 new.counters = counters; 1554 *objects = new.objects - new.inuse; 1555 if (mode) { 1556 new.inuse = page->objects; 1557 new.freelist = NULL; 1558 } else { 1559 new.freelist = freelist; 1560 } 1561 1562 VM_BUG_ON(new.frozen); 1563 new.frozen = 1; 1564 1565 if (!__cmpxchg_double_slab(s, page, 1566 freelist, counters, 1567 new.freelist, new.counters, 1568 "acquire_slab")) 1569 return NULL; 1570 1571 remove_partial(n, page); 1572 WARN_ON(!freelist); 1573 return freelist; 1574 } 1575 1576 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1577 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); 1578 1579 /* 1580 * Try to allocate a partial slab from a specific node. 1581 */ 1582 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 1583 struct kmem_cache_cpu *c, gfp_t flags) 1584 { 1585 struct page *page, *page2; 1586 void *object = NULL; 1587 int available = 0; 1588 int objects; 1589 1590 /* 1591 * Racy check. If we mistakenly see no partial slabs then we 1592 * just allocate an empty slab. If we mistakenly try to get a 1593 * partial slab and there is none available then get_partials() 1594 * will return NULL. 1595 */ 1596 if (!n || !n->nr_partial) 1597 return NULL; 1598 1599 spin_lock(&n->list_lock); 1600 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1601 void *t; 1602 1603 if (!pfmemalloc_match(page, flags)) 1604 continue; 1605 1606 t = acquire_slab(s, n, page, object == NULL, &objects); 1607 if (!t) 1608 break; 1609 1610 available += objects; 1611 if (!object) { 1612 c->page = page; 1613 stat(s, ALLOC_FROM_PARTIAL); 1614 object = t; 1615 } else { 1616 put_cpu_partial(s, page, 0); 1617 stat(s, CPU_PARTIAL_NODE); 1618 } 1619 if (!kmem_cache_has_cpu_partial(s) 1620 || available > s->cpu_partial / 2) 1621 break; 1622 1623 } 1624 spin_unlock(&n->list_lock); 1625 return object; 1626 } 1627 1628 /* 1629 * Get a page from somewhere. Search in increasing NUMA distances. 1630 */ 1631 static void *get_any_partial(struct kmem_cache *s, gfp_t flags, 1632 struct kmem_cache_cpu *c) 1633 { 1634 #ifdef CONFIG_NUMA 1635 struct zonelist *zonelist; 1636 struct zoneref *z; 1637 struct zone *zone; 1638 enum zone_type high_zoneidx = gfp_zone(flags); 1639 void *object; 1640 unsigned int cpuset_mems_cookie; 1641 1642 /* 1643 * The defrag ratio allows a configuration of the tradeoffs between 1644 * inter node defragmentation and node local allocations. A lower 1645 * defrag_ratio increases the tendency to do local allocations 1646 * instead of attempting to obtain partial slabs from other nodes. 1647 * 1648 * If the defrag_ratio is set to 0 then kmalloc() always 1649 * returns node local objects. If the ratio is higher then kmalloc() 1650 * may return off node objects because partial slabs are obtained 1651 * from other nodes and filled up. 1652 * 1653 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1654 * defrag_ratio = 1000) then every (well almost) allocation will 1655 * first attempt to defrag slab caches on other nodes. This means 1656 * scanning over all nodes to look for partial slabs which may be 1657 * expensive if we do it every time we are trying to find a slab 1658 * with available objects. 1659 */ 1660 if (!s->remote_node_defrag_ratio || 1661 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1662 return NULL; 1663 1664 do { 1665 cpuset_mems_cookie = get_mems_allowed(); 1666 zonelist = node_zonelist(slab_node(), flags); 1667 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1668 struct kmem_cache_node *n; 1669 1670 n = get_node(s, zone_to_nid(zone)); 1671 1672 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1673 n->nr_partial > s->min_partial) { 1674 object = get_partial_node(s, n, c, flags); 1675 if (object) { 1676 /* 1677 * Return the object even if 1678 * put_mems_allowed indicated that 1679 * the cpuset mems_allowed was 1680 * updated in parallel. It's a 1681 * harmless race between the alloc 1682 * and the cpuset update. 1683 */ 1684 put_mems_allowed(cpuset_mems_cookie); 1685 return object; 1686 } 1687 } 1688 } 1689 } while (!put_mems_allowed(cpuset_mems_cookie)); 1690 #endif 1691 return NULL; 1692 } 1693 1694 /* 1695 * Get a partial page, lock it and return it. 1696 */ 1697 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1698 struct kmem_cache_cpu *c) 1699 { 1700 void *object; 1701 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1702 1703 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1704 if (object || node != NUMA_NO_NODE) 1705 return object; 1706 1707 return get_any_partial(s, flags, c); 1708 } 1709 1710 #ifdef CONFIG_PREEMPT 1711 /* 1712 * Calculate the next globally unique transaction for disambiguiation 1713 * during cmpxchg. The transactions start with the cpu number and are then 1714 * incremented by CONFIG_NR_CPUS. 1715 */ 1716 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1717 #else 1718 /* 1719 * No preemption supported therefore also no need to check for 1720 * different cpus. 1721 */ 1722 #define TID_STEP 1 1723 #endif 1724 1725 static inline unsigned long next_tid(unsigned long tid) 1726 { 1727 return tid + TID_STEP; 1728 } 1729 1730 static inline unsigned int tid_to_cpu(unsigned long tid) 1731 { 1732 return tid % TID_STEP; 1733 } 1734 1735 static inline unsigned long tid_to_event(unsigned long tid) 1736 { 1737 return tid / TID_STEP; 1738 } 1739 1740 static inline unsigned int init_tid(int cpu) 1741 { 1742 return cpu; 1743 } 1744 1745 static inline void note_cmpxchg_failure(const char *n, 1746 const struct kmem_cache *s, unsigned long tid) 1747 { 1748 #ifdef SLUB_DEBUG_CMPXCHG 1749 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1750 1751 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1752 1753 #ifdef CONFIG_PREEMPT 1754 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1755 printk("due to cpu change %d -> %d\n", 1756 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1757 else 1758 #endif 1759 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1760 printk("due to cpu running other code. Event %ld->%ld\n", 1761 tid_to_event(tid), tid_to_event(actual_tid)); 1762 else 1763 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1764 actual_tid, tid, next_tid(tid)); 1765 #endif 1766 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1767 } 1768 1769 static void init_kmem_cache_cpus(struct kmem_cache *s) 1770 { 1771 int cpu; 1772 1773 for_each_possible_cpu(cpu) 1774 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1775 } 1776 1777 /* 1778 * Remove the cpu slab 1779 */ 1780 static void deactivate_slab(struct kmem_cache *s, struct page *page, 1781 void *freelist) 1782 { 1783 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1784 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1785 int lock = 0; 1786 enum slab_modes l = M_NONE, m = M_NONE; 1787 void *nextfree; 1788 int tail = DEACTIVATE_TO_HEAD; 1789 struct page new; 1790 struct page old; 1791 1792 if (page->freelist) { 1793 stat(s, DEACTIVATE_REMOTE_FREES); 1794 tail = DEACTIVATE_TO_TAIL; 1795 } 1796 1797 /* 1798 * Stage one: Free all available per cpu objects back 1799 * to the page freelist while it is still frozen. Leave the 1800 * last one. 1801 * 1802 * There is no need to take the list->lock because the page 1803 * is still frozen. 1804 */ 1805 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1806 void *prior; 1807 unsigned long counters; 1808 1809 do { 1810 prior = page->freelist; 1811 counters = page->counters; 1812 set_freepointer(s, freelist, prior); 1813 new.counters = counters; 1814 new.inuse--; 1815 VM_BUG_ON(!new.frozen); 1816 1817 } while (!__cmpxchg_double_slab(s, page, 1818 prior, counters, 1819 freelist, new.counters, 1820 "drain percpu freelist")); 1821 1822 freelist = nextfree; 1823 } 1824 1825 /* 1826 * Stage two: Ensure that the page is unfrozen while the 1827 * list presence reflects the actual number of objects 1828 * during unfreeze. 1829 * 1830 * We setup the list membership and then perform a cmpxchg 1831 * with the count. If there is a mismatch then the page 1832 * is not unfrozen but the page is on the wrong list. 1833 * 1834 * Then we restart the process which may have to remove 1835 * the page from the list that we just put it on again 1836 * because the number of objects in the slab may have 1837 * changed. 1838 */ 1839 redo: 1840 1841 old.freelist = page->freelist; 1842 old.counters = page->counters; 1843 VM_BUG_ON(!old.frozen); 1844 1845 /* Determine target state of the slab */ 1846 new.counters = old.counters; 1847 if (freelist) { 1848 new.inuse--; 1849 set_freepointer(s, freelist, old.freelist); 1850 new.freelist = freelist; 1851 } else 1852 new.freelist = old.freelist; 1853 1854 new.frozen = 0; 1855 1856 if (!new.inuse && n->nr_partial > s->min_partial) 1857 m = M_FREE; 1858 else if (new.freelist) { 1859 m = M_PARTIAL; 1860 if (!lock) { 1861 lock = 1; 1862 /* 1863 * Taking the spinlock removes the possiblity 1864 * that acquire_slab() will see a slab page that 1865 * is frozen 1866 */ 1867 spin_lock(&n->list_lock); 1868 } 1869 } else { 1870 m = M_FULL; 1871 if (kmem_cache_debug(s) && !lock) { 1872 lock = 1; 1873 /* 1874 * This also ensures that the scanning of full 1875 * slabs from diagnostic functions will not see 1876 * any frozen slabs. 1877 */ 1878 spin_lock(&n->list_lock); 1879 } 1880 } 1881 1882 if (l != m) { 1883 1884 if (l == M_PARTIAL) 1885 1886 remove_partial(n, page); 1887 1888 else if (l == M_FULL) 1889 1890 remove_full(s, page); 1891 1892 if (m == M_PARTIAL) { 1893 1894 add_partial(n, page, tail); 1895 stat(s, tail); 1896 1897 } else if (m == M_FULL) { 1898 1899 stat(s, DEACTIVATE_FULL); 1900 add_full(s, n, page); 1901 1902 } 1903 } 1904 1905 l = m; 1906 if (!__cmpxchg_double_slab(s, page, 1907 old.freelist, old.counters, 1908 new.freelist, new.counters, 1909 "unfreezing slab")) 1910 goto redo; 1911 1912 if (lock) 1913 spin_unlock(&n->list_lock); 1914 1915 if (m == M_FREE) { 1916 stat(s, DEACTIVATE_EMPTY); 1917 discard_slab(s, page); 1918 stat(s, FREE_SLAB); 1919 } 1920 } 1921 1922 /* 1923 * Unfreeze all the cpu partial slabs. 1924 * 1925 * This function must be called with interrupts disabled 1926 * for the cpu using c (or some other guarantee must be there 1927 * to guarantee no concurrent accesses). 1928 */ 1929 static void unfreeze_partials(struct kmem_cache *s, 1930 struct kmem_cache_cpu *c) 1931 { 1932 #ifdef CONFIG_SLUB_CPU_PARTIAL 1933 struct kmem_cache_node *n = NULL, *n2 = NULL; 1934 struct page *page, *discard_page = NULL; 1935 1936 while ((page = c->partial)) { 1937 struct page new; 1938 struct page old; 1939 1940 c->partial = page->next; 1941 1942 n2 = get_node(s, page_to_nid(page)); 1943 if (n != n2) { 1944 if (n) 1945 spin_unlock(&n->list_lock); 1946 1947 n = n2; 1948 spin_lock(&n->list_lock); 1949 } 1950 1951 do { 1952 1953 old.freelist = page->freelist; 1954 old.counters = page->counters; 1955 VM_BUG_ON(!old.frozen); 1956 1957 new.counters = old.counters; 1958 new.freelist = old.freelist; 1959 1960 new.frozen = 0; 1961 1962 } while (!__cmpxchg_double_slab(s, page, 1963 old.freelist, old.counters, 1964 new.freelist, new.counters, 1965 "unfreezing slab")); 1966 1967 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { 1968 page->next = discard_page; 1969 discard_page = page; 1970 } else { 1971 add_partial(n, page, DEACTIVATE_TO_TAIL); 1972 stat(s, FREE_ADD_PARTIAL); 1973 } 1974 } 1975 1976 if (n) 1977 spin_unlock(&n->list_lock); 1978 1979 while (discard_page) { 1980 page = discard_page; 1981 discard_page = discard_page->next; 1982 1983 stat(s, DEACTIVATE_EMPTY); 1984 discard_slab(s, page); 1985 stat(s, FREE_SLAB); 1986 } 1987 #endif 1988 } 1989 1990 /* 1991 * Put a page that was just frozen (in __slab_free) into a partial page 1992 * slot if available. This is done without interrupts disabled and without 1993 * preemption disabled. The cmpxchg is racy and may put the partial page 1994 * onto a random cpus partial slot. 1995 * 1996 * If we did not find a slot then simply move all the partials to the 1997 * per node partial list. 1998 */ 1999 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 2000 { 2001 #ifdef CONFIG_SLUB_CPU_PARTIAL 2002 struct page *oldpage; 2003 int pages; 2004 int pobjects; 2005 2006 do { 2007 pages = 0; 2008 pobjects = 0; 2009 oldpage = this_cpu_read(s->cpu_slab->partial); 2010 2011 if (oldpage) { 2012 pobjects = oldpage->pobjects; 2013 pages = oldpage->pages; 2014 if (drain && pobjects > s->cpu_partial) { 2015 unsigned long flags; 2016 /* 2017 * partial array is full. Move the existing 2018 * set to the per node partial list. 2019 */ 2020 local_irq_save(flags); 2021 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 2022 local_irq_restore(flags); 2023 oldpage = NULL; 2024 pobjects = 0; 2025 pages = 0; 2026 stat(s, CPU_PARTIAL_DRAIN); 2027 } 2028 } 2029 2030 pages++; 2031 pobjects += page->objects - page->inuse; 2032 2033 page->pages = pages; 2034 page->pobjects = pobjects; 2035 page->next = oldpage; 2036 2037 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2038 != oldpage); 2039 #endif 2040 } 2041 2042 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 2043 { 2044 stat(s, CPUSLAB_FLUSH); 2045 deactivate_slab(s, c->page, c->freelist); 2046 2047 c->tid = next_tid(c->tid); 2048 c->page = NULL; 2049 c->freelist = NULL; 2050 } 2051 2052 /* 2053 * Flush cpu slab. 2054 * 2055 * Called from IPI handler with interrupts disabled. 2056 */ 2057 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 2058 { 2059 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2060 2061 if (likely(c)) { 2062 if (c->page) 2063 flush_slab(s, c); 2064 2065 unfreeze_partials(s, c); 2066 } 2067 } 2068 2069 static void flush_cpu_slab(void *d) 2070 { 2071 struct kmem_cache *s = d; 2072 2073 __flush_cpu_slab(s, smp_processor_id()); 2074 } 2075 2076 static bool has_cpu_slab(int cpu, void *info) 2077 { 2078 struct kmem_cache *s = info; 2079 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2080 2081 return c->page || c->partial; 2082 } 2083 2084 static void flush_all(struct kmem_cache *s) 2085 { 2086 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); 2087 } 2088 2089 /* 2090 * Check if the objects in a per cpu structure fit numa 2091 * locality expectations. 2092 */ 2093 static inline int node_match(struct page *page, int node) 2094 { 2095 #ifdef CONFIG_NUMA 2096 if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) 2097 return 0; 2098 #endif 2099 return 1; 2100 } 2101 2102 static int count_free(struct page *page) 2103 { 2104 return page->objects - page->inuse; 2105 } 2106 2107 static unsigned long count_partial(struct kmem_cache_node *n, 2108 int (*get_count)(struct page *)) 2109 { 2110 unsigned long flags; 2111 unsigned long x = 0; 2112 struct page *page; 2113 2114 spin_lock_irqsave(&n->list_lock, flags); 2115 list_for_each_entry(page, &n->partial, lru) 2116 x += get_count(page); 2117 spin_unlock_irqrestore(&n->list_lock, flags); 2118 return x; 2119 } 2120 2121 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 2122 { 2123 #ifdef CONFIG_SLUB_DEBUG 2124 return atomic_long_read(&n->total_objects); 2125 #else 2126 return 0; 2127 #endif 2128 } 2129 2130 static noinline void 2131 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2132 { 2133 int node; 2134 2135 printk(KERN_WARNING 2136 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2137 nid, gfpflags); 2138 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2139 "default order: %d, min order: %d\n", s->name, s->object_size, 2140 s->size, oo_order(s->oo), oo_order(s->min)); 2141 2142 if (oo_order(s->min) > get_order(s->object_size)) 2143 printk(KERN_WARNING " %s debugging increased min order, use " 2144 "slub_debug=O to disable.\n", s->name); 2145 2146 for_each_online_node(node) { 2147 struct kmem_cache_node *n = get_node(s, node); 2148 unsigned long nr_slabs; 2149 unsigned long nr_objs; 2150 unsigned long nr_free; 2151 2152 if (!n) 2153 continue; 2154 2155 nr_free = count_partial(n, count_free); 2156 nr_slabs = node_nr_slabs(n); 2157 nr_objs = node_nr_objs(n); 2158 2159 printk(KERN_WARNING 2160 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2161 node, nr_slabs, nr_objs, nr_free); 2162 } 2163 } 2164 2165 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2166 int node, struct kmem_cache_cpu **pc) 2167 { 2168 void *freelist; 2169 struct kmem_cache_cpu *c = *pc; 2170 struct page *page; 2171 2172 freelist = get_partial(s, flags, node, c); 2173 2174 if (freelist) 2175 return freelist; 2176 2177 page = new_slab(s, flags, node); 2178 if (page) { 2179 c = __this_cpu_ptr(s->cpu_slab); 2180 if (c->page) 2181 flush_slab(s, c); 2182 2183 /* 2184 * No other reference to the page yet so we can 2185 * muck around with it freely without cmpxchg 2186 */ 2187 freelist = page->freelist; 2188 page->freelist = NULL; 2189 2190 stat(s, ALLOC_SLAB); 2191 c->page = page; 2192 *pc = c; 2193 } else 2194 freelist = NULL; 2195 2196 return freelist; 2197 } 2198 2199 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) 2200 { 2201 if (unlikely(PageSlabPfmemalloc(page))) 2202 return gfp_pfmemalloc_allowed(gfpflags); 2203 2204 return true; 2205 } 2206 2207 /* 2208 * Check the page->freelist of a page and either transfer the freelist to the 2209 * per cpu freelist or deactivate the page. 2210 * 2211 * The page is still frozen if the return value is not NULL. 2212 * 2213 * If this function returns NULL then the page has been unfrozen. 2214 * 2215 * This function must be called with interrupt disabled. 2216 */ 2217 static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2218 { 2219 struct page new; 2220 unsigned long counters; 2221 void *freelist; 2222 2223 do { 2224 freelist = page->freelist; 2225 counters = page->counters; 2226 2227 new.counters = counters; 2228 VM_BUG_ON(!new.frozen); 2229 2230 new.inuse = page->objects; 2231 new.frozen = freelist != NULL; 2232 2233 } while (!__cmpxchg_double_slab(s, page, 2234 freelist, counters, 2235 NULL, new.counters, 2236 "get_freelist")); 2237 2238 return freelist; 2239 } 2240 2241 /* 2242 * Slow path. The lockless freelist is empty or we need to perform 2243 * debugging duties. 2244 * 2245 * Processing is still very fast if new objects have been freed to the 2246 * regular freelist. In that case we simply take over the regular freelist 2247 * as the lockless freelist and zap the regular freelist. 2248 * 2249 * If that is not working then we fall back to the partial lists. We take the 2250 * first element of the freelist as the object to allocate now and move the 2251 * rest of the freelist to the lockless freelist. 2252 * 2253 * And if we were unable to get a new slab from the partial slab lists then 2254 * we need to allocate a new slab. This is the slowest path since it involves 2255 * a call to the page allocator and the setup of a new slab. 2256 */ 2257 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2258 unsigned long addr, struct kmem_cache_cpu *c) 2259 { 2260 void *freelist; 2261 struct page *page; 2262 unsigned long flags; 2263 2264 local_irq_save(flags); 2265 #ifdef CONFIG_PREEMPT 2266 /* 2267 * We may have been preempted and rescheduled on a different 2268 * cpu before disabling interrupts. Need to reload cpu area 2269 * pointer. 2270 */ 2271 c = this_cpu_ptr(s->cpu_slab); 2272 #endif 2273 2274 page = c->page; 2275 if (!page) 2276 goto new_slab; 2277 redo: 2278 2279 if (unlikely(!node_match(page, node))) { 2280 stat(s, ALLOC_NODE_MISMATCH); 2281 deactivate_slab(s, page, c->freelist); 2282 c->page = NULL; 2283 c->freelist = NULL; 2284 goto new_slab; 2285 } 2286 2287 /* 2288 * By rights, we should be searching for a slab page that was 2289 * PFMEMALLOC but right now, we are losing the pfmemalloc 2290 * information when the page leaves the per-cpu allocator 2291 */ 2292 if (unlikely(!pfmemalloc_match(page, gfpflags))) { 2293 deactivate_slab(s, page, c->freelist); 2294 c->page = NULL; 2295 c->freelist = NULL; 2296 goto new_slab; 2297 } 2298 2299 /* must check again c->freelist in case of cpu migration or IRQ */ 2300 freelist = c->freelist; 2301 if (freelist) 2302 goto load_freelist; 2303 2304 stat(s, ALLOC_SLOWPATH); 2305 2306 freelist = get_freelist(s, page); 2307 2308 if (!freelist) { 2309 c->page = NULL; 2310 stat(s, DEACTIVATE_BYPASS); 2311 goto new_slab; 2312 } 2313 2314 stat(s, ALLOC_REFILL); 2315 2316 load_freelist: 2317 /* 2318 * freelist is pointing to the list of objects to be used. 2319 * page is pointing to the page from which the objects are obtained. 2320 * That page must be frozen for per cpu allocations to work. 2321 */ 2322 VM_BUG_ON(!c->page->frozen); 2323 c->freelist = get_freepointer(s, freelist); 2324 c->tid = next_tid(c->tid); 2325 local_irq_restore(flags); 2326 return freelist; 2327 2328 new_slab: 2329 2330 if (c->partial) { 2331 page = c->page = c->partial; 2332 c->partial = page->next; 2333 stat(s, CPU_PARTIAL_ALLOC); 2334 c->freelist = NULL; 2335 goto redo; 2336 } 2337 2338 freelist = new_slab_objects(s, gfpflags, node, &c); 2339 2340 if (unlikely(!freelist)) { 2341 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2342 slab_out_of_memory(s, gfpflags, node); 2343 2344 local_irq_restore(flags); 2345 return NULL; 2346 } 2347 2348 page = c->page; 2349 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2350 goto load_freelist; 2351 2352 /* Only entered in the debug case */ 2353 if (kmem_cache_debug(s) && 2354 !alloc_debug_processing(s, page, freelist, addr)) 2355 goto new_slab; /* Slab failed checks. Next slab needed */ 2356 2357 deactivate_slab(s, page, get_freepointer(s, freelist)); 2358 c->page = NULL; 2359 c->freelist = NULL; 2360 local_irq_restore(flags); 2361 return freelist; 2362 } 2363 2364 /* 2365 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2366 * have the fastpath folded into their functions. So no function call 2367 * overhead for requests that can be satisfied on the fastpath. 2368 * 2369 * The fastpath works by first checking if the lockless freelist can be used. 2370 * If not then __slab_alloc is called for slow processing. 2371 * 2372 * Otherwise we can simply pick the next object from the lockless free list. 2373 */ 2374 static __always_inline void *slab_alloc_node(struct kmem_cache *s, 2375 gfp_t gfpflags, int node, unsigned long addr) 2376 { 2377 void **object; 2378 struct kmem_cache_cpu *c; 2379 struct page *page; 2380 unsigned long tid; 2381 2382 if (slab_pre_alloc_hook(s, gfpflags)) 2383 return NULL; 2384 2385 s = memcg_kmem_get_cache(s, gfpflags); 2386 redo: 2387 /* 2388 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2389 * enabled. We may switch back and forth between cpus while 2390 * reading from one cpu area. That does not matter as long 2391 * as we end up on the original cpu again when doing the cmpxchg. 2392 * 2393 * Preemption is disabled for the retrieval of the tid because that 2394 * must occur from the current processor. We cannot allow rescheduling 2395 * on a different processor between the determination of the pointer 2396 * and the retrieval of the tid. 2397 */ 2398 preempt_disable(); 2399 c = __this_cpu_ptr(s->cpu_slab); 2400 2401 /* 2402 * The transaction ids are globally unique per cpu and per operation on 2403 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2404 * occurs on the right processor and that there was no operation on the 2405 * linked list in between. 2406 */ 2407 tid = c->tid; 2408 preempt_enable(); 2409 2410 object = c->freelist; 2411 page = c->page; 2412 if (unlikely(!object || !node_match(page, node))) 2413 object = __slab_alloc(s, gfpflags, node, addr, c); 2414 2415 else { 2416 void *next_object = get_freepointer_safe(s, object); 2417 2418 /* 2419 * The cmpxchg will only match if there was no additional 2420 * operation and if we are on the right processor. 2421 * 2422 * The cmpxchg does the following atomically (without lock 2423 * semantics!) 2424 * 1. Relocate first pointer to the current per cpu area. 2425 * 2. Verify that tid and freelist have not been changed 2426 * 3. If they were not changed replace tid and freelist 2427 * 2428 * Since this is without lock semantics the protection is only 2429 * against code executing on this cpu *not* from access by 2430 * other cpus. 2431 */ 2432 if (unlikely(!this_cpu_cmpxchg_double( 2433 s->cpu_slab->freelist, s->cpu_slab->tid, 2434 object, tid, 2435 next_object, next_tid(tid)))) { 2436 2437 note_cmpxchg_failure("slab_alloc", s, tid); 2438 goto redo; 2439 } 2440 prefetch_freepointer(s, next_object); 2441 stat(s, ALLOC_FASTPATH); 2442 } 2443 2444 if (unlikely(gfpflags & __GFP_ZERO) && object) 2445 memset(object, 0, s->object_size); 2446 2447 slab_post_alloc_hook(s, gfpflags, object); 2448 2449 return object; 2450 } 2451 2452 static __always_inline void *slab_alloc(struct kmem_cache *s, 2453 gfp_t gfpflags, unsigned long addr) 2454 { 2455 return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); 2456 } 2457 2458 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2459 { 2460 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2461 2462 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, 2463 s->size, gfpflags); 2464 2465 return ret; 2466 } 2467 EXPORT_SYMBOL(kmem_cache_alloc); 2468 2469 #ifdef CONFIG_TRACING 2470 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2471 { 2472 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2473 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2474 return ret; 2475 } 2476 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2477 #endif 2478 2479 #ifdef CONFIG_NUMA 2480 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2481 { 2482 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2483 2484 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2485 s->object_size, s->size, gfpflags, node); 2486 2487 return ret; 2488 } 2489 EXPORT_SYMBOL(kmem_cache_alloc_node); 2490 2491 #ifdef CONFIG_TRACING 2492 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2493 gfp_t gfpflags, 2494 int node, size_t size) 2495 { 2496 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2497 2498 trace_kmalloc_node(_RET_IP_, ret, 2499 size, s->size, gfpflags, node); 2500 return ret; 2501 } 2502 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2503 #endif 2504 #endif 2505 2506 /* 2507 * Slow patch handling. This may still be called frequently since objects 2508 * have a longer lifetime than the cpu slabs in most processing loads. 2509 * 2510 * So we still attempt to reduce cache line usage. Just take the slab 2511 * lock and free the item. If there is no additional partial page 2512 * handling required then we can return immediately. 2513 */ 2514 static void __slab_free(struct kmem_cache *s, struct page *page, 2515 void *x, unsigned long addr) 2516 { 2517 void *prior; 2518 void **object = (void *)x; 2519 int was_frozen; 2520 struct page new; 2521 unsigned long counters; 2522 struct kmem_cache_node *n = NULL; 2523 unsigned long uninitialized_var(flags); 2524 2525 stat(s, FREE_SLOWPATH); 2526 2527 if (kmem_cache_debug(s) && 2528 !(n = free_debug_processing(s, page, x, addr, &flags))) 2529 return; 2530 2531 do { 2532 if (unlikely(n)) { 2533 spin_unlock_irqrestore(&n->list_lock, flags); 2534 n = NULL; 2535 } 2536 prior = page->freelist; 2537 counters = page->counters; 2538 set_freepointer(s, object, prior); 2539 new.counters = counters; 2540 was_frozen = new.frozen; 2541 new.inuse--; 2542 if ((!new.inuse || !prior) && !was_frozen) { 2543 2544 if (kmem_cache_has_cpu_partial(s) && !prior) 2545 2546 /* 2547 * Slab was on no list before and will be 2548 * partially empty 2549 * We can defer the list move and instead 2550 * freeze it. 2551 */ 2552 new.frozen = 1; 2553 2554 else { /* Needs to be taken off a list */ 2555 2556 n = get_node(s, page_to_nid(page)); 2557 /* 2558 * Speculatively acquire the list_lock. 2559 * If the cmpxchg does not succeed then we may 2560 * drop the list_lock without any processing. 2561 * 2562 * Otherwise the list_lock will synchronize with 2563 * other processors updating the list of slabs. 2564 */ 2565 spin_lock_irqsave(&n->list_lock, flags); 2566 2567 } 2568 } 2569 2570 } while (!cmpxchg_double_slab(s, page, 2571 prior, counters, 2572 object, new.counters, 2573 "__slab_free")); 2574 2575 if (likely(!n)) { 2576 2577 /* 2578 * If we just froze the page then put it onto the 2579 * per cpu partial list. 2580 */ 2581 if (new.frozen && !was_frozen) { 2582 put_cpu_partial(s, page, 1); 2583 stat(s, CPU_PARTIAL_FREE); 2584 } 2585 /* 2586 * The list lock was not taken therefore no list 2587 * activity can be necessary. 2588 */ 2589 if (was_frozen) 2590 stat(s, FREE_FROZEN); 2591 return; 2592 } 2593 2594 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) 2595 goto slab_empty; 2596 2597 /* 2598 * Objects left in the slab. If it was not on the partial list before 2599 * then add it. 2600 */ 2601 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 2602 if (kmem_cache_debug(s)) 2603 remove_full(s, page); 2604 add_partial(n, page, DEACTIVATE_TO_TAIL); 2605 stat(s, FREE_ADD_PARTIAL); 2606 } 2607 spin_unlock_irqrestore(&n->list_lock, flags); 2608 return; 2609 2610 slab_empty: 2611 if (prior) { 2612 /* 2613 * Slab on the partial list. 2614 */ 2615 remove_partial(n, page); 2616 stat(s, FREE_REMOVE_PARTIAL); 2617 } else 2618 /* Slab must be on the full list */ 2619 remove_full(s, page); 2620 2621 spin_unlock_irqrestore(&n->list_lock, flags); 2622 stat(s, FREE_SLAB); 2623 discard_slab(s, page); 2624 } 2625 2626 /* 2627 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2628 * can perform fastpath freeing without additional function calls. 2629 * 2630 * The fastpath is only possible if we are freeing to the current cpu slab 2631 * of this processor. This typically the case if we have just allocated 2632 * the item before. 2633 * 2634 * If fastpath is not possible then fall back to __slab_free where we deal 2635 * with all sorts of special processing. 2636 */ 2637 static __always_inline void slab_free(struct kmem_cache *s, 2638 struct page *page, void *x, unsigned long addr) 2639 { 2640 void **object = (void *)x; 2641 struct kmem_cache_cpu *c; 2642 unsigned long tid; 2643 2644 slab_free_hook(s, x); 2645 2646 redo: 2647 /* 2648 * Determine the currently cpus per cpu slab. 2649 * The cpu may change afterward. However that does not matter since 2650 * data is retrieved via this pointer. If we are on the same cpu 2651 * during the cmpxchg then the free will succedd. 2652 */ 2653 preempt_disable(); 2654 c = __this_cpu_ptr(s->cpu_slab); 2655 2656 tid = c->tid; 2657 preempt_enable(); 2658 2659 if (likely(page == c->page)) { 2660 set_freepointer(s, object, c->freelist); 2661 2662 if (unlikely(!this_cpu_cmpxchg_double( 2663 s->cpu_slab->freelist, s->cpu_slab->tid, 2664 c->freelist, tid, 2665 object, next_tid(tid)))) { 2666 2667 note_cmpxchg_failure("slab_free", s, tid); 2668 goto redo; 2669 } 2670 stat(s, FREE_FASTPATH); 2671 } else 2672 __slab_free(s, page, x, addr); 2673 2674 } 2675 2676 void kmem_cache_free(struct kmem_cache *s, void *x) 2677 { 2678 s = cache_from_obj(s, x); 2679 if (!s) 2680 return; 2681 slab_free(s, virt_to_head_page(x), x, _RET_IP_); 2682 trace_kmem_cache_free(_RET_IP_, x); 2683 } 2684 EXPORT_SYMBOL(kmem_cache_free); 2685 2686 /* 2687 * Object placement in a slab is made very easy because we always start at 2688 * offset 0. If we tune the size of the object to the alignment then we can 2689 * get the required alignment by putting one properly sized object after 2690 * another. 2691 * 2692 * Notice that the allocation order determines the sizes of the per cpu 2693 * caches. Each processor has always one slab available for allocations. 2694 * Increasing the allocation order reduces the number of times that slabs 2695 * must be moved on and off the partial lists and is therefore a factor in 2696 * locking overhead. 2697 */ 2698 2699 /* 2700 * Mininum / Maximum order of slab pages. This influences locking overhead 2701 * and slab fragmentation. A higher order reduces the number of partial slabs 2702 * and increases the number of allocations possible without having to 2703 * take the list_lock. 2704 */ 2705 static int slub_min_order; 2706 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2707 static int slub_min_objects; 2708 2709 /* 2710 * Merge control. If this is set then no merging of slab caches will occur. 2711 * (Could be removed. This was introduced to pacify the merge skeptics.) 2712 */ 2713 static int slub_nomerge; 2714 2715 /* 2716 * Calculate the order of allocation given an slab object size. 2717 * 2718 * The order of allocation has significant impact on performance and other 2719 * system components. Generally order 0 allocations should be preferred since 2720 * order 0 does not cause fragmentation in the page allocator. Larger objects 2721 * be problematic to put into order 0 slabs because there may be too much 2722 * unused space left. We go to a higher order if more than 1/16th of the slab 2723 * would be wasted. 2724 * 2725 * In order to reach satisfactory performance we must ensure that a minimum 2726 * number of objects is in one slab. Otherwise we may generate too much 2727 * activity on the partial lists which requires taking the list_lock. This is 2728 * less a concern for large slabs though which are rarely used. 2729 * 2730 * slub_max_order specifies the order where we begin to stop considering the 2731 * number of objects in a slab as critical. If we reach slub_max_order then 2732 * we try to keep the page order as low as possible. So we accept more waste 2733 * of space in favor of a small page order. 2734 * 2735 * Higher order allocations also allow the placement of more objects in a 2736 * slab and thereby reduce object handling overhead. If the user has 2737 * requested a higher mininum order then we start with that one instead of 2738 * the smallest order which will fit the object. 2739 */ 2740 static inline int slab_order(int size, int min_objects, 2741 int max_order, int fract_leftover, int reserved) 2742 { 2743 int order; 2744 int rem; 2745 int min_order = slub_min_order; 2746 2747 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2748 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2749 2750 for (order = max(min_order, 2751 fls(min_objects * size - 1) - PAGE_SHIFT); 2752 order <= max_order; order++) { 2753 2754 unsigned long slab_size = PAGE_SIZE << order; 2755 2756 if (slab_size < min_objects * size + reserved) 2757 continue; 2758 2759 rem = (slab_size - reserved) % size; 2760 2761 if (rem <= slab_size / fract_leftover) 2762 break; 2763 2764 } 2765 2766 return order; 2767 } 2768 2769 static inline int calculate_order(int size, int reserved) 2770 { 2771 int order; 2772 int min_objects; 2773 int fraction; 2774 int max_objects; 2775 2776 /* 2777 * Attempt to find best configuration for a slab. This 2778 * works by first attempting to generate a layout with 2779 * the best configuration and backing off gradually. 2780 * 2781 * First we reduce the acceptable waste in a slab. Then 2782 * we reduce the minimum objects required in a slab. 2783 */ 2784 min_objects = slub_min_objects; 2785 if (!min_objects) 2786 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2787 max_objects = order_objects(slub_max_order, size, reserved); 2788 min_objects = min(min_objects, max_objects); 2789 2790 while (min_objects > 1) { 2791 fraction = 16; 2792 while (fraction >= 4) { 2793 order = slab_order(size, min_objects, 2794 slub_max_order, fraction, reserved); 2795 if (order <= slub_max_order) 2796 return order; 2797 fraction /= 2; 2798 } 2799 min_objects--; 2800 } 2801 2802 /* 2803 * We were unable to place multiple objects in a slab. Now 2804 * lets see if we can place a single object there. 2805 */ 2806 order = slab_order(size, 1, slub_max_order, 1, reserved); 2807 if (order <= slub_max_order) 2808 return order; 2809 2810 /* 2811 * Doh this slab cannot be placed using slub_max_order. 2812 */ 2813 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2814 if (order < MAX_ORDER) 2815 return order; 2816 return -ENOSYS; 2817 } 2818 2819 static void 2820 init_kmem_cache_node(struct kmem_cache_node *n) 2821 { 2822 n->nr_partial = 0; 2823 spin_lock_init(&n->list_lock); 2824 INIT_LIST_HEAD(&n->partial); 2825 #ifdef CONFIG_SLUB_DEBUG 2826 atomic_long_set(&n->nr_slabs, 0); 2827 atomic_long_set(&n->total_objects, 0); 2828 INIT_LIST_HEAD(&n->full); 2829 #endif 2830 } 2831 2832 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2833 { 2834 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2835 KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); 2836 2837 /* 2838 * Must align to double word boundary for the double cmpxchg 2839 * instructions to work; see __pcpu_double_call_return_bool(). 2840 */ 2841 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2842 2 * sizeof(void *)); 2843 2844 if (!s->cpu_slab) 2845 return 0; 2846 2847 init_kmem_cache_cpus(s); 2848 2849 return 1; 2850 } 2851 2852 static struct kmem_cache *kmem_cache_node; 2853 2854 /* 2855 * No kmalloc_node yet so do it by hand. We know that this is the first 2856 * slab on the node for this slabcache. There are no concurrent accesses 2857 * possible. 2858 * 2859 * Note that this function only works on the kmem_cache_node 2860 * when allocating for the kmem_cache_node. This is used for bootstrapping 2861 * memory on a fresh node that has no slab structures yet. 2862 */ 2863 static void early_kmem_cache_node_alloc(int node) 2864 { 2865 struct page *page; 2866 struct kmem_cache_node *n; 2867 2868 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2869 2870 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2871 2872 BUG_ON(!page); 2873 if (page_to_nid(page) != node) { 2874 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2875 "node %d\n", node); 2876 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2877 "in order to be able to continue\n"); 2878 } 2879 2880 n = page->freelist; 2881 BUG_ON(!n); 2882 page->freelist = get_freepointer(kmem_cache_node, n); 2883 page->inuse = 1; 2884 page->frozen = 0; 2885 kmem_cache_node->node[node] = n; 2886 #ifdef CONFIG_SLUB_DEBUG 2887 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2888 init_tracking(kmem_cache_node, n); 2889 #endif 2890 init_kmem_cache_node(n); 2891 inc_slabs_node(kmem_cache_node, node, page->objects); 2892 2893 add_partial(n, page, DEACTIVATE_TO_HEAD); 2894 } 2895 2896 static void free_kmem_cache_nodes(struct kmem_cache *s) 2897 { 2898 int node; 2899 2900 for_each_node_state(node, N_NORMAL_MEMORY) { 2901 struct kmem_cache_node *n = s->node[node]; 2902 2903 if (n) 2904 kmem_cache_free(kmem_cache_node, n); 2905 2906 s->node[node] = NULL; 2907 } 2908 } 2909 2910 static int init_kmem_cache_nodes(struct kmem_cache *s) 2911 { 2912 int node; 2913 2914 for_each_node_state(node, N_NORMAL_MEMORY) { 2915 struct kmem_cache_node *n; 2916 2917 if (slab_state == DOWN) { 2918 early_kmem_cache_node_alloc(node); 2919 continue; 2920 } 2921 n = kmem_cache_alloc_node(kmem_cache_node, 2922 GFP_KERNEL, node); 2923 2924 if (!n) { 2925 free_kmem_cache_nodes(s); 2926 return 0; 2927 } 2928 2929 s->node[node] = n; 2930 init_kmem_cache_node(n); 2931 } 2932 return 1; 2933 } 2934 2935 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2936 { 2937 if (min < MIN_PARTIAL) 2938 min = MIN_PARTIAL; 2939 else if (min > MAX_PARTIAL) 2940 min = MAX_PARTIAL; 2941 s->min_partial = min; 2942 } 2943 2944 /* 2945 * calculate_sizes() determines the order and the distribution of data within 2946 * a slab object. 2947 */ 2948 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2949 { 2950 unsigned long flags = s->flags; 2951 unsigned long size = s->object_size; 2952 int order; 2953 2954 /* 2955 * Round up object size to the next word boundary. We can only 2956 * place the free pointer at word boundaries and this determines 2957 * the possible location of the free pointer. 2958 */ 2959 size = ALIGN(size, sizeof(void *)); 2960 2961 #ifdef CONFIG_SLUB_DEBUG 2962 /* 2963 * Determine if we can poison the object itself. If the user of 2964 * the slab may touch the object after free or before allocation 2965 * then we should never poison the object itself. 2966 */ 2967 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2968 !s->ctor) 2969 s->flags |= __OBJECT_POISON; 2970 else 2971 s->flags &= ~__OBJECT_POISON; 2972 2973 2974 /* 2975 * If we are Redzoning then check if there is some space between the 2976 * end of the object and the free pointer. If not then add an 2977 * additional word to have some bytes to store Redzone information. 2978 */ 2979 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 2980 size += sizeof(void *); 2981 #endif 2982 2983 /* 2984 * With that we have determined the number of bytes in actual use 2985 * by the object. This is the potential offset to the free pointer. 2986 */ 2987 s->inuse = size; 2988 2989 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2990 s->ctor)) { 2991 /* 2992 * Relocate free pointer after the object if it is not 2993 * permitted to overwrite the first word of the object on 2994 * kmem_cache_free. 2995 * 2996 * This is the case if we do RCU, have a constructor or 2997 * destructor or are poisoning the objects. 2998 */ 2999 s->offset = size; 3000 size += sizeof(void *); 3001 } 3002 3003 #ifdef CONFIG_SLUB_DEBUG 3004 if (flags & SLAB_STORE_USER) 3005 /* 3006 * Need to store information about allocs and frees after 3007 * the object. 3008 */ 3009 size += 2 * sizeof(struct track); 3010 3011 if (flags & SLAB_RED_ZONE) 3012 /* 3013 * Add some empty padding so that we can catch 3014 * overwrites from earlier objects rather than let 3015 * tracking information or the free pointer be 3016 * corrupted if a user writes before the start 3017 * of the object. 3018 */ 3019 size += sizeof(void *); 3020 #endif 3021 3022 /* 3023 * SLUB stores one object immediately after another beginning from 3024 * offset 0. In order to align the objects we have to simply size 3025 * each object to conform to the alignment. 3026 */ 3027 size = ALIGN(size, s->align); 3028 s->size = size; 3029 if (forced_order >= 0) 3030 order = forced_order; 3031 else 3032 order = calculate_order(size, s->reserved); 3033 3034 if (order < 0) 3035 return 0; 3036 3037 s->allocflags = 0; 3038 if (order) 3039 s->allocflags |= __GFP_COMP; 3040 3041 if (s->flags & SLAB_CACHE_DMA) 3042 s->allocflags |= GFP_DMA; 3043 3044 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3045 s->allocflags |= __GFP_RECLAIMABLE; 3046 3047 /* 3048 * Determine the number of objects per slab 3049 */ 3050 s->oo = oo_make(order, size, s->reserved); 3051 s->min = oo_make(get_order(size), size, s->reserved); 3052 if (oo_objects(s->oo) > oo_objects(s->max)) 3053 s->max = s->oo; 3054 3055 return !!oo_objects(s->oo); 3056 } 3057 3058 static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3059 { 3060 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3061 s->reserved = 0; 3062 3063 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3064 s->reserved = sizeof(struct rcu_head); 3065 3066 if (!calculate_sizes(s, -1)) 3067 goto error; 3068 if (disable_higher_order_debug) { 3069 /* 3070 * Disable debugging flags that store metadata if the min slab 3071 * order increased. 3072 */ 3073 if (get_order(s->size) > get_order(s->object_size)) { 3074 s->flags &= ~DEBUG_METADATA_FLAGS; 3075 s->offset = 0; 3076 if (!calculate_sizes(s, -1)) 3077 goto error; 3078 } 3079 } 3080 3081 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3082 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3083 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3084 /* Enable fast mode */ 3085 s->flags |= __CMPXCHG_DOUBLE; 3086 #endif 3087 3088 /* 3089 * The larger the object size is, the more pages we want on the partial 3090 * list to avoid pounding the page allocator excessively. 3091 */ 3092 set_min_partial(s, ilog2(s->size) / 2); 3093 3094 /* 3095 * cpu_partial determined the maximum number of objects kept in the 3096 * per cpu partial lists of a processor. 3097 * 3098 * Per cpu partial lists mainly contain slabs that just have one 3099 * object freed. If they are used for allocation then they can be 3100 * filled up again with minimal effort. The slab will never hit the 3101 * per node partial lists and therefore no locking will be required. 3102 * 3103 * This setting also determines 3104 * 3105 * A) The number of objects from per cpu partial slabs dumped to the 3106 * per node list when we reach the limit. 3107 * B) The number of objects in cpu partial slabs to extract from the 3108 * per node list when we run out of per cpu objects. We only fetch 3109 * 50% to keep some capacity around for frees. 3110 */ 3111 if (!kmem_cache_has_cpu_partial(s)) 3112 s->cpu_partial = 0; 3113 else if (s->size >= PAGE_SIZE) 3114 s->cpu_partial = 2; 3115 else if (s->size >= 1024) 3116 s->cpu_partial = 6; 3117 else if (s->size >= 256) 3118 s->cpu_partial = 13; 3119 else 3120 s->cpu_partial = 30; 3121 3122 #ifdef CONFIG_NUMA 3123 s->remote_node_defrag_ratio = 1000; 3124 #endif 3125 if (!init_kmem_cache_nodes(s)) 3126 goto error; 3127 3128 if (alloc_kmem_cache_cpus(s)) 3129 return 0; 3130 3131 free_kmem_cache_nodes(s); 3132 error: 3133 if (flags & SLAB_PANIC) 3134 panic("Cannot create slab %s size=%lu realsize=%u " 3135 "order=%u offset=%u flags=%lx\n", 3136 s->name, (unsigned long)s->size, s->size, 3137 oo_order(s->oo), s->offset, flags); 3138 return -EINVAL; 3139 } 3140 3141 static void list_slab_objects(struct kmem_cache *s, struct page *page, 3142 const char *text) 3143 { 3144 #ifdef CONFIG_SLUB_DEBUG 3145 void *addr = page_address(page); 3146 void *p; 3147 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 3148 sizeof(long), GFP_ATOMIC); 3149 if (!map) 3150 return; 3151 slab_err(s, page, text, s->name); 3152 slab_lock(page); 3153 3154 get_map(s, page, map); 3155 for_each_object(p, s, addr, page->objects) { 3156 3157 if (!test_bit(slab_index(p, s, addr), map)) { 3158 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3159 p, p - addr); 3160 print_tracking(s, p); 3161 } 3162 } 3163 slab_unlock(page); 3164 kfree(map); 3165 #endif 3166 } 3167 3168 /* 3169 * Attempt to free all partial slabs on a node. 3170 * This is called from kmem_cache_close(). We must be the last thread 3171 * using the cache and therefore we do not need to lock anymore. 3172 */ 3173 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3174 { 3175 struct page *page, *h; 3176 3177 list_for_each_entry_safe(page, h, &n->partial, lru) { 3178 if (!page->inuse) { 3179 remove_partial(n, page); 3180 discard_slab(s, page); 3181 } else { 3182 list_slab_objects(s, page, 3183 "Objects remaining in %s on kmem_cache_close()"); 3184 } 3185 } 3186 } 3187 3188 /* 3189 * Release all resources used by a slab cache. 3190 */ 3191 static inline int kmem_cache_close(struct kmem_cache *s) 3192 { 3193 int node; 3194 3195 flush_all(s); 3196 /* Attempt to free all objects */ 3197 for_each_node_state(node, N_NORMAL_MEMORY) { 3198 struct kmem_cache_node *n = get_node(s, node); 3199 3200 free_partial(s, n); 3201 if (n->nr_partial || slabs_node(s, node)) 3202 return 1; 3203 } 3204 free_percpu(s->cpu_slab); 3205 free_kmem_cache_nodes(s); 3206 return 0; 3207 } 3208 3209 int __kmem_cache_shutdown(struct kmem_cache *s) 3210 { 3211 int rc = kmem_cache_close(s); 3212 3213 if (!rc) { 3214 /* 3215 * We do the same lock strategy around sysfs_slab_add, see 3216 * __kmem_cache_create. Because this is pretty much the last 3217 * operation we do and the lock will be released shortly after 3218 * that in slab_common.c, we could just move sysfs_slab_remove 3219 * to a later point in common code. We should do that when we 3220 * have a common sysfs framework for all allocators. 3221 */ 3222 mutex_unlock(&slab_mutex); 3223 sysfs_slab_remove(s); 3224 mutex_lock(&slab_mutex); 3225 } 3226 3227 return rc; 3228 } 3229 3230 /******************************************************************** 3231 * Kmalloc subsystem 3232 *******************************************************************/ 3233 3234 static int __init setup_slub_min_order(char *str) 3235 { 3236 get_option(&str, &slub_min_order); 3237 3238 return 1; 3239 } 3240 3241 __setup("slub_min_order=", setup_slub_min_order); 3242 3243 static int __init setup_slub_max_order(char *str) 3244 { 3245 get_option(&str, &slub_max_order); 3246 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3247 3248 return 1; 3249 } 3250 3251 __setup("slub_max_order=", setup_slub_max_order); 3252 3253 static int __init setup_slub_min_objects(char *str) 3254 { 3255 get_option(&str, &slub_min_objects); 3256 3257 return 1; 3258 } 3259 3260 __setup("slub_min_objects=", setup_slub_min_objects); 3261 3262 static int __init setup_slub_nomerge(char *str) 3263 { 3264 slub_nomerge = 1; 3265 return 1; 3266 } 3267 3268 __setup("slub_nomerge", setup_slub_nomerge); 3269 3270 void *__kmalloc(size_t size, gfp_t flags) 3271 { 3272 struct kmem_cache *s; 3273 void *ret; 3274 3275 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 3276 return kmalloc_large(size, flags); 3277 3278 s = kmalloc_slab(size, flags); 3279 3280 if (unlikely(ZERO_OR_NULL_PTR(s))) 3281 return s; 3282 3283 ret = slab_alloc(s, flags, _RET_IP_); 3284 3285 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3286 3287 return ret; 3288 } 3289 EXPORT_SYMBOL(__kmalloc); 3290 3291 #ifdef CONFIG_NUMA 3292 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3293 { 3294 struct page *page; 3295 void *ptr = NULL; 3296 3297 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3298 page = alloc_pages_node(node, flags, get_order(size)); 3299 if (page) 3300 ptr = page_address(page); 3301 3302 kmalloc_large_node_hook(ptr, size, flags); 3303 return ptr; 3304 } 3305 3306 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3307 { 3308 struct kmem_cache *s; 3309 void *ret; 3310 3311 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 3312 ret = kmalloc_large_node(size, flags, node); 3313 3314 trace_kmalloc_node(_RET_IP_, ret, 3315 size, PAGE_SIZE << get_order(size), 3316 flags, node); 3317 3318 return ret; 3319 } 3320 3321 s = kmalloc_slab(size, flags); 3322 3323 if (unlikely(ZERO_OR_NULL_PTR(s))) 3324 return s; 3325 3326 ret = slab_alloc_node(s, flags, node, _RET_IP_); 3327 3328 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3329 3330 return ret; 3331 } 3332 EXPORT_SYMBOL(__kmalloc_node); 3333 #endif 3334 3335 size_t ksize(const void *object) 3336 { 3337 struct page *page; 3338 3339 if (unlikely(object == ZERO_SIZE_PTR)) 3340 return 0; 3341 3342 page = virt_to_head_page(object); 3343 3344 if (unlikely(!PageSlab(page))) { 3345 WARN_ON(!PageCompound(page)); 3346 return PAGE_SIZE << compound_order(page); 3347 } 3348 3349 return slab_ksize(page->slab_cache); 3350 } 3351 EXPORT_SYMBOL(ksize); 3352 3353 void kfree(const void *x) 3354 { 3355 struct page *page; 3356 void *object = (void *)x; 3357 3358 trace_kfree(_RET_IP_, x); 3359 3360 if (unlikely(ZERO_OR_NULL_PTR(x))) 3361 return; 3362 3363 page = virt_to_head_page(x); 3364 if (unlikely(!PageSlab(page))) { 3365 BUG_ON(!PageCompound(page)); 3366 kfree_hook(x); 3367 __free_memcg_kmem_pages(page, compound_order(page)); 3368 return; 3369 } 3370 slab_free(page->slab_cache, page, object, _RET_IP_); 3371 } 3372 EXPORT_SYMBOL(kfree); 3373 3374 /* 3375 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3376 * the remaining slabs by the number of items in use. The slabs with the 3377 * most items in use come first. New allocations will then fill those up 3378 * and thus they can be removed from the partial lists. 3379 * 3380 * The slabs with the least items are placed last. This results in them 3381 * being allocated from last increasing the chance that the last objects 3382 * are freed in them. 3383 */ 3384 int kmem_cache_shrink(struct kmem_cache *s) 3385 { 3386 int node; 3387 int i; 3388 struct kmem_cache_node *n; 3389 struct page *page; 3390 struct page *t; 3391 int objects = oo_objects(s->max); 3392 struct list_head *slabs_by_inuse = 3393 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3394 unsigned long flags; 3395 3396 if (!slabs_by_inuse) 3397 return -ENOMEM; 3398 3399 flush_all(s); 3400 for_each_node_state(node, N_NORMAL_MEMORY) { 3401 n = get_node(s, node); 3402 3403 if (!n->nr_partial) 3404 continue; 3405 3406 for (i = 0; i < objects; i++) 3407 INIT_LIST_HEAD(slabs_by_inuse + i); 3408 3409 spin_lock_irqsave(&n->list_lock, flags); 3410 3411 /* 3412 * Build lists indexed by the items in use in each slab. 3413 * 3414 * Note that concurrent frees may occur while we hold the 3415 * list_lock. page->inuse here is the upper limit. 3416 */ 3417 list_for_each_entry_safe(page, t, &n->partial, lru) { 3418 list_move(&page->lru, slabs_by_inuse + page->inuse); 3419 if (!page->inuse) 3420 n->nr_partial--; 3421 } 3422 3423 /* 3424 * Rebuild the partial list with the slabs filled up most 3425 * first and the least used slabs at the end. 3426 */ 3427 for (i = objects - 1; i > 0; i--) 3428 list_splice(slabs_by_inuse + i, n->partial.prev); 3429 3430 spin_unlock_irqrestore(&n->list_lock, flags); 3431 3432 /* Release empty slabs */ 3433 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3434 discard_slab(s, page); 3435 } 3436 3437 kfree(slabs_by_inuse); 3438 return 0; 3439 } 3440 EXPORT_SYMBOL(kmem_cache_shrink); 3441 3442 static int slab_mem_going_offline_callback(void *arg) 3443 { 3444 struct kmem_cache *s; 3445 3446 mutex_lock(&slab_mutex); 3447 list_for_each_entry(s, &slab_caches, list) 3448 kmem_cache_shrink(s); 3449 mutex_unlock(&slab_mutex); 3450 3451 return 0; 3452 } 3453 3454 static void slab_mem_offline_callback(void *arg) 3455 { 3456 struct kmem_cache_node *n; 3457 struct kmem_cache *s; 3458 struct memory_notify *marg = arg; 3459 int offline_node; 3460 3461 offline_node = marg->status_change_nid_normal; 3462 3463 /* 3464 * If the node still has available memory. we need kmem_cache_node 3465 * for it yet. 3466 */ 3467 if (offline_node < 0) 3468 return; 3469 3470 mutex_lock(&slab_mutex); 3471 list_for_each_entry(s, &slab_caches, list) { 3472 n = get_node(s, offline_node); 3473 if (n) { 3474 /* 3475 * if n->nr_slabs > 0, slabs still exist on the node 3476 * that is going down. We were unable to free them, 3477 * and offline_pages() function shouldn't call this 3478 * callback. So, we must fail. 3479 */ 3480 BUG_ON(slabs_node(s, offline_node)); 3481 3482 s->node[offline_node] = NULL; 3483 kmem_cache_free(kmem_cache_node, n); 3484 } 3485 } 3486 mutex_unlock(&slab_mutex); 3487 } 3488 3489 static int slab_mem_going_online_callback(void *arg) 3490 { 3491 struct kmem_cache_node *n; 3492 struct kmem_cache *s; 3493 struct memory_notify *marg = arg; 3494 int nid = marg->status_change_nid_normal; 3495 int ret = 0; 3496 3497 /* 3498 * If the node's memory is already available, then kmem_cache_node is 3499 * already created. Nothing to do. 3500 */ 3501 if (nid < 0) 3502 return 0; 3503 3504 /* 3505 * We are bringing a node online. No memory is available yet. We must 3506 * allocate a kmem_cache_node structure in order to bring the node 3507 * online. 3508 */ 3509 mutex_lock(&slab_mutex); 3510 list_for_each_entry(s, &slab_caches, list) { 3511 /* 3512 * XXX: kmem_cache_alloc_node will fallback to other nodes 3513 * since memory is not yet available from the node that 3514 * is brought up. 3515 */ 3516 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3517 if (!n) { 3518 ret = -ENOMEM; 3519 goto out; 3520 } 3521 init_kmem_cache_node(n); 3522 s->node[nid] = n; 3523 } 3524 out: 3525 mutex_unlock(&slab_mutex); 3526 return ret; 3527 } 3528 3529 static int slab_memory_callback(struct notifier_block *self, 3530 unsigned long action, void *arg) 3531 { 3532 int ret = 0; 3533 3534 switch (action) { 3535 case MEM_GOING_ONLINE: 3536 ret = slab_mem_going_online_callback(arg); 3537 break; 3538 case MEM_GOING_OFFLINE: 3539 ret = slab_mem_going_offline_callback(arg); 3540 break; 3541 case MEM_OFFLINE: 3542 case MEM_CANCEL_ONLINE: 3543 slab_mem_offline_callback(arg); 3544 break; 3545 case MEM_ONLINE: 3546 case MEM_CANCEL_OFFLINE: 3547 break; 3548 } 3549 if (ret) 3550 ret = notifier_from_errno(ret); 3551 else 3552 ret = NOTIFY_OK; 3553 return ret; 3554 } 3555 3556 static struct notifier_block slab_memory_callback_nb = { 3557 .notifier_call = slab_memory_callback, 3558 .priority = SLAB_CALLBACK_PRI, 3559 }; 3560 3561 /******************************************************************** 3562 * Basic setup of slabs 3563 *******************************************************************/ 3564 3565 /* 3566 * Used for early kmem_cache structures that were allocated using 3567 * the page allocator. Allocate them properly then fix up the pointers 3568 * that may be pointing to the wrong kmem_cache structure. 3569 */ 3570 3571 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) 3572 { 3573 int node; 3574 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 3575 3576 memcpy(s, static_cache, kmem_cache->object_size); 3577 3578 /* 3579 * This runs very early, and only the boot processor is supposed to be 3580 * up. Even if it weren't true, IRQs are not up so we couldn't fire 3581 * IPIs around. 3582 */ 3583 __flush_cpu_slab(s, smp_processor_id()); 3584 for_each_node_state(node, N_NORMAL_MEMORY) { 3585 struct kmem_cache_node *n = get_node(s, node); 3586 struct page *p; 3587 3588 if (n) { 3589 list_for_each_entry(p, &n->partial, lru) 3590 p->slab_cache = s; 3591 3592 #ifdef CONFIG_SLUB_DEBUG 3593 list_for_each_entry(p, &n->full, lru) 3594 p->slab_cache = s; 3595 #endif 3596 } 3597 } 3598 list_add(&s->list, &slab_caches); 3599 return s; 3600 } 3601 3602 void __init kmem_cache_init(void) 3603 { 3604 static __initdata struct kmem_cache boot_kmem_cache, 3605 boot_kmem_cache_node; 3606 3607 if (debug_guardpage_minorder()) 3608 slub_max_order = 0; 3609 3610 kmem_cache_node = &boot_kmem_cache_node; 3611 kmem_cache = &boot_kmem_cache; 3612 3613 create_boot_cache(kmem_cache_node, "kmem_cache_node", 3614 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); 3615 3616 register_hotmemory_notifier(&slab_memory_callback_nb); 3617 3618 /* Able to allocate the per node structures */ 3619 slab_state = PARTIAL; 3620 3621 create_boot_cache(kmem_cache, "kmem_cache", 3622 offsetof(struct kmem_cache, node) + 3623 nr_node_ids * sizeof(struct kmem_cache_node *), 3624 SLAB_HWCACHE_ALIGN); 3625 3626 kmem_cache = bootstrap(&boot_kmem_cache); 3627 3628 /* 3629 * Allocate kmem_cache_node properly from the kmem_cache slab. 3630 * kmem_cache_node is separately allocated so no need to 3631 * update any list pointers. 3632 */ 3633 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 3634 3635 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3636 create_kmalloc_caches(0); 3637 3638 #ifdef CONFIG_SMP 3639 register_cpu_notifier(&slab_notifier); 3640 #endif 3641 3642 printk(KERN_INFO 3643 "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," 3644 " CPUs=%d, Nodes=%d\n", 3645 cache_line_size(), 3646 slub_min_order, slub_max_order, slub_min_objects, 3647 nr_cpu_ids, nr_node_ids); 3648 } 3649 3650 void __init kmem_cache_init_late(void) 3651 { 3652 } 3653 3654 /* 3655 * Find a mergeable slab cache 3656 */ 3657 static int slab_unmergeable(struct kmem_cache *s) 3658 { 3659 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3660 return 1; 3661 3662 if (s->ctor) 3663 return 1; 3664 3665 /* 3666 * We may have set a slab to be unmergeable during bootstrap. 3667 */ 3668 if (s->refcount < 0) 3669 return 1; 3670 3671 return 0; 3672 } 3673 3674 static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, 3675 size_t align, unsigned long flags, const char *name, 3676 void (*ctor)(void *)) 3677 { 3678 struct kmem_cache *s; 3679 3680 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3681 return NULL; 3682 3683 if (ctor) 3684 return NULL; 3685 3686 size = ALIGN(size, sizeof(void *)); 3687 align = calculate_alignment(flags, align, size); 3688 size = ALIGN(size, align); 3689 flags = kmem_cache_flags(size, flags, name, NULL); 3690 3691 list_for_each_entry(s, &slab_caches, list) { 3692 if (slab_unmergeable(s)) 3693 continue; 3694 3695 if (size > s->size) 3696 continue; 3697 3698 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3699 continue; 3700 /* 3701 * Check if alignment is compatible. 3702 * Courtesy of Adrian Drzewiecki 3703 */ 3704 if ((s->size & ~(align - 1)) != s->size) 3705 continue; 3706 3707 if (s->size - size >= sizeof(void *)) 3708 continue; 3709 3710 if (!cache_match_memcg(s, memcg)) 3711 continue; 3712 3713 return s; 3714 } 3715 return NULL; 3716 } 3717 3718 struct kmem_cache * 3719 __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 3720 size_t align, unsigned long flags, void (*ctor)(void *)) 3721 { 3722 struct kmem_cache *s; 3723 3724 s = find_mergeable(memcg, size, align, flags, name, ctor); 3725 if (s) { 3726 s->refcount++; 3727 /* 3728 * Adjust the object sizes so that we clear 3729 * the complete object on kzalloc. 3730 */ 3731 s->object_size = max(s->object_size, (int)size); 3732 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3733 3734 if (sysfs_slab_alias(s, name)) { 3735 s->refcount--; 3736 s = NULL; 3737 } 3738 } 3739 3740 return s; 3741 } 3742 3743 int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 3744 { 3745 int err; 3746 3747 err = kmem_cache_open(s, flags); 3748 if (err) 3749 return err; 3750 3751 /* Mutex is not taken during early boot */ 3752 if (slab_state <= UP) 3753 return 0; 3754 3755 memcg_propagate_slab_attrs(s); 3756 mutex_unlock(&slab_mutex); 3757 err = sysfs_slab_add(s); 3758 mutex_lock(&slab_mutex); 3759 3760 if (err) 3761 kmem_cache_close(s); 3762 3763 return err; 3764 } 3765 3766 #ifdef CONFIG_SMP 3767 /* 3768 * Use the cpu notifier to insure that the cpu slabs are flushed when 3769 * necessary. 3770 */ 3771 static int slab_cpuup_callback(struct notifier_block *nfb, 3772 unsigned long action, void *hcpu) 3773 { 3774 long cpu = (long)hcpu; 3775 struct kmem_cache *s; 3776 unsigned long flags; 3777 3778 switch (action) { 3779 case CPU_UP_CANCELED: 3780 case CPU_UP_CANCELED_FROZEN: 3781 case CPU_DEAD: 3782 case CPU_DEAD_FROZEN: 3783 mutex_lock(&slab_mutex); 3784 list_for_each_entry(s, &slab_caches, list) { 3785 local_irq_save(flags); 3786 __flush_cpu_slab(s, cpu); 3787 local_irq_restore(flags); 3788 } 3789 mutex_unlock(&slab_mutex); 3790 break; 3791 default: 3792 break; 3793 } 3794 return NOTIFY_OK; 3795 } 3796 3797 static struct notifier_block slab_notifier = { 3798 .notifier_call = slab_cpuup_callback 3799 }; 3800 3801 #endif 3802 3803 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3804 { 3805 struct kmem_cache *s; 3806 void *ret; 3807 3808 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 3809 return kmalloc_large(size, gfpflags); 3810 3811 s = kmalloc_slab(size, gfpflags); 3812 3813 if (unlikely(ZERO_OR_NULL_PTR(s))) 3814 return s; 3815 3816 ret = slab_alloc(s, gfpflags, caller); 3817 3818 /* Honor the call site pointer we received. */ 3819 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3820 3821 return ret; 3822 } 3823 3824 #ifdef CONFIG_NUMA 3825 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3826 int node, unsigned long caller) 3827 { 3828 struct kmem_cache *s; 3829 void *ret; 3830 3831 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 3832 ret = kmalloc_large_node(size, gfpflags, node); 3833 3834 trace_kmalloc_node(caller, ret, 3835 size, PAGE_SIZE << get_order(size), 3836 gfpflags, node); 3837 3838 return ret; 3839 } 3840 3841 s = kmalloc_slab(size, gfpflags); 3842 3843 if (unlikely(ZERO_OR_NULL_PTR(s))) 3844 return s; 3845 3846 ret = slab_alloc_node(s, gfpflags, node, caller); 3847 3848 /* Honor the call site pointer we received. */ 3849 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3850 3851 return ret; 3852 } 3853 #endif 3854 3855 #ifdef CONFIG_SYSFS 3856 static int count_inuse(struct page *page) 3857 { 3858 return page->inuse; 3859 } 3860 3861 static int count_total(struct page *page) 3862 { 3863 return page->objects; 3864 } 3865 #endif 3866 3867 #ifdef CONFIG_SLUB_DEBUG 3868 static int validate_slab(struct kmem_cache *s, struct page *page, 3869 unsigned long *map) 3870 { 3871 void *p; 3872 void *addr = page_address(page); 3873 3874 if (!check_slab(s, page) || 3875 !on_freelist(s, page, NULL)) 3876 return 0; 3877 3878 /* Now we know that a valid freelist exists */ 3879 bitmap_zero(map, page->objects); 3880 3881 get_map(s, page, map); 3882 for_each_object(p, s, addr, page->objects) { 3883 if (test_bit(slab_index(p, s, addr), map)) 3884 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3885 return 0; 3886 } 3887 3888 for_each_object(p, s, addr, page->objects) 3889 if (!test_bit(slab_index(p, s, addr), map)) 3890 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 3891 return 0; 3892 return 1; 3893 } 3894 3895 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3896 unsigned long *map) 3897 { 3898 slab_lock(page); 3899 validate_slab(s, page, map); 3900 slab_unlock(page); 3901 } 3902 3903 static int validate_slab_node(struct kmem_cache *s, 3904 struct kmem_cache_node *n, unsigned long *map) 3905 { 3906 unsigned long count = 0; 3907 struct page *page; 3908 unsigned long flags; 3909 3910 spin_lock_irqsave(&n->list_lock, flags); 3911 3912 list_for_each_entry(page, &n->partial, lru) { 3913 validate_slab_slab(s, page, map); 3914 count++; 3915 } 3916 if (count != n->nr_partial) 3917 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3918 "counter=%ld\n", s->name, count, n->nr_partial); 3919 3920 if (!(s->flags & SLAB_STORE_USER)) 3921 goto out; 3922 3923 list_for_each_entry(page, &n->full, lru) { 3924 validate_slab_slab(s, page, map); 3925 count++; 3926 } 3927 if (count != atomic_long_read(&n->nr_slabs)) 3928 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3929 "counter=%ld\n", s->name, count, 3930 atomic_long_read(&n->nr_slabs)); 3931 3932 out: 3933 spin_unlock_irqrestore(&n->list_lock, flags); 3934 return count; 3935 } 3936 3937 static long validate_slab_cache(struct kmem_cache *s) 3938 { 3939 int node; 3940 unsigned long count = 0; 3941 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3942 sizeof(unsigned long), GFP_KERNEL); 3943 3944 if (!map) 3945 return -ENOMEM; 3946 3947 flush_all(s); 3948 for_each_node_state(node, N_NORMAL_MEMORY) { 3949 struct kmem_cache_node *n = get_node(s, node); 3950 3951 count += validate_slab_node(s, n, map); 3952 } 3953 kfree(map); 3954 return count; 3955 } 3956 /* 3957 * Generate lists of code addresses where slabcache objects are allocated 3958 * and freed. 3959 */ 3960 3961 struct location { 3962 unsigned long count; 3963 unsigned long addr; 3964 long long sum_time; 3965 long min_time; 3966 long max_time; 3967 long min_pid; 3968 long max_pid; 3969 DECLARE_BITMAP(cpus, NR_CPUS); 3970 nodemask_t nodes; 3971 }; 3972 3973 struct loc_track { 3974 unsigned long max; 3975 unsigned long count; 3976 struct location *loc; 3977 }; 3978 3979 static void free_loc_track(struct loc_track *t) 3980 { 3981 if (t->max) 3982 free_pages((unsigned long)t->loc, 3983 get_order(sizeof(struct location) * t->max)); 3984 } 3985 3986 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3987 { 3988 struct location *l; 3989 int order; 3990 3991 order = get_order(sizeof(struct location) * max); 3992 3993 l = (void *)__get_free_pages(flags, order); 3994 if (!l) 3995 return 0; 3996 3997 if (t->count) { 3998 memcpy(l, t->loc, sizeof(struct location) * t->count); 3999 free_loc_track(t); 4000 } 4001 t->max = max; 4002 t->loc = l; 4003 return 1; 4004 } 4005 4006 static int add_location(struct loc_track *t, struct kmem_cache *s, 4007 const struct track *track) 4008 { 4009 long start, end, pos; 4010 struct location *l; 4011 unsigned long caddr; 4012 unsigned long age = jiffies - track->when; 4013 4014 start = -1; 4015 end = t->count; 4016 4017 for ( ; ; ) { 4018 pos = start + (end - start + 1) / 2; 4019 4020 /* 4021 * There is nothing at "end". If we end up there 4022 * we need to add something to before end. 4023 */ 4024 if (pos == end) 4025 break; 4026 4027 caddr = t->loc[pos].addr; 4028 if (track->addr == caddr) { 4029 4030 l = &t->loc[pos]; 4031 l->count++; 4032 if (track->when) { 4033 l->sum_time += age; 4034 if (age < l->min_time) 4035 l->min_time = age; 4036 if (age > l->max_time) 4037 l->max_time = age; 4038 4039 if (track->pid < l->min_pid) 4040 l->min_pid = track->pid; 4041 if (track->pid > l->max_pid) 4042 l->max_pid = track->pid; 4043 4044 cpumask_set_cpu(track->cpu, 4045 to_cpumask(l->cpus)); 4046 } 4047 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4048 return 1; 4049 } 4050 4051 if (track->addr < caddr) 4052 end = pos; 4053 else 4054 start = pos; 4055 } 4056 4057 /* 4058 * Not found. Insert new tracking element. 4059 */ 4060 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4061 return 0; 4062 4063 l = t->loc + pos; 4064 if (pos < t->count) 4065 memmove(l + 1, l, 4066 (t->count - pos) * sizeof(struct location)); 4067 t->count++; 4068 l->count = 1; 4069 l->addr = track->addr; 4070 l->sum_time = age; 4071 l->min_time = age; 4072 l->max_time = age; 4073 l->min_pid = track->pid; 4074 l->max_pid = track->pid; 4075 cpumask_clear(to_cpumask(l->cpus)); 4076 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4077 nodes_clear(l->nodes); 4078 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4079 return 1; 4080 } 4081 4082 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4083 struct page *page, enum track_item alloc, 4084 unsigned long *map) 4085 { 4086 void *addr = page_address(page); 4087 void *p; 4088 4089 bitmap_zero(map, page->objects); 4090 get_map(s, page, map); 4091 4092 for_each_object(p, s, addr, page->objects) 4093 if (!test_bit(slab_index(p, s, addr), map)) 4094 add_location(t, s, get_track(s, p, alloc)); 4095 } 4096 4097 static int list_locations(struct kmem_cache *s, char *buf, 4098 enum track_item alloc) 4099 { 4100 int len = 0; 4101 unsigned long i; 4102 struct loc_track t = { 0, 0, NULL }; 4103 int node; 4104 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4105 sizeof(unsigned long), GFP_KERNEL); 4106 4107 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4108 GFP_TEMPORARY)) { 4109 kfree(map); 4110 return sprintf(buf, "Out of memory\n"); 4111 } 4112 /* Push back cpu slabs */ 4113 flush_all(s); 4114 4115 for_each_node_state(node, N_NORMAL_MEMORY) { 4116 struct kmem_cache_node *n = get_node(s, node); 4117 unsigned long flags; 4118 struct page *page; 4119 4120 if (!atomic_long_read(&n->nr_slabs)) 4121 continue; 4122 4123 spin_lock_irqsave(&n->list_lock, flags); 4124 list_for_each_entry(page, &n->partial, lru) 4125 process_slab(&t, s, page, alloc, map); 4126 list_for_each_entry(page, &n->full, lru) 4127 process_slab(&t, s, page, alloc, map); 4128 spin_unlock_irqrestore(&n->list_lock, flags); 4129 } 4130 4131 for (i = 0; i < t.count; i++) { 4132 struct location *l = &t.loc[i]; 4133 4134 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4135 break; 4136 len += sprintf(buf + len, "%7ld ", l->count); 4137 4138 if (l->addr) 4139 len += sprintf(buf + len, "%pS", (void *)l->addr); 4140 else 4141 len += sprintf(buf + len, "<not-available>"); 4142 4143 if (l->sum_time != l->min_time) { 4144 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4145 l->min_time, 4146 (long)div_u64(l->sum_time, l->count), 4147 l->max_time); 4148 } else 4149 len += sprintf(buf + len, " age=%ld", 4150 l->min_time); 4151 4152 if (l->min_pid != l->max_pid) 4153 len += sprintf(buf + len, " pid=%ld-%ld", 4154 l->min_pid, l->max_pid); 4155 else 4156 len += sprintf(buf + len, " pid=%ld", 4157 l->min_pid); 4158 4159 if (num_online_cpus() > 1 && 4160 !cpumask_empty(to_cpumask(l->cpus)) && 4161 len < PAGE_SIZE - 60) { 4162 len += sprintf(buf + len, " cpus="); 4163 len += cpulist_scnprintf(buf + len, 4164 PAGE_SIZE - len - 50, 4165 to_cpumask(l->cpus)); 4166 } 4167 4168 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4169 len < PAGE_SIZE - 60) { 4170 len += sprintf(buf + len, " nodes="); 4171 len += nodelist_scnprintf(buf + len, 4172 PAGE_SIZE - len - 50, 4173 l->nodes); 4174 } 4175 4176 len += sprintf(buf + len, "\n"); 4177 } 4178 4179 free_loc_track(&t); 4180 kfree(map); 4181 if (!t.count) 4182 len += sprintf(buf, "No data\n"); 4183 return len; 4184 } 4185 #endif 4186 4187 #ifdef SLUB_RESILIENCY_TEST 4188 static void resiliency_test(void) 4189 { 4190 u8 *p; 4191 4192 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); 4193 4194 printk(KERN_ERR "SLUB resiliency testing\n"); 4195 printk(KERN_ERR "-----------------------\n"); 4196 printk(KERN_ERR "A. Corruption after allocation\n"); 4197 4198 p = kzalloc(16, GFP_KERNEL); 4199 p[16] = 0x12; 4200 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4201 " 0x12->0x%p\n\n", p + 16); 4202 4203 validate_slab_cache(kmalloc_caches[4]); 4204 4205 /* Hmmm... The next two are dangerous */ 4206 p = kzalloc(32, GFP_KERNEL); 4207 p[32 + sizeof(void *)] = 0x34; 4208 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4209 " 0x34 -> -0x%p\n", p); 4210 printk(KERN_ERR 4211 "If allocated object is overwritten then not detectable\n\n"); 4212 4213 validate_slab_cache(kmalloc_caches[5]); 4214 p = kzalloc(64, GFP_KERNEL); 4215 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4216 *p = 0x56; 4217 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4218 p); 4219 printk(KERN_ERR 4220 "If allocated object is overwritten then not detectable\n\n"); 4221 validate_slab_cache(kmalloc_caches[6]); 4222 4223 printk(KERN_ERR "\nB. Corruption after free\n"); 4224 p = kzalloc(128, GFP_KERNEL); 4225 kfree(p); 4226 *p = 0x78; 4227 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4228 validate_slab_cache(kmalloc_caches[7]); 4229 4230 p = kzalloc(256, GFP_KERNEL); 4231 kfree(p); 4232 p[50] = 0x9a; 4233 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4234 p); 4235 validate_slab_cache(kmalloc_caches[8]); 4236 4237 p = kzalloc(512, GFP_KERNEL); 4238 kfree(p); 4239 p[512] = 0xab; 4240 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4241 validate_slab_cache(kmalloc_caches[9]); 4242 } 4243 #else 4244 #ifdef CONFIG_SYSFS 4245 static void resiliency_test(void) {}; 4246 #endif 4247 #endif 4248 4249 #ifdef CONFIG_SYSFS 4250 enum slab_stat_type { 4251 SL_ALL, /* All slabs */ 4252 SL_PARTIAL, /* Only partially allocated slabs */ 4253 SL_CPU, /* Only slabs used for cpu caches */ 4254 SL_OBJECTS, /* Determine allocated objects not slabs */ 4255 SL_TOTAL /* Determine object capacity not slabs */ 4256 }; 4257 4258 #define SO_ALL (1 << SL_ALL) 4259 #define SO_PARTIAL (1 << SL_PARTIAL) 4260 #define SO_CPU (1 << SL_CPU) 4261 #define SO_OBJECTS (1 << SL_OBJECTS) 4262 #define SO_TOTAL (1 << SL_TOTAL) 4263 4264 static ssize_t show_slab_objects(struct kmem_cache *s, 4265 char *buf, unsigned long flags) 4266 { 4267 unsigned long total = 0; 4268 int node; 4269 int x; 4270 unsigned long *nodes; 4271 4272 nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4273 if (!nodes) 4274 return -ENOMEM; 4275 4276 if (flags & SO_CPU) { 4277 int cpu; 4278 4279 for_each_possible_cpu(cpu) { 4280 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 4281 cpu); 4282 int node; 4283 struct page *page; 4284 4285 page = ACCESS_ONCE(c->page); 4286 if (!page) 4287 continue; 4288 4289 node = page_to_nid(page); 4290 if (flags & SO_TOTAL) 4291 x = page->objects; 4292 else if (flags & SO_OBJECTS) 4293 x = page->inuse; 4294 else 4295 x = 1; 4296 4297 total += x; 4298 nodes[node] += x; 4299 4300 page = ACCESS_ONCE(c->partial); 4301 if (page) { 4302 x = page->pobjects; 4303 total += x; 4304 nodes[node] += x; 4305 } 4306 } 4307 } 4308 4309 lock_memory_hotplug(); 4310 #ifdef CONFIG_SLUB_DEBUG 4311 if (flags & SO_ALL) { 4312 for_each_node_state(node, N_NORMAL_MEMORY) { 4313 struct kmem_cache_node *n = get_node(s, node); 4314 4315 if (flags & SO_TOTAL) 4316 x = atomic_long_read(&n->total_objects); 4317 else if (flags & SO_OBJECTS) 4318 x = atomic_long_read(&n->total_objects) - 4319 count_partial(n, count_free); 4320 else 4321 x = atomic_long_read(&n->nr_slabs); 4322 total += x; 4323 nodes[node] += x; 4324 } 4325 4326 } else 4327 #endif 4328 if (flags & SO_PARTIAL) { 4329 for_each_node_state(node, N_NORMAL_MEMORY) { 4330 struct kmem_cache_node *n = get_node(s, node); 4331 4332 if (flags & SO_TOTAL) 4333 x = count_partial(n, count_total); 4334 else if (flags & SO_OBJECTS) 4335 x = count_partial(n, count_inuse); 4336 else 4337 x = n->nr_partial; 4338 total += x; 4339 nodes[node] += x; 4340 } 4341 } 4342 x = sprintf(buf, "%lu", total); 4343 #ifdef CONFIG_NUMA 4344 for_each_node_state(node, N_NORMAL_MEMORY) 4345 if (nodes[node]) 4346 x += sprintf(buf + x, " N%d=%lu", 4347 node, nodes[node]); 4348 #endif 4349 unlock_memory_hotplug(); 4350 kfree(nodes); 4351 return x + sprintf(buf + x, "\n"); 4352 } 4353 4354 #ifdef CONFIG_SLUB_DEBUG 4355 static int any_slab_objects(struct kmem_cache *s) 4356 { 4357 int node; 4358 4359 for_each_online_node(node) { 4360 struct kmem_cache_node *n = get_node(s, node); 4361 4362 if (!n) 4363 continue; 4364 4365 if (atomic_long_read(&n->total_objects)) 4366 return 1; 4367 } 4368 return 0; 4369 } 4370 #endif 4371 4372 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4373 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4374 4375 struct slab_attribute { 4376 struct attribute attr; 4377 ssize_t (*show)(struct kmem_cache *s, char *buf); 4378 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4379 }; 4380 4381 #define SLAB_ATTR_RO(_name) \ 4382 static struct slab_attribute _name##_attr = \ 4383 __ATTR(_name, 0400, _name##_show, NULL) 4384 4385 #define SLAB_ATTR(_name) \ 4386 static struct slab_attribute _name##_attr = \ 4387 __ATTR(_name, 0600, _name##_show, _name##_store) 4388 4389 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4390 { 4391 return sprintf(buf, "%d\n", s->size); 4392 } 4393 SLAB_ATTR_RO(slab_size); 4394 4395 static ssize_t align_show(struct kmem_cache *s, char *buf) 4396 { 4397 return sprintf(buf, "%d\n", s->align); 4398 } 4399 SLAB_ATTR_RO(align); 4400 4401 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4402 { 4403 return sprintf(buf, "%d\n", s->object_size); 4404 } 4405 SLAB_ATTR_RO(object_size); 4406 4407 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4408 { 4409 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4410 } 4411 SLAB_ATTR_RO(objs_per_slab); 4412 4413 static ssize_t order_store(struct kmem_cache *s, 4414 const char *buf, size_t length) 4415 { 4416 unsigned long order; 4417 int err; 4418 4419 err = kstrtoul(buf, 10, &order); 4420 if (err) 4421 return err; 4422 4423 if (order > slub_max_order || order < slub_min_order) 4424 return -EINVAL; 4425 4426 calculate_sizes(s, order); 4427 return length; 4428 } 4429 4430 static ssize_t order_show(struct kmem_cache *s, char *buf) 4431 { 4432 return sprintf(buf, "%d\n", oo_order(s->oo)); 4433 } 4434 SLAB_ATTR(order); 4435 4436 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4437 { 4438 return sprintf(buf, "%lu\n", s->min_partial); 4439 } 4440 4441 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4442 size_t length) 4443 { 4444 unsigned long min; 4445 int err; 4446 4447 err = kstrtoul(buf, 10, &min); 4448 if (err) 4449 return err; 4450 4451 set_min_partial(s, min); 4452 return length; 4453 } 4454 SLAB_ATTR(min_partial); 4455 4456 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 4457 { 4458 return sprintf(buf, "%u\n", s->cpu_partial); 4459 } 4460 4461 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4462 size_t length) 4463 { 4464 unsigned long objects; 4465 int err; 4466 4467 err = kstrtoul(buf, 10, &objects); 4468 if (err) 4469 return err; 4470 if (objects && !kmem_cache_has_cpu_partial(s)) 4471 return -EINVAL; 4472 4473 s->cpu_partial = objects; 4474 flush_all(s); 4475 return length; 4476 } 4477 SLAB_ATTR(cpu_partial); 4478 4479 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4480 { 4481 if (!s->ctor) 4482 return 0; 4483 return sprintf(buf, "%pS\n", s->ctor); 4484 } 4485 SLAB_ATTR_RO(ctor); 4486 4487 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4488 { 4489 return sprintf(buf, "%d\n", s->refcount - 1); 4490 } 4491 SLAB_ATTR_RO(aliases); 4492 4493 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4494 { 4495 return show_slab_objects(s, buf, SO_PARTIAL); 4496 } 4497 SLAB_ATTR_RO(partial); 4498 4499 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4500 { 4501 return show_slab_objects(s, buf, SO_CPU); 4502 } 4503 SLAB_ATTR_RO(cpu_slabs); 4504 4505 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4506 { 4507 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4508 } 4509 SLAB_ATTR_RO(objects); 4510 4511 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4512 { 4513 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4514 } 4515 SLAB_ATTR_RO(objects_partial); 4516 4517 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 4518 { 4519 int objects = 0; 4520 int pages = 0; 4521 int cpu; 4522 int len; 4523 4524 for_each_online_cpu(cpu) { 4525 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; 4526 4527 if (page) { 4528 pages += page->pages; 4529 objects += page->pobjects; 4530 } 4531 } 4532 4533 len = sprintf(buf, "%d(%d)", objects, pages); 4534 4535 #ifdef CONFIG_SMP 4536 for_each_online_cpu(cpu) { 4537 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; 4538 4539 if (page && len < PAGE_SIZE - 20) 4540 len += sprintf(buf + len, " C%d=%d(%d)", cpu, 4541 page->pobjects, page->pages); 4542 } 4543 #endif 4544 return len + sprintf(buf + len, "\n"); 4545 } 4546 SLAB_ATTR_RO(slabs_cpu_partial); 4547 4548 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4549 { 4550 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4551 } 4552 4553 static ssize_t reclaim_account_store(struct kmem_cache *s, 4554 const char *buf, size_t length) 4555 { 4556 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4557 if (buf[0] == '1') 4558 s->flags |= SLAB_RECLAIM_ACCOUNT; 4559 return length; 4560 } 4561 SLAB_ATTR(reclaim_account); 4562 4563 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4564 { 4565 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4566 } 4567 SLAB_ATTR_RO(hwcache_align); 4568 4569 #ifdef CONFIG_ZONE_DMA 4570 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4571 { 4572 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4573 } 4574 SLAB_ATTR_RO(cache_dma); 4575 #endif 4576 4577 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4578 { 4579 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4580 } 4581 SLAB_ATTR_RO(destroy_by_rcu); 4582 4583 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4584 { 4585 return sprintf(buf, "%d\n", s->reserved); 4586 } 4587 SLAB_ATTR_RO(reserved); 4588 4589 #ifdef CONFIG_SLUB_DEBUG 4590 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4591 { 4592 return show_slab_objects(s, buf, SO_ALL); 4593 } 4594 SLAB_ATTR_RO(slabs); 4595 4596 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4597 { 4598 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4599 } 4600 SLAB_ATTR_RO(total_objects); 4601 4602 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4603 { 4604 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4605 } 4606 4607 static ssize_t sanity_checks_store(struct kmem_cache *s, 4608 const char *buf, size_t length) 4609 { 4610 s->flags &= ~SLAB_DEBUG_FREE; 4611 if (buf[0] == '1') { 4612 s->flags &= ~__CMPXCHG_DOUBLE; 4613 s->flags |= SLAB_DEBUG_FREE; 4614 } 4615 return length; 4616 } 4617 SLAB_ATTR(sanity_checks); 4618 4619 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4620 { 4621 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4622 } 4623 4624 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4625 size_t length) 4626 { 4627 s->flags &= ~SLAB_TRACE; 4628 if (buf[0] == '1') { 4629 s->flags &= ~__CMPXCHG_DOUBLE; 4630 s->flags |= SLAB_TRACE; 4631 } 4632 return length; 4633 } 4634 SLAB_ATTR(trace); 4635 4636 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4637 { 4638 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4639 } 4640 4641 static ssize_t red_zone_store(struct kmem_cache *s, 4642 const char *buf, size_t length) 4643 { 4644 if (any_slab_objects(s)) 4645 return -EBUSY; 4646 4647 s->flags &= ~SLAB_RED_ZONE; 4648 if (buf[0] == '1') { 4649 s->flags &= ~__CMPXCHG_DOUBLE; 4650 s->flags |= SLAB_RED_ZONE; 4651 } 4652 calculate_sizes(s, -1); 4653 return length; 4654 } 4655 SLAB_ATTR(red_zone); 4656 4657 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4658 { 4659 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4660 } 4661 4662 static ssize_t poison_store(struct kmem_cache *s, 4663 const char *buf, size_t length) 4664 { 4665 if (any_slab_objects(s)) 4666 return -EBUSY; 4667 4668 s->flags &= ~SLAB_POISON; 4669 if (buf[0] == '1') { 4670 s->flags &= ~__CMPXCHG_DOUBLE; 4671 s->flags |= SLAB_POISON; 4672 } 4673 calculate_sizes(s, -1); 4674 return length; 4675 } 4676 SLAB_ATTR(poison); 4677 4678 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4679 { 4680 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4681 } 4682 4683 static ssize_t store_user_store(struct kmem_cache *s, 4684 const char *buf, size_t length) 4685 { 4686 if (any_slab_objects(s)) 4687 return -EBUSY; 4688 4689 s->flags &= ~SLAB_STORE_USER; 4690 if (buf[0] == '1') { 4691 s->flags &= ~__CMPXCHG_DOUBLE; 4692 s->flags |= SLAB_STORE_USER; 4693 } 4694 calculate_sizes(s, -1); 4695 return length; 4696 } 4697 SLAB_ATTR(store_user); 4698 4699 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4700 { 4701 return 0; 4702 } 4703 4704 static ssize_t validate_store(struct kmem_cache *s, 4705 const char *buf, size_t length) 4706 { 4707 int ret = -EINVAL; 4708 4709 if (buf[0] == '1') { 4710 ret = validate_slab_cache(s); 4711 if (ret >= 0) 4712 ret = length; 4713 } 4714 return ret; 4715 } 4716 SLAB_ATTR(validate); 4717 4718 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4719 { 4720 if (!(s->flags & SLAB_STORE_USER)) 4721 return -ENOSYS; 4722 return list_locations(s, buf, TRACK_ALLOC); 4723 } 4724 SLAB_ATTR_RO(alloc_calls); 4725 4726 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4727 { 4728 if (!(s->flags & SLAB_STORE_USER)) 4729 return -ENOSYS; 4730 return list_locations(s, buf, TRACK_FREE); 4731 } 4732 SLAB_ATTR_RO(free_calls); 4733 #endif /* CONFIG_SLUB_DEBUG */ 4734 4735 #ifdef CONFIG_FAILSLAB 4736 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4737 { 4738 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4739 } 4740 4741 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4742 size_t length) 4743 { 4744 s->flags &= ~SLAB_FAILSLAB; 4745 if (buf[0] == '1') 4746 s->flags |= SLAB_FAILSLAB; 4747 return length; 4748 } 4749 SLAB_ATTR(failslab); 4750 #endif 4751 4752 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4753 { 4754 return 0; 4755 } 4756 4757 static ssize_t shrink_store(struct kmem_cache *s, 4758 const char *buf, size_t length) 4759 { 4760 if (buf[0] == '1') { 4761 int rc = kmem_cache_shrink(s); 4762 4763 if (rc) 4764 return rc; 4765 } else 4766 return -EINVAL; 4767 return length; 4768 } 4769 SLAB_ATTR(shrink); 4770 4771 #ifdef CONFIG_NUMA 4772 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4773 { 4774 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4775 } 4776 4777 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4778 const char *buf, size_t length) 4779 { 4780 unsigned long ratio; 4781 int err; 4782 4783 err = kstrtoul(buf, 10, &ratio); 4784 if (err) 4785 return err; 4786 4787 if (ratio <= 100) 4788 s->remote_node_defrag_ratio = ratio * 10; 4789 4790 return length; 4791 } 4792 SLAB_ATTR(remote_node_defrag_ratio); 4793 #endif 4794 4795 #ifdef CONFIG_SLUB_STATS 4796 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4797 { 4798 unsigned long sum = 0; 4799 int cpu; 4800 int len; 4801 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4802 4803 if (!data) 4804 return -ENOMEM; 4805 4806 for_each_online_cpu(cpu) { 4807 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4808 4809 data[cpu] = x; 4810 sum += x; 4811 } 4812 4813 len = sprintf(buf, "%lu", sum); 4814 4815 #ifdef CONFIG_SMP 4816 for_each_online_cpu(cpu) { 4817 if (data[cpu] && len < PAGE_SIZE - 20) 4818 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4819 } 4820 #endif 4821 kfree(data); 4822 return len + sprintf(buf + len, "\n"); 4823 } 4824 4825 static void clear_stat(struct kmem_cache *s, enum stat_item si) 4826 { 4827 int cpu; 4828 4829 for_each_online_cpu(cpu) 4830 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4831 } 4832 4833 #define STAT_ATTR(si, text) \ 4834 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4835 { \ 4836 return show_stat(s, buf, si); \ 4837 } \ 4838 static ssize_t text##_store(struct kmem_cache *s, \ 4839 const char *buf, size_t length) \ 4840 { \ 4841 if (buf[0] != '0') \ 4842 return -EINVAL; \ 4843 clear_stat(s, si); \ 4844 return length; \ 4845 } \ 4846 SLAB_ATTR(text); \ 4847 4848 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4849 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4850 STAT_ATTR(FREE_FASTPATH, free_fastpath); 4851 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4852 STAT_ATTR(FREE_FROZEN, free_frozen); 4853 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4854 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4855 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4856 STAT_ATTR(ALLOC_SLAB, alloc_slab); 4857 STAT_ATTR(ALLOC_REFILL, alloc_refill); 4858 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 4859 STAT_ATTR(FREE_SLAB, free_slab); 4860 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4861 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4862 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4863 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4864 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4865 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4866 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 4867 STAT_ATTR(ORDER_FALLBACK, order_fallback); 4868 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4869 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4870 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 4871 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 4872 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 4873 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 4874 #endif 4875 4876 static struct attribute *slab_attrs[] = { 4877 &slab_size_attr.attr, 4878 &object_size_attr.attr, 4879 &objs_per_slab_attr.attr, 4880 &order_attr.attr, 4881 &min_partial_attr.attr, 4882 &cpu_partial_attr.attr, 4883 &objects_attr.attr, 4884 &objects_partial_attr.attr, 4885 &partial_attr.attr, 4886 &cpu_slabs_attr.attr, 4887 &ctor_attr.attr, 4888 &aliases_attr.attr, 4889 &align_attr.attr, 4890 &hwcache_align_attr.attr, 4891 &reclaim_account_attr.attr, 4892 &destroy_by_rcu_attr.attr, 4893 &shrink_attr.attr, 4894 &reserved_attr.attr, 4895 &slabs_cpu_partial_attr.attr, 4896 #ifdef CONFIG_SLUB_DEBUG 4897 &total_objects_attr.attr, 4898 &slabs_attr.attr, 4899 &sanity_checks_attr.attr, 4900 &trace_attr.attr, 4901 &red_zone_attr.attr, 4902 &poison_attr.attr, 4903 &store_user_attr.attr, 4904 &validate_attr.attr, 4905 &alloc_calls_attr.attr, 4906 &free_calls_attr.attr, 4907 #endif 4908 #ifdef CONFIG_ZONE_DMA 4909 &cache_dma_attr.attr, 4910 #endif 4911 #ifdef CONFIG_NUMA 4912 &remote_node_defrag_ratio_attr.attr, 4913 #endif 4914 #ifdef CONFIG_SLUB_STATS 4915 &alloc_fastpath_attr.attr, 4916 &alloc_slowpath_attr.attr, 4917 &free_fastpath_attr.attr, 4918 &free_slowpath_attr.attr, 4919 &free_frozen_attr.attr, 4920 &free_add_partial_attr.attr, 4921 &free_remove_partial_attr.attr, 4922 &alloc_from_partial_attr.attr, 4923 &alloc_slab_attr.attr, 4924 &alloc_refill_attr.attr, 4925 &alloc_node_mismatch_attr.attr, 4926 &free_slab_attr.attr, 4927 &cpuslab_flush_attr.attr, 4928 &deactivate_full_attr.attr, 4929 &deactivate_empty_attr.attr, 4930 &deactivate_to_head_attr.attr, 4931 &deactivate_to_tail_attr.attr, 4932 &deactivate_remote_frees_attr.attr, 4933 &deactivate_bypass_attr.attr, 4934 &order_fallback_attr.attr, 4935 &cmpxchg_double_fail_attr.attr, 4936 &cmpxchg_double_cpu_fail_attr.attr, 4937 &cpu_partial_alloc_attr.attr, 4938 &cpu_partial_free_attr.attr, 4939 &cpu_partial_node_attr.attr, 4940 &cpu_partial_drain_attr.attr, 4941 #endif 4942 #ifdef CONFIG_FAILSLAB 4943 &failslab_attr.attr, 4944 #endif 4945 4946 NULL 4947 }; 4948 4949 static struct attribute_group slab_attr_group = { 4950 .attrs = slab_attrs, 4951 }; 4952 4953 static ssize_t slab_attr_show(struct kobject *kobj, 4954 struct attribute *attr, 4955 char *buf) 4956 { 4957 struct slab_attribute *attribute; 4958 struct kmem_cache *s; 4959 int err; 4960 4961 attribute = to_slab_attr(attr); 4962 s = to_slab(kobj); 4963 4964 if (!attribute->show) 4965 return -EIO; 4966 4967 err = attribute->show(s, buf); 4968 4969 return err; 4970 } 4971 4972 static ssize_t slab_attr_store(struct kobject *kobj, 4973 struct attribute *attr, 4974 const char *buf, size_t len) 4975 { 4976 struct slab_attribute *attribute; 4977 struct kmem_cache *s; 4978 int err; 4979 4980 attribute = to_slab_attr(attr); 4981 s = to_slab(kobj); 4982 4983 if (!attribute->store) 4984 return -EIO; 4985 4986 err = attribute->store(s, buf, len); 4987 #ifdef CONFIG_MEMCG_KMEM 4988 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 4989 int i; 4990 4991 mutex_lock(&slab_mutex); 4992 if (s->max_attr_size < len) 4993 s->max_attr_size = len; 4994 4995 /* 4996 * This is a best effort propagation, so this function's return 4997 * value will be determined by the parent cache only. This is 4998 * basically because not all attributes will have a well 4999 * defined semantics for rollbacks - most of the actions will 5000 * have permanent effects. 5001 * 5002 * Returning the error value of any of the children that fail 5003 * is not 100 % defined, in the sense that users seeing the 5004 * error code won't be able to know anything about the state of 5005 * the cache. 5006 * 5007 * Only returning the error code for the parent cache at least 5008 * has well defined semantics. The cache being written to 5009 * directly either failed or succeeded, in which case we loop 5010 * through the descendants with best-effort propagation. 5011 */ 5012 for_each_memcg_cache_index(i) { 5013 struct kmem_cache *c = cache_from_memcg_idx(s, i); 5014 if (c) 5015 attribute->store(c, buf, len); 5016 } 5017 mutex_unlock(&slab_mutex); 5018 } 5019 #endif 5020 return err; 5021 } 5022 5023 static void memcg_propagate_slab_attrs(struct kmem_cache *s) 5024 { 5025 #ifdef CONFIG_MEMCG_KMEM 5026 int i; 5027 char *buffer = NULL; 5028 5029 if (!is_root_cache(s)) 5030 return; 5031 5032 /* 5033 * This mean this cache had no attribute written. Therefore, no point 5034 * in copying default values around 5035 */ 5036 if (!s->max_attr_size) 5037 return; 5038 5039 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { 5040 char mbuf[64]; 5041 char *buf; 5042 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); 5043 5044 if (!attr || !attr->store || !attr->show) 5045 continue; 5046 5047 /* 5048 * It is really bad that we have to allocate here, so we will 5049 * do it only as a fallback. If we actually allocate, though, 5050 * we can just use the allocated buffer until the end. 5051 * 5052 * Most of the slub attributes will tend to be very small in 5053 * size, but sysfs allows buffers up to a page, so they can 5054 * theoretically happen. 5055 */ 5056 if (buffer) 5057 buf = buffer; 5058 else if (s->max_attr_size < ARRAY_SIZE(mbuf)) 5059 buf = mbuf; 5060 else { 5061 buffer = (char *) get_zeroed_page(GFP_KERNEL); 5062 if (WARN_ON(!buffer)) 5063 continue; 5064 buf = buffer; 5065 } 5066 5067 attr->show(s->memcg_params->root_cache, buf); 5068 attr->store(s, buf, strlen(buf)); 5069 } 5070 5071 if (buffer) 5072 free_page((unsigned long)buffer); 5073 #endif 5074 } 5075 5076 static const struct sysfs_ops slab_sysfs_ops = { 5077 .show = slab_attr_show, 5078 .store = slab_attr_store, 5079 }; 5080 5081 static struct kobj_type slab_ktype = { 5082 .sysfs_ops = &slab_sysfs_ops, 5083 }; 5084 5085 static int uevent_filter(struct kset *kset, struct kobject *kobj) 5086 { 5087 struct kobj_type *ktype = get_ktype(kobj); 5088 5089 if (ktype == &slab_ktype) 5090 return 1; 5091 return 0; 5092 } 5093 5094 static const struct kset_uevent_ops slab_uevent_ops = { 5095 .filter = uevent_filter, 5096 }; 5097 5098 static struct kset *slab_kset; 5099 5100 #define ID_STR_LENGTH 64 5101 5102 /* Create a unique string id for a slab cache: 5103 * 5104 * Format :[flags-]size 5105 */ 5106 static char *create_unique_id(struct kmem_cache *s) 5107 { 5108 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 5109 char *p = name; 5110 5111 BUG_ON(!name); 5112 5113 *p++ = ':'; 5114 /* 5115 * First flags affecting slabcache operations. We will only 5116 * get here for aliasable slabs so we do not need to support 5117 * too many flags. The flags here must cover all flags that 5118 * are matched during merging to guarantee that the id is 5119 * unique. 5120 */ 5121 if (s->flags & SLAB_CACHE_DMA) 5122 *p++ = 'd'; 5123 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5124 *p++ = 'a'; 5125 if (s->flags & SLAB_DEBUG_FREE) 5126 *p++ = 'F'; 5127 if (!(s->flags & SLAB_NOTRACK)) 5128 *p++ = 't'; 5129 if (p != name + 1) 5130 *p++ = '-'; 5131 p += sprintf(p, "%07d", s->size); 5132 5133 #ifdef CONFIG_MEMCG_KMEM 5134 if (!is_root_cache(s)) 5135 p += sprintf(p, "-%08d", 5136 memcg_cache_id(s->memcg_params->memcg)); 5137 #endif 5138 5139 BUG_ON(p > name + ID_STR_LENGTH - 1); 5140 return name; 5141 } 5142 5143 static int sysfs_slab_add(struct kmem_cache *s) 5144 { 5145 int err; 5146 const char *name; 5147 int unmergeable = slab_unmergeable(s); 5148 5149 if (unmergeable) { 5150 /* 5151 * Slabcache can never be merged so we can use the name proper. 5152 * This is typically the case for debug situations. In that 5153 * case we can catch duplicate names easily. 5154 */ 5155 sysfs_remove_link(&slab_kset->kobj, s->name); 5156 name = s->name; 5157 } else { 5158 /* 5159 * Create a unique name for the slab as a target 5160 * for the symlinks. 5161 */ 5162 name = create_unique_id(s); 5163 } 5164 5165 s->kobj.kset = slab_kset; 5166 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5167 if (err) { 5168 kobject_put(&s->kobj); 5169 return err; 5170 } 5171 5172 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5173 if (err) { 5174 kobject_del(&s->kobj); 5175 kobject_put(&s->kobj); 5176 return err; 5177 } 5178 kobject_uevent(&s->kobj, KOBJ_ADD); 5179 if (!unmergeable) { 5180 /* Setup first alias */ 5181 sysfs_slab_alias(s, s->name); 5182 kfree(name); 5183 } 5184 return 0; 5185 } 5186 5187 static void sysfs_slab_remove(struct kmem_cache *s) 5188 { 5189 if (slab_state < FULL) 5190 /* 5191 * Sysfs has not been setup yet so no need to remove the 5192 * cache from sysfs. 5193 */ 5194 return; 5195 5196 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5197 kobject_del(&s->kobj); 5198 kobject_put(&s->kobj); 5199 } 5200 5201 /* 5202 * Need to buffer aliases during bootup until sysfs becomes 5203 * available lest we lose that information. 5204 */ 5205 struct saved_alias { 5206 struct kmem_cache *s; 5207 const char *name; 5208 struct saved_alias *next; 5209 }; 5210 5211 static struct saved_alias *alias_list; 5212 5213 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5214 { 5215 struct saved_alias *al; 5216 5217 if (slab_state == FULL) { 5218 /* 5219 * If we have a leftover link then remove it. 5220 */ 5221 sysfs_remove_link(&slab_kset->kobj, name); 5222 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5223 } 5224 5225 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5226 if (!al) 5227 return -ENOMEM; 5228 5229 al->s = s; 5230 al->name = name; 5231 al->next = alias_list; 5232 alias_list = al; 5233 return 0; 5234 } 5235 5236 static int __init slab_sysfs_init(void) 5237 { 5238 struct kmem_cache *s; 5239 int err; 5240 5241 mutex_lock(&slab_mutex); 5242 5243 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5244 if (!slab_kset) { 5245 mutex_unlock(&slab_mutex); 5246 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5247 return -ENOSYS; 5248 } 5249 5250 slab_state = FULL; 5251 5252 list_for_each_entry(s, &slab_caches, list) { 5253 err = sysfs_slab_add(s); 5254 if (err) 5255 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5256 " to sysfs\n", s->name); 5257 } 5258 5259 while (alias_list) { 5260 struct saved_alias *al = alias_list; 5261 5262 alias_list = alias_list->next; 5263 err = sysfs_slab_alias(al->s, al->name); 5264 if (err) 5265 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5266 " %s to sysfs\n", al->name); 5267 kfree(al); 5268 } 5269 5270 mutex_unlock(&slab_mutex); 5271 resiliency_test(); 5272 return 0; 5273 } 5274 5275 __initcall(slab_sysfs_init); 5276 #endif /* CONFIG_SYSFS */ 5277 5278 /* 5279 * The /proc/slabinfo ABI 5280 */ 5281 #ifdef CONFIG_SLABINFO 5282 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5283 { 5284 unsigned long nr_slabs = 0; 5285 unsigned long nr_objs = 0; 5286 unsigned long nr_free = 0; 5287 int node; 5288 5289 for_each_online_node(node) { 5290 struct kmem_cache_node *n = get_node(s, node); 5291 5292 if (!n) 5293 continue; 5294 5295 nr_slabs += node_nr_slabs(n); 5296 nr_objs += node_nr_objs(n); 5297 nr_free += count_partial(n, count_free); 5298 } 5299 5300 sinfo->active_objs = nr_objs - nr_free; 5301 sinfo->num_objs = nr_objs; 5302 sinfo->active_slabs = nr_slabs; 5303 sinfo->num_slabs = nr_slabs; 5304 sinfo->objects_per_slab = oo_objects(s->oo); 5305 sinfo->cache_order = oo_order(s->oo); 5306 } 5307 5308 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) 5309 { 5310 } 5311 5312 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 5313 size_t count, loff_t *ppos) 5314 { 5315 return -EIO; 5316 } 5317 #endif /* CONFIG_SLABINFO */ 5318