1 /* 2 * Slab allocator functions that are independent of the allocator strategy 3 * 4 * (C) 2012 Christoph Lameter <cl@linux.com> 5 */ 6 #include <linux/slab.h> 7 8 #include <linux/mm.h> 9 #include <linux/poison.h> 10 #include <linux/interrupt.h> 11 #include <linux/memory.h> 12 #include <linux/compiler.h> 13 #include <linux/module.h> 14 #include <linux/cpu.h> 15 #include <linux/uaccess.h> 16 #include <linux/seq_file.h> 17 #include <linux/proc_fs.h> 18 #include <asm/cacheflush.h> 19 #include <asm/tlbflush.h> 20 #include <asm/page.h> 21 #include <linux/memcontrol.h> 22 #include <trace/events/kmem.h> 23 24 #include "slab.h" 25 26 enum slab_state slab_state; 27 LIST_HEAD(slab_caches); 28 DEFINE_MUTEX(slab_mutex); 29 struct kmem_cache *kmem_cache; 30 31 #ifdef CONFIG_DEBUG_VM 32 static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, 33 size_t size) 34 { 35 struct kmem_cache *s = NULL; 36 37 if (!name || in_interrupt() || size < sizeof(void *) || 38 size > KMALLOC_MAX_SIZE) { 39 pr_err("kmem_cache_create(%s) integrity check failed\n", name); 40 return -EINVAL; 41 } 42 43 list_for_each_entry(s, &slab_caches, list) { 44 char tmp; 45 int res; 46 47 /* 48 * This happens when the module gets unloaded and doesn't 49 * destroy its slab cache and no-one else reuses the vmalloc 50 * area of the module. Print a warning. 51 */ 52 res = probe_kernel_address(s->name, tmp); 53 if (res) { 54 pr_err("Slab cache with size %d has lost its name\n", 55 s->object_size); 56 continue; 57 } 58 59 #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) 60 /* 61 * For simplicity, we won't check this in the list of memcg 62 * caches. We have control over memcg naming, and if there 63 * aren't duplicates in the global list, there won't be any 64 * duplicates in the memcg lists as well. 65 */ 66 if (!memcg && !strcmp(s->name, name)) { 67 pr_err("%s (%s): Cache name already exists.\n", 68 __func__, name); 69 dump_stack(); 70 s = NULL; 71 return -EINVAL; 72 } 73 #endif 74 } 75 76 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 77 return 0; 78 } 79 #else 80 static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, 81 const char *name, size_t size) 82 { 83 return 0; 84 } 85 #endif 86 87 #ifdef CONFIG_MEMCG_KMEM 88 int memcg_update_all_caches(int num_memcgs) 89 { 90 struct kmem_cache *s; 91 int ret = 0; 92 mutex_lock(&slab_mutex); 93 94 list_for_each_entry(s, &slab_caches, list) { 95 if (!is_root_cache(s)) 96 continue; 97 98 ret = memcg_update_cache_size(s, num_memcgs); 99 /* 100 * See comment in memcontrol.c, memcg_update_cache_size: 101 * Instead of freeing the memory, we'll just leave the caches 102 * up to this point in an updated state. 103 */ 104 if (ret) 105 goto out; 106 } 107 108 memcg_update_array_size(num_memcgs); 109 out: 110 mutex_unlock(&slab_mutex); 111 return ret; 112 } 113 #endif 114 115 /* 116 * Figure out what the alignment of the objects will be given a set of 117 * flags, a user specified alignment and the size of the objects. 118 */ 119 unsigned long calculate_alignment(unsigned long flags, 120 unsigned long align, unsigned long size) 121 { 122 /* 123 * If the user wants hardware cache aligned objects then follow that 124 * suggestion if the object is sufficiently large. 125 * 126 * The hardware cache alignment cannot override the specified 127 * alignment though. If that is greater then use it. 128 */ 129 if (flags & SLAB_HWCACHE_ALIGN) { 130 unsigned long ralign = cache_line_size(); 131 while (size <= ralign / 2) 132 ralign /= 2; 133 align = max(align, ralign); 134 } 135 136 if (align < ARCH_SLAB_MINALIGN) 137 align = ARCH_SLAB_MINALIGN; 138 139 return ALIGN(align, sizeof(void *)); 140 } 141 142 143 /* 144 * kmem_cache_create - Create a cache. 145 * @name: A string which is used in /proc/slabinfo to identify this cache. 146 * @size: The size of objects to be created in this cache. 147 * @align: The required alignment for the objects. 148 * @flags: SLAB flags 149 * @ctor: A constructor for the objects. 150 * 151 * Returns a ptr to the cache on success, NULL on failure. 152 * Cannot be called within a interrupt, but can be interrupted. 153 * The @ctor is run when new pages are allocated by the cache. 154 * 155 * The flags are 156 * 157 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 158 * to catch references to uninitialised memory. 159 * 160 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 161 * for buffer overruns. 162 * 163 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 164 * cacheline. This can be beneficial if you're counting cycles as closely 165 * as davem. 166 */ 167 168 struct kmem_cache * 169 kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, 170 size_t align, unsigned long flags, void (*ctor)(void *), 171 struct kmem_cache *parent_cache) 172 { 173 struct kmem_cache *s = NULL; 174 int err; 175 176 get_online_cpus(); 177 mutex_lock(&slab_mutex); 178 179 err = kmem_cache_sanity_check(memcg, name, size); 180 if (err) 181 goto out_unlock; 182 183 if (memcg) { 184 /* 185 * Since per-memcg caches are created asynchronously on first 186 * allocation (see memcg_kmem_get_cache()), several threads can 187 * try to create the same cache, but only one of them may 188 * succeed. Therefore if we get here and see the cache has 189 * already been created, we silently return NULL. 190 */ 191 if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) 192 goto out_unlock; 193 } 194 195 /* 196 * Some allocators will constraint the set of valid flags to a subset 197 * of all flags. We expect them to define CACHE_CREATE_MASK in this 198 * case, and we'll just provide them with a sanitized version of the 199 * passed flags. 200 */ 201 flags &= CACHE_CREATE_MASK; 202 203 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); 204 if (s) 205 goto out_unlock; 206 207 err = -ENOMEM; 208 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 209 if (!s) 210 goto out_unlock; 211 212 s->object_size = s->size = size; 213 s->align = calculate_alignment(flags, align, size); 214 s->ctor = ctor; 215 216 s->name = kstrdup(name, GFP_KERNEL); 217 if (!s->name) 218 goto out_free_cache; 219 220 err = memcg_alloc_cache_params(memcg, s, parent_cache); 221 if (err) 222 goto out_free_cache; 223 224 err = __kmem_cache_create(s, flags); 225 if (err) 226 goto out_free_cache; 227 228 s->refcount = 1; 229 list_add(&s->list, &slab_caches); 230 memcg_register_cache(s); 231 232 out_unlock: 233 mutex_unlock(&slab_mutex); 234 put_online_cpus(); 235 236 if (err) { 237 /* 238 * There is no point in flooding logs with warnings or 239 * especially crashing the system if we fail to create a cache 240 * for a memcg. In this case we will be accounting the memcg 241 * allocation to the root cgroup until we succeed to create its 242 * own cache, but it isn't that critical. 243 */ 244 if (!memcg) 245 return NULL; 246 247 if (flags & SLAB_PANIC) 248 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", 249 name, err); 250 else { 251 printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", 252 name, err); 253 dump_stack(); 254 } 255 return NULL; 256 } 257 return s; 258 259 out_free_cache: 260 memcg_free_cache_params(s); 261 kfree(s->name); 262 kmem_cache_free(kmem_cache, s); 263 goto out_unlock; 264 } 265 266 struct kmem_cache * 267 kmem_cache_create(const char *name, size_t size, size_t align, 268 unsigned long flags, void (*ctor)(void *)) 269 { 270 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); 271 } 272 EXPORT_SYMBOL(kmem_cache_create); 273 274 void kmem_cache_destroy(struct kmem_cache *s) 275 { 276 /* Destroy all the children caches if we aren't a memcg cache */ 277 kmem_cache_destroy_memcg_children(s); 278 279 get_online_cpus(); 280 mutex_lock(&slab_mutex); 281 s->refcount--; 282 if (!s->refcount) { 283 list_del(&s->list); 284 285 if (!__kmem_cache_shutdown(s)) { 286 memcg_unregister_cache(s); 287 mutex_unlock(&slab_mutex); 288 if (s->flags & SLAB_DESTROY_BY_RCU) 289 rcu_barrier(); 290 291 memcg_free_cache_params(s); 292 kfree(s->name); 293 kmem_cache_free(kmem_cache, s); 294 } else { 295 list_add(&s->list, &slab_caches); 296 mutex_unlock(&slab_mutex); 297 printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", 298 s->name); 299 dump_stack(); 300 } 301 } else { 302 mutex_unlock(&slab_mutex); 303 } 304 put_online_cpus(); 305 } 306 EXPORT_SYMBOL(kmem_cache_destroy); 307 308 int slab_is_available(void) 309 { 310 return slab_state >= UP; 311 } 312 313 #ifndef CONFIG_SLOB 314 /* Create a cache during boot when no slab services are available yet */ 315 void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, 316 unsigned long flags) 317 { 318 int err; 319 320 s->name = name; 321 s->size = s->object_size = size; 322 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); 323 err = __kmem_cache_create(s, flags); 324 325 if (err) 326 panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", 327 name, size, err); 328 329 s->refcount = -1; /* Exempt from merging for now */ 330 } 331 332 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, 333 unsigned long flags) 334 { 335 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 336 337 if (!s) 338 panic("Out of memory when creating slab %s\n", name); 339 340 create_boot_cache(s, name, size, flags); 341 list_add(&s->list, &slab_caches); 342 s->refcount = 1; 343 return s; 344 } 345 346 struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 347 EXPORT_SYMBOL(kmalloc_caches); 348 349 #ifdef CONFIG_ZONE_DMA 350 struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; 351 EXPORT_SYMBOL(kmalloc_dma_caches); 352 #endif 353 354 /* 355 * Conversion table for small slabs sizes / 8 to the index in the 356 * kmalloc array. This is necessary for slabs < 192 since we have non power 357 * of two cache sizes there. The size of larger slabs can be determined using 358 * fls. 359 */ 360 static s8 size_index[24] = { 361 3, /* 8 */ 362 4, /* 16 */ 363 5, /* 24 */ 364 5, /* 32 */ 365 6, /* 40 */ 366 6, /* 48 */ 367 6, /* 56 */ 368 6, /* 64 */ 369 1, /* 72 */ 370 1, /* 80 */ 371 1, /* 88 */ 372 1, /* 96 */ 373 7, /* 104 */ 374 7, /* 112 */ 375 7, /* 120 */ 376 7, /* 128 */ 377 2, /* 136 */ 378 2, /* 144 */ 379 2, /* 152 */ 380 2, /* 160 */ 381 2, /* 168 */ 382 2, /* 176 */ 383 2, /* 184 */ 384 2 /* 192 */ 385 }; 386 387 static inline int size_index_elem(size_t bytes) 388 { 389 return (bytes - 1) / 8; 390 } 391 392 /* 393 * Find the kmem_cache structure that serves a given size of 394 * allocation 395 */ 396 struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) 397 { 398 int index; 399 400 if (unlikely(size > KMALLOC_MAX_SIZE)) { 401 WARN_ON_ONCE(!(flags & __GFP_NOWARN)); 402 return NULL; 403 } 404 405 if (size <= 192) { 406 if (!size) 407 return ZERO_SIZE_PTR; 408 409 index = size_index[size_index_elem(size)]; 410 } else 411 index = fls(size - 1); 412 413 #ifdef CONFIG_ZONE_DMA 414 if (unlikely((flags & GFP_DMA))) 415 return kmalloc_dma_caches[index]; 416 417 #endif 418 return kmalloc_caches[index]; 419 } 420 421 /* 422 * Create the kmalloc array. Some of the regular kmalloc arrays 423 * may already have been created because they were needed to 424 * enable allocations for slab creation. 425 */ 426 void __init create_kmalloc_caches(unsigned long flags) 427 { 428 int i; 429 430 /* 431 * Patch up the size_index table if we have strange large alignment 432 * requirements for the kmalloc array. This is only the case for 433 * MIPS it seems. The standard arches will not generate any code here. 434 * 435 * Largest permitted alignment is 256 bytes due to the way we 436 * handle the index determination for the smaller caches. 437 * 438 * Make sure that nothing crazy happens if someone starts tinkering 439 * around with ARCH_KMALLOC_MINALIGN 440 */ 441 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 442 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 443 444 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 445 int elem = size_index_elem(i); 446 447 if (elem >= ARRAY_SIZE(size_index)) 448 break; 449 size_index[elem] = KMALLOC_SHIFT_LOW; 450 } 451 452 if (KMALLOC_MIN_SIZE >= 64) { 453 /* 454 * The 96 byte size cache is not used if the alignment 455 * is 64 byte. 456 */ 457 for (i = 64 + 8; i <= 96; i += 8) 458 size_index[size_index_elem(i)] = 7; 459 460 } 461 462 if (KMALLOC_MIN_SIZE >= 128) { 463 /* 464 * The 192 byte sized cache is not used if the alignment 465 * is 128 byte. Redirect kmalloc to use the 256 byte cache 466 * instead. 467 */ 468 for (i = 128 + 8; i <= 192; i += 8) 469 size_index[size_index_elem(i)] = 8; 470 } 471 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 472 if (!kmalloc_caches[i]) { 473 kmalloc_caches[i] = create_kmalloc_cache(NULL, 474 1 << i, flags); 475 } 476 477 /* 478 * Caches that are not of the two-to-the-power-of size. 479 * These have to be created immediately after the 480 * earlier power of two caches 481 */ 482 if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) 483 kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); 484 485 if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) 486 kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); 487 } 488 489 /* Kmalloc array is now usable */ 490 slab_state = UP; 491 492 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 493 struct kmem_cache *s = kmalloc_caches[i]; 494 char *n; 495 496 if (s) { 497 n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); 498 499 BUG_ON(!n); 500 s->name = n; 501 } 502 } 503 504 #ifdef CONFIG_ZONE_DMA 505 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 506 struct kmem_cache *s = kmalloc_caches[i]; 507 508 if (s) { 509 int size = kmalloc_size(i); 510 char *n = kasprintf(GFP_NOWAIT, 511 "dma-kmalloc-%d", size); 512 513 BUG_ON(!n); 514 kmalloc_dma_caches[i] = create_kmalloc_cache(n, 515 size, SLAB_CACHE_DMA | flags); 516 } 517 } 518 #endif 519 } 520 #endif /* !CONFIG_SLOB */ 521 522 #ifdef CONFIG_TRACING 523 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 524 { 525 void *ret = kmalloc_order(size, flags, order); 526 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 527 return ret; 528 } 529 EXPORT_SYMBOL(kmalloc_order_trace); 530 #endif 531 532 #ifdef CONFIG_SLABINFO 533 534 #ifdef CONFIG_SLAB 535 #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) 536 #else 537 #define SLABINFO_RIGHTS S_IRUSR 538 #endif 539 540 void print_slabinfo_header(struct seq_file *m) 541 { 542 /* 543 * Output format version, so at least we can change it 544 * without _too_ many complaints. 545 */ 546 #ifdef CONFIG_DEBUG_SLAB 547 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 548 #else 549 seq_puts(m, "slabinfo - version: 2.1\n"); 550 #endif 551 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 552 "<objperslab> <pagesperslab>"); 553 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 554 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 555 #ifdef CONFIG_DEBUG_SLAB 556 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 557 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 558 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 559 #endif 560 seq_putc(m, '\n'); 561 } 562 563 static void *s_start(struct seq_file *m, loff_t *pos) 564 { 565 loff_t n = *pos; 566 567 mutex_lock(&slab_mutex); 568 if (!n) 569 print_slabinfo_header(m); 570 571 return seq_list_start(&slab_caches, *pos); 572 } 573 574 void *slab_next(struct seq_file *m, void *p, loff_t *pos) 575 { 576 return seq_list_next(p, &slab_caches, pos); 577 } 578 579 void slab_stop(struct seq_file *m, void *p) 580 { 581 mutex_unlock(&slab_mutex); 582 } 583 584 static void 585 memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) 586 { 587 struct kmem_cache *c; 588 struct slabinfo sinfo; 589 int i; 590 591 if (!is_root_cache(s)) 592 return; 593 594 for_each_memcg_cache_index(i) { 595 c = cache_from_memcg_idx(s, i); 596 if (!c) 597 continue; 598 599 memset(&sinfo, 0, sizeof(sinfo)); 600 get_slabinfo(c, &sinfo); 601 602 info->active_slabs += sinfo.active_slabs; 603 info->num_slabs += sinfo.num_slabs; 604 info->shared_avail += sinfo.shared_avail; 605 info->active_objs += sinfo.active_objs; 606 info->num_objs += sinfo.num_objs; 607 } 608 } 609 610 int cache_show(struct kmem_cache *s, struct seq_file *m) 611 { 612 struct slabinfo sinfo; 613 614 memset(&sinfo, 0, sizeof(sinfo)); 615 get_slabinfo(s, &sinfo); 616 617 memcg_accumulate_slabinfo(s, &sinfo); 618 619 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 620 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, 621 sinfo.objects_per_slab, (1 << sinfo.cache_order)); 622 623 seq_printf(m, " : tunables %4u %4u %4u", 624 sinfo.limit, sinfo.batchcount, sinfo.shared); 625 seq_printf(m, " : slabdata %6lu %6lu %6lu", 626 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); 627 slabinfo_show_stats(m, s); 628 seq_putc(m, '\n'); 629 return 0; 630 } 631 632 static int s_show(struct seq_file *m, void *p) 633 { 634 struct kmem_cache *s = list_entry(p, struct kmem_cache, list); 635 636 if (!is_root_cache(s)) 637 return 0; 638 return cache_show(s, m); 639 } 640 641 /* 642 * slabinfo_op - iterator that generates /proc/slabinfo 643 * 644 * Output layout: 645 * cache-name 646 * num-active-objs 647 * total-objs 648 * object size 649 * num-active-slabs 650 * total-slabs 651 * num-pages-per-slab 652 * + further values on SMP and with statistics enabled 653 */ 654 static const struct seq_operations slabinfo_op = { 655 .start = s_start, 656 .next = slab_next, 657 .stop = slab_stop, 658 .show = s_show, 659 }; 660 661 static int slabinfo_open(struct inode *inode, struct file *file) 662 { 663 return seq_open(file, &slabinfo_op); 664 } 665 666 static const struct file_operations proc_slabinfo_operations = { 667 .open = slabinfo_open, 668 .read = seq_read, 669 .write = slabinfo_write, 670 .llseek = seq_lseek, 671 .release = seq_release, 672 }; 673 674 static int __init slab_proc_init(void) 675 { 676 proc_create("slabinfo", SLABINFO_RIGHTS, NULL, 677 &proc_slabinfo_operations); 678 return 0; 679 } 680 module_init(slab_proc_init); 681 #endif /* CONFIG_SLABINFO */ 682