xref: /openbmc/linux/lib/stackdepot.c (revision 36aa1e67)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Generic stack depot for storing stack traces.
4  *
5  * Some debugging tools need to save stack traces of certain events which can
6  * be later presented to the user. For example, KASAN needs to safe alloc and
7  * free stacks for each object, but storing two stack traces per object
8  * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for
9  * that).
10  *
11  * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc
12  * and free stacks repeat a lot, we save about 100x space.
13  * Stacks are never removed from depot, so we store them contiguously one after
14  * another in a contiguous memory allocation.
15  *
16  * Author: Alexander Potapenko <glider@google.com>
17  * Copyright (C) 2016 Google, Inc.
18  *
19  * Based on code by Dmitry Chernenkov.
20  */
21 
22 #define pr_fmt(fmt) "stackdepot: " fmt
23 
24 #include <linux/gfp.h>
25 #include <linux/jhash.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/mutex.h>
29 #include <linux/percpu.h>
30 #include <linux/printk.h>
31 #include <linux/slab.h>
32 #include <linux/stacktrace.h>
33 #include <linux/stackdepot.h>
34 #include <linux/string.h>
35 #include <linux/types.h>
36 #include <linux/memblock.h>
37 #include <linux/kasan-enabled.h>
38 
39 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
40 
41 #define DEPOT_VALID_BITS 1
42 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
43 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
44 #define DEPOT_STACK_ALIGN 4
45 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
46 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \
47 			       DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS)
48 #define DEPOT_POOLS_CAP 8192
49 #define DEPOT_MAX_POOLS \
50 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
51 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
52 
53 /* The compact structure to store the reference to stacks. */
54 union handle_parts {
55 	depot_stack_handle_t handle;
56 	struct {
57 		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
58 		u32 offset	: DEPOT_OFFSET_BITS;
59 		u32 valid	: DEPOT_VALID_BITS;
60 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
61 	};
62 };
63 
64 struct stack_record {
65 	struct stack_record *next;	/* Link in the hashtable */
66 	u32 hash;			/* Hash in the hastable */
67 	u32 size;			/* Number of frames in the stack */
68 	union handle_parts handle;
69 	unsigned long entries[];	/* Variable-sized array of entries. */
70 };
71 
72 static bool stack_depot_disabled;
73 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
74 static bool __stack_depot_early_init_passed __initdata;
75 
76 /* Use one hash table bucket per 16 KB of memory. */
77 #define STACK_HASH_TABLE_SCALE 14
78 /* Limit the number of buckets between 4K and 1M. */
79 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
80 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
81 /* Initial seed for jhash2. */
82 #define STACK_HASH_SEED 0x9747b28c
83 
84 /* Hash table of pointers to stored stack traces. */
85 static struct stack_record **stack_table;
86 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
87 static unsigned int stack_bucket_number_order;
88 /* Hash mask for indexing the table. */
89 static unsigned int stack_hash_mask;
90 
91 /* Array of memory regions that store stack traces. */
92 static void *stack_pools[DEPOT_MAX_POOLS];
93 /* Currently used pool in stack_pools. */
94 static int pool_index;
95 /* Offset to the unused space in the currently used pool. */
96 static size_t pool_offset;
97 /* Lock that protects the variables above. */
98 static DEFINE_RAW_SPINLOCK(pool_lock);
99 /*
100  * Stack depot tries to keep an extra pool allocated even before it runs out
101  * of space in the currently used pool.
102  * This flag marks that this next extra pool needs to be allocated and
103  * initialized. It has the value 0 when either the next pool is not yet
104  * initialized or the limit on the number of pools is reached.
105  */
106 static int next_pool_required = 1;
107 
108 static int __init disable_stack_depot(char *str)
109 {
110 	int ret;
111 
112 	ret = kstrtobool(str, &stack_depot_disabled);
113 	if (!ret && stack_depot_disabled) {
114 		pr_info("disabled\n");
115 		stack_table = NULL;
116 	}
117 	return 0;
118 }
119 early_param("stack_depot_disable", disable_stack_depot);
120 
121 void __init stack_depot_request_early_init(void)
122 {
123 	/* Too late to request early init now. */
124 	WARN_ON(__stack_depot_early_init_passed);
125 
126 	__stack_depot_early_init_requested = true;
127 }
128 
129 /* Allocates a hash table via memblock. Can only be used during early boot. */
130 int __init stack_depot_early_init(void)
131 {
132 	unsigned long entries = 0;
133 
134 	/* This function must be called only once, from mm_init(). */
135 	if (WARN_ON(__stack_depot_early_init_passed))
136 		return 0;
137 	__stack_depot_early_init_passed = true;
138 
139 	/*
140 	 * If KASAN is enabled, use the maximum order: KASAN is frequently used
141 	 * in fuzzing scenarios, which leads to a large number of different
142 	 * stack traces being stored in stack depot.
143 	 */
144 	if (kasan_enabled() && !stack_bucket_number_order)
145 		stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
146 
147 	if (!__stack_depot_early_init_requested || stack_depot_disabled)
148 		return 0;
149 
150 	/*
151 	 * If stack_bucket_number_order is not set, leave entries as 0 to rely
152 	 * on the automatic calculations performed by alloc_large_system_hash.
153 	 */
154 	if (stack_bucket_number_order)
155 		entries = 1UL << stack_bucket_number_order;
156 	pr_info("allocating hash table via alloc_large_system_hash\n");
157 	stack_table = alloc_large_system_hash("stackdepot",
158 						sizeof(struct stack_record *),
159 						entries,
160 						STACK_HASH_TABLE_SCALE,
161 						HASH_EARLY | HASH_ZERO,
162 						NULL,
163 						&stack_hash_mask,
164 						1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
165 						1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
166 	if (!stack_table) {
167 		pr_err("hash table allocation failed, disabling\n");
168 		stack_depot_disabled = true;
169 		return -ENOMEM;
170 	}
171 
172 	return 0;
173 }
174 
175 /* Allocates a hash table via kvcalloc. Can be used after boot. */
176 int stack_depot_init(void)
177 {
178 	static DEFINE_MUTEX(stack_depot_init_mutex);
179 	unsigned long entries;
180 	int ret = 0;
181 
182 	mutex_lock(&stack_depot_init_mutex);
183 
184 	if (stack_depot_disabled || stack_table)
185 		goto out_unlock;
186 
187 	/*
188 	 * Similarly to stack_depot_early_init, use stack_bucket_number_order
189 	 * if assigned, and rely on automatic scaling otherwise.
190 	 */
191 	if (stack_bucket_number_order) {
192 		entries = 1UL << stack_bucket_number_order;
193 	} else {
194 		int scale = STACK_HASH_TABLE_SCALE;
195 
196 		entries = nr_free_buffer_pages();
197 		entries = roundup_pow_of_two(entries);
198 
199 		if (scale > PAGE_SHIFT)
200 			entries >>= (scale - PAGE_SHIFT);
201 		else
202 			entries <<= (PAGE_SHIFT - scale);
203 	}
204 
205 	if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
206 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
207 	if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
208 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
209 
210 	pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
211 	stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL);
212 	if (!stack_table) {
213 		pr_err("hash table allocation failed, disabling\n");
214 		stack_depot_disabled = true;
215 		ret = -ENOMEM;
216 		goto out_unlock;
217 	}
218 	stack_hash_mask = entries - 1;
219 
220 out_unlock:
221 	mutex_unlock(&stack_depot_init_mutex);
222 
223 	return ret;
224 }
225 EXPORT_SYMBOL_GPL(stack_depot_init);
226 
227 /* Uses preallocated memory to initialize a new stack depot pool. */
228 static void depot_init_pool(void **prealloc)
229 {
230 	/*
231 	 * If the next pool is already initialized or the maximum number of
232 	 * pools is reached, do not use the preallocated memory.
233 	 * smp_load_acquire() here pairs with smp_store_release() below and
234 	 * in depot_alloc_stack().
235 	 */
236 	if (!smp_load_acquire(&next_pool_required))
237 		return;
238 
239 	/* Check if the current pool is not yet allocated. */
240 	if (stack_pools[pool_index] == NULL) {
241 		/* Use the preallocated memory for the current pool. */
242 		stack_pools[pool_index] = *prealloc;
243 		*prealloc = NULL;
244 	} else {
245 		/*
246 		 * Otherwise, use the preallocated memory for the next pool
247 		 * as long as we do not exceed the maximum number of pools.
248 		 */
249 		if (pool_index + 1 < DEPOT_MAX_POOLS) {
250 			stack_pools[pool_index + 1] = *prealloc;
251 			*prealloc = NULL;
252 		}
253 		/*
254 		 * At this point, either the next pool is initialized or the
255 		 * maximum number of pools is reached. In either case, take
256 		 * note that initializing another pool is not required.
257 		 * This smp_store_release pairs with smp_load_acquire() above
258 		 * and in stack_depot_save().
259 		 */
260 		smp_store_release(&next_pool_required, 0);
261 	}
262 }
263 
264 /* Allocates a new stack in a stack depot pool. */
265 static struct stack_record *
266 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
267 {
268 	struct stack_record *stack;
269 	size_t required_size = struct_size(stack, entries, size);
270 
271 	required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN);
272 
273 	/* Check if there is not enough space in the current pool. */
274 	if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) {
275 		/* Bail out if we reached the pool limit. */
276 		if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) {
277 			WARN_ONCE(1, "Stack depot reached limit capacity");
278 			return NULL;
279 		}
280 
281 		/* Move on to the next pool. */
282 		pool_index++;
283 		pool_offset = 0;
284 		/*
285 		 * If the maximum number of pools is not reached, take note
286 		 * that the next pool needs to initialized.
287 		 * smp_store_release() here pairs with smp_load_acquire() in
288 		 * stack_depot_save() and depot_init_pool().
289 		 */
290 		if (pool_index + 1 < DEPOT_MAX_POOLS)
291 			smp_store_release(&next_pool_required, 1);
292 	}
293 
294 	/* Assign the preallocated memory to a pool if required. */
295 	if (*prealloc)
296 		depot_init_pool(prealloc);
297 
298 	/* Check if we have a pool to save the stack trace. */
299 	if (stack_pools[pool_index] == NULL)
300 		return NULL;
301 
302 	/* Save the stack trace. */
303 	stack = stack_pools[pool_index] + pool_offset;
304 	stack->hash = hash;
305 	stack->size = size;
306 	stack->handle.pool_index = pool_index;
307 	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
308 	stack->handle.valid = 1;
309 	stack->handle.extra = 0;
310 	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
311 	pool_offset += required_size;
312 
313 	return stack;
314 }
315 
316 /* Calculate hash for a stack */
317 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
318 {
319 	return jhash2((u32 *)entries,
320 		      array_size(size,  sizeof(*entries)) / sizeof(u32),
321 		      STACK_HASH_SEED);
322 }
323 
324 /* Use our own, non-instrumented version of memcmp().
325  *
326  * We actually don't care about the order, just the equality.
327  */
328 static inline
329 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
330 			unsigned int n)
331 {
332 	for ( ; n-- ; u1++, u2++) {
333 		if (*u1 != *u2)
334 			return 1;
335 	}
336 	return 0;
337 }
338 
339 /* Find a stack that is equal to the one stored in entries in the hash */
340 static inline struct stack_record *find_stack(struct stack_record *bucket,
341 					     unsigned long *entries, int size,
342 					     u32 hash)
343 {
344 	struct stack_record *found;
345 
346 	for (found = bucket; found; found = found->next) {
347 		if (found->hash == hash &&
348 		    found->size == size &&
349 		    !stackdepot_memcmp(entries, found->entries, size))
350 			return found;
351 	}
352 	return NULL;
353 }
354 
355 /**
356  * __stack_depot_save - Save a stack trace from an array
357  *
358  * @entries:		Pointer to storage array
359  * @nr_entries:		Size of the storage array
360  * @alloc_flags:	Allocation gfp flags
361  * @can_alloc:		Allocate stack pools (increased chance of failure if false)
362  *
363  * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
364  * %true, is allowed to replenish the stack pool in case no space is left
365  * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
366  * any allocations and will fail if no space is left to store the stack trace.
367  *
368  * If the stack trace in @entries is from an interrupt, only the portion up to
369  * interrupt entry is saved.
370  *
371  * Context: Any context, but setting @can_alloc to %false is required if
372  *          alloc_pages() cannot be used from the current context. Currently
373  *          this is the case from contexts where neither %GFP_ATOMIC nor
374  *          %GFP_NOWAIT can be used (NMI, raw_spin_lock).
375  *
376  * Return: The handle of the stack struct stored in depot, 0 on failure.
377  */
378 depot_stack_handle_t __stack_depot_save(unsigned long *entries,
379 					unsigned int nr_entries,
380 					gfp_t alloc_flags, bool can_alloc)
381 {
382 	struct stack_record *found = NULL, **bucket;
383 	union handle_parts retval = { .handle = 0 };
384 	struct page *page = NULL;
385 	void *prealloc = NULL;
386 	unsigned long flags;
387 	u32 hash;
388 
389 	/*
390 	 * If this stack trace is from an interrupt, including anything before
391 	 * interrupt entry usually leads to unbounded stackdepot growth.
392 	 *
393 	 * Because use of filter_irq_stacks() is a requirement to ensure
394 	 * stackdepot can efficiently deduplicate interrupt stacks, always
395 	 * filter_irq_stacks() to simplify all callers' use of stackdepot.
396 	 */
397 	nr_entries = filter_irq_stacks(entries, nr_entries);
398 
399 	if (unlikely(nr_entries == 0) || stack_depot_disabled)
400 		goto fast_exit;
401 
402 	hash = hash_stack(entries, nr_entries);
403 	bucket = &stack_table[hash & stack_hash_mask];
404 
405 	/*
406 	 * Fast path: look the stack trace up without locking.
407 	 * The smp_load_acquire() here pairs with smp_store_release() to
408 	 * |bucket| below.
409 	 */
410 	found = find_stack(smp_load_acquire(bucket), entries,
411 			   nr_entries, hash);
412 	if (found)
413 		goto exit;
414 
415 	/*
416 	 * Check if another stack pool needs to be initialized. If so, allocate
417 	 * the memory now - we won't be able to do that under the lock.
418 	 *
419 	 * The smp_load_acquire() here pairs with smp_store_release() to
420 	 * |next_pool_inited| in depot_alloc_stack() and depot_init_pool().
421 	 */
422 	if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) {
423 		/*
424 		 * Zero out zone modifiers, as we don't have specific zone
425 		 * requirements. Keep the flags related to allocation in atomic
426 		 * contexts and I/O.
427 		 */
428 		alloc_flags &= ~GFP_ZONEMASK;
429 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
430 		alloc_flags |= __GFP_NOWARN;
431 		page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
432 		if (page)
433 			prealloc = page_address(page);
434 	}
435 
436 	raw_spin_lock_irqsave(&pool_lock, flags);
437 
438 	found = find_stack(*bucket, entries, nr_entries, hash);
439 	if (!found) {
440 		struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc);
441 
442 		if (new) {
443 			new->next = *bucket;
444 			/*
445 			 * This smp_store_release() pairs with
446 			 * smp_load_acquire() from |bucket| above.
447 			 */
448 			smp_store_release(bucket, new);
449 			found = new;
450 		}
451 	} else if (prealloc) {
452 		/*
453 		 * We didn't need to store this stack trace, but let's keep
454 		 * the preallocated memory for the future.
455 		 */
456 		depot_init_pool(&prealloc);
457 	}
458 
459 	raw_spin_unlock_irqrestore(&pool_lock, flags);
460 exit:
461 	if (prealloc) {
462 		/* Nobody used this memory, ok to free it. */
463 		free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
464 	}
465 	if (found)
466 		retval.handle = found->handle.handle;
467 fast_exit:
468 	return retval.handle;
469 }
470 EXPORT_SYMBOL_GPL(__stack_depot_save);
471 
472 /**
473  * stack_depot_save - Save a stack trace from an array
474  *
475  * @entries:		Pointer to storage array
476  * @nr_entries:		Size of the storage array
477  * @alloc_flags:	Allocation gfp flags
478  *
479  * Context: Contexts where allocations via alloc_pages() are allowed.
480  *          See __stack_depot_save() for more details.
481  *
482  * Return: The handle of the stack struct stored in depot, 0 on failure.
483  */
484 depot_stack_handle_t stack_depot_save(unsigned long *entries,
485 				      unsigned int nr_entries,
486 				      gfp_t alloc_flags)
487 {
488 	return __stack_depot_save(entries, nr_entries, alloc_flags, true);
489 }
490 EXPORT_SYMBOL_GPL(stack_depot_save);
491 
492 /**
493  * stack_depot_fetch - Fetch stack entries from a depot
494  *
495  * @handle:		Stack depot handle which was returned from
496  *			stack_depot_save().
497  * @entries:		Pointer to store the entries address
498  *
499  * Return: The number of trace entries for this depot.
500  */
501 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
502 			       unsigned long **entries)
503 {
504 	union handle_parts parts = { .handle = handle };
505 	void *pool;
506 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
507 	struct stack_record *stack;
508 
509 	*entries = NULL;
510 	if (!handle)
511 		return 0;
512 
513 	if (parts.pool_index > pool_index) {
514 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
515 			parts.pool_index, pool_index, handle);
516 		return 0;
517 	}
518 	pool = stack_pools[parts.pool_index];
519 	if (!pool)
520 		return 0;
521 	stack = pool + offset;
522 
523 	*entries = stack->entries;
524 	return stack->size;
525 }
526 EXPORT_SYMBOL_GPL(stack_depot_fetch);
527 
528 /**
529  * stack_depot_print - print stack entries from a depot
530  *
531  * @stack:		Stack depot handle which was returned from
532  *			stack_depot_save().
533  *
534  */
535 void stack_depot_print(depot_stack_handle_t stack)
536 {
537 	unsigned long *entries;
538 	unsigned int nr_entries;
539 
540 	nr_entries = stack_depot_fetch(stack, &entries);
541 	if (nr_entries > 0)
542 		stack_trace_print(entries, nr_entries, 0);
543 }
544 EXPORT_SYMBOL_GPL(stack_depot_print);
545 
546 /**
547  * stack_depot_snprint - print stack entries from a depot into a buffer
548  *
549  * @handle:	Stack depot handle which was returned from
550  *		stack_depot_save().
551  * @buf:	Pointer to the print buffer
552  *
553  * @size:	Size of the print buffer
554  *
555  * @spaces:	Number of leading spaces to print
556  *
557  * Return:	Number of bytes printed.
558  */
559 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
560 		       int spaces)
561 {
562 	unsigned long *entries;
563 	unsigned int nr_entries;
564 
565 	nr_entries = stack_depot_fetch(handle, &entries);
566 	return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
567 						spaces) : 0;
568 }
569 EXPORT_SYMBOL_GPL(stack_depot_snprint);
570 
571 /**
572  * stack_depot_set_extra_bits - Set extra bits in a stack depot handle
573  *
574  * @handle:	Stack depot handle returned from stack_depot_save()
575  * @extra_bits:	Value to set the extra bits
576  *
577  * Return: Stack depot handle with extra bits set
578  *
579  * Stack depot handles have a few unused bits, which can be used for storing
580  * user-specific information. These bits are transparent to the stack depot.
581  */
582 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
583 			depot_stack_handle_t handle, unsigned int extra_bits)
584 {
585 	union handle_parts parts = { .handle = handle };
586 
587 	/* Don't set extra bits on empty handles. */
588 	if (!handle)
589 		return 0;
590 
591 	parts.extra = extra_bits;
592 	return parts.handle;
593 }
594 EXPORT_SYMBOL(stack_depot_set_extra_bits);
595 
596 /**
597  * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle
598  *
599  * @handle:	Stack depot handle with extra bits saved
600  *
601  * Return: Extra bits retrieved from the stack depot handle
602  */
603 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
604 {
605 	union handle_parts parts = { .handle = handle };
606 
607 	return parts.extra;
608 }
609 EXPORT_SYMBOL(stack_depot_get_extra_bits);
610