1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Debugging module statistics. 4 * 5 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> 6 */ 7 8 #include <linux/module.h> 9 #include <uapi/linux/module.h> 10 #include <linux/string.h> 11 #include <linux/printk.h> 12 #include <linux/slab.h> 13 #include <linux/list.h> 14 #include <linux/debugfs.h> 15 #include <linux/rculist.h> 16 #include <linux/math.h> 17 18 #include "internal.h" 19 20 /** 21 * DOC: module debugging statistics overview 22 * 23 * Enabling CONFIG_MODULE_STATS enables module debugging statistics which 24 * are useful to monitor and root cause memory pressure issues with module 25 * loading. These statistics are useful to allow us to improve production 26 * workloads. 27 * 28 * The current module debugging statistics supported help keep track of module 29 * loading failures to enable improvements either for kernel module auto-loading 30 * usage (request_module()) or interactions with userspace. Statistics are 31 * provided to track all possible failures in the finit_module() path and memory 32 * wasted in this process space. Each of the failure counters are associated 33 * to a type of module loading failure which is known to incur a certain amount 34 * of memory allocation loss. In the worst case loading a module will fail after 35 * a 3 step memory allocation process: 36 * 37 * a) memory allocated with kernel_read_file_from_fd() 38 * b) module decompression processes the file read from 39 * kernel_read_file_from_fd(), and vmap() is used to map 40 * the decompressed module to a new local buffer which represents 41 * a copy of the decompressed module passed from userspace. The buffer 42 * from kernel_read_file_from_fd() is freed right away. 43 * c) layout_and_allocate() allocates space for the final resting 44 * place where we would keep the module if it were to be processed 45 * successfully. 46 * 47 * If a failure occurs after these three different allocations only one 48 * counter will be incremented with the summation of the allocated bytes freed 49 * incurred during this failure. Likewise, if module loading failed only after 50 * step b) a separate counter is used and incremented for the bytes freed and 51 * not used during both of those allocations. 52 * 53 * Virtual memory space can be limited, for example on x86 virtual memory size 54 * defaults to 128 MiB. We should strive to limit and avoid wasting virtual 55 * memory allocations when possible. These module debugging statistics help 56 * to evaluate how much memory is being wasted on bootup due to module loading 57 * failures. 58 * 59 * All counters are designed to be incremental. Atomic counters are used so to 60 * remain simple and avoid delays and deadlocks. 61 */ 62 63 /** 64 * DOC: dup_failed_modules - tracks duplicate failed modules 65 * 66 * Linked list of modules which failed to be loaded because an already existing 67 * module with the same name was already being processed or already loaded. 68 * The finit_module() system call incurs heavy virtual memory allocations. In 69 * the worst case an finit_module() system call can end up allocating virtual 70 * memory 3 times: 71 * 72 * 1) kernel_read_file_from_fd() call uses vmalloc() 73 * 2) optional module decompression uses vmap() 74 * 3) layout_and allocate() can use vzalloc() or an arch specific variation of 75 * vmalloc to deal with ELF sections requiring special permissions 76 * 77 * In practice on a typical boot today most finit_module() calls fail due to 78 * the module with the same name already being loaded or about to be processed. 79 * All virtual memory allocated to these failed modules will be freed with 80 * no functional use. 81 * 82 * To help with this the dup_failed_modules allows us to track modules which 83 * failed to load due to the fact that a module was already loaded or being 84 * processed. There are only two points at which we can fail such calls, 85 * we list them below along with the number of virtual memory allocation 86 * calls: 87 * 88 * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before 89 * layout_and_allocate(). 90 * - with module decompression: 2 virtual memory allocation calls 91 * - without module decompression: 1 virtual memory allocation calls 92 * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() 93 * - with module decompression 3 virtual memory allocation calls 94 * - without module decompression 2 virtual memory allocation calls 95 * 96 * We should strive to get this list to be as small as possible. If this list 97 * is not empty it is a reflection of possible work or optimizations possible 98 * either in-kernel or in userspace. 99 */ 100 static LIST_HEAD(dup_failed_modules); 101 102 /** 103 * DOC: module statistics debugfs counters 104 * 105 * The total amount of wasted virtual memory allocation space during module 106 * loading can be computed by adding the total from the summation: 107 * 108 * * @invalid_kread_bytes + 109 * @invalid_decompress_bytes + 110 * @invalid_becoming_bytes + 111 * @invalid_mod_bytes 112 * 113 * The following debugfs counters are available to inspect module loading 114 * failures: 115 * 116 * * total_mod_size: total bytes ever used by all modules we've dealt with on 117 * this system 118 * * total_text_size: total bytes of the .text and .init.text ELF section 119 * sizes we've dealt with on this system 120 * * invalid_kread_bytes: bytes allocated and then freed on failures which 121 * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() 122 * uses vmalloc(). These should typically not happen unless your system is 123 * under memory pressure. 124 * * invalid_decompress_bytes: number of bytes allocated and freed due to 125 * memory allocations in the module decompression path that use vmap(). 126 * These typically should not happen unless your system is under memory 127 * pressure. 128 * * invalid_becoming_bytes: total number of bytes allocated and freed used 129 * used to read the kernel module userspace wants us to read before we 130 * promote it to be processed to be added to our @modules linked list. These 131 * failures can happen if we had a check in between a successful kernel_read_file_from_fd() 132 * call and right before we allocate the our private memory for the module 133 * which would be kept if the module is successfully loaded. The most common 134 * reason for this failure is when userspace is racing to load a module 135 * which it does not yet see loaded. The first module to succeed in 136 * add_unformed_module() will add a module to our &modules list and 137 * subsequent loads of modules with the same name will error out at the 138 * end of early_mod_check(). The check for module_patient_check_exists() 139 * at the end of early_mod_check() prevents duplicate allocations 140 * on layout_and_allocate() for modules already being processed. These 141 * duplicate failed modules are non-fatal, however they typically are 142 * indicative of userspace not seeing a module in userspace loaded yet and 143 * unnecessarily trying to load a module before the kernel even has a chance 144 * to begin to process prior requests. Although duplicate failures can be 145 * non-fatal, we should try to reduce vmalloc() pressure proactively, so 146 * ideally after boot this will be close to as 0 as possible. If module 147 * decompression was used we also add to this counter the cost of the 148 * initial kernel_read_file_from_fd() of the compressed module. If module 149 * decompression was not used the value represents the total allocated and 150 * freed bytes in kernel_read_file_from_fd() calls for these type of 151 * failures. These failures can occur because: 152 * 153 * * module_sig_check() - module signature checks 154 * * elf_validity_cache_copy() - some ELF validation issue 155 * * early_mod_check(): 156 * 157 * * blacklisting 158 * * failed to rewrite section headers 159 * * version magic 160 * * live patch requirements didn't check out 161 * * the module was detected as being already present 162 * 163 * * invalid_mod_bytes: these are the total number of bytes allocated and 164 * freed due to failures after we did all the sanity checks of the module 165 * which userspace passed to us and after our first check that the module 166 * is unique. A module can still fail to load if we detect the module is 167 * loaded after we allocate space for it with layout_and_allocate(), we do 168 * this check right before processing the module as live and run its 169 * initialization routines. Note that you have a failure of this type it 170 * also means the respective kernel_read_file_from_fd() memory space was 171 * also freed and not used, and so we increment this counter with twice 172 * the size of the module. Additionally if you used module decompression 173 * the size of the compressed module is also added to this counter. 174 * 175 * * modcount: how many modules we've loaded in our kernel life time 176 * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() 177 * * failed_decompress: how many failed module decompression attempts we've had. 178 * These really should not happen unless your compression / decompression 179 * might be broken. 180 * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() 181 * it and before we allocate memory for it with layout_and_allocate(). This 182 * counter is never incremented if you manage to validate the module and 183 * call layout_and_allocate() for it. 184 * * failed_load_modules: how many modules failed once we've allocated our 185 * private space for our module using layout_and_allocate(). These failures 186 * should hopefully mostly be dealt with already. Races in theory could 187 * still exist here, but it would just mean the kernel had started processing 188 * two threads concurrently up to early_mod_check() and one thread won. 189 * These failures are good signs the kernel or userspace is doing something 190 * seriously stupid or that could be improved. We should strive to fix these, 191 * but it is perhaps not easy to fix them. A recent example are the modules 192 * requests incurred for frequency modules, a separate module request was 193 * being issued for each CPU on a system. 194 */ 195 196 atomic_long_t total_mod_size; 197 atomic_long_t total_text_size; 198 atomic_long_t invalid_kread_bytes; 199 atomic_long_t invalid_decompress_bytes; 200 static atomic_long_t invalid_becoming_bytes; 201 static atomic_long_t invalid_mod_bytes; 202 atomic_t modcount; 203 atomic_t failed_kreads; 204 atomic_t failed_decompress; 205 static atomic_t failed_becoming; 206 static atomic_t failed_load_modules; 207 208 static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) 209 { 210 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && 211 test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) 212 return "Becoming & Load"; 213 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) 214 return "Becoming"; 215 if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) 216 return "Load"; 217 return "Bug-on-stats"; 218 } 219 220 void mod_stat_bump_invalid(struct load_info *info, int flags) 221 { 222 atomic_long_add(info->len * 2, &invalid_mod_bytes); 223 atomic_inc(&failed_load_modules); 224 #if defined(CONFIG_MODULE_DECOMPRESS) 225 if (flags & MODULE_INIT_COMPRESSED_FILE) 226 atomic_long_add(info->compressed_len, &invalid_mod_bytes); 227 #endif 228 } 229 230 void mod_stat_bump_becoming(struct load_info *info, int flags) 231 { 232 atomic_inc(&failed_becoming); 233 atomic_long_add(info->len, &invalid_becoming_bytes); 234 #if defined(CONFIG_MODULE_DECOMPRESS) 235 if (flags & MODULE_INIT_COMPRESSED_FILE) 236 atomic_long_add(info->compressed_len, &invalid_becoming_bytes); 237 #endif 238 } 239 240 int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) 241 { 242 struct mod_fail_load *mod_fail; 243 244 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, 245 lockdep_is_held(&module_mutex)) { 246 if (!strcmp(mod_fail->name, name)) { 247 atomic_long_inc(&mod_fail->count); 248 __set_bit(reason, &mod_fail->dup_fail_mask); 249 goto out; 250 } 251 } 252 253 mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); 254 if (!mod_fail) 255 return -ENOMEM; 256 memcpy(mod_fail->name, name, strlen(name)); 257 __set_bit(reason, &mod_fail->dup_fail_mask); 258 atomic_long_inc(&mod_fail->count); 259 list_add_rcu(&mod_fail->list, &dup_failed_modules); 260 out: 261 return 0; 262 } 263 264 /* 265 * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the 266 * 112 module prints within 8k. 267 * 268 * 1024 + (64*112) = 8k 269 */ 270 #define MAX_PREAMBLE 1024 271 #define MAX_FAILED_MOD_PRINT 112 272 #define MAX_BYTES_PER_MOD 64 273 static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, 274 size_t count, loff_t *ppos) 275 { 276 struct mod_fail_load *mod_fail; 277 unsigned int len, size, count_failed = 0; 278 char *buf; 279 int ret; 280 u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; 281 unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, 282 idecompress_bytes, imod_bytes, total_virtual_lost; 283 284 live_mod_count = atomic_read(&modcount); 285 fkreads = atomic_read(&failed_kreads); 286 fdecompress = atomic_read(&failed_decompress); 287 fbecoming = atomic_read(&failed_becoming); 288 floads = atomic_read(&failed_load_modules); 289 290 total_size = atomic_long_read(&total_mod_size); 291 text_size = atomic_long_read(&total_text_size); 292 ikread_bytes = atomic_long_read(&invalid_kread_bytes); 293 idecompress_bytes = atomic_long_read(&invalid_decompress_bytes); 294 ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes); 295 imod_bytes = atomic_long_read(&invalid_mod_bytes); 296 297 total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; 298 299 size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), 300 (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; 301 buf = kzalloc(size, GFP_KERNEL); 302 if (buf == NULL) 303 return -ENOMEM; 304 305 /* The beginning of our debug preamble */ 306 len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count); 307 308 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); 309 310 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", 311 fdecompress); 312 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); 313 314 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); 315 316 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size); 317 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size); 318 319 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes); 320 321 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes", 322 idecompress_bytes); 323 324 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes); 325 326 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes); 327 328 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost); 329 330 if (live_mod_count && total_size) { 331 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size", 332 DIV_ROUND_UP(total_size, live_mod_count)); 333 } 334 335 if (live_mod_count && text_size) { 336 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size", 337 DIV_ROUND_UP(text_size, live_mod_count)); 338 } 339 340 /* 341 * We use WARN_ON_ONCE() for the counters to ensure we always have parity 342 * for keeping tabs on a type of failure with one type of byte counter. 343 * The counters for imod_bytes does not increase for fkreads failures 344 * for example, and so on. 345 */ 346 347 WARN_ON_ONCE(ikread_bytes && !fkreads); 348 if (fkreads && ikread_bytes) { 349 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes", 350 DIV_ROUND_UP(ikread_bytes, fkreads)); 351 } 352 353 WARN_ON_ONCE(ibecoming_bytes && !fbecoming); 354 if (fbecoming && ibecoming_bytes) { 355 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes", 356 DIV_ROUND_UP(ibecoming_bytes, fbecoming)); 357 } 358 359 WARN_ON_ONCE(idecompress_bytes && !fdecompress); 360 if (fdecompress && idecompress_bytes) { 361 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes", 362 DIV_ROUND_UP(idecompress_bytes, fdecompress)); 363 } 364 365 WARN_ON_ONCE(imod_bytes && !floads); 366 if (floads && imod_bytes) { 367 len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes", 368 DIV_ROUND_UP(imod_bytes, floads)); 369 } 370 371 /* End of our debug preamble header. */ 372 373 /* Catch when we've gone beyond our expected preamble */ 374 WARN_ON_ONCE(len >= MAX_PREAMBLE); 375 376 if (list_empty(&dup_failed_modules)) 377 goto out; 378 379 len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); 380 len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", 381 "Module-name", "How-many-times", "Reason"); 382 mutex_lock(&module_mutex); 383 384 385 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { 386 if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) 387 goto out_unlock; 388 len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name, 389 atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail)); 390 } 391 out_unlock: 392 mutex_unlock(&module_mutex); 393 out: 394 ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); 395 kfree(buf); 396 return ret; 397 } 398 #undef MAX_PREAMBLE 399 #undef MAX_FAILED_MOD_PRINT 400 #undef MAX_BYTES_PER_MOD 401 402 static const struct file_operations fops_mod_stats = { 403 .read = read_file_mod_stats, 404 .open = simple_open, 405 .owner = THIS_MODULE, 406 .llseek = default_llseek, 407 }; 408 409 #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) 410 #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) 411 static int __init module_stats_init(void) 412 { 413 mod_debug_add_ulong(total_mod_size); 414 mod_debug_add_ulong(total_text_size); 415 mod_debug_add_ulong(invalid_kread_bytes); 416 mod_debug_add_ulong(invalid_decompress_bytes); 417 mod_debug_add_ulong(invalid_becoming_bytes); 418 mod_debug_add_ulong(invalid_mod_bytes); 419 420 mod_debug_add_atomic(modcount); 421 mod_debug_add_atomic(failed_kreads); 422 mod_debug_add_atomic(failed_decompress); 423 mod_debug_add_atomic(failed_becoming); 424 mod_debug_add_atomic(failed_load_modules); 425 426 debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); 427 428 return 0; 429 } 430 #undef mod_debug_add_ulong 431 #undef mod_debug_add_atomic 432 module_init(module_stats_init); 433