cec.c (9632a3299bb1897f01c6a485ff035b20e61d7ae1) | cec.c (b8b5ca6600dec2a4f1e50ca9d3cf9e1d032870cd) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2#include <linux/mm.h> 3#include <linux/gfp.h> 4#include <linux/kernel.h> 5#include <linux/workqueue.h> 6 7#include <asm/mce.h> 8 --- 23 unchanged lines hidden (view full) --- 32 * 33 * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of 34 * elements entered into the array, during which, we're decaying all elements. 35 * If, after decay, an element gets inserted again, its generation is set to 11b 36 * to make sure it has higher numerical count than other, older elements and 37 * thus emulate an an LRU-like behavior when deleting elements to free up space 38 * in the page. 39 * | 1// SPDX-License-Identifier: GPL-2.0 2#include <linux/mm.h> 3#include <linux/gfp.h> 4#include <linux/kernel.h> 5#include <linux/workqueue.h> 6 7#include <asm/mce.h> 8 --- 23 unchanged lines hidden (view full) --- 32 * 33 * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of 34 * elements entered into the array, during which, we're decaying all elements. 35 * If, after decay, an element gets inserted again, its generation is set to 11b 36 * to make sure it has higher numerical count than other, older elements and 37 * thus emulate an an LRU-like behavior when deleting elements to free up space 38 * in the page. 39 * |
40 * When an element reaches it's max count of count_threshold, we try to poison 41 * it by assuming that errors triggered count_threshold times in a single page 42 * are excessive and that page shouldn't be used anymore. count_threshold is | 40 * When an element reaches it's max count of action_threshold, we try to poison 41 * it by assuming that errors triggered action_threshold times in a single page 42 * are excessive and that page shouldn't be used anymore. action_threshold is |
43 * initialized to COUNT_MASK which is the maximum. 44 * 45 * That error event entry causes cec_add_elem() to return !0 value and thus 46 * signal to its callers to log the error. 47 * 48 * To the question why we've chosen a page and moving elements around with 49 * memmove(), it is because it is a very simple structure to handle and max data 50 * movement is 4K which on highly optimized modern CPUs is almost unnoticeable. --- 66 unchanged lines hidden (view full) --- 117 __u32 flags; 118 }; 119} ce_arr; 120 121static DEFINE_MUTEX(ce_mutex); 122static u64 dfs_pfn; 123 124/* Amount of errors after which we offline */ | 43 * initialized to COUNT_MASK which is the maximum. 44 * 45 * That error event entry causes cec_add_elem() to return !0 value and thus 46 * signal to its callers to log the error. 47 * 48 * To the question why we've chosen a page and moving elements around with 49 * memmove(), it is because it is a very simple structure to handle and max data 50 * movement is 4K which on highly optimized modern CPUs is almost unnoticeable. --- 66 unchanged lines hidden (view full) --- 117 __u32 flags; 118 }; 119} ce_arr; 120 121static DEFINE_MUTEX(ce_mutex); 122static u64 dfs_pfn; 123 124/* Amount of errors after which we offline */ |
125static unsigned int count_threshold = COUNT_MASK; | 125static u64 action_threshold = COUNT_MASK; |
126 127/* Each element "decays" each decay_interval which is 24hrs by default. */ 128#define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */ 129#define CEC_DECAY_MIN_INTERVAL 1 * 60 * 60 /* 1h */ 130#define CEC_DECAY_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */ 131static struct delayed_work cec_work; 132static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL; 133 --- 206 unchanged lines hidden (view full) --- 340 } 341 342 /* Add/refresh element generation and increment count */ 343 ca->array[to] |= DECAY_MASK << COUNT_BITS; 344 ca->array[to]++; 345 346 /* Check action threshold and soft-offline, if reached. */ 347 count = COUNT(ca->array[to]); | 126 127/* Each element "decays" each decay_interval which is 24hrs by default. */ 128#define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */ 129#define CEC_DECAY_MIN_INTERVAL 1 * 60 * 60 /* 1h */ 130#define CEC_DECAY_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */ 131static struct delayed_work cec_work; 132static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL; 133 --- 206 unchanged lines hidden (view full) --- 340 } 341 342 /* Add/refresh element generation and increment count */ 343 ca->array[to] |= DECAY_MASK << COUNT_BITS; 344 ca->array[to]++; 345 346 /* Check action threshold and soft-offline, if reached. */ 347 count = COUNT(ca->array[to]); |
348 if (count >= count_threshold) { | 348 if (count >= action_threshold) { |
349 u64 pfn = ca->array[to] >> PAGE_SHIFT; 350 351 if (!pfn_valid(pfn)) { 352 pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn); 353 } else { 354 /* We have reached max count for this page, soft-offline it. */ 355 pr_err("Soft-offlining pfn: 0x%llx\n", pfn); 356 memory_failure_queue(pfn, MF_SOFT_OFFLINE); --- 54 unchanged lines hidden (view full) --- 411 decay_interval = val; 412 413 cec_mod_work(decay_interval); 414 415 return 0; 416} 417DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n"); 418 | 349 u64 pfn = ca->array[to] >> PAGE_SHIFT; 350 351 if (!pfn_valid(pfn)) { 352 pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn); 353 } else { 354 /* We have reached max count for this page, soft-offline it. */ 355 pr_err("Soft-offlining pfn: 0x%llx\n", pfn); 356 memory_failure_queue(pfn, MF_SOFT_OFFLINE); --- 54 unchanged lines hidden (view full) --- 411 decay_interval = val; 412 413 cec_mod_work(decay_interval); 414 415 return 0; 416} 417DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n"); 418 |
419static int count_threshold_set(void *data, u64 val) | 419static int action_threshold_set(void *data, u64 val) |
420{ 421 *(u64 *)data = val; 422 423 if (val > COUNT_MASK) 424 val = COUNT_MASK; 425 | 420{ 421 *(u64 *)data = val; 422 423 if (val > COUNT_MASK) 424 val = COUNT_MASK; 425 |
426 count_threshold = val; | 426 action_threshold = val; |
427 428 return 0; 429} | 427 428 return 0; 429} |
430DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n"); | 430DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n"); |
431 432static int array_dump(struct seq_file *m, void *v) 433{ 434 struct ce_array *ca = &ce_arr; 435 int i; 436 437 mutex_lock(&ce_mutex); 438 --- 9 unchanged lines hidden (view full) --- 448 seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n", 449 ca->ces_entered, ca->pfns_poisoned); 450 451 seq_printf(m, "Flags: 0x%x\n", ca->flags); 452 453 seq_printf(m, "Decay interval: %lld seconds\n", decay_interval); 454 seq_printf(m, "Decays: %lld\n", ca->decays_done); 455 | 431 432static int array_dump(struct seq_file *m, void *v) 433{ 434 struct ce_array *ca = &ce_arr; 435 int i; 436 437 mutex_lock(&ce_mutex); 438 --- 9 unchanged lines hidden (view full) --- 448 seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n", 449 ca->ces_entered, ca->pfns_poisoned); 450 451 seq_printf(m, "Flags: 0x%x\n", ca->flags); 452 453 seq_printf(m, "Decay interval: %lld seconds\n", decay_interval); 454 seq_printf(m, "Decays: %lld\n", ca->decays_done); 455 |
456 seq_printf(m, "Action threshold: %d\n", count_threshold); | 456 seq_printf(m, "Action threshold: %lld\n", action_threshold); |
457 458 mutex_unlock(&ce_mutex); 459 460 return 0; 461} 462 463static int array_open(struct inode *inode, struct file *filp) 464{ --- 32 unchanged lines hidden (view full) --- 497 498 decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d, 499 &decay_interval, &decay_interval_ops); 500 if (!decay) { 501 pr_warn("Error creating decay_interval debugfs node!\n"); 502 goto err; 503 } 504 | 457 458 mutex_unlock(&ce_mutex); 459 460 return 0; 461} 462 463static int array_open(struct inode *inode, struct file *filp) 464{ --- 32 unchanged lines hidden (view full) --- 497 498 decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d, 499 &decay_interval, &decay_interval_ops); 500 if (!decay) { 501 pr_warn("Error creating decay_interval debugfs node!\n"); 502 goto err; 503 } 504 |
505 count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d, 506 &count_threshold, &count_threshold_ops); | 505 count = debugfs_create_file("action_threshold", S_IRUSR | S_IWUSR, d, 506 &action_threshold, &action_threshold_ops); |
507 if (!count) { | 507 if (!count) { |
508 pr_warn("Error creating count_threshold debugfs node!\n"); | 508 pr_warn("Error creating action_threshold debugfs node!\n"); |
509 goto err; 510 } 511 512 513 return 0; 514 515err: 516 debugfs_remove_recursive(d); --- 41 unchanged lines hidden --- | 509 goto err; 510 } 511 512 513 return 0; 514 515err: 516 debugfs_remove_recursive(d); --- 41 unchanged lines hidden --- |