xref: /openbmc/linux/drivers/edac/edac_device.c (revision 9aa2cba7)
1 
2 /*
3  * edac_device.c
4  * (C) 2007 www.douglaskthompson.com
5  *
6  * This file may be distributed under the terms of the
7  * GNU General Public License.
8  *
9  * Written by Doug Thompson <norsk5@xmission.com>
10  *
11  * edac_device API implementation
12  * 19 Jan 2007
13  */
14 
15 #include <asm/page.h>
16 #include <linux/uaccess.h>
17 #include <linux/ctype.h>
18 #include <linux/highmem.h>
19 #include <linux/init.h>
20 #include <linux/jiffies.h>
21 #include <linux/module.h>
22 #include <linux/slab.h>
23 #include <linux/smp.h>
24 #include <linux/spinlock.h>
25 #include <linux/sysctl.h>
26 #include <linux/timer.h>
27 
28 #include "edac_device.h"
29 #include "edac_module.h"
30 
31 /* lock for the list: 'edac_device_list', manipulation of this list
32  * is protected by the 'device_ctls_mutex' lock
33  */
34 static DEFINE_MUTEX(device_ctls_mutex);
35 static LIST_HEAD(edac_device_list);
36 
37 /* Default workqueue processing interval on this instance, in msecs */
38 #define DEFAULT_POLL_INTERVAL 1000
39 
40 #ifdef CONFIG_EDAC_DEBUG
41 static void edac_device_dump_device(struct edac_device_ctl_info *edac_dev)
42 {
43 	edac_dbg(3, "\tedac_dev = %p dev_idx=%d\n",
44 		 edac_dev, edac_dev->dev_idx);
45 	edac_dbg(4, "\tedac_dev->edac_check = %p\n", edac_dev->edac_check);
46 	edac_dbg(3, "\tdev = %p\n", edac_dev->dev);
47 	edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
48 		 edac_dev->mod_name, edac_dev->ctl_name);
49 	edac_dbg(3, "\tpvt_info = %p\n\n", edac_dev->pvt_info);
50 }
51 #endif				/* CONFIG_EDAC_DEBUG */
52 
53 /*
54  * @off_val: zero, 1, or other based offset
55  */
56 struct edac_device_ctl_info *
57 edac_device_alloc_ctl_info(unsigned pvt_sz, char *dev_name, unsigned nr_instances,
58 			   char *blk_name, unsigned nr_blocks, unsigned off_val,
59 			   struct edac_dev_sysfs_block_attribute *attrib_spec,
60 			   unsigned nr_attrib, int device_index)
61 {
62 	struct edac_dev_sysfs_block_attribute *dev_attrib, *attrib_p, *attrib;
63 	struct edac_device_block *dev_blk, *blk_p, *blk;
64 	struct edac_device_instance *dev_inst, *inst;
65 	struct edac_device_ctl_info *dev_ctl;
66 	unsigned instance, block, attr;
67 	void *pvt;
68 	int err;
69 
70 	edac_dbg(4, "instances=%d blocks=%d\n", nr_instances, nr_blocks);
71 
72 	dev_ctl = kzalloc(sizeof(struct edac_device_ctl_info), GFP_KERNEL);
73 	if (!dev_ctl)
74 		return NULL;
75 
76 	dev_inst = kcalloc(nr_instances, sizeof(struct edac_device_instance), GFP_KERNEL);
77 	if (!dev_inst)
78 		goto free;
79 
80 	dev_ctl->instances = dev_inst;
81 
82 	dev_blk = kcalloc(nr_instances * nr_blocks, sizeof(struct edac_device_block), GFP_KERNEL);
83 	if (!dev_blk)
84 		goto free;
85 
86 	dev_ctl->blocks = dev_blk;
87 
88 	if (nr_attrib) {
89 		dev_attrib = kcalloc(nr_attrib, sizeof(struct edac_dev_sysfs_block_attribute),
90 				     GFP_KERNEL);
91 		if (!dev_attrib)
92 			goto free;
93 
94 		dev_ctl->attribs = dev_attrib;
95 	}
96 
97 	if (pvt_sz) {
98 		pvt = kzalloc(pvt_sz, GFP_KERNEL);
99 		if (!pvt)
100 			goto free;
101 
102 		dev_ctl->pvt_info = pvt;
103 	}
104 
105 	dev_ctl->dev_idx	= device_index;
106 	dev_ctl->nr_instances	= nr_instances;
107 
108 	/* Default logging of CEs and UEs */
109 	dev_ctl->log_ce = 1;
110 	dev_ctl->log_ue = 1;
111 
112 	/* Name of this edac device */
113 	snprintf(dev_ctl->name, sizeof(dev_ctl->name),"%s", dev_name);
114 
115 	/* Initialize every Instance */
116 	for (instance = 0; instance < nr_instances; instance++) {
117 		inst = &dev_inst[instance];
118 		inst->ctl = dev_ctl;
119 		inst->nr_blocks = nr_blocks;
120 		blk_p = &dev_blk[instance * nr_blocks];
121 		inst->blocks = blk_p;
122 
123 		/* name of this instance */
124 		snprintf(inst->name, sizeof(inst->name), "%s%u", dev_name, instance);
125 
126 		/* Initialize every block in each instance */
127 		for (block = 0; block < nr_blocks; block++) {
128 			blk = &blk_p[block];
129 			blk->instance = inst;
130 			snprintf(blk->name, sizeof(blk->name),
131 				 "%s%d", blk_name, block + off_val);
132 
133 			edac_dbg(4, "instance=%d inst_p=%p block=#%d block_p=%p name='%s'\n",
134 				 instance, inst, block, blk, blk->name);
135 
136 			/* if there are NO attributes OR no attribute pointer
137 			 * then continue on to next block iteration
138 			 */
139 			if ((nr_attrib == 0) || (attrib_spec == NULL))
140 				continue;
141 
142 			/* setup the attribute array for this block */
143 			blk->nr_attribs = nr_attrib;
144 			attrib_p = &dev_attrib[block*nr_instances*nr_attrib];
145 			blk->block_attributes = attrib_p;
146 
147 			edac_dbg(4, "THIS BLOCK_ATTRIB=%p\n",
148 				 blk->block_attributes);
149 
150 			/* Initialize every user specified attribute in this
151 			 * block with the data the caller passed in
152 			 * Each block gets its own copy of pointers,
153 			 * and its unique 'value'
154 			 */
155 			for (attr = 0; attr < nr_attrib; attr++) {
156 				attrib = &attrib_p[attr];
157 
158 				/* populate the unique per attrib
159 				 * with the code pointers and info
160 				 */
161 				attrib->attr = attrib_spec[attr].attr;
162 				attrib->show = attrib_spec[attr].show;
163 				attrib->store = attrib_spec[attr].store;
164 
165 				attrib->block = blk;	/* up link */
166 
167 				edac_dbg(4, "alloc-attrib=%p attrib_name='%s' attrib-spec=%p spec-name=%s\n",
168 					 attrib, attrib->attr.name,
169 					 &attrib_spec[attr],
170 					 attrib_spec[attr].attr.name
171 					);
172 			}
173 		}
174 	}
175 
176 	/* Mark this instance as merely ALLOCATED */
177 	dev_ctl->op_state = OP_ALLOC;
178 
179 	/*
180 	 * Initialize the 'root' kobj for the edac_device controller
181 	 */
182 	err = edac_device_register_sysfs_main_kobj(dev_ctl);
183 	if (err)
184 		goto free;
185 
186 	/* at this point, the root kobj is valid, and in order to
187 	 * 'free' the object, then the function:
188 	 *	edac_device_unregister_sysfs_main_kobj() must be called
189 	 * which will perform kobj unregistration and the actual free
190 	 * will occur during the kobject callback operation
191 	 */
192 
193 	return dev_ctl;
194 
195 free:
196 	__edac_device_free_ctl_info(dev_ctl);
197 
198 	return NULL;
199 }
200 EXPORT_SYMBOL_GPL(edac_device_alloc_ctl_info);
201 
202 void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info)
203 {
204 	edac_device_unregister_sysfs_main_kobj(ctl_info);
205 }
206 EXPORT_SYMBOL_GPL(edac_device_free_ctl_info);
207 
208 /*
209  * find_edac_device_by_dev
210  *	scans the edac_device list for a specific 'struct device *'
211  *
212  *	lock to be held prior to call:	device_ctls_mutex
213  *
214  *	Return:
215  *		pointer to control structure managing 'dev'
216  *		NULL if not found on list
217  */
218 static struct edac_device_ctl_info *find_edac_device_by_dev(struct device *dev)
219 {
220 	struct edac_device_ctl_info *edac_dev;
221 	struct list_head *item;
222 
223 	edac_dbg(0, "\n");
224 
225 	list_for_each(item, &edac_device_list) {
226 		edac_dev = list_entry(item, struct edac_device_ctl_info, link);
227 
228 		if (edac_dev->dev == dev)
229 			return edac_dev;
230 	}
231 
232 	return NULL;
233 }
234 
235 /*
236  * add_edac_dev_to_global_list
237  *	Before calling this function, caller must
238  *	assign a unique value to edac_dev->dev_idx.
239  *
240  *	lock to be held prior to call:	device_ctls_mutex
241  *
242  *	Return:
243  *		0 on success
244  *		1 on failure.
245  */
246 static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev)
247 {
248 	struct list_head *item, *insert_before;
249 	struct edac_device_ctl_info *rover;
250 
251 	insert_before = &edac_device_list;
252 
253 	/* Determine if already on the list */
254 	rover = find_edac_device_by_dev(edac_dev->dev);
255 	if (unlikely(rover != NULL))
256 		goto fail0;
257 
258 	/* Insert in ascending order by 'dev_idx', so find position */
259 	list_for_each(item, &edac_device_list) {
260 		rover = list_entry(item, struct edac_device_ctl_info, link);
261 
262 		if (rover->dev_idx >= edac_dev->dev_idx) {
263 			if (unlikely(rover->dev_idx == edac_dev->dev_idx))
264 				goto fail1;
265 
266 			insert_before = item;
267 			break;
268 		}
269 	}
270 
271 	list_add_tail_rcu(&edac_dev->link, insert_before);
272 	return 0;
273 
274 fail0:
275 	edac_printk(KERN_WARNING, EDAC_MC,
276 			"%s (%s) %s %s already assigned %d\n",
277 			dev_name(rover->dev), edac_dev_name(rover),
278 			rover->mod_name, rover->ctl_name, rover->dev_idx);
279 	return 1;
280 
281 fail1:
282 	edac_printk(KERN_WARNING, EDAC_MC,
283 			"bug in low-level driver: attempt to assign\n"
284 			"    duplicate dev_idx %d in %s()\n", rover->dev_idx,
285 			__func__);
286 	return 1;
287 }
288 
289 /*
290  * del_edac_device_from_global_list
291  */
292 static void del_edac_device_from_global_list(struct edac_device_ctl_info
293 						*edac_device)
294 {
295 	list_del_rcu(&edac_device->link);
296 
297 	/* these are for safe removal of devices from global list while
298 	 * NMI handlers may be traversing list
299 	 */
300 	synchronize_rcu();
301 	INIT_LIST_HEAD(&edac_device->link);
302 }
303 
304 /*
305  * edac_device_workq_function
306  *	performs the operation scheduled by a workq request
307  *
308  *	this workq is embedded within an edac_device_ctl_info
309  *	structure, that needs to be polled for possible error events.
310  *
311  *	This operation is to acquire the list mutex lock
312  *	(thus preventing insertation or deletion)
313  *	and then call the device's poll function IFF this device is
314  *	running polled and there is a poll function defined.
315  */
316 static void edac_device_workq_function(struct work_struct *work_req)
317 {
318 	struct delayed_work *d_work = to_delayed_work(work_req);
319 	struct edac_device_ctl_info *edac_dev = to_edac_device_ctl_work(d_work);
320 
321 	mutex_lock(&device_ctls_mutex);
322 
323 	/* If we are being removed, bail out immediately */
324 	if (edac_dev->op_state == OP_OFFLINE) {
325 		mutex_unlock(&device_ctls_mutex);
326 		return;
327 	}
328 
329 	/* Only poll controllers that are running polled and have a check */
330 	if ((edac_dev->op_state == OP_RUNNING_POLL) &&
331 		(edac_dev->edac_check != NULL)) {
332 			edac_dev->edac_check(edac_dev);
333 	}
334 
335 	mutex_unlock(&device_ctls_mutex);
336 
337 	/* Reschedule the workq for the next time period to start again
338 	 * if the number of msec is for 1 sec, then adjust to the next
339 	 * whole one second to save timers firing all over the period
340 	 * between integral seconds
341 	 */
342 	if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
343 		edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
344 	else
345 		edac_queue_work(&edac_dev->work, edac_dev->delay);
346 }
347 
348 /*
349  * edac_device_workq_setup
350  *	initialize a workq item for this edac_device instance
351  *	passing in the new delay period in msec
352  */
353 static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev,
354 				    unsigned msec)
355 {
356 	edac_dbg(0, "\n");
357 
358 	/* take the arg 'msec' and set it into the control structure
359 	 * to used in the time period calculation
360 	 * then calc the number of jiffies that represents
361 	 */
362 	edac_dev->poll_msec = msec;
363 	edac_dev->delay = msecs_to_jiffies(msec);
364 
365 	INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function);
366 
367 	/* optimize here for the 1 second case, which will be normal value, to
368 	 * fire ON the 1 second time event. This helps reduce all sorts of
369 	 * timers firing on sub-second basis, while they are happy
370 	 * to fire together on the 1 second exactly
371 	 */
372 	if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
373 		edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
374 	else
375 		edac_queue_work(&edac_dev->work, edac_dev->delay);
376 }
377 
378 /*
379  * edac_device_workq_teardown
380  *	stop the workq processing on this edac_dev
381  */
382 static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev)
383 {
384 	if (!edac_dev->edac_check)
385 		return;
386 
387 	edac_dev->op_state = OP_OFFLINE;
388 
389 	edac_stop_work(&edac_dev->work);
390 }
391 
392 /*
393  * edac_device_reset_delay_period
394  *
395  *	need to stop any outstanding workq queued up at this time
396  *	because we will be resetting the sleep time.
397  *	Then restart the workq on the new delay
398  */
399 void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
400 				    unsigned long msec)
401 {
402 	edac_dev->poll_msec = msec;
403 	edac_dev->delay	    = msecs_to_jiffies(msec);
404 
405 	/* See comment in edac_device_workq_setup() above */
406 	if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
407 		edac_mod_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
408 	else
409 		edac_mod_work(&edac_dev->work, edac_dev->delay);
410 }
411 
412 int edac_device_alloc_index(void)
413 {
414 	static atomic_t device_indexes = ATOMIC_INIT(0);
415 
416 	return atomic_inc_return(&device_indexes) - 1;
417 }
418 EXPORT_SYMBOL_GPL(edac_device_alloc_index);
419 
420 int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
421 {
422 	edac_dbg(0, "\n");
423 
424 #ifdef CONFIG_EDAC_DEBUG
425 	if (edac_debug_level >= 3)
426 		edac_device_dump_device(edac_dev);
427 #endif
428 	mutex_lock(&device_ctls_mutex);
429 
430 	if (add_edac_dev_to_global_list(edac_dev))
431 		goto fail0;
432 
433 	/* set load time so that error rate can be tracked */
434 	edac_dev->start_time = jiffies;
435 
436 	/* create this instance's sysfs entries */
437 	if (edac_device_create_sysfs(edac_dev)) {
438 		edac_device_printk(edac_dev, KERN_WARNING,
439 					"failed to create sysfs device\n");
440 		goto fail1;
441 	}
442 
443 	/* If there IS a check routine, then we are running POLLED */
444 	if (edac_dev->edac_check != NULL) {
445 		/* This instance is NOW RUNNING */
446 		edac_dev->op_state = OP_RUNNING_POLL;
447 
448 		edac_device_workq_setup(edac_dev, edac_dev->poll_msec ?: DEFAULT_POLL_INTERVAL);
449 	} else {
450 		edac_dev->op_state = OP_RUNNING_INTERRUPT;
451 	}
452 
453 	/* Report action taken */
454 	edac_device_printk(edac_dev, KERN_INFO,
455 		"Giving out device to module %s controller %s: DEV %s (%s)\n",
456 		edac_dev->mod_name, edac_dev->ctl_name, edac_dev->dev_name,
457 		edac_op_state_to_string(edac_dev->op_state));
458 
459 	mutex_unlock(&device_ctls_mutex);
460 	return 0;
461 
462 fail1:
463 	/* Some error, so remove the entry from the lsit */
464 	del_edac_device_from_global_list(edac_dev);
465 
466 fail0:
467 	mutex_unlock(&device_ctls_mutex);
468 	return 1;
469 }
470 EXPORT_SYMBOL_GPL(edac_device_add_device);
471 
472 struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
473 {
474 	struct edac_device_ctl_info *edac_dev;
475 
476 	edac_dbg(0, "\n");
477 
478 	mutex_lock(&device_ctls_mutex);
479 
480 	/* Find the structure on the list, if not there, then leave */
481 	edac_dev = find_edac_device_by_dev(dev);
482 	if (edac_dev == NULL) {
483 		mutex_unlock(&device_ctls_mutex);
484 		return NULL;
485 	}
486 
487 	/* mark this instance as OFFLINE */
488 	edac_dev->op_state = OP_OFFLINE;
489 
490 	/* deregister from global list */
491 	del_edac_device_from_global_list(edac_dev);
492 
493 	mutex_unlock(&device_ctls_mutex);
494 
495 	/* clear workq processing on this instance */
496 	edac_device_workq_teardown(edac_dev);
497 
498 	/* Tear down the sysfs entries for this instance */
499 	edac_device_remove_sysfs(edac_dev);
500 
501 	edac_printk(KERN_INFO, EDAC_MC,
502 		"Removed device %d for %s %s: DEV %s\n",
503 		edac_dev->dev_idx,
504 		edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev));
505 
506 	return edac_dev;
507 }
508 EXPORT_SYMBOL_GPL(edac_device_del_device);
509 
510 static inline int edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
511 {
512 	return edac_dev->log_ce;
513 }
514 
515 static inline int edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
516 {
517 	return edac_dev->log_ue;
518 }
519 
520 static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
521 					*edac_dev)
522 {
523 	return edac_dev->panic_on_ue;
524 }
525 
526 void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
527 				 unsigned int count, int inst_nr, int block_nr,
528 				 const char *msg)
529 {
530 	struct edac_device_instance *instance;
531 	struct edac_device_block *block = NULL;
532 
533 	if (!count)
534 		return;
535 
536 	if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
537 		edac_device_printk(edac_dev, KERN_ERR,
538 				"INTERNAL ERROR: 'instance' out of range "
539 				"(%d >= %d)\n", inst_nr,
540 				edac_dev->nr_instances);
541 		return;
542 	}
543 
544 	instance = edac_dev->instances + inst_nr;
545 
546 	if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
547 		edac_device_printk(edac_dev, KERN_ERR,
548 				"INTERNAL ERROR: instance %d 'block' "
549 				"out of range (%d >= %d)\n",
550 				inst_nr, block_nr,
551 				instance->nr_blocks);
552 		return;
553 	}
554 
555 	if (instance->nr_blocks > 0) {
556 		block = instance->blocks + block_nr;
557 		block->counters.ce_count += count;
558 	}
559 
560 	/* Propagate the count up the 'totals' tree */
561 	instance->counters.ce_count += count;
562 	edac_dev->counters.ce_count += count;
563 
564 	if (edac_device_get_log_ce(edac_dev))
565 		edac_device_printk(edac_dev, KERN_WARNING,
566 				   "CE: %s instance: %s block: %s count: %d '%s'\n",
567 				   edac_dev->ctl_name, instance->name,
568 				   block ? block->name : "N/A", count, msg);
569 }
570 EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
571 
572 void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
573 				 unsigned int count, int inst_nr, int block_nr,
574 				 const char *msg)
575 {
576 	struct edac_device_instance *instance;
577 	struct edac_device_block *block = NULL;
578 
579 	if (!count)
580 		return;
581 
582 	if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
583 		edac_device_printk(edac_dev, KERN_ERR,
584 				"INTERNAL ERROR: 'instance' out of range "
585 				"(%d >= %d)\n", inst_nr,
586 				edac_dev->nr_instances);
587 		return;
588 	}
589 
590 	instance = edac_dev->instances + inst_nr;
591 
592 	if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
593 		edac_device_printk(edac_dev, KERN_ERR,
594 				"INTERNAL ERROR: instance %d 'block' "
595 				"out of range (%d >= %d)\n",
596 				inst_nr, block_nr,
597 				instance->nr_blocks);
598 		return;
599 	}
600 
601 	if (instance->nr_blocks > 0) {
602 		block = instance->blocks + block_nr;
603 		block->counters.ue_count += count;
604 	}
605 
606 	/* Propagate the count up the 'totals' tree */
607 	instance->counters.ue_count += count;
608 	edac_dev->counters.ue_count += count;
609 
610 	if (edac_device_get_log_ue(edac_dev))
611 		edac_device_printk(edac_dev, KERN_EMERG,
612 				   "UE: %s instance: %s block: %s count: %d '%s'\n",
613 				   edac_dev->ctl_name, instance->name,
614 				   block ? block->name : "N/A", count, msg);
615 
616 	if (edac_device_get_panic_on_ue(edac_dev))
617 		panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
618 		      edac_dev->ctl_name, instance->name,
619 		      block ? block->name : "N/A", count, msg);
620 }
621 EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
622