xref: /openbmc/linux/drivers/edac/edac_mc.c (revision d0b73b48)
1 /*
2  * edac_mc kernel module
3  * (C) 2005, 2006 Linux Networx (http://lnxi.com)
4  * This file may be distributed under the terms of the
5  * GNU General Public License.
6  *
7  * Written by Thayne Harbaugh
8  * Based on work by Dan Hollis <goemon at anime dot net> and others.
9  *	http://www.anime.net/~goemon/linux-ecc/
10  *
11  * Modified by Dave Peterson and Doug Thompson
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/proc_fs.h>
17 #include <linux/kernel.h>
18 #include <linux/types.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/sysctl.h>
22 #include <linux/highmem.h>
23 #include <linux/timer.h>
24 #include <linux/slab.h>
25 #include <linux/jiffies.h>
26 #include <linux/spinlock.h>
27 #include <linux/list.h>
28 #include <linux/ctype.h>
29 #include <linux/edac.h>
30 #include <linux/bitops.h>
31 #include <asm/uaccess.h>
32 #include <asm/page.h>
33 #include <asm/edac.h>
34 #include "edac_core.h"
35 #include "edac_module.h"
36 
37 #define CREATE_TRACE_POINTS
38 #define TRACE_INCLUDE_PATH ../../include/ras
39 #include <ras/ras_event.h>
40 
41 /* lock to memory controller's control array */
42 static DEFINE_MUTEX(mem_ctls_mutex);
43 static LIST_HEAD(mc_devices);
44 
45 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
46 			         unsigned len)
47 {
48 	struct mem_ctl_info *mci = dimm->mci;
49 	int i, n, count = 0;
50 	char *p = buf;
51 
52 	for (i = 0; i < mci->n_layers; i++) {
53 		n = snprintf(p, len, "%s %d ",
54 			      edac_layer_name[mci->layers[i].type],
55 			      dimm->location[i]);
56 		p += n;
57 		len -= n;
58 		count += n;
59 		if (!len)
60 			break;
61 	}
62 
63 	return count;
64 }
65 
66 #ifdef CONFIG_EDAC_DEBUG
67 
68 static void edac_mc_dump_channel(struct rank_info *chan)
69 {
70 	edac_dbg(4, "  channel->chan_idx = %d\n", chan->chan_idx);
71 	edac_dbg(4, "    channel = %p\n", chan);
72 	edac_dbg(4, "    channel->csrow = %p\n", chan->csrow);
73 	edac_dbg(4, "    channel->dimm = %p\n", chan->dimm);
74 }
75 
76 static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
77 {
78 	char location[80];
79 
80 	edac_dimm_info_location(dimm, location, sizeof(location));
81 
82 	edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
83 		 dimm->mci->mem_is_per_rank ? "rank" : "dimm",
84 		 number, location, dimm->csrow, dimm->cschannel);
85 	edac_dbg(4, "  dimm = %p\n", dimm);
86 	edac_dbg(4, "  dimm->label = '%s'\n", dimm->label);
87 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
88 	edac_dbg(4, "  dimm->grain = %d\n", dimm->grain);
89 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
90 }
91 
92 static void edac_mc_dump_csrow(struct csrow_info *csrow)
93 {
94 	edac_dbg(4, "csrow->csrow_idx = %d\n", csrow->csrow_idx);
95 	edac_dbg(4, "  csrow = %p\n", csrow);
96 	edac_dbg(4, "  csrow->first_page = 0x%lx\n", csrow->first_page);
97 	edac_dbg(4, "  csrow->last_page = 0x%lx\n", csrow->last_page);
98 	edac_dbg(4, "  csrow->page_mask = 0x%lx\n", csrow->page_mask);
99 	edac_dbg(4, "  csrow->nr_channels = %d\n", csrow->nr_channels);
100 	edac_dbg(4, "  csrow->channels = %p\n", csrow->channels);
101 	edac_dbg(4, "  csrow->mci = %p\n", csrow->mci);
102 }
103 
104 static void edac_mc_dump_mci(struct mem_ctl_info *mci)
105 {
106 	edac_dbg(3, "\tmci = %p\n", mci);
107 	edac_dbg(3, "\tmci->mtype_cap = %lx\n", mci->mtype_cap);
108 	edac_dbg(3, "\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap);
109 	edac_dbg(3, "\tmci->edac_cap = %lx\n", mci->edac_cap);
110 	edac_dbg(4, "\tmci->edac_check = %p\n", mci->edac_check);
111 	edac_dbg(3, "\tmci->nr_csrows = %d, csrows = %p\n",
112 		 mci->nr_csrows, mci->csrows);
113 	edac_dbg(3, "\tmci->nr_dimms = %d, dimms = %p\n",
114 		 mci->tot_dimms, mci->dimms);
115 	edac_dbg(3, "\tdev = %p\n", mci->pdev);
116 	edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
117 		 mci->mod_name, mci->ctl_name);
118 	edac_dbg(3, "\tpvt_info = %p\n\n", mci->pvt_info);
119 }
120 
121 #endif				/* CONFIG_EDAC_DEBUG */
122 
123 /*
124  * keep those in sync with the enum mem_type
125  */
126 const char *edac_mem_types[] = {
127 	"Empty csrow",
128 	"Reserved csrow type",
129 	"Unknown csrow type",
130 	"Fast page mode RAM",
131 	"Extended data out RAM",
132 	"Burst Extended data out RAM",
133 	"Single data rate SDRAM",
134 	"Registered single data rate SDRAM",
135 	"Double data rate SDRAM",
136 	"Registered Double data rate SDRAM",
137 	"Rambus DRAM",
138 	"Unbuffered DDR2 RAM",
139 	"Fully buffered DDR2",
140 	"Registered DDR2 RAM",
141 	"Rambus XDR",
142 	"Unbuffered DDR3 RAM",
143 	"Registered DDR3 RAM",
144 };
145 EXPORT_SYMBOL_GPL(edac_mem_types);
146 
147 /**
148  * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
149  * @p:		pointer to a pointer with the memory offset to be used. At
150  *		return, this will be incremented to point to the next offset
151  * @size:	Size of the data structure to be reserved
152  * @n_elems:	Number of elements that should be reserved
153  *
154  * If 'size' is a constant, the compiler will optimize this whole function
155  * down to either a no-op or the addition of a constant to the value of '*p'.
156  *
157  * The 'p' pointer is absolutely needed to keep the proper advancing
158  * further in memory to the proper offsets when allocating the struct along
159  * with its embedded structs, as edac_device_alloc_ctl_info() does it
160  * above, for example.
161  *
162  * At return, the pointer 'p' will be incremented to be used on a next call
163  * to this function.
164  */
165 void *edac_align_ptr(void **p, unsigned size, int n_elems)
166 {
167 	unsigned align, r;
168 	void *ptr = *p;
169 
170 	*p += size * n_elems;
171 
172 	/*
173 	 * 'p' can possibly be an unaligned item X such that sizeof(X) is
174 	 * 'size'.  Adjust 'p' so that its alignment is at least as
175 	 * stringent as what the compiler would provide for X and return
176 	 * the aligned result.
177 	 * Here we assume that the alignment of a "long long" is the most
178 	 * stringent alignment that the compiler will ever provide by default.
179 	 * As far as I know, this is a reasonable assumption.
180 	 */
181 	if (size > sizeof(long))
182 		align = sizeof(long long);
183 	else if (size > sizeof(int))
184 		align = sizeof(long);
185 	else if (size > sizeof(short))
186 		align = sizeof(int);
187 	else if (size > sizeof(char))
188 		align = sizeof(short);
189 	else
190 		return (char *)ptr;
191 
192 	r = (unsigned long)p % align;
193 
194 	if (r == 0)
195 		return (char *)ptr;
196 
197 	*p += align - r;
198 
199 	return (void *)(((unsigned long)ptr) + align - r);
200 }
201 
202 static void _edac_mc_free(struct mem_ctl_info *mci)
203 {
204 	int i, chn, row;
205 	struct csrow_info *csr;
206 	const unsigned int tot_dimms = mci->tot_dimms;
207 	const unsigned int tot_channels = mci->num_cschannel;
208 	const unsigned int tot_csrows = mci->nr_csrows;
209 
210 	if (mci->dimms) {
211 		for (i = 0; i < tot_dimms; i++)
212 			kfree(mci->dimms[i]);
213 		kfree(mci->dimms);
214 	}
215 	if (mci->csrows) {
216 		for (row = 0; row < tot_csrows; row++) {
217 			csr = mci->csrows[row];
218 			if (csr) {
219 				if (csr->channels) {
220 					for (chn = 0; chn < tot_channels; chn++)
221 						kfree(csr->channels[chn]);
222 					kfree(csr->channels);
223 				}
224 				kfree(csr);
225 			}
226 		}
227 		kfree(mci->csrows);
228 	}
229 	kfree(mci);
230 }
231 
232 /**
233  * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
234  * @mc_num:		Memory controller number
235  * @n_layers:		Number of MC hierarchy layers
236  * layers:		Describes each layer as seen by the Memory Controller
237  * @size_pvt:		size of private storage needed
238  *
239  *
240  * Everything is kmalloc'ed as one big chunk - more efficient.
241  * Only can be used if all structures have the same lifetime - otherwise
242  * you have to allocate and initialize your own structures.
243  *
244  * Use edac_mc_free() to free mc structures allocated by this function.
245  *
246  * NOTE: drivers handle multi-rank memories in different ways: in some
247  * drivers, one multi-rank memory stick is mapped as one entry, while, in
248  * others, a single multi-rank memory stick would be mapped into several
249  * entries. Currently, this function will allocate multiple struct dimm_info
250  * on such scenarios, as grouping the multiple ranks require drivers change.
251  *
252  * Returns:
253  *	On failure: NULL
254  *	On success: struct mem_ctl_info pointer
255  */
256 struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
257 				   unsigned n_layers,
258 				   struct edac_mc_layer *layers,
259 				   unsigned sz_pvt)
260 {
261 	struct mem_ctl_info *mci;
262 	struct edac_mc_layer *layer;
263 	struct csrow_info *csr;
264 	struct rank_info *chan;
265 	struct dimm_info *dimm;
266 	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
267 	unsigned pos[EDAC_MAX_LAYERS];
268 	unsigned size, tot_dimms = 1, count = 1;
269 	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
270 	void *pvt, *p, *ptr = NULL;
271 	int i, j, row, chn, n, len, off;
272 	bool per_rank = false;
273 
274 	BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
275 	/*
276 	 * Calculate the total amount of dimms and csrows/cschannels while
277 	 * in the old API emulation mode
278 	 */
279 	for (i = 0; i < n_layers; i++) {
280 		tot_dimms *= layers[i].size;
281 		if (layers[i].is_virt_csrow)
282 			tot_csrows *= layers[i].size;
283 		else
284 			tot_channels *= layers[i].size;
285 
286 		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
287 			per_rank = true;
288 	}
289 
290 	/* Figure out the offsets of the various items from the start of an mc
291 	 * structure.  We want the alignment of each item to be at least as
292 	 * stringent as what the compiler would provide if we could simply
293 	 * hardcode everything into a single struct.
294 	 */
295 	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
296 	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
297 	for (i = 0; i < n_layers; i++) {
298 		count *= layers[i].size;
299 		edac_dbg(4, "errcount layer %d size %d\n", i, count);
300 		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
301 		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
302 		tot_errcount += 2 * count;
303 	}
304 
305 	edac_dbg(4, "allocating %d error counters\n", tot_errcount);
306 	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
307 	size = ((unsigned long)pvt) + sz_pvt;
308 
309 	edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
310 		 size,
311 		 tot_dimms,
312 		 per_rank ? "ranks" : "dimms",
313 		 tot_csrows * tot_channels);
314 
315 	mci = kzalloc(size, GFP_KERNEL);
316 	if (mci == NULL)
317 		return NULL;
318 
319 	/* Adjust pointers so they point within the memory we just allocated
320 	 * rather than an imaginary chunk of memory located at address 0.
321 	 */
322 	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
323 	for (i = 0; i < n_layers; i++) {
324 		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
325 		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
326 	}
327 	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
328 
329 	/* setup index and various internal pointers */
330 	mci->mc_idx = mc_num;
331 	mci->tot_dimms = tot_dimms;
332 	mci->pvt_info = pvt;
333 	mci->n_layers = n_layers;
334 	mci->layers = layer;
335 	memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
336 	mci->nr_csrows = tot_csrows;
337 	mci->num_cschannel = tot_channels;
338 	mci->mem_is_per_rank = per_rank;
339 
340 	/*
341 	 * Alocate and fill the csrow/channels structs
342 	 */
343 	mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
344 	if (!mci->csrows)
345 		goto error;
346 	for (row = 0; row < tot_csrows; row++) {
347 		csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
348 		if (!csr)
349 			goto error;
350 		mci->csrows[row] = csr;
351 		csr->csrow_idx = row;
352 		csr->mci = mci;
353 		csr->nr_channels = tot_channels;
354 		csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
355 					GFP_KERNEL);
356 		if (!csr->channels)
357 			goto error;
358 
359 		for (chn = 0; chn < tot_channels; chn++) {
360 			chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
361 			if (!chan)
362 				goto error;
363 			csr->channels[chn] = chan;
364 			chan->chan_idx = chn;
365 			chan->csrow = csr;
366 		}
367 	}
368 
369 	/*
370 	 * Allocate and fill the dimm structs
371 	 */
372 	mci->dimms  = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
373 	if (!mci->dimms)
374 		goto error;
375 
376 	memset(&pos, 0, sizeof(pos));
377 	row = 0;
378 	chn = 0;
379 	for (i = 0; i < tot_dimms; i++) {
380 		chan = mci->csrows[row]->channels[chn];
381 		off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
382 		if (off < 0 || off >= tot_dimms) {
383 			edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n");
384 			goto error;
385 		}
386 
387 		dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
388 		if (!dimm)
389 			goto error;
390 		mci->dimms[off] = dimm;
391 		dimm->mci = mci;
392 
393 		/*
394 		 * Copy DIMM location and initialize it.
395 		 */
396 		len = sizeof(dimm->label);
397 		p = dimm->label;
398 		n = snprintf(p, len, "mc#%u", mc_num);
399 		p += n;
400 		len -= n;
401 		for (j = 0; j < n_layers; j++) {
402 			n = snprintf(p, len, "%s#%u",
403 				     edac_layer_name[layers[j].type],
404 				     pos[j]);
405 			p += n;
406 			len -= n;
407 			dimm->location[j] = pos[j];
408 
409 			if (len <= 0)
410 				break;
411 		}
412 
413 		/* Link it to the csrows old API data */
414 		chan->dimm = dimm;
415 		dimm->csrow = row;
416 		dimm->cschannel = chn;
417 
418 		/* Increment csrow location */
419 		if (layers[0].is_virt_csrow) {
420 			chn++;
421 			if (chn == tot_channels) {
422 				chn = 0;
423 				row++;
424 			}
425 		} else {
426 			row++;
427 			if (row == tot_csrows) {
428 				row = 0;
429 				chn++;
430 			}
431 		}
432 
433 		/* Increment dimm location */
434 		for (j = n_layers - 1; j >= 0; j--) {
435 			pos[j]++;
436 			if (pos[j] < layers[j].size)
437 				break;
438 			pos[j] = 0;
439 		}
440 	}
441 
442 	mci->op_state = OP_ALLOC;
443 
444 	/* at this point, the root kobj is valid, and in order to
445 	 * 'free' the object, then the function:
446 	 *      edac_mc_unregister_sysfs_main_kobj() must be called
447 	 * which will perform kobj unregistration and the actual free
448 	 * will occur during the kobject callback operation
449 	 */
450 
451 	return mci;
452 
453 error:
454 	_edac_mc_free(mci);
455 
456 	return NULL;
457 }
458 EXPORT_SYMBOL_GPL(edac_mc_alloc);
459 
460 /**
461  * edac_mc_free
462  *	'Free' a previously allocated 'mci' structure
463  * @mci: pointer to a struct mem_ctl_info structure
464  */
465 void edac_mc_free(struct mem_ctl_info *mci)
466 {
467 	edac_dbg(1, "\n");
468 
469 	/* If we're not yet registered with sysfs free only what was allocated
470 	 * in edac_mc_alloc().
471 	 */
472 	if (!device_is_registered(&mci->dev)) {
473 		_edac_mc_free(mci);
474 		return;
475 	}
476 
477 	/* the mci instance is freed here, when the sysfs object is dropped */
478 	edac_unregister_sysfs(mci);
479 }
480 EXPORT_SYMBOL_GPL(edac_mc_free);
481 
482 
483 /**
484  * find_mci_by_dev
485  *
486  *	scan list of controllers looking for the one that manages
487  *	the 'dev' device
488  * @dev: pointer to a struct device related with the MCI
489  */
490 struct mem_ctl_info *find_mci_by_dev(struct device *dev)
491 {
492 	struct mem_ctl_info *mci;
493 	struct list_head *item;
494 
495 	edac_dbg(3, "\n");
496 
497 	list_for_each(item, &mc_devices) {
498 		mci = list_entry(item, struct mem_ctl_info, link);
499 
500 		if (mci->pdev == dev)
501 			return mci;
502 	}
503 
504 	return NULL;
505 }
506 EXPORT_SYMBOL_GPL(find_mci_by_dev);
507 
508 /*
509  * handler for EDAC to check if NMI type handler has asserted interrupt
510  */
511 static int edac_mc_assert_error_check_and_clear(void)
512 {
513 	int old_state;
514 
515 	if (edac_op_state == EDAC_OPSTATE_POLL)
516 		return 1;
517 
518 	old_state = edac_err_assert;
519 	edac_err_assert = 0;
520 
521 	return old_state;
522 }
523 
524 /*
525  * edac_mc_workq_function
526  *	performs the operation scheduled by a workq request
527  */
528 static void edac_mc_workq_function(struct work_struct *work_req)
529 {
530 	struct delayed_work *d_work = to_delayed_work(work_req);
531 	struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work);
532 
533 	mutex_lock(&mem_ctls_mutex);
534 
535 	/* if this control struct has movd to offline state, we are done */
536 	if (mci->op_state == OP_OFFLINE) {
537 		mutex_unlock(&mem_ctls_mutex);
538 		return;
539 	}
540 
541 	/* Only poll controllers that are running polled and have a check */
542 	if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
543 		mci->edac_check(mci);
544 
545 	mutex_unlock(&mem_ctls_mutex);
546 
547 	/* Reschedule */
548 	queue_delayed_work(edac_workqueue, &mci->work,
549 			msecs_to_jiffies(edac_mc_get_poll_msec()));
550 }
551 
552 /*
553  * edac_mc_workq_setup
554  *	initialize a workq item for this mci
555  *	passing in the new delay period in msec
556  *
557  *	locking model:
558  *
559  *		called with the mem_ctls_mutex held
560  */
561 static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
562 {
563 	edac_dbg(0, "\n");
564 
565 	/* if this instance is not in the POLL state, then simply return */
566 	if (mci->op_state != OP_RUNNING_POLL)
567 		return;
568 
569 	INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
570 	mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
571 }
572 
573 /*
574  * edac_mc_workq_teardown
575  *	stop the workq processing on this mci
576  *
577  *	locking model:
578  *
579  *		called WITHOUT lock held
580  */
581 static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
582 {
583 	int status;
584 
585 	if (mci->op_state != OP_RUNNING_POLL)
586 		return;
587 
588 	status = cancel_delayed_work(&mci->work);
589 	if (status == 0) {
590 		edac_dbg(0, "not canceled, flush the queue\n");
591 
592 		/* workq instance might be running, wait for it */
593 		flush_workqueue(edac_workqueue);
594 	}
595 }
596 
597 /*
598  * edac_mc_reset_delay_period(unsigned long value)
599  *
600  *	user space has updated our poll period value, need to
601  *	reset our workq delays
602  */
603 void edac_mc_reset_delay_period(int value)
604 {
605 	struct mem_ctl_info *mci;
606 	struct list_head *item;
607 
608 	mutex_lock(&mem_ctls_mutex);
609 
610 	list_for_each(item, &mc_devices) {
611 		mci = list_entry(item, struct mem_ctl_info, link);
612 
613 		edac_mc_workq_setup(mci, (unsigned long) value);
614 	}
615 
616 	mutex_unlock(&mem_ctls_mutex);
617 }
618 
619 
620 
621 /* Return 0 on success, 1 on failure.
622  * Before calling this function, caller must
623  * assign a unique value to mci->mc_idx.
624  *
625  *	locking model:
626  *
627  *		called with the mem_ctls_mutex lock held
628  */
629 static int add_mc_to_global_list(struct mem_ctl_info *mci)
630 {
631 	struct list_head *item, *insert_before;
632 	struct mem_ctl_info *p;
633 
634 	insert_before = &mc_devices;
635 
636 	p = find_mci_by_dev(mci->pdev);
637 	if (unlikely(p != NULL))
638 		goto fail0;
639 
640 	list_for_each(item, &mc_devices) {
641 		p = list_entry(item, struct mem_ctl_info, link);
642 
643 		if (p->mc_idx >= mci->mc_idx) {
644 			if (unlikely(p->mc_idx == mci->mc_idx))
645 				goto fail1;
646 
647 			insert_before = item;
648 			break;
649 		}
650 	}
651 
652 	list_add_tail_rcu(&mci->link, insert_before);
653 	atomic_inc(&edac_handlers);
654 	return 0;
655 
656 fail0:
657 	edac_printk(KERN_WARNING, EDAC_MC,
658 		"%s (%s) %s %s already assigned %d\n", dev_name(p->pdev),
659 		edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
660 	return 1;
661 
662 fail1:
663 	edac_printk(KERN_WARNING, EDAC_MC,
664 		"bug in low-level driver: attempt to assign\n"
665 		"    duplicate mc_idx %d in %s()\n", p->mc_idx, __func__);
666 	return 1;
667 }
668 
669 static void del_mc_from_global_list(struct mem_ctl_info *mci)
670 {
671 	atomic_dec(&edac_handlers);
672 	list_del_rcu(&mci->link);
673 
674 	/* these are for safe removal of devices from global list while
675 	 * NMI handlers may be traversing list
676 	 */
677 	synchronize_rcu();
678 	INIT_LIST_HEAD(&mci->link);
679 }
680 
681 /**
682  * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'.
683  *
684  * If found, return a pointer to the structure.
685  * Else return NULL.
686  *
687  * Caller must hold mem_ctls_mutex.
688  */
689 struct mem_ctl_info *edac_mc_find(int idx)
690 {
691 	struct list_head *item;
692 	struct mem_ctl_info *mci;
693 
694 	list_for_each(item, &mc_devices) {
695 		mci = list_entry(item, struct mem_ctl_info, link);
696 
697 		if (mci->mc_idx >= idx) {
698 			if (mci->mc_idx == idx)
699 				return mci;
700 
701 			break;
702 		}
703 	}
704 
705 	return NULL;
706 }
707 EXPORT_SYMBOL(edac_mc_find);
708 
709 /**
710  * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
711  *                 create sysfs entries associated with mci structure
712  * @mci: pointer to the mci structure to be added to the list
713  *
714  * Return:
715  *	0	Success
716  *	!0	Failure
717  */
718 
719 /* FIXME - should a warning be printed if no error detection? correction? */
720 int edac_mc_add_mc(struct mem_ctl_info *mci)
721 {
722 	edac_dbg(0, "\n");
723 
724 #ifdef CONFIG_EDAC_DEBUG
725 	if (edac_debug_level >= 3)
726 		edac_mc_dump_mci(mci);
727 
728 	if (edac_debug_level >= 4) {
729 		int i;
730 
731 		for (i = 0; i < mci->nr_csrows; i++) {
732 			struct csrow_info *csrow = mci->csrows[i];
733 			u32 nr_pages = 0;
734 			int j;
735 
736 			for (j = 0; j < csrow->nr_channels; j++)
737 				nr_pages += csrow->channels[j]->dimm->nr_pages;
738 			if (!nr_pages)
739 				continue;
740 			edac_mc_dump_csrow(csrow);
741 			for (j = 0; j < csrow->nr_channels; j++)
742 				if (csrow->channels[j]->dimm->nr_pages)
743 					edac_mc_dump_channel(csrow->channels[j]);
744 		}
745 		for (i = 0; i < mci->tot_dimms; i++)
746 			if (mci->dimms[i]->nr_pages)
747 				edac_mc_dump_dimm(mci->dimms[i], i);
748 	}
749 #endif
750 	mutex_lock(&mem_ctls_mutex);
751 
752 	if (add_mc_to_global_list(mci))
753 		goto fail0;
754 
755 	/* set load time so that error rate can be tracked */
756 	mci->start_time = jiffies;
757 
758 	if (edac_create_sysfs_mci_device(mci)) {
759 		edac_mc_printk(mci, KERN_WARNING,
760 			"failed to create sysfs device\n");
761 		goto fail1;
762 	}
763 
764 	/* If there IS a check routine, then we are running POLLED */
765 	if (mci->edac_check != NULL) {
766 		/* This instance is NOW RUNNING */
767 		mci->op_state = OP_RUNNING_POLL;
768 
769 		edac_mc_workq_setup(mci, edac_mc_get_poll_msec());
770 	} else {
771 		mci->op_state = OP_RUNNING_INTERRUPT;
772 	}
773 
774 	/* Report action taken */
775 	edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
776 		" DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
777 
778 	mutex_unlock(&mem_ctls_mutex);
779 	return 0;
780 
781 fail1:
782 	del_mc_from_global_list(mci);
783 
784 fail0:
785 	mutex_unlock(&mem_ctls_mutex);
786 	return 1;
787 }
788 EXPORT_SYMBOL_GPL(edac_mc_add_mc);
789 
790 /**
791  * edac_mc_del_mc: Remove sysfs entries for specified mci structure and
792  *                 remove mci structure from global list
793  * @pdev: Pointer to 'struct device' representing mci structure to remove.
794  *
795  * Return pointer to removed mci structure, or NULL if device not found.
796  */
797 struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
798 {
799 	struct mem_ctl_info *mci;
800 
801 	edac_dbg(0, "\n");
802 
803 	mutex_lock(&mem_ctls_mutex);
804 
805 	/* find the requested mci struct in the global list */
806 	mci = find_mci_by_dev(dev);
807 	if (mci == NULL) {
808 		mutex_unlock(&mem_ctls_mutex);
809 		return NULL;
810 	}
811 
812 	del_mc_from_global_list(mci);
813 	mutex_unlock(&mem_ctls_mutex);
814 
815 	/* flush workq processes */
816 	edac_mc_workq_teardown(mci);
817 
818 	/* marking MCI offline */
819 	mci->op_state = OP_OFFLINE;
820 
821 	/* remove from sysfs */
822 	edac_remove_sysfs_mci_device(mci);
823 
824 	edac_printk(KERN_INFO, EDAC_MC,
825 		"Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
826 		mci->mod_name, mci->ctl_name, edac_dev_name(mci));
827 
828 	return mci;
829 }
830 EXPORT_SYMBOL_GPL(edac_mc_del_mc);
831 
832 static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
833 				u32 size)
834 {
835 	struct page *pg;
836 	void *virt_addr;
837 	unsigned long flags = 0;
838 
839 	edac_dbg(3, "\n");
840 
841 	/* ECC error page was not in our memory. Ignore it. */
842 	if (!pfn_valid(page))
843 		return;
844 
845 	/* Find the actual page structure then map it and fix */
846 	pg = pfn_to_page(page);
847 
848 	if (PageHighMem(pg))
849 		local_irq_save(flags);
850 
851 	virt_addr = kmap_atomic(pg);
852 
853 	/* Perform architecture specific atomic scrub operation */
854 	atomic_scrub(virt_addr + offset, size);
855 
856 	/* Unmap and complete */
857 	kunmap_atomic(virt_addr);
858 
859 	if (PageHighMem(pg))
860 		local_irq_restore(flags);
861 }
862 
863 /* FIXME - should return -1 */
864 int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
865 {
866 	struct csrow_info **csrows = mci->csrows;
867 	int row, i, j, n;
868 
869 	edac_dbg(1, "MC%d: 0x%lx\n", mci->mc_idx, page);
870 	row = -1;
871 
872 	for (i = 0; i < mci->nr_csrows; i++) {
873 		struct csrow_info *csrow = csrows[i];
874 		n = 0;
875 		for (j = 0; j < csrow->nr_channels; j++) {
876 			struct dimm_info *dimm = csrow->channels[j]->dimm;
877 			n += dimm->nr_pages;
878 		}
879 		if (n == 0)
880 			continue;
881 
882 		edac_dbg(3, "MC%d: first(0x%lx) page(0x%lx) last(0x%lx) mask(0x%lx)\n",
883 			 mci->mc_idx,
884 			 csrow->first_page, page, csrow->last_page,
885 			 csrow->page_mask);
886 
887 		if ((page >= csrow->first_page) &&
888 		    (page <= csrow->last_page) &&
889 		    ((page & csrow->page_mask) ==
890 		     (csrow->first_page & csrow->page_mask))) {
891 			row = i;
892 			break;
893 		}
894 	}
895 
896 	if (row == -1)
897 		edac_mc_printk(mci, KERN_ERR,
898 			"could not look up page error address %lx\n",
899 			(unsigned long)page);
900 
901 	return row;
902 }
903 EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
904 
905 const char *edac_layer_name[] = {
906 	[EDAC_MC_LAYER_BRANCH] = "branch",
907 	[EDAC_MC_LAYER_CHANNEL] = "channel",
908 	[EDAC_MC_LAYER_SLOT] = "slot",
909 	[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
910 };
911 EXPORT_SYMBOL_GPL(edac_layer_name);
912 
913 static void edac_inc_ce_error(struct mem_ctl_info *mci,
914 			      bool enable_per_layer_report,
915 			      const int pos[EDAC_MAX_LAYERS],
916 			      const u16 count)
917 {
918 	int i, index = 0;
919 
920 	mci->ce_mc += count;
921 
922 	if (!enable_per_layer_report) {
923 		mci->ce_noinfo_count += count;
924 		return;
925 	}
926 
927 	for (i = 0; i < mci->n_layers; i++) {
928 		if (pos[i] < 0)
929 			break;
930 		index += pos[i];
931 		mci->ce_per_layer[i][index] += count;
932 
933 		if (i < mci->n_layers - 1)
934 			index *= mci->layers[i + 1].size;
935 	}
936 }
937 
938 static void edac_inc_ue_error(struct mem_ctl_info *mci,
939 				    bool enable_per_layer_report,
940 				    const int pos[EDAC_MAX_LAYERS],
941 				    const u16 count)
942 {
943 	int i, index = 0;
944 
945 	mci->ue_mc += count;
946 
947 	if (!enable_per_layer_report) {
948 		mci->ce_noinfo_count += count;
949 		return;
950 	}
951 
952 	for (i = 0; i < mci->n_layers; i++) {
953 		if (pos[i] < 0)
954 			break;
955 		index += pos[i];
956 		mci->ue_per_layer[i][index] += count;
957 
958 		if (i < mci->n_layers - 1)
959 			index *= mci->layers[i + 1].size;
960 	}
961 }
962 
963 static void edac_ce_error(struct mem_ctl_info *mci,
964 			  const u16 error_count,
965 			  const int pos[EDAC_MAX_LAYERS],
966 			  const char *msg,
967 			  const char *location,
968 			  const char *label,
969 			  const char *detail,
970 			  const char *other_detail,
971 			  const bool enable_per_layer_report,
972 			  const unsigned long page_frame_number,
973 			  const unsigned long offset_in_page,
974 			  long grain)
975 {
976 	unsigned long remapped_page;
977 	char *msg_aux = "";
978 
979 	if (*msg)
980 		msg_aux = " ";
981 
982 	if (edac_mc_get_log_ce()) {
983 		if (other_detail && *other_detail)
984 			edac_mc_printk(mci, KERN_WARNING,
985 				       "%d CE %s%son %s (%s %s - %s)\n",
986 				       error_count, msg, msg_aux, label,
987 				       location, detail, other_detail);
988 		else
989 			edac_mc_printk(mci, KERN_WARNING,
990 				       "%d CE %s%son %s (%s %s)\n",
991 				       error_count, msg, msg_aux, label,
992 				       location, detail);
993 	}
994 	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
995 
996 	if (mci->scrub_mode & SCRUB_SW_SRC) {
997 		/*
998 			* Some memory controllers (called MCs below) can remap
999 			* memory so that it is still available at a different
1000 			* address when PCI devices map into memory.
1001 			* MC's that can't do this, lose the memory where PCI
1002 			* devices are mapped. This mapping is MC-dependent
1003 			* and so we call back into the MC driver for it to
1004 			* map the MC page to a physical (CPU) page which can
1005 			* then be mapped to a virtual page - which can then
1006 			* be scrubbed.
1007 			*/
1008 		remapped_page = mci->ctl_page_to_phys ?
1009 			mci->ctl_page_to_phys(mci, page_frame_number) :
1010 			page_frame_number;
1011 
1012 		edac_mc_scrub_block(remapped_page,
1013 					offset_in_page, grain);
1014 	}
1015 }
1016 
1017 static void edac_ue_error(struct mem_ctl_info *mci,
1018 			  const u16 error_count,
1019 			  const int pos[EDAC_MAX_LAYERS],
1020 			  const char *msg,
1021 			  const char *location,
1022 			  const char *label,
1023 			  const char *detail,
1024 			  const char *other_detail,
1025 			  const bool enable_per_layer_report)
1026 {
1027 	char *msg_aux = "";
1028 
1029 	if (*msg)
1030 		msg_aux = " ";
1031 
1032 	if (edac_mc_get_log_ue()) {
1033 		if (other_detail && *other_detail)
1034 			edac_mc_printk(mci, KERN_WARNING,
1035 				       "%d UE %s%son %s (%s %s - %s)\n",
1036 				       error_count, msg, msg_aux, label,
1037 				       location, detail, other_detail);
1038 		else
1039 			edac_mc_printk(mci, KERN_WARNING,
1040 				       "%d UE %s%son %s (%s %s)\n",
1041 				       error_count, msg, msg_aux, label,
1042 				       location, detail);
1043 	}
1044 
1045 	if (edac_mc_get_panic_on_ue()) {
1046 		if (other_detail && *other_detail)
1047 			panic("UE %s%son %s (%s%s - %s)\n",
1048 			      msg, msg_aux, label, location, detail, other_detail);
1049 		else
1050 			panic("UE %s%son %s (%s%s)\n",
1051 			      msg, msg_aux, label, location, detail);
1052 	}
1053 
1054 	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
1055 }
1056 
1057 #define OTHER_LABEL " or "
1058 
1059 /**
1060  * edac_mc_handle_error - reports a memory event to userspace
1061  *
1062  * @type:		severity of the error (CE/UE/Fatal)
1063  * @mci:		a struct mem_ctl_info pointer
1064  * @error_count:	Number of errors of the same type
1065  * @page_frame_number:	mem page where the error occurred
1066  * @offset_in_page:	offset of the error inside the page
1067  * @syndrome:		ECC syndrome
1068  * @top_layer:		Memory layer[0] position
1069  * @mid_layer:		Memory layer[1] position
1070  * @low_layer:		Memory layer[2] position
1071  * @msg:		Message meaningful to the end users that
1072  *			explains the event
1073  * @other_detail:	Technical details about the event that
1074  *			may help hardware manufacturers and
1075  *			EDAC developers to analyse the event
1076  */
1077 void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1078 			  struct mem_ctl_info *mci,
1079 			  const u16 error_count,
1080 			  const unsigned long page_frame_number,
1081 			  const unsigned long offset_in_page,
1082 			  const unsigned long syndrome,
1083 			  const int top_layer,
1084 			  const int mid_layer,
1085 			  const int low_layer,
1086 			  const char *msg,
1087 			  const char *other_detail)
1088 {
1089 	/* FIXME: too much for stack: move it to some pre-alocated area */
1090 	char detail[80], location[80];
1091 	char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
1092 	char *p;
1093 	int row = -1, chan = -1;
1094 	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
1095 	int i;
1096 	long grain;
1097 	bool enable_per_layer_report = false;
1098 	u8 grain_bits;
1099 
1100 	edac_dbg(3, "MC%d\n", mci->mc_idx);
1101 
1102 	/*
1103 	 * Check if the event report is consistent and if the memory
1104 	 * location is known. If it is known, enable_per_layer_report will be
1105 	 * true, the DIMM(s) label info will be filled and the per-layer
1106 	 * error counters will be incremented.
1107 	 */
1108 	for (i = 0; i < mci->n_layers; i++) {
1109 		if (pos[i] >= (int)mci->layers[i].size) {
1110 
1111 			edac_mc_printk(mci, KERN_ERR,
1112 				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
1113 				       edac_layer_name[mci->layers[i].type],
1114 				       pos[i], mci->layers[i].size);
1115 			/*
1116 			 * Instead of just returning it, let's use what's
1117 			 * known about the error. The increment routines and
1118 			 * the DIMM filter logic will do the right thing by
1119 			 * pointing the likely damaged DIMMs.
1120 			 */
1121 			pos[i] = -1;
1122 		}
1123 		if (pos[i] >= 0)
1124 			enable_per_layer_report = true;
1125 	}
1126 
1127 	/*
1128 	 * Get the dimm label/grain that applies to the match criteria.
1129 	 * As the error algorithm may not be able to point to just one memory
1130 	 * stick, the logic here will get all possible labels that could
1131 	 * pottentially be affected by the error.
1132 	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
1133 	 * to have only the MC channel and the MC dimm (also called "branch")
1134 	 * but the channel is not known, as the memory is arranged in pairs,
1135 	 * where each memory belongs to a separate channel within the same
1136 	 * branch.
1137 	 */
1138 	grain = 0;
1139 	p = label;
1140 	*p = '\0';
1141 
1142 	for (i = 0; i < mci->tot_dimms; i++) {
1143 		struct dimm_info *dimm = mci->dimms[i];
1144 
1145 		if (top_layer >= 0 && top_layer != dimm->location[0])
1146 			continue;
1147 		if (mid_layer >= 0 && mid_layer != dimm->location[1])
1148 			continue;
1149 		if (low_layer >= 0 && low_layer != dimm->location[2])
1150 			continue;
1151 
1152 		/* get the max grain, over the error match range */
1153 		if (dimm->grain > grain)
1154 			grain = dimm->grain;
1155 
1156 		/*
1157 		 * If the error is memory-controller wide, there's no need to
1158 		 * seek for the affected DIMMs because the whole
1159 		 * channel/memory controller/...  may be affected.
1160 		 * Also, don't show errors for empty DIMM slots.
1161 		 */
1162 		if (enable_per_layer_report && dimm->nr_pages) {
1163 			if (p != label) {
1164 				strcpy(p, OTHER_LABEL);
1165 				p += strlen(OTHER_LABEL);
1166 			}
1167 			strcpy(p, dimm->label);
1168 			p += strlen(p);
1169 			*p = '\0';
1170 
1171 			/*
1172 			 * get csrow/channel of the DIMM, in order to allow
1173 			 * incrementing the compat API counters
1174 			 */
1175 			edac_dbg(4, "%s csrows map: (%d,%d)\n",
1176 				 mci->mem_is_per_rank ? "rank" : "dimm",
1177 				 dimm->csrow, dimm->cschannel);
1178 			if (row == -1)
1179 				row = dimm->csrow;
1180 			else if (row >= 0 && row != dimm->csrow)
1181 				row = -2;
1182 
1183 			if (chan == -1)
1184 				chan = dimm->cschannel;
1185 			else if (chan >= 0 && chan != dimm->cschannel)
1186 				chan = -2;
1187 		}
1188 	}
1189 
1190 	if (!enable_per_layer_report) {
1191 		strcpy(label, "any memory");
1192 	} else {
1193 		edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
1194 		if (p == label)
1195 			strcpy(label, "unknown memory");
1196 		if (type == HW_EVENT_ERR_CORRECTED) {
1197 			if (row >= 0) {
1198 				mci->csrows[row]->ce_count += error_count;
1199 				if (chan >= 0)
1200 					mci->csrows[row]->channels[chan]->ce_count += error_count;
1201 			}
1202 		} else
1203 			if (row >= 0)
1204 				mci->csrows[row]->ue_count += error_count;
1205 	}
1206 
1207 	/* Fill the RAM location data */
1208 	p = location;
1209 
1210 	for (i = 0; i < mci->n_layers; i++) {
1211 		if (pos[i] < 0)
1212 			continue;
1213 
1214 		p += sprintf(p, "%s:%d ",
1215 			     edac_layer_name[mci->layers[i].type],
1216 			     pos[i]);
1217 	}
1218 	if (p > location)
1219 		*(p - 1) = '\0';
1220 
1221 	/* Report the error via the trace interface */
1222 	grain_bits = fls_long(grain) + 1;
1223 	trace_mc_event(type, msg, label, error_count,
1224 		       mci->mc_idx, top_layer, mid_layer, low_layer,
1225 		       PAGES_TO_MiB(page_frame_number) | offset_in_page,
1226 		       grain_bits, syndrome, other_detail);
1227 
1228 	/* Memory type dependent details about the error */
1229 	if (type == HW_EVENT_ERR_CORRECTED) {
1230 		snprintf(detail, sizeof(detail),
1231 			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1232 			page_frame_number, offset_in_page,
1233 			grain, syndrome);
1234 		edac_ce_error(mci, error_count, pos, msg, location, label,
1235 			      detail, other_detail, enable_per_layer_report,
1236 			      page_frame_number, offset_in_page, grain);
1237 	} else {
1238 		snprintf(detail, sizeof(detail),
1239 			"page:0x%lx offset:0x%lx grain:%ld",
1240 			page_frame_number, offset_in_page, grain);
1241 
1242 		edac_ue_error(mci, error_count, pos, msg, location, label,
1243 			      detail, other_detail, enable_per_layer_report);
1244 	}
1245 }
1246 EXPORT_SYMBOL_GPL(edac_mc_handle_error);
1247