xref: /openbmc/linux/drivers/edac/edac_mc.c (revision b34e08d5)
1 /*
2  * edac_mc kernel module
3  * (C) 2005, 2006 Linux Networx (http://lnxi.com)
4  * This file may be distributed under the terms of the
5  * GNU General Public License.
6  *
7  * Written by Thayne Harbaugh
8  * Based on work by Dan Hollis <goemon at anime dot net> and others.
9  *	http://www.anime.net/~goemon/linux-ecc/
10  *
11  * Modified by Dave Peterson and Doug Thompson
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/proc_fs.h>
17 #include <linux/kernel.h>
18 #include <linux/types.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/sysctl.h>
22 #include <linux/highmem.h>
23 #include <linux/timer.h>
24 #include <linux/slab.h>
25 #include <linux/jiffies.h>
26 #include <linux/spinlock.h>
27 #include <linux/list.h>
28 #include <linux/ctype.h>
29 #include <linux/edac.h>
30 #include <linux/bitops.h>
31 #include <asm/uaccess.h>
32 #include <asm/page.h>
33 #include <asm/edac.h>
34 #include "edac_core.h"
35 #include "edac_module.h"
36 
37 #define CREATE_TRACE_POINTS
38 #define TRACE_INCLUDE_PATH ../../include/ras
39 #include <ras/ras_event.h>
40 
41 /* lock to memory controller's control array */
42 static DEFINE_MUTEX(mem_ctls_mutex);
43 static LIST_HEAD(mc_devices);
44 
45 /*
46  * Used to lock EDAC MC to just one module, avoiding two drivers e. g.
47  *	apei/ghes and i7core_edac to be used at the same time.
48  */
49 static void const *edac_mc_owner;
50 
51 static struct bus_type mc_bus[EDAC_MAX_MCS];
52 
53 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
54 			         unsigned len)
55 {
56 	struct mem_ctl_info *mci = dimm->mci;
57 	int i, n, count = 0;
58 	char *p = buf;
59 
60 	for (i = 0; i < mci->n_layers; i++) {
61 		n = snprintf(p, len, "%s %d ",
62 			      edac_layer_name[mci->layers[i].type],
63 			      dimm->location[i]);
64 		p += n;
65 		len -= n;
66 		count += n;
67 		if (!len)
68 			break;
69 	}
70 
71 	return count;
72 }
73 
74 #ifdef CONFIG_EDAC_DEBUG
75 
76 static void edac_mc_dump_channel(struct rank_info *chan)
77 {
78 	edac_dbg(4, "  channel->chan_idx = %d\n", chan->chan_idx);
79 	edac_dbg(4, "    channel = %p\n", chan);
80 	edac_dbg(4, "    channel->csrow = %p\n", chan->csrow);
81 	edac_dbg(4, "    channel->dimm = %p\n", chan->dimm);
82 }
83 
84 static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
85 {
86 	char location[80];
87 
88 	edac_dimm_info_location(dimm, location, sizeof(location));
89 
90 	edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
91 		 dimm->mci->csbased ? "rank" : "dimm",
92 		 number, location, dimm->csrow, dimm->cschannel);
93 	edac_dbg(4, "  dimm = %p\n", dimm);
94 	edac_dbg(4, "  dimm->label = '%s'\n", dimm->label);
95 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
96 	edac_dbg(4, "  dimm->grain = %d\n", dimm->grain);
97 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
98 }
99 
100 static void edac_mc_dump_csrow(struct csrow_info *csrow)
101 {
102 	edac_dbg(4, "csrow->csrow_idx = %d\n", csrow->csrow_idx);
103 	edac_dbg(4, "  csrow = %p\n", csrow);
104 	edac_dbg(4, "  csrow->first_page = 0x%lx\n", csrow->first_page);
105 	edac_dbg(4, "  csrow->last_page = 0x%lx\n", csrow->last_page);
106 	edac_dbg(4, "  csrow->page_mask = 0x%lx\n", csrow->page_mask);
107 	edac_dbg(4, "  csrow->nr_channels = %d\n", csrow->nr_channels);
108 	edac_dbg(4, "  csrow->channels = %p\n", csrow->channels);
109 	edac_dbg(4, "  csrow->mci = %p\n", csrow->mci);
110 }
111 
112 static void edac_mc_dump_mci(struct mem_ctl_info *mci)
113 {
114 	edac_dbg(3, "\tmci = %p\n", mci);
115 	edac_dbg(3, "\tmci->mtype_cap = %lx\n", mci->mtype_cap);
116 	edac_dbg(3, "\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap);
117 	edac_dbg(3, "\tmci->edac_cap = %lx\n", mci->edac_cap);
118 	edac_dbg(4, "\tmci->edac_check = %p\n", mci->edac_check);
119 	edac_dbg(3, "\tmci->nr_csrows = %d, csrows = %p\n",
120 		 mci->nr_csrows, mci->csrows);
121 	edac_dbg(3, "\tmci->nr_dimms = %d, dimms = %p\n",
122 		 mci->tot_dimms, mci->dimms);
123 	edac_dbg(3, "\tdev = %p\n", mci->pdev);
124 	edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
125 		 mci->mod_name, mci->ctl_name);
126 	edac_dbg(3, "\tpvt_info = %p\n\n", mci->pvt_info);
127 }
128 
129 #endif				/* CONFIG_EDAC_DEBUG */
130 
131 /*
132  * keep those in sync with the enum mem_type
133  */
134 const char *edac_mem_types[] = {
135 	"Empty csrow",
136 	"Reserved csrow type",
137 	"Unknown csrow type",
138 	"Fast page mode RAM",
139 	"Extended data out RAM",
140 	"Burst Extended data out RAM",
141 	"Single data rate SDRAM",
142 	"Registered single data rate SDRAM",
143 	"Double data rate SDRAM",
144 	"Registered Double data rate SDRAM",
145 	"Rambus DRAM",
146 	"Unbuffered DDR2 RAM",
147 	"Fully buffered DDR2",
148 	"Registered DDR2 RAM",
149 	"Rambus XDR",
150 	"Unbuffered DDR3 RAM",
151 	"Registered DDR3 RAM",
152 };
153 EXPORT_SYMBOL_GPL(edac_mem_types);
154 
155 /**
156  * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
157  * @p:		pointer to a pointer with the memory offset to be used. At
158  *		return, this will be incremented to point to the next offset
159  * @size:	Size of the data structure to be reserved
160  * @n_elems:	Number of elements that should be reserved
161  *
162  * If 'size' is a constant, the compiler will optimize this whole function
163  * down to either a no-op or the addition of a constant to the value of '*p'.
164  *
165  * The 'p' pointer is absolutely needed to keep the proper advancing
166  * further in memory to the proper offsets when allocating the struct along
167  * with its embedded structs, as edac_device_alloc_ctl_info() does it
168  * above, for example.
169  *
170  * At return, the pointer 'p' will be incremented to be used on a next call
171  * to this function.
172  */
173 void *edac_align_ptr(void **p, unsigned size, int n_elems)
174 {
175 	unsigned align, r;
176 	void *ptr = *p;
177 
178 	*p += size * n_elems;
179 
180 	/*
181 	 * 'p' can possibly be an unaligned item X such that sizeof(X) is
182 	 * 'size'.  Adjust 'p' so that its alignment is at least as
183 	 * stringent as what the compiler would provide for X and return
184 	 * the aligned result.
185 	 * Here we assume that the alignment of a "long long" is the most
186 	 * stringent alignment that the compiler will ever provide by default.
187 	 * As far as I know, this is a reasonable assumption.
188 	 */
189 	if (size > sizeof(long))
190 		align = sizeof(long long);
191 	else if (size > sizeof(int))
192 		align = sizeof(long);
193 	else if (size > sizeof(short))
194 		align = sizeof(int);
195 	else if (size > sizeof(char))
196 		align = sizeof(short);
197 	else
198 		return (char *)ptr;
199 
200 	r = (unsigned long)p % align;
201 
202 	if (r == 0)
203 		return (char *)ptr;
204 
205 	*p += align - r;
206 
207 	return (void *)(((unsigned long)ptr) + align - r);
208 }
209 
210 static void _edac_mc_free(struct mem_ctl_info *mci)
211 {
212 	int i, chn, row;
213 	struct csrow_info *csr;
214 	const unsigned int tot_dimms = mci->tot_dimms;
215 	const unsigned int tot_channels = mci->num_cschannel;
216 	const unsigned int tot_csrows = mci->nr_csrows;
217 
218 	if (mci->dimms) {
219 		for (i = 0; i < tot_dimms; i++)
220 			kfree(mci->dimms[i]);
221 		kfree(mci->dimms);
222 	}
223 	if (mci->csrows) {
224 		for (row = 0; row < tot_csrows; row++) {
225 			csr = mci->csrows[row];
226 			if (csr) {
227 				if (csr->channels) {
228 					for (chn = 0; chn < tot_channels; chn++)
229 						kfree(csr->channels[chn]);
230 					kfree(csr->channels);
231 				}
232 				kfree(csr);
233 			}
234 		}
235 		kfree(mci->csrows);
236 	}
237 	kfree(mci);
238 }
239 
240 /**
241  * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
242  * @mc_num:		Memory controller number
243  * @n_layers:		Number of MC hierarchy layers
244  * layers:		Describes each layer as seen by the Memory Controller
245  * @size_pvt:		size of private storage needed
246  *
247  *
248  * Everything is kmalloc'ed as one big chunk - more efficient.
249  * Only can be used if all structures have the same lifetime - otherwise
250  * you have to allocate and initialize your own structures.
251  *
252  * Use edac_mc_free() to free mc structures allocated by this function.
253  *
254  * NOTE: drivers handle multi-rank memories in different ways: in some
255  * drivers, one multi-rank memory stick is mapped as one entry, while, in
256  * others, a single multi-rank memory stick would be mapped into several
257  * entries. Currently, this function will allocate multiple struct dimm_info
258  * on such scenarios, as grouping the multiple ranks require drivers change.
259  *
260  * Returns:
261  *	On failure: NULL
262  *	On success: struct mem_ctl_info pointer
263  */
264 struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
265 				   unsigned n_layers,
266 				   struct edac_mc_layer *layers,
267 				   unsigned sz_pvt)
268 {
269 	struct mem_ctl_info *mci;
270 	struct edac_mc_layer *layer;
271 	struct csrow_info *csr;
272 	struct rank_info *chan;
273 	struct dimm_info *dimm;
274 	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
275 	unsigned pos[EDAC_MAX_LAYERS];
276 	unsigned size, tot_dimms = 1, count = 1;
277 	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
278 	void *pvt, *p, *ptr = NULL;
279 	int i, j, row, chn, n, len, off;
280 	bool per_rank = false;
281 
282 	BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
283 	/*
284 	 * Calculate the total amount of dimms and csrows/cschannels while
285 	 * in the old API emulation mode
286 	 */
287 	for (i = 0; i < n_layers; i++) {
288 		tot_dimms *= layers[i].size;
289 		if (layers[i].is_virt_csrow)
290 			tot_csrows *= layers[i].size;
291 		else
292 			tot_channels *= layers[i].size;
293 
294 		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
295 			per_rank = true;
296 	}
297 
298 	/* Figure out the offsets of the various items from the start of an mc
299 	 * structure.  We want the alignment of each item to be at least as
300 	 * stringent as what the compiler would provide if we could simply
301 	 * hardcode everything into a single struct.
302 	 */
303 	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
304 	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
305 	for (i = 0; i < n_layers; i++) {
306 		count *= layers[i].size;
307 		edac_dbg(4, "errcount layer %d size %d\n", i, count);
308 		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
309 		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
310 		tot_errcount += 2 * count;
311 	}
312 
313 	edac_dbg(4, "allocating %d error counters\n", tot_errcount);
314 	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
315 	size = ((unsigned long)pvt) + sz_pvt;
316 
317 	edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
318 		 size,
319 		 tot_dimms,
320 		 per_rank ? "ranks" : "dimms",
321 		 tot_csrows * tot_channels);
322 
323 	mci = kzalloc(size, GFP_KERNEL);
324 	if (mci == NULL)
325 		return NULL;
326 
327 	/* Adjust pointers so they point within the memory we just allocated
328 	 * rather than an imaginary chunk of memory located at address 0.
329 	 */
330 	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
331 	for (i = 0; i < n_layers; i++) {
332 		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
333 		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
334 	}
335 	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
336 
337 	/* setup index and various internal pointers */
338 	mci->mc_idx = mc_num;
339 	mci->tot_dimms = tot_dimms;
340 	mci->pvt_info = pvt;
341 	mci->n_layers = n_layers;
342 	mci->layers = layer;
343 	memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
344 	mci->nr_csrows = tot_csrows;
345 	mci->num_cschannel = tot_channels;
346 	mci->csbased = per_rank;
347 
348 	/*
349 	 * Alocate and fill the csrow/channels structs
350 	 */
351 	mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
352 	if (!mci->csrows)
353 		goto error;
354 	for (row = 0; row < tot_csrows; row++) {
355 		csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
356 		if (!csr)
357 			goto error;
358 		mci->csrows[row] = csr;
359 		csr->csrow_idx = row;
360 		csr->mci = mci;
361 		csr->nr_channels = tot_channels;
362 		csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
363 					GFP_KERNEL);
364 		if (!csr->channels)
365 			goto error;
366 
367 		for (chn = 0; chn < tot_channels; chn++) {
368 			chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
369 			if (!chan)
370 				goto error;
371 			csr->channels[chn] = chan;
372 			chan->chan_idx = chn;
373 			chan->csrow = csr;
374 		}
375 	}
376 
377 	/*
378 	 * Allocate and fill the dimm structs
379 	 */
380 	mci->dimms  = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
381 	if (!mci->dimms)
382 		goto error;
383 
384 	memset(&pos, 0, sizeof(pos));
385 	row = 0;
386 	chn = 0;
387 	for (i = 0; i < tot_dimms; i++) {
388 		chan = mci->csrows[row]->channels[chn];
389 		off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
390 		if (off < 0 || off >= tot_dimms) {
391 			edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n");
392 			goto error;
393 		}
394 
395 		dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
396 		if (!dimm)
397 			goto error;
398 		mci->dimms[off] = dimm;
399 		dimm->mci = mci;
400 
401 		/*
402 		 * Copy DIMM location and initialize it.
403 		 */
404 		len = sizeof(dimm->label);
405 		p = dimm->label;
406 		n = snprintf(p, len, "mc#%u", mc_num);
407 		p += n;
408 		len -= n;
409 		for (j = 0; j < n_layers; j++) {
410 			n = snprintf(p, len, "%s#%u",
411 				     edac_layer_name[layers[j].type],
412 				     pos[j]);
413 			p += n;
414 			len -= n;
415 			dimm->location[j] = pos[j];
416 
417 			if (len <= 0)
418 				break;
419 		}
420 
421 		/* Link it to the csrows old API data */
422 		chan->dimm = dimm;
423 		dimm->csrow = row;
424 		dimm->cschannel = chn;
425 
426 		/* Increment csrow location */
427 		if (layers[0].is_virt_csrow) {
428 			chn++;
429 			if (chn == tot_channels) {
430 				chn = 0;
431 				row++;
432 			}
433 		} else {
434 			row++;
435 			if (row == tot_csrows) {
436 				row = 0;
437 				chn++;
438 			}
439 		}
440 
441 		/* Increment dimm location */
442 		for (j = n_layers - 1; j >= 0; j--) {
443 			pos[j]++;
444 			if (pos[j] < layers[j].size)
445 				break;
446 			pos[j] = 0;
447 		}
448 	}
449 
450 	mci->op_state = OP_ALLOC;
451 
452 	return mci;
453 
454 error:
455 	_edac_mc_free(mci);
456 
457 	return NULL;
458 }
459 EXPORT_SYMBOL_GPL(edac_mc_alloc);
460 
461 /**
462  * edac_mc_free
463  *	'Free' a previously allocated 'mci' structure
464  * @mci: pointer to a struct mem_ctl_info structure
465  */
466 void edac_mc_free(struct mem_ctl_info *mci)
467 {
468 	edac_dbg(1, "\n");
469 
470 	/* If we're not yet registered with sysfs free only what was allocated
471 	 * in edac_mc_alloc().
472 	 */
473 	if (!device_is_registered(&mci->dev)) {
474 		_edac_mc_free(mci);
475 		return;
476 	}
477 
478 	/* the mci instance is freed here, when the sysfs object is dropped */
479 	edac_unregister_sysfs(mci);
480 }
481 EXPORT_SYMBOL_GPL(edac_mc_free);
482 
483 
484 /**
485  * find_mci_by_dev
486  *
487  *	scan list of controllers looking for the one that manages
488  *	the 'dev' device
489  * @dev: pointer to a struct device related with the MCI
490  */
491 struct mem_ctl_info *find_mci_by_dev(struct device *dev)
492 {
493 	struct mem_ctl_info *mci;
494 	struct list_head *item;
495 
496 	edac_dbg(3, "\n");
497 
498 	list_for_each(item, &mc_devices) {
499 		mci = list_entry(item, struct mem_ctl_info, link);
500 
501 		if (mci->pdev == dev)
502 			return mci;
503 	}
504 
505 	return NULL;
506 }
507 EXPORT_SYMBOL_GPL(find_mci_by_dev);
508 
509 /*
510  * handler for EDAC to check if NMI type handler has asserted interrupt
511  */
512 static int edac_mc_assert_error_check_and_clear(void)
513 {
514 	int old_state;
515 
516 	if (edac_op_state == EDAC_OPSTATE_POLL)
517 		return 1;
518 
519 	old_state = edac_err_assert;
520 	edac_err_assert = 0;
521 
522 	return old_state;
523 }
524 
525 /*
526  * edac_mc_workq_function
527  *	performs the operation scheduled by a workq request
528  */
529 static void edac_mc_workq_function(struct work_struct *work_req)
530 {
531 	struct delayed_work *d_work = to_delayed_work(work_req);
532 	struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work);
533 
534 	mutex_lock(&mem_ctls_mutex);
535 
536 	/* if this control struct has movd to offline state, we are done */
537 	if (mci->op_state == OP_OFFLINE) {
538 		mutex_unlock(&mem_ctls_mutex);
539 		return;
540 	}
541 
542 	/* Only poll controllers that are running polled and have a check */
543 	if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
544 		mci->edac_check(mci);
545 
546 	mutex_unlock(&mem_ctls_mutex);
547 
548 	/* Reschedule */
549 	queue_delayed_work(edac_workqueue, &mci->work,
550 			msecs_to_jiffies(edac_mc_get_poll_msec()));
551 }
552 
553 /*
554  * edac_mc_workq_setup
555  *	initialize a workq item for this mci
556  *	passing in the new delay period in msec
557  *
558  *	locking model:
559  *
560  *		called with the mem_ctls_mutex held
561  */
562 static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec,
563 				bool init)
564 {
565 	edac_dbg(0, "\n");
566 
567 	/* if this instance is not in the POLL state, then simply return */
568 	if (mci->op_state != OP_RUNNING_POLL)
569 		return;
570 
571 	if (init)
572 		INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
573 
574 	mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
575 }
576 
577 /*
578  * edac_mc_workq_teardown
579  *	stop the workq processing on this mci
580  *
581  *	locking model:
582  *
583  *		called WITHOUT lock held
584  */
585 static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
586 {
587 	int status;
588 
589 	if (mci->op_state != OP_RUNNING_POLL)
590 		return;
591 
592 	status = cancel_delayed_work(&mci->work);
593 	if (status == 0) {
594 		edac_dbg(0, "not canceled, flush the queue\n");
595 
596 		/* workq instance might be running, wait for it */
597 		flush_workqueue(edac_workqueue);
598 	}
599 }
600 
601 /*
602  * edac_mc_reset_delay_period(unsigned long value)
603  *
604  *	user space has updated our poll period value, need to
605  *	reset our workq delays
606  */
607 void edac_mc_reset_delay_period(unsigned long value)
608 {
609 	struct mem_ctl_info *mci;
610 	struct list_head *item;
611 
612 	mutex_lock(&mem_ctls_mutex);
613 
614 	list_for_each(item, &mc_devices) {
615 		mci = list_entry(item, struct mem_ctl_info, link);
616 
617 		edac_mc_workq_setup(mci, value, false);
618 	}
619 
620 	mutex_unlock(&mem_ctls_mutex);
621 }
622 
623 
624 
625 /* Return 0 on success, 1 on failure.
626  * Before calling this function, caller must
627  * assign a unique value to mci->mc_idx.
628  *
629  *	locking model:
630  *
631  *		called with the mem_ctls_mutex lock held
632  */
633 static int add_mc_to_global_list(struct mem_ctl_info *mci)
634 {
635 	struct list_head *item, *insert_before;
636 	struct mem_ctl_info *p;
637 
638 	insert_before = &mc_devices;
639 
640 	p = find_mci_by_dev(mci->pdev);
641 	if (unlikely(p != NULL))
642 		goto fail0;
643 
644 	list_for_each(item, &mc_devices) {
645 		p = list_entry(item, struct mem_ctl_info, link);
646 
647 		if (p->mc_idx >= mci->mc_idx) {
648 			if (unlikely(p->mc_idx == mci->mc_idx))
649 				goto fail1;
650 
651 			insert_before = item;
652 			break;
653 		}
654 	}
655 
656 	list_add_tail_rcu(&mci->link, insert_before);
657 	atomic_inc(&edac_handlers);
658 	return 0;
659 
660 fail0:
661 	edac_printk(KERN_WARNING, EDAC_MC,
662 		"%s (%s) %s %s already assigned %d\n", dev_name(p->pdev),
663 		edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
664 	return 1;
665 
666 fail1:
667 	edac_printk(KERN_WARNING, EDAC_MC,
668 		"bug in low-level driver: attempt to assign\n"
669 		"    duplicate mc_idx %d in %s()\n", p->mc_idx, __func__);
670 	return 1;
671 }
672 
673 static int del_mc_from_global_list(struct mem_ctl_info *mci)
674 {
675 	int handlers = atomic_dec_return(&edac_handlers);
676 	list_del_rcu(&mci->link);
677 
678 	/* these are for safe removal of devices from global list while
679 	 * NMI handlers may be traversing list
680 	 */
681 	synchronize_rcu();
682 	INIT_LIST_HEAD(&mci->link);
683 
684 	return handlers;
685 }
686 
687 /**
688  * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'.
689  *
690  * If found, return a pointer to the structure.
691  * Else return NULL.
692  *
693  * Caller must hold mem_ctls_mutex.
694  */
695 struct mem_ctl_info *edac_mc_find(int idx)
696 {
697 	struct list_head *item;
698 	struct mem_ctl_info *mci;
699 
700 	list_for_each(item, &mc_devices) {
701 		mci = list_entry(item, struct mem_ctl_info, link);
702 
703 		if (mci->mc_idx >= idx) {
704 			if (mci->mc_idx == idx)
705 				return mci;
706 
707 			break;
708 		}
709 	}
710 
711 	return NULL;
712 }
713 EXPORT_SYMBOL(edac_mc_find);
714 
715 /**
716  * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
717  *                 create sysfs entries associated with mci structure
718  * @mci: pointer to the mci structure to be added to the list
719  *
720  * Return:
721  *	0	Success
722  *	!0	Failure
723  */
724 
725 /* FIXME - should a warning be printed if no error detection? correction? */
726 int edac_mc_add_mc(struct mem_ctl_info *mci)
727 {
728 	int ret = -EINVAL;
729 	edac_dbg(0, "\n");
730 
731 	if (mci->mc_idx >= EDAC_MAX_MCS) {
732 		pr_warn_once("Too many memory controllers: %d\n", mci->mc_idx);
733 		return -ENODEV;
734 	}
735 
736 #ifdef CONFIG_EDAC_DEBUG
737 	if (edac_debug_level >= 3)
738 		edac_mc_dump_mci(mci);
739 
740 	if (edac_debug_level >= 4) {
741 		int i;
742 
743 		for (i = 0; i < mci->nr_csrows; i++) {
744 			struct csrow_info *csrow = mci->csrows[i];
745 			u32 nr_pages = 0;
746 			int j;
747 
748 			for (j = 0; j < csrow->nr_channels; j++)
749 				nr_pages += csrow->channels[j]->dimm->nr_pages;
750 			if (!nr_pages)
751 				continue;
752 			edac_mc_dump_csrow(csrow);
753 			for (j = 0; j < csrow->nr_channels; j++)
754 				if (csrow->channels[j]->dimm->nr_pages)
755 					edac_mc_dump_channel(csrow->channels[j]);
756 		}
757 		for (i = 0; i < mci->tot_dimms; i++)
758 			if (mci->dimms[i]->nr_pages)
759 				edac_mc_dump_dimm(mci->dimms[i], i);
760 	}
761 #endif
762 	mutex_lock(&mem_ctls_mutex);
763 
764 	if (edac_mc_owner && edac_mc_owner != mci->mod_name) {
765 		ret = -EPERM;
766 		goto fail0;
767 	}
768 
769 	if (add_mc_to_global_list(mci))
770 		goto fail0;
771 
772 	/* set load time so that error rate can be tracked */
773 	mci->start_time = jiffies;
774 
775 	mci->bus = &mc_bus[mci->mc_idx];
776 
777 	if (edac_create_sysfs_mci_device(mci)) {
778 		edac_mc_printk(mci, KERN_WARNING,
779 			"failed to create sysfs device\n");
780 		goto fail1;
781 	}
782 
783 	/* If there IS a check routine, then we are running POLLED */
784 	if (mci->edac_check != NULL) {
785 		/* This instance is NOW RUNNING */
786 		mci->op_state = OP_RUNNING_POLL;
787 
788 		edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true);
789 	} else {
790 		mci->op_state = OP_RUNNING_INTERRUPT;
791 	}
792 
793 	/* Report action taken */
794 	edac_mc_printk(mci, KERN_INFO,
795 		"Giving out device to module %s controller %s: DEV %s (%s)\n",
796 		mci->mod_name, mci->ctl_name, mci->dev_name,
797 		edac_op_state_to_string(mci->op_state));
798 
799 	edac_mc_owner = mci->mod_name;
800 
801 	mutex_unlock(&mem_ctls_mutex);
802 	return 0;
803 
804 fail1:
805 	del_mc_from_global_list(mci);
806 
807 fail0:
808 	mutex_unlock(&mem_ctls_mutex);
809 	return ret;
810 }
811 EXPORT_SYMBOL_GPL(edac_mc_add_mc);
812 
813 /**
814  * edac_mc_del_mc: Remove sysfs entries for specified mci structure and
815  *                 remove mci structure from global list
816  * @pdev: Pointer to 'struct device' representing mci structure to remove.
817  *
818  * Return pointer to removed mci structure, or NULL if device not found.
819  */
820 struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
821 {
822 	struct mem_ctl_info *mci;
823 
824 	edac_dbg(0, "\n");
825 
826 	mutex_lock(&mem_ctls_mutex);
827 
828 	/* find the requested mci struct in the global list */
829 	mci = find_mci_by_dev(dev);
830 	if (mci == NULL) {
831 		mutex_unlock(&mem_ctls_mutex);
832 		return NULL;
833 	}
834 
835 	if (!del_mc_from_global_list(mci))
836 		edac_mc_owner = NULL;
837 	mutex_unlock(&mem_ctls_mutex);
838 
839 	/* flush workq processes */
840 	edac_mc_workq_teardown(mci);
841 
842 	/* marking MCI offline */
843 	mci->op_state = OP_OFFLINE;
844 
845 	/* remove from sysfs */
846 	edac_remove_sysfs_mci_device(mci);
847 
848 	edac_printk(KERN_INFO, EDAC_MC,
849 		"Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
850 		mci->mod_name, mci->ctl_name, edac_dev_name(mci));
851 
852 	return mci;
853 }
854 EXPORT_SYMBOL_GPL(edac_mc_del_mc);
855 
856 static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
857 				u32 size)
858 {
859 	struct page *pg;
860 	void *virt_addr;
861 	unsigned long flags = 0;
862 
863 	edac_dbg(3, "\n");
864 
865 	/* ECC error page was not in our memory. Ignore it. */
866 	if (!pfn_valid(page))
867 		return;
868 
869 	/* Find the actual page structure then map it and fix */
870 	pg = pfn_to_page(page);
871 
872 	if (PageHighMem(pg))
873 		local_irq_save(flags);
874 
875 	virt_addr = kmap_atomic(pg);
876 
877 	/* Perform architecture specific atomic scrub operation */
878 	atomic_scrub(virt_addr + offset, size);
879 
880 	/* Unmap and complete */
881 	kunmap_atomic(virt_addr);
882 
883 	if (PageHighMem(pg))
884 		local_irq_restore(flags);
885 }
886 
887 /* FIXME - should return -1 */
888 int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
889 {
890 	struct csrow_info **csrows = mci->csrows;
891 	int row, i, j, n;
892 
893 	edac_dbg(1, "MC%d: 0x%lx\n", mci->mc_idx, page);
894 	row = -1;
895 
896 	for (i = 0; i < mci->nr_csrows; i++) {
897 		struct csrow_info *csrow = csrows[i];
898 		n = 0;
899 		for (j = 0; j < csrow->nr_channels; j++) {
900 			struct dimm_info *dimm = csrow->channels[j]->dimm;
901 			n += dimm->nr_pages;
902 		}
903 		if (n == 0)
904 			continue;
905 
906 		edac_dbg(3, "MC%d: first(0x%lx) page(0x%lx) last(0x%lx) mask(0x%lx)\n",
907 			 mci->mc_idx,
908 			 csrow->first_page, page, csrow->last_page,
909 			 csrow->page_mask);
910 
911 		if ((page >= csrow->first_page) &&
912 		    (page <= csrow->last_page) &&
913 		    ((page & csrow->page_mask) ==
914 		     (csrow->first_page & csrow->page_mask))) {
915 			row = i;
916 			break;
917 		}
918 	}
919 
920 	if (row == -1)
921 		edac_mc_printk(mci, KERN_ERR,
922 			"could not look up page error address %lx\n",
923 			(unsigned long)page);
924 
925 	return row;
926 }
927 EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
928 
929 const char *edac_layer_name[] = {
930 	[EDAC_MC_LAYER_BRANCH] = "branch",
931 	[EDAC_MC_LAYER_CHANNEL] = "channel",
932 	[EDAC_MC_LAYER_SLOT] = "slot",
933 	[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
934 	[EDAC_MC_LAYER_ALL_MEM] = "memory",
935 };
936 EXPORT_SYMBOL_GPL(edac_layer_name);
937 
938 static void edac_inc_ce_error(struct mem_ctl_info *mci,
939 			      bool enable_per_layer_report,
940 			      const int pos[EDAC_MAX_LAYERS],
941 			      const u16 count)
942 {
943 	int i, index = 0;
944 
945 	mci->ce_mc += count;
946 
947 	if (!enable_per_layer_report) {
948 		mci->ce_noinfo_count += count;
949 		return;
950 	}
951 
952 	for (i = 0; i < mci->n_layers; i++) {
953 		if (pos[i] < 0)
954 			break;
955 		index += pos[i];
956 		mci->ce_per_layer[i][index] += count;
957 
958 		if (i < mci->n_layers - 1)
959 			index *= mci->layers[i + 1].size;
960 	}
961 }
962 
963 static void edac_inc_ue_error(struct mem_ctl_info *mci,
964 				    bool enable_per_layer_report,
965 				    const int pos[EDAC_MAX_LAYERS],
966 				    const u16 count)
967 {
968 	int i, index = 0;
969 
970 	mci->ue_mc += count;
971 
972 	if (!enable_per_layer_report) {
973 		mci->ce_noinfo_count += count;
974 		return;
975 	}
976 
977 	for (i = 0; i < mci->n_layers; i++) {
978 		if (pos[i] < 0)
979 			break;
980 		index += pos[i];
981 		mci->ue_per_layer[i][index] += count;
982 
983 		if (i < mci->n_layers - 1)
984 			index *= mci->layers[i + 1].size;
985 	}
986 }
987 
988 static void edac_ce_error(struct mem_ctl_info *mci,
989 			  const u16 error_count,
990 			  const int pos[EDAC_MAX_LAYERS],
991 			  const char *msg,
992 			  const char *location,
993 			  const char *label,
994 			  const char *detail,
995 			  const char *other_detail,
996 			  const bool enable_per_layer_report,
997 			  const unsigned long page_frame_number,
998 			  const unsigned long offset_in_page,
999 			  long grain)
1000 {
1001 	unsigned long remapped_page;
1002 	char *msg_aux = "";
1003 
1004 	if (*msg)
1005 		msg_aux = " ";
1006 
1007 	if (edac_mc_get_log_ce()) {
1008 		if (other_detail && *other_detail)
1009 			edac_mc_printk(mci, KERN_WARNING,
1010 				       "%d CE %s%son %s (%s %s - %s)\n",
1011 				       error_count, msg, msg_aux, label,
1012 				       location, detail, other_detail);
1013 		else
1014 			edac_mc_printk(mci, KERN_WARNING,
1015 				       "%d CE %s%son %s (%s %s)\n",
1016 				       error_count, msg, msg_aux, label,
1017 				       location, detail);
1018 	}
1019 	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
1020 
1021 	if (mci->scrub_mode & SCRUB_SW_SRC) {
1022 		/*
1023 			* Some memory controllers (called MCs below) can remap
1024 			* memory so that it is still available at a different
1025 			* address when PCI devices map into memory.
1026 			* MC's that can't do this, lose the memory where PCI
1027 			* devices are mapped. This mapping is MC-dependent
1028 			* and so we call back into the MC driver for it to
1029 			* map the MC page to a physical (CPU) page which can
1030 			* then be mapped to a virtual page - which can then
1031 			* be scrubbed.
1032 			*/
1033 		remapped_page = mci->ctl_page_to_phys ?
1034 			mci->ctl_page_to_phys(mci, page_frame_number) :
1035 			page_frame_number;
1036 
1037 		edac_mc_scrub_block(remapped_page,
1038 					offset_in_page, grain);
1039 	}
1040 }
1041 
1042 static void edac_ue_error(struct mem_ctl_info *mci,
1043 			  const u16 error_count,
1044 			  const int pos[EDAC_MAX_LAYERS],
1045 			  const char *msg,
1046 			  const char *location,
1047 			  const char *label,
1048 			  const char *detail,
1049 			  const char *other_detail,
1050 			  const bool enable_per_layer_report)
1051 {
1052 	char *msg_aux = "";
1053 
1054 	if (*msg)
1055 		msg_aux = " ";
1056 
1057 	if (edac_mc_get_log_ue()) {
1058 		if (other_detail && *other_detail)
1059 			edac_mc_printk(mci, KERN_WARNING,
1060 				       "%d UE %s%son %s (%s %s - %s)\n",
1061 				       error_count, msg, msg_aux, label,
1062 				       location, detail, other_detail);
1063 		else
1064 			edac_mc_printk(mci, KERN_WARNING,
1065 				       "%d UE %s%son %s (%s %s)\n",
1066 				       error_count, msg, msg_aux, label,
1067 				       location, detail);
1068 	}
1069 
1070 	if (edac_mc_get_panic_on_ue()) {
1071 		if (other_detail && *other_detail)
1072 			panic("UE %s%son %s (%s%s - %s)\n",
1073 			      msg, msg_aux, label, location, detail, other_detail);
1074 		else
1075 			panic("UE %s%son %s (%s%s)\n",
1076 			      msg, msg_aux, label, location, detail);
1077 	}
1078 
1079 	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
1080 }
1081 
1082 /**
1083  * edac_raw_mc_handle_error - reports a memory event to userspace without doing
1084  *			      anything to discover the error location
1085  *
1086  * @type:		severity of the error (CE/UE/Fatal)
1087  * @mci:		a struct mem_ctl_info pointer
1088  * @e:			error description
1089  *
1090  * This raw function is used internally by edac_mc_handle_error(). It should
1091  * only be called directly when the hardware error come directly from BIOS,
1092  * like in the case of APEI GHES driver.
1093  */
1094 void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
1095 			      struct mem_ctl_info *mci,
1096 			      struct edac_raw_error_desc *e)
1097 {
1098 	char detail[80];
1099 	int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
1100 
1101 	/* Memory type dependent details about the error */
1102 	if (type == HW_EVENT_ERR_CORRECTED) {
1103 		snprintf(detail, sizeof(detail),
1104 			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1105 			e->page_frame_number, e->offset_in_page,
1106 			e->grain, e->syndrome);
1107 		edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1108 			      detail, e->other_detail, e->enable_per_layer_report,
1109 			      e->page_frame_number, e->offset_in_page, e->grain);
1110 	} else {
1111 		snprintf(detail, sizeof(detail),
1112 			"page:0x%lx offset:0x%lx grain:%ld",
1113 			e->page_frame_number, e->offset_in_page, e->grain);
1114 
1115 		edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1116 			      detail, e->other_detail, e->enable_per_layer_report);
1117 	}
1118 
1119 
1120 }
1121 EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
1122 
1123 /**
1124  * edac_mc_handle_error - reports a memory event to userspace
1125  *
1126  * @type:		severity of the error (CE/UE/Fatal)
1127  * @mci:		a struct mem_ctl_info pointer
1128  * @error_count:	Number of errors of the same type
1129  * @page_frame_number:	mem page where the error occurred
1130  * @offset_in_page:	offset of the error inside the page
1131  * @syndrome:		ECC syndrome
1132  * @top_layer:		Memory layer[0] position
1133  * @mid_layer:		Memory layer[1] position
1134  * @low_layer:		Memory layer[2] position
1135  * @msg:		Message meaningful to the end users that
1136  *			explains the event
1137  * @other_detail:	Technical details about the event that
1138  *			may help hardware manufacturers and
1139  *			EDAC developers to analyse the event
1140  */
1141 void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1142 			  struct mem_ctl_info *mci,
1143 			  const u16 error_count,
1144 			  const unsigned long page_frame_number,
1145 			  const unsigned long offset_in_page,
1146 			  const unsigned long syndrome,
1147 			  const int top_layer,
1148 			  const int mid_layer,
1149 			  const int low_layer,
1150 			  const char *msg,
1151 			  const char *other_detail)
1152 {
1153 	char *p;
1154 	int row = -1, chan = -1;
1155 	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
1156 	int i, n_labels = 0;
1157 	u8 grain_bits;
1158 	struct edac_raw_error_desc *e = &mci->error_desc;
1159 
1160 	edac_dbg(3, "MC%d\n", mci->mc_idx);
1161 
1162 	/* Fills the error report buffer */
1163 	memset(e, 0, sizeof (*e));
1164 	e->error_count = error_count;
1165 	e->top_layer = top_layer;
1166 	e->mid_layer = mid_layer;
1167 	e->low_layer = low_layer;
1168 	e->page_frame_number = page_frame_number;
1169 	e->offset_in_page = offset_in_page;
1170 	e->syndrome = syndrome;
1171 	e->msg = msg;
1172 	e->other_detail = other_detail;
1173 
1174 	/*
1175 	 * Check if the event report is consistent and if the memory
1176 	 * location is known. If it is known, enable_per_layer_report will be
1177 	 * true, the DIMM(s) label info will be filled and the per-layer
1178 	 * error counters will be incremented.
1179 	 */
1180 	for (i = 0; i < mci->n_layers; i++) {
1181 		if (pos[i] >= (int)mci->layers[i].size) {
1182 
1183 			edac_mc_printk(mci, KERN_ERR,
1184 				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
1185 				       edac_layer_name[mci->layers[i].type],
1186 				       pos[i], mci->layers[i].size);
1187 			/*
1188 			 * Instead of just returning it, let's use what's
1189 			 * known about the error. The increment routines and
1190 			 * the DIMM filter logic will do the right thing by
1191 			 * pointing the likely damaged DIMMs.
1192 			 */
1193 			pos[i] = -1;
1194 		}
1195 		if (pos[i] >= 0)
1196 			e->enable_per_layer_report = true;
1197 	}
1198 
1199 	/*
1200 	 * Get the dimm label/grain that applies to the match criteria.
1201 	 * As the error algorithm may not be able to point to just one memory
1202 	 * stick, the logic here will get all possible labels that could
1203 	 * pottentially be affected by the error.
1204 	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
1205 	 * to have only the MC channel and the MC dimm (also called "branch")
1206 	 * but the channel is not known, as the memory is arranged in pairs,
1207 	 * where each memory belongs to a separate channel within the same
1208 	 * branch.
1209 	 */
1210 	p = e->label;
1211 	*p = '\0';
1212 
1213 	for (i = 0; i < mci->tot_dimms; i++) {
1214 		struct dimm_info *dimm = mci->dimms[i];
1215 
1216 		if (top_layer >= 0 && top_layer != dimm->location[0])
1217 			continue;
1218 		if (mid_layer >= 0 && mid_layer != dimm->location[1])
1219 			continue;
1220 		if (low_layer >= 0 && low_layer != dimm->location[2])
1221 			continue;
1222 
1223 		/* get the max grain, over the error match range */
1224 		if (dimm->grain > e->grain)
1225 			e->grain = dimm->grain;
1226 
1227 		/*
1228 		 * If the error is memory-controller wide, there's no need to
1229 		 * seek for the affected DIMMs because the whole
1230 		 * channel/memory controller/...  may be affected.
1231 		 * Also, don't show errors for empty DIMM slots.
1232 		 */
1233 		if (e->enable_per_layer_report && dimm->nr_pages) {
1234 			if (n_labels >= EDAC_MAX_LABELS) {
1235 				e->enable_per_layer_report = false;
1236 				break;
1237 			}
1238 			n_labels++;
1239 			if (p != e->label) {
1240 				strcpy(p, OTHER_LABEL);
1241 				p += strlen(OTHER_LABEL);
1242 			}
1243 			strcpy(p, dimm->label);
1244 			p += strlen(p);
1245 			*p = '\0';
1246 
1247 			/*
1248 			 * get csrow/channel of the DIMM, in order to allow
1249 			 * incrementing the compat API counters
1250 			 */
1251 			edac_dbg(4, "%s csrows map: (%d,%d)\n",
1252 				 mci->csbased ? "rank" : "dimm",
1253 				 dimm->csrow, dimm->cschannel);
1254 			if (row == -1)
1255 				row = dimm->csrow;
1256 			else if (row >= 0 && row != dimm->csrow)
1257 				row = -2;
1258 
1259 			if (chan == -1)
1260 				chan = dimm->cschannel;
1261 			else if (chan >= 0 && chan != dimm->cschannel)
1262 				chan = -2;
1263 		}
1264 	}
1265 
1266 	if (!e->enable_per_layer_report) {
1267 		strcpy(e->label, "any memory");
1268 	} else {
1269 		edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
1270 		if (p == e->label)
1271 			strcpy(e->label, "unknown memory");
1272 		if (type == HW_EVENT_ERR_CORRECTED) {
1273 			if (row >= 0) {
1274 				mci->csrows[row]->ce_count += error_count;
1275 				if (chan >= 0)
1276 					mci->csrows[row]->channels[chan]->ce_count += error_count;
1277 			}
1278 		} else
1279 			if (row >= 0)
1280 				mci->csrows[row]->ue_count += error_count;
1281 	}
1282 
1283 	/* Fill the RAM location data */
1284 	p = e->location;
1285 
1286 	for (i = 0; i < mci->n_layers; i++) {
1287 		if (pos[i] < 0)
1288 			continue;
1289 
1290 		p += sprintf(p, "%s:%d ",
1291 			     edac_layer_name[mci->layers[i].type],
1292 			     pos[i]);
1293 	}
1294 	if (p > e->location)
1295 		*(p - 1) = '\0';
1296 
1297 	/* Report the error via the trace interface */
1298 	grain_bits = fls_long(e->grain) + 1;
1299 	trace_mc_event(type, e->msg, e->label, e->error_count,
1300 		       mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
1301 		       PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
1302 		       grain_bits, e->syndrome, e->other_detail);
1303 
1304 	edac_raw_mc_handle_error(type, mci, e);
1305 }
1306 EXPORT_SYMBOL_GPL(edac_mc_handle_error);
1307