xref: /openbmc/linux/drivers/edac/edac_mc.c (revision 3932b9ca)
1 /*
2  * edac_mc kernel module
3  * (C) 2005, 2006 Linux Networx (http://lnxi.com)
4  * This file may be distributed under the terms of the
5  * GNU General Public License.
6  *
7  * Written by Thayne Harbaugh
8  * Based on work by Dan Hollis <goemon at anime dot net> and others.
9  *	http://www.anime.net/~goemon/linux-ecc/
10  *
11  * Modified by Dave Peterson and Doug Thompson
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/proc_fs.h>
17 #include <linux/kernel.h>
18 #include <linux/types.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/sysctl.h>
22 #include <linux/highmem.h>
23 #include <linux/timer.h>
24 #include <linux/slab.h>
25 #include <linux/jiffies.h>
26 #include <linux/spinlock.h>
27 #include <linux/list.h>
28 #include <linux/ctype.h>
29 #include <linux/edac.h>
30 #include <linux/bitops.h>
31 #include <asm/uaccess.h>
32 #include <asm/page.h>
33 #include <asm/edac.h>
34 #include "edac_core.h"
35 #include "edac_module.h"
36 #include <ras/ras_event.h>
37 
38 /* lock to memory controller's control array */
39 static DEFINE_MUTEX(mem_ctls_mutex);
40 static LIST_HEAD(mc_devices);
41 
42 /*
43  * Used to lock EDAC MC to just one module, avoiding two drivers e. g.
44  *	apei/ghes and i7core_edac to be used at the same time.
45  */
46 static void const *edac_mc_owner;
47 
48 static struct bus_type mc_bus[EDAC_MAX_MCS];
49 
50 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
51 			         unsigned len)
52 {
53 	struct mem_ctl_info *mci = dimm->mci;
54 	int i, n, count = 0;
55 	char *p = buf;
56 
57 	for (i = 0; i < mci->n_layers; i++) {
58 		n = snprintf(p, len, "%s %d ",
59 			      edac_layer_name[mci->layers[i].type],
60 			      dimm->location[i]);
61 		p += n;
62 		len -= n;
63 		count += n;
64 		if (!len)
65 			break;
66 	}
67 
68 	return count;
69 }
70 
71 #ifdef CONFIG_EDAC_DEBUG
72 
73 static void edac_mc_dump_channel(struct rank_info *chan)
74 {
75 	edac_dbg(4, "  channel->chan_idx = %d\n", chan->chan_idx);
76 	edac_dbg(4, "    channel = %p\n", chan);
77 	edac_dbg(4, "    channel->csrow = %p\n", chan->csrow);
78 	edac_dbg(4, "    channel->dimm = %p\n", chan->dimm);
79 }
80 
81 static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
82 {
83 	char location[80];
84 
85 	edac_dimm_info_location(dimm, location, sizeof(location));
86 
87 	edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n",
88 		 dimm->mci->csbased ? "rank" : "dimm",
89 		 number, location, dimm->csrow, dimm->cschannel);
90 	edac_dbg(4, "  dimm = %p\n", dimm);
91 	edac_dbg(4, "  dimm->label = '%s'\n", dimm->label);
92 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
93 	edac_dbg(4, "  dimm->grain = %d\n", dimm->grain);
94 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
95 }
96 
97 static void edac_mc_dump_csrow(struct csrow_info *csrow)
98 {
99 	edac_dbg(4, "csrow->csrow_idx = %d\n", csrow->csrow_idx);
100 	edac_dbg(4, "  csrow = %p\n", csrow);
101 	edac_dbg(4, "  csrow->first_page = 0x%lx\n", csrow->first_page);
102 	edac_dbg(4, "  csrow->last_page = 0x%lx\n", csrow->last_page);
103 	edac_dbg(4, "  csrow->page_mask = 0x%lx\n", csrow->page_mask);
104 	edac_dbg(4, "  csrow->nr_channels = %d\n", csrow->nr_channels);
105 	edac_dbg(4, "  csrow->channels = %p\n", csrow->channels);
106 	edac_dbg(4, "  csrow->mci = %p\n", csrow->mci);
107 }
108 
109 static void edac_mc_dump_mci(struct mem_ctl_info *mci)
110 {
111 	edac_dbg(3, "\tmci = %p\n", mci);
112 	edac_dbg(3, "\tmci->mtype_cap = %lx\n", mci->mtype_cap);
113 	edac_dbg(3, "\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap);
114 	edac_dbg(3, "\tmci->edac_cap = %lx\n", mci->edac_cap);
115 	edac_dbg(4, "\tmci->edac_check = %p\n", mci->edac_check);
116 	edac_dbg(3, "\tmci->nr_csrows = %d, csrows = %p\n",
117 		 mci->nr_csrows, mci->csrows);
118 	edac_dbg(3, "\tmci->nr_dimms = %d, dimms = %p\n",
119 		 mci->tot_dimms, mci->dimms);
120 	edac_dbg(3, "\tdev = %p\n", mci->pdev);
121 	edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
122 		 mci->mod_name, mci->ctl_name);
123 	edac_dbg(3, "\tpvt_info = %p\n\n", mci->pvt_info);
124 }
125 
126 #endif				/* CONFIG_EDAC_DEBUG */
127 
128 /*
129  * keep those in sync with the enum mem_type
130  */
131 const char *edac_mem_types[] = {
132 	"Empty csrow",
133 	"Reserved csrow type",
134 	"Unknown csrow type",
135 	"Fast page mode RAM",
136 	"Extended data out RAM",
137 	"Burst Extended data out RAM",
138 	"Single data rate SDRAM",
139 	"Registered single data rate SDRAM",
140 	"Double data rate SDRAM",
141 	"Registered Double data rate SDRAM",
142 	"Rambus DRAM",
143 	"Unbuffered DDR2 RAM",
144 	"Fully buffered DDR2",
145 	"Registered DDR2 RAM",
146 	"Rambus XDR",
147 	"Unbuffered DDR3 RAM",
148 	"Registered DDR3 RAM",
149 };
150 EXPORT_SYMBOL_GPL(edac_mem_types);
151 
152 /**
153  * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
154  * @p:		pointer to a pointer with the memory offset to be used. At
155  *		return, this will be incremented to point to the next offset
156  * @size:	Size of the data structure to be reserved
157  * @n_elems:	Number of elements that should be reserved
158  *
159  * If 'size' is a constant, the compiler will optimize this whole function
160  * down to either a no-op or the addition of a constant to the value of '*p'.
161  *
162  * The 'p' pointer is absolutely needed to keep the proper advancing
163  * further in memory to the proper offsets when allocating the struct along
164  * with its embedded structs, as edac_device_alloc_ctl_info() does it
165  * above, for example.
166  *
167  * At return, the pointer 'p' will be incremented to be used on a next call
168  * to this function.
169  */
170 void *edac_align_ptr(void **p, unsigned size, int n_elems)
171 {
172 	unsigned align, r;
173 	void *ptr = *p;
174 
175 	*p += size * n_elems;
176 
177 	/*
178 	 * 'p' can possibly be an unaligned item X such that sizeof(X) is
179 	 * 'size'.  Adjust 'p' so that its alignment is at least as
180 	 * stringent as what the compiler would provide for X and return
181 	 * the aligned result.
182 	 * Here we assume that the alignment of a "long long" is the most
183 	 * stringent alignment that the compiler will ever provide by default.
184 	 * As far as I know, this is a reasonable assumption.
185 	 */
186 	if (size > sizeof(long))
187 		align = sizeof(long long);
188 	else if (size > sizeof(int))
189 		align = sizeof(long);
190 	else if (size > sizeof(short))
191 		align = sizeof(int);
192 	else if (size > sizeof(char))
193 		align = sizeof(short);
194 	else
195 		return (char *)ptr;
196 
197 	r = (unsigned long)p % align;
198 
199 	if (r == 0)
200 		return (char *)ptr;
201 
202 	*p += align - r;
203 
204 	return (void *)(((unsigned long)ptr) + align - r);
205 }
206 
207 static void _edac_mc_free(struct mem_ctl_info *mci)
208 {
209 	int i, chn, row;
210 	struct csrow_info *csr;
211 	const unsigned int tot_dimms = mci->tot_dimms;
212 	const unsigned int tot_channels = mci->num_cschannel;
213 	const unsigned int tot_csrows = mci->nr_csrows;
214 
215 	if (mci->dimms) {
216 		for (i = 0; i < tot_dimms; i++)
217 			kfree(mci->dimms[i]);
218 		kfree(mci->dimms);
219 	}
220 	if (mci->csrows) {
221 		for (row = 0; row < tot_csrows; row++) {
222 			csr = mci->csrows[row];
223 			if (csr) {
224 				if (csr->channels) {
225 					for (chn = 0; chn < tot_channels; chn++)
226 						kfree(csr->channels[chn]);
227 					kfree(csr->channels);
228 				}
229 				kfree(csr);
230 			}
231 		}
232 		kfree(mci->csrows);
233 	}
234 	kfree(mci);
235 }
236 
237 /**
238  * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
239  * @mc_num:		Memory controller number
240  * @n_layers:		Number of MC hierarchy layers
241  * layers:		Describes each layer as seen by the Memory Controller
242  * @size_pvt:		size of private storage needed
243  *
244  *
245  * Everything is kmalloc'ed as one big chunk - more efficient.
246  * Only can be used if all structures have the same lifetime - otherwise
247  * you have to allocate and initialize your own structures.
248  *
249  * Use edac_mc_free() to free mc structures allocated by this function.
250  *
251  * NOTE: drivers handle multi-rank memories in different ways: in some
252  * drivers, one multi-rank memory stick is mapped as one entry, while, in
253  * others, a single multi-rank memory stick would be mapped into several
254  * entries. Currently, this function will allocate multiple struct dimm_info
255  * on such scenarios, as grouping the multiple ranks require drivers change.
256  *
257  * Returns:
258  *	On failure: NULL
259  *	On success: struct mem_ctl_info pointer
260  */
261 struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
262 				   unsigned n_layers,
263 				   struct edac_mc_layer *layers,
264 				   unsigned sz_pvt)
265 {
266 	struct mem_ctl_info *mci;
267 	struct edac_mc_layer *layer;
268 	struct csrow_info *csr;
269 	struct rank_info *chan;
270 	struct dimm_info *dimm;
271 	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
272 	unsigned pos[EDAC_MAX_LAYERS];
273 	unsigned size, tot_dimms = 1, count = 1;
274 	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
275 	void *pvt, *p, *ptr = NULL;
276 	int i, j, row, chn, n, len, off;
277 	bool per_rank = false;
278 
279 	BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
280 	/*
281 	 * Calculate the total amount of dimms and csrows/cschannels while
282 	 * in the old API emulation mode
283 	 */
284 	for (i = 0; i < n_layers; i++) {
285 		tot_dimms *= layers[i].size;
286 		if (layers[i].is_virt_csrow)
287 			tot_csrows *= layers[i].size;
288 		else
289 			tot_channels *= layers[i].size;
290 
291 		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
292 			per_rank = true;
293 	}
294 
295 	/* Figure out the offsets of the various items from the start of an mc
296 	 * structure.  We want the alignment of each item to be at least as
297 	 * stringent as what the compiler would provide if we could simply
298 	 * hardcode everything into a single struct.
299 	 */
300 	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
301 	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
302 	for (i = 0; i < n_layers; i++) {
303 		count *= layers[i].size;
304 		edac_dbg(4, "errcount layer %d size %d\n", i, count);
305 		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
306 		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
307 		tot_errcount += 2 * count;
308 	}
309 
310 	edac_dbg(4, "allocating %d error counters\n", tot_errcount);
311 	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
312 	size = ((unsigned long)pvt) + sz_pvt;
313 
314 	edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
315 		 size,
316 		 tot_dimms,
317 		 per_rank ? "ranks" : "dimms",
318 		 tot_csrows * tot_channels);
319 
320 	mci = kzalloc(size, GFP_KERNEL);
321 	if (mci == NULL)
322 		return NULL;
323 
324 	/* Adjust pointers so they point within the memory we just allocated
325 	 * rather than an imaginary chunk of memory located at address 0.
326 	 */
327 	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
328 	for (i = 0; i < n_layers; i++) {
329 		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
330 		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
331 	}
332 	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
333 
334 	/* setup index and various internal pointers */
335 	mci->mc_idx = mc_num;
336 	mci->tot_dimms = tot_dimms;
337 	mci->pvt_info = pvt;
338 	mci->n_layers = n_layers;
339 	mci->layers = layer;
340 	memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
341 	mci->nr_csrows = tot_csrows;
342 	mci->num_cschannel = tot_channels;
343 	mci->csbased = per_rank;
344 
345 	/*
346 	 * Alocate and fill the csrow/channels structs
347 	 */
348 	mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL);
349 	if (!mci->csrows)
350 		goto error;
351 	for (row = 0; row < tot_csrows; row++) {
352 		csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL);
353 		if (!csr)
354 			goto error;
355 		mci->csrows[row] = csr;
356 		csr->csrow_idx = row;
357 		csr->mci = mci;
358 		csr->nr_channels = tot_channels;
359 		csr->channels = kcalloc(tot_channels, sizeof(*csr->channels),
360 					GFP_KERNEL);
361 		if (!csr->channels)
362 			goto error;
363 
364 		for (chn = 0; chn < tot_channels; chn++) {
365 			chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL);
366 			if (!chan)
367 				goto error;
368 			csr->channels[chn] = chan;
369 			chan->chan_idx = chn;
370 			chan->csrow = csr;
371 		}
372 	}
373 
374 	/*
375 	 * Allocate and fill the dimm structs
376 	 */
377 	mci->dimms  = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL);
378 	if (!mci->dimms)
379 		goto error;
380 
381 	memset(&pos, 0, sizeof(pos));
382 	row = 0;
383 	chn = 0;
384 	for (i = 0; i < tot_dimms; i++) {
385 		chan = mci->csrows[row]->channels[chn];
386 		off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
387 		if (off < 0 || off >= tot_dimms) {
388 			edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n");
389 			goto error;
390 		}
391 
392 		dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL);
393 		if (!dimm)
394 			goto error;
395 		mci->dimms[off] = dimm;
396 		dimm->mci = mci;
397 
398 		/*
399 		 * Copy DIMM location and initialize it.
400 		 */
401 		len = sizeof(dimm->label);
402 		p = dimm->label;
403 		n = snprintf(p, len, "mc#%u", mc_num);
404 		p += n;
405 		len -= n;
406 		for (j = 0; j < n_layers; j++) {
407 			n = snprintf(p, len, "%s#%u",
408 				     edac_layer_name[layers[j].type],
409 				     pos[j]);
410 			p += n;
411 			len -= n;
412 			dimm->location[j] = pos[j];
413 
414 			if (len <= 0)
415 				break;
416 		}
417 
418 		/* Link it to the csrows old API data */
419 		chan->dimm = dimm;
420 		dimm->csrow = row;
421 		dimm->cschannel = chn;
422 
423 		/* Increment csrow location */
424 		if (layers[0].is_virt_csrow) {
425 			chn++;
426 			if (chn == tot_channels) {
427 				chn = 0;
428 				row++;
429 			}
430 		} else {
431 			row++;
432 			if (row == tot_csrows) {
433 				row = 0;
434 				chn++;
435 			}
436 		}
437 
438 		/* Increment dimm location */
439 		for (j = n_layers - 1; j >= 0; j--) {
440 			pos[j]++;
441 			if (pos[j] < layers[j].size)
442 				break;
443 			pos[j] = 0;
444 		}
445 	}
446 
447 	mci->op_state = OP_ALLOC;
448 
449 	return mci;
450 
451 error:
452 	_edac_mc_free(mci);
453 
454 	return NULL;
455 }
456 EXPORT_SYMBOL_GPL(edac_mc_alloc);
457 
458 /**
459  * edac_mc_free
460  *	'Free' a previously allocated 'mci' structure
461  * @mci: pointer to a struct mem_ctl_info structure
462  */
463 void edac_mc_free(struct mem_ctl_info *mci)
464 {
465 	edac_dbg(1, "\n");
466 
467 	/* If we're not yet registered with sysfs free only what was allocated
468 	 * in edac_mc_alloc().
469 	 */
470 	if (!device_is_registered(&mci->dev)) {
471 		_edac_mc_free(mci);
472 		return;
473 	}
474 
475 	/* the mci instance is freed here, when the sysfs object is dropped */
476 	edac_unregister_sysfs(mci);
477 }
478 EXPORT_SYMBOL_GPL(edac_mc_free);
479 
480 
481 /**
482  * find_mci_by_dev
483  *
484  *	scan list of controllers looking for the one that manages
485  *	the 'dev' device
486  * @dev: pointer to a struct device related with the MCI
487  */
488 struct mem_ctl_info *find_mci_by_dev(struct device *dev)
489 {
490 	struct mem_ctl_info *mci;
491 	struct list_head *item;
492 
493 	edac_dbg(3, "\n");
494 
495 	list_for_each(item, &mc_devices) {
496 		mci = list_entry(item, struct mem_ctl_info, link);
497 
498 		if (mci->pdev == dev)
499 			return mci;
500 	}
501 
502 	return NULL;
503 }
504 EXPORT_SYMBOL_GPL(find_mci_by_dev);
505 
506 /*
507  * handler for EDAC to check if NMI type handler has asserted interrupt
508  */
509 static int edac_mc_assert_error_check_and_clear(void)
510 {
511 	int old_state;
512 
513 	if (edac_op_state == EDAC_OPSTATE_POLL)
514 		return 1;
515 
516 	old_state = edac_err_assert;
517 	edac_err_assert = 0;
518 
519 	return old_state;
520 }
521 
522 /*
523  * edac_mc_workq_function
524  *	performs the operation scheduled by a workq request
525  */
526 static void edac_mc_workq_function(struct work_struct *work_req)
527 {
528 	struct delayed_work *d_work = to_delayed_work(work_req);
529 	struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work);
530 
531 	mutex_lock(&mem_ctls_mutex);
532 
533 	/* if this control struct has movd to offline state, we are done */
534 	if (mci->op_state == OP_OFFLINE) {
535 		mutex_unlock(&mem_ctls_mutex);
536 		return;
537 	}
538 
539 	/* Only poll controllers that are running polled and have a check */
540 	if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
541 		mci->edac_check(mci);
542 
543 	mutex_unlock(&mem_ctls_mutex);
544 
545 	/* Reschedule */
546 	queue_delayed_work(edac_workqueue, &mci->work,
547 			msecs_to_jiffies(edac_mc_get_poll_msec()));
548 }
549 
550 /*
551  * edac_mc_workq_setup
552  *	initialize a workq item for this mci
553  *	passing in the new delay period in msec
554  *
555  *	locking model:
556  *
557  *		called with the mem_ctls_mutex held
558  */
559 static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec,
560 				bool init)
561 {
562 	edac_dbg(0, "\n");
563 
564 	/* if this instance is not in the POLL state, then simply return */
565 	if (mci->op_state != OP_RUNNING_POLL)
566 		return;
567 
568 	if (init)
569 		INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
570 
571 	mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
572 }
573 
574 /*
575  * edac_mc_workq_teardown
576  *	stop the workq processing on this mci
577  *
578  *	locking model:
579  *
580  *		called WITHOUT lock held
581  */
582 static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
583 {
584 	int status;
585 
586 	if (mci->op_state != OP_RUNNING_POLL)
587 		return;
588 
589 	status = cancel_delayed_work(&mci->work);
590 	if (status == 0) {
591 		edac_dbg(0, "not canceled, flush the queue\n");
592 
593 		/* workq instance might be running, wait for it */
594 		flush_workqueue(edac_workqueue);
595 	}
596 }
597 
598 /*
599  * edac_mc_reset_delay_period(unsigned long value)
600  *
601  *	user space has updated our poll period value, need to
602  *	reset our workq delays
603  */
604 void edac_mc_reset_delay_period(unsigned long value)
605 {
606 	struct mem_ctl_info *mci;
607 	struct list_head *item;
608 
609 	mutex_lock(&mem_ctls_mutex);
610 
611 	list_for_each(item, &mc_devices) {
612 		mci = list_entry(item, struct mem_ctl_info, link);
613 
614 		edac_mc_workq_setup(mci, value, false);
615 	}
616 
617 	mutex_unlock(&mem_ctls_mutex);
618 }
619 
620 
621 
622 /* Return 0 on success, 1 on failure.
623  * Before calling this function, caller must
624  * assign a unique value to mci->mc_idx.
625  *
626  *	locking model:
627  *
628  *		called with the mem_ctls_mutex lock held
629  */
630 static int add_mc_to_global_list(struct mem_ctl_info *mci)
631 {
632 	struct list_head *item, *insert_before;
633 	struct mem_ctl_info *p;
634 
635 	insert_before = &mc_devices;
636 
637 	p = find_mci_by_dev(mci->pdev);
638 	if (unlikely(p != NULL))
639 		goto fail0;
640 
641 	list_for_each(item, &mc_devices) {
642 		p = list_entry(item, struct mem_ctl_info, link);
643 
644 		if (p->mc_idx >= mci->mc_idx) {
645 			if (unlikely(p->mc_idx == mci->mc_idx))
646 				goto fail1;
647 
648 			insert_before = item;
649 			break;
650 		}
651 	}
652 
653 	list_add_tail_rcu(&mci->link, insert_before);
654 	atomic_inc(&edac_handlers);
655 	return 0;
656 
657 fail0:
658 	edac_printk(KERN_WARNING, EDAC_MC,
659 		"%s (%s) %s %s already assigned %d\n", dev_name(p->pdev),
660 		edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
661 	return 1;
662 
663 fail1:
664 	edac_printk(KERN_WARNING, EDAC_MC,
665 		"bug in low-level driver: attempt to assign\n"
666 		"    duplicate mc_idx %d in %s()\n", p->mc_idx, __func__);
667 	return 1;
668 }
669 
670 static int del_mc_from_global_list(struct mem_ctl_info *mci)
671 {
672 	int handlers = atomic_dec_return(&edac_handlers);
673 	list_del_rcu(&mci->link);
674 
675 	/* these are for safe removal of devices from global list while
676 	 * NMI handlers may be traversing list
677 	 */
678 	synchronize_rcu();
679 	INIT_LIST_HEAD(&mci->link);
680 
681 	return handlers;
682 }
683 
684 /**
685  * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'.
686  *
687  * If found, return a pointer to the structure.
688  * Else return NULL.
689  *
690  * Caller must hold mem_ctls_mutex.
691  */
692 struct mem_ctl_info *edac_mc_find(int idx)
693 {
694 	struct list_head *item;
695 	struct mem_ctl_info *mci;
696 
697 	list_for_each(item, &mc_devices) {
698 		mci = list_entry(item, struct mem_ctl_info, link);
699 
700 		if (mci->mc_idx >= idx) {
701 			if (mci->mc_idx == idx)
702 				return mci;
703 
704 			break;
705 		}
706 	}
707 
708 	return NULL;
709 }
710 EXPORT_SYMBOL(edac_mc_find);
711 
712 /**
713  * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
714  *                 create sysfs entries associated with mci structure
715  * @mci: pointer to the mci structure to be added to the list
716  *
717  * Return:
718  *	0	Success
719  *	!0	Failure
720  */
721 
722 /* FIXME - should a warning be printed if no error detection? correction? */
723 int edac_mc_add_mc(struct mem_ctl_info *mci)
724 {
725 	int ret = -EINVAL;
726 	edac_dbg(0, "\n");
727 
728 	if (mci->mc_idx >= EDAC_MAX_MCS) {
729 		pr_warn_once("Too many memory controllers: %d\n", mci->mc_idx);
730 		return -ENODEV;
731 	}
732 
733 #ifdef CONFIG_EDAC_DEBUG
734 	if (edac_debug_level >= 3)
735 		edac_mc_dump_mci(mci);
736 
737 	if (edac_debug_level >= 4) {
738 		int i;
739 
740 		for (i = 0; i < mci->nr_csrows; i++) {
741 			struct csrow_info *csrow = mci->csrows[i];
742 			u32 nr_pages = 0;
743 			int j;
744 
745 			for (j = 0; j < csrow->nr_channels; j++)
746 				nr_pages += csrow->channels[j]->dimm->nr_pages;
747 			if (!nr_pages)
748 				continue;
749 			edac_mc_dump_csrow(csrow);
750 			for (j = 0; j < csrow->nr_channels; j++)
751 				if (csrow->channels[j]->dimm->nr_pages)
752 					edac_mc_dump_channel(csrow->channels[j]);
753 		}
754 		for (i = 0; i < mci->tot_dimms; i++)
755 			if (mci->dimms[i]->nr_pages)
756 				edac_mc_dump_dimm(mci->dimms[i], i);
757 	}
758 #endif
759 	mutex_lock(&mem_ctls_mutex);
760 
761 	if (edac_mc_owner && edac_mc_owner != mci->mod_name) {
762 		ret = -EPERM;
763 		goto fail0;
764 	}
765 
766 	if (add_mc_to_global_list(mci))
767 		goto fail0;
768 
769 	/* set load time so that error rate can be tracked */
770 	mci->start_time = jiffies;
771 
772 	mci->bus = &mc_bus[mci->mc_idx];
773 
774 	if (edac_create_sysfs_mci_device(mci)) {
775 		edac_mc_printk(mci, KERN_WARNING,
776 			"failed to create sysfs device\n");
777 		goto fail1;
778 	}
779 
780 	/* If there IS a check routine, then we are running POLLED */
781 	if (mci->edac_check != NULL) {
782 		/* This instance is NOW RUNNING */
783 		mci->op_state = OP_RUNNING_POLL;
784 
785 		edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true);
786 	} else {
787 		mci->op_state = OP_RUNNING_INTERRUPT;
788 	}
789 
790 	/* Report action taken */
791 	edac_mc_printk(mci, KERN_INFO,
792 		"Giving out device to module %s controller %s: DEV %s (%s)\n",
793 		mci->mod_name, mci->ctl_name, mci->dev_name,
794 		edac_op_state_to_string(mci->op_state));
795 
796 	edac_mc_owner = mci->mod_name;
797 
798 	mutex_unlock(&mem_ctls_mutex);
799 	return 0;
800 
801 fail1:
802 	del_mc_from_global_list(mci);
803 
804 fail0:
805 	mutex_unlock(&mem_ctls_mutex);
806 	return ret;
807 }
808 EXPORT_SYMBOL_GPL(edac_mc_add_mc);
809 
810 /**
811  * edac_mc_del_mc: Remove sysfs entries for specified mci structure and
812  *                 remove mci structure from global list
813  * @pdev: Pointer to 'struct device' representing mci structure to remove.
814  *
815  * Return pointer to removed mci structure, or NULL if device not found.
816  */
817 struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
818 {
819 	struct mem_ctl_info *mci;
820 
821 	edac_dbg(0, "\n");
822 
823 	mutex_lock(&mem_ctls_mutex);
824 
825 	/* find the requested mci struct in the global list */
826 	mci = find_mci_by_dev(dev);
827 	if (mci == NULL) {
828 		mutex_unlock(&mem_ctls_mutex);
829 		return NULL;
830 	}
831 
832 	if (!del_mc_from_global_list(mci))
833 		edac_mc_owner = NULL;
834 	mutex_unlock(&mem_ctls_mutex);
835 
836 	/* flush workq processes */
837 	edac_mc_workq_teardown(mci);
838 
839 	/* marking MCI offline */
840 	mci->op_state = OP_OFFLINE;
841 
842 	/* remove from sysfs */
843 	edac_remove_sysfs_mci_device(mci);
844 
845 	edac_printk(KERN_INFO, EDAC_MC,
846 		"Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
847 		mci->mod_name, mci->ctl_name, edac_dev_name(mci));
848 
849 	return mci;
850 }
851 EXPORT_SYMBOL_GPL(edac_mc_del_mc);
852 
853 static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
854 				u32 size)
855 {
856 	struct page *pg;
857 	void *virt_addr;
858 	unsigned long flags = 0;
859 
860 	edac_dbg(3, "\n");
861 
862 	/* ECC error page was not in our memory. Ignore it. */
863 	if (!pfn_valid(page))
864 		return;
865 
866 	/* Find the actual page structure then map it and fix */
867 	pg = pfn_to_page(page);
868 
869 	if (PageHighMem(pg))
870 		local_irq_save(flags);
871 
872 	virt_addr = kmap_atomic(pg);
873 
874 	/* Perform architecture specific atomic scrub operation */
875 	atomic_scrub(virt_addr + offset, size);
876 
877 	/* Unmap and complete */
878 	kunmap_atomic(virt_addr);
879 
880 	if (PageHighMem(pg))
881 		local_irq_restore(flags);
882 }
883 
884 /* FIXME - should return -1 */
885 int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
886 {
887 	struct csrow_info **csrows = mci->csrows;
888 	int row, i, j, n;
889 
890 	edac_dbg(1, "MC%d: 0x%lx\n", mci->mc_idx, page);
891 	row = -1;
892 
893 	for (i = 0; i < mci->nr_csrows; i++) {
894 		struct csrow_info *csrow = csrows[i];
895 		n = 0;
896 		for (j = 0; j < csrow->nr_channels; j++) {
897 			struct dimm_info *dimm = csrow->channels[j]->dimm;
898 			n += dimm->nr_pages;
899 		}
900 		if (n == 0)
901 			continue;
902 
903 		edac_dbg(3, "MC%d: first(0x%lx) page(0x%lx) last(0x%lx) mask(0x%lx)\n",
904 			 mci->mc_idx,
905 			 csrow->first_page, page, csrow->last_page,
906 			 csrow->page_mask);
907 
908 		if ((page >= csrow->first_page) &&
909 		    (page <= csrow->last_page) &&
910 		    ((page & csrow->page_mask) ==
911 		     (csrow->first_page & csrow->page_mask))) {
912 			row = i;
913 			break;
914 		}
915 	}
916 
917 	if (row == -1)
918 		edac_mc_printk(mci, KERN_ERR,
919 			"could not look up page error address %lx\n",
920 			(unsigned long)page);
921 
922 	return row;
923 }
924 EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
925 
926 const char *edac_layer_name[] = {
927 	[EDAC_MC_LAYER_BRANCH] = "branch",
928 	[EDAC_MC_LAYER_CHANNEL] = "channel",
929 	[EDAC_MC_LAYER_SLOT] = "slot",
930 	[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
931 	[EDAC_MC_LAYER_ALL_MEM] = "memory",
932 };
933 EXPORT_SYMBOL_GPL(edac_layer_name);
934 
935 static void edac_inc_ce_error(struct mem_ctl_info *mci,
936 			      bool enable_per_layer_report,
937 			      const int pos[EDAC_MAX_LAYERS],
938 			      const u16 count)
939 {
940 	int i, index = 0;
941 
942 	mci->ce_mc += count;
943 
944 	if (!enable_per_layer_report) {
945 		mci->ce_noinfo_count += count;
946 		return;
947 	}
948 
949 	for (i = 0; i < mci->n_layers; i++) {
950 		if (pos[i] < 0)
951 			break;
952 		index += pos[i];
953 		mci->ce_per_layer[i][index] += count;
954 
955 		if (i < mci->n_layers - 1)
956 			index *= mci->layers[i + 1].size;
957 	}
958 }
959 
960 static void edac_inc_ue_error(struct mem_ctl_info *mci,
961 				    bool enable_per_layer_report,
962 				    const int pos[EDAC_MAX_LAYERS],
963 				    const u16 count)
964 {
965 	int i, index = 0;
966 
967 	mci->ue_mc += count;
968 
969 	if (!enable_per_layer_report) {
970 		mci->ce_noinfo_count += count;
971 		return;
972 	}
973 
974 	for (i = 0; i < mci->n_layers; i++) {
975 		if (pos[i] < 0)
976 			break;
977 		index += pos[i];
978 		mci->ue_per_layer[i][index] += count;
979 
980 		if (i < mci->n_layers - 1)
981 			index *= mci->layers[i + 1].size;
982 	}
983 }
984 
985 static void edac_ce_error(struct mem_ctl_info *mci,
986 			  const u16 error_count,
987 			  const int pos[EDAC_MAX_LAYERS],
988 			  const char *msg,
989 			  const char *location,
990 			  const char *label,
991 			  const char *detail,
992 			  const char *other_detail,
993 			  const bool enable_per_layer_report,
994 			  const unsigned long page_frame_number,
995 			  const unsigned long offset_in_page,
996 			  long grain)
997 {
998 	unsigned long remapped_page;
999 	char *msg_aux = "";
1000 
1001 	if (*msg)
1002 		msg_aux = " ";
1003 
1004 	if (edac_mc_get_log_ce()) {
1005 		if (other_detail && *other_detail)
1006 			edac_mc_printk(mci, KERN_WARNING,
1007 				       "%d CE %s%son %s (%s %s - %s)\n",
1008 				       error_count, msg, msg_aux, label,
1009 				       location, detail, other_detail);
1010 		else
1011 			edac_mc_printk(mci, KERN_WARNING,
1012 				       "%d CE %s%son %s (%s %s)\n",
1013 				       error_count, msg, msg_aux, label,
1014 				       location, detail);
1015 	}
1016 	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
1017 
1018 	if (mci->scrub_mode == SCRUB_SW_SRC) {
1019 		/*
1020 			* Some memory controllers (called MCs below) can remap
1021 			* memory so that it is still available at a different
1022 			* address when PCI devices map into memory.
1023 			* MC's that can't do this, lose the memory where PCI
1024 			* devices are mapped. This mapping is MC-dependent
1025 			* and so we call back into the MC driver for it to
1026 			* map the MC page to a physical (CPU) page which can
1027 			* then be mapped to a virtual page - which can then
1028 			* be scrubbed.
1029 			*/
1030 		remapped_page = mci->ctl_page_to_phys ?
1031 			mci->ctl_page_to_phys(mci, page_frame_number) :
1032 			page_frame_number;
1033 
1034 		edac_mc_scrub_block(remapped_page,
1035 					offset_in_page, grain);
1036 	}
1037 }
1038 
1039 static void edac_ue_error(struct mem_ctl_info *mci,
1040 			  const u16 error_count,
1041 			  const int pos[EDAC_MAX_LAYERS],
1042 			  const char *msg,
1043 			  const char *location,
1044 			  const char *label,
1045 			  const char *detail,
1046 			  const char *other_detail,
1047 			  const bool enable_per_layer_report)
1048 {
1049 	char *msg_aux = "";
1050 
1051 	if (*msg)
1052 		msg_aux = " ";
1053 
1054 	if (edac_mc_get_log_ue()) {
1055 		if (other_detail && *other_detail)
1056 			edac_mc_printk(mci, KERN_WARNING,
1057 				       "%d UE %s%son %s (%s %s - %s)\n",
1058 				       error_count, msg, msg_aux, label,
1059 				       location, detail, other_detail);
1060 		else
1061 			edac_mc_printk(mci, KERN_WARNING,
1062 				       "%d UE %s%son %s (%s %s)\n",
1063 				       error_count, msg, msg_aux, label,
1064 				       location, detail);
1065 	}
1066 
1067 	if (edac_mc_get_panic_on_ue()) {
1068 		if (other_detail && *other_detail)
1069 			panic("UE %s%son %s (%s%s - %s)\n",
1070 			      msg, msg_aux, label, location, detail, other_detail);
1071 		else
1072 			panic("UE %s%son %s (%s%s)\n",
1073 			      msg, msg_aux, label, location, detail);
1074 	}
1075 
1076 	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
1077 }
1078 
1079 /**
1080  * edac_raw_mc_handle_error - reports a memory event to userspace without doing
1081  *			      anything to discover the error location
1082  *
1083  * @type:		severity of the error (CE/UE/Fatal)
1084  * @mci:		a struct mem_ctl_info pointer
1085  * @e:			error description
1086  *
1087  * This raw function is used internally by edac_mc_handle_error(). It should
1088  * only be called directly when the hardware error come directly from BIOS,
1089  * like in the case of APEI GHES driver.
1090  */
1091 void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
1092 			      struct mem_ctl_info *mci,
1093 			      struct edac_raw_error_desc *e)
1094 {
1095 	char detail[80];
1096 	int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
1097 
1098 	/* Memory type dependent details about the error */
1099 	if (type == HW_EVENT_ERR_CORRECTED) {
1100 		snprintf(detail, sizeof(detail),
1101 			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1102 			e->page_frame_number, e->offset_in_page,
1103 			e->grain, e->syndrome);
1104 		edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1105 			      detail, e->other_detail, e->enable_per_layer_report,
1106 			      e->page_frame_number, e->offset_in_page, e->grain);
1107 	} else {
1108 		snprintf(detail, sizeof(detail),
1109 			"page:0x%lx offset:0x%lx grain:%ld",
1110 			e->page_frame_number, e->offset_in_page, e->grain);
1111 
1112 		edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1113 			      detail, e->other_detail, e->enable_per_layer_report);
1114 	}
1115 
1116 
1117 }
1118 EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
1119 
1120 /**
1121  * edac_mc_handle_error - reports a memory event to userspace
1122  *
1123  * @type:		severity of the error (CE/UE/Fatal)
1124  * @mci:		a struct mem_ctl_info pointer
1125  * @error_count:	Number of errors of the same type
1126  * @page_frame_number:	mem page where the error occurred
1127  * @offset_in_page:	offset of the error inside the page
1128  * @syndrome:		ECC syndrome
1129  * @top_layer:		Memory layer[0] position
1130  * @mid_layer:		Memory layer[1] position
1131  * @low_layer:		Memory layer[2] position
1132  * @msg:		Message meaningful to the end users that
1133  *			explains the event
1134  * @other_detail:	Technical details about the event that
1135  *			may help hardware manufacturers and
1136  *			EDAC developers to analyse the event
1137  */
1138 void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1139 			  struct mem_ctl_info *mci,
1140 			  const u16 error_count,
1141 			  const unsigned long page_frame_number,
1142 			  const unsigned long offset_in_page,
1143 			  const unsigned long syndrome,
1144 			  const int top_layer,
1145 			  const int mid_layer,
1146 			  const int low_layer,
1147 			  const char *msg,
1148 			  const char *other_detail)
1149 {
1150 	char *p;
1151 	int row = -1, chan = -1;
1152 	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
1153 	int i, n_labels = 0;
1154 	u8 grain_bits;
1155 	struct edac_raw_error_desc *e = &mci->error_desc;
1156 
1157 	edac_dbg(3, "MC%d\n", mci->mc_idx);
1158 
1159 	/* Fills the error report buffer */
1160 	memset(e, 0, sizeof (*e));
1161 	e->error_count = error_count;
1162 	e->top_layer = top_layer;
1163 	e->mid_layer = mid_layer;
1164 	e->low_layer = low_layer;
1165 	e->page_frame_number = page_frame_number;
1166 	e->offset_in_page = offset_in_page;
1167 	e->syndrome = syndrome;
1168 	e->msg = msg;
1169 	e->other_detail = other_detail;
1170 
1171 	/*
1172 	 * Check if the event report is consistent and if the memory
1173 	 * location is known. If it is known, enable_per_layer_report will be
1174 	 * true, the DIMM(s) label info will be filled and the per-layer
1175 	 * error counters will be incremented.
1176 	 */
1177 	for (i = 0; i < mci->n_layers; i++) {
1178 		if (pos[i] >= (int)mci->layers[i].size) {
1179 
1180 			edac_mc_printk(mci, KERN_ERR,
1181 				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
1182 				       edac_layer_name[mci->layers[i].type],
1183 				       pos[i], mci->layers[i].size);
1184 			/*
1185 			 * Instead of just returning it, let's use what's
1186 			 * known about the error. The increment routines and
1187 			 * the DIMM filter logic will do the right thing by
1188 			 * pointing the likely damaged DIMMs.
1189 			 */
1190 			pos[i] = -1;
1191 		}
1192 		if (pos[i] >= 0)
1193 			e->enable_per_layer_report = true;
1194 	}
1195 
1196 	/*
1197 	 * Get the dimm label/grain that applies to the match criteria.
1198 	 * As the error algorithm may not be able to point to just one memory
1199 	 * stick, the logic here will get all possible labels that could
1200 	 * pottentially be affected by the error.
1201 	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
1202 	 * to have only the MC channel and the MC dimm (also called "branch")
1203 	 * but the channel is not known, as the memory is arranged in pairs,
1204 	 * where each memory belongs to a separate channel within the same
1205 	 * branch.
1206 	 */
1207 	p = e->label;
1208 	*p = '\0';
1209 
1210 	for (i = 0; i < mci->tot_dimms; i++) {
1211 		struct dimm_info *dimm = mci->dimms[i];
1212 
1213 		if (top_layer >= 0 && top_layer != dimm->location[0])
1214 			continue;
1215 		if (mid_layer >= 0 && mid_layer != dimm->location[1])
1216 			continue;
1217 		if (low_layer >= 0 && low_layer != dimm->location[2])
1218 			continue;
1219 
1220 		/* get the max grain, over the error match range */
1221 		if (dimm->grain > e->grain)
1222 			e->grain = dimm->grain;
1223 
1224 		/*
1225 		 * If the error is memory-controller wide, there's no need to
1226 		 * seek for the affected DIMMs because the whole
1227 		 * channel/memory controller/...  may be affected.
1228 		 * Also, don't show errors for empty DIMM slots.
1229 		 */
1230 		if (e->enable_per_layer_report && dimm->nr_pages) {
1231 			if (n_labels >= EDAC_MAX_LABELS) {
1232 				e->enable_per_layer_report = false;
1233 				break;
1234 			}
1235 			n_labels++;
1236 			if (p != e->label) {
1237 				strcpy(p, OTHER_LABEL);
1238 				p += strlen(OTHER_LABEL);
1239 			}
1240 			strcpy(p, dimm->label);
1241 			p += strlen(p);
1242 			*p = '\0';
1243 
1244 			/*
1245 			 * get csrow/channel of the DIMM, in order to allow
1246 			 * incrementing the compat API counters
1247 			 */
1248 			edac_dbg(4, "%s csrows map: (%d,%d)\n",
1249 				 mci->csbased ? "rank" : "dimm",
1250 				 dimm->csrow, dimm->cschannel);
1251 			if (row == -1)
1252 				row = dimm->csrow;
1253 			else if (row >= 0 && row != dimm->csrow)
1254 				row = -2;
1255 
1256 			if (chan == -1)
1257 				chan = dimm->cschannel;
1258 			else if (chan >= 0 && chan != dimm->cschannel)
1259 				chan = -2;
1260 		}
1261 	}
1262 
1263 	if (!e->enable_per_layer_report) {
1264 		strcpy(e->label, "any memory");
1265 	} else {
1266 		edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
1267 		if (p == e->label)
1268 			strcpy(e->label, "unknown memory");
1269 		if (type == HW_EVENT_ERR_CORRECTED) {
1270 			if (row >= 0) {
1271 				mci->csrows[row]->ce_count += error_count;
1272 				if (chan >= 0)
1273 					mci->csrows[row]->channels[chan]->ce_count += error_count;
1274 			}
1275 		} else
1276 			if (row >= 0)
1277 				mci->csrows[row]->ue_count += error_count;
1278 	}
1279 
1280 	/* Fill the RAM location data */
1281 	p = e->location;
1282 
1283 	for (i = 0; i < mci->n_layers; i++) {
1284 		if (pos[i] < 0)
1285 			continue;
1286 
1287 		p += sprintf(p, "%s:%d ",
1288 			     edac_layer_name[mci->layers[i].type],
1289 			     pos[i]);
1290 	}
1291 	if (p > e->location)
1292 		*(p - 1) = '\0';
1293 
1294 	/* Report the error via the trace interface */
1295 	grain_bits = fls_long(e->grain) + 1;
1296 	trace_mc_event(type, e->msg, e->label, e->error_count,
1297 		       mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
1298 		       PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
1299 		       grain_bits, e->syndrome, e->other_detail);
1300 
1301 	edac_raw_mc_handle_error(type, mci, e);
1302 }
1303 EXPORT_SYMBOL_GPL(edac_mc_handle_error);
1304