xref: /openbmc/linux/drivers/misc/genwqe/card_base.c (revision 275876e2)
1 /**
2  * IBM Accelerator Family 'GenWQE'
3  *
4  * (C) Copyright IBM Corp. 2013
5  *
6  * Author: Frank Haverkamp <haver@linux.vnet.ibm.com>
7  * Author: Joerg-Stephan Vogt <jsvogt@de.ibm.com>
8  * Author: Michael Jung <mijung@de.ibm.com>
9  * Author: Michael Ruettger <michael@ibmra.de>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License (version 2 only)
13  * as published by the Free Software Foundation.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  */
20 
21 /*
22  * Module initialization and PCIe setup. Card health monitoring and
23  * recovery functionality. Character device creation and deletion are
24  * controlled from here.
25  */
26 
27 #include <linux/module.h>
28 #include <linux/types.h>
29 #include <linux/pci.h>
30 #include <linux/err.h>
31 #include <linux/aer.h>
32 #include <linux/string.h>
33 #include <linux/sched.h>
34 #include <linux/wait.h>
35 #include <linux/delay.h>
36 #include <linux/dma-mapping.h>
37 #include <linux/module.h>
38 #include <linux/notifier.h>
39 #include <linux/device.h>
40 #include <linux/log2.h>
41 
42 #include "card_base.h"
43 #include "card_ddcb.h"
44 
45 MODULE_AUTHOR("Frank Haverkamp <haver@linux.vnet.ibm.com>");
46 MODULE_AUTHOR("Michael Ruettger <michael@ibmra.de>");
47 MODULE_AUTHOR("Joerg-Stephan Vogt <jsvogt@de.ibm.com>");
48 MODULE_AUTHOR("Michal Jung <mijung@de.ibm.com>");
49 
50 MODULE_DESCRIPTION("GenWQE Card");
51 MODULE_VERSION(DRV_VERS_STRING);
52 MODULE_LICENSE("GPL");
53 
54 static char genwqe_driver_name[] = GENWQE_DEVNAME;
55 static struct class *class_genwqe;
56 static struct dentry *debugfs_genwqe;
57 static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX];
58 
59 /* PCI structure for identifying device by PCI vendor and device ID */
60 static const struct pci_device_id genwqe_device_table[] = {
61 	{ .vendor      = PCI_VENDOR_ID_IBM,
62 	  .device      = PCI_DEVICE_GENWQE,
63 	  .subvendor   = PCI_SUBVENDOR_ID_IBM,
64 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
65 	  .class       = (PCI_CLASSCODE_GENWQE5 << 8),
66 	  .class_mask  = ~0,
67 	  .driver_data = 0 },
68 
69 	/* Initial SR-IOV bring-up image */
70 	{ .vendor      = PCI_VENDOR_ID_IBM,
71 	  .device      = PCI_DEVICE_GENWQE,
72 	  .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
73 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
74 	  .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
75 	  .class_mask  = ~0,
76 	  .driver_data = 0 },
77 
78 	{ .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
79 	  .device      = 0x0000,  /* VF Device ID */
80 	  .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
81 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
82 	  .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
83 	  .class_mask  = ~0,
84 	  .driver_data = 0 },
85 
86 	/* Fixed up image */
87 	{ .vendor      = PCI_VENDOR_ID_IBM,
88 	  .device      = PCI_DEVICE_GENWQE,
89 	  .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
90 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
91 	  .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
92 	  .class_mask  = ~0,
93 	  .driver_data = 0 },
94 
95 	{ .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
96 	  .device      = 0x0000,  /* VF Device ID */
97 	  .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
98 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
99 	  .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
100 	  .class_mask  = ~0,
101 	  .driver_data = 0 },
102 
103 	/* Even one more ... */
104 	{ .vendor      = PCI_VENDOR_ID_IBM,
105 	  .device      = PCI_DEVICE_GENWQE,
106 	  .subvendor   = PCI_SUBVENDOR_ID_IBM,
107 	  .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_NEW,
108 	  .class       = (PCI_CLASSCODE_GENWQE5 << 8),
109 	  .class_mask  = ~0,
110 	  .driver_data = 0 },
111 
112 	{ 0, }			/* 0 terminated list. */
113 };
114 
115 MODULE_DEVICE_TABLE(pci, genwqe_device_table);
116 
117 /**
118  * genwqe_dev_alloc() - Create and prepare a new card descriptor
119  *
120  * Return: Pointer to card descriptor, or ERR_PTR(err) on error
121  */
122 static struct genwqe_dev *genwqe_dev_alloc(void)
123 {
124 	unsigned int i = 0, j;
125 	struct genwqe_dev *cd;
126 
127 	for (i = 0; i < GENWQE_CARD_NO_MAX; i++) {
128 		if (genwqe_devices[i] == NULL)
129 			break;
130 	}
131 	if (i >= GENWQE_CARD_NO_MAX)
132 		return ERR_PTR(-ENODEV);
133 
134 	cd = kzalloc(sizeof(struct genwqe_dev), GFP_KERNEL);
135 	if (!cd)
136 		return ERR_PTR(-ENOMEM);
137 
138 	cd->card_idx = i;
139 	cd->class_genwqe = class_genwqe;
140 	cd->debugfs_genwqe = debugfs_genwqe;
141 
142 	/*
143 	 * This comes from kernel config option and can be overritten via
144 	 * debugfs.
145 	 */
146 	cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
147 
148 	init_waitqueue_head(&cd->queue_waitq);
149 
150 	spin_lock_init(&cd->file_lock);
151 	INIT_LIST_HEAD(&cd->file_list);
152 
153 	cd->card_state = GENWQE_CARD_UNUSED;
154 	spin_lock_init(&cd->print_lock);
155 
156 	cd->ddcb_software_timeout = genwqe_ddcb_software_timeout;
157 	cd->kill_timeout = genwqe_kill_timeout;
158 
159 	for (j = 0; j < GENWQE_MAX_VFS; j++)
160 		cd->vf_jobtimeout_msec[j] = genwqe_vf_jobtimeout_msec;
161 
162 	genwqe_devices[i] = cd;
163 	return cd;
164 }
165 
166 static void genwqe_dev_free(struct genwqe_dev *cd)
167 {
168 	if (!cd)
169 		return;
170 
171 	genwqe_devices[cd->card_idx] = NULL;
172 	kfree(cd);
173 }
174 
175 /**
176  * genwqe_bus_reset() - Card recovery
177  *
178  * pci_reset_function() will recover the device and ensure that the
179  * registers are accessible again when it completes with success. If
180  * not, the card will stay dead and registers will be unaccessible
181  * still.
182  */
183 static int genwqe_bus_reset(struct genwqe_dev *cd)
184 {
185 	int bars, rc = 0;
186 	struct pci_dev *pci_dev = cd->pci_dev;
187 	void __iomem *mmio;
188 
189 	if (cd->err_inject & GENWQE_INJECT_BUS_RESET_FAILURE)
190 		return -EIO;
191 
192 	mmio = cd->mmio;
193 	cd->mmio = NULL;
194 	pci_iounmap(pci_dev, mmio);
195 
196 	bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
197 	pci_release_selected_regions(pci_dev, bars);
198 
199 	/*
200 	 * Firmware/BIOS might change memory mapping during bus reset.
201 	 * Settings like enable bus-mastering, ... are backuped and
202 	 * restored by the pci_reset_function().
203 	 */
204 	dev_dbg(&pci_dev->dev, "[%s] pci_reset function ...\n", __func__);
205 	rc = pci_reset_function(pci_dev);
206 	if (rc) {
207 		dev_err(&pci_dev->dev,
208 			"[%s] err: failed reset func (rc %d)\n", __func__, rc);
209 		return rc;
210 	}
211 	dev_dbg(&pci_dev->dev, "[%s] done with rc=%d\n", __func__, rc);
212 
213 	/*
214 	 * Here is the right spot to clear the register read
215 	 * failure. pci_bus_reset() does this job in real systems.
216 	 */
217 	cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
218 			    GENWQE_INJECT_GFIR_FATAL |
219 			    GENWQE_INJECT_GFIR_INFO);
220 
221 	rc = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
222 	if (rc) {
223 		dev_err(&pci_dev->dev,
224 			"[%s] err: request bars failed (%d)\n", __func__, rc);
225 		return -EIO;
226 	}
227 
228 	cd->mmio = pci_iomap(pci_dev, 0, 0);
229 	if (cd->mmio == NULL) {
230 		dev_err(&pci_dev->dev,
231 			"[%s] err: mapping BAR0 failed\n", __func__);
232 		return -ENOMEM;
233 	}
234 	return 0;
235 }
236 
237 /*
238  * Hardware circumvention section. Certain bitstreams in our test-lab
239  * had different kinds of problems. Here is where we adjust those
240  * bitstreams to function will with this version of our device driver.
241  *
242  * Thise circumventions are applied to the physical function only.
243  * The magical numbers below are identifying development/manufacturing
244  * versions of the bitstream used on the card.
245  *
246  * Turn off error reporting for old/manufacturing images.
247  */
248 
249 bool genwqe_need_err_masking(struct genwqe_dev *cd)
250 {
251 	return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
252 }
253 
254 static void genwqe_tweak_hardware(struct genwqe_dev *cd)
255 {
256 	struct pci_dev *pci_dev = cd->pci_dev;
257 
258 	/* Mask FIRs for development images */
259 	if (((cd->slu_unitcfg & 0xFFFF0ull) >= 0x32000ull) &&
260 	    ((cd->slu_unitcfg & 0xFFFF0ull) <= 0x33250ull)) {
261 		dev_warn(&pci_dev->dev,
262 			 "FIRs masked due to bitstream %016llx.%016llx\n",
263 			 cd->slu_unitcfg, cd->app_unitcfg);
264 
265 		__genwqe_writeq(cd, IO_APP_SEC_LEM_DEBUG_OVR,
266 				0xFFFFFFFFFFFFFFFFull);
267 
268 		__genwqe_writeq(cd, IO_APP_ERR_ACT_MASK,
269 				0x0000000000000000ull);
270 	}
271 }
272 
273 /**
274  * genwqe_recovery_on_fatal_gfir_required() - Version depended actions
275  *
276  * Bitstreams older than 2013-02-17 have a bug where fatal GFIRs must
277  * be ignored. This is e.g. true for the bitstream we gave to the card
278  * manufacturer, but also for some old bitstreams we released to our
279  * test-lab.
280  */
281 int genwqe_recovery_on_fatal_gfir_required(struct genwqe_dev *cd)
282 {
283 	return (cd->slu_unitcfg & 0xFFFF0ull) >= 0x32170ull;
284 }
285 
286 int genwqe_flash_readback_fails(struct genwqe_dev *cd)
287 {
288 	return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
289 }
290 
291 /**
292  * genwqe_T_psec() - Calculate PF/VF timeout register content
293  *
294  * Note: From a design perspective it turned out to be a bad idea to
295  * use codes here to specifiy the frequency/speed values. An old
296  * driver cannot understand new codes and is therefore always a
297  * problem. Better is to measure out the value or put the
298  * speed/frequency directly into a register which is always a valid
299  * value for old as well as for new software.
300  */
301 /* T = 1/f */
302 static int genwqe_T_psec(struct genwqe_dev *cd)
303 {
304 	u16 speed;	/* 1/f -> 250,  200,  166,  175 */
305 	static const int T[] = { 4000, 5000, 6000, 5714 };
306 
307 	speed = (u16)((cd->slu_unitcfg >> 28) & 0x0full);
308 	if (speed >= ARRAY_SIZE(T))
309 		return -1;	/* illegal value */
310 
311 	return T[speed];
312 }
313 
314 /**
315  * genwqe_setup_pf_jtimer() - Setup PF hardware timeouts for DDCB execution
316  *
317  * Do this _after_ card_reset() is called. Otherwise the values will
318  * vanish. The settings need to be done when the queues are inactive.
319  *
320  * The max. timeout value is 2^(10+x) * T (6ns for 166MHz) * 15/16.
321  * The min. timeout value is 2^(10+x) * T (6ns for 166MHz) * 14/16.
322  */
323 static bool genwqe_setup_pf_jtimer(struct genwqe_dev *cd)
324 {
325 	u32 T = genwqe_T_psec(cd);
326 	u64 x;
327 
328 	if (genwqe_pf_jobtimeout_msec == 0)
329 		return false;
330 
331 	/* PF: large value needed, flash update 2sec per block */
332 	x = ilog2(genwqe_pf_jobtimeout_msec *
333 		  16000000000uL/(T * 15)) - 10;
334 
335 	genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
336 			  0xff00 | (x & 0xff), 0);
337 	return true;
338 }
339 
340 /**
341  * genwqe_setup_vf_jtimer() - Setup VF hardware timeouts for DDCB execution
342  */
343 static bool genwqe_setup_vf_jtimer(struct genwqe_dev *cd)
344 {
345 	struct pci_dev *pci_dev = cd->pci_dev;
346 	unsigned int vf;
347 	u32 T = genwqe_T_psec(cd);
348 	u64 x;
349 
350 	for (vf = 0; vf < pci_sriov_get_totalvfs(pci_dev); vf++) {
351 
352 		if (cd->vf_jobtimeout_msec[vf] == 0)
353 			continue;
354 
355 		x = ilog2(cd->vf_jobtimeout_msec[vf] *
356 			  16000000000uL/(T * 15)) - 10;
357 
358 		genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
359 				  0xff00 | (x & 0xff), vf + 1);
360 	}
361 	return true;
362 }
363 
364 static int genwqe_ffdc_buffs_alloc(struct genwqe_dev *cd)
365 {
366 	unsigned int type, e = 0;
367 
368 	for (type = 0; type < GENWQE_DBG_UNITS; type++) {
369 		switch (type) {
370 		case GENWQE_DBG_UNIT0:
371 			e = genwqe_ffdc_buff_size(cd, 0);
372 			break;
373 		case GENWQE_DBG_UNIT1:
374 			e = genwqe_ffdc_buff_size(cd, 1);
375 			break;
376 		case GENWQE_DBG_UNIT2:
377 			e = genwqe_ffdc_buff_size(cd, 2);
378 			break;
379 		case GENWQE_DBG_REGS:
380 			e = GENWQE_FFDC_REGS;
381 			break;
382 		}
383 
384 		/* currently support only the debug units mentioned here */
385 		cd->ffdc[type].entries = e;
386 		cd->ffdc[type].regs = kmalloc(e * sizeof(struct genwqe_reg),
387 					      GFP_KERNEL);
388 		/*
389 		 * regs == NULL is ok, the using code treats this as no regs,
390 		 * Printing warning is ok in this case.
391 		 */
392 	}
393 	return 0;
394 }
395 
396 static void genwqe_ffdc_buffs_free(struct genwqe_dev *cd)
397 {
398 	unsigned int type;
399 
400 	for (type = 0; type < GENWQE_DBG_UNITS; type++) {
401 		kfree(cd->ffdc[type].regs);
402 		cd->ffdc[type].regs = NULL;
403 	}
404 }
405 
406 static int genwqe_read_ids(struct genwqe_dev *cd)
407 {
408 	int err = 0;
409 	int slu_id;
410 	struct pci_dev *pci_dev = cd->pci_dev;
411 
412 	cd->slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
413 	if (cd->slu_unitcfg == IO_ILLEGAL_VALUE) {
414 		dev_err(&pci_dev->dev,
415 			"err: SLUID=%016llx\n", cd->slu_unitcfg);
416 		err = -EIO;
417 		goto out_err;
418 	}
419 
420 	slu_id = genwqe_get_slu_id(cd);
421 	if (slu_id < GENWQE_SLU_ARCH_REQ || slu_id == 0xff) {
422 		dev_err(&pci_dev->dev,
423 			"err: incompatible SLU Architecture %u\n", slu_id);
424 		err = -ENOENT;
425 		goto out_err;
426 	}
427 
428 	cd->app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
429 	if (cd->app_unitcfg == IO_ILLEGAL_VALUE) {
430 		dev_err(&pci_dev->dev,
431 			"err: APPID=%016llx\n", cd->app_unitcfg);
432 		err = -EIO;
433 		goto out_err;
434 	}
435 	genwqe_read_app_id(cd, cd->app_name, sizeof(cd->app_name));
436 
437 	/*
438 	 * Is access to all registers possible? If we are a VF the
439 	 * answer is obvious. If we run fully virtualized, we need to
440 	 * check if we can access all registers. If we do not have
441 	 * full access we will cause an UR and some informational FIRs
442 	 * in the PF, but that should not harm.
443 	 */
444 	if (pci_dev->is_virtfn)
445 		cd->is_privileged = 0;
446 	else
447 		cd->is_privileged = (__genwqe_readq(cd, IO_SLU_BITSTREAM)
448 				     != IO_ILLEGAL_VALUE);
449 
450  out_err:
451 	return err;
452 }
453 
454 static int genwqe_start(struct genwqe_dev *cd)
455 {
456 	int err;
457 	struct pci_dev *pci_dev = cd->pci_dev;
458 
459 	err = genwqe_read_ids(cd);
460 	if (err)
461 		return err;
462 
463 	if (genwqe_is_privileged(cd)) {
464 		/* do this after the tweaks. alloc fail is acceptable */
465 		genwqe_ffdc_buffs_alloc(cd);
466 		genwqe_stop_traps(cd);
467 
468 		/* Collect registers e.g. FIRs, UNITIDs, traces ... */
469 		genwqe_read_ffdc_regs(cd, cd->ffdc[GENWQE_DBG_REGS].regs,
470 				      cd->ffdc[GENWQE_DBG_REGS].entries, 0);
471 
472 		genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT0,
473 				      cd->ffdc[GENWQE_DBG_UNIT0].regs,
474 				      cd->ffdc[GENWQE_DBG_UNIT0].entries);
475 
476 		genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT1,
477 				      cd->ffdc[GENWQE_DBG_UNIT1].regs,
478 				      cd->ffdc[GENWQE_DBG_UNIT1].entries);
479 
480 		genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT2,
481 				      cd->ffdc[GENWQE_DBG_UNIT2].regs,
482 				      cd->ffdc[GENWQE_DBG_UNIT2].entries);
483 
484 		genwqe_start_traps(cd);
485 
486 		if (cd->card_state == GENWQE_CARD_FATAL_ERROR) {
487 			dev_warn(&pci_dev->dev,
488 				 "[%s] chip reload/recovery!\n", __func__);
489 
490 			/*
491 			 * Stealth Mode: Reload chip on either hot
492 			 * reset or PERST.
493 			 */
494 			cd->softreset = 0x7Cull;
495 			__genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
496 				       cd->softreset);
497 
498 			err = genwqe_bus_reset(cd);
499 			if (err != 0) {
500 				dev_err(&pci_dev->dev,
501 					"[%s] err: bus reset failed!\n",
502 					__func__);
503 				goto out;
504 			}
505 
506 			/*
507 			 * Re-read the IDs because
508 			 * it could happen that the bitstream load
509 			 * failed!
510 			 */
511 			err = genwqe_read_ids(cd);
512 			if (err)
513 				goto out;
514 		}
515 	}
516 
517 	err = genwqe_setup_service_layer(cd);  /* does a reset to the card */
518 	if (err != 0) {
519 		dev_err(&pci_dev->dev,
520 			"[%s] err: could not setup servicelayer!\n", __func__);
521 		err = -ENODEV;
522 		goto out;
523 	}
524 
525 	if (genwqe_is_privileged(cd)) {	 /* code is running _after_ reset */
526 		genwqe_tweak_hardware(cd);
527 
528 		genwqe_setup_pf_jtimer(cd);
529 		genwqe_setup_vf_jtimer(cd);
530 	}
531 
532 	err = genwqe_device_create(cd);
533 	if (err < 0) {
534 		dev_err(&pci_dev->dev,
535 			"err: chdev init failed! (err=%d)\n", err);
536 		goto out_release_service_layer;
537 	}
538 	return 0;
539 
540  out_release_service_layer:
541 	genwqe_release_service_layer(cd);
542  out:
543 	if (genwqe_is_privileged(cd))
544 		genwqe_ffdc_buffs_free(cd);
545 	return -EIO;
546 }
547 
548 /**
549  * genwqe_stop() - Stop card operation
550  *
551  * Recovery notes:
552  *   As long as genwqe_thread runs we might access registers during
553  *   error data capture. Same is with the genwqe_health_thread.
554  *   When genwqe_bus_reset() fails this function might called two times:
555  *   first by the genwqe_health_thread() and later by genwqe_remove() to
556  *   unbind the device. We must be able to survive that.
557  *
558  * This function must be robust enough to be called twice.
559  */
560 static int genwqe_stop(struct genwqe_dev *cd)
561 {
562 	genwqe_finish_queue(cd);	    /* no register access */
563 	genwqe_device_remove(cd);	    /* device removed, procs killed */
564 	genwqe_release_service_layer(cd);   /* here genwqe_thread is stopped */
565 
566 	if (genwqe_is_privileged(cd)) {
567 		pci_disable_sriov(cd->pci_dev);	/* access pci config space */
568 		genwqe_ffdc_buffs_free(cd);
569 	}
570 
571 	return 0;
572 }
573 
574 /**
575  * genwqe_recover_card() - Try to recover the card if it is possible
576  *
577  * If fatal_err is set no register access is possible anymore. It is
578  * likely that genwqe_start fails in that situation. Proper error
579  * handling is required in this case.
580  *
581  * genwqe_bus_reset() will cause the pci code to call genwqe_remove()
582  * and later genwqe_probe() for all virtual functions.
583  */
584 static int genwqe_recover_card(struct genwqe_dev *cd, int fatal_err)
585 {
586 	int rc;
587 	struct pci_dev *pci_dev = cd->pci_dev;
588 
589 	genwqe_stop(cd);
590 
591 	/*
592 	 * Make sure chip is not reloaded to maintain FFDC. Write SLU
593 	 * Reset Register, CPLDReset field to 0.
594 	 */
595 	if (!fatal_err) {
596 		cd->softreset = 0x70ull;
597 		__genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET, cd->softreset);
598 	}
599 
600 	rc = genwqe_bus_reset(cd);
601 	if (rc != 0) {
602 		dev_err(&pci_dev->dev,
603 			"[%s] err: card recovery impossible!\n", __func__);
604 		return rc;
605 	}
606 
607 	rc = genwqe_start(cd);
608 	if (rc < 0) {
609 		dev_err(&pci_dev->dev,
610 			"[%s] err: failed to launch device!\n", __func__);
611 		return rc;
612 	}
613 	return 0;
614 }
615 
616 static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir)
617 {
618 	*gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
619 	return (*gfir & GFIR_ERR_TRIGGER) &&
620 		genwqe_recovery_on_fatal_gfir_required(cd);
621 }
622 
623 /**
624  * genwqe_fir_checking() - Check the fault isolation registers of the card
625  *
626  * If this code works ok, can be tried out with help of the genwqe_poke tool:
627  *   sudo ./tools/genwqe_poke 0x8 0xfefefefefef
628  *
629  * Now the relevant FIRs/sFIRs should be printed out and the driver should
630  * invoke recovery (devices are removed and readded).
631  */
632 static u64 genwqe_fir_checking(struct genwqe_dev *cd)
633 {
634 	int j, iterations = 0;
635 	u64 mask, fir, fec, uid, gfir, gfir_masked, sfir, sfec;
636 	u32 fir_addr, fir_clr_addr, fec_addr, sfir_addr, sfec_addr;
637 	struct pci_dev *pci_dev = cd->pci_dev;
638 
639  healthMonitor:
640 	iterations++;
641 	if (iterations > 16) {
642 		dev_err(&pci_dev->dev, "* exit looping after %d times\n",
643 			iterations);
644 		goto fatal_error;
645 	}
646 
647 	gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
648 	if (gfir != 0x0)
649 		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n",
650 				    IO_SLC_CFGREG_GFIR, gfir);
651 	if (gfir == IO_ILLEGAL_VALUE)
652 		goto fatal_error;
653 
654 	/*
655 	 * Avoid printing when to GFIR bit is on prevents contignous
656 	 * printout e.g. for the following bug:
657 	 *   FIR set without a 2ndary FIR/FIR cannot be cleared
658 	 * Comment out the following if to get the prints:
659 	 */
660 	if (gfir == 0)
661 		return 0;
662 
663 	gfir_masked = gfir & GFIR_ERR_TRIGGER;  /* fatal errors */
664 
665 	for (uid = 0; uid < GENWQE_MAX_UNITS; uid++) { /* 0..2 in zEDC */
666 
667 		/* read the primary FIR (pfir) */
668 		fir_addr = (uid << 24) + 0x08;
669 		fir = __genwqe_readq(cd, fir_addr);
670 		if (fir == 0x0)
671 			continue;  /* no error in this unit */
672 
673 		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fir_addr, fir);
674 		if (fir == IO_ILLEGAL_VALUE)
675 			goto fatal_error;
676 
677 		/* read primary FEC */
678 		fec_addr = (uid << 24) + 0x18;
679 		fec = __genwqe_readq(cd, fec_addr);
680 
681 		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fec_addr, fec);
682 		if (fec == IO_ILLEGAL_VALUE)
683 			goto fatal_error;
684 
685 		for (j = 0, mask = 1ULL; j < 64; j++, mask <<= 1) {
686 
687 			/* secondary fir empty, skip it */
688 			if ((fir & mask) == 0x0)
689 				continue;
690 
691 			sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
692 			sfir = __genwqe_readq(cd, sfir_addr);
693 
694 			if (sfir == IO_ILLEGAL_VALUE)
695 				goto fatal_error;
696 			dev_err(&pci_dev->dev,
697 				"* 0x%08x 0x%016llx\n", sfir_addr, sfir);
698 
699 			sfec_addr = (uid << 24) + 0x300 + 0x08 * j;
700 			sfec = __genwqe_readq(cd, sfec_addr);
701 
702 			if (sfec == IO_ILLEGAL_VALUE)
703 				goto fatal_error;
704 			dev_err(&pci_dev->dev,
705 				"* 0x%08x 0x%016llx\n", sfec_addr, sfec);
706 
707 			gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
708 			if (gfir == IO_ILLEGAL_VALUE)
709 				goto fatal_error;
710 
711 			/* gfir turned on during routine! get out and
712 			   start over. */
713 			if ((gfir_masked == 0x0) &&
714 			    (gfir & GFIR_ERR_TRIGGER)) {
715 				goto healthMonitor;
716 			}
717 
718 			/* do not clear if we entered with a fatal gfir */
719 			if (gfir_masked == 0x0) {
720 
721 				/* NEW clear by mask the logged bits */
722 				sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
723 				__genwqe_writeq(cd, sfir_addr, sfir);
724 
725 				dev_dbg(&pci_dev->dev,
726 					"[HM] Clearing  2ndary FIR 0x%08x "
727 					"with 0x%016llx\n", sfir_addr, sfir);
728 
729 				/*
730 				 * note, these cannot be error-Firs
731 				 * since gfir_masked is 0 after sfir
732 				 * was read. Also, it is safe to do
733 				 * this write if sfir=0. Still need to
734 				 * clear the primary. This just means
735 				 * there is no secondary FIR.
736 				 */
737 
738 				/* clear by mask the logged bit. */
739 				fir_clr_addr = (uid << 24) + 0x10;
740 				__genwqe_writeq(cd, fir_clr_addr, mask);
741 
742 				dev_dbg(&pci_dev->dev,
743 					"[HM] Clearing primary FIR 0x%08x "
744 					"with 0x%016llx\n", fir_clr_addr,
745 					mask);
746 			}
747 		}
748 	}
749 	gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
750 	if (gfir == IO_ILLEGAL_VALUE)
751 		goto fatal_error;
752 
753 	if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) {
754 		/*
755 		 * Check once more that it didn't go on after all the
756 		 * FIRS were cleared.
757 		 */
758 		dev_dbg(&pci_dev->dev, "ACK! Another FIR! Recursing %d!\n",
759 			iterations);
760 		goto healthMonitor;
761 	}
762 	return gfir_masked;
763 
764  fatal_error:
765 	return IO_ILLEGAL_VALUE;
766 }
767 
768 /**
769  * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot
770  *
771  * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this
772  * reset method will not work in all cases.
773  *
774  * Return: 0 on success or error code from pci_set_pcie_reset_state()
775  */
776 static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev)
777 {
778 	int rc;
779 
780 	/*
781 	 * lock pci config space access from userspace,
782 	 * save state and issue PCIe fundamental reset
783 	 */
784 	pci_cfg_access_lock(pci_dev);
785 	pci_save_state(pci_dev);
786 	rc = pci_set_pcie_reset_state(pci_dev, pcie_warm_reset);
787 	if (!rc) {
788 		/* keep PCIe reset asserted for 250ms */
789 		msleep(250);
790 		pci_set_pcie_reset_state(pci_dev, pcie_deassert_reset);
791 		/* Wait for 2s to reload flash and train the link */
792 		msleep(2000);
793 	}
794 	pci_restore_state(pci_dev);
795 	pci_cfg_access_unlock(pci_dev);
796 	return rc;
797 }
798 
799 
800 static int genwqe_platform_recovery(struct genwqe_dev *cd)
801 {
802 	struct pci_dev *pci_dev = cd->pci_dev;
803 	int rc;
804 
805 	dev_info(&pci_dev->dev,
806 		 "[%s] resetting card for error recovery\n", __func__);
807 
808 	/* Clear out error injection flags */
809 	cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
810 			    GENWQE_INJECT_GFIR_FATAL |
811 			    GENWQE_INJECT_GFIR_INFO);
812 
813 	genwqe_stop(cd);
814 
815 	/* Try recoverying the card with fundamental reset */
816 	rc = genwqe_pci_fundamental_reset(pci_dev);
817 	if (!rc) {
818 		rc = genwqe_start(cd);
819 		if (!rc)
820 			dev_info(&pci_dev->dev,
821 				 "[%s] card recovered\n", __func__);
822 		else
823 			dev_err(&pci_dev->dev,
824 				"[%s] err: cannot start card services! (err=%d)\n",
825 				__func__, rc);
826 	} else {
827 		dev_err(&pci_dev->dev,
828 			"[%s] card reset failed\n", __func__);
829 	}
830 
831 	return rc;
832 }
833 
834 /*
835  * genwqe_reload_bistream() - reload card bitstream
836  *
837  * Set the appropriate register and call fundamental reset to reaload the card
838  * bitstream.
839  *
840  * Return: 0 on success, error code otherwise
841  */
842 static int genwqe_reload_bistream(struct genwqe_dev *cd)
843 {
844 	struct pci_dev *pci_dev = cd->pci_dev;
845 	int rc;
846 
847 	dev_info(&pci_dev->dev,
848 		 "[%s] resetting card for bitstream reload\n",
849 		 __func__);
850 
851 	genwqe_stop(cd);
852 
853 	/*
854 	 * Cause a CPLD reprogram with the 'next_bitstream'
855 	 * partition on PCIe hot or fundamental reset
856 	 */
857 	__genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
858 			(cd->softreset & 0xcull) | 0x70ull);
859 
860 	rc = genwqe_pci_fundamental_reset(pci_dev);
861 	if (rc) {
862 		/*
863 		 * A fundamental reset failure can be caused
864 		 * by lack of support on the arch, so we just
865 		 * log the error and try to start the card
866 		 * again.
867 		 */
868 		dev_err(&pci_dev->dev,
869 			"[%s] err: failed to reset card for bitstream reload\n",
870 			__func__);
871 	}
872 
873 	rc = genwqe_start(cd);
874 	if (rc) {
875 		dev_err(&pci_dev->dev,
876 			"[%s] err: cannot start card services! (err=%d)\n",
877 			__func__, rc);
878 		return rc;
879 	}
880 	dev_info(&pci_dev->dev,
881 		 "[%s] card reloaded\n", __func__);
882 	return 0;
883 }
884 
885 
886 /**
887  * genwqe_health_thread() - Health checking thread
888  *
889  * This thread is only started for the PF of the card.
890  *
891  * This thread monitors the health of the card. A critical situation
892  * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In
893  * this case we need to be recovered from outside. Writing to
894  * registers will very likely not work either.
895  *
896  * This thread must only exit if kthread_should_stop() becomes true.
897  *
898  * Condition for the health-thread to trigger:
899  *   a) when a kthread_stop() request comes in or
900  *   b) a critical GFIR occured
901  *
902  * Informational GFIRs are checked and potentially printed in
903  * health_check_interval seconds.
904  */
905 static int genwqe_health_thread(void *data)
906 {
907 	int rc, should_stop = 0;
908 	struct genwqe_dev *cd = data;
909 	struct pci_dev *pci_dev = cd->pci_dev;
910 	u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg;
911 
912  health_thread_begin:
913 	while (!kthread_should_stop()) {
914 		rc = wait_event_interruptible_timeout(cd->health_waitq,
915 			 (genwqe_health_check_cond(cd, &gfir) ||
916 			  (should_stop = kthread_should_stop())),
917 				genwqe_health_check_interval * HZ);
918 
919 		if (should_stop)
920 			break;
921 
922 		if (gfir == IO_ILLEGAL_VALUE) {
923 			dev_err(&pci_dev->dev,
924 				"[%s] GFIR=%016llx\n", __func__, gfir);
925 			goto fatal_error;
926 		}
927 
928 		slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
929 		if (slu_unitcfg == IO_ILLEGAL_VALUE) {
930 			dev_err(&pci_dev->dev,
931 				"[%s] SLU_UNITCFG=%016llx\n",
932 				__func__, slu_unitcfg);
933 			goto fatal_error;
934 		}
935 
936 		app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
937 		if (app_unitcfg == IO_ILLEGAL_VALUE) {
938 			dev_err(&pci_dev->dev,
939 				"[%s] APP_UNITCFG=%016llx\n",
940 				__func__, app_unitcfg);
941 			goto fatal_error;
942 		}
943 
944 		gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
945 		if (gfir == IO_ILLEGAL_VALUE) {
946 			dev_err(&pci_dev->dev,
947 				"[%s] %s: GFIR=%016llx\n", __func__,
948 				(gfir & GFIR_ERR_TRIGGER) ? "err" : "info",
949 				gfir);
950 			goto fatal_error;
951 		}
952 
953 		gfir_masked = genwqe_fir_checking(cd);
954 		if (gfir_masked == IO_ILLEGAL_VALUE)
955 			goto fatal_error;
956 
957 		/*
958 		 * GFIR ErrorTrigger bits set => reset the card!
959 		 * Never do this for old/manufacturing images!
960 		 */
961 		if ((gfir_masked) && !cd->skip_recovery &&
962 		    genwqe_recovery_on_fatal_gfir_required(cd)) {
963 
964 			cd->card_state = GENWQE_CARD_FATAL_ERROR;
965 
966 			rc = genwqe_recover_card(cd, 0);
967 			if (rc < 0) {
968 				/* FIXME Card is unusable and needs unbind! */
969 				goto fatal_error;
970 			}
971 		}
972 
973 		if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) {
974 			/* Userspace requested card bitstream reload */
975 			rc = genwqe_reload_bistream(cd);
976 			if (rc)
977 				goto fatal_error;
978 		}
979 
980 		cd->last_gfir = gfir;
981 		cond_resched();
982 	}
983 
984 	return 0;
985 
986  fatal_error:
987 	if (cd->use_platform_recovery) {
988 		/*
989 		 * Since we use raw accessors, EEH errors won't be detected
990 		 * by the platform until we do a non-raw MMIO or config space
991 		 * read
992 		 */
993 		readq(cd->mmio + IO_SLC_CFGREG_GFIR);
994 
995 		/* We do nothing if the card is going over PCI recovery */
996 		if (pci_channel_offline(pci_dev))
997 			return -EIO;
998 
999 		/*
1000 		 * If it's supported by the platform, we try a fundamental reset
1001 		 * to recover from a fatal error. Otherwise, we continue to wait
1002 		 * for an external recovery procedure to take care of it.
1003 		 */
1004 		rc = genwqe_platform_recovery(cd);
1005 		if (!rc)
1006 			goto health_thread_begin;
1007 	}
1008 
1009 	dev_err(&pci_dev->dev,
1010 		"[%s] card unusable. Please trigger unbind!\n", __func__);
1011 
1012 	/* Bring down logical devices to inform user space via udev remove. */
1013 	cd->card_state = GENWQE_CARD_FATAL_ERROR;
1014 	genwqe_stop(cd);
1015 
1016 	/* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */
1017 	while (!kthread_should_stop())
1018 		cond_resched();
1019 
1020 	return -EIO;
1021 }
1022 
1023 static int genwqe_health_check_start(struct genwqe_dev *cd)
1024 {
1025 	int rc;
1026 
1027 	if (genwqe_health_check_interval <= 0)
1028 		return 0;	/* valid for disabling the service */
1029 
1030 	/* moved before request_irq() */
1031 	/* init_waitqueue_head(&cd->health_waitq); */
1032 
1033 	cd->health_thread = kthread_run(genwqe_health_thread, cd,
1034 					GENWQE_DEVNAME "%d_health",
1035 					cd->card_idx);
1036 	if (IS_ERR(cd->health_thread)) {
1037 		rc = PTR_ERR(cd->health_thread);
1038 		cd->health_thread = NULL;
1039 		return rc;
1040 	}
1041 	return 0;
1042 }
1043 
1044 static int genwqe_health_thread_running(struct genwqe_dev *cd)
1045 {
1046 	return cd->health_thread != NULL;
1047 }
1048 
1049 static int genwqe_health_check_stop(struct genwqe_dev *cd)
1050 {
1051 	int rc;
1052 
1053 	if (!genwqe_health_thread_running(cd))
1054 		return -EIO;
1055 
1056 	rc = kthread_stop(cd->health_thread);
1057 	cd->health_thread = NULL;
1058 	return 0;
1059 }
1060 
1061 /**
1062  * genwqe_pci_setup() - Allocate PCIe related resources for our card
1063  */
1064 static int genwqe_pci_setup(struct genwqe_dev *cd)
1065 {
1066 	int err, bars;
1067 	struct pci_dev *pci_dev = cd->pci_dev;
1068 
1069 	bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
1070 	err = pci_enable_device_mem(pci_dev);
1071 	if (err) {
1072 		dev_err(&pci_dev->dev,
1073 			"err: failed to enable pci memory (err=%d)\n", err);
1074 		goto err_out;
1075 	}
1076 
1077 	/* Reserve PCI I/O and memory resources */
1078 	err = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
1079 	if (err) {
1080 		dev_err(&pci_dev->dev,
1081 			"[%s] err: request bars failed (%d)\n", __func__, err);
1082 		err = -EIO;
1083 		goto err_disable_device;
1084 	}
1085 
1086 	/* check for 64-bit DMA address supported (DAC) */
1087 	if (!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(64))) {
1088 		err = pci_set_consistent_dma_mask(pci_dev, DMA_BIT_MASK(64));
1089 		if (err) {
1090 			dev_err(&pci_dev->dev,
1091 				"err: DMA64 consistent mask error\n");
1092 			err = -EIO;
1093 			goto out_release_resources;
1094 		}
1095 	/* check for 32-bit DMA address supported (SAC) */
1096 	} else if (!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32))) {
1097 		err = pci_set_consistent_dma_mask(pci_dev, DMA_BIT_MASK(32));
1098 		if (err) {
1099 			dev_err(&pci_dev->dev,
1100 				"err: DMA32 consistent mask error\n");
1101 			err = -EIO;
1102 			goto out_release_resources;
1103 		}
1104 	} else {
1105 		dev_err(&pci_dev->dev,
1106 			"err: neither DMA32 nor DMA64 supported\n");
1107 		err = -EIO;
1108 		goto out_release_resources;
1109 	}
1110 
1111 	pci_set_master(pci_dev);
1112 	pci_enable_pcie_error_reporting(pci_dev);
1113 
1114 	/* EEH recovery requires PCIe fundamental reset */
1115 	pci_dev->needs_freset = 1;
1116 
1117 	/* request complete BAR-0 space (length = 0) */
1118 	cd->mmio_len = pci_resource_len(pci_dev, 0);
1119 	cd->mmio = pci_iomap(pci_dev, 0, 0);
1120 	if (cd->mmio == NULL) {
1121 		dev_err(&pci_dev->dev,
1122 			"[%s] err: mapping BAR0 failed\n", __func__);
1123 		err = -ENOMEM;
1124 		goto out_release_resources;
1125 	}
1126 
1127 	cd->num_vfs = pci_sriov_get_totalvfs(pci_dev);
1128 
1129 	err = genwqe_read_ids(cd);
1130 	if (err)
1131 		goto out_iounmap;
1132 
1133 	return 0;
1134 
1135  out_iounmap:
1136 	pci_iounmap(pci_dev, cd->mmio);
1137  out_release_resources:
1138 	pci_release_selected_regions(pci_dev, bars);
1139  err_disable_device:
1140 	pci_disable_device(pci_dev);
1141  err_out:
1142 	return err;
1143 }
1144 
1145 /**
1146  * genwqe_pci_remove() - Free PCIe related resources for our card
1147  */
1148 static void genwqe_pci_remove(struct genwqe_dev *cd)
1149 {
1150 	int bars;
1151 	struct pci_dev *pci_dev = cd->pci_dev;
1152 
1153 	if (cd->mmio)
1154 		pci_iounmap(pci_dev, cd->mmio);
1155 
1156 	bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
1157 	pci_release_selected_regions(pci_dev, bars);
1158 	pci_disable_device(pci_dev);
1159 }
1160 
1161 /**
1162  * genwqe_probe() - Device initialization
1163  * @pdev:	PCI device information struct
1164  *
1165  * Callable for multiple cards. This function is called on bind.
1166  *
1167  * Return: 0 if succeeded, < 0 when failed
1168  */
1169 static int genwqe_probe(struct pci_dev *pci_dev,
1170 			const struct pci_device_id *id)
1171 {
1172 	int err;
1173 	struct genwqe_dev *cd;
1174 
1175 	genwqe_init_crc32();
1176 
1177 	cd = genwqe_dev_alloc();
1178 	if (IS_ERR(cd)) {
1179 		dev_err(&pci_dev->dev, "err: could not alloc mem (err=%d)!\n",
1180 			(int)PTR_ERR(cd));
1181 		return PTR_ERR(cd);
1182 	}
1183 
1184 	dev_set_drvdata(&pci_dev->dev, cd);
1185 	cd->pci_dev = pci_dev;
1186 
1187 	err = genwqe_pci_setup(cd);
1188 	if (err < 0) {
1189 		dev_err(&pci_dev->dev,
1190 			"err: problems with PCI setup (err=%d)\n", err);
1191 		goto out_free_dev;
1192 	}
1193 
1194 	err = genwqe_start(cd);
1195 	if (err < 0) {
1196 		dev_err(&pci_dev->dev,
1197 			"err: cannot start card services! (err=%d)\n", err);
1198 		goto out_pci_remove;
1199 	}
1200 
1201 	if (genwqe_is_privileged(cd)) {
1202 		err = genwqe_health_check_start(cd);
1203 		if (err < 0) {
1204 			dev_err(&pci_dev->dev,
1205 				"err: cannot start health checking! "
1206 				"(err=%d)\n", err);
1207 			goto out_stop_services;
1208 		}
1209 	}
1210 	return 0;
1211 
1212  out_stop_services:
1213 	genwqe_stop(cd);
1214  out_pci_remove:
1215 	genwqe_pci_remove(cd);
1216  out_free_dev:
1217 	genwqe_dev_free(cd);
1218 	return err;
1219 }
1220 
1221 /**
1222  * genwqe_remove() - Called when device is removed (hot-plugable)
1223  *
1224  * Or when driver is unloaded respecitively when unbind is done.
1225  */
1226 static void genwqe_remove(struct pci_dev *pci_dev)
1227 {
1228 	struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1229 
1230 	genwqe_health_check_stop(cd);
1231 
1232 	/*
1233 	 * genwqe_stop() must survive if it is called twice
1234 	 * sequentially. This happens when the health thread calls it
1235 	 * and fails on genwqe_bus_reset().
1236 	 */
1237 	genwqe_stop(cd);
1238 	genwqe_pci_remove(cd);
1239 	genwqe_dev_free(cd);
1240 }
1241 
1242 /*
1243  * genwqe_err_error_detected() - Error detection callback
1244  *
1245  * This callback is called by the PCI subsystem whenever a PCI bus
1246  * error is detected.
1247  */
1248 static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev,
1249 						 enum pci_channel_state state)
1250 {
1251 	struct genwqe_dev *cd;
1252 
1253 	dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
1254 
1255 	cd = dev_get_drvdata(&pci_dev->dev);
1256 	if (cd == NULL)
1257 		return PCI_ERS_RESULT_DISCONNECT;
1258 
1259 	/* Stop the card */
1260 	genwqe_health_check_stop(cd);
1261 	genwqe_stop(cd);
1262 
1263 	/*
1264 	 * On permanent failure, the PCI code will call device remove
1265 	 * after the return of this function.
1266 	 * genwqe_stop() can be called twice.
1267 	 */
1268 	if (state == pci_channel_io_perm_failure) {
1269 		return PCI_ERS_RESULT_DISCONNECT;
1270 	} else {
1271 		genwqe_pci_remove(cd);
1272 		return PCI_ERS_RESULT_NEED_RESET;
1273 	}
1274 }
1275 
1276 static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
1277 {
1278 	int rc;
1279 	struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1280 
1281 	rc = genwqe_pci_setup(cd);
1282 	if (!rc) {
1283 		return PCI_ERS_RESULT_RECOVERED;
1284 	} else {
1285 		dev_err(&pci_dev->dev,
1286 			"err: problems with PCI setup (err=%d)\n", rc);
1287 		return PCI_ERS_RESULT_DISCONNECT;
1288 	}
1289 }
1290 
1291 static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
1292 {
1293 	return PCI_ERS_RESULT_NONE;
1294 }
1295 
1296 static void genwqe_err_resume(struct pci_dev *pci_dev)
1297 {
1298 	int rc;
1299 	struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1300 
1301 	rc = genwqe_start(cd);
1302 	if (!rc) {
1303 		rc = genwqe_health_check_start(cd);
1304 		if (rc)
1305 			dev_err(&pci_dev->dev,
1306 				"err: cannot start health checking! (err=%d)\n",
1307 				rc);
1308 	} else {
1309 		dev_err(&pci_dev->dev,
1310 			"err: cannot start card services! (err=%d)\n", rc);
1311 	}
1312 }
1313 
1314 static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
1315 {
1316 	struct genwqe_dev *cd = dev_get_drvdata(&dev->dev);
1317 
1318 	if (numvfs > 0) {
1319 		genwqe_setup_vf_jtimer(cd);
1320 		pci_enable_sriov(dev, numvfs);
1321 		return numvfs;
1322 	}
1323 	if (numvfs == 0) {
1324 		pci_disable_sriov(dev);
1325 		return 0;
1326 	}
1327 	return 0;
1328 }
1329 
1330 static struct pci_error_handlers genwqe_err_handler = {
1331 	.error_detected = genwqe_err_error_detected,
1332 	.mmio_enabled	= genwqe_err_result_none,
1333 	.link_reset	= genwqe_err_result_none,
1334 	.slot_reset	= genwqe_err_slot_reset,
1335 	.resume		= genwqe_err_resume,
1336 };
1337 
1338 static struct pci_driver genwqe_driver = {
1339 	.name	  = genwqe_driver_name,
1340 	.id_table = genwqe_device_table,
1341 	.probe	  = genwqe_probe,
1342 	.remove	  = genwqe_remove,
1343 	.sriov_configure = genwqe_sriov_configure,
1344 	.err_handler = &genwqe_err_handler,
1345 };
1346 
1347 /**
1348  * genwqe_init_module() - Driver registration and initialization
1349  */
1350 static int __init genwqe_init_module(void)
1351 {
1352 	int rc;
1353 
1354 	class_genwqe = class_create(THIS_MODULE, GENWQE_DEVNAME);
1355 	if (IS_ERR(class_genwqe)) {
1356 		pr_err("[%s] create class failed\n", __func__);
1357 		return -ENOMEM;
1358 	}
1359 
1360 	debugfs_genwqe = debugfs_create_dir(GENWQE_DEVNAME, NULL);
1361 	if (!debugfs_genwqe) {
1362 		rc = -ENOMEM;
1363 		goto err_out;
1364 	}
1365 
1366 	rc = pci_register_driver(&genwqe_driver);
1367 	if (rc != 0) {
1368 		pr_err("[%s] pci_reg_driver (rc=%d)\n", __func__, rc);
1369 		goto err_out0;
1370 	}
1371 
1372 	return rc;
1373 
1374  err_out0:
1375 	debugfs_remove(debugfs_genwqe);
1376  err_out:
1377 	class_destroy(class_genwqe);
1378 	return rc;
1379 }
1380 
1381 /**
1382  * genwqe_exit_module() - Driver exit
1383  */
1384 static void __exit genwqe_exit_module(void)
1385 {
1386 	pci_unregister_driver(&genwqe_driver);
1387 	debugfs_remove(debugfs_genwqe);
1388 	class_destroy(class_genwqe);
1389 }
1390 
1391 module_init(genwqe_init_module);
1392 module_exit(genwqe_exit_module);
1393