1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 
18 #define CREATE_TRACE_POINTS
19 #include <trace/events/habanalabs.h>
20 
21 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
22 
23 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
24 
25 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
26 MODULE_DESCRIPTION(HL_DRIVER_DESC);
27 MODULE_LICENSE("GPL v2");
28 
29 static int hl_major;
30 static struct class *hl_class;
31 static DEFINE_IDR(hl_devs_idr);
32 static DEFINE_MUTEX(hl_devs_idr_lock);
33 
34 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
35 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
36 
37 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
38 static int reset_on_lockup = 1;
39 static int memory_scrub;
40 static ulong boot_error_status_mask = ULONG_MAX;
41 
42 module_param(timeout_locked, int, 0444);
43 MODULE_PARM_DESC(timeout_locked,
44 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
45 
46 module_param(reset_on_lockup, int, 0444);
47 MODULE_PARM_DESC(reset_on_lockup,
48 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
49 
50 module_param(memory_scrub, int, 0444);
51 MODULE_PARM_DESC(memory_scrub,
52 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
53 
54 module_param(boot_error_status_mask, ulong, 0444);
55 MODULE_PARM_DESC(boot_error_status_mask,
56 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
57 
58 #define PCI_IDS_GOYA			0x0001
59 #define PCI_IDS_GAUDI			0x1000
60 #define PCI_IDS_GAUDI_SEC		0x1010
61 
62 #define PCI_IDS_GAUDI2			0x1020
63 
64 static const struct pci_device_id ids[] = {
65 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
66 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
69 	{ 0, }
70 };
71 MODULE_DEVICE_TABLE(pci, ids);
72 
73 /*
74  * get_asic_type - translate device id to asic type
75  *
76  * @hdev: pointer to habanalabs device structure.
77  *
78  * Translate device id and revision id to asic type.
79  * In case of unidentified device, return -1
80  */
get_asic_type(struct hl_device * hdev)81 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
82 {
83 	struct pci_dev *pdev = hdev->pdev;
84 	enum hl_asic_type asic_type = ASIC_INVALID;
85 
86 	switch (pdev->device) {
87 	case PCI_IDS_GOYA:
88 		asic_type = ASIC_GOYA;
89 		break;
90 	case PCI_IDS_GAUDI:
91 		asic_type = ASIC_GAUDI;
92 		break;
93 	case PCI_IDS_GAUDI_SEC:
94 		asic_type = ASIC_GAUDI_SEC;
95 		break;
96 	case PCI_IDS_GAUDI2:
97 		switch (pdev->revision) {
98 		case REV_ID_A:
99 			asic_type = ASIC_GAUDI2;
100 			break;
101 		case REV_ID_B:
102 			asic_type = ASIC_GAUDI2B;
103 			break;
104 		case REV_ID_C:
105 			asic_type = ASIC_GAUDI2C;
106 			break;
107 		default:
108 			break;
109 		}
110 		break;
111 	default:
112 		break;
113 	}
114 
115 	return asic_type;
116 }
117 
is_asic_secured(enum hl_asic_type asic_type)118 static bool is_asic_secured(enum hl_asic_type asic_type)
119 {
120 	switch (asic_type) {
121 	case ASIC_GAUDI_SEC:
122 		return true;
123 	default:
124 		return false;
125 	}
126 }
127 
128 /*
129  * hl_device_open - open function for habanalabs device
130  *
131  * @inode: pointer to inode structure
132  * @filp: pointer to file structure
133  *
134  * Called when process opens an habanalabs device.
135  */
hl_device_open(struct inode * inode,struct file * filp)136 int hl_device_open(struct inode *inode, struct file *filp)
137 {
138 	enum hl_device_status status;
139 	struct hl_device *hdev;
140 	struct hl_fpriv *hpriv;
141 	int rc;
142 
143 	mutex_lock(&hl_devs_idr_lock);
144 	hdev = idr_find(&hl_devs_idr, iminor(inode));
145 	mutex_unlock(&hl_devs_idr_lock);
146 
147 	if (!hdev) {
148 		pr_err("Couldn't find device %d:%d\n",
149 			imajor(inode), iminor(inode));
150 		return -ENXIO;
151 	}
152 
153 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
154 	if (!hpriv)
155 		return -ENOMEM;
156 
157 	hpriv->hdev = hdev;
158 	filp->private_data = hpriv;
159 	hpriv->filp = filp;
160 
161 	mutex_init(&hpriv->notifier_event.lock);
162 	mutex_init(&hpriv->restore_phase_mutex);
163 	mutex_init(&hpriv->ctx_lock);
164 	kref_init(&hpriv->refcount);
165 	nonseekable_open(inode, filp);
166 
167 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
168 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
169 
170 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
171 
172 	mutex_lock(&hdev->fpriv_list_lock);
173 
174 	if (!hl_device_operational(hdev, &status)) {
175 		dev_dbg_ratelimited(hdev->dev,
176 			"Can't open %s because it is %s\n",
177 			dev_name(hdev->dev), hdev->status[status]);
178 
179 		if (status == HL_DEVICE_STATUS_IN_RESET ||
180 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
181 			rc = -EAGAIN;
182 		else
183 			rc = -EPERM;
184 
185 		goto out_err;
186 	}
187 
188 	if (hdev->is_in_dram_scrub) {
189 		dev_dbg_ratelimited(hdev->dev,
190 			"Can't open %s during dram scrub\n",
191 			dev_name(hdev->dev));
192 		rc = -EAGAIN;
193 		goto out_err;
194 	}
195 
196 	if (hdev->compute_ctx_in_release) {
197 		dev_dbg_ratelimited(hdev->dev,
198 			"Can't open %s because another user is still releasing it\n",
199 			dev_name(hdev->dev));
200 		rc = -EAGAIN;
201 		goto out_err;
202 	}
203 
204 	if (hdev->is_compute_ctx_active) {
205 		dev_dbg_ratelimited(hdev->dev,
206 			"Can't open %s because another user is working on it\n",
207 			dev_name(hdev->dev));
208 		rc = -EBUSY;
209 		goto out_err;
210 	}
211 
212 	rc = hl_ctx_create(hdev, hpriv);
213 	if (rc) {
214 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
215 		goto out_err;
216 	}
217 
218 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
219 	mutex_unlock(&hdev->fpriv_list_lock);
220 
221 	hdev->asic_funcs->send_device_activity(hdev, true);
222 
223 	hl_debugfs_add_file(hpriv);
224 
225 	hl_enable_err_info_capture(&hdev->captured_err_info);
226 
227 	hdev->open_counter++;
228 	hdev->last_successful_open_jif = jiffies;
229 	hdev->last_successful_open_ktime = ktime_get();
230 
231 	return 0;
232 
233 out_err:
234 	mutex_unlock(&hdev->fpriv_list_lock);
235 	hl_mem_mgr_fini(&hpriv->mem_mgr);
236 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
237 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
238 	filp->private_data = NULL;
239 	mutex_destroy(&hpriv->ctx_lock);
240 	mutex_destroy(&hpriv->restore_phase_mutex);
241 	mutex_destroy(&hpriv->notifier_event.lock);
242 	put_pid(hpriv->taskpid);
243 
244 	kfree(hpriv);
245 
246 	return rc;
247 }
248 
hl_device_open_ctrl(struct inode * inode,struct file * filp)249 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
250 {
251 	struct hl_device *hdev;
252 	struct hl_fpriv *hpriv;
253 	int rc;
254 
255 	mutex_lock(&hl_devs_idr_lock);
256 	hdev = idr_find(&hl_devs_idr, iminor(inode));
257 	mutex_unlock(&hl_devs_idr_lock);
258 
259 	if (!hdev) {
260 		pr_err("Couldn't find device %d:%d\n",
261 			imajor(inode), iminor(inode));
262 		return -ENXIO;
263 	}
264 
265 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
266 	if (!hpriv)
267 		return -ENOMEM;
268 
269 	/* Prevent other routines from reading partial hpriv data by
270 	 * initializing hpriv fields before inserting it to the list
271 	 */
272 	hpriv->hdev = hdev;
273 	filp->private_data = hpriv;
274 	hpriv->filp = filp;
275 
276 	mutex_init(&hpriv->notifier_event.lock);
277 	nonseekable_open(inode, filp);
278 
279 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
280 
281 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
282 
283 	if (!hl_ctrl_device_operational(hdev, NULL)) {
284 		dev_dbg_ratelimited(hdev->dev_ctrl,
285 			"Can't open %s because it is disabled\n",
286 			dev_name(hdev->dev_ctrl));
287 		rc = -EPERM;
288 		goto out_err;
289 	}
290 
291 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
292 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
293 
294 	return 0;
295 
296 out_err:
297 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
298 	filp->private_data = NULL;
299 	put_pid(hpriv->taskpid);
300 
301 	kfree(hpriv);
302 
303 	return rc;
304 }
305 
set_driver_behavior_per_device(struct hl_device * hdev)306 static void set_driver_behavior_per_device(struct hl_device *hdev)
307 {
308 	hdev->nic_ports_mask = 0;
309 	hdev->fw_components = FW_TYPE_ALL_TYPES;
310 	hdev->cpu_queues_enable = 1;
311 	hdev->pldm = 0;
312 	hdev->hard_reset_on_fw_events = 1;
313 	hdev->bmc_enable = 1;
314 	hdev->reset_on_preboot_fail = 1;
315 	hdev->heartbeat = 1;
316 }
317 
copy_kernel_module_params_to_device(struct hl_device * hdev)318 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
319 {
320 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
321 
322 	hdev->major = hl_major;
323 	hdev->hclass = hl_class;
324 	hdev->memory_scrub = memory_scrub;
325 	hdev->reset_on_lockup = reset_on_lockup;
326 	hdev->boot_error_status_mask = boot_error_status_mask;
327 }
328 
fixup_device_params_per_asic(struct hl_device * hdev,int timeout)329 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
330 {
331 	switch (hdev->asic_type) {
332 	case ASIC_GAUDI:
333 	case ASIC_GAUDI_SEC:
334 		/* If user didn't request a different timeout than the default one, we have
335 		 * a different default timeout for Gaudi
336 		 */
337 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
338 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
339 										MSEC_PER_SEC);
340 
341 		hdev->reset_upon_device_release = 0;
342 		break;
343 
344 	case ASIC_GOYA:
345 		hdev->reset_upon_device_release = 0;
346 		break;
347 
348 	default:
349 		hdev->reset_upon_device_release = 1;
350 		break;
351 	}
352 }
353 
fixup_device_params(struct hl_device * hdev)354 static int fixup_device_params(struct hl_device *hdev)
355 {
356 	int tmp_timeout;
357 
358 	tmp_timeout = timeout_locked;
359 
360 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
361 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
362 
363 	if (tmp_timeout)
364 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
365 	else
366 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
367 
368 	hdev->stop_on_err = true;
369 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
370 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
371 
372 	/* Enable only after the initialization of the device */
373 	hdev->disabled = true;
374 
375 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
376 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
377 		pr_err("Preboot must be set along with other components");
378 		return -EINVAL;
379 	}
380 
381 	/* If CPU queues not enabled, no way to do heartbeat */
382 	if (!hdev->cpu_queues_enable)
383 		hdev->heartbeat = 0;
384 	fixup_device_params_per_asic(hdev, tmp_timeout);
385 
386 	return 0;
387 }
388 
389 /**
390  * create_hdev - create habanalabs device instance
391  *
392  * @dev: will hold the pointer to the new habanalabs device structure
393  * @pdev: pointer to the pci device
394  *
395  * Allocate memory for habanalabs device and initialize basic fields
396  * Identify the ASIC type
397  * Allocate ID (minor) for the device (only for real devices)
398  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev)399 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
400 {
401 	int main_id, ctrl_id = 0, rc = 0;
402 	struct hl_device *hdev;
403 
404 	*dev = NULL;
405 
406 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
407 	if (!hdev)
408 		return -ENOMEM;
409 
410 	/* Will be NULL in case of simulator device */
411 	hdev->pdev = pdev;
412 
413 	/* Assign status description string */
414 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
415 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
416 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
417 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
418 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
419 					"in device creation", HL_STR_MAX);
420 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
421 					"in reset after device release", HL_STR_MAX);
422 
423 
424 	/* First, we must find out which ASIC are we handling. This is needed
425 	 * to configure the behavior of the driver (kernel parameters)
426 	 */
427 	hdev->asic_type = get_asic_type(hdev);
428 	if (hdev->asic_type == ASIC_INVALID) {
429 		dev_err(&pdev->dev, "Unsupported ASIC\n");
430 		rc = -ENODEV;
431 		goto free_hdev;
432 	}
433 
434 	copy_kernel_module_params_to_device(hdev);
435 
436 	set_driver_behavior_per_device(hdev);
437 
438 	fixup_device_params(hdev);
439 
440 	mutex_lock(&hl_devs_idr_lock);
441 
442 	/* Always save 2 numbers, 1 for main device and 1 for control.
443 	 * They must be consecutive
444 	 */
445 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
446 
447 	if (main_id >= 0)
448 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
449 					main_id + 2, GFP_KERNEL);
450 
451 	mutex_unlock(&hl_devs_idr_lock);
452 
453 	if ((main_id < 0) || (ctrl_id < 0)) {
454 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
455 			pr_err("too many devices in the system\n");
456 
457 		if (main_id >= 0) {
458 			mutex_lock(&hl_devs_idr_lock);
459 			idr_remove(&hl_devs_idr, main_id);
460 			mutex_unlock(&hl_devs_idr_lock);
461 		}
462 
463 		rc = -EBUSY;
464 		goto free_hdev;
465 	}
466 
467 	hdev->id = main_id;
468 	hdev->id_control = ctrl_id;
469 
470 	*dev = hdev;
471 
472 	return 0;
473 
474 free_hdev:
475 	kfree(hdev);
476 	return rc;
477 }
478 
479 /*
480  * destroy_hdev - destroy habanalabs device instance
481  *
482  * @dev: pointer to the habanalabs device structure
483  *
484  */
destroy_hdev(struct hl_device * hdev)485 static void destroy_hdev(struct hl_device *hdev)
486 {
487 	/* Remove device from the device list */
488 	mutex_lock(&hl_devs_idr_lock);
489 	idr_remove(&hl_devs_idr, hdev->id);
490 	idr_remove(&hl_devs_idr, hdev->id_control);
491 	mutex_unlock(&hl_devs_idr_lock);
492 
493 	kfree(hdev);
494 }
495 
hl_pmops_suspend(struct device * dev)496 static int hl_pmops_suspend(struct device *dev)
497 {
498 	struct hl_device *hdev = dev_get_drvdata(dev);
499 
500 	pr_debug("Going to suspend PCI device\n");
501 
502 	if (!hdev) {
503 		pr_err("device pointer is NULL in suspend\n");
504 		return 0;
505 	}
506 
507 	return hl_device_suspend(hdev);
508 }
509 
hl_pmops_resume(struct device * dev)510 static int hl_pmops_resume(struct device *dev)
511 {
512 	struct hl_device *hdev = dev_get_drvdata(dev);
513 
514 	pr_debug("Going to resume PCI device\n");
515 
516 	if (!hdev) {
517 		pr_err("device pointer is NULL in resume\n");
518 		return 0;
519 	}
520 
521 	return hl_device_resume(hdev);
522 }
523 
524 /**
525  * hl_pci_probe - probe PCI habanalabs devices
526  *
527  * @pdev: pointer to pci device
528  * @id: pointer to pci device id structure
529  *
530  * Standard PCI probe function for habanalabs device.
531  * Create a new habanalabs device and initialize it according to the
532  * device's type
533  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)534 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
535 {
536 	struct hl_device *hdev;
537 	int rc;
538 
539 	dev_info(&pdev->dev, HL_NAME
540 		 " device found [%04x:%04x] (rev %x)\n",
541 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
542 
543 	rc = create_hdev(&hdev, pdev);
544 	if (rc)
545 		return rc;
546 
547 	pci_set_drvdata(pdev, hdev);
548 
549 	rc = hl_device_init(hdev);
550 	if (rc) {
551 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
552 		rc = -ENODEV;
553 		goto disable_device;
554 	}
555 
556 	return 0;
557 
558 disable_device:
559 	pci_set_drvdata(pdev, NULL);
560 	destroy_hdev(hdev);
561 
562 	return rc;
563 }
564 
565 /*
566  * hl_pci_remove - remove PCI habanalabs devices
567  *
568  * @pdev: pointer to pci device
569  *
570  * Standard PCI remove function for habanalabs device
571  */
hl_pci_remove(struct pci_dev * pdev)572 static void hl_pci_remove(struct pci_dev *pdev)
573 {
574 	struct hl_device *hdev;
575 
576 	hdev = pci_get_drvdata(pdev);
577 	if (!hdev)
578 		return;
579 
580 	hl_device_fini(hdev);
581 	pci_set_drvdata(pdev, NULL);
582 	destroy_hdev(hdev);
583 }
584 
585 /**
586  * hl_pci_err_detected - a PCI bus error detected on this device
587  *
588  * @pdev: pointer to pci device
589  * @state: PCI error type
590  *
591  * Called by the PCI subsystem whenever a non-correctable
592  * PCI bus error is detected
593  */
594 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)595 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
596 {
597 	struct hl_device *hdev = pci_get_drvdata(pdev);
598 	enum pci_ers_result result;
599 
600 	switch (state) {
601 	case pci_channel_io_normal:
602 		dev_warn(hdev->dev, "PCI normal state error detected\n");
603 		return PCI_ERS_RESULT_CAN_RECOVER;
604 
605 	case pci_channel_io_frozen:
606 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
607 		result = PCI_ERS_RESULT_NEED_RESET;
608 		break;
609 
610 	case pci_channel_io_perm_failure:
611 		dev_warn(hdev->dev, "PCI failure state error detected\n");
612 		result = PCI_ERS_RESULT_DISCONNECT;
613 		break;
614 
615 	default:
616 		result = PCI_ERS_RESULT_NONE;
617 	}
618 
619 	hdev->asic_funcs->halt_engines(hdev, true, false);
620 
621 	return result;
622 }
623 
624 /**
625  * hl_pci_err_resume - resume after a PCI slot reset
626  *
627  * @pdev: pointer to pci device
628  *
629  */
hl_pci_err_resume(struct pci_dev * pdev)630 static void hl_pci_err_resume(struct pci_dev *pdev)
631 {
632 	struct hl_device *hdev = pci_get_drvdata(pdev);
633 
634 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
635 	hl_device_resume(hdev);
636 }
637 
638 /**
639  * hl_pci_err_slot_reset - a PCI slot reset has just happened
640  *
641  * @pdev: pointer to pci device
642  *
643  * Determine if the driver can recover from the PCI slot reset
644  */
hl_pci_err_slot_reset(struct pci_dev * pdev)645 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
646 {
647 	struct hl_device *hdev = pci_get_drvdata(pdev);
648 
649 	dev_warn(hdev->dev, "PCI slot reset detected\n");
650 
651 	return PCI_ERS_RESULT_RECOVERED;
652 }
653 
654 static const struct dev_pm_ops hl_pm_ops = {
655 	.suspend = hl_pmops_suspend,
656 	.resume = hl_pmops_resume,
657 };
658 
659 static const struct pci_error_handlers hl_pci_err_handler = {
660 	.error_detected = hl_pci_err_detected,
661 	.slot_reset = hl_pci_err_slot_reset,
662 	.resume = hl_pci_err_resume,
663 };
664 
665 static struct pci_driver hl_pci_driver = {
666 	.name = HL_NAME,
667 	.id_table = ids,
668 	.probe = hl_pci_probe,
669 	.remove = hl_pci_remove,
670 	.shutdown = hl_pci_remove,
671 	.driver = {
672 		.name = HL_NAME,
673 		.pm = &hl_pm_ops,
674 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
675 	},
676 	.err_handler = &hl_pci_err_handler,
677 };
678 
679 /*
680  * hl_init - Initialize the habanalabs kernel driver
681  */
hl_init(void)682 static int __init hl_init(void)
683 {
684 	int rc;
685 	dev_t dev;
686 
687 	pr_info("loading driver\n");
688 
689 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
690 	if (rc < 0) {
691 		pr_err("unable to get major\n");
692 		return rc;
693 	}
694 
695 	hl_major = MAJOR(dev);
696 
697 	hl_class = class_create(HL_NAME);
698 	if (IS_ERR(hl_class)) {
699 		pr_err("failed to allocate class\n");
700 		rc = PTR_ERR(hl_class);
701 		goto remove_major;
702 	}
703 
704 	hl_debugfs_init();
705 
706 	rc = pci_register_driver(&hl_pci_driver);
707 	if (rc) {
708 		pr_err("failed to register pci device\n");
709 		goto remove_debugfs;
710 	}
711 
712 	pr_debug("driver loaded\n");
713 
714 	return 0;
715 
716 remove_debugfs:
717 	hl_debugfs_fini();
718 	class_destroy(hl_class);
719 remove_major:
720 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
721 	return rc;
722 }
723 
724 /*
725  * hl_exit - Release all resources of the habanalabs kernel driver
726  */
hl_exit(void)727 static void __exit hl_exit(void)
728 {
729 	pci_unregister_driver(&hl_pci_driver);
730 
731 	/*
732 	 * Removing debugfs must be after all devices or simulator devices
733 	 * have been removed because otherwise we get a bug in the
734 	 * debugfs module for referencing NULL objects
735 	 */
736 	hl_debugfs_fini();
737 
738 	class_destroy(hl_class);
739 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
740 
741 	idr_destroy(&hl_devs_idr);
742 
743 	pr_debug("driver removed\n");
744 }
745 
746 module_init(hl_init);
747 module_exit(hl_exit);
748