xref: /openbmc/linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision 7bd571b274fd15e0e7dc3d79d104f32928010eff)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/aer.h>
16 #include <linux/module.h>
17 
18 #define CREATE_TRACE_POINTS
19 #include <trace/events/habanalabs.h>
20 
21 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
22 
23 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
24 
25 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
26 MODULE_DESCRIPTION(HL_DRIVER_DESC);
27 MODULE_LICENSE("GPL v2");
28 
29 static int hl_major;
30 static struct class *hl_class;
31 static DEFINE_IDR(hl_devs_idr);
32 static DEFINE_MUTEX(hl_devs_idr_lock);
33 
34 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
35 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
36 
37 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
38 static int reset_on_lockup = 1;
39 static int memory_scrub;
40 static ulong boot_error_status_mask = ULONG_MAX;
41 
42 module_param(timeout_locked, int, 0444);
43 MODULE_PARM_DESC(timeout_locked,
44 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
45 
46 module_param(reset_on_lockup, int, 0444);
47 MODULE_PARM_DESC(reset_on_lockup,
48 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
49 
50 module_param(memory_scrub, int, 0444);
51 MODULE_PARM_DESC(memory_scrub,
52 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
53 
54 module_param(boot_error_status_mask, ulong, 0444);
55 MODULE_PARM_DESC(boot_error_status_mask,
56 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
57 
58 #define PCI_VENDOR_ID_HABANALABS	0x1da3
59 
60 #define PCI_IDS_GOYA			0x0001
61 #define PCI_IDS_GAUDI			0x1000
62 #define PCI_IDS_GAUDI_SEC		0x1010
63 
64 #define PCI_IDS_GAUDI2			0x1020
65 
66 static const struct pci_device_id ids[] = {
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
70 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
71 	{ 0, }
72 };
73 MODULE_DEVICE_TABLE(pci, ids);
74 
75 /*
76  * get_asic_type - translate device id to asic type
77  *
78  * @hdev: pointer to habanalabs device structure.
79  *
80  * Translate device id and revision id to asic type.
81  * In case of unidentified device, return -1
82  */
83 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
84 {
85 	struct pci_dev *pdev = hdev->pdev;
86 	enum hl_asic_type asic_type = ASIC_INVALID;
87 
88 	switch (pdev->device) {
89 	case PCI_IDS_GOYA:
90 		asic_type = ASIC_GOYA;
91 		break;
92 	case PCI_IDS_GAUDI:
93 		asic_type = ASIC_GAUDI;
94 		break;
95 	case PCI_IDS_GAUDI_SEC:
96 		asic_type = ASIC_GAUDI_SEC;
97 		break;
98 	case PCI_IDS_GAUDI2:
99 		switch (pdev->revision) {
100 		case REV_ID_A:
101 			asic_type = ASIC_GAUDI2;
102 			break;
103 		case REV_ID_B:
104 			asic_type = ASIC_GAUDI2B;
105 			break;
106 		default:
107 			break;
108 		}
109 		break;
110 	default:
111 		break;
112 	}
113 
114 	return asic_type;
115 }
116 
117 static bool is_asic_secured(enum hl_asic_type asic_type)
118 {
119 	switch (asic_type) {
120 	case ASIC_GAUDI_SEC:
121 		return true;
122 	default:
123 		return false;
124 	}
125 }
126 
127 /*
128  * hl_device_open - open function for habanalabs device
129  *
130  * @inode: pointer to inode structure
131  * @filp: pointer to file structure
132  *
133  * Called when process opens an habanalabs device.
134  */
135 int hl_device_open(struct inode *inode, struct file *filp)
136 {
137 	enum hl_device_status status;
138 	struct hl_device *hdev;
139 	struct hl_fpriv *hpriv;
140 	int rc;
141 
142 	mutex_lock(&hl_devs_idr_lock);
143 	hdev = idr_find(&hl_devs_idr, iminor(inode));
144 	mutex_unlock(&hl_devs_idr_lock);
145 
146 	if (!hdev) {
147 		pr_err("Couldn't find device %d:%d\n",
148 			imajor(inode), iminor(inode));
149 		return -ENXIO;
150 	}
151 
152 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
153 	if (!hpriv)
154 		return -ENOMEM;
155 
156 	hpriv->hdev = hdev;
157 	filp->private_data = hpriv;
158 	hpriv->filp = filp;
159 
160 	mutex_init(&hpriv->notifier_event.lock);
161 	mutex_init(&hpriv->restore_phase_mutex);
162 	mutex_init(&hpriv->ctx_lock);
163 	kref_init(&hpriv->refcount);
164 	nonseekable_open(inode, filp);
165 
166 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
167 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
168 
169 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
170 
171 	mutex_lock(&hdev->fpriv_list_lock);
172 
173 	if (!hl_device_operational(hdev, &status)) {
174 		dev_dbg_ratelimited(hdev->dev,
175 			"Can't open %s because it is %s\n",
176 			dev_name(hdev->dev), hdev->status[status]);
177 
178 		if (status == HL_DEVICE_STATUS_IN_RESET ||
179 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
180 			rc = -EAGAIN;
181 		else
182 			rc = -EPERM;
183 
184 		goto out_err;
185 	}
186 
187 	if (hdev->is_in_dram_scrub) {
188 		dev_dbg_ratelimited(hdev->dev,
189 			"Can't open %s during dram scrub\n",
190 			dev_name(hdev->dev));
191 		rc = -EAGAIN;
192 		goto out_err;
193 	}
194 
195 	if (hdev->compute_ctx_in_release) {
196 		dev_dbg_ratelimited(hdev->dev,
197 			"Can't open %s because another user is still releasing it\n",
198 			dev_name(hdev->dev));
199 		rc = -EAGAIN;
200 		goto out_err;
201 	}
202 
203 	if (hdev->is_compute_ctx_active) {
204 		dev_dbg_ratelimited(hdev->dev,
205 			"Can't open %s because another user is working on it\n",
206 			dev_name(hdev->dev));
207 		rc = -EBUSY;
208 		goto out_err;
209 	}
210 
211 	rc = hl_ctx_create(hdev, hpriv);
212 	if (rc) {
213 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
214 		goto out_err;
215 	}
216 
217 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
218 	mutex_unlock(&hdev->fpriv_list_lock);
219 
220 	hdev->asic_funcs->send_device_activity(hdev, true);
221 
222 	hl_debugfs_add_file(hpriv);
223 
224 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
225 	atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
226 	atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
227 	hdev->captured_err_info.undef_opcode.write_enable = true;
228 	hdev->captured_err_info.razwi_info.razwi_info_available = false;
229 	hdev->captured_err_info.page_fault_info.page_fault_info_available = false;
230 
231 	hdev->open_counter++;
232 	hdev->last_successful_open_jif = jiffies;
233 	hdev->last_successful_open_ktime = ktime_get();
234 
235 	return 0;
236 
237 out_err:
238 	mutex_unlock(&hdev->fpriv_list_lock);
239 	hl_mem_mgr_fini(&hpriv->mem_mgr);
240 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
241 	filp->private_data = NULL;
242 	mutex_destroy(&hpriv->ctx_lock);
243 	mutex_destroy(&hpriv->restore_phase_mutex);
244 	mutex_destroy(&hpriv->notifier_event.lock);
245 	put_pid(hpriv->taskpid);
246 
247 	kfree(hpriv);
248 
249 	return rc;
250 }
251 
252 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
253 {
254 	struct hl_device *hdev;
255 	struct hl_fpriv *hpriv;
256 	int rc;
257 
258 	mutex_lock(&hl_devs_idr_lock);
259 	hdev = idr_find(&hl_devs_idr, iminor(inode));
260 	mutex_unlock(&hl_devs_idr_lock);
261 
262 	if (!hdev) {
263 		pr_err("Couldn't find device %d:%d\n",
264 			imajor(inode), iminor(inode));
265 		return -ENXIO;
266 	}
267 
268 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
269 	if (!hpriv)
270 		return -ENOMEM;
271 
272 	/* Prevent other routines from reading partial hpriv data by
273 	 * initializing hpriv fields before inserting it to the list
274 	 */
275 	hpriv->hdev = hdev;
276 	filp->private_data = hpriv;
277 	hpriv->filp = filp;
278 
279 	mutex_init(&hpriv->notifier_event.lock);
280 	nonseekable_open(inode, filp);
281 
282 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
283 
284 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
285 
286 	if (!hl_ctrl_device_operational(hdev, NULL)) {
287 		dev_dbg_ratelimited(hdev->dev_ctrl,
288 			"Can't open %s because it is disabled\n",
289 			dev_name(hdev->dev_ctrl));
290 		rc = -EPERM;
291 		goto out_err;
292 	}
293 
294 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
295 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
296 
297 	return 0;
298 
299 out_err:
300 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
301 	filp->private_data = NULL;
302 	put_pid(hpriv->taskpid);
303 
304 	kfree(hpriv);
305 
306 	return rc;
307 }
308 
309 static void set_driver_behavior_per_device(struct hl_device *hdev)
310 {
311 	hdev->nic_ports_mask = 0;
312 	hdev->fw_components = FW_TYPE_ALL_TYPES;
313 	hdev->mmu_enable = MMU_EN_ALL;
314 	hdev->cpu_queues_enable = 1;
315 	hdev->pldm = 0;
316 	hdev->hard_reset_on_fw_events = 1;
317 	hdev->bmc_enable = 1;
318 	hdev->reset_on_preboot_fail = 1;
319 	hdev->heartbeat = 1;
320 }
321 
322 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
323 {
324 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
325 
326 	hdev->major = hl_major;
327 	hdev->memory_scrub = memory_scrub;
328 	hdev->reset_on_lockup = reset_on_lockup;
329 	hdev->boot_error_status_mask = boot_error_status_mask;
330 }
331 
332 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
333 {
334 	switch (hdev->asic_type) {
335 	case ASIC_GAUDI:
336 	case ASIC_GAUDI_SEC:
337 		/* If user didn't request a different timeout than the default one, we have
338 		 * a different default timeout for Gaudi
339 		 */
340 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
341 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
342 										MSEC_PER_SEC);
343 
344 		hdev->reset_upon_device_release = 0;
345 		break;
346 
347 	case ASIC_GOYA:
348 		hdev->reset_upon_device_release = 0;
349 		break;
350 
351 	default:
352 		hdev->reset_upon_device_release = 1;
353 		break;
354 	}
355 }
356 
357 static int fixup_device_params(struct hl_device *hdev)
358 {
359 	int tmp_timeout;
360 
361 	tmp_timeout = timeout_locked;
362 
363 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
364 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
365 
366 	if (tmp_timeout)
367 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
368 	else
369 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
370 
371 	hdev->stop_on_err = true;
372 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
373 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
374 
375 	/* Enable only after the initialization of the device */
376 	hdev->disabled = true;
377 
378 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
379 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
380 		pr_err("Preboot must be set along with other components");
381 		return -EINVAL;
382 	}
383 
384 	/* If CPU queues not enabled, no way to do heartbeat */
385 	if (!hdev->cpu_queues_enable)
386 		hdev->heartbeat = 0;
387 
388 	fixup_device_params_per_asic(hdev, tmp_timeout);
389 
390 	return 0;
391 }
392 
393 /**
394  * create_hdev - create habanalabs device instance
395  *
396  * @dev: will hold the pointer to the new habanalabs device structure
397  * @pdev: pointer to the pci device
398  *
399  * Allocate memory for habanalabs device and initialize basic fields
400  * Identify the ASIC type
401  * Allocate ID (minor) for the device (only for real devices)
402  */
403 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
404 {
405 	int main_id, ctrl_id = 0, rc = 0;
406 	struct hl_device *hdev;
407 
408 	*dev = NULL;
409 
410 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
411 	if (!hdev)
412 		return -ENOMEM;
413 
414 	/* Will be NULL in case of simulator device */
415 	hdev->pdev = pdev;
416 
417 	/* Assign status description string */
418 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
419 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
420 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
421 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
422 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
423 					"in device creation", HL_STR_MAX);
424 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
425 					"in reset after device release", HL_STR_MAX);
426 
427 
428 	/* First, we must find out which ASIC are we handling. This is needed
429 	 * to configure the behavior of the driver (kernel parameters)
430 	 */
431 	hdev->asic_type = get_asic_type(hdev);
432 	if (hdev->asic_type == ASIC_INVALID) {
433 		dev_err(&pdev->dev, "Unsupported ASIC\n");
434 		rc = -ENODEV;
435 		goto free_hdev;
436 	}
437 
438 	copy_kernel_module_params_to_device(hdev);
439 
440 	set_driver_behavior_per_device(hdev);
441 
442 	fixup_device_params(hdev);
443 
444 	mutex_lock(&hl_devs_idr_lock);
445 
446 	/* Always save 2 numbers, 1 for main device and 1 for control.
447 	 * They must be consecutive
448 	 */
449 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
450 
451 	if (main_id >= 0)
452 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
453 					main_id + 2, GFP_KERNEL);
454 
455 	mutex_unlock(&hl_devs_idr_lock);
456 
457 	if ((main_id < 0) || (ctrl_id < 0)) {
458 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
459 			pr_err("too many devices in the system\n");
460 
461 		if (main_id >= 0) {
462 			mutex_lock(&hl_devs_idr_lock);
463 			idr_remove(&hl_devs_idr, main_id);
464 			mutex_unlock(&hl_devs_idr_lock);
465 		}
466 
467 		rc = -EBUSY;
468 		goto free_hdev;
469 	}
470 
471 	hdev->id = main_id;
472 	hdev->id_control = ctrl_id;
473 
474 	*dev = hdev;
475 
476 	return 0;
477 
478 free_hdev:
479 	kfree(hdev);
480 	return rc;
481 }
482 
483 /*
484  * destroy_hdev - destroy habanalabs device instance
485  *
486  * @dev: pointer to the habanalabs device structure
487  *
488  */
489 static void destroy_hdev(struct hl_device *hdev)
490 {
491 	/* Remove device from the device list */
492 	mutex_lock(&hl_devs_idr_lock);
493 	idr_remove(&hl_devs_idr, hdev->id);
494 	idr_remove(&hl_devs_idr, hdev->id_control);
495 	mutex_unlock(&hl_devs_idr_lock);
496 
497 	kfree(hdev);
498 }
499 
500 static int hl_pmops_suspend(struct device *dev)
501 {
502 	struct hl_device *hdev = dev_get_drvdata(dev);
503 
504 	pr_debug("Going to suspend PCI device\n");
505 
506 	if (!hdev) {
507 		pr_err("device pointer is NULL in suspend\n");
508 		return 0;
509 	}
510 
511 	return hl_device_suspend(hdev);
512 }
513 
514 static int hl_pmops_resume(struct device *dev)
515 {
516 	struct hl_device *hdev = dev_get_drvdata(dev);
517 
518 	pr_debug("Going to resume PCI device\n");
519 
520 	if (!hdev) {
521 		pr_err("device pointer is NULL in resume\n");
522 		return 0;
523 	}
524 
525 	return hl_device_resume(hdev);
526 }
527 
528 /**
529  * hl_pci_probe - probe PCI habanalabs devices
530  *
531  * @pdev: pointer to pci device
532  * @id: pointer to pci device id structure
533  *
534  * Standard PCI probe function for habanalabs device.
535  * Create a new habanalabs device and initialize it according to the
536  * device's type
537  */
538 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
539 {
540 	struct hl_device *hdev;
541 	int rc;
542 
543 	dev_info(&pdev->dev, HL_NAME
544 		 " device found [%04x:%04x] (rev %x)\n",
545 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
546 
547 	rc = create_hdev(&hdev, pdev);
548 	if (rc)
549 		return rc;
550 
551 	pci_set_drvdata(pdev, hdev);
552 
553 	pci_enable_pcie_error_reporting(pdev);
554 
555 	rc = hl_device_init(hdev, hl_class);
556 	if (rc) {
557 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
558 		rc = -ENODEV;
559 		goto disable_device;
560 	}
561 
562 	return 0;
563 
564 disable_device:
565 	pci_disable_pcie_error_reporting(pdev);
566 	pci_set_drvdata(pdev, NULL);
567 	destroy_hdev(hdev);
568 
569 	return rc;
570 }
571 
572 /*
573  * hl_pci_remove - remove PCI habanalabs devices
574  *
575  * @pdev: pointer to pci device
576  *
577  * Standard PCI remove function for habanalabs device
578  */
579 static void hl_pci_remove(struct pci_dev *pdev)
580 {
581 	struct hl_device *hdev;
582 
583 	hdev = pci_get_drvdata(pdev);
584 	if (!hdev)
585 		return;
586 
587 	hl_device_fini(hdev);
588 	pci_disable_pcie_error_reporting(pdev);
589 	pci_set_drvdata(pdev, NULL);
590 	destroy_hdev(hdev);
591 }
592 
593 /**
594  * hl_pci_err_detected - a PCI bus error detected on this device
595  *
596  * @pdev: pointer to pci device
597  * @state: PCI error type
598  *
599  * Called by the PCI subsystem whenever a non-correctable
600  * PCI bus error is detected
601  */
602 static pci_ers_result_t
603 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
604 {
605 	struct hl_device *hdev = pci_get_drvdata(pdev);
606 	enum pci_ers_result result;
607 
608 	switch (state) {
609 	case pci_channel_io_normal:
610 		dev_warn(hdev->dev, "PCI normal state error detected\n");
611 		return PCI_ERS_RESULT_CAN_RECOVER;
612 
613 	case pci_channel_io_frozen:
614 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
615 		result = PCI_ERS_RESULT_NEED_RESET;
616 		break;
617 
618 	case pci_channel_io_perm_failure:
619 		dev_warn(hdev->dev, "PCI failure state error detected\n");
620 		result = PCI_ERS_RESULT_DISCONNECT;
621 		break;
622 
623 	default:
624 		result = PCI_ERS_RESULT_NONE;
625 	}
626 
627 	hdev->asic_funcs->halt_engines(hdev, true, false);
628 
629 	return result;
630 }
631 
632 /**
633  * hl_pci_err_resume - resume after a PCI slot reset
634  *
635  * @pdev: pointer to pci device
636  *
637  */
638 static void hl_pci_err_resume(struct pci_dev *pdev)
639 {
640 	struct hl_device *hdev = pci_get_drvdata(pdev);
641 
642 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
643 	hl_device_resume(hdev);
644 }
645 
646 /**
647  * hl_pci_err_slot_reset - a PCI slot reset has just happened
648  *
649  * @pdev: pointer to pci device
650  *
651  * Determine if the driver can recover from the PCI slot reset
652  */
653 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
654 {
655 	struct hl_device *hdev = pci_get_drvdata(pdev);
656 
657 	dev_warn(hdev->dev, "PCI slot reset detected\n");
658 
659 	return PCI_ERS_RESULT_RECOVERED;
660 }
661 
662 static const struct dev_pm_ops hl_pm_ops = {
663 	.suspend = hl_pmops_suspend,
664 	.resume = hl_pmops_resume,
665 };
666 
667 static const struct pci_error_handlers hl_pci_err_handler = {
668 	.error_detected = hl_pci_err_detected,
669 	.slot_reset = hl_pci_err_slot_reset,
670 	.resume = hl_pci_err_resume,
671 };
672 
673 static struct pci_driver hl_pci_driver = {
674 	.name = HL_NAME,
675 	.id_table = ids,
676 	.probe = hl_pci_probe,
677 	.remove = hl_pci_remove,
678 	.shutdown = hl_pci_remove,
679 	.driver = {
680 		.name = HL_NAME,
681 		.pm = &hl_pm_ops,
682 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
683 	},
684 	.err_handler = &hl_pci_err_handler,
685 };
686 
687 /*
688  * hl_init - Initialize the habanalabs kernel driver
689  */
690 static int __init hl_init(void)
691 {
692 	int rc;
693 	dev_t dev;
694 
695 	pr_info("loading driver\n");
696 
697 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
698 	if (rc < 0) {
699 		pr_err("unable to get major\n");
700 		return rc;
701 	}
702 
703 	hl_major = MAJOR(dev);
704 
705 	hl_class = class_create(THIS_MODULE, HL_NAME);
706 	if (IS_ERR(hl_class)) {
707 		pr_err("failed to allocate class\n");
708 		rc = PTR_ERR(hl_class);
709 		goto remove_major;
710 	}
711 
712 	hl_debugfs_init();
713 
714 	rc = pci_register_driver(&hl_pci_driver);
715 	if (rc) {
716 		pr_err("failed to register pci device\n");
717 		goto remove_debugfs;
718 	}
719 
720 	pr_debug("driver loaded\n");
721 
722 	return 0;
723 
724 remove_debugfs:
725 	hl_debugfs_fini();
726 	class_destroy(hl_class);
727 remove_major:
728 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
729 	return rc;
730 }
731 
732 /*
733  * hl_exit - Release all resources of the habanalabs kernel driver
734  */
735 static void __exit hl_exit(void)
736 {
737 	pci_unregister_driver(&hl_pci_driver);
738 
739 	/*
740 	 * Removing debugfs must be after all devices or simulator devices
741 	 * have been removed because otherwise we get a bug in the
742 	 * debugfs module for referencing NULL objects
743 	 */
744 	hl_debugfs_fini();
745 
746 	class_destroy(hl_class);
747 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
748 
749 	idr_destroy(&hl_devs_idr);
750 
751 	pr_debug("driver removed\n");
752 }
753 
754 module_init(hl_init);
755 module_exit(hl_exit);
756