xref: /openbmc/linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision 901bdf5ea1a836400ee69aa32b04e9c209271ec7)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 
18 #define CREATE_TRACE_POINTS
19 #include <trace/events/habanalabs.h>
20 
21 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
22 
23 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
24 
25 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
26 MODULE_DESCRIPTION(HL_DRIVER_DESC);
27 MODULE_LICENSE("GPL v2");
28 
29 static int hl_major;
30 static struct class *hl_class;
31 static DEFINE_IDR(hl_devs_idr);
32 static DEFINE_MUTEX(hl_devs_idr_lock);
33 
34 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
35 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
36 
37 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
38 static int reset_on_lockup = 1;
39 static int memory_scrub;
40 static ulong boot_error_status_mask = ULONG_MAX;
41 
42 module_param(timeout_locked, int, 0444);
43 MODULE_PARM_DESC(timeout_locked,
44 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
45 
46 module_param(reset_on_lockup, int, 0444);
47 MODULE_PARM_DESC(reset_on_lockup,
48 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
49 
50 module_param(memory_scrub, int, 0444);
51 MODULE_PARM_DESC(memory_scrub,
52 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
53 
54 module_param(boot_error_status_mask, ulong, 0444);
55 MODULE_PARM_DESC(boot_error_status_mask,
56 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
57 
58 #define PCI_IDS_GOYA			0x0001
59 #define PCI_IDS_GAUDI			0x1000
60 #define PCI_IDS_GAUDI_SEC		0x1010
61 
62 #define PCI_IDS_GAUDI2			0x1020
63 
64 static const struct pci_device_id ids[] = {
65 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
66 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
69 	{ 0, }
70 };
71 MODULE_DEVICE_TABLE(pci, ids);
72 
73 /*
74  * get_asic_type - translate device id to asic type
75  *
76  * @hdev: pointer to habanalabs device structure.
77  *
78  * Translate device id and revision id to asic type.
79  * In case of unidentified device, return -1
80  */
81 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
82 {
83 	struct pci_dev *pdev = hdev->pdev;
84 	enum hl_asic_type asic_type = ASIC_INVALID;
85 
86 	switch (pdev->device) {
87 	case PCI_IDS_GOYA:
88 		asic_type = ASIC_GOYA;
89 		break;
90 	case PCI_IDS_GAUDI:
91 		asic_type = ASIC_GAUDI;
92 		break;
93 	case PCI_IDS_GAUDI_SEC:
94 		asic_type = ASIC_GAUDI_SEC;
95 		break;
96 	case PCI_IDS_GAUDI2:
97 		switch (pdev->revision) {
98 		case REV_ID_A:
99 			asic_type = ASIC_GAUDI2;
100 			break;
101 		case REV_ID_B:
102 			asic_type = ASIC_GAUDI2B;
103 			break;
104 		default:
105 			break;
106 		}
107 		break;
108 	default:
109 		break;
110 	}
111 
112 	return asic_type;
113 }
114 
115 static bool is_asic_secured(enum hl_asic_type asic_type)
116 {
117 	switch (asic_type) {
118 	case ASIC_GAUDI_SEC:
119 		return true;
120 	default:
121 		return false;
122 	}
123 }
124 
125 /*
126  * hl_device_open - open function for habanalabs device
127  *
128  * @inode: pointer to inode structure
129  * @filp: pointer to file structure
130  *
131  * Called when process opens an habanalabs device.
132  */
133 int hl_device_open(struct inode *inode, struct file *filp)
134 {
135 	enum hl_device_status status;
136 	struct hl_device *hdev;
137 	struct hl_fpriv *hpriv;
138 	int rc;
139 
140 	mutex_lock(&hl_devs_idr_lock);
141 	hdev = idr_find(&hl_devs_idr, iminor(inode));
142 	mutex_unlock(&hl_devs_idr_lock);
143 
144 	if (!hdev) {
145 		pr_err("Couldn't find device %d:%d\n",
146 			imajor(inode), iminor(inode));
147 		return -ENXIO;
148 	}
149 
150 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
151 	if (!hpriv)
152 		return -ENOMEM;
153 
154 	hpriv->hdev = hdev;
155 	filp->private_data = hpriv;
156 	hpriv->filp = filp;
157 
158 	mutex_init(&hpriv->notifier_event.lock);
159 	mutex_init(&hpriv->restore_phase_mutex);
160 	mutex_init(&hpriv->ctx_lock);
161 	kref_init(&hpriv->refcount);
162 	nonseekable_open(inode, filp);
163 
164 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
165 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
166 
167 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
168 
169 	mutex_lock(&hdev->fpriv_list_lock);
170 
171 	if (!hl_device_operational(hdev, &status)) {
172 		dev_dbg_ratelimited(hdev->dev,
173 			"Can't open %s because it is %s\n",
174 			dev_name(hdev->dev), hdev->status[status]);
175 
176 		if (status == HL_DEVICE_STATUS_IN_RESET ||
177 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
178 			rc = -EAGAIN;
179 		else
180 			rc = -EPERM;
181 
182 		goto out_err;
183 	}
184 
185 	if (hdev->is_in_dram_scrub) {
186 		dev_dbg_ratelimited(hdev->dev,
187 			"Can't open %s during dram scrub\n",
188 			dev_name(hdev->dev));
189 		rc = -EAGAIN;
190 		goto out_err;
191 	}
192 
193 	if (hdev->compute_ctx_in_release) {
194 		dev_dbg_ratelimited(hdev->dev,
195 			"Can't open %s because another user is still releasing it\n",
196 			dev_name(hdev->dev));
197 		rc = -EAGAIN;
198 		goto out_err;
199 	}
200 
201 	if (hdev->is_compute_ctx_active) {
202 		dev_dbg_ratelimited(hdev->dev,
203 			"Can't open %s because another user is working on it\n",
204 			dev_name(hdev->dev));
205 		rc = -EBUSY;
206 		goto out_err;
207 	}
208 
209 	rc = hl_ctx_create(hdev, hpriv);
210 	if (rc) {
211 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
212 		goto out_err;
213 	}
214 
215 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
216 	mutex_unlock(&hdev->fpriv_list_lock);
217 
218 	hdev->asic_funcs->send_device_activity(hdev, true);
219 
220 	hl_debugfs_add_file(hpriv);
221 
222 	hl_enable_err_info_capture(&hdev->captured_err_info);
223 
224 	hdev->open_counter++;
225 	hdev->last_successful_open_jif = jiffies;
226 	hdev->last_successful_open_ktime = ktime_get();
227 
228 	return 0;
229 
230 out_err:
231 	mutex_unlock(&hdev->fpriv_list_lock);
232 	hl_mem_mgr_fini(&hpriv->mem_mgr);
233 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
234 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
235 	filp->private_data = NULL;
236 	mutex_destroy(&hpriv->ctx_lock);
237 	mutex_destroy(&hpriv->restore_phase_mutex);
238 	mutex_destroy(&hpriv->notifier_event.lock);
239 	put_pid(hpriv->taskpid);
240 
241 	kfree(hpriv);
242 
243 	return rc;
244 }
245 
246 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
247 {
248 	struct hl_device *hdev;
249 	struct hl_fpriv *hpriv;
250 	int rc;
251 
252 	mutex_lock(&hl_devs_idr_lock);
253 	hdev = idr_find(&hl_devs_idr, iminor(inode));
254 	mutex_unlock(&hl_devs_idr_lock);
255 
256 	if (!hdev) {
257 		pr_err("Couldn't find device %d:%d\n",
258 			imajor(inode), iminor(inode));
259 		return -ENXIO;
260 	}
261 
262 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
263 	if (!hpriv)
264 		return -ENOMEM;
265 
266 	/* Prevent other routines from reading partial hpriv data by
267 	 * initializing hpriv fields before inserting it to the list
268 	 */
269 	hpriv->hdev = hdev;
270 	filp->private_data = hpriv;
271 	hpriv->filp = filp;
272 
273 	mutex_init(&hpriv->notifier_event.lock);
274 	nonseekable_open(inode, filp);
275 
276 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
277 
278 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
279 
280 	if (!hl_ctrl_device_operational(hdev, NULL)) {
281 		dev_dbg_ratelimited(hdev->dev_ctrl,
282 			"Can't open %s because it is disabled\n",
283 			dev_name(hdev->dev_ctrl));
284 		rc = -EPERM;
285 		goto out_err;
286 	}
287 
288 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
289 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
290 
291 	return 0;
292 
293 out_err:
294 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
295 	filp->private_data = NULL;
296 	put_pid(hpriv->taskpid);
297 
298 	kfree(hpriv);
299 
300 	return rc;
301 }
302 
303 static void set_driver_behavior_per_device(struct hl_device *hdev)
304 {
305 	hdev->nic_ports_mask = 0;
306 	hdev->fw_components = FW_TYPE_ALL_TYPES;
307 	hdev->cpu_queues_enable = 1;
308 	hdev->pldm = 0;
309 	hdev->hard_reset_on_fw_events = 1;
310 	hdev->bmc_enable = 1;
311 	hdev->reset_on_preboot_fail = 1;
312 	hdev->heartbeat = 1;
313 }
314 
315 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
316 {
317 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
318 
319 	hdev->major = hl_major;
320 	hdev->hclass = hl_class;
321 	hdev->memory_scrub = memory_scrub;
322 	hdev->reset_on_lockup = reset_on_lockup;
323 	hdev->boot_error_status_mask = boot_error_status_mask;
324 }
325 
326 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
327 {
328 	switch (hdev->asic_type) {
329 	case ASIC_GAUDI:
330 	case ASIC_GAUDI_SEC:
331 		/* If user didn't request a different timeout than the default one, we have
332 		 * a different default timeout for Gaudi
333 		 */
334 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
335 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
336 										MSEC_PER_SEC);
337 
338 		hdev->reset_upon_device_release = 0;
339 		break;
340 
341 	case ASIC_GOYA:
342 		hdev->reset_upon_device_release = 0;
343 		break;
344 
345 	default:
346 		hdev->reset_upon_device_release = 1;
347 		break;
348 	}
349 }
350 
351 static int fixup_device_params(struct hl_device *hdev)
352 {
353 	int tmp_timeout;
354 
355 	tmp_timeout = timeout_locked;
356 
357 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
358 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
359 
360 	if (tmp_timeout)
361 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
362 	else
363 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
364 
365 	hdev->stop_on_err = true;
366 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
367 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
368 
369 	/* Enable only after the initialization of the device */
370 	hdev->disabled = true;
371 
372 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
373 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
374 		pr_err("Preboot must be set along with other components");
375 		return -EINVAL;
376 	}
377 
378 	/* If CPU queues not enabled, no way to do heartbeat */
379 	if (!hdev->cpu_queues_enable)
380 		hdev->heartbeat = 0;
381 	fixup_device_params_per_asic(hdev, tmp_timeout);
382 
383 	return 0;
384 }
385 
386 /**
387  * create_hdev - create habanalabs device instance
388  *
389  * @dev: will hold the pointer to the new habanalabs device structure
390  * @pdev: pointer to the pci device
391  *
392  * Allocate memory for habanalabs device and initialize basic fields
393  * Identify the ASIC type
394  * Allocate ID (minor) for the device (only for real devices)
395  */
396 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
397 {
398 	int main_id, ctrl_id = 0, rc = 0;
399 	struct hl_device *hdev;
400 
401 	*dev = NULL;
402 
403 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
404 	if (!hdev)
405 		return -ENOMEM;
406 
407 	/* Will be NULL in case of simulator device */
408 	hdev->pdev = pdev;
409 
410 	/* Assign status description string */
411 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
412 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
413 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
414 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
415 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
416 					"in device creation", HL_STR_MAX);
417 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
418 					"in reset after device release", HL_STR_MAX);
419 
420 
421 	/* First, we must find out which ASIC are we handling. This is needed
422 	 * to configure the behavior of the driver (kernel parameters)
423 	 */
424 	hdev->asic_type = get_asic_type(hdev);
425 	if (hdev->asic_type == ASIC_INVALID) {
426 		dev_err(&pdev->dev, "Unsupported ASIC\n");
427 		rc = -ENODEV;
428 		goto free_hdev;
429 	}
430 
431 	copy_kernel_module_params_to_device(hdev);
432 
433 	set_driver_behavior_per_device(hdev);
434 
435 	fixup_device_params(hdev);
436 
437 	mutex_lock(&hl_devs_idr_lock);
438 
439 	/* Always save 2 numbers, 1 for main device and 1 for control.
440 	 * They must be consecutive
441 	 */
442 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
443 
444 	if (main_id >= 0)
445 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
446 					main_id + 2, GFP_KERNEL);
447 
448 	mutex_unlock(&hl_devs_idr_lock);
449 
450 	if ((main_id < 0) || (ctrl_id < 0)) {
451 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
452 			pr_err("too many devices in the system\n");
453 
454 		if (main_id >= 0) {
455 			mutex_lock(&hl_devs_idr_lock);
456 			idr_remove(&hl_devs_idr, main_id);
457 			mutex_unlock(&hl_devs_idr_lock);
458 		}
459 
460 		rc = -EBUSY;
461 		goto free_hdev;
462 	}
463 
464 	hdev->id = main_id;
465 	hdev->id_control = ctrl_id;
466 
467 	*dev = hdev;
468 
469 	return 0;
470 
471 free_hdev:
472 	kfree(hdev);
473 	return rc;
474 }
475 
476 /*
477  * destroy_hdev - destroy habanalabs device instance
478  *
479  * @dev: pointer to the habanalabs device structure
480  *
481  */
482 static void destroy_hdev(struct hl_device *hdev)
483 {
484 	/* Remove device from the device list */
485 	mutex_lock(&hl_devs_idr_lock);
486 	idr_remove(&hl_devs_idr, hdev->id);
487 	idr_remove(&hl_devs_idr, hdev->id_control);
488 	mutex_unlock(&hl_devs_idr_lock);
489 
490 	kfree(hdev);
491 }
492 
493 static int hl_pmops_suspend(struct device *dev)
494 {
495 	struct hl_device *hdev = dev_get_drvdata(dev);
496 
497 	pr_debug("Going to suspend PCI device\n");
498 
499 	if (!hdev) {
500 		pr_err("device pointer is NULL in suspend\n");
501 		return 0;
502 	}
503 
504 	return hl_device_suspend(hdev);
505 }
506 
507 static int hl_pmops_resume(struct device *dev)
508 {
509 	struct hl_device *hdev = dev_get_drvdata(dev);
510 
511 	pr_debug("Going to resume PCI device\n");
512 
513 	if (!hdev) {
514 		pr_err("device pointer is NULL in resume\n");
515 		return 0;
516 	}
517 
518 	return hl_device_resume(hdev);
519 }
520 
521 /**
522  * hl_pci_probe - probe PCI habanalabs devices
523  *
524  * @pdev: pointer to pci device
525  * @id: pointer to pci device id structure
526  *
527  * Standard PCI probe function for habanalabs device.
528  * Create a new habanalabs device and initialize it according to the
529  * device's type
530  */
531 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
532 {
533 	struct hl_device *hdev;
534 	int rc;
535 
536 	dev_info(&pdev->dev, HL_NAME
537 		 " device found [%04x:%04x] (rev %x)\n",
538 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
539 
540 	rc = create_hdev(&hdev, pdev);
541 	if (rc)
542 		return rc;
543 
544 	pci_set_drvdata(pdev, hdev);
545 
546 	rc = hl_device_init(hdev);
547 	if (rc) {
548 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
549 		rc = -ENODEV;
550 		goto disable_device;
551 	}
552 
553 	return 0;
554 
555 disable_device:
556 	pci_set_drvdata(pdev, NULL);
557 	destroy_hdev(hdev);
558 
559 	return rc;
560 }
561 
562 /*
563  * hl_pci_remove - remove PCI habanalabs devices
564  *
565  * @pdev: pointer to pci device
566  *
567  * Standard PCI remove function for habanalabs device
568  */
569 static void hl_pci_remove(struct pci_dev *pdev)
570 {
571 	struct hl_device *hdev;
572 
573 	hdev = pci_get_drvdata(pdev);
574 	if (!hdev)
575 		return;
576 
577 	hl_device_fini(hdev);
578 	pci_set_drvdata(pdev, NULL);
579 	destroy_hdev(hdev);
580 }
581 
582 /**
583  * hl_pci_err_detected - a PCI bus error detected on this device
584  *
585  * @pdev: pointer to pci device
586  * @state: PCI error type
587  *
588  * Called by the PCI subsystem whenever a non-correctable
589  * PCI bus error is detected
590  */
591 static pci_ers_result_t
592 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
593 {
594 	struct hl_device *hdev = pci_get_drvdata(pdev);
595 	enum pci_ers_result result;
596 
597 	switch (state) {
598 	case pci_channel_io_normal:
599 		dev_warn(hdev->dev, "PCI normal state error detected\n");
600 		return PCI_ERS_RESULT_CAN_RECOVER;
601 
602 	case pci_channel_io_frozen:
603 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
604 		result = PCI_ERS_RESULT_NEED_RESET;
605 		break;
606 
607 	case pci_channel_io_perm_failure:
608 		dev_warn(hdev->dev, "PCI failure state error detected\n");
609 		result = PCI_ERS_RESULT_DISCONNECT;
610 		break;
611 
612 	default:
613 		result = PCI_ERS_RESULT_NONE;
614 	}
615 
616 	hdev->asic_funcs->halt_engines(hdev, true, false);
617 
618 	return result;
619 }
620 
621 /**
622  * hl_pci_err_resume - resume after a PCI slot reset
623  *
624  * @pdev: pointer to pci device
625  *
626  */
627 static void hl_pci_err_resume(struct pci_dev *pdev)
628 {
629 	struct hl_device *hdev = pci_get_drvdata(pdev);
630 
631 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
632 	hl_device_resume(hdev);
633 }
634 
635 /**
636  * hl_pci_err_slot_reset - a PCI slot reset has just happened
637  *
638  * @pdev: pointer to pci device
639  *
640  * Determine if the driver can recover from the PCI slot reset
641  */
642 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
643 {
644 	struct hl_device *hdev = pci_get_drvdata(pdev);
645 
646 	dev_warn(hdev->dev, "PCI slot reset detected\n");
647 
648 	return PCI_ERS_RESULT_RECOVERED;
649 }
650 
651 static const struct dev_pm_ops hl_pm_ops = {
652 	.suspend = hl_pmops_suspend,
653 	.resume = hl_pmops_resume,
654 };
655 
656 static const struct pci_error_handlers hl_pci_err_handler = {
657 	.error_detected = hl_pci_err_detected,
658 	.slot_reset = hl_pci_err_slot_reset,
659 	.resume = hl_pci_err_resume,
660 };
661 
662 static struct pci_driver hl_pci_driver = {
663 	.name = HL_NAME,
664 	.id_table = ids,
665 	.probe = hl_pci_probe,
666 	.remove = hl_pci_remove,
667 	.shutdown = hl_pci_remove,
668 	.driver = {
669 		.name = HL_NAME,
670 		.pm = &hl_pm_ops,
671 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
672 	},
673 	.err_handler = &hl_pci_err_handler,
674 };
675 
676 /*
677  * hl_init - Initialize the habanalabs kernel driver
678  */
679 static int __init hl_init(void)
680 {
681 	int rc;
682 	dev_t dev;
683 
684 	pr_info("loading driver\n");
685 
686 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
687 	if (rc < 0) {
688 		pr_err("unable to get major\n");
689 		return rc;
690 	}
691 
692 	hl_major = MAJOR(dev);
693 
694 	hl_class = class_create(HL_NAME);
695 	if (IS_ERR(hl_class)) {
696 		pr_err("failed to allocate class\n");
697 		rc = PTR_ERR(hl_class);
698 		goto remove_major;
699 	}
700 
701 	hl_debugfs_init();
702 
703 	rc = pci_register_driver(&hl_pci_driver);
704 	if (rc) {
705 		pr_err("failed to register pci device\n");
706 		goto remove_debugfs;
707 	}
708 
709 	pr_debug("driver loaded\n");
710 
711 	return 0;
712 
713 remove_debugfs:
714 	hl_debugfs_fini();
715 	class_destroy(hl_class);
716 remove_major:
717 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
718 	return rc;
719 }
720 
721 /*
722  * hl_exit - Release all resources of the habanalabs kernel driver
723  */
724 static void __exit hl_exit(void)
725 {
726 	pci_unregister_driver(&hl_pci_driver);
727 
728 	/*
729 	 * Removing debugfs must be after all devices or simulator devices
730 	 * have been removed because otherwise we get a bug in the
731 	 * debugfs module for referencing NULL objects
732 	 */
733 	hl_debugfs_fini();
734 
735 	class_destroy(hl_class);
736 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
737 
738 	idr_destroy(&hl_devs_idr);
739 
740 	pr_debug("driver removed\n");
741 }
742 
743 module_init(hl_init);
744 module_exit(hl_exit);
745