xref: /openbmc/linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision e6b9d8eddb1772d99a676a906d42865293934edd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 
17 #define CREATE_TRACE_POINTS
18 #include <trace/events/habanalabs.h>
19 
20 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
21 
22 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
23 
24 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
25 MODULE_DESCRIPTION(HL_DRIVER_DESC);
26 MODULE_LICENSE("GPL v2");
27 
28 static int hl_major;
29 static struct class *hl_class;
30 static DEFINE_IDR(hl_devs_idr);
31 static DEFINE_MUTEX(hl_devs_idr_lock);
32 
33 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
34 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
35 
36 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
37 static int reset_on_lockup = 1;
38 static int memory_scrub;
39 static ulong boot_error_status_mask = ULONG_MAX;
40 
41 module_param(timeout_locked, int, 0444);
42 MODULE_PARM_DESC(timeout_locked,
43 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
44 
45 module_param(reset_on_lockup, int, 0444);
46 MODULE_PARM_DESC(reset_on_lockup,
47 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
48 
49 module_param(memory_scrub, int, 0444);
50 MODULE_PARM_DESC(memory_scrub,
51 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
52 
53 module_param(boot_error_status_mask, ulong, 0444);
54 MODULE_PARM_DESC(boot_error_status_mask,
55 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
56 
57 #define PCI_VENDOR_ID_HABANALABS	0x1da3
58 
59 #define PCI_IDS_GOYA			0x0001
60 #define PCI_IDS_GAUDI			0x1000
61 #define PCI_IDS_GAUDI_SEC		0x1010
62 
63 #define PCI_IDS_GAUDI2			0x1020
64 
65 static const struct pci_device_id ids[] = {
66 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
70 	{ 0, }
71 };
72 MODULE_DEVICE_TABLE(pci, ids);
73 
74 /*
75  * get_asic_type - translate device id to asic type
76  *
77  * @hdev: pointer to habanalabs device structure.
78  *
79  * Translate device id and revision id to asic type.
80  * In case of unidentified device, return -1
81  */
82 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
83 {
84 	struct pci_dev *pdev = hdev->pdev;
85 	enum hl_asic_type asic_type = ASIC_INVALID;
86 
87 	switch (pdev->device) {
88 	case PCI_IDS_GOYA:
89 		asic_type = ASIC_GOYA;
90 		break;
91 	case PCI_IDS_GAUDI:
92 		asic_type = ASIC_GAUDI;
93 		break;
94 	case PCI_IDS_GAUDI_SEC:
95 		asic_type = ASIC_GAUDI_SEC;
96 		break;
97 	case PCI_IDS_GAUDI2:
98 		switch (pdev->revision) {
99 		case REV_ID_A:
100 			asic_type = ASIC_GAUDI2;
101 			break;
102 		case REV_ID_B:
103 			asic_type = ASIC_GAUDI2B;
104 			break;
105 		default:
106 			break;
107 		}
108 		break;
109 	default:
110 		break;
111 	}
112 
113 	return asic_type;
114 }
115 
116 static bool is_asic_secured(enum hl_asic_type asic_type)
117 {
118 	switch (asic_type) {
119 	case ASIC_GAUDI_SEC:
120 		return true;
121 	default:
122 		return false;
123 	}
124 }
125 
126 /*
127  * hl_device_open - open function for habanalabs device
128  *
129  * @inode: pointer to inode structure
130  * @filp: pointer to file structure
131  *
132  * Called when process opens an habanalabs device.
133  */
134 int hl_device_open(struct inode *inode, struct file *filp)
135 {
136 	enum hl_device_status status;
137 	struct hl_device *hdev;
138 	struct hl_fpriv *hpriv;
139 	int rc;
140 
141 	mutex_lock(&hl_devs_idr_lock);
142 	hdev = idr_find(&hl_devs_idr, iminor(inode));
143 	mutex_unlock(&hl_devs_idr_lock);
144 
145 	if (!hdev) {
146 		pr_err("Couldn't find device %d:%d\n",
147 			imajor(inode), iminor(inode));
148 		return -ENXIO;
149 	}
150 
151 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
152 	if (!hpriv)
153 		return -ENOMEM;
154 
155 	hpriv->hdev = hdev;
156 	filp->private_data = hpriv;
157 	hpriv->filp = filp;
158 
159 	mutex_init(&hpriv->notifier_event.lock);
160 	mutex_init(&hpriv->restore_phase_mutex);
161 	mutex_init(&hpriv->ctx_lock);
162 	kref_init(&hpriv->refcount);
163 	nonseekable_open(inode, filp);
164 
165 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
166 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
167 
168 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
169 
170 	mutex_lock(&hdev->fpriv_list_lock);
171 
172 	if (!hl_device_operational(hdev, &status)) {
173 		dev_dbg_ratelimited(hdev->dev,
174 			"Can't open %s because it is %s\n",
175 			dev_name(hdev->dev), hdev->status[status]);
176 
177 		if (status == HL_DEVICE_STATUS_IN_RESET ||
178 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
179 			rc = -EAGAIN;
180 		else
181 			rc = -EPERM;
182 
183 		goto out_err;
184 	}
185 
186 	if (hdev->is_in_dram_scrub) {
187 		dev_dbg_ratelimited(hdev->dev,
188 			"Can't open %s during dram scrub\n",
189 			dev_name(hdev->dev));
190 		rc = -EAGAIN;
191 		goto out_err;
192 	}
193 
194 	if (hdev->compute_ctx_in_release) {
195 		dev_dbg_ratelimited(hdev->dev,
196 			"Can't open %s because another user is still releasing it\n",
197 			dev_name(hdev->dev));
198 		rc = -EAGAIN;
199 		goto out_err;
200 	}
201 
202 	if (hdev->is_compute_ctx_active) {
203 		dev_dbg_ratelimited(hdev->dev,
204 			"Can't open %s because another user is working on it\n",
205 			dev_name(hdev->dev));
206 		rc = -EBUSY;
207 		goto out_err;
208 	}
209 
210 	rc = hl_ctx_create(hdev, hpriv);
211 	if (rc) {
212 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
213 		goto out_err;
214 	}
215 
216 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
217 	mutex_unlock(&hdev->fpriv_list_lock);
218 
219 	hdev->asic_funcs->send_device_activity(hdev, true);
220 
221 	hl_debugfs_add_file(hpriv);
222 
223 	memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
224 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
225 	hdev->captured_err_info.undef_opcode.write_enable = true;
226 
227 	hdev->open_counter++;
228 	hdev->last_successful_open_jif = jiffies;
229 	hdev->last_successful_open_ktime = ktime_get();
230 
231 	return 0;
232 
233 out_err:
234 	mutex_unlock(&hdev->fpriv_list_lock);
235 	hl_mem_mgr_fini(&hpriv->mem_mgr);
236 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
237 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
238 	filp->private_data = NULL;
239 	mutex_destroy(&hpriv->ctx_lock);
240 	mutex_destroy(&hpriv->restore_phase_mutex);
241 	mutex_destroy(&hpriv->notifier_event.lock);
242 	put_pid(hpriv->taskpid);
243 
244 	kfree(hpriv);
245 
246 	return rc;
247 }
248 
249 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
250 {
251 	struct hl_device *hdev;
252 	struct hl_fpriv *hpriv;
253 	int rc;
254 
255 	mutex_lock(&hl_devs_idr_lock);
256 	hdev = idr_find(&hl_devs_idr, iminor(inode));
257 	mutex_unlock(&hl_devs_idr_lock);
258 
259 	if (!hdev) {
260 		pr_err("Couldn't find device %d:%d\n",
261 			imajor(inode), iminor(inode));
262 		return -ENXIO;
263 	}
264 
265 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
266 	if (!hpriv)
267 		return -ENOMEM;
268 
269 	/* Prevent other routines from reading partial hpriv data by
270 	 * initializing hpriv fields before inserting it to the list
271 	 */
272 	hpriv->hdev = hdev;
273 	filp->private_data = hpriv;
274 	hpriv->filp = filp;
275 
276 	mutex_init(&hpriv->notifier_event.lock);
277 	nonseekable_open(inode, filp);
278 
279 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
280 
281 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
282 
283 	if (!hl_ctrl_device_operational(hdev, NULL)) {
284 		dev_dbg_ratelimited(hdev->dev_ctrl,
285 			"Can't open %s because it is disabled\n",
286 			dev_name(hdev->dev_ctrl));
287 		rc = -EPERM;
288 		goto out_err;
289 	}
290 
291 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
292 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
293 
294 	return 0;
295 
296 out_err:
297 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
298 	filp->private_data = NULL;
299 	put_pid(hpriv->taskpid);
300 
301 	kfree(hpriv);
302 
303 	return rc;
304 }
305 
306 static void set_driver_behavior_per_device(struct hl_device *hdev)
307 {
308 	hdev->nic_ports_mask = 0;
309 	hdev->fw_components = FW_TYPE_ALL_TYPES;
310 	hdev->mmu_enable = MMU_EN_ALL;
311 	hdev->cpu_queues_enable = 1;
312 	hdev->pldm = 0;
313 	hdev->hard_reset_on_fw_events = 1;
314 	hdev->bmc_enable = 1;
315 	hdev->reset_on_preboot_fail = 1;
316 	hdev->heartbeat = 1;
317 }
318 
319 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
320 {
321 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
322 
323 	hdev->major = hl_major;
324 	hdev->hclass = hl_class;
325 	hdev->memory_scrub = memory_scrub;
326 	hdev->reset_on_lockup = reset_on_lockup;
327 	hdev->boot_error_status_mask = boot_error_status_mask;
328 }
329 
330 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
331 {
332 	switch (hdev->asic_type) {
333 	case ASIC_GAUDI:
334 	case ASIC_GAUDI_SEC:
335 		/* If user didn't request a different timeout than the default one, we have
336 		 * a different default timeout for Gaudi
337 		 */
338 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
339 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
340 										MSEC_PER_SEC);
341 
342 		hdev->reset_upon_device_release = 0;
343 		break;
344 
345 	case ASIC_GOYA:
346 		hdev->reset_upon_device_release = 0;
347 		break;
348 
349 	default:
350 		hdev->reset_upon_device_release = 1;
351 		break;
352 	}
353 }
354 
355 static int fixup_device_params(struct hl_device *hdev)
356 {
357 	int tmp_timeout;
358 
359 	tmp_timeout = timeout_locked;
360 
361 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
362 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
363 
364 	if (tmp_timeout)
365 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
366 	else
367 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
368 
369 	hdev->stop_on_err = true;
370 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
371 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
372 
373 	/* Enable only after the initialization of the device */
374 	hdev->disabled = true;
375 
376 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
377 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
378 		pr_err("Preboot must be set along with other components");
379 		return -EINVAL;
380 	}
381 
382 	/* If CPU queues not enabled, no way to do heartbeat */
383 	if (!hdev->cpu_queues_enable)
384 		hdev->heartbeat = 0;
385 
386 	fixup_device_params_per_asic(hdev, tmp_timeout);
387 
388 	return 0;
389 }
390 
391 /**
392  * create_hdev - create habanalabs device instance
393  *
394  * @dev: will hold the pointer to the new habanalabs device structure
395  * @pdev: pointer to the pci device
396  *
397  * Allocate memory for habanalabs device and initialize basic fields
398  * Identify the ASIC type
399  * Allocate ID (minor) for the device (only for real devices)
400  */
401 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
402 {
403 	int main_id, ctrl_id = 0, rc = 0;
404 	struct hl_device *hdev;
405 
406 	*dev = NULL;
407 
408 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
409 	if (!hdev)
410 		return -ENOMEM;
411 
412 	/* Will be NULL in case of simulator device */
413 	hdev->pdev = pdev;
414 
415 	/* Assign status description string */
416 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
417 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
418 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
419 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
420 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
421 					"in device creation", HL_STR_MAX);
422 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
423 					"in reset after device release", HL_STR_MAX);
424 
425 
426 	/* First, we must find out which ASIC are we handling. This is needed
427 	 * to configure the behavior of the driver (kernel parameters)
428 	 */
429 	hdev->asic_type = get_asic_type(hdev);
430 	if (hdev->asic_type == ASIC_INVALID) {
431 		dev_err(&pdev->dev, "Unsupported ASIC\n");
432 		rc = -ENODEV;
433 		goto free_hdev;
434 	}
435 
436 	copy_kernel_module_params_to_device(hdev);
437 
438 	set_driver_behavior_per_device(hdev);
439 
440 	fixup_device_params(hdev);
441 
442 	mutex_lock(&hl_devs_idr_lock);
443 
444 	/* Always save 2 numbers, 1 for main device and 1 for control.
445 	 * They must be consecutive
446 	 */
447 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
448 
449 	if (main_id >= 0)
450 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
451 					main_id + 2, GFP_KERNEL);
452 
453 	mutex_unlock(&hl_devs_idr_lock);
454 
455 	if ((main_id < 0) || (ctrl_id < 0)) {
456 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
457 			pr_err("too many devices in the system\n");
458 
459 		if (main_id >= 0) {
460 			mutex_lock(&hl_devs_idr_lock);
461 			idr_remove(&hl_devs_idr, main_id);
462 			mutex_unlock(&hl_devs_idr_lock);
463 		}
464 
465 		rc = -EBUSY;
466 		goto free_hdev;
467 	}
468 
469 	hdev->id = main_id;
470 	hdev->id_control = ctrl_id;
471 
472 	*dev = hdev;
473 
474 	return 0;
475 
476 free_hdev:
477 	kfree(hdev);
478 	return rc;
479 }
480 
481 /*
482  * destroy_hdev - destroy habanalabs device instance
483  *
484  * @dev: pointer to the habanalabs device structure
485  *
486  */
487 static void destroy_hdev(struct hl_device *hdev)
488 {
489 	/* Remove device from the device list */
490 	mutex_lock(&hl_devs_idr_lock);
491 	idr_remove(&hl_devs_idr, hdev->id);
492 	idr_remove(&hl_devs_idr, hdev->id_control);
493 	mutex_unlock(&hl_devs_idr_lock);
494 
495 	kfree(hdev);
496 }
497 
498 static int hl_pmops_suspend(struct device *dev)
499 {
500 	struct hl_device *hdev = dev_get_drvdata(dev);
501 
502 	pr_debug("Going to suspend PCI device\n");
503 
504 	if (!hdev) {
505 		pr_err("device pointer is NULL in suspend\n");
506 		return 0;
507 	}
508 
509 	return hl_device_suspend(hdev);
510 }
511 
512 static int hl_pmops_resume(struct device *dev)
513 {
514 	struct hl_device *hdev = dev_get_drvdata(dev);
515 
516 	pr_debug("Going to resume PCI device\n");
517 
518 	if (!hdev) {
519 		pr_err("device pointer is NULL in resume\n");
520 		return 0;
521 	}
522 
523 	return hl_device_resume(hdev);
524 }
525 
526 /**
527  * hl_pci_probe - probe PCI habanalabs devices
528  *
529  * @pdev: pointer to pci device
530  * @id: pointer to pci device id structure
531  *
532  * Standard PCI probe function for habanalabs device.
533  * Create a new habanalabs device and initialize it according to the
534  * device's type
535  */
536 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
537 {
538 	struct hl_device *hdev;
539 	int rc;
540 
541 	dev_info(&pdev->dev, HL_NAME
542 		 " device found [%04x:%04x] (rev %x)\n",
543 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
544 
545 	rc = create_hdev(&hdev, pdev);
546 	if (rc)
547 		return rc;
548 
549 	pci_set_drvdata(pdev, hdev);
550 
551 	rc = hl_device_init(hdev);
552 	if (rc) {
553 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
554 		rc = -ENODEV;
555 		goto disable_device;
556 	}
557 
558 	return 0;
559 
560 disable_device:
561 	pci_set_drvdata(pdev, NULL);
562 	destroy_hdev(hdev);
563 
564 	return rc;
565 }
566 
567 /*
568  * hl_pci_remove - remove PCI habanalabs devices
569  *
570  * @pdev: pointer to pci device
571  *
572  * Standard PCI remove function for habanalabs device
573  */
574 static void hl_pci_remove(struct pci_dev *pdev)
575 {
576 	struct hl_device *hdev;
577 
578 	hdev = pci_get_drvdata(pdev);
579 	if (!hdev)
580 		return;
581 
582 	hl_device_fini(hdev);
583 	pci_set_drvdata(pdev, NULL);
584 	destroy_hdev(hdev);
585 }
586 
587 /**
588  * hl_pci_err_detected - a PCI bus error detected on this device
589  *
590  * @pdev: pointer to pci device
591  * @state: PCI error type
592  *
593  * Called by the PCI subsystem whenever a non-correctable
594  * PCI bus error is detected
595  */
596 static pci_ers_result_t
597 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
598 {
599 	struct hl_device *hdev = pci_get_drvdata(pdev);
600 	enum pci_ers_result result;
601 
602 	switch (state) {
603 	case pci_channel_io_normal:
604 		dev_warn(hdev->dev, "PCI normal state error detected\n");
605 		return PCI_ERS_RESULT_CAN_RECOVER;
606 
607 	case pci_channel_io_frozen:
608 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
609 		result = PCI_ERS_RESULT_NEED_RESET;
610 		break;
611 
612 	case pci_channel_io_perm_failure:
613 		dev_warn(hdev->dev, "PCI failure state error detected\n");
614 		result = PCI_ERS_RESULT_DISCONNECT;
615 		break;
616 
617 	default:
618 		result = PCI_ERS_RESULT_NONE;
619 	}
620 
621 	hdev->asic_funcs->halt_engines(hdev, true, false);
622 
623 	return result;
624 }
625 
626 /**
627  * hl_pci_err_resume - resume after a PCI slot reset
628  *
629  * @pdev: pointer to pci device
630  *
631  */
632 static void hl_pci_err_resume(struct pci_dev *pdev)
633 {
634 	struct hl_device *hdev = pci_get_drvdata(pdev);
635 
636 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
637 	hl_device_resume(hdev);
638 }
639 
640 /**
641  * hl_pci_err_slot_reset - a PCI slot reset has just happened
642  *
643  * @pdev: pointer to pci device
644  *
645  * Determine if the driver can recover from the PCI slot reset
646  */
647 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
648 {
649 	struct hl_device *hdev = pci_get_drvdata(pdev);
650 
651 	dev_warn(hdev->dev, "PCI slot reset detected\n");
652 
653 	return PCI_ERS_RESULT_RECOVERED;
654 }
655 
656 static const struct dev_pm_ops hl_pm_ops = {
657 	.suspend = hl_pmops_suspend,
658 	.resume = hl_pmops_resume,
659 };
660 
661 static const struct pci_error_handlers hl_pci_err_handler = {
662 	.error_detected = hl_pci_err_detected,
663 	.slot_reset = hl_pci_err_slot_reset,
664 	.resume = hl_pci_err_resume,
665 };
666 
667 static struct pci_driver hl_pci_driver = {
668 	.name = HL_NAME,
669 	.id_table = ids,
670 	.probe = hl_pci_probe,
671 	.remove = hl_pci_remove,
672 	.shutdown = hl_pci_remove,
673 	.driver = {
674 		.name = HL_NAME,
675 		.pm = &hl_pm_ops,
676 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
677 	},
678 	.err_handler = &hl_pci_err_handler,
679 };
680 
681 /*
682  * hl_init - Initialize the habanalabs kernel driver
683  */
684 static int __init hl_init(void)
685 {
686 	int rc;
687 	dev_t dev;
688 
689 	pr_info("loading driver\n");
690 
691 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
692 	if (rc < 0) {
693 		pr_err("unable to get major\n");
694 		return rc;
695 	}
696 
697 	hl_major = MAJOR(dev);
698 
699 	hl_class = class_create(HL_NAME);
700 	if (IS_ERR(hl_class)) {
701 		pr_err("failed to allocate class\n");
702 		rc = PTR_ERR(hl_class);
703 		goto remove_major;
704 	}
705 
706 	hl_debugfs_init();
707 
708 	rc = pci_register_driver(&hl_pci_driver);
709 	if (rc) {
710 		pr_err("failed to register pci device\n");
711 		goto remove_debugfs;
712 	}
713 
714 	pr_debug("driver loaded\n");
715 
716 	return 0;
717 
718 remove_debugfs:
719 	hl_debugfs_fini();
720 	class_destroy(hl_class);
721 remove_major:
722 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
723 	return rc;
724 }
725 
726 /*
727  * hl_exit - Release all resources of the habanalabs kernel driver
728  */
729 static void __exit hl_exit(void)
730 {
731 	pci_unregister_driver(&hl_pci_driver);
732 
733 	/*
734 	 * Removing debugfs must be after all devices or simulator devices
735 	 * have been removed because otherwise we get a bug in the
736 	 * debugfs module for referencing NULL objects
737 	 */
738 	hl_debugfs_fini();
739 
740 	class_destroy(hl_class);
741 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
742 
743 	idr_destroy(&hl_devs_idr);
744 
745 	pr_debug("driver removed\n");
746 }
747 
748 module_init(hl_init);
749 module_exit(hl_exit);
750