xref: /openbmc/linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision 25ebbc57ca56df3cf9149e9da6b1d3169c8487db)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/aer.h>
16 #include <linux/module.h>
17 
18 #define CREATE_TRACE_POINTS
19 #include <trace/events/habanalabs.h>
20 
21 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
22 
23 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
24 
25 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
26 MODULE_DESCRIPTION(HL_DRIVER_DESC);
27 MODULE_LICENSE("GPL v2");
28 
29 static int hl_major;
30 static struct class *hl_class;
31 static DEFINE_IDR(hl_devs_idr);
32 static DEFINE_MUTEX(hl_devs_idr_lock);
33 
34 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
35 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
36 
37 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
38 static int reset_on_lockup = 1;
39 static int memory_scrub;
40 static ulong boot_error_status_mask = ULONG_MAX;
41 
42 module_param(timeout_locked, int, 0444);
43 MODULE_PARM_DESC(timeout_locked,
44 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
45 
46 module_param(reset_on_lockup, int, 0444);
47 MODULE_PARM_DESC(reset_on_lockup,
48 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
49 
50 module_param(memory_scrub, int, 0444);
51 MODULE_PARM_DESC(memory_scrub,
52 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
53 
54 module_param(boot_error_status_mask, ulong, 0444);
55 MODULE_PARM_DESC(boot_error_status_mask,
56 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
57 
58 #define PCI_VENDOR_ID_HABANALABS	0x1da3
59 
60 #define PCI_IDS_GOYA			0x0001
61 #define PCI_IDS_GAUDI			0x1000
62 #define PCI_IDS_GAUDI_SEC		0x1010
63 
64 #define PCI_IDS_GAUDI2			0x1020
65 
66 static const struct pci_device_id ids[] = {
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
70 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
71 	{ 0, }
72 };
73 MODULE_DEVICE_TABLE(pci, ids);
74 
75 /*
76  * get_asic_type - translate device id to asic type
77  *
78  * @hdev: pointer to habanalabs device structure.
79  *
80  * Translate device id and revision id to asic type.
81  * In case of unidentified device, return -1
82  */
83 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
84 {
85 	struct pci_dev *pdev = hdev->pdev;
86 	enum hl_asic_type asic_type = ASIC_INVALID;
87 
88 	switch (pdev->device) {
89 	case PCI_IDS_GOYA:
90 		asic_type = ASIC_GOYA;
91 		break;
92 	case PCI_IDS_GAUDI:
93 		asic_type = ASIC_GAUDI;
94 		break;
95 	case PCI_IDS_GAUDI_SEC:
96 		asic_type = ASIC_GAUDI_SEC;
97 		break;
98 	case PCI_IDS_GAUDI2:
99 		switch (pdev->revision) {
100 		case REV_ID_A:
101 			asic_type = ASIC_GAUDI2;
102 			break;
103 		case REV_ID_B:
104 			asic_type = ASIC_GAUDI2B;
105 			break;
106 		default:
107 			break;
108 		}
109 		break;
110 	default:
111 		break;
112 	}
113 
114 	return asic_type;
115 }
116 
117 static bool is_asic_secured(enum hl_asic_type asic_type)
118 {
119 	switch (asic_type) {
120 	case ASIC_GAUDI_SEC:
121 		return true;
122 	default:
123 		return false;
124 	}
125 }
126 
127 /*
128  * hl_device_open - open function for habanalabs device
129  *
130  * @inode: pointer to inode structure
131  * @filp: pointer to file structure
132  *
133  * Called when process opens an habanalabs device.
134  */
135 int hl_device_open(struct inode *inode, struct file *filp)
136 {
137 	enum hl_device_status status;
138 	struct hl_device *hdev;
139 	struct hl_fpriv *hpriv;
140 	int rc;
141 
142 	mutex_lock(&hl_devs_idr_lock);
143 	hdev = idr_find(&hl_devs_idr, iminor(inode));
144 	mutex_unlock(&hl_devs_idr_lock);
145 
146 	if (!hdev) {
147 		pr_err("Couldn't find device %d:%d\n",
148 			imajor(inode), iminor(inode));
149 		return -ENXIO;
150 	}
151 
152 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
153 	if (!hpriv)
154 		return -ENOMEM;
155 
156 	hpriv->hdev = hdev;
157 	filp->private_data = hpriv;
158 	hpriv->filp = filp;
159 
160 	mutex_init(&hpriv->notifier_event.lock);
161 	mutex_init(&hpriv->restore_phase_mutex);
162 	mutex_init(&hpriv->ctx_lock);
163 	kref_init(&hpriv->refcount);
164 	nonseekable_open(inode, filp);
165 
166 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
167 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
168 
169 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
170 
171 	mutex_lock(&hdev->fpriv_list_lock);
172 
173 	if (!hl_device_operational(hdev, &status)) {
174 		dev_dbg_ratelimited(hdev->dev,
175 			"Can't open %s because it is %s\n",
176 			dev_name(hdev->dev), hdev->status[status]);
177 
178 		if (status == HL_DEVICE_STATUS_IN_RESET ||
179 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
180 			rc = -EAGAIN;
181 		else
182 			rc = -EPERM;
183 
184 		goto out_err;
185 	}
186 
187 	if (hdev->is_in_dram_scrub) {
188 		dev_dbg_ratelimited(hdev->dev,
189 			"Can't open %s during dram scrub\n",
190 			dev_name(hdev->dev));
191 		rc = -EAGAIN;
192 		goto out_err;
193 	}
194 
195 	if (hdev->compute_ctx_in_release) {
196 		dev_dbg_ratelimited(hdev->dev,
197 			"Can't open %s because another user is still releasing it\n",
198 			dev_name(hdev->dev));
199 		rc = -EAGAIN;
200 		goto out_err;
201 	}
202 
203 	if (hdev->is_compute_ctx_active) {
204 		dev_dbg_ratelimited(hdev->dev,
205 			"Can't open %s because another user is working on it\n",
206 			dev_name(hdev->dev));
207 		rc = -EBUSY;
208 		goto out_err;
209 	}
210 
211 	rc = hl_ctx_create(hdev, hpriv);
212 	if (rc) {
213 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
214 		goto out_err;
215 	}
216 
217 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
218 	mutex_unlock(&hdev->fpriv_list_lock);
219 
220 	hdev->asic_funcs->send_device_activity(hdev, true);
221 
222 	hl_debugfs_add_file(hpriv);
223 
224 	memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
225 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
226 	hdev->captured_err_info.undef_opcode.write_enable = true;
227 
228 	hdev->open_counter++;
229 	hdev->last_successful_open_jif = jiffies;
230 	hdev->last_successful_open_ktime = ktime_get();
231 
232 	return 0;
233 
234 out_err:
235 	mutex_unlock(&hdev->fpriv_list_lock);
236 	hl_mem_mgr_fini(&hpriv->mem_mgr);
237 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
238 	filp->private_data = NULL;
239 	mutex_destroy(&hpriv->ctx_lock);
240 	mutex_destroy(&hpriv->restore_phase_mutex);
241 	mutex_destroy(&hpriv->notifier_event.lock);
242 	put_pid(hpriv->taskpid);
243 
244 	kfree(hpriv);
245 
246 	return rc;
247 }
248 
249 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
250 {
251 	struct hl_device *hdev;
252 	struct hl_fpriv *hpriv;
253 	int rc;
254 
255 	mutex_lock(&hl_devs_idr_lock);
256 	hdev = idr_find(&hl_devs_idr, iminor(inode));
257 	mutex_unlock(&hl_devs_idr_lock);
258 
259 	if (!hdev) {
260 		pr_err("Couldn't find device %d:%d\n",
261 			imajor(inode), iminor(inode));
262 		return -ENXIO;
263 	}
264 
265 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
266 	if (!hpriv)
267 		return -ENOMEM;
268 
269 	/* Prevent other routines from reading partial hpriv data by
270 	 * initializing hpriv fields before inserting it to the list
271 	 */
272 	hpriv->hdev = hdev;
273 	filp->private_data = hpriv;
274 	hpriv->filp = filp;
275 
276 	mutex_init(&hpriv->notifier_event.lock);
277 	nonseekable_open(inode, filp);
278 
279 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
280 
281 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
282 
283 	if (!hl_ctrl_device_operational(hdev, NULL)) {
284 		dev_dbg_ratelimited(hdev->dev_ctrl,
285 			"Can't open %s because it is disabled\n",
286 			dev_name(hdev->dev_ctrl));
287 		rc = -EPERM;
288 		goto out_err;
289 	}
290 
291 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
292 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
293 
294 	return 0;
295 
296 out_err:
297 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
298 	filp->private_data = NULL;
299 	put_pid(hpriv->taskpid);
300 
301 	kfree(hpriv);
302 
303 	return rc;
304 }
305 
306 static void set_driver_behavior_per_device(struct hl_device *hdev)
307 {
308 	hdev->nic_ports_mask = 0;
309 	hdev->fw_components = FW_TYPE_ALL_TYPES;
310 	hdev->mmu_enable = MMU_EN_ALL;
311 	hdev->cpu_queues_enable = 1;
312 	hdev->pldm = 0;
313 	hdev->hard_reset_on_fw_events = 1;
314 	hdev->bmc_enable = 1;
315 	hdev->reset_on_preboot_fail = 1;
316 	hdev->heartbeat = 1;
317 }
318 
319 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
320 {
321 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
322 
323 	hdev->major = hl_major;
324 	hdev->hclass = hl_class;
325 	hdev->memory_scrub = memory_scrub;
326 	hdev->reset_on_lockup = reset_on_lockup;
327 	hdev->boot_error_status_mask = boot_error_status_mask;
328 }
329 
330 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
331 {
332 	switch (hdev->asic_type) {
333 	case ASIC_GAUDI:
334 	case ASIC_GAUDI_SEC:
335 		/* If user didn't request a different timeout than the default one, we have
336 		 * a different default timeout for Gaudi
337 		 */
338 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
339 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
340 										MSEC_PER_SEC);
341 
342 		hdev->reset_upon_device_release = 0;
343 		break;
344 
345 	case ASIC_GOYA:
346 		hdev->reset_upon_device_release = 0;
347 		break;
348 
349 	default:
350 		hdev->reset_upon_device_release = 1;
351 		break;
352 	}
353 }
354 
355 static int fixup_device_params(struct hl_device *hdev)
356 {
357 	int tmp_timeout;
358 
359 	tmp_timeout = timeout_locked;
360 
361 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
362 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
363 
364 	if (tmp_timeout)
365 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
366 	else
367 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
368 
369 	hdev->stop_on_err = true;
370 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
371 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
372 
373 	/* Enable only after the initialization of the device */
374 	hdev->disabled = true;
375 
376 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
377 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
378 		pr_err("Preboot must be set along with other components");
379 		return -EINVAL;
380 	}
381 
382 	/* If CPU queues not enabled, no way to do heartbeat */
383 	if (!hdev->cpu_queues_enable)
384 		hdev->heartbeat = 0;
385 
386 	fixup_device_params_per_asic(hdev, tmp_timeout);
387 
388 	return 0;
389 }
390 
391 /**
392  * create_hdev - create habanalabs device instance
393  *
394  * @dev: will hold the pointer to the new habanalabs device structure
395  * @pdev: pointer to the pci device
396  *
397  * Allocate memory for habanalabs device and initialize basic fields
398  * Identify the ASIC type
399  * Allocate ID (minor) for the device (only for real devices)
400  */
401 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
402 {
403 	int main_id, ctrl_id = 0, rc = 0;
404 	struct hl_device *hdev;
405 
406 	*dev = NULL;
407 
408 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
409 	if (!hdev)
410 		return -ENOMEM;
411 
412 	/* Will be NULL in case of simulator device */
413 	hdev->pdev = pdev;
414 
415 	/* Assign status description string */
416 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
417 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
418 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
419 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
420 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
421 					"in device creation", HL_STR_MAX);
422 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
423 					"in reset after device release", HL_STR_MAX);
424 
425 
426 	/* First, we must find out which ASIC are we handling. This is needed
427 	 * to configure the behavior of the driver (kernel parameters)
428 	 */
429 	hdev->asic_type = get_asic_type(hdev);
430 	if (hdev->asic_type == ASIC_INVALID) {
431 		dev_err(&pdev->dev, "Unsupported ASIC\n");
432 		rc = -ENODEV;
433 		goto free_hdev;
434 	}
435 
436 	copy_kernel_module_params_to_device(hdev);
437 
438 	set_driver_behavior_per_device(hdev);
439 
440 	fixup_device_params(hdev);
441 
442 	mutex_lock(&hl_devs_idr_lock);
443 
444 	/* Always save 2 numbers, 1 for main device and 1 for control.
445 	 * They must be consecutive
446 	 */
447 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
448 
449 	if (main_id >= 0)
450 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
451 					main_id + 2, GFP_KERNEL);
452 
453 	mutex_unlock(&hl_devs_idr_lock);
454 
455 	if ((main_id < 0) || (ctrl_id < 0)) {
456 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
457 			pr_err("too many devices in the system\n");
458 
459 		if (main_id >= 0) {
460 			mutex_lock(&hl_devs_idr_lock);
461 			idr_remove(&hl_devs_idr, main_id);
462 			mutex_unlock(&hl_devs_idr_lock);
463 		}
464 
465 		rc = -EBUSY;
466 		goto free_hdev;
467 	}
468 
469 	hdev->id = main_id;
470 	hdev->id_control = ctrl_id;
471 
472 	*dev = hdev;
473 
474 	return 0;
475 
476 free_hdev:
477 	kfree(hdev);
478 	return rc;
479 }
480 
481 /*
482  * destroy_hdev - destroy habanalabs device instance
483  *
484  * @dev: pointer to the habanalabs device structure
485  *
486  */
487 static void destroy_hdev(struct hl_device *hdev)
488 {
489 	/* Remove device from the device list */
490 	mutex_lock(&hl_devs_idr_lock);
491 	idr_remove(&hl_devs_idr, hdev->id);
492 	idr_remove(&hl_devs_idr, hdev->id_control);
493 	mutex_unlock(&hl_devs_idr_lock);
494 
495 	kfree(hdev);
496 }
497 
498 static int hl_pmops_suspend(struct device *dev)
499 {
500 	struct hl_device *hdev = dev_get_drvdata(dev);
501 
502 	pr_debug("Going to suspend PCI device\n");
503 
504 	if (!hdev) {
505 		pr_err("device pointer is NULL in suspend\n");
506 		return 0;
507 	}
508 
509 	return hl_device_suspend(hdev);
510 }
511 
512 static int hl_pmops_resume(struct device *dev)
513 {
514 	struct hl_device *hdev = dev_get_drvdata(dev);
515 
516 	pr_debug("Going to resume PCI device\n");
517 
518 	if (!hdev) {
519 		pr_err("device pointer is NULL in resume\n");
520 		return 0;
521 	}
522 
523 	return hl_device_resume(hdev);
524 }
525 
526 /**
527  * hl_pci_probe - probe PCI habanalabs devices
528  *
529  * @pdev: pointer to pci device
530  * @id: pointer to pci device id structure
531  *
532  * Standard PCI probe function for habanalabs device.
533  * Create a new habanalabs device and initialize it according to the
534  * device's type
535  */
536 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
537 {
538 	struct hl_device *hdev;
539 	int rc;
540 
541 	dev_info(&pdev->dev, HL_NAME
542 		 " device found [%04x:%04x] (rev %x)\n",
543 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
544 
545 	rc = create_hdev(&hdev, pdev);
546 	if (rc)
547 		return rc;
548 
549 	pci_set_drvdata(pdev, hdev);
550 
551 	pci_enable_pcie_error_reporting(pdev);
552 
553 	rc = hl_device_init(hdev);
554 	if (rc) {
555 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
556 		rc = -ENODEV;
557 		goto disable_device;
558 	}
559 
560 	return 0;
561 
562 disable_device:
563 	pci_disable_pcie_error_reporting(pdev);
564 	pci_set_drvdata(pdev, NULL);
565 	destroy_hdev(hdev);
566 
567 	return rc;
568 }
569 
570 /*
571  * hl_pci_remove - remove PCI habanalabs devices
572  *
573  * @pdev: pointer to pci device
574  *
575  * Standard PCI remove function for habanalabs device
576  */
577 static void hl_pci_remove(struct pci_dev *pdev)
578 {
579 	struct hl_device *hdev;
580 
581 	hdev = pci_get_drvdata(pdev);
582 	if (!hdev)
583 		return;
584 
585 	hl_device_fini(hdev);
586 	pci_disable_pcie_error_reporting(pdev);
587 	pci_set_drvdata(pdev, NULL);
588 	destroy_hdev(hdev);
589 }
590 
591 /**
592  * hl_pci_err_detected - a PCI bus error detected on this device
593  *
594  * @pdev: pointer to pci device
595  * @state: PCI error type
596  *
597  * Called by the PCI subsystem whenever a non-correctable
598  * PCI bus error is detected
599  */
600 static pci_ers_result_t
601 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
602 {
603 	struct hl_device *hdev = pci_get_drvdata(pdev);
604 	enum pci_ers_result result;
605 
606 	switch (state) {
607 	case pci_channel_io_normal:
608 		dev_warn(hdev->dev, "PCI normal state error detected\n");
609 		return PCI_ERS_RESULT_CAN_RECOVER;
610 
611 	case pci_channel_io_frozen:
612 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
613 		result = PCI_ERS_RESULT_NEED_RESET;
614 		break;
615 
616 	case pci_channel_io_perm_failure:
617 		dev_warn(hdev->dev, "PCI failure state error detected\n");
618 		result = PCI_ERS_RESULT_DISCONNECT;
619 		break;
620 
621 	default:
622 		result = PCI_ERS_RESULT_NONE;
623 	}
624 
625 	hdev->asic_funcs->halt_engines(hdev, true, false);
626 
627 	return result;
628 }
629 
630 /**
631  * hl_pci_err_resume - resume after a PCI slot reset
632  *
633  * @pdev: pointer to pci device
634  *
635  */
636 static void hl_pci_err_resume(struct pci_dev *pdev)
637 {
638 	struct hl_device *hdev = pci_get_drvdata(pdev);
639 
640 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
641 	hl_device_resume(hdev);
642 }
643 
644 /**
645  * hl_pci_err_slot_reset - a PCI slot reset has just happened
646  *
647  * @pdev: pointer to pci device
648  *
649  * Determine if the driver can recover from the PCI slot reset
650  */
651 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
652 {
653 	struct hl_device *hdev = pci_get_drvdata(pdev);
654 
655 	dev_warn(hdev->dev, "PCI slot reset detected\n");
656 
657 	return PCI_ERS_RESULT_RECOVERED;
658 }
659 
660 static const struct dev_pm_ops hl_pm_ops = {
661 	.suspend = hl_pmops_suspend,
662 	.resume = hl_pmops_resume,
663 };
664 
665 static const struct pci_error_handlers hl_pci_err_handler = {
666 	.error_detected = hl_pci_err_detected,
667 	.slot_reset = hl_pci_err_slot_reset,
668 	.resume = hl_pci_err_resume,
669 };
670 
671 static struct pci_driver hl_pci_driver = {
672 	.name = HL_NAME,
673 	.id_table = ids,
674 	.probe = hl_pci_probe,
675 	.remove = hl_pci_remove,
676 	.shutdown = hl_pci_remove,
677 	.driver = {
678 		.name = HL_NAME,
679 		.pm = &hl_pm_ops,
680 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
681 	},
682 	.err_handler = &hl_pci_err_handler,
683 };
684 
685 /*
686  * hl_init - Initialize the habanalabs kernel driver
687  */
688 static int __init hl_init(void)
689 {
690 	int rc;
691 	dev_t dev;
692 
693 	pr_info("loading driver\n");
694 
695 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
696 	if (rc < 0) {
697 		pr_err("unable to get major\n");
698 		return rc;
699 	}
700 
701 	hl_major = MAJOR(dev);
702 
703 	hl_class = class_create(THIS_MODULE, HL_NAME);
704 	if (IS_ERR(hl_class)) {
705 		pr_err("failed to allocate class\n");
706 		rc = PTR_ERR(hl_class);
707 		goto remove_major;
708 	}
709 
710 	hl_debugfs_init();
711 
712 	rc = pci_register_driver(&hl_pci_driver);
713 	if (rc) {
714 		pr_err("failed to register pci device\n");
715 		goto remove_debugfs;
716 	}
717 
718 	pr_debug("driver loaded\n");
719 
720 	return 0;
721 
722 remove_debugfs:
723 	hl_debugfs_fini();
724 	class_destroy(hl_class);
725 remove_major:
726 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
727 	return rc;
728 }
729 
730 /*
731  * hl_exit - Release all resources of the habanalabs kernel driver
732  */
733 static void __exit hl_exit(void)
734 {
735 	pci_unregister_driver(&hl_pci_driver);
736 
737 	/*
738 	 * Removing debugfs must be after all devices or simulator devices
739 	 * have been removed because otherwise we get a bug in the
740 	 * debugfs module for referencing NULL objects
741 	 */
742 	hl_debugfs_fini();
743 
744 	class_destroy(hl_class);
745 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
746 
747 	idr_destroy(&hl_devs_idr);
748 
749 	pr_debug("driver removed\n");
750 }
751 
752 module_init(hl_init);
753 module_exit(hl_exit);
754