1e65e175bSOded Gabbay // SPDX-License-Identifier: GPL-2.0
2e65e175bSOded Gabbay
3e65e175bSOded Gabbay /*
4e65e175bSOded Gabbay * Copyright 2016-2021 HabanaLabs, Ltd.
5e65e175bSOded Gabbay * All Rights Reserved.
6e65e175bSOded Gabbay *
7e65e175bSOded Gabbay */
8e65e175bSOded Gabbay
9e65e175bSOded Gabbay #define pr_fmt(fmt) "habanalabs: " fmt
10e65e175bSOded Gabbay
11e65e175bSOded Gabbay #include "habanalabs.h"
12e65e175bSOded Gabbay #include "../include/hw_ip/pci/pci_general.h"
13e65e175bSOded Gabbay
14e65e175bSOded Gabbay #include <linux/pci.h>
15e65e175bSOded Gabbay #include <linux/module.h>
16314a7ffdSMoti Haimovski #include <linux/vmalloc.h>
17e65e175bSOded Gabbay
18e65e175bSOded Gabbay #define CREATE_TRACE_POINTS
19e65e175bSOded Gabbay #include <trace/events/habanalabs.h>
20e65e175bSOded Gabbay
21e65e175bSOded Gabbay #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
22e65e175bSOded Gabbay
23e65e175bSOded Gabbay #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
24e65e175bSOded Gabbay
25e65e175bSOded Gabbay MODULE_AUTHOR(HL_DRIVER_AUTHOR);
26e65e175bSOded Gabbay MODULE_DESCRIPTION(HL_DRIVER_DESC);
27e65e175bSOded Gabbay MODULE_LICENSE("GPL v2");
28e65e175bSOded Gabbay
29e65e175bSOded Gabbay static int hl_major;
30e65e175bSOded Gabbay static struct class *hl_class;
31e65e175bSOded Gabbay static DEFINE_IDR(hl_devs_idr);
32e65e175bSOded Gabbay static DEFINE_MUTEX(hl_devs_idr_lock);
33e65e175bSOded Gabbay
34e65e175bSOded Gabbay #define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
35e65e175bSOded Gabbay #define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
36e65e175bSOded Gabbay
37e65e175bSOded Gabbay static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
38e65e175bSOded Gabbay static int reset_on_lockup = 1;
39e65e175bSOded Gabbay static int memory_scrub;
40e65e175bSOded Gabbay static ulong boot_error_status_mask = ULONG_MAX;
41e65e175bSOded Gabbay
42e65e175bSOded Gabbay module_param(timeout_locked, int, 0444);
43e65e175bSOded Gabbay MODULE_PARM_DESC(timeout_locked,
44e65e175bSOded Gabbay "Device lockup timeout in seconds (0 = disabled, default 30s)");
45e65e175bSOded Gabbay
46e65e175bSOded Gabbay module_param(reset_on_lockup, int, 0444);
47e65e175bSOded Gabbay MODULE_PARM_DESC(reset_on_lockup,
48e65e175bSOded Gabbay "Do device reset on lockup (0 = no, 1 = yes, default yes)");
49e65e175bSOded Gabbay
50e65e175bSOded Gabbay module_param(memory_scrub, int, 0444);
51e65e175bSOded Gabbay MODULE_PARM_DESC(memory_scrub,
52e65e175bSOded Gabbay "Scrub device memory in various states (0 = no, 1 = yes, default no)");
53e65e175bSOded Gabbay
54e65e175bSOded Gabbay module_param(boot_error_status_mask, ulong, 0444);
55e65e175bSOded Gabbay MODULE_PARM_DESC(boot_error_status_mask,
56e65e175bSOded Gabbay "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
57e65e175bSOded Gabbay
58e65e175bSOded Gabbay #define PCI_IDS_GOYA 0x0001
59e65e175bSOded Gabbay #define PCI_IDS_GAUDI 0x1000
60e65e175bSOded Gabbay #define PCI_IDS_GAUDI_SEC 0x1010
61e65e175bSOded Gabbay
62e65e175bSOded Gabbay #define PCI_IDS_GAUDI2 0x1020
63e65e175bSOded Gabbay
64e65e175bSOded Gabbay static const struct pci_device_id ids[] = {
65e65e175bSOded Gabbay { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
66e65e175bSOded Gabbay { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
67e65e175bSOded Gabbay { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
68e65e175bSOded Gabbay { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
69e65e175bSOded Gabbay { 0, }
70e65e175bSOded Gabbay };
71e65e175bSOded Gabbay MODULE_DEVICE_TABLE(pci, ids);
72e65e175bSOded Gabbay
73e65e175bSOded Gabbay /*
74e65e175bSOded Gabbay * get_asic_type - translate device id to asic type
75e65e175bSOded Gabbay *
76e65e175bSOded Gabbay * @hdev: pointer to habanalabs device structure.
77e65e175bSOded Gabbay *
78e65e175bSOded Gabbay * Translate device id and revision id to asic type.
79e65e175bSOded Gabbay * In case of unidentified device, return -1
80e65e175bSOded Gabbay */
get_asic_type(struct hl_device * hdev)81e65e175bSOded Gabbay static enum hl_asic_type get_asic_type(struct hl_device *hdev)
82e65e175bSOded Gabbay {
83e65e175bSOded Gabbay struct pci_dev *pdev = hdev->pdev;
84e65e175bSOded Gabbay enum hl_asic_type asic_type = ASIC_INVALID;
85e65e175bSOded Gabbay
86e65e175bSOded Gabbay switch (pdev->device) {
87e65e175bSOded Gabbay case PCI_IDS_GOYA:
88e65e175bSOded Gabbay asic_type = ASIC_GOYA;
89e65e175bSOded Gabbay break;
90e65e175bSOded Gabbay case PCI_IDS_GAUDI:
91e65e175bSOded Gabbay asic_type = ASIC_GAUDI;
92e65e175bSOded Gabbay break;
93e65e175bSOded Gabbay case PCI_IDS_GAUDI_SEC:
94e65e175bSOded Gabbay asic_type = ASIC_GAUDI_SEC;
95e65e175bSOded Gabbay break;
96e65e175bSOded Gabbay case PCI_IDS_GAUDI2:
97e65e175bSOded Gabbay switch (pdev->revision) {
98e65e175bSOded Gabbay case REV_ID_A:
99e65e175bSOded Gabbay asic_type = ASIC_GAUDI2;
100e65e175bSOded Gabbay break;
101e65e175bSOded Gabbay case REV_ID_B:
102e65e175bSOded Gabbay asic_type = ASIC_GAUDI2B;
103e65e175bSOded Gabbay break;
104*9aa2cba7SOded Gabbay case REV_ID_C:
105*9aa2cba7SOded Gabbay asic_type = ASIC_GAUDI2C;
106*9aa2cba7SOded Gabbay break;
107e65e175bSOded Gabbay default:
108e65e175bSOded Gabbay break;
109e65e175bSOded Gabbay }
110e65e175bSOded Gabbay break;
111e65e175bSOded Gabbay default:
112e65e175bSOded Gabbay break;
113e65e175bSOded Gabbay }
114e65e175bSOded Gabbay
115e65e175bSOded Gabbay return asic_type;
116e65e175bSOded Gabbay }
117e65e175bSOded Gabbay
is_asic_secured(enum hl_asic_type asic_type)118e65e175bSOded Gabbay static bool is_asic_secured(enum hl_asic_type asic_type)
119e65e175bSOded Gabbay {
120e65e175bSOded Gabbay switch (asic_type) {
121e65e175bSOded Gabbay case ASIC_GAUDI_SEC:
122e65e175bSOded Gabbay return true;
123e65e175bSOded Gabbay default:
124e65e175bSOded Gabbay return false;
125e65e175bSOded Gabbay }
126e65e175bSOded Gabbay }
127e65e175bSOded Gabbay
128e65e175bSOded Gabbay /*
129e65e175bSOded Gabbay * hl_device_open - open function for habanalabs device
130e65e175bSOded Gabbay *
131e65e175bSOded Gabbay * @inode: pointer to inode structure
132e65e175bSOded Gabbay * @filp: pointer to file structure
133e65e175bSOded Gabbay *
134e65e175bSOded Gabbay * Called when process opens an habanalabs device.
135e65e175bSOded Gabbay */
hl_device_open(struct inode * inode,struct file * filp)136e65e175bSOded Gabbay int hl_device_open(struct inode *inode, struct file *filp)
137e65e175bSOded Gabbay {
138e65e175bSOded Gabbay enum hl_device_status status;
139e65e175bSOded Gabbay struct hl_device *hdev;
140e65e175bSOded Gabbay struct hl_fpriv *hpriv;
141e65e175bSOded Gabbay int rc;
142e65e175bSOded Gabbay
143e65e175bSOded Gabbay mutex_lock(&hl_devs_idr_lock);
144e65e175bSOded Gabbay hdev = idr_find(&hl_devs_idr, iminor(inode));
145e65e175bSOded Gabbay mutex_unlock(&hl_devs_idr_lock);
146e65e175bSOded Gabbay
147e65e175bSOded Gabbay if (!hdev) {
148e65e175bSOded Gabbay pr_err("Couldn't find device %d:%d\n",
149e65e175bSOded Gabbay imajor(inode), iminor(inode));
150e65e175bSOded Gabbay return -ENXIO;
151e65e175bSOded Gabbay }
152e65e175bSOded Gabbay
153e65e175bSOded Gabbay hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
154e65e175bSOded Gabbay if (!hpriv)
155e65e175bSOded Gabbay return -ENOMEM;
156e65e175bSOded Gabbay
157e65e175bSOded Gabbay hpriv->hdev = hdev;
158e65e175bSOded Gabbay filp->private_data = hpriv;
159e65e175bSOded Gabbay hpriv->filp = filp;
160e65e175bSOded Gabbay
161e65e175bSOded Gabbay mutex_init(&hpriv->notifier_event.lock);
162e65e175bSOded Gabbay mutex_init(&hpriv->restore_phase_mutex);
163e65e175bSOded Gabbay mutex_init(&hpriv->ctx_lock);
164e65e175bSOded Gabbay kref_init(&hpriv->refcount);
165e65e175bSOded Gabbay nonseekable_open(inode, filp);
166e65e175bSOded Gabbay
167e65e175bSOded Gabbay hl_ctx_mgr_init(&hpriv->ctx_mgr);
168e2a079a2STomer Tayar hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
169e65e175bSOded Gabbay
170e65e175bSOded Gabbay hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
171e65e175bSOded Gabbay
172e65e175bSOded Gabbay mutex_lock(&hdev->fpriv_list_lock);
173e65e175bSOded Gabbay
174e65e175bSOded Gabbay if (!hl_device_operational(hdev, &status)) {
175e65e175bSOded Gabbay dev_dbg_ratelimited(hdev->dev,
176e65e175bSOded Gabbay "Can't open %s because it is %s\n",
177e65e175bSOded Gabbay dev_name(hdev->dev), hdev->status[status]);
178e65e175bSOded Gabbay
179e65e175bSOded Gabbay if (status == HL_DEVICE_STATUS_IN_RESET ||
180e65e175bSOded Gabbay status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
181e65e175bSOded Gabbay rc = -EAGAIN;
182e65e175bSOded Gabbay else
183e65e175bSOded Gabbay rc = -EPERM;
184e65e175bSOded Gabbay
185e65e175bSOded Gabbay goto out_err;
186e65e175bSOded Gabbay }
187e65e175bSOded Gabbay
188e65e175bSOded Gabbay if (hdev->is_in_dram_scrub) {
189e65e175bSOded Gabbay dev_dbg_ratelimited(hdev->dev,
190e65e175bSOded Gabbay "Can't open %s during dram scrub\n",
191e65e175bSOded Gabbay dev_name(hdev->dev));
192e65e175bSOded Gabbay rc = -EAGAIN;
193e65e175bSOded Gabbay goto out_err;
194e65e175bSOded Gabbay }
195e65e175bSOded Gabbay
196e65e175bSOded Gabbay if (hdev->compute_ctx_in_release) {
197e65e175bSOded Gabbay dev_dbg_ratelimited(hdev->dev,
198e65e175bSOded Gabbay "Can't open %s because another user is still releasing it\n",
199e65e175bSOded Gabbay dev_name(hdev->dev));
200e65e175bSOded Gabbay rc = -EAGAIN;
201e65e175bSOded Gabbay goto out_err;
202e65e175bSOded Gabbay }
203e65e175bSOded Gabbay
204e65e175bSOded Gabbay if (hdev->is_compute_ctx_active) {
205e65e175bSOded Gabbay dev_dbg_ratelimited(hdev->dev,
206e65e175bSOded Gabbay "Can't open %s because another user is working on it\n",
207e65e175bSOded Gabbay dev_name(hdev->dev));
208e65e175bSOded Gabbay rc = -EBUSY;
209e65e175bSOded Gabbay goto out_err;
210e65e175bSOded Gabbay }
211e65e175bSOded Gabbay
212e65e175bSOded Gabbay rc = hl_ctx_create(hdev, hpriv);
213e65e175bSOded Gabbay if (rc) {
214e65e175bSOded Gabbay dev_err(hdev->dev, "Failed to create context %d\n", rc);
215e65e175bSOded Gabbay goto out_err;
216e65e175bSOded Gabbay }
217e65e175bSOded Gabbay
218e65e175bSOded Gabbay list_add(&hpriv->dev_node, &hdev->fpriv_list);
219e65e175bSOded Gabbay mutex_unlock(&hdev->fpriv_list_lock);
220e65e175bSOded Gabbay
221e65e175bSOded Gabbay hdev->asic_funcs->send_device_activity(hdev, true);
222e65e175bSOded Gabbay
223e65e175bSOded Gabbay hl_debugfs_add_file(hpriv);
224e65e175bSOded Gabbay
225e6f49e96SDani Liberman hl_enable_err_info_capture(&hdev->captured_err_info);
226e65e175bSOded Gabbay
227e65e175bSOded Gabbay hdev->open_counter++;
228e65e175bSOded Gabbay hdev->last_successful_open_jif = jiffies;
229e65e175bSOded Gabbay hdev->last_successful_open_ktime = ktime_get();
230e65e175bSOded Gabbay
231e65e175bSOded Gabbay return 0;
232e65e175bSOded Gabbay
233e65e175bSOded Gabbay out_err:
234e65e175bSOded Gabbay mutex_unlock(&hdev->fpriv_list_lock);
235e65e175bSOded Gabbay hl_mem_mgr_fini(&hpriv->mem_mgr);
2362e8e9a89STomer Tayar hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
237e65e175bSOded Gabbay hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
238e65e175bSOded Gabbay filp->private_data = NULL;
239e65e175bSOded Gabbay mutex_destroy(&hpriv->ctx_lock);
240e65e175bSOded Gabbay mutex_destroy(&hpriv->restore_phase_mutex);
241e65e175bSOded Gabbay mutex_destroy(&hpriv->notifier_event.lock);
242e65e175bSOded Gabbay put_pid(hpriv->taskpid);
243e65e175bSOded Gabbay
244e65e175bSOded Gabbay kfree(hpriv);
245e65e175bSOded Gabbay
246e65e175bSOded Gabbay return rc;
247e65e175bSOded Gabbay }
248e65e175bSOded Gabbay
hl_device_open_ctrl(struct inode * inode,struct file * filp)249e65e175bSOded Gabbay int hl_device_open_ctrl(struct inode *inode, struct file *filp)
250e65e175bSOded Gabbay {
251e65e175bSOded Gabbay struct hl_device *hdev;
252e65e175bSOded Gabbay struct hl_fpriv *hpriv;
253e65e175bSOded Gabbay int rc;
254e65e175bSOded Gabbay
255e65e175bSOded Gabbay mutex_lock(&hl_devs_idr_lock);
256e65e175bSOded Gabbay hdev = idr_find(&hl_devs_idr, iminor(inode));
257e65e175bSOded Gabbay mutex_unlock(&hl_devs_idr_lock);
258e65e175bSOded Gabbay
259e65e175bSOded Gabbay if (!hdev) {
260e65e175bSOded Gabbay pr_err("Couldn't find device %d:%d\n",
261e65e175bSOded Gabbay imajor(inode), iminor(inode));
262e65e175bSOded Gabbay return -ENXIO;
263e65e175bSOded Gabbay }
264e65e175bSOded Gabbay
265e65e175bSOded Gabbay hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
266e65e175bSOded Gabbay if (!hpriv)
267e65e175bSOded Gabbay return -ENOMEM;
268e65e175bSOded Gabbay
269e65e175bSOded Gabbay /* Prevent other routines from reading partial hpriv data by
270e65e175bSOded Gabbay * initializing hpriv fields before inserting it to the list
271e65e175bSOded Gabbay */
272e65e175bSOded Gabbay hpriv->hdev = hdev;
273e65e175bSOded Gabbay filp->private_data = hpriv;
274e65e175bSOded Gabbay hpriv->filp = filp;
275e65e175bSOded Gabbay
276e65e175bSOded Gabbay mutex_init(&hpriv->notifier_event.lock);
277e65e175bSOded Gabbay nonseekable_open(inode, filp);
278e65e175bSOded Gabbay
279e65e175bSOded Gabbay hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
280e65e175bSOded Gabbay
281e65e175bSOded Gabbay mutex_lock(&hdev->fpriv_ctrl_list_lock);
282e65e175bSOded Gabbay
283e65e175bSOded Gabbay if (!hl_ctrl_device_operational(hdev, NULL)) {
284e65e175bSOded Gabbay dev_dbg_ratelimited(hdev->dev_ctrl,
285e65e175bSOded Gabbay "Can't open %s because it is disabled\n",
286e65e175bSOded Gabbay dev_name(hdev->dev_ctrl));
287e65e175bSOded Gabbay rc = -EPERM;
288e65e175bSOded Gabbay goto out_err;
289e65e175bSOded Gabbay }
290e65e175bSOded Gabbay
291e65e175bSOded Gabbay list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
292e65e175bSOded Gabbay mutex_unlock(&hdev->fpriv_ctrl_list_lock);
293e65e175bSOded Gabbay
294e65e175bSOded Gabbay return 0;
295e65e175bSOded Gabbay
296e65e175bSOded Gabbay out_err:
297e65e175bSOded Gabbay mutex_unlock(&hdev->fpriv_ctrl_list_lock);
298e65e175bSOded Gabbay filp->private_data = NULL;
299e65e175bSOded Gabbay put_pid(hpriv->taskpid);
300e65e175bSOded Gabbay
301e65e175bSOded Gabbay kfree(hpriv);
302e65e175bSOded Gabbay
303e65e175bSOded Gabbay return rc;
304e65e175bSOded Gabbay }
305e65e175bSOded Gabbay
set_driver_behavior_per_device(struct hl_device * hdev)306e65e175bSOded Gabbay static void set_driver_behavior_per_device(struct hl_device *hdev)
307e65e175bSOded Gabbay {
308e65e175bSOded Gabbay hdev->nic_ports_mask = 0;
309e65e175bSOded Gabbay hdev->fw_components = FW_TYPE_ALL_TYPES;
310e65e175bSOded Gabbay hdev->cpu_queues_enable = 1;
311e65e175bSOded Gabbay hdev->pldm = 0;
312e65e175bSOded Gabbay hdev->hard_reset_on_fw_events = 1;
313e65e175bSOded Gabbay hdev->bmc_enable = 1;
314e65e175bSOded Gabbay hdev->reset_on_preboot_fail = 1;
315e65e175bSOded Gabbay hdev->heartbeat = 1;
316e65e175bSOded Gabbay }
317e65e175bSOded Gabbay
copy_kernel_module_params_to_device(struct hl_device * hdev)318e65e175bSOded Gabbay static void copy_kernel_module_params_to_device(struct hl_device *hdev)
319e65e175bSOded Gabbay {
320e65e175bSOded Gabbay hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
321e65e175bSOded Gabbay
322e65e175bSOded Gabbay hdev->major = hl_major;
323323adae9SOded Gabbay hdev->hclass = hl_class;
324e65e175bSOded Gabbay hdev->memory_scrub = memory_scrub;
325e65e175bSOded Gabbay hdev->reset_on_lockup = reset_on_lockup;
326e65e175bSOded Gabbay hdev->boot_error_status_mask = boot_error_status_mask;
327e65e175bSOded Gabbay }
328e65e175bSOded Gabbay
fixup_device_params_per_asic(struct hl_device * hdev,int timeout)329e65e175bSOded Gabbay static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
330e65e175bSOded Gabbay {
331e65e175bSOded Gabbay switch (hdev->asic_type) {
332e65e175bSOded Gabbay case ASIC_GAUDI:
333e65e175bSOded Gabbay case ASIC_GAUDI_SEC:
334e65e175bSOded Gabbay /* If user didn't request a different timeout than the default one, we have
335e65e175bSOded Gabbay * a different default timeout for Gaudi
336e65e175bSOded Gabbay */
337e65e175bSOded Gabbay if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
338e65e175bSOded Gabbay hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
339e65e175bSOded Gabbay MSEC_PER_SEC);
340e65e175bSOded Gabbay
341e65e175bSOded Gabbay hdev->reset_upon_device_release = 0;
342e65e175bSOded Gabbay break;
343e65e175bSOded Gabbay
344e65e175bSOded Gabbay case ASIC_GOYA:
345e65e175bSOded Gabbay hdev->reset_upon_device_release = 0;
346e65e175bSOded Gabbay break;
347e65e175bSOded Gabbay
348e65e175bSOded Gabbay default:
349e65e175bSOded Gabbay hdev->reset_upon_device_release = 1;
350e65e175bSOded Gabbay break;
351e65e175bSOded Gabbay }
352e65e175bSOded Gabbay }
353e65e175bSOded Gabbay
fixup_device_params(struct hl_device * hdev)354e65e175bSOded Gabbay static int fixup_device_params(struct hl_device *hdev)
355e65e175bSOded Gabbay {
356e65e175bSOded Gabbay int tmp_timeout;
357e65e175bSOded Gabbay
358e65e175bSOded Gabbay tmp_timeout = timeout_locked;
359e65e175bSOded Gabbay
360e65e175bSOded Gabbay hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
361e65e175bSOded Gabbay hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
362e65e175bSOded Gabbay
363e65e175bSOded Gabbay if (tmp_timeout)
364e65e175bSOded Gabbay hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
365e65e175bSOded Gabbay else
366e65e175bSOded Gabbay hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
367e65e175bSOded Gabbay
368e65e175bSOded Gabbay hdev->stop_on_err = true;
369e65e175bSOded Gabbay hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
370e65e175bSOded Gabbay hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
371e65e175bSOded Gabbay
372e65e175bSOded Gabbay /* Enable only after the initialization of the device */
373e65e175bSOded Gabbay hdev->disabled = true;
374e65e175bSOded Gabbay
375e65e175bSOded Gabbay if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
376e65e175bSOded Gabbay (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
377e65e175bSOded Gabbay pr_err("Preboot must be set along with other components");
378e65e175bSOded Gabbay return -EINVAL;
379e65e175bSOded Gabbay }
380e65e175bSOded Gabbay
381e65e175bSOded Gabbay /* If CPU queues not enabled, no way to do heartbeat */
382e65e175bSOded Gabbay if (!hdev->cpu_queues_enable)
383e65e175bSOded Gabbay hdev->heartbeat = 0;
384e65e175bSOded Gabbay fixup_device_params_per_asic(hdev, tmp_timeout);
385e65e175bSOded Gabbay
386e65e175bSOded Gabbay return 0;
387e65e175bSOded Gabbay }
388e65e175bSOded Gabbay
389e65e175bSOded Gabbay /**
390e65e175bSOded Gabbay * create_hdev - create habanalabs device instance
391e65e175bSOded Gabbay *
392e65e175bSOded Gabbay * @dev: will hold the pointer to the new habanalabs device structure
393e65e175bSOded Gabbay * @pdev: pointer to the pci device
394e65e175bSOded Gabbay *
395e65e175bSOded Gabbay * Allocate memory for habanalabs device and initialize basic fields
396e65e175bSOded Gabbay * Identify the ASIC type
397e65e175bSOded Gabbay * Allocate ID (minor) for the device (only for real devices)
398e65e175bSOded Gabbay */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev)399e65e175bSOded Gabbay static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
400e65e175bSOded Gabbay {
401e65e175bSOded Gabbay int main_id, ctrl_id = 0, rc = 0;
402e65e175bSOded Gabbay struct hl_device *hdev;
403e65e175bSOded Gabbay
404e65e175bSOded Gabbay *dev = NULL;
405e65e175bSOded Gabbay
406e65e175bSOded Gabbay hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
407e65e175bSOded Gabbay if (!hdev)
408e65e175bSOded Gabbay return -ENOMEM;
409e65e175bSOded Gabbay
410e65e175bSOded Gabbay /* Will be NULL in case of simulator device */
411e65e175bSOded Gabbay hdev->pdev = pdev;
412e65e175bSOded Gabbay
413e65e175bSOded Gabbay /* Assign status description string */
414e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
415e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
416e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
417e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
418e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
419e65e175bSOded Gabbay "in device creation", HL_STR_MAX);
420e65e175bSOded Gabbay strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
421e65e175bSOded Gabbay "in reset after device release", HL_STR_MAX);
422e65e175bSOded Gabbay
423e65e175bSOded Gabbay
424e65e175bSOded Gabbay /* First, we must find out which ASIC are we handling. This is needed
425e65e175bSOded Gabbay * to configure the behavior of the driver (kernel parameters)
426e65e175bSOded Gabbay */
427e65e175bSOded Gabbay hdev->asic_type = get_asic_type(hdev);
428e65e175bSOded Gabbay if (hdev->asic_type == ASIC_INVALID) {
429e65e175bSOded Gabbay dev_err(&pdev->dev, "Unsupported ASIC\n");
430e65e175bSOded Gabbay rc = -ENODEV;
431e65e175bSOded Gabbay goto free_hdev;
432e65e175bSOded Gabbay }
433e65e175bSOded Gabbay
434e65e175bSOded Gabbay copy_kernel_module_params_to_device(hdev);
435e65e175bSOded Gabbay
436e65e175bSOded Gabbay set_driver_behavior_per_device(hdev);
437e65e175bSOded Gabbay
438e65e175bSOded Gabbay fixup_device_params(hdev);
439e65e175bSOded Gabbay
440e65e175bSOded Gabbay mutex_lock(&hl_devs_idr_lock);
441e65e175bSOded Gabbay
442e65e175bSOded Gabbay /* Always save 2 numbers, 1 for main device and 1 for control.
443e65e175bSOded Gabbay * They must be consecutive
444e65e175bSOded Gabbay */
445e65e175bSOded Gabbay main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
446e65e175bSOded Gabbay
447e65e175bSOded Gabbay if (main_id >= 0)
448e65e175bSOded Gabbay ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
449e65e175bSOded Gabbay main_id + 2, GFP_KERNEL);
450e65e175bSOded Gabbay
451e65e175bSOded Gabbay mutex_unlock(&hl_devs_idr_lock);
452e65e175bSOded Gabbay
453e65e175bSOded Gabbay if ((main_id < 0) || (ctrl_id < 0)) {
454e65e175bSOded Gabbay if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
455e65e175bSOded Gabbay pr_err("too many devices in the system\n");
456e65e175bSOded Gabbay
457e65e175bSOded Gabbay if (main_id >= 0) {
458e65e175bSOded Gabbay mutex_lock(&hl_devs_idr_lock);
459e65e175bSOded Gabbay idr_remove(&hl_devs_idr, main_id);
460e65e175bSOded Gabbay mutex_unlock(&hl_devs_idr_lock);
461e65e175bSOded Gabbay }
462e65e175bSOded Gabbay
463e65e175bSOded Gabbay rc = -EBUSY;
464e65e175bSOded Gabbay goto free_hdev;
465e65e175bSOded Gabbay }
466e65e175bSOded Gabbay
467e65e175bSOded Gabbay hdev->id = main_id;
468e65e175bSOded Gabbay hdev->id_control = ctrl_id;
469e65e175bSOded Gabbay
470e65e175bSOded Gabbay *dev = hdev;
471e65e175bSOded Gabbay
472e65e175bSOded Gabbay return 0;
473e65e175bSOded Gabbay
474e65e175bSOded Gabbay free_hdev:
475e65e175bSOded Gabbay kfree(hdev);
476e65e175bSOded Gabbay return rc;
477e65e175bSOded Gabbay }
478e65e175bSOded Gabbay
479e65e175bSOded Gabbay /*
480e65e175bSOded Gabbay * destroy_hdev - destroy habanalabs device instance
481e65e175bSOded Gabbay *
482e65e175bSOded Gabbay * @dev: pointer to the habanalabs device structure
483e65e175bSOded Gabbay *
484e65e175bSOded Gabbay */
destroy_hdev(struct hl_device * hdev)485e65e175bSOded Gabbay static void destroy_hdev(struct hl_device *hdev)
486e65e175bSOded Gabbay {
487e65e175bSOded Gabbay /* Remove device from the device list */
488e65e175bSOded Gabbay mutex_lock(&hl_devs_idr_lock);
489e65e175bSOded Gabbay idr_remove(&hl_devs_idr, hdev->id);
490e65e175bSOded Gabbay idr_remove(&hl_devs_idr, hdev->id_control);
491e65e175bSOded Gabbay mutex_unlock(&hl_devs_idr_lock);
492e65e175bSOded Gabbay
493e65e175bSOded Gabbay kfree(hdev);
494e65e175bSOded Gabbay }
495e65e175bSOded Gabbay
hl_pmops_suspend(struct device * dev)496e65e175bSOded Gabbay static int hl_pmops_suspend(struct device *dev)
497e65e175bSOded Gabbay {
498e65e175bSOded Gabbay struct hl_device *hdev = dev_get_drvdata(dev);
499e65e175bSOded Gabbay
500e65e175bSOded Gabbay pr_debug("Going to suspend PCI device\n");
501e65e175bSOded Gabbay
502e65e175bSOded Gabbay if (!hdev) {
503e65e175bSOded Gabbay pr_err("device pointer is NULL in suspend\n");
504e65e175bSOded Gabbay return 0;
505e65e175bSOded Gabbay }
506e65e175bSOded Gabbay
507e65e175bSOded Gabbay return hl_device_suspend(hdev);
508e65e175bSOded Gabbay }
509e65e175bSOded Gabbay
hl_pmops_resume(struct device * dev)510e65e175bSOded Gabbay static int hl_pmops_resume(struct device *dev)
511e65e175bSOded Gabbay {
512e65e175bSOded Gabbay struct hl_device *hdev = dev_get_drvdata(dev);
513e65e175bSOded Gabbay
514e65e175bSOded Gabbay pr_debug("Going to resume PCI device\n");
515e65e175bSOded Gabbay
516e65e175bSOded Gabbay if (!hdev) {
517e65e175bSOded Gabbay pr_err("device pointer is NULL in resume\n");
518e65e175bSOded Gabbay return 0;
519e65e175bSOded Gabbay }
520e65e175bSOded Gabbay
521e65e175bSOded Gabbay return hl_device_resume(hdev);
522e65e175bSOded Gabbay }
523e65e175bSOded Gabbay
524e65e175bSOded Gabbay /**
525e65e175bSOded Gabbay * hl_pci_probe - probe PCI habanalabs devices
526e65e175bSOded Gabbay *
527e65e175bSOded Gabbay * @pdev: pointer to pci device
528e65e175bSOded Gabbay * @id: pointer to pci device id structure
529e65e175bSOded Gabbay *
530e65e175bSOded Gabbay * Standard PCI probe function for habanalabs device.
531e65e175bSOded Gabbay * Create a new habanalabs device and initialize it according to the
532e65e175bSOded Gabbay * device's type
533e65e175bSOded Gabbay */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)534e65e175bSOded Gabbay static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
535e65e175bSOded Gabbay {
536e65e175bSOded Gabbay struct hl_device *hdev;
537e65e175bSOded Gabbay int rc;
538e65e175bSOded Gabbay
539e65e175bSOded Gabbay dev_info(&pdev->dev, HL_NAME
540e65e175bSOded Gabbay " device found [%04x:%04x] (rev %x)\n",
541e65e175bSOded Gabbay (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
542e65e175bSOded Gabbay
543e65e175bSOded Gabbay rc = create_hdev(&hdev, pdev);
544e65e175bSOded Gabbay if (rc)
545e65e175bSOded Gabbay return rc;
546e65e175bSOded Gabbay
547e65e175bSOded Gabbay pci_set_drvdata(pdev, hdev);
548e65e175bSOded Gabbay
549323adae9SOded Gabbay rc = hl_device_init(hdev);
550e65e175bSOded Gabbay if (rc) {
551e65e175bSOded Gabbay dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
552e65e175bSOded Gabbay rc = -ENODEV;
553e65e175bSOded Gabbay goto disable_device;
554e65e175bSOded Gabbay }
555e65e175bSOded Gabbay
556e65e175bSOded Gabbay return 0;
557e65e175bSOded Gabbay
558e65e175bSOded Gabbay disable_device:
559e65e175bSOded Gabbay pci_set_drvdata(pdev, NULL);
560e65e175bSOded Gabbay destroy_hdev(hdev);
561e65e175bSOded Gabbay
562e65e175bSOded Gabbay return rc;
563e65e175bSOded Gabbay }
564e65e175bSOded Gabbay
565e65e175bSOded Gabbay /*
566e65e175bSOded Gabbay * hl_pci_remove - remove PCI habanalabs devices
567e65e175bSOded Gabbay *
568e65e175bSOded Gabbay * @pdev: pointer to pci device
569e65e175bSOded Gabbay *
570e65e175bSOded Gabbay * Standard PCI remove function for habanalabs device
571e65e175bSOded Gabbay */
hl_pci_remove(struct pci_dev * pdev)572e65e175bSOded Gabbay static void hl_pci_remove(struct pci_dev *pdev)
573e65e175bSOded Gabbay {
574e65e175bSOded Gabbay struct hl_device *hdev;
575e65e175bSOded Gabbay
576e65e175bSOded Gabbay hdev = pci_get_drvdata(pdev);
577e65e175bSOded Gabbay if (!hdev)
578e65e175bSOded Gabbay return;
579e65e175bSOded Gabbay
580e65e175bSOded Gabbay hl_device_fini(hdev);
581e65e175bSOded Gabbay pci_set_drvdata(pdev, NULL);
582e65e175bSOded Gabbay destroy_hdev(hdev);
583e65e175bSOded Gabbay }
584e65e175bSOded Gabbay
585e65e175bSOded Gabbay /**
586e65e175bSOded Gabbay * hl_pci_err_detected - a PCI bus error detected on this device
587e65e175bSOded Gabbay *
588e65e175bSOded Gabbay * @pdev: pointer to pci device
589e65e175bSOded Gabbay * @state: PCI error type
590e65e175bSOded Gabbay *
591e65e175bSOded Gabbay * Called by the PCI subsystem whenever a non-correctable
592e65e175bSOded Gabbay * PCI bus error is detected
593e65e175bSOded Gabbay */
594e65e175bSOded Gabbay static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)595e65e175bSOded Gabbay hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
596e65e175bSOded Gabbay {
597e65e175bSOded Gabbay struct hl_device *hdev = pci_get_drvdata(pdev);
598e65e175bSOded Gabbay enum pci_ers_result result;
599e65e175bSOded Gabbay
600e65e175bSOded Gabbay switch (state) {
601e65e175bSOded Gabbay case pci_channel_io_normal:
602e65e175bSOded Gabbay dev_warn(hdev->dev, "PCI normal state error detected\n");
603e65e175bSOded Gabbay return PCI_ERS_RESULT_CAN_RECOVER;
604e65e175bSOded Gabbay
605e65e175bSOded Gabbay case pci_channel_io_frozen:
606e65e175bSOded Gabbay dev_warn(hdev->dev, "PCI frozen state error detected\n");
607e65e175bSOded Gabbay result = PCI_ERS_RESULT_NEED_RESET;
608e65e175bSOded Gabbay break;
609e65e175bSOded Gabbay
610e65e175bSOded Gabbay case pci_channel_io_perm_failure:
611e65e175bSOded Gabbay dev_warn(hdev->dev, "PCI failure state error detected\n");
612e65e175bSOded Gabbay result = PCI_ERS_RESULT_DISCONNECT;
613e65e175bSOded Gabbay break;
614e65e175bSOded Gabbay
615e65e175bSOded Gabbay default:
616e65e175bSOded Gabbay result = PCI_ERS_RESULT_NONE;
617e65e175bSOded Gabbay }
618e65e175bSOded Gabbay
619e65e175bSOded Gabbay hdev->asic_funcs->halt_engines(hdev, true, false);
620e65e175bSOded Gabbay
621e65e175bSOded Gabbay return result;
622e65e175bSOded Gabbay }
623e65e175bSOded Gabbay
624e65e175bSOded Gabbay /**
625e65e175bSOded Gabbay * hl_pci_err_resume - resume after a PCI slot reset
626e65e175bSOded Gabbay *
627e65e175bSOded Gabbay * @pdev: pointer to pci device
628e65e175bSOded Gabbay *
629e65e175bSOded Gabbay */
hl_pci_err_resume(struct pci_dev * pdev)630e65e175bSOded Gabbay static void hl_pci_err_resume(struct pci_dev *pdev)
631e65e175bSOded Gabbay {
632e65e175bSOded Gabbay struct hl_device *hdev = pci_get_drvdata(pdev);
633e65e175bSOded Gabbay
634e65e175bSOded Gabbay dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
635e65e175bSOded Gabbay hl_device_resume(hdev);
636e65e175bSOded Gabbay }
637e65e175bSOded Gabbay
638e65e175bSOded Gabbay /**
639e65e175bSOded Gabbay * hl_pci_err_slot_reset - a PCI slot reset has just happened
640e65e175bSOded Gabbay *
641e65e175bSOded Gabbay * @pdev: pointer to pci device
642e65e175bSOded Gabbay *
643e65e175bSOded Gabbay * Determine if the driver can recover from the PCI slot reset
644e65e175bSOded Gabbay */
hl_pci_err_slot_reset(struct pci_dev * pdev)645e65e175bSOded Gabbay static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
646e65e175bSOded Gabbay {
647e65e175bSOded Gabbay struct hl_device *hdev = pci_get_drvdata(pdev);
648e65e175bSOded Gabbay
649e65e175bSOded Gabbay dev_warn(hdev->dev, "PCI slot reset detected\n");
650e65e175bSOded Gabbay
651e65e175bSOded Gabbay return PCI_ERS_RESULT_RECOVERED;
652e65e175bSOded Gabbay }
653e65e175bSOded Gabbay
654e65e175bSOded Gabbay static const struct dev_pm_ops hl_pm_ops = {
655e65e175bSOded Gabbay .suspend = hl_pmops_suspend,
656e65e175bSOded Gabbay .resume = hl_pmops_resume,
657e65e175bSOded Gabbay };
658e65e175bSOded Gabbay
659e65e175bSOded Gabbay static const struct pci_error_handlers hl_pci_err_handler = {
660e65e175bSOded Gabbay .error_detected = hl_pci_err_detected,
661e65e175bSOded Gabbay .slot_reset = hl_pci_err_slot_reset,
662e65e175bSOded Gabbay .resume = hl_pci_err_resume,
663e65e175bSOded Gabbay };
664e65e175bSOded Gabbay
665e65e175bSOded Gabbay static struct pci_driver hl_pci_driver = {
666e65e175bSOded Gabbay .name = HL_NAME,
667e65e175bSOded Gabbay .id_table = ids,
668e65e175bSOded Gabbay .probe = hl_pci_probe,
669e65e175bSOded Gabbay .remove = hl_pci_remove,
670e65e175bSOded Gabbay .shutdown = hl_pci_remove,
671e65e175bSOded Gabbay .driver = {
672e65e175bSOded Gabbay .name = HL_NAME,
673e65e175bSOded Gabbay .pm = &hl_pm_ops,
674e65e175bSOded Gabbay .probe_type = PROBE_PREFER_ASYNCHRONOUS,
675e65e175bSOded Gabbay },
676e65e175bSOded Gabbay .err_handler = &hl_pci_err_handler,
677e65e175bSOded Gabbay };
678e65e175bSOded Gabbay
679e65e175bSOded Gabbay /*
680e65e175bSOded Gabbay * hl_init - Initialize the habanalabs kernel driver
681e65e175bSOded Gabbay */
hl_init(void)682e65e175bSOded Gabbay static int __init hl_init(void)
683e65e175bSOded Gabbay {
684e65e175bSOded Gabbay int rc;
685e65e175bSOded Gabbay dev_t dev;
686e65e175bSOded Gabbay
687e65e175bSOded Gabbay pr_info("loading driver\n");
688e65e175bSOded Gabbay
689e65e175bSOded Gabbay rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
690e65e175bSOded Gabbay if (rc < 0) {
691e65e175bSOded Gabbay pr_err("unable to get major\n");
692e65e175bSOded Gabbay return rc;
693e65e175bSOded Gabbay }
694e65e175bSOded Gabbay
695e65e175bSOded Gabbay hl_major = MAJOR(dev);
696e65e175bSOded Gabbay
6971aaba11dSGreg Kroah-Hartman hl_class = class_create(HL_NAME);
698e65e175bSOded Gabbay if (IS_ERR(hl_class)) {
699e65e175bSOded Gabbay pr_err("failed to allocate class\n");
700e65e175bSOded Gabbay rc = PTR_ERR(hl_class);
701e65e175bSOded Gabbay goto remove_major;
702e65e175bSOded Gabbay }
703e65e175bSOded Gabbay
704e65e175bSOded Gabbay hl_debugfs_init();
705e65e175bSOded Gabbay
706e65e175bSOded Gabbay rc = pci_register_driver(&hl_pci_driver);
707e65e175bSOded Gabbay if (rc) {
708e65e175bSOded Gabbay pr_err("failed to register pci device\n");
709e65e175bSOded Gabbay goto remove_debugfs;
710e65e175bSOded Gabbay }
711e65e175bSOded Gabbay
712e65e175bSOded Gabbay pr_debug("driver loaded\n");
713e65e175bSOded Gabbay
714e65e175bSOded Gabbay return 0;
715e65e175bSOded Gabbay
716e65e175bSOded Gabbay remove_debugfs:
717e65e175bSOded Gabbay hl_debugfs_fini();
718e65e175bSOded Gabbay class_destroy(hl_class);
719e65e175bSOded Gabbay remove_major:
720e65e175bSOded Gabbay unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
721e65e175bSOded Gabbay return rc;
722e65e175bSOded Gabbay }
723e65e175bSOded Gabbay
724e65e175bSOded Gabbay /*
725e65e175bSOded Gabbay * hl_exit - Release all resources of the habanalabs kernel driver
726e65e175bSOded Gabbay */
hl_exit(void)727e65e175bSOded Gabbay static void __exit hl_exit(void)
728e65e175bSOded Gabbay {
729e65e175bSOded Gabbay pci_unregister_driver(&hl_pci_driver);
730e65e175bSOded Gabbay
731e65e175bSOded Gabbay /*
732e65e175bSOded Gabbay * Removing debugfs must be after all devices or simulator devices
733e65e175bSOded Gabbay * have been removed because otherwise we get a bug in the
734e65e175bSOded Gabbay * debugfs module for referencing NULL objects
735e65e175bSOded Gabbay */
736e65e175bSOded Gabbay hl_debugfs_fini();
737e65e175bSOded Gabbay
738e65e175bSOded Gabbay class_destroy(hl_class);
739e65e175bSOded Gabbay unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
740e65e175bSOded Gabbay
741e65e175bSOded Gabbay idr_destroy(&hl_devs_idr);
742e65e175bSOded Gabbay
743e65e175bSOded Gabbay pr_debug("driver removed\n");
744e65e175bSOded Gabbay }
745e65e175bSOded Gabbay
746e65e175bSOded Gabbay module_init(hl_init);
747e65e175bSOded Gabbay module_exit(hl_exit);
748