1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright 2016-2022 HabanaLabs, Ltd. 5 * All Rights Reserved. 6 */ 7 8 #define pr_fmt(fmt) "habanalabs: " fmt 9 10 #include <uapi/drm/habanalabs_accel.h> 11 #include "habanalabs.h" 12 13 #include <linux/pci.h> 14 #include <linux/hwmon.h> 15 #include <linux/vmalloc.h> 16 17 #include <trace/events/habanalabs.h> 18 19 #define HL_RESET_DELAY_USEC 10000 /* 10ms */ 20 21 #define HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC 5 22 23 enum dma_alloc_type { 24 DMA_ALLOC_COHERENT, 25 DMA_ALLOC_CPU_ACCESSIBLE, 26 DMA_ALLOC_POOL, 27 }; 28 29 #define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788 30 31 /* 32 * hl_set_dram_bar- sets the bar to allow later access to address 33 * 34 * @hdev: pointer to habanalabs device structure. 35 * @addr: the address the caller wants to access. 36 * @region: the PCI region. 37 * @new_bar_region_base: the new BAR region base address. 38 * 39 * @return: the old BAR base address on success, U64_MAX for failure. 40 * The caller should set it back to the old address after use. 41 * 42 * In case the bar space does not cover the whole address space, 43 * the bar base address should be set to allow access to a given address. 44 * This function can be called also if the bar doesn't need to be set, 45 * in that case it just won't change the base. 46 */ 47 static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region, 48 u64 *new_bar_region_base) 49 { 50 struct asic_fixed_properties *prop = &hdev->asic_prop; 51 u64 bar_base_addr, old_base; 52 53 if (is_power_of_2(prop->dram_pci_bar_size)) 54 bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull); 55 else 56 bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) * 57 prop->dram_pci_bar_size; 58 59 old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); 60 61 /* in case of success we need to update the new BAR base */ 62 if ((old_base != U64_MAX) && new_bar_region_base) 63 *new_bar_region_base = bar_base_addr; 64 65 return old_base; 66 } 67 68 int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, 69 enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar) 70 { 71 struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; 72 u64 old_base = 0, rc, bar_region_base = region->region_base; 73 void __iomem *acc_addr; 74 75 if (set_dram_bar) { 76 old_base = hl_set_dram_bar(hdev, addr, region, &bar_region_base); 77 if (old_base == U64_MAX) 78 return -EIO; 79 } 80 81 acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + 82 (addr - bar_region_base); 83 84 switch (acc_type) { 85 case DEBUGFS_READ8: 86 *val = readb(acc_addr); 87 break; 88 case DEBUGFS_WRITE8: 89 writeb(*val, acc_addr); 90 break; 91 case DEBUGFS_READ32: 92 *val = readl(acc_addr); 93 break; 94 case DEBUGFS_WRITE32: 95 writel(*val, acc_addr); 96 break; 97 case DEBUGFS_READ64: 98 *val = readq(acc_addr); 99 break; 100 case DEBUGFS_WRITE64: 101 writeq(*val, acc_addr); 102 break; 103 } 104 105 if (set_dram_bar) { 106 rc = hl_set_dram_bar(hdev, old_base, region, NULL); 107 if (rc == U64_MAX) 108 return -EIO; 109 } 110 111 return 0; 112 } 113 114 static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, 115 gfp_t flag, enum dma_alloc_type alloc_type, 116 const char *caller) 117 { 118 void *ptr = NULL; 119 120 switch (alloc_type) { 121 case DMA_ALLOC_COHERENT: 122 ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); 123 break; 124 case DMA_ALLOC_CPU_ACCESSIBLE: 125 ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); 126 break; 127 case DMA_ALLOC_POOL: 128 ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); 129 break; 130 } 131 132 if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr)) 133 trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size, 134 caller); 135 136 return ptr; 137 } 138 139 static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *cpu_addr, 140 dma_addr_t dma_handle, enum dma_alloc_type alloc_type, 141 const char *caller) 142 { 143 /* this is needed to avoid warning on using freed pointer */ 144 u64 store_cpu_addr = (u64) (uintptr_t) cpu_addr; 145 146 switch (alloc_type) { 147 case DMA_ALLOC_COHERENT: 148 hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); 149 break; 150 case DMA_ALLOC_CPU_ACCESSIBLE: 151 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr); 152 break; 153 case DMA_ALLOC_POOL: 154 hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); 155 break; 156 } 157 158 trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller); 159 } 160 161 void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, 162 gfp_t flag, const char *caller) 163 { 164 return hl_dma_alloc_common(hdev, size, dma_handle, flag, DMA_ALLOC_COHERENT, caller); 165 } 166 167 void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, 168 dma_addr_t dma_handle, const char *caller) 169 { 170 hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); 171 } 172 173 void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size, 174 dma_addr_t *dma_handle, const char *caller) 175 { 176 return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); 177 } 178 179 void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr, 180 const char *caller) 181 { 182 hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); 183 } 184 185 void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, 186 dma_addr_t *dma_handle, const char *caller) 187 { 188 return hl_dma_alloc_common(hdev, size, dma_handle, mem_flags, DMA_ALLOC_POOL, caller); 189 } 190 191 void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, 192 const char *caller) 193 { 194 hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); 195 } 196 197 int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) 198 { 199 struct asic_fixed_properties *prop = &hdev->asic_prop; 200 struct scatterlist *sg; 201 int rc, i; 202 203 rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0); 204 if (rc) 205 return rc; 206 207 /* Shift to the device's base physical address of host memory if necessary */ 208 if (prop->device_dma_offset_for_host_access) 209 for_each_sgtable_dma_sg(sgt, sg, i) 210 sg->dma_address += prop->device_dma_offset_for_host_access; 211 212 return 0; 213 } 214 215 void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) 216 { 217 struct asic_fixed_properties *prop = &hdev->asic_prop; 218 struct scatterlist *sg; 219 int i; 220 221 /* Cancel the device's base physical address of host memory if necessary */ 222 if (prop->device_dma_offset_for_host_access) 223 for_each_sgtable_dma_sg(sgt, sg, i) 224 sg->dma_address -= prop->device_dma_offset_for_host_access; 225 226 dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0); 227 } 228 229 /* 230 * hl_access_cfg_region - access the config region 231 * 232 * @hdev: pointer to habanalabs device structure 233 * @addr: the address to access 234 * @val: the value to write from or read to 235 * @acc_type: the type of access (read/write 64/32) 236 */ 237 int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, 238 enum debugfs_access_type acc_type) 239 { 240 struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG]; 241 u32 val_h, val_l; 242 243 if (!IS_ALIGNED(addr, sizeof(u32))) { 244 dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32)); 245 return -EINVAL; 246 } 247 248 switch (acc_type) { 249 case DEBUGFS_READ32: 250 *val = RREG32(addr - cfg_region->region_base); 251 break; 252 case DEBUGFS_WRITE32: 253 WREG32(addr - cfg_region->region_base, *val); 254 break; 255 case DEBUGFS_READ64: 256 val_l = RREG32(addr - cfg_region->region_base); 257 val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base); 258 259 *val = (((u64) val_h) << 32) | val_l; 260 break; 261 case DEBUGFS_WRITE64: 262 WREG32(addr - cfg_region->region_base, lower_32_bits(*val)); 263 WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val)); 264 break; 265 default: 266 dev_err(hdev->dev, "access type %d is not supported\n", acc_type); 267 return -EOPNOTSUPP; 268 } 269 270 return 0; 271 } 272 273 /* 274 * hl_access_dev_mem - access device memory 275 * 276 * @hdev: pointer to habanalabs device structure 277 * @region_type: the type of the region the address belongs to 278 * @addr: the address to access 279 * @val: the value to write from or read to 280 * @acc_type: the type of access (r/w, 32/64) 281 */ 282 int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, 283 u64 addr, u64 *val, enum debugfs_access_type acc_type) 284 { 285 switch (region_type) { 286 case PCI_REGION_CFG: 287 return hl_access_cfg_region(hdev, addr, val, acc_type); 288 case PCI_REGION_SRAM: 289 case PCI_REGION_DRAM: 290 return hl_access_sram_dram_region(hdev, addr, val, acc_type, 291 region_type, (region_type == PCI_REGION_DRAM)); 292 default: 293 return -EFAULT; 294 } 295 296 return 0; 297 } 298 299 void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...) 300 { 301 va_list args; 302 int str_size; 303 304 va_start(args, fmt); 305 /* Calculate formatted string length. Assuming each string is null terminated, hence 306 * increment result by 1 307 */ 308 str_size = vsnprintf(NULL, 0, fmt, args) + 1; 309 va_end(args); 310 311 if ((e->actual_size + str_size) < e->allocated_buf_size) { 312 va_start(args, fmt); 313 vsnprintf(e->buf + e->actual_size, str_size, fmt, args); 314 va_end(args); 315 } 316 317 /* Need to update the size even when not updating destination buffer to get the exact size 318 * of all input strings 319 */ 320 e->actual_size += str_size; 321 } 322 323 enum hl_device_status hl_device_status(struct hl_device *hdev) 324 { 325 enum hl_device_status status; 326 327 if (hdev->reset_info.in_reset) { 328 if (hdev->reset_info.in_compute_reset) 329 status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE; 330 else 331 status = HL_DEVICE_STATUS_IN_RESET; 332 } else if (hdev->reset_info.needs_reset) { 333 status = HL_DEVICE_STATUS_NEEDS_RESET; 334 } else if (hdev->disabled) { 335 status = HL_DEVICE_STATUS_MALFUNCTION; 336 } else if (!hdev->init_done) { 337 status = HL_DEVICE_STATUS_IN_DEVICE_CREATION; 338 } else { 339 status = HL_DEVICE_STATUS_OPERATIONAL; 340 } 341 342 return status; 343 } 344 345 bool hl_device_operational(struct hl_device *hdev, 346 enum hl_device_status *status) 347 { 348 enum hl_device_status current_status; 349 350 current_status = hl_device_status(hdev); 351 if (status) 352 *status = current_status; 353 354 switch (current_status) { 355 case HL_DEVICE_STATUS_IN_RESET: 356 case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: 357 case HL_DEVICE_STATUS_MALFUNCTION: 358 case HL_DEVICE_STATUS_NEEDS_RESET: 359 return false; 360 case HL_DEVICE_STATUS_OPERATIONAL: 361 case HL_DEVICE_STATUS_IN_DEVICE_CREATION: 362 default: 363 return true; 364 } 365 } 366 367 bool hl_ctrl_device_operational(struct hl_device *hdev, 368 enum hl_device_status *status) 369 { 370 enum hl_device_status current_status; 371 372 current_status = hl_device_status(hdev); 373 if (status) 374 *status = current_status; 375 376 switch (current_status) { 377 case HL_DEVICE_STATUS_MALFUNCTION: 378 return false; 379 case HL_DEVICE_STATUS_IN_RESET: 380 case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: 381 case HL_DEVICE_STATUS_NEEDS_RESET: 382 case HL_DEVICE_STATUS_OPERATIONAL: 383 case HL_DEVICE_STATUS_IN_DEVICE_CREATION: 384 default: 385 return true; 386 } 387 } 388 389 static void print_idle_status_mask(struct hl_device *hdev, const char *message, 390 u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE]) 391 { 392 u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {}; 393 394 BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4); 395 396 pad_width[3] = idle_mask[3] ? 16 : 0; 397 pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0; 398 pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0; 399 pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0; 400 401 dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n", 402 message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2], 403 pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]); 404 } 405 406 static void hpriv_release(struct kref *ref) 407 { 408 u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; 409 bool reset_device, device_is_idle = true; 410 struct hl_fpriv *hpriv; 411 struct hl_device *hdev; 412 413 hpriv = container_of(ref, struct hl_fpriv, refcount); 414 415 hdev = hpriv->hdev; 416 417 hdev->asic_funcs->send_device_activity(hdev, false); 418 419 put_pid(hpriv->taskpid); 420 421 hl_debugfs_remove_file(hpriv); 422 423 mutex_destroy(&hpriv->ctx_lock); 424 mutex_destroy(&hpriv->restore_phase_mutex); 425 426 /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending 427 * reset that waits for device release. 428 */ 429 reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active; 430 431 /* Check the device idle status and reset if not idle. 432 * Skip it if already in reset, or if device is going to be reset in any case. 433 */ 434 if (!hdev->reset_info.in_reset && !reset_device && hdev->pdev && !hdev->pldm) 435 device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, 436 HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); 437 if (!device_is_idle) { 438 print_idle_status_mask(hdev, "device is not idle after user context is closed", 439 idle_mask); 440 reset_device = true; 441 } 442 443 /* We need to remove the user from the list to make sure the reset process won't 444 * try to kill the user process. Because, if we got here, it means there are no 445 * more driver/device resources that the user process is occupying so there is 446 * no need to kill it 447 * 448 * However, we can't set the compute_ctx to NULL at this stage. This is to prevent 449 * a race between the release and opening the device again. We don't want to let 450 * a user open the device while there a reset is about to happen. 451 */ 452 mutex_lock(&hdev->fpriv_list_lock); 453 list_del(&hpriv->dev_node); 454 mutex_unlock(&hdev->fpriv_list_lock); 455 456 if (reset_device) { 457 hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); 458 } else { 459 /* Scrubbing is handled within hl_device_reset(), so here need to do it directly */ 460 int rc = hdev->asic_funcs->scrub_device_mem(hdev); 461 462 if (rc) 463 dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc); 464 } 465 466 /* Now we can mark the compute_ctx as not active. Even if a reset is running in a different 467 * thread, we don't care because the in_reset is marked so if a user will try to open 468 * the device it will fail on that, even if compute_ctx is false. 469 */ 470 mutex_lock(&hdev->fpriv_list_lock); 471 hdev->is_compute_ctx_active = false; 472 mutex_unlock(&hdev->fpriv_list_lock); 473 474 hdev->compute_ctx_in_release = 0; 475 476 /* release the eventfd */ 477 if (hpriv->notifier_event.eventfd) 478 eventfd_ctx_put(hpriv->notifier_event.eventfd); 479 480 mutex_destroy(&hpriv->notifier_event.lock); 481 482 kfree(hpriv); 483 } 484 485 void hl_hpriv_get(struct hl_fpriv *hpriv) 486 { 487 kref_get(&hpriv->refcount); 488 } 489 490 int hl_hpriv_put(struct hl_fpriv *hpriv) 491 { 492 return kref_put(&hpriv->refcount, hpriv_release); 493 } 494 495 /* 496 * hl_device_release - release function for habanalabs device 497 * 498 * @inode: pointer to inode structure 499 * @filp: pointer to file structure 500 * 501 * Called when process closes an habanalabs device 502 */ 503 static int hl_device_release(struct inode *inode, struct file *filp) 504 { 505 struct hl_fpriv *hpriv = filp->private_data; 506 struct hl_device *hdev = hpriv->hdev; 507 508 filp->private_data = NULL; 509 510 if (!hdev) { 511 pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n"); 512 put_pid(hpriv->taskpid); 513 return 0; 514 } 515 516 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); 517 hl_mem_mgr_fini(&hpriv->mem_mgr); 518 519 hdev->compute_ctx_in_release = 1; 520 521 if (!hl_hpriv_put(hpriv)) { 522 dev_notice(hdev->dev, "User process closed FD but device still in use\n"); 523 hl_device_reset(hdev, HL_DRV_RESET_HARD); 524 } 525 526 hdev->last_open_session_duration_jif = 527 jiffies - hdev->last_successful_open_jif; 528 529 return 0; 530 } 531 532 static int hl_device_release_ctrl(struct inode *inode, struct file *filp) 533 { 534 struct hl_fpriv *hpriv = filp->private_data; 535 struct hl_device *hdev = hpriv->hdev; 536 537 filp->private_data = NULL; 538 539 if (!hdev) { 540 pr_err("Closing FD after device was removed\n"); 541 goto out; 542 } 543 544 mutex_lock(&hdev->fpriv_ctrl_list_lock); 545 list_del(&hpriv->dev_node); 546 mutex_unlock(&hdev->fpriv_ctrl_list_lock); 547 out: 548 /* release the eventfd */ 549 if (hpriv->notifier_event.eventfd) 550 eventfd_ctx_put(hpriv->notifier_event.eventfd); 551 552 mutex_destroy(&hpriv->notifier_event.lock); 553 put_pid(hpriv->taskpid); 554 555 kfree(hpriv); 556 557 return 0; 558 } 559 560 /* 561 * hl_mmap - mmap function for habanalabs device 562 * 563 * @*filp: pointer to file structure 564 * @*vma: pointer to vm_area_struct of the process 565 * 566 * Called when process does an mmap on habanalabs device. Call the relevant mmap 567 * function at the end of the common code. 568 */ 569 static int hl_mmap(struct file *filp, struct vm_area_struct *vma) 570 { 571 struct hl_fpriv *hpriv = filp->private_data; 572 struct hl_device *hdev = hpriv->hdev; 573 unsigned long vm_pgoff; 574 575 if (!hdev) { 576 pr_err_ratelimited("Trying to mmap after device was removed! Please close FD\n"); 577 return -ENODEV; 578 } 579 580 vm_pgoff = vma->vm_pgoff; 581 582 switch (vm_pgoff & HL_MMAP_TYPE_MASK) { 583 case HL_MMAP_TYPE_BLOCK: 584 vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff); 585 return hl_hw_block_mmap(hpriv, vma); 586 587 case HL_MMAP_TYPE_CB: 588 case HL_MMAP_TYPE_TS_BUFF: 589 return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL); 590 } 591 return -EINVAL; 592 } 593 594 static const struct file_operations hl_ops = { 595 .owner = THIS_MODULE, 596 .open = hl_device_open, 597 .release = hl_device_release, 598 .mmap = hl_mmap, 599 .unlocked_ioctl = hl_ioctl, 600 .compat_ioctl = hl_ioctl 601 }; 602 603 static const struct file_operations hl_ctrl_ops = { 604 .owner = THIS_MODULE, 605 .open = hl_device_open_ctrl, 606 .release = hl_device_release_ctrl, 607 .unlocked_ioctl = hl_ioctl_control, 608 .compat_ioctl = hl_ioctl_control 609 }; 610 611 static void device_release_func(struct device *dev) 612 { 613 kfree(dev); 614 } 615 616 /* 617 * device_init_cdev - Initialize cdev and device for habanalabs device 618 * 619 * @hdev: pointer to habanalabs device structure 620 * @hclass: pointer to the class object of the device 621 * @minor: minor number of the specific device 622 * @fpos: file operations to install for this device 623 * @name: name of the device as it will appear in the filesystem 624 * @cdev: pointer to the char device object that will be initialized 625 * @dev: pointer to the device object that will be initialized 626 * 627 * Initialize a cdev and a Linux device for habanalabs's device. 628 */ 629 static int device_init_cdev(struct hl_device *hdev, struct class *hclass, 630 int minor, const struct file_operations *fops, 631 char *name, struct cdev *cdev, 632 struct device **dev) 633 { 634 cdev_init(cdev, fops); 635 cdev->owner = THIS_MODULE; 636 637 *dev = kzalloc(sizeof(**dev), GFP_KERNEL); 638 if (!*dev) 639 return -ENOMEM; 640 641 device_initialize(*dev); 642 (*dev)->devt = MKDEV(hdev->major, minor); 643 (*dev)->class = hclass; 644 (*dev)->release = device_release_func; 645 dev_set_drvdata(*dev, hdev); 646 dev_set_name(*dev, "%s", name); 647 648 return 0; 649 } 650 651 static int device_cdev_sysfs_add(struct hl_device *hdev) 652 { 653 int rc; 654 655 rc = cdev_device_add(&hdev->cdev, hdev->dev); 656 if (rc) { 657 dev_err(hdev->dev, 658 "failed to add a char device to the system\n"); 659 return rc; 660 } 661 662 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); 663 if (rc) { 664 dev_err(hdev->dev, 665 "failed to add a control char device to the system\n"); 666 goto delete_cdev_device; 667 } 668 669 /* hl_sysfs_init() must be done after adding the device to the system */ 670 rc = hl_sysfs_init(hdev); 671 if (rc) { 672 dev_err(hdev->dev, "failed to initialize sysfs\n"); 673 goto delete_ctrl_cdev_device; 674 } 675 676 hdev->cdev_sysfs_created = true; 677 678 return 0; 679 680 delete_ctrl_cdev_device: 681 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); 682 delete_cdev_device: 683 cdev_device_del(&hdev->cdev, hdev->dev); 684 return rc; 685 } 686 687 static void device_cdev_sysfs_del(struct hl_device *hdev) 688 { 689 if (!hdev->cdev_sysfs_created) 690 goto put_devices; 691 692 hl_sysfs_fini(hdev); 693 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); 694 cdev_device_del(&hdev->cdev, hdev->dev); 695 696 put_devices: 697 put_device(hdev->dev); 698 put_device(hdev->dev_ctrl); 699 } 700 701 static void device_hard_reset_pending(struct work_struct *work) 702 { 703 struct hl_device_reset_work *device_reset_work = 704 container_of(work, struct hl_device_reset_work, reset_work.work); 705 struct hl_device *hdev = device_reset_work->hdev; 706 u32 flags; 707 int rc; 708 709 flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; 710 711 rc = hl_device_reset(hdev, flags); 712 713 if ((rc == -EBUSY) && !hdev->device_fini_pending) { 714 struct hl_ctx *ctx = hl_get_compute_ctx(hdev); 715 716 if (ctx) { 717 /* The read refcount value should subtracted by one, because the read is 718 * protected with hl_get_compute_ctx(). 719 */ 720 dev_info(hdev->dev, 721 "Could not reset device (compute_ctx refcount %u). will try again in %u seconds", 722 kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC); 723 hl_ctx_put(ctx); 724 } else { 725 dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", 726 HL_PENDING_RESET_PER_SEC); 727 } 728 729 queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, 730 msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); 731 } 732 } 733 734 static void device_release_watchdog_func(struct work_struct *work) 735 { 736 struct hl_device_reset_work *device_release_watchdog_work = 737 container_of(work, struct hl_device_reset_work, reset_work.work); 738 struct hl_device *hdev = device_release_watchdog_work->hdev; 739 u32 flags; 740 741 dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); 742 743 flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; 744 745 hl_device_reset(hdev, flags); 746 } 747 748 /* 749 * device_early_init - do some early initialization for the habanalabs device 750 * 751 * @hdev: pointer to habanalabs device structure 752 * 753 * Install the relevant function pointers and call the early_init function, 754 * if such a function exists 755 */ 756 static int device_early_init(struct hl_device *hdev) 757 { 758 int i, rc; 759 char workq_name[32]; 760 761 switch (hdev->asic_type) { 762 case ASIC_GOYA: 763 goya_set_asic_funcs(hdev); 764 strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); 765 break; 766 case ASIC_GAUDI: 767 gaudi_set_asic_funcs(hdev); 768 strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name)); 769 break; 770 case ASIC_GAUDI_SEC: 771 gaudi_set_asic_funcs(hdev); 772 strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name)); 773 break; 774 case ASIC_GAUDI2: 775 gaudi2_set_asic_funcs(hdev); 776 strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); 777 break; 778 case ASIC_GAUDI2B: 779 gaudi2_set_asic_funcs(hdev); 780 strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); 781 break; 782 break; 783 default: 784 dev_err(hdev->dev, "Unrecognized ASIC type %d\n", 785 hdev->asic_type); 786 return -EINVAL; 787 } 788 789 rc = hdev->asic_funcs->early_init(hdev); 790 if (rc) 791 return rc; 792 793 rc = hl_asid_init(hdev); 794 if (rc) 795 goto early_fini; 796 797 if (hdev->asic_prop.completion_queues_count) { 798 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, 799 sizeof(struct workqueue_struct *), 800 GFP_KERNEL); 801 if (!hdev->cq_wq) { 802 rc = -ENOMEM; 803 goto asid_fini; 804 } 805 } 806 807 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { 808 snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i); 809 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); 810 if (hdev->cq_wq[i] == NULL) { 811 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); 812 rc = -ENOMEM; 813 goto free_cq_wq; 814 } 815 } 816 817 hdev->eq_wq = create_singlethread_workqueue("hl-events"); 818 if (hdev->eq_wq == NULL) { 819 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); 820 rc = -ENOMEM; 821 goto free_cq_wq; 822 } 823 824 hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0); 825 if (!hdev->cs_cmplt_wq) { 826 dev_err(hdev->dev, 827 "Failed to allocate CS completions workqueue\n"); 828 rc = -ENOMEM; 829 goto free_eq_wq; 830 } 831 832 hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0); 833 if (!hdev->ts_free_obj_wq) { 834 dev_err(hdev->dev, 835 "Failed to allocate Timestamp registration free workqueue\n"); 836 rc = -ENOMEM; 837 goto free_cs_cmplt_wq; 838 } 839 840 hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); 841 if (!hdev->prefetch_wq) { 842 dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); 843 rc = -ENOMEM; 844 goto free_ts_free_wq; 845 } 846 847 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), 848 GFP_KERNEL); 849 if (!hdev->hl_chip_info) { 850 rc = -ENOMEM; 851 goto free_prefetch_wq; 852 } 853 854 rc = hl_mmu_if_set_funcs(hdev); 855 if (rc) 856 goto free_chip_info; 857 858 hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); 859 860 hdev->reset_wq = create_singlethread_workqueue("hl_device_reset"); 861 if (!hdev->reset_wq) { 862 rc = -ENOMEM; 863 dev_err(hdev->dev, "Failed to create device reset WQ\n"); 864 goto free_cb_mgr; 865 } 866 867 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending); 868 hdev->device_reset_work.hdev = hdev; 869 hdev->device_fini_pending = 0; 870 871 INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work, 872 device_release_watchdog_func); 873 hdev->device_release_watchdog_work.hdev = hdev; 874 875 mutex_init(&hdev->send_cpu_message_lock); 876 mutex_init(&hdev->debug_lock); 877 INIT_LIST_HEAD(&hdev->cs_mirror_list); 878 spin_lock_init(&hdev->cs_mirror_lock); 879 spin_lock_init(&hdev->reset_info.lock); 880 INIT_LIST_HEAD(&hdev->fpriv_list); 881 INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); 882 mutex_init(&hdev->fpriv_list_lock); 883 mutex_init(&hdev->fpriv_ctrl_list_lock); 884 mutex_init(&hdev->clk_throttling.lock); 885 886 return 0; 887 888 free_cb_mgr: 889 hl_mem_mgr_fini(&hdev->kernel_mem_mgr); 890 free_chip_info: 891 kfree(hdev->hl_chip_info); 892 free_prefetch_wq: 893 destroy_workqueue(hdev->prefetch_wq); 894 free_ts_free_wq: 895 destroy_workqueue(hdev->ts_free_obj_wq); 896 free_cs_cmplt_wq: 897 destroy_workqueue(hdev->cs_cmplt_wq); 898 free_eq_wq: 899 destroy_workqueue(hdev->eq_wq); 900 free_cq_wq: 901 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 902 if (hdev->cq_wq[i]) 903 destroy_workqueue(hdev->cq_wq[i]); 904 kfree(hdev->cq_wq); 905 asid_fini: 906 hl_asid_fini(hdev); 907 early_fini: 908 if (hdev->asic_funcs->early_fini) 909 hdev->asic_funcs->early_fini(hdev); 910 911 return rc; 912 } 913 914 /* 915 * device_early_fini - finalize all that was done in device_early_init 916 * 917 * @hdev: pointer to habanalabs device structure 918 * 919 */ 920 static void device_early_fini(struct hl_device *hdev) 921 { 922 int i; 923 924 mutex_destroy(&hdev->debug_lock); 925 mutex_destroy(&hdev->send_cpu_message_lock); 926 927 mutex_destroy(&hdev->fpriv_list_lock); 928 mutex_destroy(&hdev->fpriv_ctrl_list_lock); 929 930 mutex_destroy(&hdev->clk_throttling.lock); 931 932 hl_mem_mgr_fini(&hdev->kernel_mem_mgr); 933 934 kfree(hdev->hl_chip_info); 935 936 destroy_workqueue(hdev->prefetch_wq); 937 destroy_workqueue(hdev->ts_free_obj_wq); 938 destroy_workqueue(hdev->cs_cmplt_wq); 939 destroy_workqueue(hdev->eq_wq); 940 destroy_workqueue(hdev->reset_wq); 941 942 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 943 destroy_workqueue(hdev->cq_wq[i]); 944 kfree(hdev->cq_wq); 945 946 hl_asid_fini(hdev); 947 948 if (hdev->asic_funcs->early_fini) 949 hdev->asic_funcs->early_fini(hdev); 950 } 951 952 static void hl_device_heartbeat(struct work_struct *work) 953 { 954 struct hl_device *hdev = container_of(work, struct hl_device, 955 work_heartbeat.work); 956 957 if (!hl_device_operational(hdev, NULL)) 958 goto reschedule; 959 960 if (!hdev->asic_funcs->send_heartbeat(hdev)) 961 goto reschedule; 962 963 if (hl_device_operational(hdev, NULL)) 964 dev_err(hdev->dev, "Device heartbeat failed!\n"); 965 966 hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); 967 968 return; 969 970 reschedule: 971 /* 972 * prev_reset_trigger tracks consecutive fatal h/w errors until first 973 * heartbeat immediately post reset. 974 * If control reached here, then at least one heartbeat work has been 975 * scheduled since last reset/init cycle. 976 * So if the device is not already in reset cycle, reset the flag 977 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR 978 * status for at least one heartbeat. From this point driver restarts 979 * tracking future consecutive fatal errors. 980 */ 981 if (!hdev->reset_info.in_reset) 982 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; 983 984 schedule_delayed_work(&hdev->work_heartbeat, 985 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); 986 } 987 988 /* 989 * device_late_init - do late stuff initialization for the habanalabs device 990 * 991 * @hdev: pointer to habanalabs device structure 992 * 993 * Do stuff that either needs the device H/W queues to be active or needs 994 * to happen after all the rest of the initialization is finished 995 */ 996 static int device_late_init(struct hl_device *hdev) 997 { 998 int rc; 999 1000 if (hdev->asic_funcs->late_init) { 1001 rc = hdev->asic_funcs->late_init(hdev); 1002 if (rc) { 1003 dev_err(hdev->dev, 1004 "failed late initialization for the H/W\n"); 1005 return rc; 1006 } 1007 } 1008 1009 hdev->high_pll = hdev->asic_prop.high_pll; 1010 1011 if (hdev->heartbeat) { 1012 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); 1013 schedule_delayed_work(&hdev->work_heartbeat, 1014 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); 1015 } 1016 1017 hdev->late_init_done = true; 1018 1019 return 0; 1020 } 1021 1022 /* 1023 * device_late_fini - finalize all that was done in device_late_init 1024 * 1025 * @hdev: pointer to habanalabs device structure 1026 * 1027 */ 1028 static void device_late_fini(struct hl_device *hdev) 1029 { 1030 if (!hdev->late_init_done) 1031 return; 1032 1033 if (hdev->heartbeat) 1034 cancel_delayed_work_sync(&hdev->work_heartbeat); 1035 1036 if (hdev->asic_funcs->late_fini) 1037 hdev->asic_funcs->late_fini(hdev); 1038 1039 hdev->late_init_done = false; 1040 } 1041 1042 int hl_device_utilization(struct hl_device *hdev, u32 *utilization) 1043 { 1044 u64 max_power, curr_power, dc_power, dividend, divisor; 1045 int rc; 1046 1047 max_power = hdev->max_power; 1048 dc_power = hdev->asic_prop.dc_power_default; 1049 divisor = max_power - dc_power; 1050 if (!divisor) { 1051 dev_warn(hdev->dev, "device utilization is not supported\n"); 1052 return -EOPNOTSUPP; 1053 } 1054 rc = hl_fw_cpucp_power_get(hdev, &curr_power); 1055 1056 if (rc) 1057 return rc; 1058 1059 curr_power = clamp(curr_power, dc_power, max_power); 1060 1061 dividend = (curr_power - dc_power) * 100; 1062 *utilization = (u32) div_u64(dividend, divisor); 1063 1064 return 0; 1065 } 1066 1067 int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable) 1068 { 1069 int rc = 0; 1070 1071 mutex_lock(&hdev->debug_lock); 1072 1073 if (!enable) { 1074 if (!hdev->in_debug) { 1075 dev_err(hdev->dev, 1076 "Failed to disable debug mode because device was not in debug mode\n"); 1077 rc = -EFAULT; 1078 goto out; 1079 } 1080 1081 if (!hdev->reset_info.hard_reset_pending) 1082 hdev->asic_funcs->halt_coresight(hdev, ctx); 1083 1084 hdev->in_debug = 0; 1085 1086 goto out; 1087 } 1088 1089 if (hdev->in_debug) { 1090 dev_err(hdev->dev, 1091 "Failed to enable debug mode because device is already in debug mode\n"); 1092 rc = -EFAULT; 1093 goto out; 1094 } 1095 1096 hdev->in_debug = 1; 1097 1098 out: 1099 mutex_unlock(&hdev->debug_lock); 1100 1101 return rc; 1102 } 1103 1104 static void take_release_locks(struct hl_device *hdev) 1105 { 1106 /* Flush anyone that is inside the critical section of enqueue 1107 * jobs to the H/W 1108 */ 1109 hdev->asic_funcs->hw_queues_lock(hdev); 1110 hdev->asic_funcs->hw_queues_unlock(hdev); 1111 1112 /* Flush processes that are sending message to CPU */ 1113 mutex_lock(&hdev->send_cpu_message_lock); 1114 mutex_unlock(&hdev->send_cpu_message_lock); 1115 1116 /* Flush anyone that is inside device open */ 1117 mutex_lock(&hdev->fpriv_list_lock); 1118 mutex_unlock(&hdev->fpriv_list_lock); 1119 mutex_lock(&hdev->fpriv_ctrl_list_lock); 1120 mutex_unlock(&hdev->fpriv_ctrl_list_lock); 1121 } 1122 1123 static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, 1124 bool skip_wq_flush) 1125 { 1126 if (hard_reset) 1127 device_late_fini(hdev); 1128 1129 /* 1130 * Halt the engines and disable interrupts so we won't get any more 1131 * completions from H/W and we won't have any accesses from the 1132 * H/W to the host machine 1133 */ 1134 hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); 1135 1136 /* Go over all the queues, release all CS and their jobs */ 1137 hl_cs_rollback_all(hdev, skip_wq_flush); 1138 1139 /* flush the MMU prefetch workqueue */ 1140 flush_workqueue(hdev->prefetch_wq); 1141 1142 /* Release all pending user interrupts, each pending user interrupt 1143 * holds a reference to user context 1144 */ 1145 hl_release_pending_user_interrupts(hdev); 1146 } 1147 1148 /* 1149 * hl_device_suspend - initiate device suspend 1150 * 1151 * @hdev: pointer to habanalabs device structure 1152 * 1153 * Puts the hw in the suspend state (all asics). 1154 * Returns 0 for success or an error on failure. 1155 * Called at driver suspend. 1156 */ 1157 int hl_device_suspend(struct hl_device *hdev) 1158 { 1159 int rc; 1160 1161 pci_save_state(hdev->pdev); 1162 1163 /* Block future CS/VM/JOB completion operations */ 1164 spin_lock(&hdev->reset_info.lock); 1165 if (hdev->reset_info.in_reset) { 1166 spin_unlock(&hdev->reset_info.lock); 1167 dev_err(hdev->dev, "Can't suspend while in reset\n"); 1168 return -EIO; 1169 } 1170 hdev->reset_info.in_reset = 1; 1171 spin_unlock(&hdev->reset_info.lock); 1172 1173 /* This blocks all other stuff that is not blocked by in_reset */ 1174 hdev->disabled = true; 1175 1176 take_release_locks(hdev); 1177 1178 rc = hdev->asic_funcs->suspend(hdev); 1179 if (rc) 1180 dev_err(hdev->dev, 1181 "Failed to disable PCI access of device CPU\n"); 1182 1183 /* Shut down the device */ 1184 pci_disable_device(hdev->pdev); 1185 pci_set_power_state(hdev->pdev, PCI_D3hot); 1186 1187 return 0; 1188 } 1189 1190 /* 1191 * hl_device_resume - initiate device resume 1192 * 1193 * @hdev: pointer to habanalabs device structure 1194 * 1195 * Bring the hw back to operating state (all asics). 1196 * Returns 0 for success or an error on failure. 1197 * Called at driver resume. 1198 */ 1199 int hl_device_resume(struct hl_device *hdev) 1200 { 1201 int rc; 1202 1203 pci_set_power_state(hdev->pdev, PCI_D0); 1204 pci_restore_state(hdev->pdev); 1205 rc = pci_enable_device_mem(hdev->pdev); 1206 if (rc) { 1207 dev_err(hdev->dev, 1208 "Failed to enable PCI device in resume\n"); 1209 return rc; 1210 } 1211 1212 pci_set_master(hdev->pdev); 1213 1214 rc = hdev->asic_funcs->resume(hdev); 1215 if (rc) { 1216 dev_err(hdev->dev, "Failed to resume device after suspend\n"); 1217 goto disable_device; 1218 } 1219 1220 1221 /* 'in_reset' was set to true during suspend, now we must clear it in order 1222 * for hard reset to be performed 1223 */ 1224 spin_lock(&hdev->reset_info.lock); 1225 hdev->reset_info.in_reset = 0; 1226 spin_unlock(&hdev->reset_info.lock); 1227 1228 rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); 1229 if (rc) { 1230 dev_err(hdev->dev, "Failed to reset device during resume\n"); 1231 goto disable_device; 1232 } 1233 1234 return 0; 1235 1236 disable_device: 1237 pci_clear_master(hdev->pdev); 1238 pci_disable_device(hdev->pdev); 1239 1240 return rc; 1241 } 1242 1243 static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev) 1244 { 1245 struct task_struct *task = NULL; 1246 struct list_head *fd_list; 1247 struct hl_fpriv *hpriv; 1248 struct mutex *fd_lock; 1249 u32 pending_cnt; 1250 1251 fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; 1252 fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; 1253 1254 /* Giving time for user to close FD, and for processes that are inside 1255 * hl_device_open to finish 1256 */ 1257 if (!list_empty(fd_list)) 1258 ssleep(1); 1259 1260 if (timeout) { 1261 pending_cnt = timeout; 1262 } else { 1263 if (hdev->process_kill_trial_cnt) { 1264 /* Processes have been already killed */ 1265 pending_cnt = 1; 1266 goto wait_for_processes; 1267 } else { 1268 /* Wait a small period after process kill */ 1269 pending_cnt = HL_PENDING_RESET_PER_SEC; 1270 } 1271 } 1272 1273 mutex_lock(fd_lock); 1274 1275 /* This section must be protected because we are dereferencing 1276 * pointers that are freed if the process exits 1277 */ 1278 list_for_each_entry(hpriv, fd_list, dev_node) { 1279 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID); 1280 if (task) { 1281 dev_info(hdev->dev, "Killing user process pid=%d\n", 1282 task_pid_nr(task)); 1283 send_sig(SIGKILL, task, 1); 1284 usleep_range(1000, 10000); 1285 1286 put_task_struct(task); 1287 } else { 1288 /* 1289 * If we got here, it means that process was killed from outside the driver 1290 * right after it started looping on fd_list and before get_pid_task, thus 1291 * we don't need to kill it. 1292 */ 1293 dev_dbg(hdev->dev, 1294 "Can't get task struct for user process, assuming process was killed from outside the driver\n"); 1295 } 1296 } 1297 1298 mutex_unlock(fd_lock); 1299 1300 /* 1301 * We killed the open users, but that doesn't mean they are closed. 1302 * It could be that they are running a long cleanup phase in the driver 1303 * e.g. MMU unmappings, or running other long teardown flow even before 1304 * our cleanup. 1305 * Therefore we need to wait again to make sure they are closed before 1306 * continuing with the reset. 1307 */ 1308 1309 wait_for_processes: 1310 while ((!list_empty(fd_list)) && (pending_cnt)) { 1311 dev_dbg(hdev->dev, 1312 "Waiting for all unmap operations to finish before hard reset\n"); 1313 1314 pending_cnt--; 1315 1316 ssleep(1); 1317 } 1318 1319 /* All processes exited successfully */ 1320 if (list_empty(fd_list)) 1321 return 0; 1322 1323 /* Give up waiting for processes to exit */ 1324 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS) 1325 return -ETIME; 1326 1327 hdev->process_kill_trial_cnt++; 1328 1329 return -EBUSY; 1330 } 1331 1332 static void device_disable_open_processes(struct hl_device *hdev, bool control_dev) 1333 { 1334 struct list_head *fd_list; 1335 struct hl_fpriv *hpriv; 1336 struct mutex *fd_lock; 1337 1338 fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; 1339 fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; 1340 1341 mutex_lock(fd_lock); 1342 list_for_each_entry(hpriv, fd_list, dev_node) 1343 hpriv->hdev = NULL; 1344 mutex_unlock(fd_lock); 1345 } 1346 1347 static void handle_reset_trigger(struct hl_device *hdev, u32 flags) 1348 { 1349 u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; 1350 1351 /* No consecutive mechanism when user context exists */ 1352 if (hdev->is_compute_ctx_active) 1353 return; 1354 1355 /* 1356 * 'reset cause' is being updated here, because getting here 1357 * means that it's the 1st time and the last time we're here 1358 * ('in_reset' makes sure of it). This makes sure that 1359 * 'reset_cause' will continue holding its 1st recorded reason! 1360 */ 1361 if (flags & HL_DRV_RESET_HEARTBEAT) { 1362 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; 1363 cur_reset_trigger = HL_DRV_RESET_HEARTBEAT; 1364 } else if (flags & HL_DRV_RESET_TDR) { 1365 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; 1366 cur_reset_trigger = HL_DRV_RESET_TDR; 1367 } else if (flags & HL_DRV_RESET_FW_FATAL_ERR) { 1368 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; 1369 cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR; 1370 } else { 1371 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; 1372 } 1373 1374 /* 1375 * If reset cause is same twice, then reset_trigger_repeated 1376 * is set and if this reset is due to a fatal FW error 1377 * device is set to an unstable state. 1378 */ 1379 if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { 1380 hdev->reset_info.prev_reset_trigger = cur_reset_trigger; 1381 hdev->reset_info.reset_trigger_repeated = 0; 1382 } else { 1383 hdev->reset_info.reset_trigger_repeated = 1; 1384 } 1385 1386 /* If reset is due to heartbeat, device CPU is no responsive in 1387 * which case no point sending PCI disable message to it. 1388 * 1389 * If F/W is performing the reset, no need to send it a message to disable 1390 * PCI access 1391 */ 1392 if ((flags & HL_DRV_RESET_HARD) && 1393 !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { 1394 /* Disable PCI access from device F/W so he won't send 1395 * us additional interrupts. We disable MSI/MSI-X at 1396 * the halt_engines function and we can't have the F/W 1397 * sending us interrupts after that. We need to disable 1398 * the access here because if the device is marked 1399 * disable, the message won't be send. Also, in case 1400 * of heartbeat, the device CPU is marked as disable 1401 * so this message won't be sent 1402 */ 1403 if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) 1404 dev_warn(hdev->dev, 1405 "Failed to disable PCI access by F/W\n"); 1406 } 1407 } 1408 1409 /* 1410 * hl_device_reset - reset the device 1411 * 1412 * @hdev: pointer to habanalabs device structure 1413 * @flags: reset flags. 1414 * 1415 * Block future CS and wait for pending CS to be enqueued 1416 * Call ASIC H/W fini 1417 * Flush all completions 1418 * Re-initialize all internal data structures 1419 * Call ASIC H/W init, late_init 1420 * Test queues 1421 * Enable device 1422 * 1423 * Returns 0 for success or an error on failure. 1424 */ 1425 int hl_device_reset(struct hl_device *hdev, u32 flags) 1426 { 1427 bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, 1428 reset_upon_device_release = false, schedule_hard_reset = false, delay_reset, 1429 from_dev_release, from_watchdog_thread; 1430 u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; 1431 struct hl_ctx *ctx; 1432 int i, rc; 1433 1434 if (!hdev->init_done) { 1435 dev_err(hdev->dev, "Can't reset before initialization is done\n"); 1436 return 0; 1437 } 1438 1439 hard_reset = !!(flags & HL_DRV_RESET_HARD); 1440 from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); 1441 fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); 1442 from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE); 1443 delay_reset = !!(flags & HL_DRV_RESET_DELAY); 1444 from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); 1445 1446 if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { 1447 hard_instead_soft = true; 1448 hard_reset = true; 1449 } 1450 1451 if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) { 1452 if (hard_reset) { 1453 dev_crit(hdev->dev, 1454 "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); 1455 return -EINVAL; 1456 } 1457 1458 reset_upon_device_release = true; 1459 1460 goto do_reset; 1461 } 1462 1463 if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { 1464 hard_instead_soft = true; 1465 hard_reset = true; 1466 } 1467 1468 if (hard_instead_soft) 1469 dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n"); 1470 1471 do_reset: 1472 /* Re-entry of reset thread */ 1473 if (from_hard_reset_thread && hdev->process_kill_trial_cnt) 1474 goto kill_processes; 1475 1476 /* 1477 * Prevent concurrency in this function - only one reset should be 1478 * done at any given time. Only need to perform this if we didn't 1479 * get from the dedicated hard reset thread 1480 */ 1481 if (!from_hard_reset_thread) { 1482 /* Block future CS/VM/JOB completion operations */ 1483 spin_lock(&hdev->reset_info.lock); 1484 if (hdev->reset_info.in_reset) { 1485 /* We only allow scheduling of a hard reset during compute reset */ 1486 if (hard_reset && hdev->reset_info.in_compute_reset) 1487 hdev->reset_info.hard_reset_schedule_flags = flags; 1488 spin_unlock(&hdev->reset_info.lock); 1489 return 0; 1490 } 1491 1492 /* This still allows the completion of some KDMA ops 1493 * Update this before in_reset because in_compute_reset implies we are in reset 1494 */ 1495 hdev->reset_info.in_compute_reset = !hard_reset; 1496 1497 hdev->reset_info.in_reset = 1; 1498 1499 spin_unlock(&hdev->reset_info.lock); 1500 1501 /* Cancel the device release watchdog work if required. 1502 * In case of reset-upon-device-release while the release watchdog work is 1503 * scheduled, do hard-reset instead of compute-reset. 1504 */ 1505 if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { 1506 hdev->reset_info.watchdog_active = 0; 1507 if (!from_watchdog_thread) 1508 cancel_delayed_work_sync( 1509 &hdev->device_release_watchdog_work.reset_work); 1510 1511 if (from_dev_release) { 1512 flags |= HL_DRV_RESET_HARD; 1513 flags &= ~HL_DRV_RESET_DEV_RELEASE; 1514 hard_reset = true; 1515 } 1516 } 1517 1518 if (delay_reset) 1519 usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); 1520 1521 handle_reset_trigger(hdev, flags); 1522 1523 /* This also blocks future CS/VM/JOB completion operations */ 1524 hdev->disabled = true; 1525 1526 take_release_locks(hdev); 1527 1528 if (hard_reset) 1529 dev_info(hdev->dev, "Going to reset device\n"); 1530 else if (reset_upon_device_release) 1531 dev_dbg(hdev->dev, "Going to reset device after release by user\n"); 1532 else 1533 dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); 1534 } 1535 1536 again: 1537 if ((hard_reset) && (!from_hard_reset_thread)) { 1538 hdev->reset_info.hard_reset_pending = true; 1539 1540 hdev->process_kill_trial_cnt = 0; 1541 1542 hdev->device_reset_work.flags = flags; 1543 1544 /* 1545 * Because the reset function can't run from heartbeat work, 1546 * we need to call the reset function from a dedicated work. 1547 */ 1548 queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0); 1549 1550 return 0; 1551 } 1552 1553 cleanup_resources(hdev, hard_reset, fw_reset, from_dev_release); 1554 1555 kill_processes: 1556 if (hard_reset) { 1557 /* Kill processes here after CS rollback. This is because the 1558 * process can't really exit until all its CSs are done, which 1559 * is what we do in cs rollback 1560 */ 1561 rc = device_kill_open_processes(hdev, 0, false); 1562 1563 if (rc == -EBUSY) { 1564 if (hdev->device_fini_pending) { 1565 dev_crit(hdev->dev, 1566 "%s Failed to kill all open processes, stopping hard reset\n", 1567 dev_name(&(hdev)->pdev->dev)); 1568 goto out_err; 1569 } 1570 1571 /* signal reset thread to reschedule */ 1572 return rc; 1573 } 1574 1575 if (rc) { 1576 dev_crit(hdev->dev, 1577 "%s Failed to kill all open processes, stopping hard reset\n", 1578 dev_name(&(hdev)->pdev->dev)); 1579 goto out_err; 1580 } 1581 1582 /* Flush the Event queue workers to make sure no other thread is 1583 * reading or writing to registers during the reset 1584 */ 1585 flush_workqueue(hdev->eq_wq); 1586 } 1587 1588 /* Reset the H/W. It will be in idle state after this returns */ 1589 hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); 1590 1591 if (hard_reset) { 1592 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; 1593 1594 /* Release kernel context */ 1595 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) 1596 hdev->kernel_ctx = NULL; 1597 1598 hl_vm_fini(hdev); 1599 hl_mmu_fini(hdev); 1600 hl_eq_reset(hdev, &hdev->event_queue); 1601 } 1602 1603 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */ 1604 hl_hw_queue_reset(hdev, hard_reset); 1605 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 1606 hl_cq_reset(hdev, &hdev->completion_queue[i]); 1607 1608 /* Make sure the context switch phase will run again */ 1609 ctx = hl_get_compute_ctx(hdev); 1610 if (ctx) { 1611 atomic_set(&ctx->thread_ctx_switch_token, 1); 1612 ctx->thread_ctx_switch_wait_token = 0; 1613 hl_ctx_put(ctx); 1614 } 1615 1616 /* Finished tear-down, starting to re-initialize */ 1617 1618 if (hard_reset) { 1619 hdev->device_cpu_disabled = false; 1620 hdev->reset_info.hard_reset_pending = false; 1621 1622 if (hdev->reset_info.reset_trigger_repeated && 1623 (hdev->reset_info.prev_reset_trigger == 1624 HL_DRV_RESET_FW_FATAL_ERR)) { 1625 /* if there 2 back to back resets from FW, 1626 * ensure driver puts the driver in a unusable state 1627 */ 1628 dev_crit(hdev->dev, 1629 "%s Consecutive FW fatal errors received, stopping hard reset\n", 1630 dev_name(&(hdev)->pdev->dev)); 1631 rc = -EIO; 1632 goto out_err; 1633 } 1634 1635 if (hdev->kernel_ctx) { 1636 dev_crit(hdev->dev, 1637 "%s kernel ctx was alive during hard reset, something is terribly wrong\n", 1638 dev_name(&(hdev)->pdev->dev)); 1639 rc = -EBUSY; 1640 goto out_err; 1641 } 1642 1643 rc = hl_mmu_init(hdev); 1644 if (rc) { 1645 dev_err(hdev->dev, 1646 "Failed to initialize MMU S/W after hard reset\n"); 1647 goto out_err; 1648 } 1649 1650 /* Allocate the kernel context */ 1651 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), 1652 GFP_KERNEL); 1653 if (!hdev->kernel_ctx) { 1654 rc = -ENOMEM; 1655 hl_mmu_fini(hdev); 1656 goto out_err; 1657 } 1658 1659 hdev->is_compute_ctx_active = false; 1660 1661 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); 1662 if (rc) { 1663 dev_err(hdev->dev, 1664 "failed to init kernel ctx in hard reset\n"); 1665 kfree(hdev->kernel_ctx); 1666 hdev->kernel_ctx = NULL; 1667 hl_mmu_fini(hdev); 1668 goto out_err; 1669 } 1670 } 1671 1672 /* Device is now enabled as part of the initialization requires 1673 * communication with the device firmware to get information that 1674 * is required for the initialization itself 1675 */ 1676 hdev->disabled = false; 1677 1678 /* F/W security enabled indication might be updated after hard-reset */ 1679 if (hard_reset) { 1680 rc = hl_fw_read_preboot_status(hdev); 1681 if (rc) 1682 goto out_err; 1683 } 1684 1685 rc = hdev->asic_funcs->hw_init(hdev); 1686 if (rc) { 1687 dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); 1688 goto out_err; 1689 } 1690 1691 /* If device is not idle fail the reset process */ 1692 if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, 1693 HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { 1694 print_idle_status_mask(hdev, "device is not idle after reset", idle_mask); 1695 rc = -EIO; 1696 goto out_err; 1697 } 1698 1699 /* Check that the communication with the device is working */ 1700 rc = hdev->asic_funcs->test_queues(hdev); 1701 if (rc) { 1702 dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); 1703 goto out_err; 1704 } 1705 1706 if (hard_reset) { 1707 rc = device_late_init(hdev); 1708 if (rc) { 1709 dev_err(hdev->dev, "Failed late init after hard reset\n"); 1710 goto out_err; 1711 } 1712 1713 rc = hl_vm_init(hdev); 1714 if (rc) { 1715 dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); 1716 goto out_err; 1717 } 1718 1719 if (!hdev->asic_prop.fw_security_enabled) 1720 hl_fw_set_max_power(hdev); 1721 } else { 1722 rc = hdev->asic_funcs->compute_reset_late_init(hdev); 1723 if (rc) { 1724 if (reset_upon_device_release) 1725 dev_err(hdev->dev, 1726 "Failed late init in reset after device release\n"); 1727 else 1728 dev_err(hdev->dev, "Failed late init after compute reset\n"); 1729 goto out_err; 1730 } 1731 } 1732 1733 rc = hdev->asic_funcs->scrub_device_mem(hdev); 1734 if (rc) { 1735 dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc); 1736 return rc; 1737 } 1738 1739 spin_lock(&hdev->reset_info.lock); 1740 hdev->reset_info.in_compute_reset = 0; 1741 1742 /* Schedule hard reset only if requested and if not already in hard reset. 1743 * We keep 'in_reset' enabled, so no other reset can go in during the hard 1744 * reset schedule 1745 */ 1746 if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) 1747 schedule_hard_reset = true; 1748 else 1749 hdev->reset_info.in_reset = 0; 1750 1751 spin_unlock(&hdev->reset_info.lock); 1752 1753 hdev->reset_info.needs_reset = false; 1754 1755 if (hard_reset) 1756 dev_info(hdev->dev, 1757 "Successfully finished resetting the %s device\n", 1758 dev_name(&(hdev)->pdev->dev)); 1759 else 1760 dev_dbg(hdev->dev, 1761 "Successfully finished resetting the %s device\n", 1762 dev_name(&(hdev)->pdev->dev)); 1763 1764 if (hard_reset) { 1765 hdev->reset_info.hard_reset_cnt++; 1766 1767 /* After reset is done, we are ready to receive events from 1768 * the F/W. We can't do it before because we will ignore events 1769 * and if those events are fatal, we won't know about it and 1770 * the device will be operational although it shouldn't be 1771 */ 1772 hdev->asic_funcs->enable_events_from_fw(hdev); 1773 } else { 1774 if (!reset_upon_device_release) 1775 hdev->reset_info.compute_reset_cnt++; 1776 1777 if (schedule_hard_reset) { 1778 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); 1779 flags = hdev->reset_info.hard_reset_schedule_flags; 1780 hdev->reset_info.hard_reset_schedule_flags = 0; 1781 hdev->disabled = true; 1782 hard_reset = true; 1783 handle_reset_trigger(hdev, flags); 1784 goto again; 1785 } 1786 } 1787 1788 return 0; 1789 1790 out_err: 1791 hdev->disabled = true; 1792 1793 spin_lock(&hdev->reset_info.lock); 1794 hdev->reset_info.in_compute_reset = 0; 1795 1796 if (hard_reset) { 1797 dev_err(hdev->dev, 1798 "%s Failed to reset! Device is NOT usable\n", 1799 dev_name(&(hdev)->pdev->dev)); 1800 hdev->reset_info.hard_reset_cnt++; 1801 } else if (reset_upon_device_release) { 1802 spin_unlock(&hdev->reset_info.lock); 1803 dev_err(hdev->dev, "Failed to reset device after user release\n"); 1804 flags |= HL_DRV_RESET_HARD; 1805 flags &= ~HL_DRV_RESET_DEV_RELEASE; 1806 hard_reset = true; 1807 goto again; 1808 } else { 1809 spin_unlock(&hdev->reset_info.lock); 1810 dev_err(hdev->dev, "Failed to do compute reset\n"); 1811 hdev->reset_info.compute_reset_cnt++; 1812 flags |= HL_DRV_RESET_HARD; 1813 hard_reset = true; 1814 goto again; 1815 } 1816 1817 hdev->reset_info.in_reset = 0; 1818 1819 spin_unlock(&hdev->reset_info.lock); 1820 1821 return rc; 1822 } 1823 1824 /* 1825 * hl_device_cond_reset() - conditionally reset the device. 1826 * @hdev: pointer to habanalabs device structure. 1827 * @reset_flags: reset flags. 1828 * @event_mask: events to notify user about. 1829 * 1830 * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device 1831 * unless another reset precedes it. 1832 */ 1833 int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) 1834 { 1835 struct hl_ctx *ctx = NULL; 1836 1837 /* Device release watchdog is only for hard reset */ 1838 if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset) 1839 goto device_reset; 1840 1841 /* F/W reset cannot be postponed */ 1842 if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) 1843 goto device_reset; 1844 1845 /* Device release watchdog is relevant only if user exists and gets a reset notification */ 1846 if (!(event_mask & HL_NOTIFIER_EVENT_DEVICE_RESET)) { 1847 dev_err(hdev->dev, "Resetting device without a reset indication to user\n"); 1848 goto device_reset; 1849 } 1850 1851 ctx = hl_get_compute_ctx(hdev); 1852 if (!ctx || !ctx->hpriv->notifier_event.eventfd) 1853 goto device_reset; 1854 1855 /* Schedule the device release watchdog work unless reset is already in progress or if the 1856 * work is already scheduled. 1857 */ 1858 spin_lock(&hdev->reset_info.lock); 1859 if (hdev->reset_info.in_reset) { 1860 spin_unlock(&hdev->reset_info.lock); 1861 goto device_reset; 1862 } 1863 1864 if (hdev->reset_info.watchdog_active) 1865 goto out; 1866 1867 hdev->device_release_watchdog_work.flags = flags; 1868 dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", 1869 hdev->device_release_watchdog_timeout_sec); 1870 schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, 1871 msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); 1872 hdev->reset_info.watchdog_active = 1; 1873 out: 1874 spin_unlock(&hdev->reset_info.lock); 1875 1876 hl_notifier_event_send_all(hdev, event_mask); 1877 1878 hl_ctx_put(ctx); 1879 1880 hl_abort_waitings_for_completion(hdev); 1881 1882 return 0; 1883 1884 device_reset: 1885 if (event_mask) 1886 hl_notifier_event_send_all(hdev, event_mask); 1887 if (ctx) 1888 hl_ctx_put(ctx); 1889 1890 return hl_device_reset(hdev, flags); 1891 } 1892 1893 static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) 1894 { 1895 mutex_lock(¬ifier_event->lock); 1896 notifier_event->events_mask |= event_mask; 1897 1898 if (notifier_event->eventfd) 1899 eventfd_signal(notifier_event->eventfd, 1); 1900 1901 mutex_unlock(¬ifier_event->lock); 1902 } 1903 1904 /* 1905 * hl_notifier_event_send_all - notify all user processes via eventfd 1906 * 1907 * @hdev: pointer to habanalabs device structure 1908 * @event_mask: the occurred event/s 1909 * Returns 0 for success or an error on failure. 1910 */ 1911 void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) 1912 { 1913 struct hl_fpriv *hpriv; 1914 1915 if (!event_mask) { 1916 dev_warn(hdev->dev, "Skip sending zero event"); 1917 return; 1918 } 1919 1920 mutex_lock(&hdev->fpriv_list_lock); 1921 1922 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) 1923 hl_notifier_event_send(&hpriv->notifier_event, event_mask); 1924 1925 mutex_unlock(&hdev->fpriv_list_lock); 1926 1927 /* control device */ 1928 mutex_lock(&hdev->fpriv_ctrl_list_lock); 1929 1930 list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) 1931 hl_notifier_event_send(&hpriv->notifier_event, event_mask); 1932 1933 mutex_unlock(&hdev->fpriv_ctrl_list_lock); 1934 } 1935 1936 /* 1937 * hl_device_init - main initialization function for habanalabs device 1938 * 1939 * @hdev: pointer to habanalabs device structure 1940 * 1941 * Allocate an id for the device, do early initialization and then call the 1942 * ASIC specific initialization functions. Finally, create the cdev and the 1943 * Linux device to expose it to the user 1944 */ 1945 int hl_device_init(struct hl_device *hdev, struct class *hclass) 1946 { 1947 int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; 1948 char *name; 1949 bool add_cdev_sysfs_on_err = false; 1950 1951 hdev->cdev_idx = hdev->id / 2; 1952 1953 name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx); 1954 if (!name) { 1955 rc = -ENOMEM; 1956 goto out_disabled; 1957 } 1958 1959 /* Initialize cdev and device structures */ 1960 rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name, 1961 &hdev->cdev, &hdev->dev); 1962 1963 kfree(name); 1964 1965 if (rc) 1966 goto out_disabled; 1967 1968 name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx); 1969 if (!name) { 1970 rc = -ENOMEM; 1971 goto free_dev; 1972 } 1973 1974 /* Initialize cdev and device structures for control device */ 1975 rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops, 1976 name, &hdev->cdev_ctrl, &hdev->dev_ctrl); 1977 1978 kfree(name); 1979 1980 if (rc) 1981 goto free_dev; 1982 1983 /* Initialize ASIC function pointers and perform early init */ 1984 rc = device_early_init(hdev); 1985 if (rc) 1986 goto free_dev_ctrl; 1987 1988 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + 1989 hdev->asic_prop.user_interrupt_count; 1990 1991 if (user_interrupt_cnt) { 1992 hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt), 1993 GFP_KERNEL); 1994 if (!hdev->user_interrupt) { 1995 rc = -ENOMEM; 1996 goto early_fini; 1997 } 1998 } 1999 2000 /* 2001 * Start calling ASIC initialization. First S/W then H/W and finally 2002 * late init 2003 */ 2004 rc = hdev->asic_funcs->sw_init(hdev); 2005 if (rc) 2006 goto free_usr_intr_mem; 2007 2008 2009 /* initialize completion structure for multi CS wait */ 2010 hl_multi_cs_completion_init(hdev); 2011 2012 /* 2013 * Initialize the H/W queues. Must be done before hw_init, because 2014 * there the addresses of the kernel queue are being written to the 2015 * registers of the device 2016 */ 2017 rc = hl_hw_queues_create(hdev); 2018 if (rc) { 2019 dev_err(hdev->dev, "failed to initialize kernel queues\n"); 2020 goto sw_fini; 2021 } 2022 2023 cq_cnt = hdev->asic_prop.completion_queues_count; 2024 2025 /* 2026 * Initialize the completion queues. Must be done before hw_init, 2027 * because there the addresses of the completion queues are being 2028 * passed as arguments to request_irq 2029 */ 2030 if (cq_cnt) { 2031 hdev->completion_queue = kcalloc(cq_cnt, 2032 sizeof(*hdev->completion_queue), 2033 GFP_KERNEL); 2034 2035 if (!hdev->completion_queue) { 2036 dev_err(hdev->dev, 2037 "failed to allocate completion queues\n"); 2038 rc = -ENOMEM; 2039 goto hw_queues_destroy; 2040 } 2041 } 2042 2043 for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { 2044 rc = hl_cq_init(hdev, &hdev->completion_queue[i], 2045 hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); 2046 if (rc) { 2047 dev_err(hdev->dev, 2048 "failed to initialize completion queue\n"); 2049 goto cq_fini; 2050 } 2051 hdev->completion_queue[i].cq_idx = i; 2052 } 2053 2054 hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs, 2055 sizeof(struct hl_cs *), GFP_KERNEL); 2056 if (!hdev->shadow_cs_queue) { 2057 rc = -ENOMEM; 2058 goto cq_fini; 2059 } 2060 2061 /* 2062 * Initialize the event queue. Must be done before hw_init, 2063 * because there the address of the event queue is being 2064 * passed as argument to request_irq 2065 */ 2066 rc = hl_eq_init(hdev, &hdev->event_queue); 2067 if (rc) { 2068 dev_err(hdev->dev, "failed to initialize event queue\n"); 2069 goto free_shadow_cs_queue; 2070 } 2071 2072 /* MMU S/W must be initialized before kernel context is created */ 2073 rc = hl_mmu_init(hdev); 2074 if (rc) { 2075 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); 2076 goto eq_fini; 2077 } 2078 2079 /* Allocate the kernel context */ 2080 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); 2081 if (!hdev->kernel_ctx) { 2082 rc = -ENOMEM; 2083 goto mmu_fini; 2084 } 2085 2086 hdev->is_compute_ctx_active = false; 2087 2088 hdev->asic_funcs->state_dump_init(hdev); 2089 2090 hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; 2091 2092 hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; 2093 hl_debugfs_add_device(hdev); 2094 2095 /* debugfs nodes are created in hl_ctx_init so it must be called after 2096 * hl_debugfs_add_device. 2097 */ 2098 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); 2099 if (rc) { 2100 dev_err(hdev->dev, "failed to initialize kernel context\n"); 2101 kfree(hdev->kernel_ctx); 2102 goto remove_device_from_debugfs; 2103 } 2104 2105 rc = hl_cb_pool_init(hdev); 2106 if (rc) { 2107 dev_err(hdev->dev, "failed to initialize CB pool\n"); 2108 goto release_ctx; 2109 } 2110 2111 rc = hl_dec_init(hdev); 2112 if (rc) { 2113 dev_err(hdev->dev, "Failed to initialize the decoder module\n"); 2114 goto cb_pool_fini; 2115 } 2116 2117 /* 2118 * From this point, override rc (=0) in case of an error to allow 2119 * debugging (by adding char devices and create sysfs nodes as part of 2120 * the error flow). 2121 */ 2122 add_cdev_sysfs_on_err = true; 2123 2124 /* Device is now enabled as part of the initialization requires 2125 * communication with the device firmware to get information that 2126 * is required for the initialization itself 2127 */ 2128 hdev->disabled = false; 2129 2130 rc = hdev->asic_funcs->hw_init(hdev); 2131 if (rc) { 2132 dev_err(hdev->dev, "failed to initialize the H/W\n"); 2133 rc = 0; 2134 goto out_disabled; 2135 } 2136 2137 /* Check that the communication with the device is working */ 2138 rc = hdev->asic_funcs->test_queues(hdev); 2139 if (rc) { 2140 dev_err(hdev->dev, "Failed to detect if device is alive\n"); 2141 rc = 0; 2142 goto out_disabled; 2143 } 2144 2145 rc = device_late_init(hdev); 2146 if (rc) { 2147 dev_err(hdev->dev, "Failed late initialization\n"); 2148 rc = 0; 2149 goto out_disabled; 2150 } 2151 2152 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", 2153 hdev->asic_name, 2154 hdev->asic_prop.dram_size / SZ_1G); 2155 2156 rc = hl_vm_init(hdev); 2157 if (rc) { 2158 dev_err(hdev->dev, "Failed to initialize memory module\n"); 2159 rc = 0; 2160 goto out_disabled; 2161 } 2162 2163 /* 2164 * Expose devices and sysfs nodes to user. 2165 * From here there is no need to add char devices and create sysfs nodes 2166 * in case of an error. 2167 */ 2168 add_cdev_sysfs_on_err = false; 2169 rc = device_cdev_sysfs_add(hdev); 2170 if (rc) { 2171 dev_err(hdev->dev, 2172 "Failed to add char devices and sysfs nodes\n"); 2173 rc = 0; 2174 goto out_disabled; 2175 } 2176 2177 /* Need to call this again because the max power might change, 2178 * depending on card type for certain ASICs 2179 */ 2180 if (hdev->asic_prop.set_max_power_on_device_init && 2181 !hdev->asic_prop.fw_security_enabled) 2182 hl_fw_set_max_power(hdev); 2183 2184 /* 2185 * hl_hwmon_init() must be called after device_late_init(), because only 2186 * there we get the information from the device about which 2187 * hwmon-related sensors the device supports. 2188 * Furthermore, it must be done after adding the device to the system. 2189 */ 2190 rc = hl_hwmon_init(hdev); 2191 if (rc) { 2192 dev_err(hdev->dev, "Failed to initialize hwmon\n"); 2193 rc = 0; 2194 goto out_disabled; 2195 } 2196 2197 dev_notice(hdev->dev, 2198 "Successfully added device %s to habanalabs driver\n", 2199 dev_name(&(hdev)->pdev->dev)); 2200 2201 hdev->init_done = true; 2202 2203 /* After initialization is done, we are ready to receive events from 2204 * the F/W. We can't do it before because we will ignore events and if 2205 * those events are fatal, we won't know about it and the device will 2206 * be operational although it shouldn't be 2207 */ 2208 hdev->asic_funcs->enable_events_from_fw(hdev); 2209 2210 return 0; 2211 2212 cb_pool_fini: 2213 hl_cb_pool_fini(hdev); 2214 release_ctx: 2215 if (hl_ctx_put(hdev->kernel_ctx) != 1) 2216 dev_err(hdev->dev, 2217 "kernel ctx is still alive on initialization failure\n"); 2218 remove_device_from_debugfs: 2219 hl_debugfs_remove_device(hdev); 2220 mmu_fini: 2221 hl_mmu_fini(hdev); 2222 eq_fini: 2223 hl_eq_fini(hdev, &hdev->event_queue); 2224 free_shadow_cs_queue: 2225 kfree(hdev->shadow_cs_queue); 2226 cq_fini: 2227 for (i = 0 ; i < cq_ready_cnt ; i++) 2228 hl_cq_fini(hdev, &hdev->completion_queue[i]); 2229 kfree(hdev->completion_queue); 2230 hw_queues_destroy: 2231 hl_hw_queues_destroy(hdev); 2232 sw_fini: 2233 hdev->asic_funcs->sw_fini(hdev); 2234 free_usr_intr_mem: 2235 kfree(hdev->user_interrupt); 2236 early_fini: 2237 device_early_fini(hdev); 2238 free_dev_ctrl: 2239 put_device(hdev->dev_ctrl); 2240 free_dev: 2241 put_device(hdev->dev); 2242 out_disabled: 2243 hdev->disabled = true; 2244 if (add_cdev_sysfs_on_err) 2245 device_cdev_sysfs_add(hdev); 2246 if (hdev->pdev) 2247 dev_err(&hdev->pdev->dev, 2248 "Failed to initialize hl%d. Device %s is NOT usable !\n", 2249 hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); 2250 else 2251 pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n", 2252 hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); 2253 2254 return rc; 2255 } 2256 2257 /* 2258 * hl_device_fini - main tear-down function for habanalabs device 2259 * 2260 * @hdev: pointer to habanalabs device structure 2261 * 2262 * Destroy the device, call ASIC fini functions and release the id 2263 */ 2264 void hl_device_fini(struct hl_device *hdev) 2265 { 2266 bool device_in_reset; 2267 ktime_t timeout; 2268 u64 reset_sec; 2269 int i, rc; 2270 2271 dev_info(hdev->dev, "Removing device\n"); 2272 2273 hdev->device_fini_pending = 1; 2274 flush_delayed_work(&hdev->device_reset_work.reset_work); 2275 2276 if (hdev->pldm) 2277 reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT; 2278 else 2279 reset_sec = HL_HARD_RESET_MAX_TIMEOUT; 2280 2281 /* 2282 * This function is competing with the reset function, so try to 2283 * take the reset atomic and if we are already in middle of reset, 2284 * wait until reset function is finished. Reset function is designed 2285 * to always finish. However, in Gaudi, because of all the network 2286 * ports, the hard reset could take between 10-30 seconds 2287 */ 2288 2289 timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000); 2290 2291 spin_lock(&hdev->reset_info.lock); 2292 device_in_reset = !!hdev->reset_info.in_reset; 2293 if (!device_in_reset) 2294 hdev->reset_info.in_reset = 1; 2295 spin_unlock(&hdev->reset_info.lock); 2296 2297 while (device_in_reset) { 2298 usleep_range(50, 200); 2299 2300 spin_lock(&hdev->reset_info.lock); 2301 device_in_reset = !!hdev->reset_info.in_reset; 2302 if (!device_in_reset) 2303 hdev->reset_info.in_reset = 1; 2304 spin_unlock(&hdev->reset_info.lock); 2305 2306 if (ktime_compare(ktime_get(), timeout) > 0) { 2307 dev_crit(hdev->dev, 2308 "%s Failed to remove device because reset function did not finish\n", 2309 dev_name(&(hdev)->pdev->dev)); 2310 return; 2311 } 2312 } 2313 2314 cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work); 2315 2316 /* Disable PCI access from device F/W so it won't send us additional 2317 * interrupts. We disable MSI/MSI-X at the halt_engines function and we 2318 * can't have the F/W sending us interrupts after that. We need to 2319 * disable the access here because if the device is marked disable, the 2320 * message won't be send. Also, in case of heartbeat, the device CPU is 2321 * marked as disable so this message won't be sent 2322 */ 2323 hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); 2324 2325 /* Mark device as disabled */ 2326 hdev->disabled = true; 2327 2328 take_release_locks(hdev); 2329 2330 hdev->reset_info.hard_reset_pending = true; 2331 2332 hl_hwmon_fini(hdev); 2333 2334 cleanup_resources(hdev, true, false, false); 2335 2336 /* Kill processes here after CS rollback. This is because the process 2337 * can't really exit until all its CSs are done, which is what we 2338 * do in cs rollback 2339 */ 2340 dev_info(hdev->dev, 2341 "Waiting for all processes to exit (timeout of %u seconds)", 2342 HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI); 2343 2344 hdev->process_kill_trial_cnt = 0; 2345 rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false); 2346 if (rc) { 2347 dev_crit(hdev->dev, "Failed to kill all open processes\n"); 2348 device_disable_open_processes(hdev, false); 2349 } 2350 2351 hdev->process_kill_trial_cnt = 0; 2352 rc = device_kill_open_processes(hdev, 0, true); 2353 if (rc) { 2354 dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); 2355 device_disable_open_processes(hdev, true); 2356 } 2357 2358 hl_cb_pool_fini(hdev); 2359 2360 /* Reset the H/W. It will be in idle state after this returns */ 2361 hdev->asic_funcs->hw_fini(hdev, true, false); 2362 2363 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; 2364 2365 /* Release kernel context */ 2366 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) 2367 dev_err(hdev->dev, "kernel ctx is still alive\n"); 2368 2369 hl_debugfs_remove_device(hdev); 2370 2371 hl_dec_fini(hdev); 2372 2373 hl_vm_fini(hdev); 2374 2375 hl_mmu_fini(hdev); 2376 2377 vfree(hdev->captured_err_info.page_fault_info.user_mappings); 2378 2379 hl_eq_fini(hdev, &hdev->event_queue); 2380 2381 kfree(hdev->shadow_cs_queue); 2382 2383 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 2384 hl_cq_fini(hdev, &hdev->completion_queue[i]); 2385 kfree(hdev->completion_queue); 2386 kfree(hdev->user_interrupt); 2387 2388 hl_hw_queues_destroy(hdev); 2389 2390 /* Call ASIC S/W finalize function */ 2391 hdev->asic_funcs->sw_fini(hdev); 2392 2393 device_early_fini(hdev); 2394 2395 /* Hide devices and sysfs nodes from user */ 2396 device_cdev_sysfs_del(hdev); 2397 2398 pr_info("removed device successfully\n"); 2399 } 2400 2401 /* 2402 * MMIO register access helper functions. 2403 */ 2404 2405 /* 2406 * hl_rreg - Read an MMIO register 2407 * 2408 * @hdev: pointer to habanalabs device structure 2409 * @reg: MMIO register offset (in bytes) 2410 * 2411 * Returns the value of the MMIO register we are asked to read 2412 * 2413 */ 2414 inline u32 hl_rreg(struct hl_device *hdev, u32 reg) 2415 { 2416 u32 val = readl(hdev->rmmio + reg); 2417 2418 if (unlikely(trace_habanalabs_rreg32_enabled())) 2419 trace_habanalabs_rreg32(hdev->dev, reg, val); 2420 2421 return val; 2422 } 2423 2424 /* 2425 * hl_wreg - Write to an MMIO register 2426 * 2427 * @hdev: pointer to habanalabs device structure 2428 * @reg: MMIO register offset (in bytes) 2429 * @val: 32-bit value 2430 * 2431 * Writes the 32-bit value into the MMIO register 2432 * 2433 */ 2434 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) 2435 { 2436 if (unlikely(trace_habanalabs_wreg32_enabled())) 2437 trace_habanalabs_wreg32(hdev->dev, reg, val); 2438 2439 writel(val, hdev->rmmio + reg); 2440 } 2441 2442 void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, 2443 u8 flags) 2444 { 2445 struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info; 2446 2447 if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) { 2448 dev_err(hdev->dev, 2449 "Number of possible razwi initiators (%u) exceeded limit (%u)\n", 2450 num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR); 2451 return; 2452 } 2453 2454 /* In case it's the first razwi since the device was opened, capture its parameters */ 2455 if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1)) 2456 return; 2457 2458 razwi_info->razwi.timestamp = ktime_to_ns(ktime_get()); 2459 razwi_info->razwi.addr = addr; 2460 razwi_info->razwi.num_of_possible_engines = num_of_engines; 2461 memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0], 2462 num_of_engines * sizeof(u16)); 2463 razwi_info->razwi.flags = flags; 2464 2465 razwi_info->razwi_info_available = true; 2466 } 2467 2468 void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, 2469 u8 flags, u64 *event_mask) 2470 { 2471 hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); 2472 2473 if (event_mask) 2474 *event_mask |= HL_NOTIFIER_EVENT_RAZWI; 2475 } 2476 2477 static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) 2478 { 2479 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; 2480 struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; 2481 struct hl_vm_hash_node *hnode; 2482 struct hl_userptr *userptr; 2483 enum vm_type *vm_type; 2484 struct hl_ctx *ctx; 2485 u32 map_idx = 0; 2486 int i; 2487 2488 /* Reset previous session count*/ 2489 pgf_info->num_of_user_mappings = 0; 2490 2491 ctx = hl_get_compute_ctx(hdev); 2492 if (!ctx) { 2493 dev_err(hdev->dev, "Can't get user context for user mappings\n"); 2494 return; 2495 } 2496 2497 mutex_lock(&ctx->mem_hash_lock); 2498 hash_for_each(ctx->mem_hash, i, hnode, node) { 2499 vm_type = hnode->ptr; 2500 if (((*vm_type == VM_TYPE_USERPTR) && is_pmmu) || 2501 ((*vm_type == VM_TYPE_PHYS_PACK) && !is_pmmu)) 2502 pgf_info->num_of_user_mappings++; 2503 2504 } 2505 2506 if (!pgf_info->num_of_user_mappings) 2507 goto finish; 2508 2509 /* In case we already allocated in previous session, need to release it before 2510 * allocating new buffer. 2511 */ 2512 vfree(pgf_info->user_mappings); 2513 pgf_info->user_mappings = 2514 vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); 2515 if (!pgf_info->user_mappings) { 2516 pgf_info->num_of_user_mappings = 0; 2517 goto finish; 2518 } 2519 2520 hash_for_each(ctx->mem_hash, i, hnode, node) { 2521 vm_type = hnode->ptr; 2522 if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) { 2523 userptr = hnode->ptr; 2524 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; 2525 pgf_info->user_mappings[map_idx].size = userptr->size; 2526 map_idx++; 2527 } else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) { 2528 phys_pg_pack = hnode->ptr; 2529 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; 2530 pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size; 2531 map_idx++; 2532 } 2533 } 2534 finish: 2535 mutex_unlock(&ctx->mem_hash_lock); 2536 hl_ctx_put(ctx); 2537 } 2538 2539 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu) 2540 { 2541 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; 2542 2543 /* Capture only the first page fault */ 2544 if (atomic_cmpxchg(&pgf_info->page_fault_detected, 0, 1)) 2545 return; 2546 2547 pgf_info->page_fault.timestamp = ktime_to_ns(ktime_get()); 2548 pgf_info->page_fault.addr = addr; 2549 pgf_info->page_fault.engine_id = eng_id; 2550 hl_capture_user_mappings(hdev, is_pmmu); 2551 2552 pgf_info->page_fault_info_available = true; 2553 } 2554 2555 void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, 2556 u64 *event_mask) 2557 { 2558 hl_capture_page_fault(hdev, addr, eng_id, is_pmmu); 2559 2560 if (event_mask) 2561 *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; 2562 } 2563