Lines Matching refs:dev

70 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)  in mlx5_get_nic_state()  argument
72 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; in mlx5_get_nic_state()
75 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) in mlx5_set_nic_state() argument
79 cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); in mlx5_set_nic_state()
82 &dev->iseg->cmdq_addr_l_sz); in mlx5_set_nic_state()
85 static bool sensor_pci_not_working(struct mlx5_core_dev *dev) in sensor_pci_not_working() argument
87 struct mlx5_core_health *health = &dev->priv.health; in sensor_pci_not_working()
99 static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) in sensor_fw_synd_rfr() argument
101 struct mlx5_core_health *health = &dev->priv.health; in sensor_fw_synd_rfr()
109 mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); in sensor_fw_synd_rfr()
113 u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev) in mlx5_health_check_fatal_sensors() argument
115 if (sensor_pci_not_working(dev)) in mlx5_health_check_fatal_sensors()
117 if (pci_channel_offline(dev->pdev)) in mlx5_health_check_fatal_sensors()
119 if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) in mlx5_health_check_fatal_sensors()
121 if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) in mlx5_health_check_fatal_sensors()
123 if (sensor_fw_synd_rfr(dev)) in mlx5_health_check_fatal_sensors()
129 static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock) in lock_sem_sw_reset() argument
134 if (!mlx5_core_is_pf(dev)) in lock_sem_sw_reset()
141 ret = mlx5_vsc_gw_lock(dev); in lock_sem_sw_reset()
152 ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); in lock_sem_sw_reset()
154 mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); in lock_sem_sw_reset()
157 mlx5_vsc_gw_unlock(dev); in lock_sem_sw_reset()
162 static bool reset_fw_if_needed(struct mlx5_core_dev *dev) in reset_fw_if_needed() argument
164 bool supported = (ioread32be(&dev->iseg->initializing) >> in reset_fw_if_needed()
176 fatal_error = mlx5_health_check_fatal_sensors(dev); in reset_fw_if_needed()
180 mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); in reset_fw_if_needed()
184 mlx5_core_warn(dev, "Issuing FW Reset\n"); in reset_fw_if_needed()
188 mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET); in reset_fw_if_needed()
193 static void enter_error_state(struct mlx5_core_dev *dev, bool force) in enter_error_state() argument
195 if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ in enter_error_state()
196 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; in enter_error_state()
197 mlx5_cmd_flush(dev); in enter_error_state()
200 mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); in enter_error_state()
203 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) in mlx5_enter_error_state() argument
208 if ((mlx5_health_check_fatal_sensors(dev) || force) && in mlx5_enter_error_state()
209 dev->state == MLX5_DEVICE_STATE_UP) { in mlx5_enter_error_state()
210 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; in mlx5_enter_error_state()
213 mutex_lock(&dev->intf_state_mutex); in mlx5_enter_error_state()
214 if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) in mlx5_enter_error_state()
217 enter_error_state(dev, force); in mlx5_enter_error_state()
219 mutex_unlock(&dev->intf_state_mutex); in mlx5_enter_error_state()
222 void mlx5_error_sw_reset(struct mlx5_core_dev *dev) in mlx5_error_sw_reset() argument
224 unsigned long end, delay_ms = mlx5_tout_ms(dev, PCI_TOGGLE); in mlx5_error_sw_reset()
227 mutex_lock(&dev->intf_state_mutex); in mlx5_error_sw_reset()
228 if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) in mlx5_error_sw_reset()
231 mlx5_core_err(dev, "start\n"); in mlx5_error_sw_reset()
233 if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) { in mlx5_error_sw_reset()
235 lock = lock_sem_sw_reset(dev, true); in mlx5_error_sw_reset()
238 delay_ms = mlx5_tout_ms(dev, FULL_CRDUMP); in mlx5_error_sw_reset()
242 reset_fw_if_needed(dev); in mlx5_error_sw_reset()
249 if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) in mlx5_error_sw_reset()
251 if (pci_channel_offline(dev->pdev)) { in mlx5_error_sw_reset()
252 mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); in mlx5_error_sw_reset()
259 if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { in mlx5_error_sw_reset()
260 dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", in mlx5_error_sw_reset()
261 mlx5_get_nic_state(dev), delay_ms); in mlx5_error_sw_reset()
266 lock_sem_sw_reset(dev, false); in mlx5_error_sw_reset()
268 mlx5_core_err(dev, "end\n"); in mlx5_error_sw_reset()
271 mutex_unlock(&dev->intf_state_mutex); in mlx5_error_sw_reset()
274 static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) in mlx5_handle_bad_state() argument
276 u8 nic_interface = mlx5_get_nic_state(dev); in mlx5_handle_bad_state()
280 mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); in mlx5_handle_bad_state()
284 mlx5_core_warn(dev, "starting teardown\n"); in mlx5_handle_bad_state()
288 mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); in mlx5_handle_bad_state()
300 if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) in mlx5_handle_bad_state()
301 mlx5_core_warn(dev, "NIC SW reset in progress\n"); in mlx5_handle_bad_state()
305 mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", in mlx5_handle_bad_state()
309 mlx5_disable_device(dev); in mlx5_handle_bad_state()
312 int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) in mlx5_health_wait_pci_up() argument
316 end = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FW_RESET)); in mlx5_health_wait_pci_up()
317 while (sensor_pci_not_working(dev)) { in mlx5_health_wait_pci_up()
320 if (test_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state)) { in mlx5_health_wait_pci_up()
321 mlx5_core_warn(dev, "device is being removed, stop waiting for PCI\n"); in mlx5_health_wait_pci_up()
324 if (pci_channel_offline(dev->pdev)) { in mlx5_health_wait_pci_up()
325 mlx5_core_err(dev, "PCI channel offline, stop waiting for PCI\n"); in mlx5_health_wait_pci_up()
333 static int mlx5_health_try_recover(struct mlx5_core_dev *dev) in mlx5_health_try_recover() argument
335 mlx5_core_warn(dev, "handling bad device here\n"); in mlx5_health_try_recover()
336 mlx5_handle_bad_state(dev); in mlx5_health_try_recover()
337 if (mlx5_health_wait_pci_up(dev)) { in mlx5_health_try_recover()
338 mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); in mlx5_health_try_recover()
341 mlx5_core_err(dev, "starting health recovery flow\n"); in mlx5_health_try_recover()
342 if (mlx5_recover_device(dev) || mlx5_health_check_fatal_sensors(dev)) { in mlx5_health_try_recover()
343 mlx5_core_err(dev, "health recovery failed\n"); in mlx5_health_try_recover()
347 mlx5_core_info(dev, "health recovery succeeded\n"); in mlx5_health_try_recover()
410 static void print_health_info(struct mlx5_core_dev *dev) in print_health_info() argument
412 struct mlx5_core_health *health = &dev->priv.health; in print_health_info()
423 mlx5_log(dev, LOGLEVEL_ERR, "PCI slot is unavailable\n"); in print_health_info()
429 mlx5_log(dev, severity, "Health issue observed, %s, severity(%d) %s:\n", in print_health_info()
433 mlx5_log(dev, severity, "assert_var[%d] 0x%08x\n", i, in print_health_info()
436 mlx5_log(dev, severity, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); in print_health_info()
437 mlx5_log(dev, severity, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); in print_health_info()
438 mlx5_log(dev, severity, "fw_ver %d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), in print_health_info()
439 fw_rev_sub(dev)); in print_health_info()
440 mlx5_log(dev, severity, "time %u\n", ioread32be(&h->time)); in print_health_info()
441 mlx5_log(dev, severity, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); in print_health_info()
442 mlx5_log(dev, severity, "rfr %d\n", mlx5_health_get_rfr(rfr_severity)); in print_health_info()
443 mlx5_log(dev, severity, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity)); in print_health_info()
444 mlx5_log(dev, severity, "irisc_index %d\n", ioread8(&h->irisc_index)); in print_health_info()
445 mlx5_log(dev, severity, "synd 0x%x: %s\n", ioread8(&h->synd), in print_health_info()
447 mlx5_log(dev, severity, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); in print_health_info()
448 mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver)); in print_health_info()
456 struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); in mlx5_fw_reporter_diagnose() local
457 struct mlx5_core_health *health = &dev->priv.health; in mlx5_fw_reporter_diagnose()
492 mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, in mlx5_fw_reporter_heath_buffer_data_put() argument
495 struct mlx5_core_health *health = &dev->priv.health; in mlx5_fw_reporter_heath_buffer_data_put()
569 struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); in mlx5_fw_reporter_dump() local
572 err = mlx5_fw_tracer_trigger_core_dump_general(dev); in mlx5_fw_reporter_dump()
584 err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg); in mlx5_fw_reporter_dump()
587 return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); in mlx5_fw_reporter_dump()
624 struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); in mlx5_fw_fatal_reporter_recover() local
626 return mlx5_health_try_recover(dev); in mlx5_fw_fatal_reporter_recover()
634 struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); in mlx5_fw_fatal_reporter_dump() local
635 u32 crdump_size = dev->priv.health.crdump_size; in mlx5_fw_fatal_reporter_dump()
639 if (!mlx5_core_is_pf(dev)) in mlx5_fw_fatal_reporter_dump()
645 err = mlx5_crdump_collect(dev, cr_data); in mlx5_fw_fatal_reporter_dump()
668 struct mlx5_core_dev *dev; in mlx5_fw_fatal_reporter_err_work() local
674 dev = container_of(priv, struct mlx5_core_dev, priv); in mlx5_fw_fatal_reporter_err_work()
675 devlink = priv_to_devlink(dev); in mlx5_fw_fatal_reporter_err_work()
677 mutex_lock(&dev->intf_state_mutex); in mlx5_fw_fatal_reporter_err_work()
679 mlx5_core_err(dev, "health works are not permitted at this stage\n"); in mlx5_fw_fatal_reporter_err_work()
680 mutex_unlock(&dev->intf_state_mutex); in mlx5_fw_fatal_reporter_err_work()
683 mutex_unlock(&dev->intf_state_mutex); in mlx5_fw_fatal_reporter_err_work()
684 enter_error_state(dev, false); in mlx5_fw_fatal_reporter_err_work()
687 if (mlx5_health_try_recover(dev)) in mlx5_fw_fatal_reporter_err_work()
688 mlx5_core_err(dev, "health recovery failed\n"); in mlx5_fw_fatal_reporter_err_work()
701 mlx5_core_err(dev, "Driver is in error state. Unloading\n"); in mlx5_fw_fatal_reporter_err_work()
702 mlx5_unload_one(dev, false); in mlx5_fw_fatal_reporter_err_work()
717 void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) in mlx5_fw_reporters_create() argument
719 struct mlx5_core_health *health = &dev->priv.health; in mlx5_fw_reporters_create()
720 struct devlink *devlink = priv_to_devlink(dev); in mlx5_fw_reporters_create()
723 if (mlx5_core_is_ecpf(dev)) { in mlx5_fw_reporters_create()
725 } else if (mlx5_core_is_pf(dev)) { in mlx5_fw_reporters_create()
734 0, dev); in mlx5_fw_reporters_create()
736 mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", in mlx5_fw_reporters_create()
743 dev); in mlx5_fw_reporters_create()
745 mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", in mlx5_fw_reporters_create()
749 static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) in mlx5_fw_reporters_destroy() argument
751 struct mlx5_core_health *health = &dev->priv.health; in mlx5_fw_reporters_destroy()
760 static unsigned long get_next_poll_jiffies(struct mlx5_core_dev *dev) in get_next_poll_jiffies() argument
766 next += jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL)); in get_next_poll_jiffies()
771 void mlx5_trigger_health_work(struct mlx5_core_dev *dev) in mlx5_trigger_health_work() argument
773 struct mlx5_core_health *health = &dev->priv.health; in mlx5_trigger_health_work()
775 if (!mlx5_dev_is_lightweight(dev)) in mlx5_trigger_health_work()
786 struct mlx5_core_dev *dev; in mlx5_health_log_ts_update() local
792 dev = container_of(priv, struct mlx5_core_dev, priv); in mlx5_health_log_ts_update()
798 mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MRTC, 0, 1); in mlx5_health_log_ts_update()
806 struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); in poll_health() local
807 struct mlx5_core_health *health = &dev->priv.health; in poll_health()
813 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) in poll_health()
816 fatal_error = mlx5_health_check_fatal_sensors(dev); in poll_health()
819 mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); in poll_health()
820 dev->priv.health.fatal_error = fatal_error; in poll_health()
821 print_health_info(dev); in poll_health()
822 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; in poll_health()
823 mlx5_trigger_health_work(dev); in poll_health()
835 mlx5_core_err(dev, "device's health compromised - reached miss count\n"); in poll_health()
836 print_health_info(dev); in poll_health()
846 mod_timer(&health->timer, get_next_poll_jiffies(dev)); in poll_health()
849 void mlx5_start_health_poll(struct mlx5_core_dev *dev) in mlx5_start_health_poll() argument
851 u64 poll_interval_ms = mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL); in mlx5_start_health_poll()
852 struct mlx5_core_health *health = &dev->priv.health; in mlx5_start_health_poll()
857 health->health = &dev->iseg->health; in mlx5_start_health_poll()
858 health->health_counter = &dev->iseg->health_counter; in mlx5_start_health_poll()
864 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) in mlx5_stop_health_poll() argument
866 struct mlx5_core_health *health = &dev->priv.health; in mlx5_stop_health_poll()
874 void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev) in mlx5_start_health_fw_log_up() argument
876 struct mlx5_core_health *health = &dev->priv.health; in mlx5_start_health_fw_log_up()
878 if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc)) in mlx5_start_health_fw_log_up()
882 void mlx5_drain_health_wq(struct mlx5_core_dev *dev) in mlx5_drain_health_wq() argument
884 struct mlx5_core_health *health = &dev->priv.health; in mlx5_drain_health_wq()
892 void mlx5_health_cleanup(struct mlx5_core_dev *dev) in mlx5_health_cleanup() argument
894 struct mlx5_core_health *health = &dev->priv.health; in mlx5_health_cleanup()
898 mlx5_reporter_vnic_destroy(dev); in mlx5_health_cleanup()
899 mlx5_fw_reporters_destroy(dev); in mlx5_health_cleanup()
902 int mlx5_health_init(struct mlx5_core_dev *dev) in mlx5_health_init() argument
904 struct devlink *devlink = priv_to_devlink(dev); in mlx5_health_init()
908 if (!mlx5_dev_is_lightweight(dev)) { in mlx5_health_init()
910 mlx5_fw_reporters_create(dev); in mlx5_health_init()
913 mlx5_reporter_vnic_create(dev); in mlx5_health_init()
915 health = &dev->priv.health; in mlx5_health_init()
921 strcat(name, dev_name(dev->device)); in mlx5_health_init()
933 mlx5_reporter_vnic_destroy(dev); in mlx5_health_init()
934 mlx5_fw_reporters_destroy(dev); in mlx5_health_init()