15a2cc190SJeff Kirsher /*
25a2cc190SJeff Kirsher * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
35a2cc190SJeff Kirsher * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
45a2cc190SJeff Kirsher *
55a2cc190SJeff Kirsher * This software is available to you under a choice of one of two
65a2cc190SJeff Kirsher * licenses. You may choose to be licensed under the terms of the GNU
75a2cc190SJeff Kirsher * General Public License (GPL) Version 2, available from the file
85a2cc190SJeff Kirsher * COPYING in the main directory of this source tree, or the
95a2cc190SJeff Kirsher * OpenIB.org BSD license below:
105a2cc190SJeff Kirsher *
115a2cc190SJeff Kirsher * Redistribution and use in source and binary forms, with or
125a2cc190SJeff Kirsher * without modification, are permitted provided that the following
135a2cc190SJeff Kirsher * conditions are met:
145a2cc190SJeff Kirsher *
155a2cc190SJeff Kirsher * - Redistributions of source code must retain the above
165a2cc190SJeff Kirsher * copyright notice, this list of conditions and the following
175a2cc190SJeff Kirsher * disclaimer.
185a2cc190SJeff Kirsher *
195a2cc190SJeff Kirsher * - Redistributions in binary form must reproduce the above
205a2cc190SJeff Kirsher * copyright notice, this list of conditions and the following
215a2cc190SJeff Kirsher * disclaimer in the documentation and/or other materials
225a2cc190SJeff Kirsher * provided with the distribution.
235a2cc190SJeff Kirsher *
245a2cc190SJeff Kirsher * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
255a2cc190SJeff Kirsher * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
265a2cc190SJeff Kirsher * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
275a2cc190SJeff Kirsher * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
285a2cc190SJeff Kirsher * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
295a2cc190SJeff Kirsher * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
305a2cc190SJeff Kirsher * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
315a2cc190SJeff Kirsher * SOFTWARE.
325a2cc190SJeff Kirsher */
335a2cc190SJeff Kirsher
345a2cc190SJeff Kirsher #include <linux/workqueue.h>
359d9779e7SPaul Gortmaker #include <linux/module.h>
365a2cc190SJeff Kirsher
375a2cc190SJeff Kirsher #include "mlx4.h"
385a2cc190SJeff Kirsher
395a2cc190SJeff Kirsher enum {
405a2cc190SJeff Kirsher MLX4_CATAS_POLL_INTERVAL = 5 * HZ,
415a2cc190SJeff Kirsher };
425a2cc190SJeff Kirsher
435a2cc190SJeff Kirsher
445a2cc190SJeff Kirsher
45f5aef5aaSYishai Hadas int mlx4_internal_err_reset = 1;
46f5aef5aaSYishai Hadas module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644);
475a2cc190SJeff Kirsher MODULE_PARM_DESC(internal_err_reset,
4855ad3592SYishai Hadas "Reset device on internal errors if non-zero (default 1)");
495a2cc190SJeff Kirsher
read_vendor_id(struct mlx4_dev * dev)50f6bc11e4SYishai Hadas static int read_vendor_id(struct mlx4_dev *dev)
51f6bc11e4SYishai Hadas {
52f6bc11e4SYishai Hadas u16 vendor_id = 0;
53f6bc11e4SYishai Hadas int ret;
54f6bc11e4SYishai Hadas
55f6bc11e4SYishai Hadas ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
56f6bc11e4SYishai Hadas if (ret) {
57f6bc11e4SYishai Hadas mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
58f6bc11e4SYishai Hadas return ret;
59f6bc11e4SYishai Hadas }
60f6bc11e4SYishai Hadas
61f6bc11e4SYishai Hadas if (vendor_id == 0xffff) {
62f6bc11e4SYishai Hadas mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
63f6bc11e4SYishai Hadas return -EINVAL;
64f6bc11e4SYishai Hadas }
65f6bc11e4SYishai Hadas
66f6bc11e4SYishai Hadas return 0;
67f6bc11e4SYishai Hadas }
68f6bc11e4SYishai Hadas
mlx4_reset_master(struct mlx4_dev * dev)69f6bc11e4SYishai Hadas static int mlx4_reset_master(struct mlx4_dev *dev)
70f6bc11e4SYishai Hadas {
71f6bc11e4SYishai Hadas int err = 0;
72f6bc11e4SYishai Hadas
7355ad3592SYishai Hadas if (mlx4_is_master(dev))
7455ad3592SYishai Hadas mlx4_report_internal_err_comm_event(dev);
7555ad3592SYishai Hadas
76f6bc11e4SYishai Hadas if (!pci_channel_offline(dev->persist->pdev)) {
77f6bc11e4SYishai Hadas err = read_vendor_id(dev);
78f6bc11e4SYishai Hadas /* If PCI can't be accessed to read vendor ID we assume that its
79f6bc11e4SYishai Hadas * link was disabled and chip was already reset.
80f6bc11e4SYishai Hadas */
81f6bc11e4SYishai Hadas if (err)
82f6bc11e4SYishai Hadas return 0;
83f6bc11e4SYishai Hadas
84f6bc11e4SYishai Hadas err = mlx4_reset(dev);
85f6bc11e4SYishai Hadas if (err)
86f6bc11e4SYishai Hadas mlx4_err(dev, "Fail to reset HCA\n");
87f6bc11e4SYishai Hadas }
88f6bc11e4SYishai Hadas
89f6bc11e4SYishai Hadas return err;
90f6bc11e4SYishai Hadas }
91f6bc11e4SYishai Hadas
mlx4_reset_slave(struct mlx4_dev * dev)9255ad3592SYishai Hadas static int mlx4_reset_slave(struct mlx4_dev *dev)
9355ad3592SYishai Hadas {
9455ad3592SYishai Hadas #define COM_CHAN_RST_REQ_OFFSET 0x10
9555ad3592SYishai Hadas #define COM_CHAN_RST_ACK_OFFSET 0x08
9655ad3592SYishai Hadas
9755ad3592SYishai Hadas u32 comm_flags;
9855ad3592SYishai Hadas u32 rst_req;
9955ad3592SYishai Hadas u32 rst_ack;
10055ad3592SYishai Hadas unsigned long end;
10155ad3592SYishai Hadas struct mlx4_priv *priv = mlx4_priv(dev);
10255ad3592SYishai Hadas
10355ad3592SYishai Hadas if (pci_channel_offline(dev->persist->pdev))
10455ad3592SYishai Hadas return 0;
10555ad3592SYishai Hadas
10655ad3592SYishai Hadas comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
10755ad3592SYishai Hadas MLX4_COMM_CHAN_FLAGS));
10855ad3592SYishai Hadas if (comm_flags == 0xffffffff) {
10955ad3592SYishai Hadas mlx4_err(dev, "VF reset is not needed\n");
11055ad3592SYishai Hadas return 0;
11155ad3592SYishai Hadas }
11255ad3592SYishai Hadas
11355ad3592SYishai Hadas if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
11455ad3592SYishai Hadas mlx4_err(dev, "VF reset is not supported\n");
11555ad3592SYishai Hadas return -EOPNOTSUPP;
11655ad3592SYishai Hadas }
11755ad3592SYishai Hadas
11855ad3592SYishai Hadas rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
11955ad3592SYishai Hadas COM_CHAN_RST_REQ_OFFSET;
12055ad3592SYishai Hadas rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
12155ad3592SYishai Hadas COM_CHAN_RST_ACK_OFFSET;
12255ad3592SYishai Hadas if (rst_req != rst_ack) {
12355ad3592SYishai Hadas mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
12455ad3592SYishai Hadas return -EIO;
12555ad3592SYishai Hadas }
12655ad3592SYishai Hadas
12755ad3592SYishai Hadas rst_req ^= 1;
12855ad3592SYishai Hadas mlx4_warn(dev, "VF is sending reset request to Firmware\n");
12955ad3592SYishai Hadas comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
13055ad3592SYishai Hadas __raw_writel((__force u32)cpu_to_be32(comm_flags),
13155ad3592SYishai Hadas (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
13255ad3592SYishai Hadas
13355ad3592SYishai Hadas end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
13455ad3592SYishai Hadas while (time_before(jiffies, end)) {
13555ad3592SYishai Hadas comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
13655ad3592SYishai Hadas MLX4_COMM_CHAN_FLAGS));
13755ad3592SYishai Hadas rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
13855ad3592SYishai Hadas COM_CHAN_RST_ACK_OFFSET;
13955ad3592SYishai Hadas
14055ad3592SYishai Hadas /* Reading rst_req again since the communication channel can
14155ad3592SYishai Hadas * be reset at any time by the PF and all its bits will be
14255ad3592SYishai Hadas * set to zero.
14355ad3592SYishai Hadas */
14455ad3592SYishai Hadas rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
14555ad3592SYishai Hadas COM_CHAN_RST_REQ_OFFSET;
14655ad3592SYishai Hadas
14755ad3592SYishai Hadas if (rst_ack == rst_req) {
14855ad3592SYishai Hadas mlx4_warn(dev, "VF Reset succeed\n");
14955ad3592SYishai Hadas return 0;
15055ad3592SYishai Hadas }
15155ad3592SYishai Hadas cond_resched();
15255ad3592SYishai Hadas }
15355ad3592SYishai Hadas mlx4_err(dev, "Fail to send reset over the communication channel\n");
15455ad3592SYishai Hadas return -ETIMEDOUT;
15555ad3592SYishai Hadas }
15655ad3592SYishai Hadas
mlx4_comm_internal_err(u32 slave_read)157d585df1cSJack Morgenstein int mlx4_comm_internal_err(u32 slave_read)
15855ad3592SYishai Hadas {
15955ad3592SYishai Hadas return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
16055ad3592SYishai Hadas (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
16155ad3592SYishai Hadas }
16255ad3592SYishai Hadas
mlx4_enter_error_state(struct mlx4_dev_persistent * persist)163f6bc11e4SYishai Hadas void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
164f6bc11e4SYishai Hadas {
165f6bc11e4SYishai Hadas int err;
166f6bc11e4SYishai Hadas struct mlx4_dev *dev;
167f6bc11e4SYishai Hadas
168f5aef5aaSYishai Hadas if (!mlx4_internal_err_reset)
169f6bc11e4SYishai Hadas return;
170f6bc11e4SYishai Hadas
171f6bc11e4SYishai Hadas mutex_lock(&persist->device_state_mutex);
172f6bc11e4SYishai Hadas if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
173f6bc11e4SYishai Hadas goto out;
174f6bc11e4SYishai Hadas
175f6bc11e4SYishai Hadas dev = persist->dev;
176f6bc11e4SYishai Hadas mlx4_err(dev, "device is going to be reset\n");
177bedc989bSAlex Vesker if (mlx4_is_slave(dev)) {
17855ad3592SYishai Hadas err = mlx4_reset_slave(dev);
179bedc989bSAlex Vesker } else {
180bedc989bSAlex Vesker mlx4_crdump_collect(dev);
181f6bc11e4SYishai Hadas err = mlx4_reset_master(dev);
182bedc989bSAlex Vesker }
183f6bc11e4SYishai Hadas
18422e3817eSDaniel Jurgens if (!err) {
185f6bc11e4SYishai Hadas mlx4_err(dev, "device was reset successfully\n");
18622e3817eSDaniel Jurgens } else {
18722e3817eSDaniel Jurgens /* EEH could have disabled the PCI channel during reset. That's
18822e3817eSDaniel Jurgens * recoverable and the PCI error flow will handle it.
18922e3817eSDaniel Jurgens */
19022e3817eSDaniel Jurgens if (!pci_channel_offline(dev->persist->pdev))
19122e3817eSDaniel Jurgens BUG_ON(1);
19222e3817eSDaniel Jurgens }
19322e3817eSDaniel Jurgens dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
194f6bc11e4SYishai Hadas mutex_unlock(&persist->device_state_mutex);
195f6bc11e4SYishai Hadas
196f6bc11e4SYishai Hadas /* At that step HW was already reset, now notify clients */
197*7ba189acSPetr Pavlu mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, NULL);
198f5aef5aaSYishai Hadas mlx4_cmd_wake_completions(dev);
199f6bc11e4SYishai Hadas return;
200f6bc11e4SYishai Hadas
201f6bc11e4SYishai Hadas out:
202f6bc11e4SYishai Hadas mutex_unlock(&persist->device_state_mutex);
203f6bc11e4SYishai Hadas }
204f6bc11e4SYishai Hadas
mlx4_handle_error_state(struct mlx4_dev_persistent * persist)205f6bc11e4SYishai Hadas static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
206f6bc11e4SYishai Hadas {
20760d7ceeaSMoshe Shemesh struct mlx4_dev *dev = persist->dev;
20860d7ceeaSMoshe Shemesh struct devlink *devlink;
209f6bc11e4SYishai Hadas int err = 0;
210f6bc11e4SYishai Hadas
211f6bc11e4SYishai Hadas mlx4_enter_error_state(persist);
21260d7ceeaSMoshe Shemesh devlink = priv_to_devlink(mlx4_priv(dev));
21360d7ceeaSMoshe Shemesh devl_lock(devlink);
214c69453e2SYishai Hadas mutex_lock(&persist->interface_state_mutex);
215c69453e2SYishai Hadas if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
216c69453e2SYishai Hadas !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
21735c7ff34SJiri Pirko err = mlx4_restart_one(persist->pdev);
218c69453e2SYishai Hadas mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
219c69453e2SYishai Hadas err);
220c69453e2SYishai Hadas }
221c69453e2SYishai Hadas mutex_unlock(&persist->interface_state_mutex);
22260d7ceeaSMoshe Shemesh devl_unlock(devlink);
223f6bc11e4SYishai Hadas }
224f6bc11e4SYishai Hadas
dump_err_buf(struct mlx4_dev * dev)2255a2cc190SJeff Kirsher static void dump_err_buf(struct mlx4_dev *dev)
2265a2cc190SJeff Kirsher {
2275a2cc190SJeff Kirsher struct mlx4_priv *priv = mlx4_priv(dev);
2285a2cc190SJeff Kirsher
2295a2cc190SJeff Kirsher int i;
2305a2cc190SJeff Kirsher
2315a2cc190SJeff Kirsher mlx4_err(dev, "Internal error detected:\n");
2325a2cc190SJeff Kirsher for (i = 0; i < priv->fw.catas_size; ++i)
2335a2cc190SJeff Kirsher mlx4_err(dev, " buf[%02x]: %08x\n",
2345a2cc190SJeff Kirsher i, swab32(readl(priv->catas_err.map + i)));
2355a2cc190SJeff Kirsher }
2365a2cc190SJeff Kirsher
poll_catas(struct timer_list * t)23755c0fcc3SKees Cook static void poll_catas(struct timer_list *t)
2385a2cc190SJeff Kirsher {
23955c0fcc3SKees Cook struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
24055c0fcc3SKees Cook struct mlx4_dev *dev = &priv->dev;
24155ad3592SYishai Hadas u32 slave_read;
2425a2cc190SJeff Kirsher
24355ad3592SYishai Hadas if (mlx4_is_slave(dev)) {
24455ad3592SYishai Hadas slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
24555ad3592SYishai Hadas if (mlx4_comm_internal_err(slave_read)) {
24655ad3592SYishai Hadas mlx4_warn(dev, "Internal error detected on the communication channel\n");
24755ad3592SYishai Hadas goto internal_err;
24855ad3592SYishai Hadas }
24955ad3592SYishai Hadas } else if (readl(priv->catas_err.map)) {
2505a2cc190SJeff Kirsher dump_err_buf(dev);
251f6bc11e4SYishai Hadas goto internal_err;
25257dbf29aSKleber Sacilotto de Souza }
253f6bc11e4SYishai Hadas
254f6bc11e4SYishai Hadas if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
255f6bc11e4SYishai Hadas mlx4_warn(dev, "Internal error mark was detected on device\n");
256f6bc11e4SYishai Hadas goto internal_err;
257f6bc11e4SYishai Hadas }
258f6bc11e4SYishai Hadas
2595a2cc190SJeff Kirsher mod_timer(&priv->catas_err.timer,
2605a2cc190SJeff Kirsher round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
261f6bc11e4SYishai Hadas return;
262f6bc11e4SYishai Hadas
263f6bc11e4SYishai Hadas internal_err:
264f5aef5aaSYishai Hadas if (mlx4_internal_err_reset)
265f6bc11e4SYishai Hadas queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
2665a2cc190SJeff Kirsher }
2675a2cc190SJeff Kirsher
catas_reset(struct work_struct * work)2685a2cc190SJeff Kirsher static void catas_reset(struct work_struct *work)
2695a2cc190SJeff Kirsher {
270ad9a0bf0SYishai Hadas struct mlx4_dev_persistent *persist =
271ad9a0bf0SYishai Hadas container_of(work, struct mlx4_dev_persistent,
272ad9a0bf0SYishai Hadas catas_work);
2735a2cc190SJeff Kirsher
274f6bc11e4SYishai Hadas mlx4_handle_error_state(persist);
2755a2cc190SJeff Kirsher }
2765a2cc190SJeff Kirsher
mlx4_start_catas_poll(struct mlx4_dev * dev)2775a2cc190SJeff Kirsher void mlx4_start_catas_poll(struct mlx4_dev *dev)
2785a2cc190SJeff Kirsher {
2795a2cc190SJeff Kirsher struct mlx4_priv *priv = mlx4_priv(dev);
2805a2cc190SJeff Kirsher phys_addr_t addr;
2815a2cc190SJeff Kirsher
2825a2cc190SJeff Kirsher INIT_LIST_HEAD(&priv->catas_err.list);
28355c0fcc3SKees Cook timer_setup(&priv->catas_err.timer, poll_catas, 0);
2845a2cc190SJeff Kirsher priv->catas_err.map = NULL;
2855a2cc190SJeff Kirsher
28655ad3592SYishai Hadas if (!mlx4_is_slave(dev)) {
28755ad3592SYishai Hadas addr = pci_resource_start(dev->persist->pdev,
28855ad3592SYishai Hadas priv->fw.catas_bar) +
2895a2cc190SJeff Kirsher priv->fw.catas_offset;
2905a2cc190SJeff Kirsher
2915a2cc190SJeff Kirsher priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
2925a2cc190SJeff Kirsher if (!priv->catas_err.map) {
2935a2cc190SJeff Kirsher mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
2945a2cc190SJeff Kirsher (unsigned long long)addr);
2955a2cc190SJeff Kirsher return;
2965a2cc190SJeff Kirsher }
29755ad3592SYishai Hadas }
2985a2cc190SJeff Kirsher
2995a2cc190SJeff Kirsher priv->catas_err.timer.expires =
3005a2cc190SJeff Kirsher round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
3015a2cc190SJeff Kirsher add_timer(&priv->catas_err.timer);
3025a2cc190SJeff Kirsher }
3035a2cc190SJeff Kirsher
mlx4_stop_catas_poll(struct mlx4_dev * dev)3045a2cc190SJeff Kirsher void mlx4_stop_catas_poll(struct mlx4_dev *dev)
3055a2cc190SJeff Kirsher {
3065a2cc190SJeff Kirsher struct mlx4_priv *priv = mlx4_priv(dev);
3075a2cc190SJeff Kirsher
3085a2cc190SJeff Kirsher del_timer_sync(&priv->catas_err.timer);
3095a2cc190SJeff Kirsher
310ad9a0bf0SYishai Hadas if (priv->catas_err.map) {
3115a2cc190SJeff Kirsher iounmap(priv->catas_err.map);
312ad9a0bf0SYishai Hadas priv->catas_err.map = NULL;
313ad9a0bf0SYishai Hadas }
314c69453e2SYishai Hadas
315c69453e2SYishai Hadas if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
316c69453e2SYishai Hadas flush_workqueue(dev->persist->catas_wq);
3175a2cc190SJeff Kirsher }
3185a2cc190SJeff Kirsher
mlx4_catas_init(struct mlx4_dev * dev)319ad9a0bf0SYishai Hadas int mlx4_catas_init(struct mlx4_dev *dev)
3205a2cc190SJeff Kirsher {
321ad9a0bf0SYishai Hadas INIT_WORK(&dev->persist->catas_work, catas_reset);
322ad9a0bf0SYishai Hadas dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
323ad9a0bf0SYishai Hadas if (!dev->persist->catas_wq)
324ad9a0bf0SYishai Hadas return -ENOMEM;
325ad9a0bf0SYishai Hadas
326ad9a0bf0SYishai Hadas return 0;
327ad9a0bf0SYishai Hadas }
328ad9a0bf0SYishai Hadas
mlx4_catas_end(struct mlx4_dev * dev)329ad9a0bf0SYishai Hadas void mlx4_catas_end(struct mlx4_dev *dev)
330ad9a0bf0SYishai Hadas {
331ad9a0bf0SYishai Hadas if (dev->persist->catas_wq) {
332ad9a0bf0SYishai Hadas destroy_workqueue(dev->persist->catas_wq);
333ad9a0bf0SYishai Hadas dev->persist->catas_wq = NULL;
334ad9a0bf0SYishai Hadas }
3355a2cc190SJeff Kirsher }
336