1 /* 2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. 3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/workqueue.h> 35 #include <linux/module.h> 36 37 #include "mlx4.h" 38 39 enum { 40 MLX4_CATAS_POLL_INTERVAL = 5 * HZ, 41 }; 42 43 44 45 int mlx4_internal_err_reset = 1; 46 module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); 47 MODULE_PARM_DESC(internal_err_reset, 48 "Reset device on internal errors if non-zero (default 1)"); 49 50 static int read_vendor_id(struct mlx4_dev *dev) 51 { 52 u16 vendor_id = 0; 53 int ret; 54 55 ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); 56 if (ret) { 57 mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); 58 return ret; 59 } 60 61 if (vendor_id == 0xffff) { 62 mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); 63 return -EINVAL; 64 } 65 66 return 0; 67 } 68 69 static int mlx4_reset_master(struct mlx4_dev *dev) 70 { 71 int err = 0; 72 73 if (mlx4_is_master(dev)) 74 mlx4_report_internal_err_comm_event(dev); 75 76 if (!pci_channel_offline(dev->persist->pdev)) { 77 err = read_vendor_id(dev); 78 /* If PCI can't be accessed to read vendor ID we assume that its 79 * link was disabled and chip was already reset. 80 */ 81 if (err) 82 return 0; 83 84 err = mlx4_reset(dev); 85 if (err) 86 mlx4_err(dev, "Fail to reset HCA\n"); 87 } 88 89 return err; 90 } 91 92 static int mlx4_reset_slave(struct mlx4_dev *dev) 93 { 94 #define COM_CHAN_RST_REQ_OFFSET 0x10 95 #define COM_CHAN_RST_ACK_OFFSET 0x08 96 97 u32 comm_flags; 98 u32 rst_req; 99 u32 rst_ack; 100 unsigned long end; 101 struct mlx4_priv *priv = mlx4_priv(dev); 102 103 if (pci_channel_offline(dev->persist->pdev)) 104 return 0; 105 106 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 107 MLX4_COMM_CHAN_FLAGS)); 108 if (comm_flags == 0xffffffff) { 109 mlx4_err(dev, "VF reset is not needed\n"); 110 return 0; 111 } 112 113 if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) { 114 mlx4_err(dev, "VF reset is not supported\n"); 115 return -EOPNOTSUPP; 116 } 117 118 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 119 COM_CHAN_RST_REQ_OFFSET; 120 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 121 COM_CHAN_RST_ACK_OFFSET; 122 if (rst_req != rst_ack) { 123 mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n"); 124 return -EIO; 125 } 126 127 rst_req ^= 1; 128 mlx4_warn(dev, "VF is sending reset request to Firmware\n"); 129 comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET; 130 __raw_writel((__force u32)cpu_to_be32(comm_flags), 131 (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS); 132 133 end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies; 134 while (time_before(jiffies, end)) { 135 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 136 MLX4_COMM_CHAN_FLAGS)); 137 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 138 COM_CHAN_RST_ACK_OFFSET; 139 140 /* Reading rst_req again since the communication channel can 141 * be reset at any time by the PF and all its bits will be 142 * set to zero. 143 */ 144 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 145 COM_CHAN_RST_REQ_OFFSET; 146 147 if (rst_ack == rst_req) { 148 mlx4_warn(dev, "VF Reset succeed\n"); 149 return 0; 150 } 151 cond_resched(); 152 } 153 mlx4_err(dev, "Fail to send reset over the communication channel\n"); 154 return -ETIMEDOUT; 155 } 156 157 int mlx4_comm_internal_err(u32 slave_read) 158 { 159 return (u32)COMM_CHAN_EVENT_INTERNAL_ERR == 160 (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0; 161 } 162 163 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) 164 { 165 int err; 166 struct mlx4_dev *dev; 167 168 if (!mlx4_internal_err_reset) 169 return; 170 171 mutex_lock(&persist->device_state_mutex); 172 if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) 173 goto out; 174 175 dev = persist->dev; 176 mlx4_err(dev, "device is going to be reset\n"); 177 if (mlx4_is_slave(dev)) { 178 err = mlx4_reset_slave(dev); 179 } else { 180 mlx4_crdump_collect(dev); 181 err = mlx4_reset_master(dev); 182 } 183 184 if (!err) { 185 mlx4_err(dev, "device was reset successfully\n"); 186 } else { 187 /* EEH could have disabled the PCI channel during reset. That's 188 * recoverable and the PCI error flow will handle it. 189 */ 190 if (!pci_channel_offline(dev->persist->pdev)) 191 BUG_ON(1); 192 } 193 dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; 194 mutex_unlock(&persist->device_state_mutex); 195 196 /* At that step HW was already reset, now notify clients */ 197 mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, NULL); 198 mlx4_cmd_wake_completions(dev); 199 return; 200 201 out: 202 mutex_unlock(&persist->device_state_mutex); 203 } 204 205 static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) 206 { 207 struct mlx4_dev *dev = persist->dev; 208 struct devlink *devlink; 209 int err = 0; 210 211 mlx4_enter_error_state(persist); 212 devlink = priv_to_devlink(mlx4_priv(dev)); 213 devl_lock(devlink); 214 mutex_lock(&persist->interface_state_mutex); 215 if (persist->interface_state & MLX4_INTERFACE_STATE_UP && 216 !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { 217 err = mlx4_restart_one(persist->pdev); 218 mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", 219 err); 220 } 221 mutex_unlock(&persist->interface_state_mutex); 222 devl_unlock(devlink); 223 } 224 225 static void dump_err_buf(struct mlx4_dev *dev) 226 { 227 struct mlx4_priv *priv = mlx4_priv(dev); 228 229 int i; 230 231 mlx4_err(dev, "Internal error detected:\n"); 232 for (i = 0; i < priv->fw.catas_size; ++i) 233 mlx4_err(dev, " buf[%02x]: %08x\n", 234 i, swab32(readl(priv->catas_err.map + i))); 235 } 236 237 static void poll_catas(struct timer_list *t) 238 { 239 struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer); 240 struct mlx4_dev *dev = &priv->dev; 241 u32 slave_read; 242 243 if (mlx4_is_slave(dev)) { 244 slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); 245 if (mlx4_comm_internal_err(slave_read)) { 246 mlx4_warn(dev, "Internal error detected on the communication channel\n"); 247 goto internal_err; 248 } 249 } else if (readl(priv->catas_err.map)) { 250 dump_err_buf(dev); 251 goto internal_err; 252 } 253 254 if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { 255 mlx4_warn(dev, "Internal error mark was detected on device\n"); 256 goto internal_err; 257 } 258 259 mod_timer(&priv->catas_err.timer, 260 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); 261 return; 262 263 internal_err: 264 if (mlx4_internal_err_reset) 265 queue_work(dev->persist->catas_wq, &dev->persist->catas_work); 266 } 267 268 static void catas_reset(struct work_struct *work) 269 { 270 struct mlx4_dev_persistent *persist = 271 container_of(work, struct mlx4_dev_persistent, 272 catas_work); 273 274 mlx4_handle_error_state(persist); 275 } 276 277 void mlx4_start_catas_poll(struct mlx4_dev *dev) 278 { 279 struct mlx4_priv *priv = mlx4_priv(dev); 280 phys_addr_t addr; 281 282 INIT_LIST_HEAD(&priv->catas_err.list); 283 timer_setup(&priv->catas_err.timer, poll_catas, 0); 284 priv->catas_err.map = NULL; 285 286 if (!mlx4_is_slave(dev)) { 287 addr = pci_resource_start(dev->persist->pdev, 288 priv->fw.catas_bar) + 289 priv->fw.catas_offset; 290 291 priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); 292 if (!priv->catas_err.map) { 293 mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", 294 (unsigned long long)addr); 295 return; 296 } 297 } 298 299 priv->catas_err.timer.expires = 300 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); 301 add_timer(&priv->catas_err.timer); 302 } 303 304 void mlx4_stop_catas_poll(struct mlx4_dev *dev) 305 { 306 struct mlx4_priv *priv = mlx4_priv(dev); 307 308 del_timer_sync(&priv->catas_err.timer); 309 310 if (priv->catas_err.map) { 311 iounmap(priv->catas_err.map); 312 priv->catas_err.map = NULL; 313 } 314 315 if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) 316 flush_workqueue(dev->persist->catas_wq); 317 } 318 319 int mlx4_catas_init(struct mlx4_dev *dev) 320 { 321 INIT_WORK(&dev->persist->catas_work, catas_reset); 322 dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); 323 if (!dev->persist->catas_wq) 324 return -ENOMEM; 325 326 return 0; 327 } 328 329 void mlx4_catas_end(struct mlx4_dev *dev) 330 { 331 if (dev->persist->catas_wq) { 332 destroy_workqueue(dev->persist->catas_wq); 333 dev->persist->catas_wq = NULL; 334 } 335 } 336