1 2 #include <analyzer/plugins/plugin.hpp> 3 #include <hei_main.hpp> 4 #include <util/pdbg.hpp> 5 #include <util/trace.hpp> 6 7 namespace analyzer 8 { 9 10 namespace P10 11 { 12 13 namespace tod 14 { 15 16 /** Each chip is connected to two TOD topologies: active and backup. The values 17 * are important because some registers and documentation simply refer to them 18 * by number instead of name. Also, they can be used as array indexes if 19 * needed. */ 20 enum class Topology 21 { 22 ACTIVE = 0, 23 BACKUP = 1, 24 }; 25 26 /** Each topology can be configured as either primary or secondary. */ 27 enum class Configuration 28 { 29 PRIMARY, 30 SECONDARY, 31 }; 32 33 class Data 34 { 35 public: 36 Data() = default; 37 ~Data() = default; 38 Data(const Data&) = default; 39 Data(Data&&) = default; 40 Data& operator=(const Data&) = default; 41 Data& operator=(Data&&) = default; 42 43 private: 44 /** The MDMT chips at fault (only one per topology). */ 45 std::map<Topology, pdbg_target*> iv_mdmtFaultList; 46 47 /** All chips with internal path faults. */ 48 std::map<Topology, std::vector<pdbg_target*>> iv_internalFaultList; 49 50 /** The chips sourcing the clocks to non-MDMT chips with faults. */ 51 std::map<Topology, std::vector<pdbg_target*>> iv_networkFaultList; 52 53 public: 54 /** 55 * @brief Sets this chip as the MDMT at fault for this topology. 56 * @param i_topology Target topology. 57 * @param i_chipAtFault The chip reporting step check fault. 58 */ 59 void setMdmtFault(Topology i_topology, pdbg_target* i_chipAtFault) 60 { 61 assert(nullptr != i_chipAtFault); 62 iv_mdmtFaultList[i_topology] = i_chipAtFault; 63 } 64 65 /** 66 * @param i_topology Target topology. 67 * @return The MDMT chip for this topology, if at fault. Otherwise, nullptr. 68 */ 69 pdbg_target* getMdmtFault(Topology i_topology) 70 { 71 return iv_mdmtFaultList[i_topology]; 72 } 73 74 /** 75 * @brief Indicates the given chip has an internal fault. 76 * @param i_topology Target topology. 77 * @param i_chipAtFault The chip reporting a step check fault. 78 */ 79 void setInternalFault(Topology i_topology, pdbg_target* i_chipAtFault) 80 { 81 assert(nullptr != i_chipAtFault); 82 iv_internalFaultList[i_topology].push_back(i_chipAtFault); 83 } 84 85 /** 86 * @param i_topology Target topology. 87 * @return The list of all chips with internal faults. 88 */ 89 const std::vector<pdbg_target*>& getInteralFaults(Topology i_topology) 90 { 91 return iv_internalFaultList[i_topology]; 92 } 93 94 /** 95 * @brief Indicates the given non-MDMT chip has seen a fault in the TOD 96 * network. 97 * @param i_topology Target topology. 98 * @param i_chipSourcingClock The chip sourcing the clock for the chip at 99 * fault. This is NOT the chip at fault. 100 */ 101 void setNetworkFault(Topology i_topology, pdbg_target* i_chipSourcingClock) 102 { 103 assert(nullptr != i_chipSourcingClock); 104 iv_networkFaultList[i_topology].push_back(i_chipSourcingClock); 105 } 106 107 /** 108 * @param i_topology Target topology. 109 * @return The list of all chips sourcing the clocks for the non-MDMT chips 110 * with step check faults. 111 */ 112 const std::vector<pdbg_target*>& getNetworkFaults(Topology i_topology) 113 { 114 return iv_networkFaultList[i_topology]; 115 } 116 }; 117 118 enum class Register 119 { 120 TOD_ERROR = 0x00040030, 121 TOD_PSS_MSS_STATUS = 0x00040008, 122 TOD_PRI_PORT_0_CTRL = 0x00040001, 123 TOD_PRI_PORT_1_CTRL = 0x00040002, 124 TOD_SEC_PORT_0_CTRL = 0x00040003, 125 TOD_SEC_PORT_1_CTRL = 0x00040004, 126 }; 127 128 bool readRegister(pdbg_target* i_chip, Register i_addr, 129 libhei::BitStringBuffer& o_val) 130 { 131 assert(64 == o_val.getBitLen()); 132 133 uint64_t scomValue; 134 if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue)) 135 { 136 trace::err("Register read failed: addr=0x%08x chip=%s", 137 static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip)); 138 return true; // SCOM failed 139 } 140 141 o_val.setFieldRight(0, 64, scomValue); 142 143 return false; // no failures 144 } 145 146 pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError, 147 unsigned int i_iohsPos) 148 { 149 using namespace util::pdbg; 150 151 pdbg_target* chipSourcingClock = nullptr; 152 153 // Given the chip reporting the error and the IOHS position within that 154 // chip, we must get 155 // - The associated IOHS target on this chip. 156 // - Next, the IOHS target on the other side of the bus. 157 // - Finally, the chip containing the IOHS target on the other side of the 158 // bus. 159 160 auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos); 161 if (nullptr != iohsUnit) 162 { 163 auto clockSourceUnit = 164 getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS); 165 if (nullptr != clockSourceUnit) 166 { 167 chipSourcingClock = getParentChip(clockSourceUnit); 168 } 169 } 170 171 return chipSourcingClock; 172 } 173 174 /** 175 * @brief Collects TOD fault data for each processor chip. 176 */ 177 void collectTodFaultData(pdbg_target* i_chip, Data& o_data) 178 { 179 // TODO: We should use a register cache captured by the isolator so that 180 // this code is using the same values the isolator used. However, at 181 // the moment the isolator does not have a register cache. Instead, 182 // we'll have to manually SCOM the registers we need. Fortunately, 183 // for a checkstop attention the hardware should freeze and the 184 // values will never change. Unfortunately, we don't have that same 185 // guarantee for TIs, but at the time of this writing, all TOD errors 186 // will trigger a checkstop attention away. So the TI case is not as 187 // important. 188 189 libhei::BitStringBuffer errorReg{64}; 190 if (readRegister(i_chip, Register::TOD_ERROR, errorReg)) 191 { 192 return; // cannot continue on this chip 193 } 194 195 libhei::BitStringBuffer statusReg{64}; 196 if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg)) 197 { 198 return; // cannot continue on this chip 199 } 200 201 // Determine which topology is configured primary or secondary. 202 std::map<Topology, Configuration> topConfig; 203 204 if (0 == statusReg.getFieldRight(0, 3)) 205 { 206 // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary. 207 topConfig[Topology::ACTIVE] = Configuration::PRIMARY; 208 topConfig[Topology::BACKUP] = Configuration::SECONDARY; 209 } 210 else 211 { 212 // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary. 213 topConfig[Topology::ACTIVE] = Configuration::SECONDARY; 214 topConfig[Topology::BACKUP] = Configuration::PRIMARY; 215 } 216 217 for (const auto top : {Topology::ACTIVE, Topology::BACKUP}) 218 { 219 // Bit positions in some registers are dependent on this topology's 220 // configuration. 221 bool isPriTop = (Configuration::PRIMARY == topConfig[top]); 222 223 // Determine if this is the MDMT chip. 224 bool isMasterTod = statusReg.isBitSet(isPriTop ? 13 : 17); 225 bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18); 226 227 if (isMasterDrawer && isMasterTod) 228 { 229 // The master path selects are sourced from the oscilator reference 230 // clocks. So, we'll need to determine which one was used at the 231 // time of the failure. 232 auto masterPathSelect = 233 statusReg.getFieldRight(isPriTop ? 12 : 16, 1); 234 235 // Determine if there is a step check fault for this path select. 236 if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15)) 237 { 238 trace::inf( 239 "TOD MDMT fault found: top=%u config=%u path=%u chip=%s", 240 static_cast<unsigned int>(top), 241 static_cast<unsigned int>(topConfig[top]), masterPathSelect, 242 util::pdbg::getPath(i_chip)); 243 244 o_data.setMdmtFault(top, i_chip); 245 } 246 } 247 else // not the MDMT on this topology 248 { 249 // The slave path selects are sourced from other processor chips. 250 // So, we'll need to determine which one was used at the time of the 251 // failure. 252 auto slavePathSelect = 253 statusReg.getFieldRight(isPriTop ? 15 : 19, 1); 254 255 // Determine if there is a step check fault for this path select. 256 if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21)) 257 { 258 // Get the IOHS unit position on this chip that is connected to 259 // the clock source chip. 260 auto addr = (0 == slavePathSelect) 261 ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL 262 : Register::TOD_SEC_PORT_0_CTRL) 263 : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL 264 : Register::TOD_SEC_PORT_1_CTRL); 265 266 libhei::BitStringBuffer portCtrl{64}; 267 if (readRegister(i_chip, addr, portCtrl)) 268 { 269 continue; // try the other topology 270 } 271 272 auto iohsPos = portCtrl.getFieldRight(0, 3); 273 auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos); 274 275 if (nullptr != chipSourcingClock) 276 { 277 trace::inf("TOD network fault found: top=%u config=%u " 278 "path=%u chip=%s iohs=%u clockSrc=%s", 279 static_cast<unsigned int>(top), 280 static_cast<unsigned int>(topConfig[top]), 281 slavePathSelect, util::pdbg::getPath(i_chip), 282 iohsPos, util::pdbg::getPath(chipSourcingClock)); 283 284 o_data.setNetworkFault(top, chipSourcingClock); 285 } 286 } 287 } 288 289 // Check for any internal path errors in the active topology only. 290 if (Topology::ACTIVE == top && errorReg.isBitSet(17)) 291 { 292 trace::inf("TOD internal fault found: top=%u config=%u chip=%s", 293 static_cast<unsigned int>(top), 294 static_cast<unsigned int>(topConfig[top]), 295 util::pdbg::getPath(i_chip)); 296 297 o_data.setInternalFault(top, i_chip); 298 } 299 } 300 } 301 302 } // namespace tod 303 304 /** 305 * @brief Handles TOD step check fault attentions. 306 */ 307 void tod_step_check_fault(unsigned int, const libhei::Chip& i_chip, 308 ServiceData& io_servData) 309 { 310 // Query hardware for TOD fault data from all active processors. 311 tod::Data data{}; 312 std::vector<pdbg_target*> chipList; 313 util::pdbg::getActiveProcessorChips(chipList); 314 for (const auto& chip : chipList) 315 { 316 tod::collectTodFaultData(chip, data); 317 } 318 319 // For each topology: 320 // - First, check if the MDMT chip is reporting a fault. If so, it is 321 // likely that any downstream step check faults are due to the fault in 322 // the MDMT. 323 // - If MDMT is not reporting a fault, look for any network path errors 324 // from the non-MDMT chips. In which case, we will want to call out all 325 // of the chips sourcing those step check errors (not the chips reporting 326 // them). 327 // - If no other errors found, callout any chips reporting internal step 328 // check faults. 329 330 bool calloutsMade = false; // need to keep track for default case. 331 332 for (const auto top : {tod::Topology::ACTIVE, tod::Topology::BACKUP}) 333 { 334 auto mdmtFault = data.getMdmtFault(top); 335 auto internalFaults = data.getInteralFaults(top); 336 auto networkFaults = data.getNetworkFaults(top); 337 338 if (nullptr != mdmtFault) // MDMT fault 339 { 340 calloutsMade = true; 341 342 // Callout the TOD clock (guard). 343 io_servData.calloutClock(callout::ClockType::TOD_CLOCK, 344 callout::Priority::MED, true); 345 346 // Callout the MDMT chip (no guard). 347 io_servData.calloutTarget(mdmtFault, callout::Priority::MED, true); 348 349 // Callout everything in between. 350 // TODO: This isn't necessary for now because the clock callout is 351 // the backplane. However, we may need a procedure callout 352 // for future systems. 353 } 354 else if (!networkFaults.empty()) // network path faults 355 { 356 calloutsMade = true; 357 358 // Callout all chips with network errors (guard). 359 for (const auto& chip : networkFaults) 360 { 361 io_servData.calloutTarget(chip, callout::Priority::MED, true); 362 } 363 } 364 else if (!internalFaults.empty()) // interal path faults 365 { 366 calloutsMade = true; 367 368 // Callout all chips with internal errors (guard). 369 for (const auto& chip : internalFaults) 370 { 371 io_servData.calloutTarget(chip, callout::Priority::MED, true); 372 } 373 } 374 } 375 376 // If no callouts are made, default to calling out the chip that reported 377 // the original attention. 378 if (!calloutsMade) 379 { 380 io_servData.calloutTarget(util::pdbg::getTrgt(i_chip), 381 callout::Priority::MED, true); 382 } 383 } 384 385 } // namespace P10 386 387 PLUGIN_DEFINE_NS(P10_10, P10, tod_step_check_fault); 388 PLUGIN_DEFINE_NS(P10_20, P10, tod_step_check_fault); 389 390 } // namespace analyzer 391