1 2 #include <analyzer/plugins/plugin.hpp> 3 #include <hei_main.hpp> 4 #include <util/pdbg.hpp> 5 #include <util/trace.hpp> 6 7 namespace analyzer 8 { 9 10 namespace P10 11 { 12 13 namespace tod 14 { 15 16 /** Each chip is connected to two TOD topologies: active and backup. The values 17 * are important because some registers and documentation simply refer to them 18 * by number instead of name. Also, they can be used as array indexes if 19 * needed. */ 20 enum class Topology 21 { 22 ACTIVE = 0, 23 BACKUP = 1, 24 }; 25 26 /** Each topology can be configured as either primary or secondary. */ 27 enum class Configuration 28 { 29 PRIMARY, 30 SECONDARY, 31 }; 32 33 class Data 34 { 35 public: 36 Data() = default; 37 ~Data() = default; 38 Data(const Data&) = default; 39 Data(Data&&) = default; 40 Data& operator=(const Data&) = default; 41 Data& operator=(Data&&) = default; 42 43 private: 44 /** The MDMT chips at fault (only one per topology). */ 45 std::map<Topology, pdbg_target*> iv_mdmtFaultList; 46 47 /** All chips with internal path faults. */ 48 std::map<Topology, std::vector<pdbg_target*>> iv_internalFaultList; 49 50 /** The chips sourcing the clocks to non-MDMT chips with faults. */ 51 std::map<Topology, std::vector<pdbg_target*>> iv_networkFaultList; 52 53 public: 54 /** 55 * @brief Sets this chip as the MDMT at fault for this topology. 56 * @param i_topology Target topology. 57 * @param i_chipAtFault The chip reporting step check fault. 58 */ 59 void setMdmtFault(Topology i_topology, pdbg_target* i_chipAtFault) 60 { 61 assert(nullptr != i_chipAtFault); 62 iv_mdmtFaultList[i_topology] = i_chipAtFault; 63 } 64 65 /** 66 * @param i_topology Target topology. 67 * @return The MDMT chip for this topology, if at fault. Otherwise, nullptr. 68 */ 69 pdbg_target* getMdmtFault(Topology i_topology) 70 { 71 return iv_mdmtFaultList[i_topology]; 72 } 73 74 /** 75 * @brief Indicates the given chip has an internal fault. 76 * @param i_topology Target topology. 77 * @param i_chipAtFault The chip reporting a step check fault. 78 */ 79 void setInternalFault(Topology i_topology, pdbg_target* i_chipAtFault) 80 { 81 assert(nullptr != i_chipAtFault); 82 iv_internalFaultList[i_topology].push_back(i_chipAtFault); 83 } 84 85 /** 86 * @param i_topology Target topology. 87 * @return The list of all chips with internal faults. 88 */ 89 const std::vector<pdbg_target*>& getInteralFaults(Topology i_topology) 90 { 91 return iv_internalFaultList[i_topology]; 92 } 93 94 /** 95 * @brief Indicates the given non-MDMT chip has seen a fault in the TOD 96 * network. 97 * @param i_topology Target topology. 98 * @param i_chipSourcingClock The chip sourcing the clock for the chip at 99 * fault. 100 * @param i_chipAtFault The chip reporting the fault. 101 */ 102 void setNetworkFault(Topology i_topology, pdbg_target* i_chipSourcingClock, 103 pdbg_target* i_chipAtFault) 104 { 105 assert(nullptr != i_chipSourcingClock); 106 iv_networkFaultList[i_topology].push_back(i_chipSourcingClock); 107 108 assert(nullptr != i_chipAtFault); 109 iv_networkFaultList[i_topology].push_back(i_chipAtFault); 110 } 111 112 /** 113 * @param i_topology Target topology. 114 * @return The list of all chips sourcing the clocks for the non-MDMT chips 115 * with step check faults. 116 */ 117 const std::vector<pdbg_target*>& getNetworkFaults(Topology i_topology) 118 { 119 return iv_networkFaultList[i_topology]; 120 } 121 }; 122 123 enum class Register 124 { 125 TOD_ERROR = 0x00040030, 126 TOD_PSS_MSS_STATUS = 0x00040008, 127 TOD_PRI_PORT_0_CTRL = 0x00040001, 128 TOD_PRI_PORT_1_CTRL = 0x00040002, 129 TOD_SEC_PORT_0_CTRL = 0x00040003, 130 TOD_SEC_PORT_1_CTRL = 0x00040004, 131 }; 132 133 bool readRegister(pdbg_target* i_chip, Register i_addr, 134 libhei::BitStringBuffer& o_val) 135 { 136 assert(64 == o_val.getBitLen()); 137 138 uint64_t scomValue; 139 if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue)) 140 { 141 trace::err("Register read failed: addr=0x%08x chip=%s", 142 static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip)); 143 return true; // SCOM failed 144 } 145 146 o_val.setFieldRight(0, 64, scomValue); 147 148 return false; // no failures 149 } 150 151 pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError, 152 unsigned int i_iohsPos) 153 { 154 using namespace util::pdbg; 155 156 pdbg_target* chipSourcingClock = nullptr; 157 158 // Given the chip reporting the error and the IOHS position within that 159 // chip, we must get 160 // - The associated IOHS target on this chip. 161 // - Next, the IOHS target on the other side of the bus. 162 // - Finally, the chip containing the IOHS target on the other side of the 163 // bus. 164 165 auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos); 166 if (nullptr != iohsUnit) 167 { 168 auto clockSourceUnit = 169 getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS); 170 if (nullptr != clockSourceUnit) 171 { 172 chipSourcingClock = getParentChip(clockSourceUnit); 173 } 174 } 175 176 return chipSourcingClock; 177 } 178 179 /** 180 * @brief Collects TOD fault data for each processor chip. 181 */ 182 void collectTodFaultData(pdbg_target* i_chip, Data& o_data) 183 { 184 // TODO: We should use a register cache captured by the isolator so that 185 // this code is using the same values the isolator used. However, at 186 // the moment the isolator does not have a register cache. Instead, 187 // we'll have to manually SCOM the registers we need. Fortunately, 188 // for a checkstop attention the hardware should freeze and the 189 // values will never change. Unfortunately, we don't have that same 190 // guarantee for TIs, but at the time of this writing, all TOD errors 191 // will trigger a checkstop attention away. So the TI case is not as 192 // important. 193 194 libhei::BitStringBuffer errorReg{64}; 195 if (readRegister(i_chip, Register::TOD_ERROR, errorReg)) 196 { 197 return; // cannot continue on this chip 198 } 199 200 libhei::BitStringBuffer statusReg{64}; 201 if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg)) 202 { 203 return; // cannot continue on this chip 204 } 205 206 // Determine which topology is configured primary or secondary. 207 std::map<Topology, Configuration> topConfig; 208 209 if (0 == statusReg.getFieldRight(0, 3)) 210 { 211 // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary. 212 topConfig[Topology::ACTIVE] = Configuration::PRIMARY; 213 topConfig[Topology::BACKUP] = Configuration::SECONDARY; 214 } 215 else 216 { 217 // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary. 218 topConfig[Topology::ACTIVE] = Configuration::SECONDARY; 219 topConfig[Topology::BACKUP] = Configuration::PRIMARY; 220 } 221 222 for (const auto top : {Topology::ACTIVE, Topology::BACKUP}) 223 { 224 // Bit positions in some registers are dependent on this topology's 225 // configuration. 226 bool isPriTop = (Configuration::PRIMARY == topConfig[top]); 227 228 // Determine if this is the MDMT chip. 229 bool isMasterTod = statusReg.isBitSet(isPriTop ? 13 : 17); 230 bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18); 231 232 if (isMasterDrawer && isMasterTod) 233 { 234 // The master path selects are sourced from the oscilator reference 235 // clocks. So, we'll need to determine which one was used at the 236 // time of the failure. 237 auto masterPathSelect = 238 statusReg.getFieldRight(isPriTop ? 12 : 16, 1); 239 240 // Determine if there is a step check fault for this path select. 241 if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15)) 242 { 243 trace::inf( 244 "TOD MDMT fault found: top=%u config=%u path=%u chip=%s", 245 static_cast<unsigned int>(top), 246 static_cast<unsigned int>(topConfig[top]), masterPathSelect, 247 util::pdbg::getPath(i_chip)); 248 249 o_data.setMdmtFault(top, i_chip); 250 } 251 } 252 else // not the MDMT on this topology 253 { 254 // The slave path selects are sourced from other processor chips. 255 // So, we'll need to determine which one was used at the time of the 256 // failure. 257 auto slavePathSelect = 258 statusReg.getFieldRight(isPriTop ? 15 : 19, 1); 259 260 // Determine if there is a step check fault for this path select. 261 if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21)) 262 { 263 // Get the IOHS unit position on this chip that is connected to 264 // the clock source chip. 265 auto addr = (0 == slavePathSelect) 266 ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL 267 : Register::TOD_SEC_PORT_0_CTRL) 268 : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL 269 : Register::TOD_SEC_PORT_1_CTRL); 270 271 libhei::BitStringBuffer portCtrl{64}; 272 if (readRegister(i_chip, addr, portCtrl)) 273 { 274 continue; // try the other topology 275 } 276 277 auto iohsPos = portCtrl.getFieldRight(0, 3); 278 auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos); 279 280 if (nullptr != chipSourcingClock) 281 { 282 trace::inf("TOD network fault found: top=%u config=%u " 283 "path=%u chip=%s iohs=%u clockSrc=%s", 284 static_cast<unsigned int>(top), 285 static_cast<unsigned int>(topConfig[top]), 286 slavePathSelect, util::pdbg::getPath(i_chip), 287 iohsPos, util::pdbg::getPath(chipSourcingClock)); 288 289 o_data.setNetworkFault(top, chipSourcingClock, i_chip); 290 } 291 } 292 } 293 294 // Check for any internal path errors in the active topology only. 295 if (Topology::ACTIVE == top && errorReg.isBitSet(17)) 296 { 297 trace::inf("TOD internal fault found: top=%u config=%u chip=%s", 298 static_cast<unsigned int>(top), 299 static_cast<unsigned int>(topConfig[top]), 300 util::pdbg::getPath(i_chip)); 301 302 o_data.setInternalFault(top, i_chip); 303 } 304 } 305 } 306 307 } // namespace tod 308 309 /** 310 * @brief Handles TOD step check fault attentions. 311 */ 312 void tod_step_check_fault(unsigned int, const libhei::Chip& i_chip, 313 ServiceData& io_servData) 314 { 315 // Query hardware for TOD fault data from all active processors. 316 tod::Data data{}; 317 std::vector<pdbg_target*> chipList; 318 util::pdbg::getActiveProcessorChips(chipList); 319 for (const auto& chip : chipList) 320 { 321 tod::collectTodFaultData(chip, data); 322 } 323 324 // For each topology: 325 // - First, check if the MDMT chip is reporting a fault. If so, it is 326 // likely that any downstream step check faults are due to the fault in 327 // the MDMT. 328 // - If MDMT is not reporting a fault, look for any network path errors 329 // from the non-MDMT chips. In which case, we will want to call out all 330 // of the chips sourcing those step check errors (not the chips reporting 331 // them). 332 // - If no other errors found, callout any chips reporting internal step 333 // check faults. 334 335 bool calloutsMade = false; // need to keep track for default case. 336 337 for (const auto top : {tod::Topology::ACTIVE, tod::Topology::BACKUP}) 338 { 339 auto mdmtFault = data.getMdmtFault(top); 340 auto internalFaults = data.getInteralFaults(top); 341 auto networkFaults = data.getNetworkFaults(top); 342 343 if (nullptr != mdmtFault) // MDMT fault 344 { 345 calloutsMade = true; 346 347 // Callout the TOD clock (guard). 348 io_servData.calloutClock(callout::ClockType::TOD_CLOCK, 349 callout::Priority::MED, true); 350 351 // Callout the MDMT chip (no guard to avoid fatal guard on primary 352 // processor when the error could be anywhere in between). 353 io_servData.calloutTarget(mdmtFault, callout::Priority::MED, false); 354 355 // Callout everything in between. 356 // TODO: This isn't necessary for now because the clock callout is 357 // the backplane. However, we may need a procedure callout 358 // for future systems. 359 } 360 else if (!networkFaults.empty()) // network path faults 361 { 362 calloutsMade = true; 363 364 // Callout all chips with network errors (no guard to avoid fatal 365 // guard on primary processor when the error could be anywhere in 366 // between). 367 for (const auto& chip : networkFaults) 368 { 369 io_servData.calloutTarget(chip, callout::Priority::MED, false); 370 } 371 } 372 else if (!internalFaults.empty()) // interal path faults 373 { 374 calloutsMade = true; 375 376 // Callout all chips with internal errors (guard because error is 377 // isolated to this processor). 378 for (const auto& chip : internalFaults) 379 { 380 io_servData.calloutTarget(chip, callout::Priority::MED, true); 381 } 382 } 383 } 384 385 // If no callouts are made, default to calling out the chip that reported 386 // the original attention. 387 if (!calloutsMade) 388 { 389 io_servData.calloutTarget(util::pdbg::getTrgt(i_chip), 390 callout::Priority::MED, true); 391 } 392 } 393 394 } // namespace P10 395 396 PLUGIN_DEFINE_NS(P10_10, P10, tod_step_check_fault); 397 PLUGIN_DEFINE_NS(P10_20, P10, tod_step_check_fault); 398 399 } // namespace analyzer 400