1
2 #include <analyzer/plugins/plugin.hpp>
3 #include <hei_main.hpp>
4 #include <util/pdbg.hpp>
5 #include <util/trace.hpp>
6
7 namespace analyzer
8 {
9
10 namespace P10
11 {
12
13 namespace tod
14 {
15
16 /** Each chip is connected to two TOD topologies: active and backup. The values
17 * are important because some registers and documentation simply refer to them
18 * by number instead of name. Also, they can be used as array indexes if
19 * needed. */
20 enum class Topology
21 {
22 ACTIVE = 0,
23 BACKUP = 1,
24 };
25
26 /** Each topology can be configured as either primary or secondary. */
27 enum class Configuration
28 {
29 PRIMARY,
30 SECONDARY,
31 };
32
33 class Data
34 {
35 public:
36 Data() = default;
37 ~Data() = default;
38 Data(const Data&) = default;
39 Data(Data&&) = default;
40 Data& operator=(const Data&) = default;
41 Data& operator=(Data&&) = default;
42
43 private:
44 /** The MDMT chips at fault (only one per topology). */
45 std::map<Topology, pdbg_target*> iv_mdmtFaultList;
46
47 /** All chips with internal path faults. */
48 std::map<Topology, std::vector<pdbg_target*>> iv_internalFaultList;
49
50 /** The chips sourcing the clocks to non-MDMT chips with faults. */
51 std::map<Topology, std::vector<pdbg_target*>> iv_networkFaultList;
52
53 public:
54 /**
55 * @brief Sets this chip as the MDMT at fault for this topology.
56 * @param i_topology Target topology.
57 * @param i_chipAtFault The chip reporting step check fault.
58 */
setMdmtFault(Topology i_topology,pdbg_target * i_chipAtFault)59 void setMdmtFault(Topology i_topology, pdbg_target* i_chipAtFault)
60 {
61 assert(nullptr != i_chipAtFault);
62 iv_mdmtFaultList[i_topology] = i_chipAtFault;
63 }
64
65 /**
66 * @param i_topology Target topology.
67 * @return The MDMT chip for this topology, if at fault. Otherwise, nullptr.
68 */
getMdmtFault(Topology i_topology)69 pdbg_target* getMdmtFault(Topology i_topology)
70 {
71 return iv_mdmtFaultList[i_topology];
72 }
73
74 /**
75 * @brief Indicates the given chip has an internal fault.
76 * @param i_topology Target topology.
77 * @param i_chipAtFault The chip reporting a step check fault.
78 */
setInternalFault(Topology i_topology,pdbg_target * i_chipAtFault)79 void setInternalFault(Topology i_topology, pdbg_target* i_chipAtFault)
80 {
81 assert(nullptr != i_chipAtFault);
82 iv_internalFaultList[i_topology].push_back(i_chipAtFault);
83 }
84
85 /**
86 * @param i_topology Target topology.
87 * @return The list of all chips with internal faults.
88 */
getInteralFaults(Topology i_topology)89 const std::vector<pdbg_target*>& getInteralFaults(Topology i_topology)
90 {
91 return iv_internalFaultList[i_topology];
92 }
93
94 /**
95 * @brief Indicates the given non-MDMT chip has seen a fault in the TOD
96 * network.
97 * @param i_topology Target topology.
98 * @param i_chipSourcingClock The chip sourcing the clock for the chip at
99 * fault.
100 * @param i_chipAtFault The chip reporting the fault.
101 */
setNetworkFault(Topology i_topology,pdbg_target * i_chipSourcingClock,pdbg_target * i_chipAtFault)102 void setNetworkFault(Topology i_topology, pdbg_target* i_chipSourcingClock,
103 pdbg_target* i_chipAtFault)
104 {
105 assert(nullptr != i_chipSourcingClock);
106 iv_networkFaultList[i_topology].push_back(i_chipSourcingClock);
107
108 assert(nullptr != i_chipAtFault);
109 iv_networkFaultList[i_topology].push_back(i_chipAtFault);
110 }
111
112 /**
113 * @param i_topology Target topology.
114 * @return The list of all chips sourcing the clocks for the non-MDMT chips
115 * with step check faults.
116 */
getNetworkFaults(Topology i_topology)117 const std::vector<pdbg_target*>& getNetworkFaults(Topology i_topology)
118 {
119 return iv_networkFaultList[i_topology];
120 }
121 };
122
123 enum class Register
124 {
125 TOD_ERROR = 0x00040030,
126 TOD_PSS_MSS_STATUS = 0x00040008,
127 TOD_PRI_PORT_0_CTRL = 0x00040001,
128 TOD_PRI_PORT_1_CTRL = 0x00040002,
129 TOD_SEC_PORT_0_CTRL = 0x00040003,
130 TOD_SEC_PORT_1_CTRL = 0x00040004,
131 };
132
readRegister(pdbg_target * i_chip,Register i_addr,libhei::BitStringBuffer & o_val)133 bool readRegister(pdbg_target* i_chip, Register i_addr,
134 libhei::BitStringBuffer& o_val)
135 {
136 assert(64 == o_val.getBitLen());
137
138 uint64_t scomValue;
139 if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue))
140 {
141 trace::err("Register read failed: addr=0x%08x chip=%s",
142 static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip));
143 return true; // SCOM failed
144 }
145
146 o_val.setFieldRight(0, 64, scomValue);
147
148 return false; // no failures
149 }
150
getChipSourcingClock(pdbg_target * i_chipReportingError,unsigned int i_iohsPos)151 pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError,
152 unsigned int i_iohsPos)
153 {
154 using namespace util::pdbg;
155
156 pdbg_target* chipSourcingClock = nullptr;
157
158 // Given the chip reporting the error and the IOHS position within that
159 // chip, we must get
160 // - The associated IOHS target on this chip.
161 // - Next, the IOHS target on the other side of the bus.
162 // - Finally, the chip containing the IOHS target on the other side of the
163 // bus.
164
165 auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos);
166 if (nullptr != iohsUnit)
167 {
168 auto clockSourceUnit =
169 getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS);
170 if (nullptr != clockSourceUnit)
171 {
172 chipSourcingClock = getParentChip(clockSourceUnit);
173 }
174 }
175
176 return chipSourcingClock;
177 }
178
179 /**
180 * @brief Collects TOD fault data for each processor chip.
181 */
collectTodFaultData(pdbg_target * i_chip,Data & o_data)182 void collectTodFaultData(pdbg_target* i_chip, Data& o_data)
183 {
184 // TODO: We should use a register cache captured by the isolator so that
185 // this code is using the same values the isolator used. However, at
186 // the moment the isolator does not have a register cache. Instead,
187 // we'll have to manually SCOM the registers we need. Fortunately,
188 // for a checkstop attention the hardware should freeze and the
189 // values will never change. Unfortunately, we don't have that same
190 // guarantee for TIs, but at the time of this writing, all TOD errors
191 // will trigger a checkstop attention away. So the TI case is not as
192 // important.
193
194 libhei::BitStringBuffer errorReg{64};
195 if (readRegister(i_chip, Register::TOD_ERROR, errorReg))
196 {
197 return; // cannot continue on this chip
198 }
199
200 libhei::BitStringBuffer statusReg{64};
201 if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg))
202 {
203 return; // cannot continue on this chip
204 }
205
206 // Determine which topology is configured primary or secondary.
207 std::map<Topology, Configuration> topConfig;
208
209 if (0 == statusReg.getFieldRight(0, 3))
210 {
211 // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary.
212 topConfig[Topology::ACTIVE] = Configuration::PRIMARY;
213 topConfig[Topology::BACKUP] = Configuration::SECONDARY;
214 }
215 else
216 {
217 // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary.
218 topConfig[Topology::ACTIVE] = Configuration::SECONDARY;
219 topConfig[Topology::BACKUP] = Configuration::PRIMARY;
220 }
221
222 for (const auto top : {Topology::ACTIVE, Topology::BACKUP})
223 {
224 // Bit positions in some registers are dependent on this topology's
225 // configuration.
226 bool isPriTop = (Configuration::PRIMARY == topConfig[top]);
227
228 // Determine if this is the MDMT chip.
229 bool isMasterTod = statusReg.isBitSet(isPriTop ? 13 : 17);
230 bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18);
231
232 if (isMasterDrawer && isMasterTod)
233 {
234 // The master path selects are sourced from the oscilator reference
235 // clocks. So, we'll need to determine which one was used at the
236 // time of the failure.
237 auto masterPathSelect =
238 statusReg.getFieldRight(isPriTop ? 12 : 16, 1);
239
240 // Determine if there is a step check fault for this path select.
241 if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15))
242 {
243 trace::inf(
244 "TOD MDMT fault found: top=%u config=%u path=%u chip=%s",
245 static_cast<unsigned int>(top),
246 static_cast<unsigned int>(topConfig[top]), masterPathSelect,
247 util::pdbg::getPath(i_chip));
248
249 o_data.setMdmtFault(top, i_chip);
250 }
251 }
252 else // not the MDMT on this topology
253 {
254 // The slave path selects are sourced from other processor chips.
255 // So, we'll need to determine which one was used at the time of the
256 // failure.
257 auto slavePathSelect =
258 statusReg.getFieldRight(isPriTop ? 15 : 19, 1);
259
260 // Determine if there is a step check fault for this path select.
261 if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21))
262 {
263 // Get the IOHS unit position on this chip that is connected to
264 // the clock source chip.
265 auto addr = (0 == slavePathSelect)
266 ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL
267 : Register::TOD_SEC_PORT_0_CTRL)
268 : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL
269 : Register::TOD_SEC_PORT_1_CTRL);
270
271 libhei::BitStringBuffer portCtrl{64};
272 if (readRegister(i_chip, addr, portCtrl))
273 {
274 continue; // try the other topology
275 }
276
277 auto iohsPos = portCtrl.getFieldRight(0, 3);
278 auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos);
279
280 if (nullptr != chipSourcingClock)
281 {
282 trace::inf("TOD network fault found: top=%u config=%u "
283 "path=%u chip=%s iohs=%u clockSrc=%s",
284 static_cast<unsigned int>(top),
285 static_cast<unsigned int>(topConfig[top]),
286 slavePathSelect, util::pdbg::getPath(i_chip),
287 iohsPos, util::pdbg::getPath(chipSourcingClock));
288
289 o_data.setNetworkFault(top, chipSourcingClock, i_chip);
290 }
291 }
292 }
293
294 // Check for any internal path errors in the active topology only.
295 if (Topology::ACTIVE == top && errorReg.isBitSet(17))
296 {
297 trace::inf("TOD internal fault found: top=%u config=%u chip=%s",
298 static_cast<unsigned int>(top),
299 static_cast<unsigned int>(topConfig[top]),
300 util::pdbg::getPath(i_chip));
301
302 o_data.setInternalFault(top, i_chip);
303 }
304 }
305 }
306
307 } // namespace tod
308
309 /**
310 * @brief Handles TOD step check fault attentions.
311 */
tod_step_check_fault(unsigned int,const libhei::Chip & i_chip,ServiceData & io_servData)312 void tod_step_check_fault(unsigned int, const libhei::Chip& i_chip,
313 ServiceData& io_servData)
314 {
315 // Query hardware for TOD fault data from all active processors.
316 tod::Data data{};
317 std::vector<pdbg_target*> chipList;
318 util::pdbg::getActiveProcessorChips(chipList);
319 for (const auto& chip : chipList)
320 {
321 tod::collectTodFaultData(chip, data);
322 }
323
324 // For each topology:
325 // - First, check if the MDMT chip is reporting a fault. If so, it is
326 // likely that any downstream step check faults are due to the fault in
327 // the MDMT.
328 // - If MDMT is not reporting a fault, look for any network path errors
329 // from the non-MDMT chips. In which case, we will want to call out all
330 // of the chips sourcing those step check errors (not the chips reporting
331 // them).
332 // - If no other errors found, callout any chips reporting internal step
333 // check faults.
334
335 bool calloutsMade = false; // need to keep track for default case.
336
337 for (const auto top : {tod::Topology::ACTIVE, tod::Topology::BACKUP})
338 {
339 auto mdmtFault = data.getMdmtFault(top);
340 auto internalFaults = data.getInteralFaults(top);
341 auto networkFaults = data.getNetworkFaults(top);
342
343 if (nullptr != mdmtFault) // MDMT fault
344 {
345 calloutsMade = true;
346
347 // Callout the TOD clock (guard).
348 io_servData.calloutClock(callout::ClockType::TOD_CLOCK,
349 callout::Priority::MED, true);
350
351 // Callout the MDMT chip (no guard to avoid fatal guard on primary
352 // processor when the error could be anywhere in between).
353 io_servData.calloutTarget(mdmtFault, callout::Priority::MED, false);
354
355 // Callout everything in between.
356 // TODO: This isn't necessary for now because the clock callout is
357 // the backplane. However, we may need a procedure callout
358 // for future systems.
359 }
360 else if (!networkFaults.empty()) // network path faults
361 {
362 calloutsMade = true;
363
364 // Callout all chips with network errors (no guard to avoid fatal
365 // guard on primary processor when the error could be anywhere in
366 // between).
367 for (const auto& chip : networkFaults)
368 {
369 io_servData.calloutTarget(chip, callout::Priority::MED, false);
370 }
371 }
372 else if (!internalFaults.empty()) // interal path faults
373 {
374 calloutsMade = true;
375
376 // Callout all chips with internal errors (guard because error is
377 // isolated to this processor).
378 for (const auto& chip : internalFaults)
379 {
380 io_servData.calloutTarget(chip, callout::Priority::MED, true);
381 }
382 }
383 }
384
385 // If no callouts are made, default to calling out the chip that reported
386 // the original attention.
387 if (!calloutsMade)
388 {
389 io_servData.calloutTarget(util::pdbg::getTrgt(i_chip),
390 callout::Priority::MED, true);
391 }
392 }
393
394 } // namespace P10
395
396 PLUGIN_DEFINE_NS(P10_10, P10, tod_step_check_fault);
397 PLUGIN_DEFINE_NS(P10_20, P10, tod_step_check_fault);
398
399 } // namespace analyzer
400