1 
2 #include <analyzer/plugins/plugin.hpp>
3 #include <hei_main.hpp>
4 #include <util/pdbg.hpp>
5 #include <util/trace.hpp>
6 
7 namespace analyzer
8 {
9 
10 namespace P10
11 {
12 
13 namespace tod
14 {
15 
16 /** Each chip is connected to two TOD topologies: active and backup. The values
17  *  are important because some registers and documentation simply refer to them
18  *  by number instead of name. Also, they can be used as array indexes if
19  *  needed. */
20 enum class Topology
21 {
22     ACTIVE = 0,
23     BACKUP = 1,
24 };
25 
26 /** Each topology can be configured as either primary or secondary. */
27 enum class Configuration
28 {
29     PRIMARY,
30     SECONDARY,
31 };
32 
33 class Data
34 {
35   public:
36     Data() = default;
37     ~Data() = default;
38     Data(const Data&) = default;
39     Data(Data&&) = default;
40     Data& operator=(const Data&) = default;
41     Data& operator=(Data&&) = default;
42 
43   private:
44     /** The MDMT chips at fault (only one per topology). */
45     std::map<Topology, pdbg_target*> iv_mdmtFaultList;
46 
47     /** All chips with internal path faults. */
48     std::map<Topology, std::vector<pdbg_target*>> iv_internalFaultList;
49 
50     /** The chips sourcing the clocks to non-MDMT chips with faults. */
51     std::map<Topology, std::vector<pdbg_target*>> iv_networkFaultList;
52 
53   public:
54     /**
55      * @brief Sets this chip as the MDMT at fault for this topology.
56      * @param i_topology    Target topology.
57      * @param i_chipAtFault The chip reporting step check fault.
58      */
setMdmtFault(Topology i_topology,pdbg_target * i_chipAtFault)59     void setMdmtFault(Topology i_topology, pdbg_target* i_chipAtFault)
60     {
61         assert(nullptr != i_chipAtFault);
62         iv_mdmtFaultList[i_topology] = i_chipAtFault;
63     }
64 
65     /**
66      * @param  i_topology Target topology.
67      * @return The MDMT chip for this topology, if at fault. Otherwise, nullptr.
68      */
getMdmtFault(Topology i_topology)69     pdbg_target* getMdmtFault(Topology i_topology)
70     {
71         return iv_mdmtFaultList[i_topology];
72     }
73 
74     /**
75      * @brief Indicates the given chip has an internal fault.
76      * @param i_topology    Target topology.
77      * @param i_chipAtFault The chip reporting a step check fault.
78      */
setInternalFault(Topology i_topology,pdbg_target * i_chipAtFault)79     void setInternalFault(Topology i_topology, pdbg_target* i_chipAtFault)
80     {
81         assert(nullptr != i_chipAtFault);
82         iv_internalFaultList[i_topology].push_back(i_chipAtFault);
83     }
84 
85     /**
86      * @param  i_topology Target topology.
87      * @return The list of all chips with internal faults.
88      */
getInteralFaults(Topology i_topology)89     const std::vector<pdbg_target*>& getInteralFaults(Topology i_topology)
90     {
91         return iv_internalFaultList[i_topology];
92     }
93 
94     /**
95      * @brief Indicates the given non-MDMT chip has seen a fault in the TOD
96      *        network.
97      * @param i_topology          Target topology.
98      * @param i_chipSourcingClock The chip sourcing the clock for the chip at
99      *                            fault.
100      * @param i_chipAtFault       The chip reporting the fault.
101      */
setNetworkFault(Topology i_topology,pdbg_target * i_chipSourcingClock,pdbg_target * i_chipAtFault)102     void setNetworkFault(Topology i_topology, pdbg_target* i_chipSourcingClock,
103                          pdbg_target* i_chipAtFault)
104     {
105         assert(nullptr != i_chipSourcingClock);
106         iv_networkFaultList[i_topology].push_back(i_chipSourcingClock);
107 
108         assert(nullptr != i_chipAtFault);
109         iv_networkFaultList[i_topology].push_back(i_chipAtFault);
110     }
111 
112     /**
113      * @param  i_topology Target topology.
114      * @return The list of all chips sourcing the clocks for the non-MDMT chips
115      *         with step check faults.
116      */
getNetworkFaults(Topology i_topology)117     const std::vector<pdbg_target*>& getNetworkFaults(Topology i_topology)
118     {
119         return iv_networkFaultList[i_topology];
120     }
121 };
122 
123 enum class Register
124 {
125     TOD_ERROR = 0x00040030,
126     TOD_PSS_MSS_STATUS = 0x00040008,
127     TOD_PRI_PORT_0_CTRL = 0x00040001,
128     TOD_PRI_PORT_1_CTRL = 0x00040002,
129     TOD_SEC_PORT_0_CTRL = 0x00040003,
130     TOD_SEC_PORT_1_CTRL = 0x00040004,
131 };
132 
readRegister(pdbg_target * i_chip,Register i_addr,libhei::BitStringBuffer & o_val)133 bool readRegister(pdbg_target* i_chip, Register i_addr,
134                   libhei::BitStringBuffer& o_val)
135 {
136     assert(64 == o_val.getBitLen());
137 
138     uint64_t scomValue;
139     if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue))
140     {
141         trace::err("Register read failed: addr=0x%08x chip=%s",
142                    static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip));
143         return true; // SCOM failed
144     }
145 
146     o_val.setFieldRight(0, 64, scomValue);
147 
148     return false; // no failures
149 }
150 
getChipSourcingClock(pdbg_target * i_chipReportingError,unsigned int i_iohsPos)151 pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError,
152                                   unsigned int i_iohsPos)
153 {
154     using namespace util::pdbg;
155 
156     pdbg_target* chipSourcingClock = nullptr;
157 
158     // Given the chip reporting the error and the IOHS position within that
159     // chip, we must get
160     //  - The associated IOHS target on this chip.
161     //  - Next, the IOHS target on the other side of the bus.
162     //  - Finally, the chip containing the IOHS target on the other side of the
163     //    bus.
164 
165     auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos);
166     if (nullptr != iohsUnit)
167     {
168         auto clockSourceUnit =
169             getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS);
170         if (nullptr != clockSourceUnit)
171         {
172             chipSourcingClock = getParentChip(clockSourceUnit);
173         }
174     }
175 
176     return chipSourcingClock;
177 }
178 
179 /**
180  * @brief Collects TOD fault data for each processor chip.
181  */
collectTodFaultData(pdbg_target * i_chip,Data & o_data)182 void collectTodFaultData(pdbg_target* i_chip, Data& o_data)
183 {
184     // TODO: We should use a register cache captured by the isolator so that
185     //       this code is using the same values the isolator used.  However, at
186     //       the moment the isolator does not have a register cache. Instead,
187     //       we'll have to manually SCOM the registers we need.  Fortunately,
188     //       for a checkstop attention the hardware should freeze and the
189     //       values will never change. Unfortunately, we don't have that same
190     //       guarantee for TIs, but at the time of this writing, all TOD errors
191     //       will trigger a checkstop attention away. So the TI case is not as
192     //       important.
193 
194     libhei::BitStringBuffer errorReg{64};
195     if (readRegister(i_chip, Register::TOD_ERROR, errorReg))
196     {
197         return; // cannot continue on this chip
198     }
199 
200     libhei::BitStringBuffer statusReg{64};
201     if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg))
202     {
203         return; // cannot continue on this chip
204     }
205 
206     // Determine which topology is configured primary or secondary.
207     std::map<Topology, Configuration> topConfig;
208 
209     if (0 == statusReg.getFieldRight(0, 3))
210     {
211         // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary.
212         topConfig[Topology::ACTIVE] = Configuration::PRIMARY;
213         topConfig[Topology::BACKUP] = Configuration::SECONDARY;
214     }
215     else
216     {
217         // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary.
218         topConfig[Topology::ACTIVE] = Configuration::SECONDARY;
219         topConfig[Topology::BACKUP] = Configuration::PRIMARY;
220     }
221 
222     for (const auto top : {Topology::ACTIVE, Topology::BACKUP})
223     {
224         // Bit positions in some registers are dependent on this topology's
225         // configuration.
226         bool isPriTop = (Configuration::PRIMARY == topConfig[top]);
227 
228         // Determine if this is the MDMT chip.
229         bool isMasterTod = statusReg.isBitSet(isPriTop ? 13 : 17);
230         bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18);
231 
232         if (isMasterDrawer && isMasterTod)
233         {
234             // The master path selects are sourced from the oscilator reference
235             // clocks. So, we'll need to determine which one was used at the
236             // time of the failure.
237             auto masterPathSelect =
238                 statusReg.getFieldRight(isPriTop ? 12 : 16, 1);
239 
240             // Determine if there is a step check fault for this path select.
241             if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15))
242             {
243                 trace::inf(
244                     "TOD MDMT fault found: top=%u config=%u path=%u chip=%s",
245                     static_cast<unsigned int>(top),
246                     static_cast<unsigned int>(topConfig[top]), masterPathSelect,
247                     util::pdbg::getPath(i_chip));
248 
249                 o_data.setMdmtFault(top, i_chip);
250             }
251         }
252         else // not the MDMT on this topology
253         {
254             // The slave path selects are sourced from other processor chips.
255             // So, we'll need to determine which one was used at the time of the
256             // failure.
257             auto slavePathSelect =
258                 statusReg.getFieldRight(isPriTop ? 15 : 19, 1);
259 
260             // Determine if there is a step check fault for this path select.
261             if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21))
262             {
263                 // Get the IOHS unit position on this chip that is connected to
264                 // the clock source chip.
265                 auto addr = (0 == slavePathSelect)
266                                 ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL
267                                             : Register::TOD_SEC_PORT_0_CTRL)
268                                 : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL
269                                             : Register::TOD_SEC_PORT_1_CTRL);
270 
271                 libhei::BitStringBuffer portCtrl{64};
272                 if (readRegister(i_chip, addr, portCtrl))
273                 {
274                     continue; // try the other topology
275                 }
276 
277                 auto iohsPos = portCtrl.getFieldRight(0, 3);
278                 auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos);
279 
280                 if (nullptr != chipSourcingClock)
281                 {
282                     trace::inf("TOD network fault found: top=%u config=%u "
283                                "path=%u chip=%s iohs=%u clockSrc=%s",
284                                static_cast<unsigned int>(top),
285                                static_cast<unsigned int>(topConfig[top]),
286                                slavePathSelect, util::pdbg::getPath(i_chip),
287                                iohsPos, util::pdbg::getPath(chipSourcingClock));
288 
289                     o_data.setNetworkFault(top, chipSourcingClock, i_chip);
290                 }
291             }
292         }
293 
294         // Check for any internal path errors in the active topology only.
295         if (Topology::ACTIVE == top && errorReg.isBitSet(17))
296         {
297             trace::inf("TOD internal fault found: top=%u config=%u chip=%s",
298                        static_cast<unsigned int>(top),
299                        static_cast<unsigned int>(topConfig[top]),
300                        util::pdbg::getPath(i_chip));
301 
302             o_data.setInternalFault(top, i_chip);
303         }
304     }
305 }
306 
307 } // namespace tod
308 
309 /**
310  * @brief Handles TOD step check fault attentions.
311  */
tod_step_check_fault(unsigned int,const libhei::Chip & i_chip,ServiceData & io_servData)312 void tod_step_check_fault(unsigned int, const libhei::Chip& i_chip,
313                           ServiceData& io_servData)
314 {
315     // Query hardware for TOD fault data from all active processors.
316     tod::Data data{};
317     std::vector<pdbg_target*> chipList;
318     util::pdbg::getActiveProcessorChips(chipList);
319     for (const auto& chip : chipList)
320     {
321         tod::collectTodFaultData(chip, data);
322     }
323 
324     // For each topology:
325     //  - First, check if the MDMT chip is reporting a fault. If so, it is
326     //    likely that any downstream step check faults are due to the fault in
327     //    the MDMT.
328     //  - If MDMT is not reporting a fault, look for any network path errors
329     //    from the non-MDMT chips. In which case, we will want to call out all
330     //    of the chips sourcing those step check errors (not the chips reporting
331     //    them).
332     //  - If no other errors found, callout any chips reporting internal step
333     //    check faults.
334 
335     bool calloutsMade = false; // need to keep track for default case.
336 
337     for (const auto top : {tod::Topology::ACTIVE, tod::Topology::BACKUP})
338     {
339         auto mdmtFault = data.getMdmtFault(top);
340         auto internalFaults = data.getInteralFaults(top);
341         auto networkFaults = data.getNetworkFaults(top);
342 
343         if (nullptr != mdmtFault) // MDMT fault
344         {
345             calloutsMade = true;
346 
347             // Callout the TOD clock (guard).
348             io_servData.calloutClock(callout::ClockType::TOD_CLOCK,
349                                      callout::Priority::MED, true);
350 
351             // Callout the MDMT chip (no guard to avoid fatal guard on primary
352             // processor when the error could be anywhere in between).
353             io_servData.calloutTarget(mdmtFault, callout::Priority::MED, false);
354 
355             // Callout everything in between.
356             // TODO: This isn't necessary for now because the clock callout is
357             //       the backplane. However, we may need a procedure callout
358             //       for future systems.
359         }
360         else if (!networkFaults.empty()) // network path faults
361         {
362             calloutsMade = true;
363 
364             // Callout all chips with network errors (no guard to avoid fatal
365             // guard on primary processor when the error could be anywhere in
366             // between).
367             for (const auto& chip : networkFaults)
368             {
369                 io_servData.calloutTarget(chip, callout::Priority::MED, false);
370             }
371         }
372         else if (!internalFaults.empty()) // interal path faults
373         {
374             calloutsMade = true;
375 
376             // Callout all chips with internal errors (guard because error is
377             // isolated to this processor).
378             for (const auto& chip : internalFaults)
379             {
380                 io_servData.calloutTarget(chip, callout::Priority::MED, true);
381             }
382         }
383     }
384 
385     // If no callouts are made, default to calling out the chip that reported
386     // the original attention.
387     if (!calloutsMade)
388     {
389         io_servData.calloutTarget(util::pdbg::getTrgt(i_chip),
390                                   callout::Priority::MED, true);
391     }
392 }
393 
394 } // namespace P10
395 
396 PLUGIN_DEFINE_NS(P10_10, P10, tod_step_check_fault);
397 PLUGIN_DEFINE_NS(P10_20, P10, tod_step_check_fault);
398 
399 } // namespace analyzer
400