1 
2 #include <analyzer/plugins/plugin.hpp>
3 #include <hei_main.hpp>
4 #include <util/pdbg.hpp>
5 #include <util/trace.hpp>
6 
7 namespace analyzer
8 {
9 
10 namespace P10
11 {
12 
13 namespace tod
14 {
15 
16 /** Each chip is connected to two TOD topologies: active and backup. The values
17  *  are important because some registers and documentation simply refer to them
18  *  by number instead of name. Also, they can be used as array indexes if
19  *  needed. */
20 enum class Topology
21 {
22     ACTIVE = 0,
23     BACKUP = 1,
24 };
25 
26 /** Each topology can be configured as either primary or secondary. */
27 enum class Configuration
28 {
29     PRIMARY,
30     SECONDARY,
31 };
32 
33 class Data
34 {
35   public:
36     Data()            = default;
37     ~Data()           = default;
38     Data(const Data&) = default;
39     Data(Data&&)      = default;
40     Data& operator=(const Data&) = default;
41     Data& operator=(Data&&) = default;
42 
43   private:
44     /** The MDMT chips at fault (only one per topology). */
45     std::map<Topology, pdbg_target*> iv_mdmtFaultList;
46 
47     /** All chips with internal path faults. */
48     std::map<Topology, std::vector<pdbg_target*>> iv_internalFaultList;
49 
50     /** The chips sourcing the clocks to non-MDMT chips with faults. */
51     std::map<Topology, std::vector<pdbg_target*>> iv_networkFaultList;
52 
53   public:
54     /**
55      * @brief Sets this chip as the MDMT at fault for this topology.
56      * @param i_topology    Target topology.
57      * @param i_chipAtFault The chip reporting step check fault.
58      */
59     void setMdmtFault(Topology i_topology, pdbg_target* i_chipAtFault)
60     {
61         assert(nullptr != i_chipAtFault);
62         iv_mdmtFaultList[i_topology] = i_chipAtFault;
63     }
64 
65     /**
66      * @param  i_topology Target topology.
67      * @return The MDMT chip for this topology, if at fault. Otherwise, nullptr.
68      */
69     pdbg_target* getMdmtFault(Topology i_topology)
70     {
71         return iv_mdmtFaultList[i_topology];
72     }
73 
74     /**
75      * @brief Indicates the given chip has an internal fault.
76      * @param i_topology    Target topology.
77      * @param i_chipAtFault The chip reporting a step check fault.
78      */
79     void setInternalFault(Topology i_topology, pdbg_target* i_chipAtFault)
80     {
81         assert(nullptr != i_chipAtFault);
82         iv_internalFaultList[i_topology].push_back(i_chipAtFault);
83     }
84 
85     /**
86      * @param  i_topology Target topology.
87      * @return The list of all chips with internal faults.
88      */
89     const std::vector<pdbg_target*>& getInteralFaults(Topology i_topology)
90     {
91         return iv_internalFaultList[i_topology];
92     }
93 
94     /**
95      * @brief Indicates the given non-MDMT chip has seen a fault in the TOD
96      *        network.
97      * @param i_topology          Target topology.
98      * @param i_chipSourcingClock The chip sourcing the clock for the chip at
99      *                            fault. This is NOT the chip at fault.
100      */
101     void setNetworkFault(Topology i_topology, pdbg_target* i_chipSourcingClock)
102     {
103         assert(nullptr != i_chipSourcingClock);
104         iv_networkFaultList[i_topology].push_back(i_chipSourcingClock);
105     }
106 
107     /**
108      * @param  i_topology Target topology.
109      * @return The list of all chips sourcing the clocks for the non-MDMT chips
110      *         with step check faults.
111      */
112     const std::vector<pdbg_target*>& getNetworkFaults(Topology i_topology)
113     {
114         return iv_networkFaultList[i_topology];
115     }
116 };
117 
118 enum class Register
119 {
120     TOD_ERROR           = 0x00040030,
121     TOD_PSS_MSS_STATUS  = 0x00040008,
122     TOD_PRI_PORT_0_CTRL = 0x00040001,
123     TOD_PRI_PORT_1_CTRL = 0x00040002,
124     TOD_SEC_PORT_0_CTRL = 0x00040003,
125     TOD_SEC_PORT_1_CTRL = 0x00040004,
126 };
127 
128 bool readRegister(pdbg_target* i_chip, Register i_addr,
129                   libhei::BitStringBuffer& o_val)
130 {
131     assert(64 == o_val.getBitLen());
132 
133     uint64_t scomValue;
134     if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue))
135     {
136         trace::err("Register read failed: addr=0x%08x chip=%s",
137                    static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip));
138         return true; // SCOM failed
139     }
140 
141     o_val.setFieldRight(0, 64, scomValue);
142 
143     return false; // no failures
144 }
145 
146 pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError,
147                                   unsigned int i_iohsPos)
148 {
149     using namespace util::pdbg;
150 
151     pdbg_target* chipSourcingClock = nullptr;
152 
153     // Given the chip reporting the error and the IOHS position within that
154     // chip, we must get
155     //  - The associated IOHS target on this chip.
156     //  - Next, the IOHS target on the other side of the bus.
157     //  - Finally, the chip containing the IOHS target on the other side of the
158     //    bus.
159 
160     auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos);
161     if (nullptr != iohsUnit)
162     {
163         auto clockSourceUnit =
164             getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS);
165         if (nullptr != clockSourceUnit)
166         {
167             chipSourcingClock = getParentChip(clockSourceUnit);
168         }
169     }
170 
171     return chipSourcingClock;
172 }
173 
174 /**
175  * @brief Collects TOD fault data for each processor chip.
176  */
177 void collectTodFaultData(pdbg_target* i_chip, Data& o_data)
178 {
179     // TODO: We should use a register cache captured by the isolator so that
180     //       this code is using the same values the isolator used.  However, at
181     //       the moment the isolator does not have a register cache. Instead,
182     //       we'll have to manually SCOM the registers we need.  Fortunately,
183     //       for a checkstop attention the hardware should freeze and the
184     //       values will never change. Unfortunately, we don't have that same
185     //       guarantee for TIs, but at the time of this writing, all TOD errors
186     //       will trigger a checkstop attention away. So the TI case is not as
187     //       important.
188 
189     libhei::BitStringBuffer errorReg{64};
190     if (readRegister(i_chip, Register::TOD_ERROR, errorReg))
191     {
192         return; // cannot continue on this chip
193     }
194 
195     libhei::BitStringBuffer statusReg{64};
196     if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg))
197     {
198         return; // cannot continue on this chip
199     }
200 
201     // Determine which topology is configured primary or secondary.
202     std::map<Topology, Configuration> topConfig;
203 
204     if (0 == statusReg.getFieldRight(0, 3))
205     {
206         // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary.
207         topConfig[Topology::ACTIVE] = Configuration::PRIMARY;
208         topConfig[Topology::BACKUP] = Configuration::SECONDARY;
209     }
210     else
211     {
212         // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary.
213         topConfig[Topology::ACTIVE] = Configuration::SECONDARY;
214         topConfig[Topology::BACKUP] = Configuration::PRIMARY;
215     }
216 
217     for (const auto top : {Topology::ACTIVE, Topology::BACKUP})
218     {
219         // Bit positions in some registers are dependent on this topology's
220         // configuration.
221         bool isPriTop = (Configuration::PRIMARY == topConfig[top]);
222 
223         // Determine if this is the MDMT chip.
224         bool isMasterTod    = statusReg.isBitSet(isPriTop ? 13 : 17);
225         bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18);
226 
227         if (isMasterDrawer && isMasterTod)
228         {
229             // The master path selects are sourced from the oscilator reference
230             // clocks. So, we'll need to determine which one was used at the
231             // time of the failure.
232             auto masterPathSelect =
233                 statusReg.getFieldRight(isPriTop ? 12 : 16, 1);
234 
235             // Determine if there is a step check fault for this path select.
236             if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15))
237             {
238                 trace::inf(
239                     "TOD MDMT fault found: top=%u config=%u path=%u chip=%s",
240                     static_cast<unsigned int>(top),
241                     static_cast<unsigned int>(topConfig[top]), masterPathSelect,
242                     util::pdbg::getPath(i_chip));
243 
244                 o_data.setMdmtFault(top, i_chip);
245             }
246         }
247         else // not the MDMT on this topology
248         {
249             // The slave path selects are sourced from other processor chips.
250             // So, we'll need to determine which one was used at the time of the
251             // failure.
252             auto slavePathSelect =
253                 statusReg.getFieldRight(isPriTop ? 15 : 19, 1);
254 
255             // Determine if there is a step check fault for this path select.
256             if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21))
257             {
258                 // Get the IOHS unit position on this chip that is connected to
259                 // the clock source chip.
260                 auto addr = (0 == slavePathSelect)
261                                 ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL
262                                             : Register::TOD_SEC_PORT_0_CTRL)
263                                 : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL
264                                             : Register::TOD_SEC_PORT_1_CTRL);
265 
266                 libhei::BitStringBuffer portCtrl{64};
267                 if (readRegister(i_chip, addr, portCtrl))
268                 {
269                     continue; // try the other topology
270                 }
271 
272                 auto iohsPos           = portCtrl.getFieldRight(0, 3);
273                 auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos);
274 
275                 if (nullptr != chipSourcingClock)
276                 {
277                     trace::inf("TOD network fault found: top=%u config=%u "
278                                "path=%u chip=%s iohs=%u clockSrc=%s",
279                                static_cast<unsigned int>(top),
280                                static_cast<unsigned int>(topConfig[top]),
281                                slavePathSelect, util::pdbg::getPath(i_chip),
282                                iohsPos, util::pdbg::getPath(chipSourcingClock));
283 
284                     o_data.setNetworkFault(top, chipSourcingClock);
285                 }
286             }
287         }
288 
289         // Check for any internal path errors in the active topology only.
290         if (Topology::ACTIVE == top && errorReg.isBitSet(17))
291         {
292             trace::inf("TOD internal fault found: top=%u config=%u chip=%s",
293                        static_cast<unsigned int>(top),
294                        static_cast<unsigned int>(topConfig[top]),
295                        util::pdbg::getPath(i_chip));
296 
297             o_data.setInternalFault(top, i_chip);
298         }
299     }
300 }
301 
302 } // namespace tod
303 
304 /**
305  * @brief Handles TOD step check fault attentions.
306  */
307 void tod_step_check_fault(unsigned int, const libhei::Chip& i_chip,
308                           ServiceData& io_servData)
309 {
310     // Query hardware for TOD fault data from all active processors.
311     tod::Data data{};
312     std::vector<pdbg_target*> chipList;
313     util::pdbg::getActiveProcessorChips(chipList);
314     for (const auto& chip : chipList)
315     {
316         tod::collectTodFaultData(chip, data);
317     }
318 
319     // For each topology:
320     //  - First, check if the MDMT chip is reporting a fault. If so, it is
321     //    likely that any downstream step check faults are due to the fault in
322     //    the MDMT.
323     //  - If MDMT is not reporting a fault, look for any network path errors
324     //    from the non-MDMT chips. In which case, we will want to call out all
325     //    of the chips sourcing those step check errors (not the chips reporting
326     //    them).
327     //  - If no other errors found, callout any chips reporting internal step
328     //    check faults.
329 
330     bool calloutsMade = false; // need to keep track for default case.
331 
332     for (const auto top : {tod::Topology::ACTIVE, tod::Topology::BACKUP})
333     {
334         auto mdmtFault      = data.getMdmtFault(top);
335         auto internalFaults = data.getInteralFaults(top);
336         auto networkFaults  = data.getNetworkFaults(top);
337 
338         if (nullptr != mdmtFault) // MDMT fault
339         {
340             calloutsMade = true;
341 
342             // Callout the TOD clock (guard).
343             io_servData.calloutClock(callout::ClockType::TOD_CLOCK,
344                                      callout::Priority::MED, true);
345 
346             // Callout the MDMT chip (no guard).
347             io_servData.calloutTarget(mdmtFault, callout::Priority::MED, true);
348 
349             // Callout everything in between.
350             // TODO: This isn't necessary for now because the clock callout is
351             //       the backplane. However, we may need a procedure callout
352             //       for future systems.
353         }
354         else if (!networkFaults.empty()) // network path faults
355         {
356             calloutsMade = true;
357 
358             // Callout all chips with network errors (guard).
359             for (const auto& chip : networkFaults)
360             {
361                 io_servData.calloutTarget(chip, callout::Priority::MED, true);
362             }
363         }
364         else if (!internalFaults.empty()) // interal path faults
365         {
366             calloutsMade = true;
367 
368             // Callout all chips with internal errors (guard).
369             for (const auto& chip : internalFaults)
370             {
371                 io_servData.calloutTarget(chip, callout::Priority::MED, true);
372             }
373         }
374     }
375 
376     // If no callouts are made, default to calling out the chip that reported
377     // the original attention.
378     if (!calloutsMade)
379     {
380         io_servData.calloutTarget(util::pdbg::getTrgt(i_chip),
381                                   callout::Priority::MED, true);
382     }
383 }
384 
385 } // namespace P10
386 
387 PLUGIN_DEFINE_NS(P10_10, P10, tod_step_check_fault);
388 PLUGIN_DEFINE_NS(P10_20, P10, tod_step_check_fault);
389 
390 } // namespace analyzer
391