1 #include <assert.h>
2 #include <unistd.h>
3 
4 #include <analyzer/analyzer_main.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <analyzer/service_data.hpp>
7 #include <attn/attn_dump.hpp>
8 #include <hei_main.hpp>
9 #include <util/pdbg.hpp>
10 #include <util/trace.hpp>
11 
12 namespace analyzer
13 {
14 //------------------------------------------------------------------------------
15 
16 // Forward references for externally defined functions.
17 
18 /**
19  * @brief Will get the list of active chip and initialize the isolator.
20  * @param o_chips The returned list of active chips.
21  */
22 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
23 
24 /**
25  * @brief  Will get the list of active chip and initialize the isolator.
26  * @param  i_type      The type of analysis to perform. See enum for details.
27  * @param  i_isoData   The data gathered during isolation (for FFDC).
28  * @param  o_rootCause The returned root cause signature.
29  * @param  i_rasData   The RAS data parser.
30  * @return True, if root cause has been found. False, otherwise.
31  */
32 bool filterRootCause(AnalysisType i_type,
33                      const libhei::IsolationData& i_isoData,
34                      libhei::Signature& o_rootCause,
35                      const RasDataParser& i_rasData);
36 
37 /**
38  * @brief Will create and submit a PEL using the given data.
39  * @param i_servData  Data regarding service actions gathered during analysis.
40  * @return The platform log ID. Will return zero if no PEL is generated.
41  */
42 uint32_t commitPel(const ServiceData& i_servData);
43 
44 //------------------------------------------------------------------------------
45 
46 const char* __attn(libhei::AttentionType_t i_type)
47 {
48     const char* str = "";
49     switch (i_type)
50     {
51         case libhei::ATTN_TYPE_CHECKSTOP:
52             str = "CHECKSTOP";
53             break;
54         case libhei::ATTN_TYPE_UNIT_CS:
55             str = "UNIT_CS";
56             break;
57         case libhei::ATTN_TYPE_RECOVERABLE:
58             str = "RECOVERABLE";
59             break;
60         case libhei::ATTN_TYPE_SP_ATTN:
61             str = "SP_ATTN";
62             break;
63         case libhei::ATTN_TYPE_HOST_ATTN:
64             str = "HOST_ATTN";
65             break;
66         default:
67             trace::err("Unsupported attention type: %u", i_type);
68             assert(0);
69     }
70     return str;
71 }
72 
73 //------------------------------------------------------------------------------
74 
75 const char* __analysisType(AnalysisType i_type)
76 {
77     const char* str = "";
78     switch (i_type)
79     {
80         case AnalysisType::SYSTEM_CHECKSTOP:
81             str = "SYSTEM_CHECKSTOP";
82             break;
83         case AnalysisType::TERMINATE_IMMEDIATE:
84             str = "TERMINATE_IMMEDIATE";
85             break;
86         case AnalysisType::MANUAL:
87             str = "MANUAL";
88             break;
89         default:
90             trace::err("Unsupported analysis type: %u", i_type);
91             assert(0);
92     }
93     return str;
94 }
95 
96 //------------------------------------------------------------------------------
97 
98 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
99 {
100     uint32_t o_plid = 0; // default, zero indicates PEL was not created
101 
102     if (!util::pdbg::queryHardwareAnalysisSupported())
103     {
104         trace::err("Hardware error analysis is not supported on this system");
105         return o_plid;
106     }
107 
108     trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
109 
110     // Initialize the isolator and get all of the chips to be analyzed.
111     trace::inf("Initializing the isolator...");
112     std::vector<libhei::Chip> chips;
113     initializeIsolator(chips);
114 
115     // Isolate attentions.
116     trace::inf("Isolating errors: # of chips=%u", chips.size());
117     libhei::IsolationData isoData{};
118     libhei::isolate(chips, isoData);
119 
120     // For debug, trace out the original list of signatures before filtering.
121     for (const auto& sig : isoData.getSignatureList())
122     {
123         trace::inf("Signature: %s 0x%0" PRIx32 " %s",
124                    util::pdbg::getPath(sig.getChip()), sig.toUint32(),
125                    __attn(sig.getAttnType()));
126     }
127 
128     // Filter for root cause attention.
129     libhei::Signature rootCause{};
130     RasDataParser rasData{};
131     bool attnFound = filterRootCause(i_type, isoData, rootCause, rasData);
132 
133     // If a root cause attention was found, or if this was a system checkstop,
134     // generate a PEL.
135     if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
136     {
137         if (attnFound)
138         {
139             trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
140                        util::pdbg::getPath(rootCause.getChip()),
141                        rootCause.toUint32(), __attn(rootCause.getAttnType()));
142         }
143         else
144         {
145             // This is bad. Analysis should have found a root cause attention
146             // for a system checkstop. Issues could range from code bugs to SCOM
147             // errors. Regardless, generate a PEL with FFDC to assist with
148             // debug.
149             trace::err("System checkstop with no root cause attention");
150             rootCause = libhei::Signature{}; // just in case
151         }
152 
153         // Start building the service data.
154         ServiceData servData{rootCause, i_type, isoData};
155 
156         // Apply any service actions, if needed. Note that there are no
157         // resolutions for manual analysis.
158         if (AnalysisType::MANUAL != i_type)
159         {
160             if (attnFound)
161             {
162                 try
163                 {
164                     // Resolve the root cause attention.
165                     rasData.getResolution(rootCause)->resolve(servData);
166                 }
167                 catch (const std::exception& e)
168                 {
169                     trace::err("Exception caught during root cause analysis");
170                     trace::err(e.what());
171 
172                     // We'll still want to create a PEL for the FFDC, but
173                     // since the analysis failed, we need to callout Level 2
174                     // Support.
175                     servData.calloutProcedure(callout::Procedure::NEXTLVL,
176                                               callout::Priority::HIGH);
177                 }
178             }
179             else
180             {
181                 // Analysis failed so callout the Level 2 Support.
182                 servData.calloutProcedure(callout::Procedure::NEXTLVL,
183                                           callout::Priority::HIGH);
184             }
185         }
186 
187         // Create and commit a PEL.
188         o_plid = commitPel(servData);
189 
190         if (0 == o_plid)
191         {
192             trace::err("Failed to create PEL");
193         }
194         else
195         {
196             trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
197 
198             // Gather/return information needed for dump. A hardware dump will
199             // always be used for system checkstop attenions. Software dumps
200             // will be reserved for MP-IPLs during TI analysis.
201             // TODO: Need ID from root cause. At the moment, HUID does not exist
202             //       in devtree. Will need a better ID definition.
203             o_dump.unitId   = 0;
204             o_dump.dumpType = attn::DumpType::Hardware;
205         }
206     }
207     else
208     {
209         // It is possible for TI handling, or manually initiated analysis via
210         // the command line, that there will not be an active attention. In
211         // which case, we will do nothing and let the caller of this function
212         // determine if this is the expected behavior.
213         trace::inf("No active attentions found");
214     }
215 
216     // All done, clean up the isolator.
217     trace::inf("Uninitializing isolator...");
218     libhei::uninitialize();
219 
220     trace::inf("<<< exit analyzeHardware()");
221 
222     return o_plid;
223 }
224 
225 //------------------------------------------------------------------------------
226 
227 } // namespace analyzer
228