1 #include <assert.h>
2 #include <unistd.h>
3 
4 #include <analyzer/analyzer_main.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <analyzer/service_data.hpp>
7 #include <attn/attn_dump.hpp>
8 #include <hei_main.hpp>
9 #include <util/pdbg.hpp>
10 #include <util/trace.hpp>
11 
12 namespace analyzer
13 {
14 
15 //------------------------------------------------------------------------------
16 
17 // Forward references for externally defined functions.
18 
19 /**
20  * @brief Will get the list of active chip and initialize the isolator.
21  * @param o_chips The returned list of active chips.
22  */
23 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
24 
25 /**
26  * @brief  Will get the list of active chip and initialize the isolator.
27  * @param  i_type      The type of analysis to perform. See enum for details.
28  * @param  i_isoData   The data gathered during isolation (for FFDC).
29  * @param  o_rootCause The returned root cause signature.
30  * @return True, if root cause has been found. False, otherwise.
31  */
32 bool filterRootCause(AnalysisType i_type,
33                      const libhei::IsolationData& i_isoData,
34                      libhei::Signature& o_rootCause);
35 
36 /**
37  * @brief Will create and submit a PEL using the given data.
38  * @param i_servData  Data regarding service actions gathered during analysis.
39  * @return The platform log ID. Will return zero if no PEL is generated.
40  */
41 uint32_t createPel(const ServiceData& i_servData);
42 
43 //------------------------------------------------------------------------------
44 
45 const char* __attn(libhei::AttentionType_t i_type)
46 {
47     const char* str = "";
48     switch (i_type)
49     {
50         case libhei::ATTN_TYPE_CHECKSTOP:
51             str = "CHECKSTOP";
52             break;
53         case libhei::ATTN_TYPE_UNIT_CS:
54             str = "UNIT_CS";
55             break;
56         case libhei::ATTN_TYPE_RECOVERABLE:
57             str = "RECOVERABLE";
58             break;
59         case libhei::ATTN_TYPE_SP_ATTN:
60             str = "SP_ATTN";
61             break;
62         case libhei::ATTN_TYPE_HOST_ATTN:
63             str = "HOST_ATTN";
64             break;
65         default:
66             trace::err("Unsupported attention type: %u", i_type);
67             assert(0);
68     }
69     return str;
70 }
71 
72 //------------------------------------------------------------------------------
73 
74 const char* __analysisType(AnalysisType i_type)
75 {
76     const char* str = "";
77     switch (i_type)
78     {
79         case AnalysisType::SYSTEM_CHECKSTOP:
80             str = "SYSTEM_CHECKSTOP";
81             break;
82         case AnalysisType::TERMINATE_IMMEDIATE:
83             str = "TERMINATE_IMMEDIATE";
84             break;
85         case AnalysisType::MANUAL:
86             str = "MANUAL";
87             break;
88         default:
89             trace::err("Unsupported analysis type: %u", i_type);
90             assert(0);
91     }
92     return str;
93 }
94 
95 //------------------------------------------------------------------------------
96 
97 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
98 {
99     uint32_t o_plid = 0; // default, zero indicates PEL was not created
100 
101     if (!util::pdbg::queryHardwareAnalysisSupported())
102     {
103         trace::err("Hardware error analysis is not supported on this system");
104         return o_plid;
105     }
106 
107     trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
108 
109     // Initialize the isolator and get all of the chips to be analyzed.
110     trace::inf("Initializing the isolator...");
111     std::vector<libhei::Chip> chips;
112     initializeIsolator(chips);
113 
114     // Isolate attentions.
115     trace::inf("Isolating errors: # of chips=%u", chips.size());
116     libhei::IsolationData isoData{};
117     libhei::isolate(chips, isoData);
118 
119     // For debug, trace out the original list of signatures before filtering.
120     for (const auto& sig : isoData.getSignatureList())
121     {
122         trace::inf("Signature: %s 0x%0" PRIx32 " %s",
123                    util::pdbg::getPath(sig.getChip()), sig.toUint32(),
124                    __attn(sig.getAttnType()));
125     }
126 
127     // Filter for root cause attention.
128     libhei::Signature rootCause{};
129     bool attnFound = filterRootCause(i_type, isoData, rootCause);
130 
131     // If a root cause attention was found, or if this was a system checkstop,
132     // generate a PEL.
133     if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
134     {
135         if (attnFound)
136         {
137             trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
138                        util::pdbg::getPath(rootCause.getChip()),
139                        rootCause.toUint32(), __attn(rootCause.getAttnType()));
140         }
141         else
142         {
143             // This is bad. Analysis should have found a root cause attention
144             // for a system checkstop. Issues could range from code bugs to SCOM
145             // errors. Regardless, generate a PEL with FFDC to assist with
146             // debug.
147             trace::err("System checkstop with no root cause attention");
148             rootCause = libhei::Signature{}; // just in case
149         }
150 
151         // Start building the service data.
152         ServiceData servData{rootCause, i_type, isoData};
153 
154         // Apply any service actions, if needed. Note that there are no
155         // resolutions for manual analysis.
156         if (AnalysisType::MANUAL != i_type)
157         {
158             if (attnFound)
159             {
160                 // Resolve the root cause attention.
161                 RasDataParser rasData{};
162                 rasData.getResolution(rootCause)->resolve(servData);
163             }
164             else
165             {
166                 // Analysis failed so apply the Level 2 Support resolution.
167                 servData.calloutProcedure(callout::Procedure::NEXTLVL,
168                                           callout::Priority::HIGH);
169             }
170         }
171 
172         // Create and commit a PEL.
173         o_plid = createPel(servData);
174 
175         if (0 == o_plid)
176         {
177             trace::err("Failed to create PEL");
178         }
179         else
180         {
181             trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
182 
183             // Gather/return information needed for dump. A hardware dump will
184             // always be used for system checkstop attenions. Software dumps
185             // will be reserved for MP-IPLs during TI analysis.
186             // TODO: Need ID from root cause. At the moment, HUID does not exist
187             //       in devtree. Will need a better ID definition.
188             o_dump.unitId   = 0;
189             o_dump.dumpType = attn::DumpType::Hardware;
190         }
191     }
192     else
193     {
194         // It is possible for TI handling, or manually initiated analysis via
195         // the command line, that there will not be an active attention. In
196         // which case, we will do nothing and let the caller of this function
197         // determine if this is the expected behavior.
198         trace::inf("No active attentions found");
199     }
200 
201     // All done, clean up the isolator.
202     trace::inf("Uninitializing isolator...");
203     libhei::uninitialize();
204 
205     trace::inf("<<< exit analyzeHardware()");
206 
207     return o_plid;
208 }
209 
210 //------------------------------------------------------------------------------
211 
212 /**
213  * @brief Get error isolator build information
214  *
215  * @return Pointer to build information
216  */
217 const char* getBuildInfo()
218 {
219     return libhei::getBuildInfo();
220 }
221 
222 } // namespace analyzer
223