1 #include <assert.h>
2 #include <unistd.h>
3
4 #include <analyzer/analyzer_main.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <analyzer/service_data.hpp>
7 #include <attn/attn_dump.hpp>
8 #include <hei_main.hpp>
9 #include <util/pdbg.hpp>
10 #include <util/trace.hpp>
11
12 namespace analyzer
13 {
14 //------------------------------------------------------------------------------
15
16 // Forward references for externally defined functions.
17
18 /**
19 * @brief Will get the list of active chip and initialize the isolator.
20 * @param o_chips The returned list of active chips.
21 */
22 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
23
24 /**
25 * @brief Will get the list of active chip and initialize the isolator.
26 * @param i_type The type of analysis to perform. See enum for details.
27 * @param i_isoData The data gathered during isolation (for FFDC).
28 * @param o_rootCause The returned root cause signature.
29 * @param i_rasData The RAS data parser.
30 * @return True, if root cause has been found. False, otherwise.
31 */
32 bool filterRootCause(AnalysisType i_type,
33 const libhei::IsolationData& i_isoData,
34 libhei::Signature& o_rootCause,
35 const RasDataParser& i_rasData);
36
37 /**
38 * @brief Will create and submit a PEL using the given data.
39 * @param i_servData Data regarding service actions gathered during analysis.
40 * @return The platform log ID. Will return zero if no PEL is generated.
41 */
42 uint32_t commitPel(const ServiceData& i_servData);
43
44 //------------------------------------------------------------------------------
45
__attn(libhei::AttentionType_t i_type)46 const char* __attn(libhei::AttentionType_t i_type)
47 {
48 const char* str = "";
49 switch (i_type)
50 {
51 case libhei::ATTN_TYPE_CHIP_CS:
52 str = "CHIP_CS";
53 break;
54 case libhei::ATTN_TYPE_UNIT_CS:
55 str = "UNIT_CS";
56 break;
57 case libhei::ATTN_TYPE_RECOVERABLE:
58 str = "RECOVERABLE";
59 break;
60 case libhei::ATTN_TYPE_SP_ATTN:
61 str = "SP_ATTN";
62 break;
63 case libhei::ATTN_TYPE_HOST_ATTN:
64 str = "HOST_ATTN";
65 break;
66 default:
67 trace::err("Unsupported attention type: %u", i_type);
68 assert(0);
69 }
70 return str;
71 }
72
73 //------------------------------------------------------------------------------
74
__analysisType(AnalysisType i_type)75 const char* __analysisType(AnalysisType i_type)
76 {
77 const char* str = "";
78 switch (i_type)
79 {
80 case AnalysisType::SYSTEM_CHECKSTOP:
81 str = "SYSTEM_CHECKSTOP";
82 break;
83 case AnalysisType::TERMINATE_IMMEDIATE:
84 str = "TERMINATE_IMMEDIATE";
85 break;
86 case AnalysisType::MANUAL:
87 str = "MANUAL";
88 break;
89 default:
90 trace::err("Unsupported analysis type: %u", i_type);
91 assert(0);
92 }
93 return str;
94 }
95
96 //------------------------------------------------------------------------------
97
analyzeHardware(AnalysisType i_type,attn::DumpParameters & o_dump)98 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
99 {
100 uint32_t o_plid = 0; // default, zero indicates PEL was not created
101
102 if (!util::pdbg::queryHardwareAnalysisSupported())
103 {
104 trace::err("Hardware error analysis is not supported on this system");
105 return o_plid;
106 }
107
108 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
109
110 // Initialize the isolator and get all of the chips to be analyzed.
111 trace::inf("Initializing the isolator...");
112 std::vector<libhei::Chip> chips;
113 initializeIsolator(chips);
114
115 // Isolate attentions.
116 trace::inf("Isolating errors: # of chips=%u", chips.size());
117 libhei::IsolationData isoData{};
118 libhei::isolate(chips, isoData);
119
120 // For debug, trace out the original list of signatures before filtering.
121 for (const auto& sig : isoData.getSignatureList())
122 {
123 trace::inf("Signature: %s 0x%0" PRIx32 " %s",
124 util::pdbg::getPath(sig.getChip()), sig.toUint32(),
125 __attn(sig.getAttnType()));
126 }
127
128 // Filter for root cause attention.
129 libhei::Signature rootCause{};
130 RasDataParser rasData{};
131 bool attnFound = false;
132 try
133 {
134 attnFound = filterRootCause(i_type, isoData, rootCause, rasData);
135 }
136 catch (const std::exception& e)
137 {
138 trace::err("Exception caught during root cause filtering");
139 trace::err(e.what());
140 attnFound = false; // just in case
141 }
142
143 // If a root cause attention was found, or if this was a system checkstop,
144 // generate a PEL.
145 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
146 {
147 if (attnFound)
148 {
149 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
150 util::pdbg::getPath(rootCause.getChip()),
151 rootCause.toUint32(), __attn(rootCause.getAttnType()));
152 }
153 else
154 {
155 // This is bad. Analysis should have found a root cause attention
156 // for a system checkstop. Issues could range from code bugs to SCOM
157 // errors. Regardless, generate a PEL with FFDC to assist with
158 // debug.
159 trace::err("System checkstop with no root cause attention");
160 rootCause = libhei::Signature{}; // just in case
161 }
162
163 // Start building the service data.
164 ServiceData servData{rootCause, i_type, isoData};
165
166 // Apply any service actions, if needed. Note that there are no
167 // resolutions for manual analysis.
168 if (AnalysisType::MANUAL != i_type)
169 {
170 if (attnFound)
171 {
172 try
173 {
174 // Resolve the root cause attention.
175 rasData.getResolution(rootCause)->resolve(servData);
176 }
177 catch (const std::exception& e)
178 {
179 trace::err("Exception caught during root cause analysis");
180 trace::err(e.what());
181
182 // We'll still want to create a PEL for the FFDC, but
183 // since the analysis failed, we need to callout Level 2
184 // Support.
185 servData.calloutProcedure(callout::Procedure::NEXTLVL,
186 callout::Priority::HIGH);
187 }
188 }
189 else
190 {
191 // Analysis failed so callout the Level 2 Support.
192 servData.calloutProcedure(callout::Procedure::NEXTLVL,
193 callout::Priority::HIGH);
194 }
195 }
196
197 // Create and commit a PEL.
198 o_plid = commitPel(servData);
199
200 if (0 == o_plid)
201 {
202 trace::err("Failed to create PEL");
203 }
204 else
205 {
206 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
207
208 // Gather/return information needed for dump. A hardware dump will
209 // always be used for system checkstop attenions. Software dumps
210 // will be reserved for MP-IPLs during TI analysis.
211 // TODO: Need ID from root cause. At the moment, HUID does not exist
212 // in devtree. Will need a better ID definition.
213 o_dump.unitId = 0;
214 o_dump.dumpType = attn::DumpType::Hardware;
215 }
216 }
217 else
218 {
219 // It is possible for TI handling, or manually initiated analysis via
220 // the command line, that there will not be an active attention. In
221 // which case, we will do nothing and let the caller of this function
222 // determine if this is the expected behavior.
223 trace::inf("No active attentions found");
224 }
225
226 // All done, clean up the isolator.
227 trace::inf("Uninitializing isolator...");
228 libhei::uninitialize();
229
230 trace::inf("<<< exit analyzeHardware()");
231
232 return o_plid;
233 }
234
235 //------------------------------------------------------------------------------
236
237 } // namespace analyzer
238