1 #include <assert.h>
2 #include <libpdbg.h>
3 #include <unistd.h>
4 
5 #include <hei_main.hpp>
6 #include <phosphor-logging/log.hpp>
7 #include <util/pdbg.hpp>
8 #include <util/trace.hpp>
9 
10 #include <algorithm>
11 #include <fstream>
12 #include <iostream>
13 #include <map>
14 #include <string>
15 
16 namespace analyzer
17 {
18 
19 //------------------------------------------------------------------------------
20 
21 // Forward references for externally defined functions.
22 
23 /**
24  * @brief Will get the list of active chip and initialize the isolator.
25  * @param o_chips The returned list of active chips.
26  */
27 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
28 
29 /**
30  * @brief Will create and submit a PEL using the given data.
31  * @param i_rootCause A signature defining the attention root cause.
32  * @param i_isoData   The data gathered during isolation (for FFDC).
33  */
34 void createPel(const libhei::Signature& i_rootCause,
35                const libhei::IsolationData& i_isoData);
36 
37 //------------------------------------------------------------------------------
38 
39 const char* __attn(libhei::AttentionType_t i_attnType)
40 {
41     const char* str = "";
42     switch (i_attnType)
43     {
44         case libhei::ATTN_TYPE_CHECKSTOP:
45             str = "CHECKSTOP";
46             break;
47         case libhei::ATTN_TYPE_UNIT_CS:
48             str = "UNIT_CS";
49             break;
50         case libhei::ATTN_TYPE_RECOVERABLE:
51             str = "RECOVERABLE";
52             break;
53         case libhei::ATTN_TYPE_SP_ATTN:
54             str = "SP_ATTN";
55             break;
56         case libhei::ATTN_TYPE_HOST_ATTN:
57             str = "HOST_ATTN";
58             break;
59         default:
60             trace::err("Unsupported attention type: %u", i_attnType);
61             assert(0);
62     }
63     return str;
64 }
65 
66 //------------------------------------------------------------------------------
67 
68 bool __filterRootCause(const libhei::IsolationData& i_isoData,
69                        libhei::Signature& o_signature)
70 {
71     // We'll need to make a copy of the list so that the original list is
72     // maintained for the log.
73     std::vector<libhei::Signature> sigList{i_isoData.getSignatureList()};
74 
75     // For debug, trace out the original list of signatures before filtering.
76     for (const auto& sig : sigList)
77     {
78         trace::inf("Signature: %s 0x%0" PRIx32 " %s",
79                    util::pdbg::getPath(sig.getChip()), sig.toUint32(),
80                    __attn(sig.getAttnType()));
81     }
82 
83     // Special and host attentions are not supported by this user application.
84     auto newEndItr =
85         std::remove_if(sigList.begin(), sigList.end(), [&](const auto& t) {
86             return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() ||
87                     libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType());
88         });
89 
90     // Shrink the vector, if needed.
91     sigList.resize(std::distance(sigList.begin(), newEndItr));
92 
93     // START WORKAROUND
94     // TODO: Filtering should be determined by the RAS Data Files provided by
95     //       the host firmware via the PNOR (similar to the Chip Data Files).
96     //       Until that support is available, use a rudimentary filter that
97     //       first looks for any recoverable attention, then any unit checkstop,
98     //       and then any system checkstop. This is built on the premise that
99     //       recoverable errors could be the root cause of an system checkstop
100     //       attentions. Fortunately, we just need to sort the list by the
101     //       greater attention type value.
102     std::sort(sigList.begin(), sigList.end(),
103               [&](const auto& a, const auto& b) {
104                   return a.getAttnType() > b.getAttnType();
105               });
106     // END WORKAROUND
107 
108     // Check if a root cause attention was found.
109     if (!sigList.empty())
110     {
111         // The entry at the front of the list will be the root cause.
112         o_signature = sigList.front();
113         return true;
114     }
115 
116     return false; // default, no active attentions found.
117 }
118 
119 //------------------------------------------------------------------------------
120 
121 bool __analyze(const libhei::IsolationData& i_isoData)
122 {
123     bool attnFound = false;
124 
125     libhei::Signature rootCause{};
126     attnFound = __filterRootCause(i_isoData, rootCause);
127 
128     if (!attnFound)
129     {
130         // NOTE: It is possible for TI handling that there will not be an active
131         //       attention. In which case, we will not do anything and let the
132         //       caller of this function determine if this is the expected
133         //       behavior.
134         trace::inf("No active attentions found");
135     }
136     else
137     {
138         trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
139                    util::pdbg::getPath(rootCause.getChip()),
140                    rootCause.toUint32(), __attn(rootCause.getAttnType()));
141 
142         // TODO: Perform service actions based on the root cause.
143 
144         // Create and commit a PEL.
145         createPel(rootCause, i_isoData);
146     }
147 
148     return attnFound;
149 }
150 
151 //------------------------------------------------------------------------------
152 
153 bool analyzeHardware()
154 {
155     bool attnFound = false;
156 
157     trace::inf(">>> enter analyzeHardware()");
158 
159     if (util::pdbg::queryHardwareAnalysisSupported())
160     {
161         // Initialize the isolator and get all of the chips to be analyzed.
162         trace::inf("Initializing the isolator...");
163         std::vector<libhei::Chip> chips;
164         initializeIsolator(chips);
165 
166         // Isolate attentions.
167         trace::inf("Isolating errors: # of chips=%u", chips.size());
168         libhei::IsolationData isoData{};
169         libhei::isolate(chips, isoData);
170 
171         // Analyze the isolation data and perform service actions if needed.
172         attnFound = __analyze(isoData);
173 
174         // All done, clean up the isolator.
175         trace::inf("Uninitializing isolator...");
176         libhei::uninitialize();
177     }
178     else
179     {
180         trace::err("Hardware error analysis is not supported on this system");
181     }
182 
183     trace::inf("<<< exit analyzeHardware()");
184 
185     return attnFound;
186 }
187 
188 } // namespace analyzer
189