1 #include <stdio.h>
2 
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/plugins/plugin.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8 #include <util/trace.hpp>
9 
10 #include "gtest/gtest.h"
11 
12 namespace analyzer
13 {
14 // Forward reference of filterRootCause
15 bool filterRootCause(AnalysisType i_type,
16                      const libhei::IsolationData& i_isoData,
17                      libhei::Signature& o_rootCause,
18                      const RasDataParser& i_rasData);
19 } // namespace analyzer
20 
21 using namespace analyzer;
22 
23 // Processor side FIRs
24 static const auto eqCoreFir = static_cast<libhei::NodeId_t>(
25     libhei::hash<libhei::NodeId_t>("EQ_CORE_FIR"));
26 
27 static const auto mc_dstl_fir = static_cast<libhei::NodeId_t>(
28     libhei::hash<libhei::NodeId_t>("MC_DSTL_FIR"));
29 
30 // Explorer OCMB FIRs
31 static const auto rdfFir =
32     static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("RDFFIR"));
33 
34 // Odyssey OCMB FIRs
35 static const auto srq_fir =
36     static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("SRQ_FIR"));
37 
38 static const auto rdf_fir =
39     static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("RDF_FIR"));
40 
41 static const auto odp_fir =
42     static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("ODP_FIR"));
43 
TEST(RootCauseFilter,Filter1)44 TEST(RootCauseFilter, Filter1)
45 {
46     pdbg_targets_init(nullptr);
47 
48     RasDataParser rasData{};
49 
50     // Test 1: Test a checkstop with a UE root cause on an OCMB
51 
52     // Checkstop signature on the proc
53     auto proc0 = util::pdbg::getTrgt("/proc0");
54     libhei::Chip procChip0{proc0, P10_20};
55 
56     // EQ_CORE_FIR[14]: ME = 0 checkstop
57     libhei::Signature checkstopSig{procChip0, eqCoreFir, 0, 14,
58                                    libhei::ATTN_TYPE_CHIP_CS};
59 
60     // MC_DSTL_FIR[1]: AFU initiated Recoverable Attn on Subchannel A
61     libhei::Signature reAttnSig{procChip0, mc_dstl_fir, 0, 1,
62                                 libhei::ATTN_TYPE_RECOVERABLE};
63 
64     // Root cause signature on the ocmb
65     auto ocmb0 =
66         util::pdbg::getTrgt("proc0/pib/perv12/mc0/mi0/mcc0/omi0/ocmb0");
67     libhei::Chip ocmbChip0{ocmb0, EXPLORER_20};
68 
69     // RDFFIR[14]: Mainline read UE
70     libhei::Signature ueSig{ocmbChip0, rdfFir, 0, 14,
71                             libhei::ATTN_TYPE_RECOVERABLE};
72 
73     // Add the signatures to the isolation data
74     libhei::IsolationData isoData{};
75     isoData.addSignature(checkstopSig);
76     isoData.addSignature(reAttnSig);
77     isoData.addSignature(ueSig);
78 
79     libhei::Signature rootCause;
80     bool attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
81                                      rootCause, rasData);
82     EXPECT_TRUE(attnFound);
83     EXPECT_EQ(ueSig.toUint32(), rootCause.toUint32());
84 
85     // Test 2: Test a checkstop with an unknown RE attn on an OCMB
86 
87     // Add the signatures to the isolation data
88     isoData.flush();
89     isoData.addSignature(checkstopSig);
90     isoData.addSignature(reAttnSig);
91 
92     attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
93                                 rootCause, rasData);
94     EXPECT_TRUE(attnFound);
95     EXPECT_EQ(reAttnSig.toUint32(), rootCause.toUint32());
96 
97     // Test 3: Test a checkstop with an unknown UCS attn on an OCMB
98 
99     // MC_DSTL_FIR[0]: AFU initiated Checkstop on Subchannel A
100     libhei::Signature ucsAttnSig{procChip0, mc_dstl_fir, 0, 0,
101                                  libhei::ATTN_TYPE_UNIT_CS};
102 
103     isoData.flush();
104     isoData.addSignature(checkstopSig);
105     isoData.addSignature(ucsAttnSig);
106 
107     attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
108                                 rootCause, rasData);
109     EXPECT_TRUE(attnFound);
110     EXPECT_EQ(ucsAttnSig.toUint32(), rootCause.toUint32());
111 
112     // Test 4: Test a checkstop with a non-root cause recoverable from an OCMB
113 
114     // RDFFIR[42]: SCOM recoverable register parity error
115     libhei::Signature reSig{ocmbChip0, rdfFir, 0, 42,
116                             libhei::ATTN_TYPE_RECOVERABLE};
117 
118     isoData.flush();
119     isoData.addSignature(checkstopSig);
120     isoData.addSignature(reAttnSig);
121     isoData.addSignature(reSig);
122 
123     attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
124                                 rootCause, rasData);
125     EXPECT_TRUE(attnFound);
126     EXPECT_EQ(checkstopSig.toUint32(), rootCause.toUint32());
127 
128     // Test 5: Test a firmware initiated channel fail due to an IUE threshold on
129     // a Odyssey OCMB
130     libhei::Chip odyChip0{ocmb0, ODYSSEY_10};
131 
132     libhei::Signature fwInitChnlFail{odyChip0, srq_fir, 0, 46,
133                                      libhei::ATTN_TYPE_CHIP_CS};
134     libhei::Signature mainlineIue{odyChip0, rdf_fir, 0, 18,
135                                   libhei::ATTN_TYPE_RECOVERABLE};
136 
137     isoData.flush();
138     isoData.addSignature(fwInitChnlFail);
139     isoData.addSignature(mainlineIue);
140 
141     attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
142                                 rootCause, rasData);
143     EXPECT_TRUE(attnFound);
144     EXPECT_EQ(mainlineIue.toUint32(), rootCause.toUint32());
145 
146     // Test 6: Test a UE that is the side effect of an ODP data corruption error
147     // on an Odyssey OCMB
148     libhei::Signature mainlineUe{odyChip0, rdf_fir, 0, 15,
149                                  libhei::ATTN_TYPE_RECOVERABLE};
150     libhei::Signature odpRootCause{odyChip0, odp_fir, 0, 6,
151                                    libhei::ATTN_TYPE_RECOVERABLE};
152 
153     isoData.flush();
154     isoData.addSignature(mainlineUe);
155     isoData.addSignature(odpRootCause);
156 
157     attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
158                                 rootCause, rasData);
159 
160     EXPECT_TRUE(attnFound);
161     EXPECT_EQ(odpRootCause.toUint32(), rootCause.toUint32());
162 
163     // Test 7: Test a Terminate Immediate with recoverable attentions, one which
164     // can be blamed as a root cause, and one that can't.
165 
166     // MC_DSTL_FIR[14]: Subchannel A valid cmd timeout error
167     libhei::Signature unrelatedRe{procChip0, mc_dstl_fir, 0, 14,
168                                   libhei::ATTN_TYPE_RECOVERABLE};
169 
170     // MC_DSTL_FIR[16]: Subchannel A buffer overuse error
171     libhei::Signature rootCauseRe{procChip0, mc_dstl_fir, 0, 16,
172                                   libhei::ATTN_TYPE_RECOVERABLE};
173 
174     isoData.flush();
175     isoData.addSignature(unrelatedRe);
176     isoData.addSignature(rootCauseRe);
177 
178     attnFound = filterRootCause(AnalysisType::TERMINATE_IMMEDIATE, isoData,
179                                 rootCause, rasData);
180 
181     EXPECT_TRUE(attnFound);
182     EXPECT_EQ(rootCauseRe.toUint32(), rootCause.toUint32());
183 }
184