xref: /openbmc/openpower-hw-diags/test/test-root-cause-filter.cpp (revision ac6cde74cb90bd4759070737b05d16ff51ec9ba4)
1  #include <stdio.h>
2  
3  #include <analyzer/analyzer_main.hpp>
4  #include <analyzer/plugins/plugin.hpp>
5  #include <analyzer/ras-data/ras-data-parser.hpp>
6  #include <hei_util.hpp>
7  #include <util/pdbg.hpp>
8  #include <util/trace.hpp>
9  
10  #include "gtest/gtest.h"
11  
12  namespace analyzer
13  {
14  // Forward reference of filterRootCause
15  bool filterRootCause(AnalysisType i_type,
16                       const libhei::IsolationData& i_isoData,
17                       libhei::Signature& o_rootCause,
18                       const RasDataParser& i_rasData);
19  } // namespace analyzer
20  
21  using namespace analyzer;
22  
23  // Processor side FIRs
24  static const auto eqCoreFir = static_cast<libhei::NodeId_t>(
25      libhei::hash<libhei::NodeId_t>("EQ_CORE_FIR"));
26  
27  static const auto mc_dstl_fir = static_cast<libhei::NodeId_t>(
28      libhei::hash<libhei::NodeId_t>("MC_DSTL_FIR"));
29  
30  // Explorer OCMB FIRs
31  static const auto rdfFir =
32      static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("RDFFIR"));
33  
34  // Odyssey OCMB FIRs
35  static const auto srq_fir =
36      static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("SRQ_FIR"));
37  
38  static const auto rdf_fir =
39      static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("RDF_FIR"));
40  
41  static const auto odp_fir =
42      static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("ODP_FIR"));
43  
TEST(RootCauseFilter,Filter1)44  TEST(RootCauseFilter, Filter1)
45  {
46      pdbg_targets_init(nullptr);
47  
48      RasDataParser rasData{};
49  
50      // Test 1: Test a checkstop with a UE root cause on an OCMB
51  
52      // Checkstop signature on the proc
53      auto proc0 = util::pdbg::getTrgt("/proc0");
54      libhei::Chip procChip0{proc0, P10_20};
55  
56      // EQ_CORE_FIR[14]: ME = 0 checkstop
57      libhei::Signature checkstopSig{procChip0, eqCoreFir, 0, 14,
58                                     libhei::ATTN_TYPE_CHIP_CS};
59  
60      // MC_DSTL_FIR[1]: AFU initiated Recoverable Attn on Subchannel A
61      libhei::Signature reAttnSig{procChip0, mc_dstl_fir, 0, 1,
62                                  libhei::ATTN_TYPE_RECOVERABLE};
63  
64      // Root cause signature on the ocmb
65      auto ocmb0 =
66          util::pdbg::getTrgt("proc0/pib/perv12/mc0/mi0/mcc0/omi0/ocmb0");
67      libhei::Chip ocmbChip0{ocmb0, EXPLORER_20};
68  
69      // RDFFIR[14]: Mainline read UE
70      libhei::Signature ueSig{ocmbChip0, rdfFir, 0, 14,
71                              libhei::ATTN_TYPE_RECOVERABLE};
72  
73      // Add the signatures to the isolation data
74      libhei::IsolationData isoData{};
75      isoData.addSignature(checkstopSig);
76      isoData.addSignature(reAttnSig);
77      isoData.addSignature(ueSig);
78  
79      libhei::Signature rootCause;
80      bool attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
81                                       rootCause, rasData);
82      EXPECT_TRUE(attnFound);
83      EXPECT_EQ(ueSig.toUint32(), rootCause.toUint32());
84  
85      // Test 2: Test a checkstop with an unknown RE attn on an OCMB
86  
87      // Add the signatures to the isolation data
88      isoData.flush();
89      isoData.addSignature(checkstopSig);
90      isoData.addSignature(reAttnSig);
91  
92      attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
93                                  rootCause, rasData);
94      EXPECT_TRUE(attnFound);
95      EXPECT_EQ(reAttnSig.toUint32(), rootCause.toUint32());
96  
97      // Test 3: Test a checkstop with an unknown UCS attn on an OCMB
98  
99      // MC_DSTL_FIR[0]: AFU initiated Checkstop on Subchannel A
100      libhei::Signature ucsAttnSig{procChip0, mc_dstl_fir, 0, 0,
101                                   libhei::ATTN_TYPE_UNIT_CS};
102  
103      isoData.flush();
104      isoData.addSignature(checkstopSig);
105      isoData.addSignature(ucsAttnSig);
106  
107      attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
108                                  rootCause, rasData);
109      EXPECT_TRUE(attnFound);
110      EXPECT_EQ(ucsAttnSig.toUint32(), rootCause.toUint32());
111  
112      // Test 4: Test a checkstop with a non-root cause recoverable from an OCMB
113  
114      // RDFFIR[42]: SCOM recoverable register parity error
115      libhei::Signature reSig{ocmbChip0, rdfFir, 0, 42,
116                              libhei::ATTN_TYPE_RECOVERABLE};
117  
118      isoData.flush();
119      isoData.addSignature(checkstopSig);
120      isoData.addSignature(reAttnSig);
121      isoData.addSignature(reSig);
122  
123      attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
124                                  rootCause, rasData);
125      EXPECT_TRUE(attnFound);
126      EXPECT_EQ(checkstopSig.toUint32(), rootCause.toUint32());
127  
128      // Test 5: Test a firmware initiated channel fail due to an IUE threshold on
129      // a Odyssey OCMB
130      libhei::Chip odyChip0{ocmb0, ODYSSEY_10};
131  
132      libhei::Signature fwInitChnlFail{odyChip0, srq_fir, 0, 46,
133                                       libhei::ATTN_TYPE_CHIP_CS};
134      libhei::Signature mainlineIue{odyChip0, rdf_fir, 0, 18,
135                                    libhei::ATTN_TYPE_RECOVERABLE};
136  
137      isoData.flush();
138      isoData.addSignature(fwInitChnlFail);
139      isoData.addSignature(mainlineIue);
140  
141      attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
142                                  rootCause, rasData);
143      EXPECT_TRUE(attnFound);
144      EXPECT_EQ(mainlineIue.toUint32(), rootCause.toUint32());
145  
146      // Test 6: Test a UE that is the side effect of an ODP data corruption error
147      // on an Odyssey OCMB
148      libhei::Signature mainlineUe{odyChip0, rdf_fir, 0, 15,
149                                   libhei::ATTN_TYPE_RECOVERABLE};
150      libhei::Signature odpRootCause{odyChip0, odp_fir, 0, 6,
151                                     libhei::ATTN_TYPE_RECOVERABLE};
152  
153      isoData.flush();
154      isoData.addSignature(mainlineUe);
155      isoData.addSignature(odpRootCause);
156  
157      attnFound = filterRootCause(AnalysisType::SYSTEM_CHECKSTOP, isoData,
158                                  rootCause, rasData);
159  
160      EXPECT_TRUE(attnFound);
161      EXPECT_EQ(odpRootCause.toUint32(), rootCause.toUint32());
162  
163      // Test 7: Test a Terminate Immediate with recoverable attentions, one which
164      // can be blamed as a root cause, and one that can't.
165  
166      // MC_DSTL_FIR[14]: Subchannel A valid cmd timeout error
167      libhei::Signature unrelatedRe{procChip0, mc_dstl_fir, 0, 14,
168                                    libhei::ATTN_TYPE_RECOVERABLE};
169  
170      // MC_DSTL_FIR[16]: Subchannel A buffer overuse error
171      libhei::Signature rootCauseRe{procChip0, mc_dstl_fir, 0, 16,
172                                    libhei::ATTN_TYPE_RECOVERABLE};
173  
174      isoData.flush();
175      isoData.addSignature(unrelatedRe);
176      isoData.addSignature(rootCauseRe);
177  
178      attnFound = filterRootCause(AnalysisType::TERMINATE_IMMEDIATE, isoData,
179                                  rootCause, rasData);
180  
181      EXPECT_TRUE(attnFound);
182      EXPECT_EQ(rootCauseRe.toUint32(), rootCause.toUint32());
183  }
184