1 #include <assert.h> 2 3 #include <analyzer_main.hpp> 4 #include <hei_main.hpp> 5 #include <util/pdbg.hpp> 6 7 #include <algorithm> 8 #include <limits> 9 #include <string> 10 11 namespace analyzer 12 { 13 14 //------------------------------------------------------------------------------ 15 16 uint64_t __hash(unsigned int i_bytes, const std::string& i_str) 17 { 18 // This hash is a simple "n*s[0] + (n-1)*s[1] + ... + s[n-1]" algorithm, 19 // where s[i] is a chunk from the input string the length of i_bytes. 20 21 // Currently only supporting 1-8 byte hashes. 22 assert(1 <= i_bytes && i_bytes <= sizeof(uint64_t)); 23 24 // Start hashing each chunk. 25 uint64_t sumA = 0; 26 uint64_t sumB = 0; 27 28 // Iterate one chunk at a time. 29 for (unsigned int i = 0; i < i_str.size(); i += i_bytes) 30 { 31 // Combine each chunk into a single integer value. If we reach the end 32 // of the string, pad with null characters. 33 uint64_t chunk = 0; 34 for (unsigned int j = 0; j < i_bytes; j++) 35 { 36 chunk <<= 8; 37 chunk |= (i + j < i_str.size()) ? i_str[i + j] : '\0'; 38 } 39 40 // Apply the simple hash. 41 sumA += chunk; 42 sumB += sumA; 43 } 44 45 // Mask off everything except the target number of bytes. 46 auto mask = std::numeric_limits<uint64_t>::max(); 47 sumB &= mask >> ((sizeof(uint64_t) - i_bytes) * 8); 48 49 return sumB; 50 } 51 52 //------------------------------------------------------------------------------ 53 54 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list, 55 libhei::Signature& o_rootCause) 56 { 57 // TODO: Consider returning all of them instead of one as root cause. 58 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 59 return (__hash(2, "TP_LOCAL_FIR") == t.getId() && 60 (42 == t.getBit() || 43 == t.getBit())); 61 }); 62 63 if (i_list.end() != itr) 64 { 65 o_rootCause = *itr; 66 return true; 67 } 68 69 return false; 70 } 71 72 //------------------------------------------------------------------------------ 73 74 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list, 75 libhei::Signature& o_rootCause) 76 { 77 // TODO: Consider returning all of them instead of one as root cause. 78 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 79 return (__hash(2, "PLL_UNLOCK") == t.getId() && 80 (0 == t.getBit() || 1 == t.getBit())); 81 }); 82 83 if (i_list.end() != itr) 84 { 85 o_rootCause = *itr; 86 return true; 87 } 88 89 return false; 90 } 91 92 //------------------------------------------------------------------------------ 93 94 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list, 95 libhei::Signature& o_rootCause) 96 { 97 using namespace util::pdbg; 98 99 static const auto mc_dstl_fir = __hash(2, "MC_DSTL_FIR"); 100 static const auto mc_ustl_fir = __hash(2, "MC_USTL_FIR"); 101 static const auto mc_omi_dl_err_rpt = __hash(2, "MC_OMI_DL_ERR_RPT"); 102 103 for (const auto s : i_list) 104 { 105 const auto targetType = getTrgtType(getTrgt(s.getChip())); 106 const auto id = s.getId(); 107 const auto bit = s.getBit(); 108 const auto attnType = s.getAttnType(); 109 110 // Look for any unit checkstop attentions from OCMBs. 111 if (TYPE_OCMB == targetType) 112 { 113 // Any unit checkstop attentions will trigger a channel failure. 114 if (libhei::ATTN_TYPE_UNIT_CS == attnType) 115 { 116 o_rootCause = s; 117 return true; 118 } 119 } 120 // Look for channel failure attentions on processors. 121 else if (TYPE_PROC == targetType) 122 { 123 // TODO: All of these channel failure bits are configurable. 124 // Eventually, we will need some mechanism to check that 125 // config registers for a more accurate analysis. For now, 126 // simply check for all bits that could potentially be 127 // configured to channel failure. 128 129 // Any unit checkstop bit in the MC_DSTL_FIR or MC_USTL_FIR could 130 // be a channel failure. 131 if (libhei::ATTN_TYPE_UNIT_CS == attnType) 132 { 133 // Ignore bits MC_DSTL_FIR[0:7] because they simply indicate 134 // attentions occurred on the attached OCMBs. 135 if ((mc_dstl_fir == id && 8 <= bit) || (mc_ustl_fir == id)) 136 { 137 o_rootCause = s; 138 return true; 139 } 140 } 141 142 // All bits in MC_OMI_DL_ERR_RPT eventually feed into 143 // MC_OMI_DL_FIR[0,20] which are configurable to channel failure. 144 if (mc_omi_dl_err_rpt == id) 145 { 146 o_rootCause = s; 147 return true; 148 } 149 } 150 } 151 152 return false; // default, nothing found 153 } 154 155 //------------------------------------------------------------------------------ 156 157 // Will query if a signature is a potential system checkstop root cause. 158 // attention. Note that this function excludes memory channel failure attentions 159 // and core unit checkstop attentions. 160 bool __findCsRootCause(const libhei::Signature& i_signature) 161 { 162 using namespace util::pdbg; 163 164 // PROC registers 165 static const auto eq_core_fir = __hash(2, "EQ_CORE_FIR"); 166 static const auto eq_l2_fir = __hash(2, "EQ_L2_FIR"); 167 static const auto eq_l3_fir = __hash(2, "EQ_L3_FIR"); 168 static const auto eq_ncu_fir = __hash(2, "EQ_NCU_FIR"); 169 static const auto iohs_dlp_fir_oc = __hash(2, "IOHS_DLP_FIR_OC"); 170 static const auto iohs_dlp_fir_smp = __hash(2, "IOHS_DLP_FIR_SMP"); 171 static const auto nx_cq_fir = __hash(2, "NX_CQ_FIR"); 172 static const auto nx_dma_eng_fir = __hash(2, "NX_DMA_ENG_FIR"); 173 static const auto pau_fir_0 = __hash(2, "PAU_FIR_0"); 174 static const auto pau_fir_1 = __hash(2, "PAU_FIR_1"); 175 static const auto pau_fir_2 = __hash(2, "PAU_FIR_2"); 176 static const auto pau_ptl_fir = __hash(2, "PAU_PTL_FIR"); 177 178 // OCMB registers 179 static const auto rdffir = __hash(2, "RDFFIR"); 180 181 const auto targetType = getTrgtType(getTrgt(i_signature.getChip())); 182 const auto id = i_signature.getId(); 183 const auto bit = i_signature.getBit(); 184 185 if (TYPE_PROC == targetType) 186 { 187 if (eq_core_fir == id && 188 (3 == bit || 5 == bit || 8 == bit || 12 == bit || 22 == bit || 189 25 == bit || 32 == bit || 36 == bit || 38 == bit || 46 == bit || 190 47 == bit || 57 == bit)) 191 { 192 return true; 193 } 194 195 if (eq_l2_fir == id && 196 (1 == bit || 12 == bit || 13 == bit || 17 == bit || 18 == bit || 197 20 == bit || 27 == bit)) 198 { 199 return true; 200 } 201 202 if (eq_l3_fir == id && 203 (2 == bit || 5 == bit || 8 == bit || 11 == bit || 17 == bit)) 204 { 205 return true; 206 } 207 208 if (eq_ncu_fir == id && (3 == bit || 4 == bit || 5 == bit || 7 == bit || 209 8 == bit || 10 == bit || 17 == bit)) 210 { 211 return true; 212 } 213 214 if (iohs_dlp_fir_oc == id && (54 <= bit && bit <= 61)) 215 { 216 return true; 217 } 218 219 if (iohs_dlp_fir_smp == id && (54 <= bit && bit <= 61)) 220 { 221 return true; 222 } 223 224 if (nx_cq_fir == id && (7 == bit || 16 == bit || 21 == bit)) 225 { 226 return true; 227 } 228 229 if (nx_dma_eng_fir == id && (0 == bit)) 230 { 231 return true; 232 } 233 234 if (pau_fir_0 == id && 235 (15 == bit || 18 == bit || 19 == bit || 25 == bit || 26 == bit || 236 29 == bit || 33 == bit || 34 == bit || 35 == bit || 40 == bit || 237 42 == bit || 44 == bit || 45 == bit)) 238 { 239 return true; 240 } 241 242 if (pau_fir_1 == id && 243 (13 == bit || 14 == bit || 15 == bit || 37 == bit || 39 == bit || 244 40 == bit || 41 == bit || 42 == bit)) 245 { 246 return true; 247 } 248 249 if (pau_fir_2 == id && 250 ((4 <= bit && bit <= 18) || (20 <= bit && bit <= 31) || 251 (36 <= bit && bit <= 41) || 45 == bit || 47 == bit || 48 == bit || 252 50 == bit || 51 == bit || 52 == bit)) 253 { 254 return true; 255 } 256 257 if (pau_ptl_fir == id && (4 == bit || 8 == bit)) 258 { 259 return true; 260 } 261 } 262 else if (TYPE_OCMB == targetType) 263 { 264 if (rdffir == id && (14 == bit || 15 == bit || 17 == bit || 37 == bit)) 265 { 266 return true; 267 } 268 } 269 270 return false; // default, nothing found 271 } 272 273 //------------------------------------------------------------------------------ 274 275 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list, 276 libhei::Signature& o_rootCause) 277 { 278 for (const auto s : i_list) 279 { 280 // Only looking for recoverable attentions. 281 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType()) 282 { 283 continue; 284 } 285 286 if (__findCsRootCause(s)) 287 { 288 o_rootCause = s; 289 return true; 290 } 291 } 292 293 return false; // default, nothing found 294 } 295 296 //------------------------------------------------------------------------------ 297 298 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list, 299 libhei::Signature& o_rootCause) 300 { 301 for (const auto s : i_list) 302 { 303 // Only looking for unit checkstop attentions. 304 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType()) 305 { 306 continue; 307 } 308 309 if (__findCsRootCause(s)) 310 { 311 o_rootCause = s; 312 return true; 313 } 314 } 315 316 return false; // default, nothing found 317 } 318 319 //------------------------------------------------------------------------------ 320 321 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list, 322 libhei::Signature& o_rootCause) 323 { 324 using namespace util::pdbg; 325 326 static const auto pb_ext_fir = __hash(2, "PB_EXT_FIR"); 327 328 for (const auto s : i_list) 329 { 330 const auto targetType = getTrgtType(getTrgt(s.getChip())); 331 const auto id = s.getId(); 332 const auto attnType = s.getAttnType(); 333 334 // Find any processor with system checkstop attention that did not 335 // originate from the PB_EXT_FIR. 336 if ((TYPE_PROC == targetType) && 337 (libhei::ATTN_TYPE_CHECKSTOP == attnType) && (pb_ext_fir != id)) 338 { 339 o_rootCause = s; 340 return true; 341 } 342 } 343 344 return false; // default, nothing found 345 } 346 347 //------------------------------------------------------------------------------ 348 349 bool filterRootCause(AnalysisType i_type, 350 const libhei::IsolationData& i_isoData, 351 libhei::Signature& o_rootCause) 352 { 353 // We'll need to make a copy of the list so that the original list is 354 // maintained for the PEL. 355 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 356 357 // START WORKAROUND 358 // TODO: Filtering should be data driven. Until that support is available, 359 // use the following isolation rules. 360 361 // Ensure the list is not empty before continuing. 362 if (list.empty()) 363 { 364 return false; // nothing more to do 365 } 366 367 // First, look for any RCS OSC errors. This must always be first because 368 // they can cause downstream PLL unlock attentions. 369 if (__findRcsOscError(list, o_rootCause)) 370 { 371 return true; 372 } 373 374 // Second, look for any PLL unlock attentions. This must always be second 375 // because PLL unlock attentions can cause any number of downstream 376 // attentions, including a system checkstop. 377 if (__findPllUnlock(list, o_rootCause)) 378 { 379 return true; 380 } 381 382 // Regardless of the analysis type, always look for anything that could be 383 // blamed as the root cause of a system checkstop. 384 385 // Memory channel failure attentions will produce SUEs and likely cause 386 // downstream attentions, including a system checkstop. 387 if (__findMemoryChannelFailure(list, o_rootCause)) 388 { 389 return true; 390 } 391 392 // Look for any recoverable attentions that have been identified as a 393 // potential root cause of a system checkstop attention. These would include 394 // any attention that would generate an SUE. Note that is it possible for 395 // recoverables to generate unit checkstop attentions so we must check them 396 // first. 397 if (__findCsRootCause_RE(list, o_rootCause)) 398 { 399 return true; 400 } 401 402 // Look for any unit checkstop attentions (other than memory channel 403 // failures) that have been identified as a potential root cause of a 404 // system checkstop attention. These would include any attention that would 405 // generate an SUE. 406 if (__findCsRootCause_UCS(list, o_rootCause)) 407 { 408 return true; 409 } 410 411 // Look for any system checkstop attentions that originated from within the 412 // chip that reported the attention. In other words, no external checkstop 413 // attentions. 414 if (__findNonExternalCs(list, o_rootCause)) 415 { 416 return true; 417 } 418 419 if (AnalysisType::SYSTEM_CHECKSTOP != i_type) 420 { 421 // No system checkstop root cause attentions were found. Next, look for 422 // any recoverable or unit checkstop attentions that could be associated 423 // with a TI. 424 425 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) { 426 return (libhei::ATTN_TYPE_RECOVERABLE == t.getAttnType() || 427 libhei::ATTN_TYPE_UNIT_CS == t.getAttnType()); 428 }); 429 430 if (list.end() != itr) 431 { 432 o_rootCause = *itr; 433 return true; 434 } 435 436 if (AnalysisType::TERMINATE_IMMEDIATE != i_type) 437 { 438 // No attentions associated with a system checkstop or TI were 439 // found. Simply, return the first entry in the list. 440 o_rootCause = list.front(); 441 return true; 442 } 443 } 444 445 // END WORKAROUND 446 447 return false; // default, no active attentions found. 448 } 449 450 //------------------------------------------------------------------------------ 451 452 } // namespace analyzer 453