1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/lg2.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> readFile(const std::string & path)37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 createPldmHandle()59 void Manager::createPldmHandle() 60 { 61 #ifdef PLDM 62 pldmHandle = std::make_unique<pldm::Interface>( 63 std::bind(std::mem_fn(&Manager::updateOCCActive), this, 64 std::placeholders::_1, std::placeholders::_2), 65 std::bind(std::mem_fn(&Manager::sbeHRESETResult), this, 66 std::placeholders::_1, std::placeholders::_2), 67 std::bind(std::mem_fn(&Manager::updateOccSafeMode), this, 68 std::placeholders::_1), 69 event); 70 #endif 71 } 72 73 // findAndCreateObjects(): 74 // Takes care of getting the required objects created and 75 // finds the available devices/processors. 76 // (function is called everytime the discoverTimer expires) 77 // - create the PowerMode object to control OCC modes 78 // - create statusObjects for each OCC device found 79 // - waits for OCC Active sensors PDRs to become available 80 // - restart discoverTimer if all data is not available yet findAndCreateObjects()81 void Manager::findAndCreateObjects() 82 { 83 #ifndef POWER10 84 for (auto id = 0; id < MAX_CPUS; ++id) 85 { 86 // Create one occ per cpu 87 auto occ = std::string(OCC_NAME) + std::to_string(id); 88 createObjects(occ); 89 } 90 #else 91 if (!pmode) 92 { 93 // Create the power mode object 94 pmode = std::make_unique<powermode::PowerMode>( 95 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 96 } 97 98 if (!fs::exists(HOST_ON_FILE)) 99 { 100 static bool statusObjCreated = false; 101 if (!statusObjCreated) 102 { 103 // Create the OCCs based on on the /dev/occX devices 104 auto occs = findOCCsInDev(); 105 106 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 107 { 108 // Something changed or no OCCs yet, try again in 10s. 109 // Note on the first pass prevOCCSearch will be empty, 110 // so there will be at least one delay to give things 111 // a chance to settle. 112 prevOCCSearch = occs; 113 114 lg2::info( 115 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})", 116 "QTY", occs.size()); 117 118 discoverTimer->restartOnce(10s); 119 } 120 else 121 { 122 // All OCCs appear to be available, create status objects 123 124 // createObjects requires OCC0 first. 125 std::sort(occs.begin(), occs.end()); 126 127 lg2::info( 128 "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects", 129 "QTY", occs.size()); 130 for (auto id : occs) 131 { 132 createObjects(std::string(OCC_NAME) + std::to_string(id)); 133 } 134 statusObjCreated = true; 135 waitingForAllOccActiveSensors = true; 136 137 // Find/update the processor path associated with each OCC 138 for (auto& obj : statusObjects) 139 { 140 obj->updateProcAssociation(); 141 } 142 } 143 } 144 145 if (statusObjCreated && waitingForAllOccActiveSensors) 146 { 147 static bool tracedHostWait = false; 148 if (utils::isHostRunning()) 149 { 150 if (tracedHostWait) 151 { 152 lg2::info( 153 "Manager::findAndCreateObjects(): Host is running"); 154 tracedHostWait = false; 155 } 156 checkAllActiveSensors(); 157 } 158 else 159 { 160 if (!tracedHostWait) 161 { 162 lg2::info( 163 "Manager::findAndCreateObjects(): Waiting for host to start"); 164 tracedHostWait = true; 165 } 166 discoverTimer->restartOnce(30s); 167 #ifdef PLDM 168 if (throttlePldmTraceTimer->isEnabled()) 169 { 170 // Host is no longer running, disable throttle timer and 171 // make sure traces are not throttled 172 lg2::info("findAndCreateObjects(): disabling sensor timer"); 173 throttlePldmTraceTimer->setEnabled(false); 174 pldmHandle->setTraceThrottle(false); 175 } 176 #endif 177 } 178 } 179 } 180 else 181 { 182 lg2::info( 183 "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...", 184 "FILE", HOST_ON_FILE); 185 discoverTimer->restartOnce(10s); 186 } 187 #endif 188 } 189 190 #ifdef POWER10 191 // Check if all occActive sensors are available checkAllActiveSensors()192 void Manager::checkAllActiveSensors() 193 { 194 static bool allActiveSensorAvailable = false; 195 static bool tracedSensorWait = false; 196 static bool waitingForHost = false; 197 198 if (open_power::occ::utils::isHostRunning()) 199 { 200 if (waitingForHost) 201 { 202 waitingForHost = false; 203 lg2::info("checkAllActiveSensors(): Host is now running"); 204 } 205 206 // Start with the assumption that all are available 207 allActiveSensorAvailable = true; 208 for (auto& obj : statusObjects) 209 { 210 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 211 { 212 auto instance = obj->getOccInstanceID(); 213 // Check if sensor was queued while waiting for discovery 214 auto match = queuedActiveState.find(instance); 215 if (match != queuedActiveState.end()) 216 { 217 queuedActiveState.erase(match); 218 lg2::info( 219 "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)", 220 "INST", instance); 221 obj->occActive(true); 222 } 223 else 224 { 225 allActiveSensorAvailable = false; 226 if (!tracedSensorWait) 227 { 228 lg2::info( 229 "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor", 230 "INST", instance); 231 tracedSensorWait = true; 232 #ifdef PLDM 233 // Make sure PLDM traces are not throttled 234 pldmHandle->setTraceThrottle(false); 235 // Start timer to throttle PLDM traces when timer 236 // expires 237 onPldmTimeoutCreatePel = false; 238 throttlePldmTraceTimer->restartOnce(5min); 239 #endif 240 } 241 #ifdef PLDM 242 // Ignore active sensor check if the OCCs are being reset 243 if (!resetInProgress) 244 { 245 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 246 } 247 #endif 248 break; 249 } 250 } 251 } 252 } 253 else 254 { 255 if (!waitingForHost) 256 { 257 waitingForHost = true; 258 lg2::info("checkAllActiveSensors(): Waiting for host to start"); 259 #ifdef PLDM 260 if (throttlePldmTraceTimer->isEnabled()) 261 { 262 // Host is no longer running, disable throttle timer and 263 // make sure traces are not throttled 264 lg2::info("checkAllActiveSensors(): disabling sensor timer"); 265 throttlePldmTraceTimer->setEnabled(false); 266 pldmHandle->setTraceThrottle(false); 267 } 268 #endif 269 } 270 } 271 272 if (allActiveSensorAvailable) 273 { 274 // All sensors were found, disable the discovery timer 275 if (discoverTimer->isEnabled()) 276 { 277 discoverTimer->setEnabled(false); 278 } 279 #ifdef PLDM 280 if (throttlePldmTraceTimer->isEnabled()) 281 { 282 // Disable throttle timer and make sure traces are not throttled 283 throttlePldmTraceTimer->setEnabled(false); 284 pldmHandle->setTraceThrottle(false); 285 } 286 #endif 287 if (waitingForAllOccActiveSensors) 288 { 289 lg2::info( 290 "checkAllActiveSensors(): OCC Active sensors are available"); 291 waitingForAllOccActiveSensors = false; 292 293 if (resetRequired) 294 { 295 initiateOccRequest(resetInstance); 296 297 if (!waitForAllOccsTimer->isEnabled()) 298 { 299 lg2::warning( 300 "occsNotAllRunning: Restarting waitForAllOccTimer"); 301 // restart occ wait timer to check status after reset 302 // completes 303 waitForAllOccsTimer->restartOnce(60s); 304 } 305 } 306 } 307 queuedActiveState.clear(); 308 tracedSensorWait = false; 309 } 310 else 311 { 312 // Not all sensors were available, so keep waiting 313 if (!tracedSensorWait) 314 { 315 lg2::info( 316 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 317 tracedSensorWait = true; 318 } 319 discoverTimer->restartOnce(10s); 320 } 321 } 322 #endif 323 findOCCsInDev()324 std::vector<int> Manager::findOCCsInDev() 325 { 326 std::vector<int> occs; 327 std::regex expr{R"(occ(\d+)$)"}; 328 329 for (auto& file : fs::directory_iterator("/dev")) 330 { 331 std::smatch match; 332 std::string path{file.path().string()}; 333 if (std::regex_search(path, match, expr)) 334 { 335 auto num = std::stoi(match[1].str()); 336 337 // /dev numbering starts at 1, ours starts at 0. 338 occs.push_back(num - 1); 339 } 340 } 341 342 return occs; 343 } 344 cpuCreated(sdbusplus::message_t & msg)345 int Manager::cpuCreated(sdbusplus::message_t& msg) 346 { 347 namespace fs = std::filesystem; 348 349 sdbusplus::message::object_path o; 350 msg.read(o); 351 fs::path cpuPath(std::string(std::move(o))); 352 353 auto name = cpuPath.filename().string(); 354 auto index = name.find(CPU_NAME); 355 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 356 357 createObjects(name); 358 359 return 0; 360 } 361 createObjects(const std::string & occ)362 void Manager::createObjects(const std::string& occ) 363 { 364 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 365 366 statusObjects.emplace_back(std::make_unique<Status>( 367 event, path.c_str(), *this, 368 #ifdef POWER10 369 pmode, 370 #endif 371 std::bind(std::mem_fn(&Manager::statusCallBack), this, 372 std::placeholders::_1, std::placeholders::_2) 373 #ifdef PLDM 374 , 375 // Callback will set flag indicating reset needs to be done 376 // instead of immediately issuing a reset via PLDM. 377 std::bind(std::mem_fn(&Manager::resetOccRequest), this, 378 std::placeholders::_1) 379 #endif 380 )); 381 382 // Create the power cap monitor object 383 if (!pcap) 384 { 385 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 386 *statusObjects.back()); 387 } 388 389 if (statusObjects.back()->isMasterOcc()) 390 { 391 lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST", 392 statusObjects.back()->getOccInstanceID()); 393 _pollTimer->setEnabled(false); 394 395 #ifdef POWER10 396 // Set the master OCC on the PowerMode object 397 pmode->setMasterOcc(path); 398 #endif 399 } 400 401 passThroughObjects.emplace_back(std::make_unique<PassThrough>( 402 path.c_str() 403 #ifdef POWER10 404 , 405 pmode 406 #endif 407 )); 408 } 409 410 // If a reset is not already outstanding, set a flag to indicate that a reset is 411 // needed. resetOccRequest(instanceID instance)412 void Manager::resetOccRequest(instanceID instance) 413 { 414 if (!resetRequired) 415 { 416 resetRequired = true; 417 resetInstance = instance; 418 lg2::error( 419 "resetOccRequest: PM Complex reset was requested due to OCC{INST}", 420 "INST", instance); 421 } 422 else if (instance != resetInstance) 423 { 424 lg2::warning( 425 "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}", 426 "INST", instance, "RINST", resetInstance); 427 } 428 } 429 430 // If a reset has not been started, initiate an OCC reset via PLDM initiateOccRequest(instanceID instance)431 void Manager::initiateOccRequest(instanceID instance) 432 { 433 if (!resetInProgress) 434 { 435 resetInProgress = true; 436 resetInstance = instance; 437 lg2::error( 438 "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}", 439 "INST", instance); 440 #ifdef PLDM 441 pldmHandle->resetOCC(instance); 442 #endif 443 resetRequired = false; 444 } 445 else 446 { 447 lg2::warning( 448 "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}", 449 "INST", instance, "RINST", resetInstance); 450 } 451 } 452 statusCallBack(instanceID instance,bool status)453 void Manager::statusCallBack(instanceID instance, bool status) 454 { 455 if (status == true) 456 { 457 if (resetInProgress) 458 { 459 lg2::info( 460 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{INST}", 461 "INST", instance, "RINST", resetInstance); 462 return; 463 } 464 465 // OCC went active 466 ++activeCount; 467 468 #ifdef POWER10 469 if (activeCount == 1) 470 { 471 // First OCC went active (allow some time for all OCCs to go active) 472 waitForAllOccsTimer->restartOnce(60s); 473 } 474 #endif 475 476 if (activeCount == statusObjects.size()) 477 { 478 #ifdef POWER10 479 // All OCCs are now running 480 if (waitForAllOccsTimer->isEnabled()) 481 { 482 // stop occ wait timer 483 waitForAllOccsTimer->setEnabled(false); 484 } 485 486 // All OCCs have been found, check if we need a reset 487 if (resetRequired) 488 { 489 initiateOccRequest(resetInstance); 490 491 if (!waitForAllOccsTimer->isEnabled()) 492 { 493 lg2::warning( 494 "occsNotAllRunning: Restarting waitForAllOccTimer"); 495 // restart occ wait timer 496 waitForAllOccsTimer->restartOnce(60s); 497 } 498 } 499 else 500 { 501 // Verify master OCC and start presence monitor 502 validateOccMaster(); 503 } 504 #else 505 // Verify master OCC and start presence monitor 506 validateOccMaster(); 507 #endif 508 } 509 510 // Start poll timer if not already started 511 if (!_pollTimer->isEnabled()) 512 { 513 lg2::info("Manager: OCCs will be polled every {TIME} seconds", 514 "TIME", pollInterval); 515 516 // Send poll and start OCC poll timer 517 pollerTimerExpired(); 518 } 519 } 520 else 521 { 522 // OCC went away 523 if (activeCount > 0) 524 { 525 --activeCount; 526 } 527 else 528 { 529 lg2::info("OCC{INST} disabled, but currently no active OCCs", 530 "INST", instance); 531 } 532 533 if (activeCount == 0) 534 { 535 // No OCCs are running 536 537 if (resetInProgress) 538 { 539 // All OCC active sensors are clear (reset should be in 540 // progress) 541 lg2::info( 542 "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})", 543 "COUNT", activeCount, "INST", instance, "STATUS", status); 544 resetInProgress = false; 545 resetInstance = 255; 546 } 547 548 // Stop OCC poll timer 549 if (_pollTimer->isEnabled()) 550 { 551 lg2::info( 552 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 553 _pollTimer->setEnabled(false); 554 } 555 556 #ifdef POWER10 557 // stop wait timer 558 if (waitForAllOccsTimer->isEnabled()) 559 { 560 waitForAllOccsTimer->setEnabled(false); 561 } 562 #endif 563 } 564 else if (resetInProgress) 565 { 566 lg2::info( 567 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})", 568 "COUNT", activeCount, "INST", instance, "STATUS", status); 569 } 570 #ifdef READ_OCC_SENSORS 571 // Clear OCC sensors 572 setSensorValueToNaN(instance); 573 #endif 574 } 575 576 #ifdef POWER10 577 if (waitingForAllOccActiveSensors) 578 { 579 if (utils::isHostRunning()) 580 { 581 checkAllActiveSensors(); 582 } 583 } 584 #endif 585 } 586 587 #ifdef I2C_OCC initStatusObjects()588 void Manager::initStatusObjects() 589 { 590 // Make sure we have a valid path string 591 static_assert(sizeof(DEV_PATH) != 0); 592 593 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 594 for (auto& name : deviceNames) 595 { 596 i2c_occ::i2cToDbus(name); 597 name = std::string(OCC_NAME) + '_' + name; 598 auto path = fs::path(OCC_CONTROL_ROOT) / name; 599 statusObjects.emplace_back( 600 std::make_unique<Status>(event, path.c_str(), *this)); 601 } 602 // The first device is master occ 603 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 604 *statusObjects.front()); 605 #ifdef POWER10 606 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 607 powermode::PIPS_PATH); 608 // Set the master OCC on the PowerMode object 609 pmode->setMasterOcc(path); 610 #endif 611 } 612 #endif 613 614 #ifdef PLDM sbeTimeout(unsigned int instance)615 void Manager::sbeTimeout(unsigned int instance) 616 { 617 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 618 [instance](const auto& obj) { 619 return instance == obj->getOccInstanceID(); 620 }); 621 622 if (obj != statusObjects.end() && (*obj)->occActive()) 623 { 624 lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST", 625 instance); 626 627 #ifdef PHAL_SUPPORT 628 setSBEState(instance, SBE_STATE_NOT_USABLE); 629 #endif 630 631 pldmHandle->sendHRESET(instance); 632 } 633 } 634 updateOCCActive(instanceID instance,bool status)635 bool Manager::updateOCCActive(instanceID instance, bool status) 636 { 637 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 638 [instance](const auto& obj) { 639 return instance == obj->getOccInstanceID(); 640 }); 641 642 const bool hostRunning = open_power::occ::utils::isHostRunning(); 643 if (obj != statusObjects.end()) 644 { 645 if (!hostRunning && (status == true)) 646 { 647 lg2::warning( 648 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received", 649 "INST", instance, "STAT", status); 650 (*obj)->setPldmSensorReceived(false); 651 if (!waitingForAllOccActiveSensors) 652 { 653 lg2::info( 654 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 655 waitingForAllOccActiveSensors = true; 656 } 657 #ifdef POWER10 658 discoverTimer->restartOnce(30s); 659 #endif 660 return false; 661 } 662 else 663 { 664 (*obj)->setPldmSensorReceived(true); 665 return (*obj)->occActive(status); 666 } 667 } 668 else 669 { 670 if (hostRunning) 671 { 672 lg2::warning( 673 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})", 674 "INST", instance, "STAT", status); 675 } 676 else 677 { 678 if (status == true) 679 { 680 lg2::warning( 681 "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})", 682 "INST", instance, "STAT", status); 683 } 684 } 685 if (status == true) 686 { 687 // OCC went active 688 queuedActiveState.insert(instance); 689 } 690 else 691 { 692 auto match = queuedActiveState.find(instance); 693 if (match != queuedActiveState.end()) 694 { 695 // OCC was disabled 696 queuedActiveState.erase(match); 697 } 698 } 699 return false; 700 } 701 } 702 703 // Called upon pldm event To set powermode Safe Mode State for system. updateOccSafeMode(bool safeMode)704 void Manager::updateOccSafeMode(bool safeMode) 705 { 706 #ifdef POWER10 707 pmode->updateDbusSafeMode(safeMode); 708 #endif 709 // Update the processor throttle status on dbus 710 for (auto& obj : statusObjects) 711 { 712 obj->updateThrottle(safeMode, THROTTLED_SAFE); 713 } 714 } 715 sbeHRESETResult(instanceID instance,bool success)716 void Manager::sbeHRESETResult(instanceID instance, bool success) 717 { 718 if (success) 719 { 720 lg2::info("HRESET succeeded (OCC{INST})", "INST", instance); 721 722 #ifdef PHAL_SUPPORT 723 setSBEState(instance, SBE_STATE_BOOTED); 724 #endif 725 726 return; 727 } 728 729 #ifdef PHAL_SUPPORT 730 setSBEState(instance, SBE_STATE_FAILED); 731 732 if (sbeCanDump(instance)) 733 { 734 lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST", 735 instance); 736 737 auto& bus = utils::getBus(); 738 uint32_t src6 = instance << 16; 739 uint32_t logId = 740 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 741 src6, "SBE command timeout"); 742 743 try 744 { 745 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 746 constexpr auto function = "CreateDump"; 747 748 std::string service = 749 utils::getService(OP_DUMP_OBJ_PATH, interface); 750 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH, 751 interface, function); 752 753 std::map<std::string, std::variant<std::string, uint64_t>> 754 createParams{ 755 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 756 uint64_t(logId)}, 757 {"com.ibm.Dump.Create.CreateParameters.DumpType", 758 "com.ibm.Dump.Create.DumpType.SBE"}, 759 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 760 uint64_t(instance)}, 761 }; 762 763 method.append(createParams); 764 765 auto response = bus.call(method); 766 } 767 catch (const sdbusplus::exception_t& e) 768 { 769 constexpr auto ERROR_DUMP_DISABLED = 770 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 771 if (e.name() == ERROR_DUMP_DISABLED) 772 { 773 lg2::info("Dump is disabled, skipping"); 774 } 775 else 776 { 777 lg2::error("Dump failed"); 778 } 779 } 780 } 781 #endif 782 783 // SBE Reset failed, try PM Complex reset 784 lg2::error("sbeHRESETResult: Forcing PM Complex reset"); 785 resetOccRequest(instance); 786 } 787 788 #ifdef PHAL_SUPPORT sbeCanDump(unsigned int instance)789 bool Manager::sbeCanDump(unsigned int instance) 790 { 791 struct pdbg_target* proc = getPdbgTarget(instance); 792 793 if (!proc) 794 { 795 // allow the dump in the error case 796 return true; 797 } 798 799 try 800 { 801 if (!openpower::phal::sbe::isDumpAllowed(proc)) 802 { 803 return false; 804 } 805 806 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 807 { 808 return false; 809 } 810 } 811 catch (openpower::phal::exception::SbeError& e) 812 { 813 lg2::info("Failed to query SBE state"); 814 } 815 816 // allow the dump in the error case 817 return true; 818 } 819 setSBEState(unsigned int instance,enum sbe_state state)820 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 821 { 822 struct pdbg_target* proc = getPdbgTarget(instance); 823 824 if (!proc) 825 { 826 return; 827 } 828 829 try 830 { 831 openpower::phal::sbe::setState(proc, state); 832 } 833 catch (const openpower::phal::exception::SbeError& e) 834 { 835 lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what()); 836 } 837 } 838 getPdbgTarget(unsigned int instance)839 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 840 { 841 if (!pdbgInitialized) 842 { 843 try 844 { 845 openpower::phal::pdbg::init(); 846 pdbgInitialized = true; 847 } 848 catch (const openpower::phal::exception::PdbgError& e) 849 { 850 lg2::error("pdbg initialization failed"); 851 return nullptr; 852 } 853 } 854 855 struct pdbg_target* proc = nullptr; 856 pdbg_for_each_class_target("proc", proc) 857 { 858 if (pdbg_target_index(proc) == instance) 859 { 860 return proc; 861 } 862 } 863 864 lg2::error("Failed to get pdbg target"); 865 return nullptr; 866 } 867 #endif 868 #endif 869 pollerTimerExpired()870 void Manager::pollerTimerExpired() 871 { 872 if (!_pollTimer) 873 { 874 lg2::error("pollerTimerExpired() ERROR: Timer not defined"); 875 return; 876 } 877 878 #ifdef POWER10 879 if (resetRequired) 880 { 881 lg2::error("pollerTimerExpired() - Initiating PM Complex reset"); 882 initiateOccRequest(resetInstance); 883 884 if (!waitForAllOccsTimer->isEnabled()) 885 { 886 lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer"); 887 // restart occ wait timer 888 waitForAllOccsTimer->restartOnce(60s); 889 } 890 return; 891 } 892 #endif 893 894 for (auto& obj : statusObjects) 895 { 896 if (!obj->occActive()) 897 { 898 // OCC is not running yet 899 #ifdef READ_OCC_SENSORS 900 auto id = obj->getOccInstanceID(); 901 setSensorValueToNaN(id); 902 #endif 903 continue; 904 } 905 906 // Read sysfs to force kernel to poll OCC 907 obj->readOccState(); 908 909 #ifdef READ_OCC_SENSORS 910 // Read occ sensor values 911 getSensorValues(obj); 912 #endif 913 } 914 915 if (activeCount > 0) 916 { 917 // Restart OCC poll timer 918 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 919 } 920 else 921 { 922 // No OCCs running, so poll timer will not be restarted 923 lg2::info( 924 "Manager::pollerTimerExpired: poll timer will not be restarted"); 925 } 926 } 927 928 #ifdef READ_OCC_SENSORS readTempSensors(const fs::path & path,uint32_t occInstance)929 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 930 { 931 // There may be more than one sensor with the same FRU type 932 // and label so make two passes: the first to read the temps 933 // from sysfs, and the second to put them on D-Bus after 934 // resolving any conflicts. 935 std::map<std::string, double> sensorData; 936 937 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 938 for (auto& file : fs::directory_iterator(path)) 939 { 940 if (!std::regex_search(file.path().string(), expr)) 941 { 942 continue; 943 } 944 945 uint32_t labelValue{0}; 946 947 try 948 { 949 labelValue = readFile<uint32_t>(file.path()); 950 } 951 catch (const std::system_error& e) 952 { 953 lg2::debug( 954 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 955 "PATH", file.path().string(), "ERROR", e.code().value()); 956 continue; 957 } 958 959 const std::string& tempLabel = "label"; 960 const std::string filePathString = file.path().string().substr( 961 0, file.path().string().length() - tempLabel.length()); 962 963 uint32_t fruTypeValue{0}; 964 try 965 { 966 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 967 } 968 catch (const std::system_error& e) 969 { 970 lg2::debug( 971 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 972 "PATH", filePathString + fruTypeSuffix, "ERROR", 973 e.code().value()); 974 continue; 975 } 976 977 std::string sensorPath = 978 OCC_SENSORS_ROOT + std::string("/temperature/"); 979 980 std::string dvfsTempPath; 981 982 if (fruTypeValue == VRMVdd) 983 { 984 sensorPath.append( 985 "vrm_vdd" + std::to_string(occInstance) + "_temp"); 986 } 987 else if (fruTypeValue == processorIoRing) 988 { 989 sensorPath.append( 990 "proc" + std::to_string(occInstance) + "_ioring_temp"); 991 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 992 std::to_string(occInstance) + "_ioring_dvfs_temp"; 993 } 994 else 995 { 996 uint16_t type = (labelValue & 0xFF000000) >> 24; 997 uint16_t instanceID = labelValue & 0x0000FFFF; 998 999 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 1000 { 1001 if (fruTypeValue == fruTypeNotAvailable) 1002 { 1003 // Not all DIMM related temps are available to read 1004 // (no _input file in this case) 1005 continue; 1006 } 1007 auto iter = dimmTempSensorName.find(fruTypeValue); 1008 if (iter == dimmTempSensorName.end()) 1009 { 1010 lg2::error( 1011 "readTempSensors: Fru type error! fruTypeValue = {FRU}) ", 1012 "FRU", fruTypeValue); 1013 continue; 1014 } 1015 1016 sensorPath.append( 1017 "dimm" + std::to_string(instanceID) + iter->second); 1018 1019 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 1020 dimmDVFSSensorName.at(fruTypeValue); 1021 } 1022 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 1023 { 1024 if (fruTypeValue == processorCore) 1025 { 1026 // The OCC reports small core temps, of which there are 1027 // two per big core. All current P10 systems are in big 1028 // core mode, so use a big core name. 1029 uint16_t coreNum = instanceID / 2; 1030 uint16_t tempNum = instanceID % 2; 1031 sensorPath.append("proc" + std::to_string(occInstance) + 1032 "_core" + std::to_string(coreNum) + "_" + 1033 std::to_string(tempNum) + "_temp"); 1034 1035 dvfsTempPath = 1036 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 1037 std::to_string(occInstance) + "_core_dvfs_temp"; 1038 } 1039 else 1040 { 1041 continue; 1042 } 1043 } 1044 else 1045 { 1046 continue; 1047 } 1048 } 1049 1050 // The dvfs temp file only needs to be read once per chip per type. 1051 if (!dvfsTempPath.empty() && 1052 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 1053 { 1054 try 1055 { 1056 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 1057 1058 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 1059 dvfsTempPath, dvfsValue * std::pow(10, -3)); 1060 } 1061 catch (const std::system_error& e) 1062 { 1063 lg2::debug( 1064 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1065 "PATH", filePathString + maxSuffix, "ERROR", 1066 e.code().value()); 1067 } 1068 } 1069 1070 uint32_t faultValue{0}; 1071 try 1072 { 1073 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 1074 } 1075 catch (const std::system_error& e) 1076 { 1077 lg2::debug( 1078 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1079 "PATH", filePathString + faultSuffix, "ERROR", 1080 e.code().value()); 1081 continue; 1082 } 1083 1084 double tempValue{0}; 1085 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 1086 if (faultValue != 0) 1087 { 1088 tempValue = std::numeric_limits<double>::quiet_NaN(); 1089 } 1090 else 1091 { 1092 // Read the temperature 1093 try 1094 { 1095 tempValue = readFile<double>(filePathString + inputSuffix); 1096 } 1097 catch (const std::system_error& e) 1098 { 1099 lg2::debug( 1100 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1101 "PATH", filePathString + inputSuffix, "ERROR", 1102 e.code().value()); 1103 1104 // if errno == EAGAIN(Resource temporarily unavailable) then set 1105 // temp to 0, to avoid using old temp, and affecting FAN 1106 // Control. 1107 if (e.code().value() == EAGAIN) 1108 { 1109 tempValue = 0; 1110 } 1111 // else the errno would be something like 1112 // EBADF(Bad file descriptor) 1113 // or ENOENT(No such file or directory) 1114 else 1115 { 1116 continue; 1117 } 1118 } 1119 } 1120 1121 // If this object path already has a value, only overwite 1122 // it if the previous one was an NaN or a smaller value. 1123 auto existing = sensorData.find(sensorPath); 1124 if (existing != sensorData.end()) 1125 { 1126 // Multiple sensors found for this FRU type 1127 if ((std::isnan(existing->second) && (tempValue == 0)) || 1128 ((existing->second == 0) && std::isnan(tempValue))) 1129 { 1130 // One of the redundant sensors has failed (0xFF/nan), and the 1131 // other sensor has no reading (0), so set the FRU to NaN to 1132 // force fan increase 1133 tempValue = std::numeric_limits<double>::quiet_NaN(); 1134 existing->second = tempValue; 1135 } 1136 if (std::isnan(existing->second) || (tempValue > existing->second)) 1137 { 1138 existing->second = tempValue; 1139 } 1140 } 1141 else 1142 { 1143 // First sensor for this FRU type 1144 sensorData[sensorPath] = tempValue; 1145 } 1146 } 1147 1148 // Now publish the values on D-Bus. 1149 for (const auto& [objectPath, value] : sensorData) 1150 { 1151 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1152 value * std::pow(10, -3)); 1153 1154 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1155 objectPath, !std::isnan(value)); 1156 1157 if (existingSensors.find(objectPath) == existingSensors.end()) 1158 { 1159 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1160 objectPath, {"all_sensors"}); 1161 } 1162 1163 existingSensors[objectPath] = occInstance; 1164 } 1165 } 1166 1167 std::optional<std::string> getPowerLabelFunctionID(const std::string & value)1168 Manager::getPowerLabelFunctionID(const std::string& value) 1169 { 1170 // If the value is "system", then the FunctionID is "system". 1171 if (value == "system") 1172 { 1173 return value; 1174 } 1175 1176 // If the value is not "system", then the label value have 3 numbers, of 1177 // which we only care about the middle one: 1178 // <sensor id>_<function id>_<apss channel> 1179 // eg: The value is "0_10_5" , then the FunctionID is "10". 1180 if (value.find("_") == std::string::npos) 1181 { 1182 return std::nullopt; 1183 } 1184 1185 auto powerLabelValue = value.substr((value.find("_") + 1)); 1186 1187 if (powerLabelValue.find("_") == std::string::npos) 1188 { 1189 return std::nullopt; 1190 } 1191 1192 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1193 } 1194 readPowerSensors(const fs::path & path,uint32_t id)1195 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1196 { 1197 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1198 for (auto& file : fs::directory_iterator(path)) 1199 { 1200 if (!std::regex_search(file.path().string(), expr)) 1201 { 1202 continue; 1203 } 1204 1205 std::string labelValue; 1206 try 1207 { 1208 labelValue = readFile<std::string>(file.path()); 1209 } 1210 catch (const std::system_error& e) 1211 { 1212 lg2::debug( 1213 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}", 1214 "PATH", file.path().string(), "ERROR", e.code().value()); 1215 continue; 1216 } 1217 1218 auto functionID = getPowerLabelFunctionID(labelValue); 1219 if (functionID == std::nullopt) 1220 { 1221 continue; 1222 } 1223 1224 const std::string& tempLabel = "label"; 1225 const std::string filePathString = file.path().string().substr( 1226 0, file.path().string().length() - tempLabel.length()); 1227 1228 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1229 1230 auto iter = powerSensorName.find(*functionID); 1231 if (iter == powerSensorName.end()) 1232 { 1233 continue; 1234 } 1235 sensorPath.append(iter->second); 1236 1237 double tempValue{0}; 1238 1239 try 1240 { 1241 tempValue = readFile<double>(filePathString + inputSuffix); 1242 } 1243 catch (const std::system_error& e) 1244 { 1245 lg2::debug( 1246 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}", 1247 "PATH", filePathString + inputSuffix, "ERROR", 1248 e.code().value()); 1249 continue; 1250 } 1251 1252 dbus::OccDBusSensors::getOccDBus().setUnit( 1253 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1254 1255 dbus::OccDBusSensors::getOccDBus().setValue( 1256 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1257 1258 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1259 sensorPath, true); 1260 1261 if (existingSensors.find(sensorPath) == existingSensors.end()) 1262 { 1263 std::vector<int> occs; 1264 std::vector<std::string> fTypeList = {"all_sensors"}; 1265 if (iter->second == "total_power") 1266 { 1267 // Total system power has its own chassis association 1268 fTypeList.push_back("total_power"); 1269 } 1270 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1271 sensorPath, fTypeList); 1272 } 1273 1274 existingSensors[sensorPath] = id; 1275 } 1276 return; 1277 } 1278 setSensorValueToNaN(uint32_t id) const1279 void Manager::setSensorValueToNaN(uint32_t id) const 1280 { 1281 for (const auto& [sensorPath, occId] : existingSensors) 1282 { 1283 if (occId == id) 1284 { 1285 dbus::OccDBusSensors::getOccDBus().setValue( 1286 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1287 1288 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1289 sensorPath, true); 1290 } 1291 } 1292 return; 1293 } 1294 setSensorValueToNonFunctional(uint32_t id) const1295 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1296 { 1297 for (const auto& [sensorPath, occId] : existingSensors) 1298 { 1299 if (occId == id) 1300 { 1301 dbus::OccDBusSensors::getOccDBus().setValue( 1302 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1303 1304 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1305 sensorPath, false); 1306 } 1307 } 1308 return; 1309 } 1310 getSensorValues(std::unique_ptr<Status> & occ)1311 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1312 { 1313 static bool tracedError[8] = {0}; 1314 const fs::path sensorPath = occ->getHwmonPath(); 1315 const uint32_t id = occ->getOccInstanceID(); 1316 1317 if (fs::exists(sensorPath)) 1318 { 1319 // Read temperature sensors 1320 readTempSensors(sensorPath, id); 1321 1322 if (occ->isMasterOcc()) 1323 { 1324 // Read power sensors 1325 readPowerSensors(sensorPath, id); 1326 } 1327 tracedError[id] = false; 1328 } 1329 else 1330 { 1331 if (!tracedError[id]) 1332 { 1333 lg2::error( 1334 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}", 1335 "INST", id, "PATH", sensorPath); 1336 tracedError[id] = true; 1337 } 1338 } 1339 1340 return; 1341 } 1342 #endif 1343 1344 // Read the altitude from DBus readAltitude()1345 void Manager::readAltitude() 1346 { 1347 static bool traceAltitudeErr = true; 1348 1349 utils::PropertyValue altitudeProperty{}; 1350 try 1351 { 1352 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1353 ALTITUDE_PROP); 1354 auto sensorVal = std::get<double>(altitudeProperty); 1355 if (sensorVal < 0xFFFF) 1356 { 1357 if (sensorVal < 0) 1358 { 1359 altitude = 0; 1360 } 1361 else 1362 { 1363 // Round to nearest meter 1364 altitude = uint16_t(sensorVal + 0.5); 1365 } 1366 lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE", 1367 sensorVal, "ALT", altitude); 1368 traceAltitudeErr = true; 1369 } 1370 else 1371 { 1372 if (traceAltitudeErr) 1373 { 1374 traceAltitudeErr = false; 1375 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal); 1376 } 1377 } 1378 } 1379 catch (const sdbusplus::exception_t& e) 1380 { 1381 if (traceAltitudeErr) 1382 { 1383 traceAltitudeErr = false; 1384 lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what()); 1385 } 1386 altitude = 0xFFFF; // not available 1387 } 1388 } 1389 1390 // Callback function when ambient temperature changes ambientCallback(sdbusplus::message_t & msg)1391 void Manager::ambientCallback(sdbusplus::message_t& msg) 1392 { 1393 double currentTemp = 0; 1394 uint8_t truncatedTemp = 0xFF; 1395 std::string msgSensor; 1396 std::map<std::string, std::variant<double>> msgData; 1397 msg.read(msgSensor, msgData); 1398 1399 auto valPropMap = msgData.find(AMBIENT_PROP); 1400 if (valPropMap == msgData.end()) 1401 { 1402 lg2::debug("ambientCallback: Unknown ambient property changed"); 1403 return; 1404 } 1405 currentTemp = std::get<double>(valPropMap->second); 1406 if (std::isnan(currentTemp)) 1407 { 1408 truncatedTemp = 0xFF; 1409 } 1410 else 1411 { 1412 if (currentTemp < 0) 1413 { 1414 truncatedTemp = 0; 1415 } 1416 else 1417 { 1418 // Round to nearest degree C 1419 truncatedTemp = uint8_t(currentTemp + 0.5); 1420 } 1421 } 1422 1423 // If ambient changes, notify OCCs 1424 if (truncatedTemp != ambient) 1425 { 1426 lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C", 1427 "OLD", ambient, "NEW", currentTemp); 1428 1429 ambient = truncatedTemp; 1430 if (altitude == 0xFFFF) 1431 { 1432 // No altitude yet, try reading again 1433 readAltitude(); 1434 } 1435 1436 lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m", 1437 "TEMP", ambient, "ALT", altitude); 1438 #ifdef POWER10 1439 // Send ambient and altitude to all OCCs 1440 for (auto& obj : statusObjects) 1441 { 1442 if (obj->occActive()) 1443 { 1444 obj->sendAmbient(ambient, altitude); 1445 } 1446 } 1447 #endif // POWER10 1448 } 1449 } 1450 1451 // return the current ambient and altitude readings getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1452 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1453 uint16_t& altitudeValue) const 1454 { 1455 ambientValid = true; 1456 ambientTemp = ambient; 1457 altitudeValue = altitude; 1458 1459 if (ambient == 0xFF) 1460 { 1461 ambientValid = false; 1462 } 1463 } 1464 1465 #ifdef POWER10 1466 // Called when waitForAllOccsTimer expires 1467 // After the first OCC goes active, this timer will be started (60 seconds) occsNotAllRunning()1468 void Manager::occsNotAllRunning() 1469 { 1470 if (resetInProgress) 1471 { 1472 lg2::warning( 1473 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress"); 1474 return; 1475 } 1476 if (activeCount != statusObjects.size()) 1477 { 1478 // Not all OCCs went active 1479 lg2::warning( 1480 "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})", 1481 "COUNT", activeCount, "EXP", statusObjects.size()); 1482 // Procs may be garded, so may be expected 1483 } 1484 1485 if (resetRequired) 1486 { 1487 initiateOccRequest(resetInstance); 1488 1489 if (!waitForAllOccsTimer->isEnabled()) 1490 { 1491 lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer"); 1492 // restart occ wait timer 1493 waitForAllOccsTimer->restartOnce(60s); 1494 } 1495 } 1496 else 1497 { 1498 validateOccMaster(); 1499 } 1500 } 1501 1502 #ifdef PLDM 1503 // Called when throttlePldmTraceTimer expires. 1504 // If this timer expires, that indicates there are no OCC active sensor PDRs 1505 // found which will trigger pldm traces to be throttled. 1506 // The second time this timer expires, a PEL will get created. throttlePldmTraceExpired()1507 void Manager::throttlePldmTraceExpired() 1508 { 1509 if (utils::isHostRunning()) 1510 { 1511 if (!onPldmTimeoutCreatePel) 1512 { 1513 // Throttle traces 1514 pldmHandle->setTraceThrottle(true); 1515 // Restart timer to log a PEL when timer expires 1516 onPldmTimeoutCreatePel = true; 1517 throttlePldmTraceTimer->restartOnce(40min); 1518 } 1519 else 1520 { 1521 lg2::error( 1522 "throttlePldmTraceExpired(): OCC active sensors still not available!"); 1523 // Create PEL 1524 createPldmSensorPEL(); 1525 } 1526 } 1527 else 1528 { 1529 // Make sure traces are not throttled 1530 pldmHandle->setTraceThrottle(false); 1531 lg2::info( 1532 "throttlePldmTraceExpired(): host it not running ignoring sensor timer"); 1533 } 1534 } 1535 createPldmSensorPEL()1536 void Manager::createPldmSensorPEL() 1537 { 1538 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1539 std::map<std::string, std::string> additionalData; 1540 1541 additionalData.emplace("_PID", std::to_string(getpid())); 1542 1543 lg2::info( 1544 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs"); 1545 1546 auto& bus = utils::getBus(); 1547 1548 try 1549 { 1550 FFDCFiles ffdc; 1551 // Add occ-control journal traces to PEL FFDC 1552 auto occJournalFile = 1553 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1554 1555 static constexpr auto loggingObjectPath = 1556 "/xyz/openbmc_project/logging"; 1557 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1558 std::string service = 1559 utils::getService(loggingObjectPath, opLoggingInterface); 1560 auto method = 1561 bus.new_method_call(service.c_str(), loggingObjectPath, 1562 opLoggingInterface, "CreatePELWithFFDCFiles"); 1563 1564 // Set level to Warning (Predictive). 1565 auto level = 1566 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1567 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1568 Warning); 1569 1570 method.append(d.path, level, additionalData, ffdc); 1571 bus.call(method); 1572 } 1573 catch (const sdbusplus::exception_t& e) 1574 { 1575 lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR", 1576 e.what()); 1577 } 1578 } 1579 #endif // PLDM 1580 #endif // POWER10 1581 1582 // Verify single master OCC and start presence monitor validateOccMaster()1583 void Manager::validateOccMaster() 1584 { 1585 int masterInstance = -1; 1586 for (auto& obj : statusObjects) 1587 { 1588 auto instance = obj->getOccInstanceID(); 1589 #ifdef POWER10 1590 if (!obj->occActive()) 1591 { 1592 if (utils::isHostRunning()) 1593 { 1594 // Check if sensor was queued while waiting for discovery 1595 auto match = queuedActiveState.find(instance); 1596 if (match != queuedActiveState.end()) 1597 { 1598 queuedActiveState.erase(match); 1599 lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)", 1600 "INST", instance); 1601 obj->occActive(true); 1602 } 1603 else 1604 { 1605 // OCC does not appear to be active yet, check active sensor 1606 #ifdef PLDM 1607 pldmHandle->checkActiveSensor(instance); 1608 #endif 1609 if (obj->occActive()) 1610 { 1611 lg2::info( 1612 "validateOccMaster: OCC{INST} is ACTIVE after reading sensor", 1613 "INST", instance); 1614 } 1615 } 1616 } 1617 else 1618 { 1619 lg2::warning( 1620 "validateOccMaster: HOST is not running (OCC{INST})", 1621 "INST", instance); 1622 return; 1623 } 1624 } 1625 #endif // POWER10 1626 1627 if (obj->isMasterOcc()) 1628 { 1629 obj->addPresenceWatchMaster(); 1630 1631 if (masterInstance == -1) 1632 { 1633 masterInstance = instance; 1634 } 1635 else 1636 { 1637 lg2::error( 1638 "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})", 1639 "MAST1", masterInstance, "MAST2", instance); 1640 // request reset 1641 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1642 } 1643 } 1644 } 1645 1646 if (masterInstance < 0) 1647 { 1648 lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)", 1649 "NUM", statusObjects.size()); 1650 // request reset 1651 statusObjects.front()->deviceError( 1652 Error::Descriptor(PRESENCE_ERROR_PATH)); 1653 } 1654 else 1655 { 1656 lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs", 1657 "INST", masterInstance, "COUNT", activeCount); 1658 #ifdef POWER10 1659 pmode->updateDbusSafeMode(false); 1660 #endif 1661 } 1662 } 1663 updatePcapBounds() const1664 void Manager::updatePcapBounds() const 1665 { 1666 if (pcap) 1667 { 1668 pcap->updatePcapBounds(); 1669 } 1670 } 1671 1672 } // namespace occ 1673 } // namespace open_power 1674