1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "occ_dbus.hpp" 6 #include "occ_errors.hpp" 7 #include "utils.hpp" 8 9 #include <nlohmann/json.hpp> 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/lg2.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 const std::string Manager::dumpFile = "/tmp/occ_control_dump.json"; 33 34 using namespace phosphor::logging; 35 using namespace std::literals::chrono_literals; 36 using json = nlohmann::json; 37 38 template <typename T> 39 T readFile(const std::string& path) 40 { 41 std::ifstream ifs; 42 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 43 std::ifstream::eofbit); 44 T data; 45 46 try 47 { 48 ifs.open(path); 49 ifs >> data; 50 ifs.close(); 51 } 52 catch (const std::exception& e) 53 { 54 auto err = errno; 55 throw std::system_error(err, std::generic_category()); 56 } 57 58 return data; 59 } 60 61 void Manager::createPldmHandle() 62 { 63 pldmHandle = std::make_unique<pldm::Interface>( 64 std::bind(std::mem_fn(&Manager::updateOCCActive), this, 65 std::placeholders::_1, std::placeholders::_2), 66 std::bind(std::mem_fn(&Manager::sbeHRESETResult), this, 67 std::placeholders::_1, std::placeholders::_2), 68 std::bind(std::mem_fn(&Manager::updateOccSafeMode), this, 69 std::placeholders::_1), 70 std::bind(std::mem_fn(&Manager::hostPoweredOff), this), event); 71 } 72 73 // findAndCreateObjects(): 74 // Takes care of getting the required objects created and 75 // finds the available devices/processors. 76 // (function is called everytime the discoverTimer expires) 77 // - create the PowerMode object to control OCC modes 78 // - create statusObjects for each OCC device found 79 // - waits for OCC Active sensors PDRs to become available 80 // - restart discoverTimer if all data is not available yet 81 void Manager::findAndCreateObjects() 82 { 83 if (!pmode) 84 { 85 // Create the power mode object 86 pmode = std::make_unique<powermode::PowerMode>( 87 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 88 } 89 90 if (!fs::exists(HOST_ON_FILE)) 91 { 92 static bool statusObjCreated = false; 93 if (!statusObjCreated) 94 { 95 // Create the OCCs based on on the /dev/occX devices 96 auto occs = findOCCsInDev(); 97 98 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 99 { 100 // Something changed or no OCCs yet, try again in 10s. 101 // Note on the first pass prevOCCSearch will be empty, 102 // so there will be at least one delay to give things 103 // a chance to settle. 104 prevOCCSearch = occs; 105 106 lg2::info( 107 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})", 108 "QTY", occs.size()); 109 110 discoverTimer->restartOnce(10s); 111 } 112 else 113 { 114 // All OCCs appear to be available, create status objects 115 116 // createObjects requires OCC0 first. 117 std::sort(occs.begin(), occs.end()); 118 119 lg2::info( 120 "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects", 121 "QTY", occs.size()); 122 for (auto id : occs) 123 { 124 createObjects(std::string(OCC_NAME) + std::to_string(id)); 125 } 126 statusObjCreated = true; 127 waitingForAllOccActiveSensors = true; 128 129 // Find/update the processor path associated with each OCC 130 for (auto& obj : statusObjects) 131 { 132 obj->updateProcAssociation(); 133 } 134 } 135 } 136 137 if (statusObjCreated && waitingForAllOccActiveSensors) 138 { 139 static bool tracedHostWait = false; 140 if (utils::isHostRunning()) 141 { 142 if (tracedHostWait) 143 { 144 lg2::info( 145 "Manager::findAndCreateObjects(): Host is running"); 146 tracedHostWait = false; 147 } 148 checkAllActiveSensors(); 149 } 150 else 151 { 152 if (!tracedHostWait) 153 { 154 lg2::info( 155 "Manager::findAndCreateObjects(): Waiting for host to start"); 156 tracedHostWait = true; 157 } 158 discoverTimer->restartOnce(30s); 159 160 if (throttlePldmTraceTimer->isEnabled()) 161 { 162 // Host is no longer running, disable throttle timer and 163 // make sure traces are not throttled 164 lg2::info("findAndCreateObjects(): disabling sensor timer"); 165 throttlePldmTraceTimer->setEnabled(false); 166 pldmHandle->setTraceThrottle(false); 167 } 168 } 169 } 170 } 171 else 172 { 173 lg2::info( 174 "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...", 175 "FILE", HOST_ON_FILE); 176 discoverTimer->restartOnce(10s); 177 } 178 } 179 180 // Check if all occActive sensors are available 181 void Manager::checkAllActiveSensors() 182 { 183 static bool allActiveSensorAvailable = false; 184 static bool tracedSensorWait = false; 185 static bool waitingForHost = false; 186 187 if (open_power::occ::utils::isHostRunning()) 188 { 189 if (waitingForHost) 190 { 191 waitingForHost = false; 192 lg2::info("checkAllActiveSensors(): Host is now running"); 193 } 194 195 // Start with the assumption that all are available 196 allActiveSensorAvailable = true; 197 for (auto& obj : statusObjects) 198 { 199 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 200 { 201 auto instance = obj->getOccInstanceID(); 202 // Check if sensor was queued while waiting for discovery 203 auto match = queuedActiveState.find(instance); 204 if (match != queuedActiveState.end()) 205 { 206 queuedActiveState.erase(match); 207 lg2::info( 208 "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)", 209 "INST", instance); 210 obj->occActive(true); 211 } 212 else 213 { 214 allActiveSensorAvailable = false; 215 if (!tracedSensorWait) 216 { 217 lg2::info( 218 "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor", 219 "INST", instance); 220 tracedSensorWait = true; 221 // Make sure PLDM traces are not throttled 222 pldmHandle->setTraceThrottle(false); 223 // Start timer to throttle PLDM traces when timer 224 // expires 225 onPldmTimeoutCreatePel = false; 226 throttlePldmTraceTimer->restartOnce(5min); 227 } 228 // Ignore active sensor check if the OCCs are being reset 229 if (!resetInProgress) 230 { 231 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 232 } 233 break; 234 } 235 } 236 } 237 } 238 else 239 { 240 if (!waitingForHost) 241 { 242 waitingForHost = true; 243 lg2::info("checkAllActiveSensors(): Waiting for host to start"); 244 if (throttlePldmTraceTimer->isEnabled()) 245 { 246 // Host is no longer running, disable throttle timer and 247 // make sure traces are not throttled 248 lg2::info("checkAllActiveSensors(): disabling sensor timer"); 249 throttlePldmTraceTimer->setEnabled(false); 250 pldmHandle->setTraceThrottle(false); 251 } 252 } 253 } 254 255 if (allActiveSensorAvailable) 256 { 257 // All sensors were found, disable the discovery timer 258 if (discoverTimer->isEnabled()) 259 { 260 discoverTimer->setEnabled(false); 261 } 262 if (throttlePldmTraceTimer->isEnabled()) 263 { 264 // Disable throttle timer and make sure traces are not throttled 265 throttlePldmTraceTimer->setEnabled(false); 266 pldmHandle->setTraceThrottle(false); 267 } 268 if (waitingForAllOccActiveSensors) 269 { 270 lg2::info( 271 "checkAllActiveSensors(): OCC Active sensors are available"); 272 waitingForAllOccActiveSensors = false; 273 274 if (resetRequired) 275 { 276 initiateOccRequest(resetInstance); 277 278 if (!waitForAllOccsTimer->isEnabled()) 279 { 280 lg2::warning( 281 "occsNotAllRunning: Restarting waitForAllOccTimer"); 282 // restart occ wait timer to check status after reset 283 // completes 284 waitForAllOccsTimer->restartOnce(60s); 285 } 286 } 287 } 288 queuedActiveState.clear(); 289 tracedSensorWait = false; 290 } 291 else 292 { 293 // Not all sensors were available, so keep waiting 294 if (!tracedSensorWait) 295 { 296 lg2::info( 297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 298 tracedSensorWait = true; 299 } 300 discoverTimer->restartOnce(10s); 301 } 302 } 303 304 std::vector<int> Manager::findOCCsInDev() 305 { 306 std::vector<int> occs; 307 std::regex expr{R"(occ(\d+)$)"}; 308 309 for (auto& file : fs::directory_iterator("/dev")) 310 { 311 std::smatch match; 312 std::string path{file.path().string()}; 313 if (std::regex_search(path, match, expr)) 314 { 315 auto num = std::stoi(match[1].str()); 316 317 // /dev numbering starts at 1, ours starts at 0. 318 occs.push_back(num - 1); 319 } 320 } 321 322 return occs; 323 } 324 325 int Manager::cpuCreated(sdbusplus::message_t& msg) 326 { 327 namespace fs = std::filesystem; 328 329 sdbusplus::message::object_path o; 330 msg.read(o); 331 fs::path cpuPath(std::string(std::move(o))); 332 333 auto name = cpuPath.filename().string(); 334 auto index = name.find(CPU_NAME); 335 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 336 337 createObjects(name); 338 339 return 0; 340 } 341 342 void Manager::createObjects(const std::string& occ) 343 { 344 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 345 346 statusObjects.emplace_back(std::make_unique<Status>( 347 event, path.c_str(), *this, pmode, 348 std::bind(std::mem_fn(&Manager::statusCallBack), this, 349 std::placeholders::_1, std::placeholders::_2), 350 // Callback will set flag indicating reset needs to be done 351 // instead of immediately issuing a reset via PLDM. 352 std::bind(std::mem_fn(&Manager::resetOccRequest), this, 353 std::placeholders::_1))); 354 355 // Create the power cap monitor object 356 if (!pcap) 357 { 358 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 359 *statusObjects.back()); 360 } 361 362 if (statusObjects.back()->isMasterOcc()) 363 { 364 lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST", 365 statusObjects.back()->getOccInstanceID()); 366 _pollTimer->setEnabled(false); 367 368 // Set the master OCC on the PowerMode object 369 pmode->setMasterOcc(path); 370 } 371 372 passThroughObjects.emplace_back( 373 std::make_unique<PassThrough>(path.c_str(), pmode)); 374 } 375 376 // If a reset is not already outstanding, set a flag to indicate that a reset is 377 // needed. 378 void Manager::resetOccRequest(instanceID instance) 379 { 380 if (!resetRequired) 381 { 382 resetRequired = true; 383 resetInstance = instance; 384 lg2::error( 385 "resetOccRequest: PM Complex reset was requested due to OCC{INST}", 386 "INST", instance); 387 } 388 else if (instance != resetInstance) 389 { 390 lg2::warning( 391 "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}", 392 "INST", instance, "RINST", resetInstance); 393 } 394 } 395 396 // If a reset has not been started, initiate an OCC reset via PLDM 397 void Manager::initiateOccRequest(instanceID instance) 398 { 399 if (!resetInProgress) 400 { 401 resetInProgress = true; 402 resetInstance = instance; 403 lg2::error( 404 "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}", 405 "INST", instance); 406 407 // Make sure ALL OCC comm stops to all OCCs before the reset 408 for (auto& obj : statusObjects) 409 { 410 if (obj->occActive()) 411 { 412 obj->occActive(false); 413 } 414 } 415 416 pldmHandle->resetOCC(instance); 417 resetRequired = false; 418 } 419 else 420 { 421 lg2::warning( 422 "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}", 423 "INST", instance, "RINST", resetInstance); 424 } 425 } 426 427 void Manager::statusCallBack(instanceID instance, bool status) 428 { 429 if (status == true) 430 { 431 if (resetInProgress) 432 { 433 lg2::info( 434 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}", 435 "INST", instance, "RINST", resetInstance); 436 return; 437 } 438 439 // OCC went active 440 ++activeCount; 441 442 if (activeCount == 1) 443 { 444 // First OCC went active (allow some time for all OCCs to go active) 445 waitForAllOccsTimer->restartOnce(60s); 446 } 447 448 if (activeCount == statusObjects.size()) 449 { 450 // All OCCs are now running 451 if (waitForAllOccsTimer->isEnabled()) 452 { 453 // stop occ wait timer 454 waitForAllOccsTimer->setEnabled(false); 455 } 456 457 // All OCCs have been found, check if we need a reset 458 if (resetRequired) 459 { 460 initiateOccRequest(resetInstance); 461 462 if (!waitForAllOccsTimer->isEnabled()) 463 { 464 lg2::warning( 465 "occsNotAllRunning: Restarting waitForAllOccTimer"); 466 // restart occ wait timer 467 waitForAllOccsTimer->restartOnce(60s); 468 } 469 } 470 else 471 { 472 // Verify master OCC and start presence monitor 473 validateOccMaster(); 474 } 475 } 476 477 // Start poll timer if not already started (since at least one OCC is 478 // running) 479 if (!_pollTimer->isEnabled()) 480 { 481 // An OCC just went active, PM Complex is just coming online so 482 // clear any outstanding reset requests 483 if (resetRequired) 484 { 485 resetRequired = false; 486 lg2::error( 487 "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})", 488 "INST", instance, "RIP", resetInProgress); 489 } 490 491 lg2::info("Manager: OCCs will be polled every {TIME} seconds", 492 "TIME", pollInterval); 493 494 // Send poll and start OCC poll timer 495 pollerTimerExpired(); 496 } 497 } 498 else 499 { 500 // OCC went away 501 if (activeCount > 0) 502 { 503 --activeCount; 504 } 505 else 506 { 507 lg2::info("OCC{INST} disabled, and no other OCCs are active", 508 "INST", instance); 509 } 510 511 if (activeCount == 0) 512 { 513 // No OCCs are running 514 515 if (resetInProgress) 516 { 517 // All OCC active sensors are clear (reset should be in 518 // progress) 519 lg2::info( 520 "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})", 521 "COUNT", activeCount, "INST", instance, "STATUS", status); 522 resetInProgress = false; 523 resetInstance = 255; 524 } 525 526 // Stop OCC poll timer 527 if (_pollTimer->isEnabled()) 528 { 529 lg2::info( 530 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 531 _pollTimer->setEnabled(false); 532 } 533 534 // stop wait timer 535 if (waitForAllOccsTimer->isEnabled()) 536 { 537 waitForAllOccsTimer->setEnabled(false); 538 } 539 } 540 else if (resetInProgress) 541 { 542 lg2::info( 543 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})", 544 "COUNT", activeCount, "INST", instance, "STATUS", status); 545 } 546 // Clear OCC sensors 547 setSensorValueToNaN(instance); 548 } 549 550 if (waitingForAllOccActiveSensors) 551 { 552 if (utils::isHostRunning()) 553 { 554 checkAllActiveSensors(); 555 } 556 } 557 } 558 559 void Manager::sbeTimeout(unsigned int instance) 560 { 561 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 562 [instance](const auto& obj) { 563 return instance == obj->getOccInstanceID(); 564 }); 565 566 if (obj != statusObjects.end() && (*obj)->occActive()) 567 { 568 lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST", 569 instance); 570 571 #ifdef PHAL_SUPPORT 572 setSBEState(instance, SBE_STATE_NOT_USABLE); 573 #endif 574 575 // Stop communication with this OCC 576 (*obj)->occActive(false); 577 578 pldmHandle->sendHRESET(instance); 579 } 580 } 581 582 bool Manager::updateOCCActive(instanceID instance, bool status) 583 { 584 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 585 [instance](const auto& obj) { 586 return instance == obj->getOccInstanceID(); 587 }); 588 589 const bool hostRunning = open_power::occ::utils::isHostRunning(); 590 if (obj != statusObjects.end()) 591 { 592 if (!hostRunning && (status == true)) 593 { 594 lg2::warning( 595 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received", 596 "INST", instance, "STAT", status); 597 (*obj)->setPldmSensorReceived(false); 598 if (!waitingForAllOccActiveSensors) 599 { 600 lg2::info( 601 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 602 waitingForAllOccActiveSensors = true; 603 } 604 discoverTimer->restartOnce(30s); 605 return false; 606 } 607 else 608 { 609 (*obj)->setPldmSensorReceived(true); 610 return (*obj)->occActive(status); 611 } 612 } 613 else 614 { 615 if (hostRunning) 616 { 617 lg2::warning( 618 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})", 619 "INST", instance, "STAT", status); 620 } 621 else 622 { 623 if (status == true) 624 { 625 lg2::warning( 626 "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})", 627 "INST", instance, "STAT", status); 628 } 629 } 630 if (status == true) 631 { 632 // OCC went active 633 queuedActiveState.insert(instance); 634 } 635 else 636 { 637 auto match = queuedActiveState.find(instance); 638 if (match != queuedActiveState.end()) 639 { 640 // OCC was disabled 641 queuedActiveState.erase(match); 642 } 643 } 644 return false; 645 } 646 } 647 648 // Called upon pldm event To set powermode Safe Mode State for system. 649 void Manager::updateOccSafeMode(bool safeMode) 650 { 651 pmode->updateDbusSafeMode(safeMode); 652 // Update the processor throttle status on dbus 653 for (auto& obj : statusObjects) 654 { 655 obj->updateThrottle(safeMode, THROTTLED_SAFE); 656 } 657 } 658 659 void Manager::sbeHRESETResult(instanceID instance, bool success) 660 { 661 if (success) 662 { 663 lg2::info("HRESET succeeded (OCC{INST})", "INST", instance); 664 665 #ifdef PHAL_SUPPORT 666 setSBEState(instance, SBE_STATE_BOOTED); 667 #endif 668 669 // Re-enable communication with this OCC 670 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 671 [instance](const auto& obj) { 672 return instance == obj->getOccInstanceID(); 673 }); 674 if (obj != statusObjects.end() && (!(*obj)->occActive())) 675 { 676 (*obj)->occActive(true); 677 } 678 679 return; 680 } 681 682 #ifdef PHAL_SUPPORT 683 setSBEState(instance, SBE_STATE_FAILED); 684 685 if (sbeCanDump(instance)) 686 { 687 lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST", 688 instance); 689 690 auto& bus = utils::getBus(); 691 uint32_t src6 = instance << 16; 692 uint32_t logId = 693 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 694 src6, "SBE command timeout"); 695 696 try 697 { 698 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 699 constexpr auto function = "CreateDump"; 700 701 std::string service = 702 utils::getService(OP_DUMP_OBJ_PATH, interface); 703 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH, 704 interface, function); 705 706 std::map<std::string, std::variant<std::string, uint64_t>> 707 createParams{ 708 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 709 uint64_t(logId)}, 710 {"com.ibm.Dump.Create.CreateParameters.DumpType", 711 "com.ibm.Dump.Create.DumpType.SBE"}, 712 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 713 uint64_t(instance)}, 714 }; 715 716 method.append(createParams); 717 718 auto response = bus.call(method); 719 } 720 catch (const sdbusplus::exception_t& e) 721 { 722 constexpr auto ERROR_DUMP_DISABLED = 723 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 724 if (e.name() == ERROR_DUMP_DISABLED) 725 { 726 lg2::info("Dump is disabled, skipping"); 727 } 728 else 729 { 730 lg2::error("Dump failed"); 731 } 732 } 733 } 734 #endif 735 736 // SBE Reset failed, try PM Complex reset 737 lg2::error("sbeHRESETResult: Forcing PM Complex reset"); 738 resetOccRequest(instance); 739 } 740 741 #ifdef PHAL_SUPPORT 742 bool Manager::sbeCanDump(unsigned int instance) 743 { 744 struct pdbg_target* proc = getPdbgTarget(instance); 745 746 if (!proc) 747 { 748 // allow the dump in the error case 749 return true; 750 } 751 752 try 753 { 754 if (!openpower::phal::sbe::isDumpAllowed(proc)) 755 { 756 return false; 757 } 758 759 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 760 { 761 return false; 762 } 763 } 764 catch (openpower::phal::exception::SbeError& e) 765 { 766 lg2::info("Failed to query SBE state"); 767 } 768 769 // allow the dump in the error case 770 return true; 771 } 772 773 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 774 { 775 struct pdbg_target* proc = getPdbgTarget(instance); 776 777 if (!proc) 778 { 779 return; 780 } 781 782 try 783 { 784 openpower::phal::sbe::setState(proc, state); 785 } 786 catch (const openpower::phal::exception::SbeError& e) 787 { 788 lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what()); 789 } 790 } 791 792 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 793 { 794 if (!pdbgInitialized) 795 { 796 try 797 { 798 openpower::phal::pdbg::init(); 799 pdbgInitialized = true; 800 } 801 catch (const openpower::phal::exception::PdbgError& e) 802 { 803 lg2::error("pdbg initialization failed"); 804 return nullptr; 805 } 806 } 807 808 struct pdbg_target* proc = nullptr; 809 pdbg_for_each_class_target("proc", proc) 810 { 811 if (pdbg_target_index(proc) == instance) 812 { 813 return proc; 814 } 815 } 816 817 lg2::error("Failed to get pdbg target"); 818 return nullptr; 819 } 820 #endif 821 822 void Manager::pollerTimerExpired() 823 { 824 if (!_pollTimer) 825 { 826 lg2::error("pollerTimerExpired() ERROR: Timer not defined"); 827 return; 828 } 829 830 if (resetRequired) 831 { 832 lg2::error("pollerTimerExpired() - Initiating PM Complex reset"); 833 initiateOccRequest(resetInstance); 834 835 if (!waitForAllOccsTimer->isEnabled()) 836 { 837 lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer"); 838 // restart occ wait timer 839 waitForAllOccsTimer->restartOnce(60s); 840 } 841 return; 842 } 843 844 for (auto& obj : statusObjects) 845 { 846 if (!obj->occActive()) 847 { 848 // OCC is not running yet 849 auto id = obj->getOccInstanceID(); 850 setSensorValueToNaN(id); 851 continue; 852 } 853 854 // Read sysfs to force kernel to poll OCC 855 obj->readOccState(); 856 857 // Read occ sensor values 858 getSensorValues(obj); 859 } 860 861 if (activeCount > 0) 862 { 863 // Restart OCC poll timer 864 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 865 } 866 else 867 { 868 // No OCCs running, so poll timer will not be restarted 869 lg2::info( 870 "Manager::pollerTimerExpired: poll timer will not be restarted"); 871 } 872 } 873 874 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 875 { 876 // There may be more than one sensor with the same FRU type 877 // and label so make two passes: the first to read the temps 878 // from sysfs, and the second to put them on D-Bus after 879 // resolving any conflicts. 880 std::map<std::string, double> sensorData; 881 882 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 883 for (auto& file : fs::directory_iterator(path)) 884 { 885 if (!std::regex_search(file.path().string(), expr)) 886 { 887 continue; 888 } 889 890 uint32_t labelValue{0}; 891 892 try 893 { 894 labelValue = readFile<uint32_t>(file.path()); 895 } 896 catch (const std::system_error& e) 897 { 898 lg2::debug( 899 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 900 "PATH", file.path().string(), "ERROR", e.code().value()); 901 continue; 902 } 903 904 const std::string& tempLabel = "label"; 905 const std::string filePathString = file.path().string().substr( 906 0, file.path().string().length() - tempLabel.length()); 907 908 uint32_t fruTypeValue{0}; 909 try 910 { 911 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 912 } 913 catch (const std::system_error& e) 914 { 915 lg2::debug( 916 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 917 "PATH", filePathString + fruTypeSuffix, "ERROR", 918 e.code().value()); 919 continue; 920 } 921 922 std::string sensorPath = 923 OCC_SENSORS_ROOT + std::string("/temperature/"); 924 925 std::string dvfsTempPath; 926 927 if (fruTypeValue == VRMVdd) 928 { 929 sensorPath.append( 930 "vrm_vdd" + std::to_string(occInstance) + "_temp"); 931 } 932 else if (fruTypeValue == processorIoRing) 933 { 934 sensorPath.append( 935 "proc" + std::to_string(occInstance) + "_ioring_temp"); 936 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 937 std::to_string(occInstance) + "_ioring_dvfs_temp"; 938 } 939 else 940 { 941 uint16_t type = (labelValue & 0xFF000000) >> 24; 942 uint16_t instanceID = labelValue & 0x0000FFFF; 943 944 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 945 { 946 if (fruTypeValue == fruTypeNotAvailable) 947 { 948 // Not all DIMM related temps are available to read 949 // (no _input file in this case) 950 continue; 951 } 952 auto iter = dimmTempSensorName.find(fruTypeValue); 953 if (iter == dimmTempSensorName.end()) 954 { 955 lg2::error( 956 "readTempSensors: Fru type error! fruTypeValue = {FRU}) ", 957 "FRU", fruTypeValue); 958 continue; 959 } 960 961 sensorPath.append( 962 "dimm" + std::to_string(instanceID) + iter->second); 963 964 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 965 dimmDVFSSensorName.at(fruTypeValue); 966 } 967 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 968 { 969 if (fruTypeValue == processorCore) 970 { 971 // The OCC reports small core temps, of which there are 972 // two per big core. All current P10 systems are in big 973 // core mode, so use a big core name. 974 uint16_t coreNum = instanceID / 2; 975 uint16_t tempNum = instanceID % 2; 976 sensorPath.append("proc" + std::to_string(occInstance) + 977 "_core" + std::to_string(coreNum) + "_" + 978 std::to_string(tempNum) + "_temp"); 979 980 dvfsTempPath = 981 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 982 std::to_string(occInstance) + "_core_dvfs_temp"; 983 } 984 else 985 { 986 continue; 987 } 988 } 989 else 990 { 991 continue; 992 } 993 } 994 995 // The dvfs temp file only needs to be read once per chip per type. 996 if (!dvfsTempPath.empty() && 997 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 998 { 999 try 1000 { 1001 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 1002 1003 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 1004 dvfsTempPath, dvfsValue * std::pow(10, -3)); 1005 } 1006 catch (const std::system_error& e) 1007 { 1008 lg2::debug( 1009 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1010 "PATH", filePathString + maxSuffix, "ERROR", 1011 e.code().value()); 1012 } 1013 } 1014 1015 uint32_t faultValue{0}; 1016 try 1017 { 1018 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 1019 } 1020 catch (const std::system_error& e) 1021 { 1022 lg2::debug( 1023 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1024 "PATH", filePathString + faultSuffix, "ERROR", 1025 e.code().value()); 1026 continue; 1027 } 1028 1029 double tempValue{0}; 1030 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 1031 if (faultValue != 0) 1032 { 1033 tempValue = std::numeric_limits<double>::quiet_NaN(); 1034 } 1035 else 1036 { 1037 // Read the temperature 1038 try 1039 { 1040 tempValue = readFile<double>(filePathString + inputSuffix); 1041 } 1042 catch (const std::system_error& e) 1043 { 1044 lg2::debug( 1045 "readTempSensors: Failed reading {PATH}, errno = {ERROR}", 1046 "PATH", filePathString + inputSuffix, "ERROR", 1047 e.code().value()); 1048 1049 // if errno == EAGAIN(Resource temporarily unavailable) then set 1050 // temp to 0, to avoid using old temp, and affecting FAN 1051 // Control. 1052 if (e.code().value() == EAGAIN) 1053 { 1054 tempValue = 0; 1055 } 1056 // else the errno would be something like 1057 // EBADF(Bad file descriptor) 1058 // or ENOENT(No such file or directory) 1059 else 1060 { 1061 continue; 1062 } 1063 } 1064 } 1065 1066 // If this object path already has a value, only overwite 1067 // it if the previous one was an NaN or a smaller value. 1068 auto existing = sensorData.find(sensorPath); 1069 if (existing != sensorData.end()) 1070 { 1071 // Multiple sensors found for this FRU type 1072 if ((std::isnan(existing->second) && (tempValue == 0)) || 1073 ((existing->second == 0) && std::isnan(tempValue))) 1074 { 1075 // One of the redundant sensors has failed (0xFF/nan), and the 1076 // other sensor has no reading (0), so set the FRU to NaN to 1077 // force fan increase 1078 tempValue = std::numeric_limits<double>::quiet_NaN(); 1079 existing->second = tempValue; 1080 } 1081 if (std::isnan(existing->second) || (tempValue > existing->second)) 1082 { 1083 existing->second = tempValue; 1084 } 1085 } 1086 else 1087 { 1088 // First sensor for this FRU type 1089 sensorData[sensorPath] = tempValue; 1090 } 1091 } 1092 1093 // Now publish the values on D-Bus. 1094 for (const auto& [objectPath, value] : sensorData) 1095 { 1096 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1097 value * std::pow(10, -3)); 1098 1099 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1100 objectPath, !std::isnan(value)); 1101 1102 if (existingSensors.find(objectPath) == existingSensors.end()) 1103 { 1104 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1105 objectPath, {"all_sensors"}); 1106 } 1107 existingSensors[objectPath] = occInstance; 1108 } 1109 } 1110 1111 std::optional<std::string> Manager::getPowerLabelFunctionID( 1112 const std::string& value) 1113 { 1114 // If the value is "system", then the FunctionID is "system". 1115 if (value == "system") 1116 { 1117 return value; 1118 } 1119 1120 // If the value is not "system", then the label value have 3 numbers, of 1121 // which we only care about the middle one: 1122 // <sensor id>_<function id>_<apss channel> 1123 // eg: The value is "0_10_5" , then the FunctionID is "10". 1124 if (value.find("_") == std::string::npos) 1125 { 1126 return std::nullopt; 1127 } 1128 1129 auto powerLabelValue = value.substr((value.find("_") + 1)); 1130 1131 if (powerLabelValue.find("_") == std::string::npos) 1132 { 1133 return std::nullopt; 1134 } 1135 1136 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1137 } 1138 1139 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1140 { 1141 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1142 for (auto& file : fs::directory_iterator(path)) 1143 { 1144 if (!std::regex_search(file.path().string(), expr)) 1145 { 1146 continue; 1147 } 1148 1149 std::string labelValue; 1150 try 1151 { 1152 labelValue = readFile<std::string>(file.path()); 1153 } 1154 catch (const std::system_error& e) 1155 { 1156 lg2::debug( 1157 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}", 1158 "PATH", file.path().string(), "ERROR", e.code().value()); 1159 continue; 1160 } 1161 1162 auto functionID = getPowerLabelFunctionID(labelValue); 1163 if (functionID == std::nullopt) 1164 { 1165 continue; 1166 } 1167 1168 const std::string& tempLabel = "label"; 1169 const std::string filePathString = file.path().string().substr( 1170 0, file.path().string().length() - tempLabel.length()); 1171 1172 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1173 1174 auto iter = powerSensorName.find(*functionID); 1175 if (iter == powerSensorName.end()) 1176 { 1177 continue; 1178 } 1179 sensorPath.append(iter->second); 1180 1181 double tempValue{0}; 1182 1183 try 1184 { 1185 tempValue = readFile<double>(filePathString + inputSuffix); 1186 } 1187 catch (const std::system_error& e) 1188 { 1189 lg2::debug( 1190 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}", 1191 "PATH", filePathString + inputSuffix, "ERROR", 1192 e.code().value()); 1193 continue; 1194 } 1195 1196 dbus::OccDBusSensors::getOccDBus().setUnit( 1197 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1198 1199 dbus::OccDBusSensors::getOccDBus().setValue( 1200 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1201 1202 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1203 sensorPath, true); 1204 1205 if (existingSensors.find(sensorPath) == existingSensors.end()) 1206 { 1207 std::vector<std::string> fTypeList = {"all_sensors"}; 1208 if (iter->second == "total_power") 1209 { 1210 // Set sensor purpose as TotalPower 1211 dbus::OccDBusSensors::getOccDBus().setPurpose( 1212 sensorPath, 1213 "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower"); 1214 } 1215 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1216 sensorPath, fTypeList); 1217 } 1218 existingSensors[sensorPath] = id; 1219 } 1220 return; 1221 } 1222 1223 void Manager::readExtnSensors(const fs::path& path, uint32_t id) 1224 { 1225 std::regex expr{"extn\\d+_label$"}; // Example: extn5_label 1226 for (auto& file : fs::directory_iterator(path)) 1227 { 1228 if (!std::regex_search(file.path().string(), expr)) 1229 { 1230 continue; 1231 } 1232 1233 // Read in Label value of the sensor from file. 1234 std::string labelValue; 1235 try 1236 { 1237 labelValue = readFile<std::string>(file.path()); 1238 } 1239 catch (const std::system_error& e) 1240 { 1241 lg2::debug( 1242 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}", 1243 "PATH", file.path().string(), "ERROR", e.code().value()); 1244 continue; 1245 } 1246 const std::string& tempLabel = "label"; 1247 const std::string filePathString = file.path().string().substr( 1248 0, file.path().string().length() - tempLabel.length()); 1249 1250 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1251 1252 // Labels of EXTN sections from OCC interface Document 1253 // have different formats. 1254 // 0x464d494e : FMIN 0x46444953 : FDIS 1255 // 0x46424153 : FBAS 0x46555400 : FUT 1256 // 0x464d4158 : FMAX 0x434c4950 : CLIP 1257 // 0x4d4f4445 : MODE 0x574f4643 : WOFC 1258 // 0x574f4649 : WOFI 0x5057524d : PWRM 1259 // 0x50575250 : PWRP 0x45525248 : ERRH 1260 // Label indicating byte 5 and 6 is the current (mem,proc) power in 1261 // Watts. 1262 if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) || 1263 (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)) 1264 { 1265 // Build the dbus String for this chiplet power asset. 1266 if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER) 1267 { 1268 labelValue = "_power"; 1269 } 1270 else // else EXTN_LABEL_PWRM_MEMORY_POWER 1271 { 1272 labelValue = "_mem_power"; 1273 } 1274 sensorPath.append("chiplet" + std::to_string(id) + labelValue); 1275 1276 // Read in data value of the sensor from file. 1277 // Read in as string due to different format of data in sensors. 1278 std::string extnValue; 1279 try 1280 { 1281 extnValue = readFile<std::string>(filePathString + inputSuffix); 1282 } 1283 catch (const std::system_error& e) 1284 { 1285 lg2::debug( 1286 "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}", 1287 "PATH", filePathString + inputSuffix, "ERROR", 1288 e.code().value()); 1289 continue; 1290 } 1291 1292 // For Power field, Convert last 4 bytes of hex string into number 1293 // value. 1294 std::stringstream ssData; 1295 ssData << std::hex << extnValue.substr(extnValue.length() - 4); 1296 uint16_t MyHexNumber; 1297 ssData >> MyHexNumber; 1298 1299 // Convert output/DC power to input/AC power in Watts (round up) 1300 MyHexNumber = 1301 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0)))); 1302 1303 dbus::OccDBusSensors::getOccDBus().setUnit( 1304 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1305 1306 dbus::OccDBusSensors::getOccDBus().setValue(sensorPath, 1307 MyHexNumber); 1308 1309 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1310 sensorPath, true); 1311 1312 if (existingSensors.find(sensorPath) == existingSensors.end()) 1313 { 1314 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1315 sensorPath, {"all_sensors"}); 1316 } 1317 1318 existingSensors[sensorPath] = id; 1319 } // End Extended Power Sensors. 1320 } // End For loop on files for Extended Sensors. 1321 return; 1322 } 1323 1324 void Manager::setSensorValueToNaN(uint32_t id) const 1325 { 1326 for (const auto& [sensorPath, occId] : existingSensors) 1327 { 1328 if (occId == id) 1329 { 1330 dbus::OccDBusSensors::getOccDBus().setValue( 1331 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1332 1333 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1334 sensorPath, true); 1335 } 1336 } 1337 return; 1338 } 1339 1340 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1341 { 1342 for (const auto& [sensorPath, occId] : existingSensors) 1343 { 1344 if (occId == id) 1345 { 1346 dbus::OccDBusSensors::getOccDBus().setValue( 1347 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1348 1349 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1350 sensorPath, false); 1351 } 1352 } 1353 return; 1354 } 1355 1356 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1357 { 1358 static bool tracedError[8] = {0}; 1359 const fs::path sensorPath = occ->getHwmonPath(); 1360 const uint32_t id = occ->getOccInstanceID(); 1361 1362 if (fs::exists(sensorPath)) 1363 { 1364 // Read temperature sensors 1365 readTempSensors(sensorPath, id); 1366 // Read Extended sensors 1367 readExtnSensors(sensorPath, id); 1368 1369 if (occ->isMasterOcc()) 1370 { 1371 // Read power sensors 1372 readPowerSensors(sensorPath, id); 1373 } 1374 tracedError[id] = false; 1375 } 1376 else 1377 { 1378 if (!tracedError[id]) 1379 { 1380 lg2::error( 1381 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}", 1382 "INST", id, "PATH", sensorPath); 1383 tracedError[id] = true; 1384 } 1385 } 1386 1387 return; 1388 } 1389 1390 // Read the altitude from DBus 1391 void Manager::readAltitude() 1392 { 1393 static bool traceAltitudeErr = true; 1394 1395 utils::PropertyValue altitudeProperty{}; 1396 try 1397 { 1398 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1399 ALTITUDE_PROP); 1400 auto sensorVal = std::get<double>(altitudeProperty); 1401 if (sensorVal < 0xFFFF) 1402 { 1403 if (sensorVal < 0) 1404 { 1405 altitude = 0; 1406 } 1407 else 1408 { 1409 // Round to nearest meter 1410 altitude = uint16_t(sensorVal + 0.5); 1411 } 1412 lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE", 1413 sensorVal, "ALT", altitude); 1414 traceAltitudeErr = true; 1415 } 1416 else 1417 { 1418 if (traceAltitudeErr) 1419 { 1420 traceAltitudeErr = false; 1421 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal); 1422 } 1423 } 1424 } 1425 catch (const sdbusplus::exception_t& e) 1426 { 1427 if (traceAltitudeErr) 1428 { 1429 traceAltitudeErr = false; 1430 lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what()); 1431 } 1432 altitude = 0xFFFF; // not available 1433 } 1434 } 1435 1436 // Callback function when ambient temperature changes 1437 void Manager::ambientCallback(sdbusplus::message_t& msg) 1438 { 1439 double currentTemp = 0; 1440 uint8_t truncatedTemp = 0xFF; 1441 std::string msgSensor; 1442 std::map<std::string, std::variant<double>> msgData; 1443 msg.read(msgSensor, msgData); 1444 1445 auto valPropMap = msgData.find(AMBIENT_PROP); 1446 if (valPropMap == msgData.end()) 1447 { 1448 lg2::debug("ambientCallback: Unknown ambient property changed"); 1449 return; 1450 } 1451 currentTemp = std::get<double>(valPropMap->second); 1452 if (std::isnan(currentTemp)) 1453 { 1454 truncatedTemp = 0xFF; 1455 } 1456 else 1457 { 1458 if (currentTemp < 0) 1459 { 1460 truncatedTemp = 0; 1461 } 1462 else 1463 { 1464 // Round to nearest degree C 1465 truncatedTemp = uint8_t(currentTemp + 0.5); 1466 } 1467 } 1468 1469 // If ambient changes, notify OCCs 1470 if (truncatedTemp != ambient) 1471 { 1472 lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C", 1473 "OLD", ambient, "NEW", currentTemp); 1474 1475 ambient = truncatedTemp; 1476 if (altitude == 0xFFFF) 1477 { 1478 // No altitude yet, try reading again 1479 readAltitude(); 1480 } 1481 1482 lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m", 1483 "TEMP", ambient, "ALT", altitude); 1484 1485 // Send ambient and altitude to all OCCs 1486 for (auto& obj : statusObjects) 1487 { 1488 if (obj->occActive()) 1489 { 1490 obj->sendAmbient(ambient, altitude); 1491 } 1492 } 1493 } 1494 } 1495 1496 // return the current ambient and altitude readings 1497 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1498 uint16_t& altitudeValue) const 1499 { 1500 ambientValid = true; 1501 ambientTemp = ambient; 1502 altitudeValue = altitude; 1503 1504 if (ambient == 0xFF) 1505 { 1506 ambientValid = false; 1507 } 1508 } 1509 1510 // Called when waitForAllOccsTimer expires 1511 // After the first OCC goes active, this timer will be started (60 seconds) 1512 void Manager::occsNotAllRunning() 1513 { 1514 if (resetInProgress) 1515 { 1516 lg2::warning( 1517 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress"); 1518 return; 1519 } 1520 if (activeCount != statusObjects.size()) 1521 { 1522 // Not all OCCs went active 1523 lg2::warning( 1524 "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})", 1525 "COUNT", activeCount, "EXP", statusObjects.size()); 1526 // Procs may be garded, so may be expected 1527 } 1528 1529 if (resetRequired) 1530 { 1531 initiateOccRequest(resetInstance); 1532 1533 if (!waitForAllOccsTimer->isEnabled()) 1534 { 1535 lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer"); 1536 // restart occ wait timer 1537 waitForAllOccsTimer->restartOnce(60s); 1538 } 1539 } 1540 else 1541 { 1542 validateOccMaster(); 1543 } 1544 } 1545 1546 // Called when throttlePldmTraceTimer expires. 1547 // If this timer expires, that indicates there are no OCC active sensor PDRs 1548 // found which will trigger pldm traces to be throttled. 1549 // The second time this timer expires, a PEL will get created. 1550 void Manager::throttlePldmTraceExpired() 1551 { 1552 if (utils::isHostRunning()) 1553 { 1554 if (!onPldmTimeoutCreatePel) 1555 { 1556 // Throttle traces 1557 pldmHandle->setTraceThrottle(true); 1558 // Restart timer to log a PEL when timer expires 1559 onPldmTimeoutCreatePel = true; 1560 throttlePldmTraceTimer->restartOnce(40min); 1561 } 1562 else 1563 { 1564 lg2::error( 1565 "throttlePldmTraceExpired(): OCC active sensors still not available!"); 1566 // Create PEL 1567 createPldmSensorPEL(); 1568 } 1569 } 1570 else 1571 { 1572 // Make sure traces are not throttled 1573 pldmHandle->setTraceThrottle(false); 1574 lg2::info( 1575 "throttlePldmTraceExpired(): host it not running ignoring sensor timer"); 1576 } 1577 } 1578 1579 void Manager::createPldmSensorPEL() 1580 { 1581 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1582 std::map<std::string, std::string> additionalData; 1583 1584 additionalData.emplace("_PID", std::to_string(getpid())); 1585 1586 lg2::info( 1587 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs"); 1588 1589 auto& bus = utils::getBus(); 1590 1591 try 1592 { 1593 FFDCFiles ffdc; 1594 // Add occ-control journal traces to PEL FFDC 1595 auto occJournalFile = 1596 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1597 1598 static constexpr auto loggingObjectPath = 1599 "/xyz/openbmc_project/logging"; 1600 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1601 std::string service = 1602 utils::getService(loggingObjectPath, opLoggingInterface); 1603 auto method = 1604 bus.new_method_call(service.c_str(), loggingObjectPath, 1605 opLoggingInterface, "CreatePELWithFFDCFiles"); 1606 1607 // Set level to Warning (Predictive). 1608 auto level = 1609 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1610 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1611 Warning); 1612 1613 method.append(d.path, level, additionalData, ffdc); 1614 bus.call(method); 1615 } 1616 catch (const sdbusplus::exception_t& e) 1617 { 1618 lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR", 1619 e.what()); 1620 } 1621 } 1622 1623 // Verify single master OCC and start presence monitor 1624 void Manager::validateOccMaster() 1625 { 1626 int masterInstance = -1; 1627 for (auto& obj : statusObjects) 1628 { 1629 auto instance = obj->getOccInstanceID(); 1630 1631 if (!obj->occActive()) 1632 { 1633 if (utils::isHostRunning()) 1634 { 1635 // Check if sensor was queued while waiting for discovery 1636 auto match = queuedActiveState.find(instance); 1637 if (match != queuedActiveState.end()) 1638 { 1639 queuedActiveState.erase(match); 1640 lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)", 1641 "INST", instance); 1642 obj->occActive(true); 1643 } 1644 else 1645 { 1646 // OCC does not appear to be active yet, check active sensor 1647 pldmHandle->checkActiveSensor(instance); 1648 if (obj->occActive()) 1649 { 1650 lg2::info( 1651 "validateOccMaster: OCC{INST} is ACTIVE after reading sensor", 1652 "INST", instance); 1653 } 1654 } 1655 } 1656 else 1657 { 1658 lg2::warning( 1659 "validateOccMaster: HOST is not running (OCC{INST})", 1660 "INST", instance); 1661 return; 1662 } 1663 } 1664 1665 if (obj->isMasterOcc()) 1666 { 1667 obj->addPresenceWatchMaster(); 1668 1669 if (masterInstance == -1) 1670 { 1671 masterInstance = instance; 1672 } 1673 else 1674 { 1675 lg2::error( 1676 "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})", 1677 "MAST1", masterInstance, "MAST2", instance); 1678 // request reset 1679 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1680 } 1681 } 1682 } 1683 1684 if (masterInstance < 0) 1685 { 1686 lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)", 1687 "NUM", statusObjects.size()); 1688 // request reset 1689 statusObjects.front()->deviceError( 1690 Error::Descriptor(PRESENCE_ERROR_PATH)); 1691 } 1692 else 1693 { 1694 lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs", 1695 "INST", masterInstance, "COUNT", activeCount); 1696 1697 pmode->updateDbusSafeMode(false); 1698 } 1699 } 1700 1701 void Manager::updatePcapBounds() const 1702 { 1703 if (pcap) 1704 { 1705 pcap->updatePcapBounds(); 1706 } 1707 } 1708 1709 // Clean up any variables since the OCC is no longer running. 1710 // Called when pldm receives an event indicating host is powered off. 1711 void Manager::hostPoweredOff() 1712 { 1713 if (resetRequired) 1714 { 1715 lg2::info("hostPoweredOff: Clearing resetRequired for OCC{INST}", 1716 "INST", resetInstance); 1717 resetRequired = false; 1718 } 1719 if (resetInProgress) 1720 { 1721 lg2::info("hostPoweredOff: Clearing resetInProgress for OCC{INST}", 1722 "INST", resetInstance); 1723 resetInProgress = false; 1724 } 1725 resetInstance = 255; 1726 } 1727 1728 void Manager::collectDumpData(sdeventplus::source::Signal&, 1729 const struct signalfd_siginfo*) 1730 { 1731 json data; 1732 lg2::info("collectDumpData()"); 1733 data["objectCount"] = std::to_string(statusObjects.size()) + " OCC objects"; 1734 if (statusObjects.size() > 0) 1735 { 1736 try 1737 { 1738 for (auto& occ : statusObjects) 1739 { 1740 json occData; 1741 auto instance = occ->getOccInstanceID(); 1742 std::string occName = "occ" + std::to_string(instance); 1743 1744 if (occ->occActive()) 1745 { 1746 // OCC General Info 1747 occData["occState"] = "ACTIVE"; 1748 occData["occRole"] = 1749 occ->isMasterOcc() ? "MASTER" : "SECONDARY"; 1750 occData["occHwmonPath"] = 1751 occ->getHwmonPath().generic_string(); 1752 1753 // OCC Poll Response 1754 std::vector<std::uint8_t> cmd = {0x00, 0x00, 0x01, 0x20}; 1755 std::vector<std::uint8_t> rsp; 1756 std::vector<std::string> rspHex; 1757 rsp = passThroughObjects[instance]->send(cmd); 1758 if (rsp.size() > 5) 1759 { 1760 rsp.erase(rsp.begin(), 1761 rsp.begin() + 5); // Strip rsp header 1762 rspHex = utils::hex_dump(rsp); 1763 occData["pollResponse"] = rspHex; 1764 } 1765 1766 // Debug Data: WOF Dynamic Data 1767 cmd = {0x40, 0x00, 0x01, 0x01}; 1768 rsp = passThroughObjects[instance]->send(cmd); 1769 if (rsp.size() > 5) 1770 { 1771 rsp.erase(rsp.begin(), 1772 rsp.begin() + 5); // Strip rsp header 1773 rspHex = utils::hex_dump(rsp); 1774 occData["wofDataDynamic"] = rspHex; 1775 } 1776 1777 // Debug Data: WOF Dynamic Data 1778 cmd = {0x40, 0x00, 0x01, 0x0A}; 1779 rsp = passThroughObjects[instance]->send(cmd); 1780 if (rsp.size() > 5) 1781 { 1782 rsp.erase(rsp.begin(), 1783 rsp.begin() + 5); // Strip rsp header 1784 rspHex = utils::hex_dump(rsp); 1785 occData["wofDataStatic"] = rspHex; 1786 } 1787 } 1788 else 1789 { 1790 occData["occState"] = "NOT ACTIVE"; 1791 } 1792 1793 data[occName] = occData; 1794 } 1795 } 1796 catch (const std::exception& e) 1797 { 1798 lg2::error("Failed to collect OCC dump data: {ERR}", "ERR", 1799 e.what()); 1800 } 1801 } 1802 1803 std::ofstream file{Manager::dumpFile}; 1804 if (!file) 1805 { 1806 lg2::error("Failed to open {FILE} for occ-control data", "FILE", 1807 Manager::dumpFile); 1808 return; 1809 } 1810 1811 file << std::setw(4) << data; 1812 } 1813 1814 } // namespace occ 1815 } // namespace open_power 1816