1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 std::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 std::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 118 // Find/update the processor path associated with each OCC 119 for (auto& obj : statusObjects) 120 { 121 obj->updateProcAssociation(); 122 } 123 } 124 } 125 126 if (statusObjCreated && waitingForAllOccActiveSensors) 127 { 128 static bool tracedHostWait = false; 129 if (utils::isHostRunning()) 130 { 131 if (tracedHostWait) 132 { 133 log<level::INFO>( 134 "Manager::findAndCreateObjects(): Host is running"); 135 tracedHostWait = false; 136 } 137 checkAllActiveSensors(); 138 } 139 else 140 { 141 if (!tracedHostWait) 142 { 143 log<level::INFO>( 144 "Manager::findAndCreateObjects(): Waiting for host to start"); 145 tracedHostWait = true; 146 } 147 discoverTimer->restartOnce(30s); 148 } 149 } 150 } 151 else 152 { 153 log<level::INFO>( 154 std::format( 155 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 156 HOST_ON_FILE) 157 .c_str()); 158 discoverTimer->restartOnce(10s); 159 } 160 #endif 161 } 162 163 #ifdef POWER10 164 // Check if all occActive sensors are available 165 void Manager::checkAllActiveSensors() 166 { 167 static bool allActiveSensorAvailable = false; 168 static bool tracedSensorWait = false; 169 static bool waitingForHost = false; 170 171 if (open_power::occ::utils::isHostRunning()) 172 { 173 if (waitingForHost) 174 { 175 waitingForHost = false; 176 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 177 } 178 179 // Start with the assumption that all are available 180 allActiveSensorAvailable = true; 181 for (auto& obj : statusObjects) 182 { 183 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 184 { 185 auto instance = obj->getOccInstanceID(); 186 // Check if sensor was queued while waiting for discovery 187 auto match = queuedActiveState.find(instance); 188 if (match != queuedActiveState.end()) 189 { 190 queuedActiveState.erase(match); 191 log<level::INFO>( 192 std::format( 193 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 194 instance) 195 .c_str()); 196 obj->occActive(true); 197 } 198 else 199 { 200 allActiveSensorAvailable = false; 201 if (!tracedSensorWait) 202 { 203 log<level::INFO>( 204 std::format( 205 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 206 instance) 207 .c_str()); 208 tracedSensorWait = true; 209 } 210 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 211 break; 212 } 213 } 214 } 215 } 216 else 217 { 218 if (!waitingForHost) 219 { 220 waitingForHost = true; 221 log<level::INFO>( 222 "checkAllActiveSensors(): Waiting for host to start"); 223 } 224 } 225 226 if (allActiveSensorAvailable) 227 { 228 // All sensors were found, disable the discovery timer 229 if (discoverTimer->isEnabled()) 230 { 231 discoverTimer->setEnabled(false); 232 } 233 234 if (waitingForAllOccActiveSensors) 235 { 236 log<level::INFO>( 237 "checkAllActiveSensors(): OCC Active sensors are available"); 238 waitingForAllOccActiveSensors = false; 239 } 240 queuedActiveState.clear(); 241 tracedSensorWait = false; 242 } 243 else 244 { 245 // Not all sensors were available, so keep waiting 246 if (!tracedSensorWait) 247 { 248 log<level::INFO>( 249 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 250 tracedSensorWait = true; 251 } 252 discoverTimer->restartOnce(10s); 253 } 254 } 255 #endif 256 257 std::vector<int> Manager::findOCCsInDev() 258 { 259 std::vector<int> occs; 260 std::regex expr{R"(occ(\d+)$)"}; 261 262 for (auto& file : fs::directory_iterator("/dev")) 263 { 264 std::smatch match; 265 std::string path{file.path().string()}; 266 if (std::regex_search(path, match, expr)) 267 { 268 auto num = std::stoi(match[1].str()); 269 270 // /dev numbering starts at 1, ours starts at 0. 271 occs.push_back(num - 1); 272 } 273 } 274 275 return occs; 276 } 277 278 int Manager::cpuCreated(sdbusplus::message_t& msg) 279 { 280 namespace fs = std::filesystem; 281 282 sdbusplus::message::object_path o; 283 msg.read(o); 284 fs::path cpuPath(std::string(std::move(o))); 285 286 auto name = cpuPath.filename().string(); 287 auto index = name.find(CPU_NAME); 288 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 289 290 createObjects(name); 291 292 return 0; 293 } 294 295 void Manager::createObjects(const std::string& occ) 296 { 297 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 298 299 statusObjects.emplace_back(std::make_unique<Status>( 300 event, path.c_str(), *this, 301 #ifdef POWER10 302 pmode, 303 #endif 304 std::bind(std::mem_fn(&Manager::statusCallBack), this, 305 std::placeholders::_1, std::placeholders::_2) 306 #ifdef PLDM 307 , 308 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 309 std::placeholders::_1) 310 #endif 311 )); 312 313 // Create the power cap monitor object 314 if (!pcap) 315 { 316 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 317 *statusObjects.back()); 318 } 319 320 if (statusObjects.back()->isMasterOcc()) 321 { 322 log<level::INFO>( 323 std::format("Manager::createObjects(): OCC{} is the master", 324 statusObjects.back()->getOccInstanceID()) 325 .c_str()); 326 _pollTimer->setEnabled(false); 327 328 #ifdef POWER10 329 // Set the master OCC on the PowerMode object 330 pmode->setMasterOcc(path); 331 #endif 332 } 333 334 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 335 #ifdef POWER10 336 , 337 pmode 338 #endif 339 )); 340 } 341 342 void Manager::statusCallBack(instanceID instance, bool status) 343 { 344 if (status == true) 345 { 346 // OCC went active 347 ++activeCount; 348 349 #ifdef POWER10 350 if (activeCount == 1) 351 { 352 // First OCC went active (allow some time for all OCCs to go active) 353 waitForAllOccsTimer->restartOnce(60s); 354 } 355 #endif 356 357 if (activeCount == statusObjects.size()) 358 { 359 #ifdef POWER10 360 // All OCCs are now running 361 if (waitForAllOccsTimer->isEnabled()) 362 { 363 // stop occ wait timer 364 waitForAllOccsTimer->setEnabled(false); 365 } 366 #endif 367 368 // Verify master OCC and start presence monitor 369 validateOccMaster(); 370 } 371 372 // Start poll timer if not already started 373 if (!_pollTimer->isEnabled()) 374 { 375 log<level::INFO>( 376 std::format("Manager: OCCs will be polled every {} seconds", 377 pollInterval) 378 .c_str()); 379 380 // Send poll and start OCC poll timer 381 pollerTimerExpired(); 382 } 383 } 384 else 385 { 386 // OCC went away 387 if (activeCount > 0) 388 { 389 --activeCount; 390 } 391 else 392 { 393 log<level::ERR>( 394 std::format("OCC{} disabled, but currently no active OCCs", 395 instance) 396 .c_str()); 397 } 398 399 if (activeCount == 0) 400 { 401 // No OCCs are running 402 403 // Stop OCC poll timer 404 if (_pollTimer->isEnabled()) 405 { 406 log<level::INFO>( 407 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 408 _pollTimer->setEnabled(false); 409 } 410 411 #ifdef POWER10 412 // stop wait timer 413 if (waitForAllOccsTimer->isEnabled()) 414 { 415 waitForAllOccsTimer->setEnabled(false); 416 } 417 #endif 418 } 419 #ifdef READ_OCC_SENSORS 420 // Clear OCC sensors 421 setSensorValueToNaN(instance); 422 #endif 423 } 424 425 #ifdef POWER10 426 if (waitingForAllOccActiveSensors) 427 { 428 if (utils::isHostRunning()) 429 { 430 checkAllActiveSensors(); 431 } 432 } 433 #endif 434 } 435 436 #ifdef I2C_OCC 437 void Manager::initStatusObjects() 438 { 439 // Make sure we have a valid path string 440 static_assert(sizeof(DEV_PATH) != 0); 441 442 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 443 for (auto& name : deviceNames) 444 { 445 i2c_occ::i2cToDbus(name); 446 name = std::string(OCC_NAME) + '_' + name; 447 auto path = fs::path(OCC_CONTROL_ROOT) / name; 448 statusObjects.emplace_back( 449 std::make_unique<Status>(event, path.c_str(), *this)); 450 } 451 // The first device is master occ 452 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 453 *statusObjects.front()); 454 #ifdef POWER10 455 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 456 powermode::PIPS_PATH); 457 // Set the master OCC on the PowerMode object 458 pmode->setMasterOcc(path); 459 #endif 460 } 461 #endif 462 463 #ifdef PLDM 464 void Manager::sbeTimeout(unsigned int instance) 465 { 466 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 467 [instance](const auto& obj) { 468 return instance == obj->getOccInstanceID(); 469 }); 470 471 if (obj != statusObjects.end() && (*obj)->occActive()) 472 { 473 log<level::INFO>( 474 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 475 .c_str()); 476 477 setSBEState(instance, SBE_STATE_NOT_USABLE); 478 479 pldmHandle->sendHRESET(instance); 480 } 481 } 482 483 bool Manager::updateOCCActive(instanceID instance, bool status) 484 { 485 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 486 [instance](const auto& obj) { 487 return instance == obj->getOccInstanceID(); 488 }); 489 490 const bool hostRunning = open_power::occ::utils::isHostRunning(); 491 if (obj != statusObjects.end()) 492 { 493 if (!hostRunning && (status == true)) 494 { 495 log<level::WARNING>( 496 std::format( 497 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 498 instance, status) 499 .c_str()); 500 (*obj)->setPldmSensorReceived(false); 501 if (!waitingForAllOccActiveSensors) 502 { 503 log<level::INFO>( 504 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 505 waitingForAllOccActiveSensors = true; 506 } 507 discoverTimer->restartOnce(30s); 508 return false; 509 } 510 else 511 { 512 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 513 instance, status) 514 .c_str()); 515 (*obj)->setPldmSensorReceived(true); 516 return (*obj)->occActive(status); 517 } 518 } 519 else 520 { 521 if (hostRunning) 522 { 523 log<level::WARNING>( 524 std::format( 525 "updateOCCActive: No status object to update for OCC{} (active={})", 526 instance, status) 527 .c_str()); 528 } 529 else 530 { 531 if (status == true) 532 { 533 log<level::WARNING>( 534 std::format( 535 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 536 instance, status) 537 .c_str()); 538 } 539 } 540 if (status == true) 541 { 542 // OCC went active 543 queuedActiveState.insert(instance); 544 } 545 else 546 { 547 auto match = queuedActiveState.find(instance); 548 if (match != queuedActiveState.end()) 549 { 550 // OCC was disabled 551 queuedActiveState.erase(match); 552 } 553 } 554 return false; 555 } 556 } 557 558 // Called upon pldm event To set powermode Safe Mode State for system. 559 void Manager::updateOccSafeMode(bool safeMode) 560 { 561 #ifdef POWER10 562 pmode->updateDbusSafeMode(safeMode); 563 #endif 564 // Update the processor throttle status on dbus 565 for (auto& obj : statusObjects) 566 { 567 obj->updateThrottle(safeMode, THROTTLED_SAFE); 568 } 569 } 570 571 void Manager::sbeHRESETResult(instanceID instance, bool success) 572 { 573 if (success) 574 { 575 log<level::INFO>( 576 std::format("HRESET succeeded (OCC{})", instance).c_str()); 577 578 setSBEState(instance, SBE_STATE_BOOTED); 579 580 return; 581 } 582 583 setSBEState(instance, SBE_STATE_FAILED); 584 585 if (sbeCanDump(instance)) 586 { 587 log<level::INFO>( 588 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 589 .c_str()); 590 591 auto& bus = utils::getBus(); 592 uint32_t src6 = instance << 16; 593 uint32_t logId = 594 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 595 src6, "SBE command timeout"); 596 597 try 598 { 599 constexpr auto path = "/org/openpower/dump"; 600 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 601 constexpr auto function = "CreateDump"; 602 603 std::string service = utils::getService(path, interface); 604 auto method = bus.new_method_call(service.c_str(), path, interface, 605 function); 606 607 std::map<std::string, std::variant<std::string, uint64_t>> 608 createParams{ 609 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 610 uint64_t(logId)}, 611 {"com.ibm.Dump.Create.CreateParameters.DumpType", 612 "com.ibm.Dump.Create.DumpType.SBE"}, 613 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 614 uint64_t(instance)}, 615 }; 616 617 method.append(createParams); 618 619 auto response = bus.call(method); 620 } 621 catch (const sdbusplus::exception_t& e) 622 { 623 constexpr auto ERROR_DUMP_DISABLED = 624 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 625 if (e.name() == ERROR_DUMP_DISABLED) 626 { 627 log<level::INFO>("Dump is disabled, skipping"); 628 } 629 else 630 { 631 log<level::ERR>("Dump failed"); 632 } 633 } 634 } 635 } 636 637 bool Manager::sbeCanDump(unsigned int instance) 638 { 639 struct pdbg_target* proc = getPdbgTarget(instance); 640 641 if (!proc) 642 { 643 // allow the dump in the error case 644 return true; 645 } 646 647 try 648 { 649 if (!openpower::phal::sbe::isDumpAllowed(proc)) 650 { 651 return false; 652 } 653 654 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 655 { 656 return false; 657 } 658 } 659 catch (openpower::phal::exception::SbeError& e) 660 { 661 log<level::INFO>("Failed to query SBE state"); 662 } 663 664 // allow the dump in the error case 665 return true; 666 } 667 668 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 669 { 670 struct pdbg_target* proc = getPdbgTarget(instance); 671 672 if (!proc) 673 { 674 return; 675 } 676 677 try 678 { 679 openpower::phal::sbe::setState(proc, state); 680 } 681 catch (const openpower::phal::exception::SbeError& e) 682 { 683 log<level::ERR>("Failed to set SBE state"); 684 } 685 } 686 687 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 688 { 689 if (!pdbgInitialized) 690 { 691 try 692 { 693 openpower::phal::pdbg::init(); 694 pdbgInitialized = true; 695 } 696 catch (const openpower::phal::exception::PdbgError& e) 697 { 698 log<level::ERR>("pdbg initialization failed"); 699 return nullptr; 700 } 701 } 702 703 struct pdbg_target* proc = nullptr; 704 pdbg_for_each_class_target("proc", proc) 705 { 706 if (pdbg_target_index(proc) == instance) 707 { 708 return proc; 709 } 710 } 711 712 log<level::ERR>("Failed to get pdbg target"); 713 return nullptr; 714 } 715 #endif 716 717 void Manager::pollerTimerExpired() 718 { 719 if (!_pollTimer) 720 { 721 log<level::ERR>( 722 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 723 return; 724 } 725 726 for (auto& obj : statusObjects) 727 { 728 if (!obj->occActive()) 729 { 730 // OCC is not running yet 731 #ifdef READ_OCC_SENSORS 732 auto id = obj->getOccInstanceID(); 733 setSensorValueToNaN(id); 734 #endif 735 continue; 736 } 737 738 // Read sysfs to force kernel to poll OCC 739 obj->readOccState(); 740 741 #ifdef READ_OCC_SENSORS 742 // Read occ sensor values 743 getSensorValues(obj); 744 #endif 745 } 746 747 if (activeCount > 0) 748 { 749 // Restart OCC poll timer 750 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 751 } 752 else 753 { 754 // No OCCs running, so poll timer will not be restarted 755 log<level::INFO>( 756 std::format( 757 "Manager::pollerTimerExpired: poll timer will not be restarted") 758 .c_str()); 759 } 760 } 761 762 #ifdef READ_OCC_SENSORS 763 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 764 { 765 // There may be more than one sensor with the same FRU type 766 // and label so make two passes: the first to read the temps 767 // from sysfs, and the second to put them on D-Bus after 768 // resolving any conflicts. 769 std::map<std::string, double> sensorData; 770 771 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 772 for (auto& file : fs::directory_iterator(path)) 773 { 774 if (!std::regex_search(file.path().string(), expr)) 775 { 776 continue; 777 } 778 779 uint32_t labelValue{0}; 780 781 try 782 { 783 labelValue = readFile<uint32_t>(file.path()); 784 } 785 catch (const std::system_error& e) 786 { 787 log<level::DEBUG>( 788 std::format("readTempSensors: Failed reading {}, errno = {}", 789 file.path().string(), e.code().value()) 790 .c_str()); 791 continue; 792 } 793 794 const std::string& tempLabel = "label"; 795 const std::string filePathString = file.path().string().substr( 796 0, file.path().string().length() - tempLabel.length()); 797 798 uint32_t fruTypeValue{0}; 799 try 800 { 801 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 802 } 803 catch (const std::system_error& e) 804 { 805 log<level::DEBUG>( 806 std::format("readTempSensors: Failed reading {}, errno = {}", 807 filePathString + fruTypeSuffix, e.code().value()) 808 .c_str()); 809 continue; 810 } 811 812 std::string sensorPath = OCC_SENSORS_ROOT + 813 std::string("/temperature/"); 814 815 std::string dvfsTempPath; 816 817 if (fruTypeValue == VRMVdd) 818 { 819 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 820 "_temp"); 821 } 822 else if (fruTypeValue == processorIoRing) 823 { 824 sensorPath.append("proc" + std::to_string(occInstance) + 825 "_ioring_temp"); 826 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 827 std::to_string(occInstance) + "_ioring_dvfs_temp"; 828 } 829 else 830 { 831 uint16_t type = (labelValue & 0xFF000000) >> 24; 832 uint16_t instanceID = labelValue & 0x0000FFFF; 833 834 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 835 { 836 if (fruTypeValue == fruTypeNotAvailable) 837 { 838 // Not all DIMM related temps are available to read 839 // (no _input file in this case) 840 continue; 841 } 842 auto iter = dimmTempSensorName.find(fruTypeValue); 843 if (iter == dimmTempSensorName.end()) 844 { 845 log<level::ERR>( 846 std::format( 847 "readTempSensors: Fru type error! fruTypeValue = {}) ", 848 fruTypeValue) 849 .c_str()); 850 continue; 851 } 852 853 sensorPath.append("dimm" + std::to_string(instanceID) + 854 iter->second); 855 856 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 857 dimmDVFSSensorName.at(fruTypeValue); 858 } 859 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 860 { 861 if (fruTypeValue == processorCore) 862 { 863 // The OCC reports small core temps, of which there are 864 // two per big core. All current P10 systems are in big 865 // core mode, so use a big core name. 866 uint16_t coreNum = instanceID / 2; 867 uint16_t tempNum = instanceID % 2; 868 sensorPath.append("proc" + std::to_string(occInstance) + 869 "_core" + std::to_string(coreNum) + "_" + 870 std::to_string(tempNum) + "_temp"); 871 872 dvfsTempPath = 873 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 874 std::to_string(occInstance) + "_core_dvfs_temp"; 875 } 876 else 877 { 878 continue; 879 } 880 } 881 else 882 { 883 continue; 884 } 885 } 886 887 // The dvfs temp file only needs to be read once per chip per type. 888 if (!dvfsTempPath.empty() && 889 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 890 { 891 try 892 { 893 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 894 895 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 896 dvfsTempPath, dvfsValue * std::pow(10, -3)); 897 } 898 catch (const std::system_error& e) 899 { 900 log<level::DEBUG>( 901 std::format( 902 "readTempSensors: Failed reading {}, errno = {}", 903 filePathString + maxSuffix, e.code().value()) 904 .c_str()); 905 } 906 } 907 908 uint32_t faultValue{0}; 909 try 910 { 911 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 912 } 913 catch (const std::system_error& e) 914 { 915 log<level::DEBUG>( 916 std::format("readTempSensors: Failed reading {}, errno = {}", 917 filePathString + faultSuffix, e.code().value()) 918 .c_str()); 919 continue; 920 } 921 922 double tempValue{0}; 923 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 924 if (faultValue != 0) 925 { 926 tempValue = std::numeric_limits<double>::quiet_NaN(); 927 } 928 else 929 { 930 // Read the temperature 931 try 932 { 933 tempValue = readFile<double>(filePathString + inputSuffix); 934 } 935 catch (const std::system_error& e) 936 { 937 log<level::DEBUG>( 938 std::format( 939 "readTempSensors: Failed reading {}, errno = {}", 940 filePathString + inputSuffix, e.code().value()) 941 .c_str()); 942 943 // if errno == EAGAIN(Resource temporarily unavailable) then set 944 // temp to 0, to avoid using old temp, and affecting FAN 945 // Control. 946 if (e.code().value() == EAGAIN) 947 { 948 tempValue = 0; 949 } 950 // else the errno would be something like 951 // EBADF(Bad file descriptor) 952 // or ENOENT(No such file or directory) 953 else 954 { 955 continue; 956 } 957 } 958 } 959 960 // If this object path already has a value, only overwite 961 // it if the previous one was an NaN or a smaller value. 962 auto existing = sensorData.find(sensorPath); 963 if (existing != sensorData.end()) 964 { 965 // Multiple sensors found for this FRU type 966 if ((std::isnan(existing->second) && (tempValue == 0)) || 967 ((existing->second == 0) && std::isnan(tempValue))) 968 { 969 // One of the redundant sensors has failed (0xFF/nan), and the 970 // other sensor has no reading (0), so set the FRU to NaN to 971 // force fan increase 972 tempValue = std::numeric_limits<double>::quiet_NaN(); 973 existing->second = tempValue; 974 } 975 if (std::isnan(existing->second) || (tempValue > existing->second)) 976 { 977 existing->second = tempValue; 978 } 979 } 980 else 981 { 982 // First sensor for this FRU type 983 sensorData[sensorPath] = tempValue; 984 } 985 } 986 987 // Now publish the values on D-Bus. 988 for (const auto& [objectPath, value] : sensorData) 989 { 990 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 991 value * std::pow(10, -3)); 992 993 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 994 objectPath, !std::isnan(value)); 995 996 if (existingSensors.find(objectPath) == existingSensors.end()) 997 { 998 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 999 objectPath); 1000 } 1001 1002 existingSensors[objectPath] = occInstance; 1003 } 1004 } 1005 1006 std::optional<std::string> 1007 Manager::getPowerLabelFunctionID(const std::string& value) 1008 { 1009 // If the value is "system", then the FunctionID is "system". 1010 if (value == "system") 1011 { 1012 return value; 1013 } 1014 1015 // If the value is not "system", then the label value have 3 numbers, of 1016 // which we only care about the middle one: 1017 // <sensor id>_<function id>_<apss channel> 1018 // eg: The value is "0_10_5" , then the FunctionID is "10". 1019 if (value.find("_") == std::string::npos) 1020 { 1021 return std::nullopt; 1022 } 1023 1024 auto powerLabelValue = value.substr((value.find("_") + 1)); 1025 1026 if (powerLabelValue.find("_") == std::string::npos) 1027 { 1028 return std::nullopt; 1029 } 1030 1031 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1032 } 1033 1034 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1035 { 1036 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1037 for (auto& file : fs::directory_iterator(path)) 1038 { 1039 if (!std::regex_search(file.path().string(), expr)) 1040 { 1041 continue; 1042 } 1043 1044 std::string labelValue; 1045 try 1046 { 1047 labelValue = readFile<std::string>(file.path()); 1048 } 1049 catch (const std::system_error& e) 1050 { 1051 log<level::DEBUG>( 1052 std::format("readPowerSensors: Failed reading {}, errno = {}", 1053 file.path().string(), e.code().value()) 1054 .c_str()); 1055 continue; 1056 } 1057 1058 auto functionID = getPowerLabelFunctionID(labelValue); 1059 if (functionID == std::nullopt) 1060 { 1061 continue; 1062 } 1063 1064 const std::string& tempLabel = "label"; 1065 const std::string filePathString = file.path().string().substr( 1066 0, file.path().string().length() - tempLabel.length()); 1067 1068 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1069 1070 auto iter = powerSensorName.find(*functionID); 1071 if (iter == powerSensorName.end()) 1072 { 1073 continue; 1074 } 1075 sensorPath.append(iter->second); 1076 1077 double tempValue{0}; 1078 1079 try 1080 { 1081 tempValue = readFile<double>(filePathString + inputSuffix); 1082 } 1083 catch (const std::system_error& e) 1084 { 1085 log<level::DEBUG>( 1086 std::format("readPowerSensors: Failed reading {}, errno = {}", 1087 filePathString + inputSuffix, e.code().value()) 1088 .c_str()); 1089 continue; 1090 } 1091 1092 dbus::OccDBusSensors::getOccDBus().setUnit( 1093 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1094 1095 dbus::OccDBusSensors::getOccDBus().setValue( 1096 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1097 1098 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1099 true); 1100 1101 if (existingSensors.find(sensorPath) == existingSensors.end()) 1102 { 1103 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1104 sensorPath); 1105 } 1106 1107 existingSensors[sensorPath] = id; 1108 } 1109 return; 1110 } 1111 1112 void Manager::setSensorValueToNaN(uint32_t id) const 1113 { 1114 for (const auto& [sensorPath, occId] : existingSensors) 1115 { 1116 if (occId == id) 1117 { 1118 dbus::OccDBusSensors::getOccDBus().setValue( 1119 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1120 1121 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1122 true); 1123 } 1124 } 1125 return; 1126 } 1127 1128 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1129 { 1130 for (const auto& [sensorPath, occId] : existingSensors) 1131 { 1132 if (occId == id) 1133 { 1134 dbus::OccDBusSensors::getOccDBus().setValue( 1135 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1136 1137 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1138 false); 1139 } 1140 } 1141 return; 1142 } 1143 1144 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1145 { 1146 static bool tracedError[8] = {0}; 1147 const fs::path sensorPath = occ->getHwmonPath(); 1148 const uint32_t id = occ->getOccInstanceID(); 1149 1150 if (fs::exists(sensorPath)) 1151 { 1152 // Read temperature sensors 1153 readTempSensors(sensorPath, id); 1154 1155 if (occ->isMasterOcc()) 1156 { 1157 // Read power sensors 1158 readPowerSensors(sensorPath, id); 1159 } 1160 tracedError[id] = false; 1161 } 1162 else 1163 { 1164 if (!tracedError[id]) 1165 { 1166 log<level::ERR>( 1167 std::format( 1168 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1169 id, sensorPath.c_str()) 1170 .c_str()); 1171 tracedError[id] = true; 1172 } 1173 } 1174 1175 return; 1176 } 1177 #endif 1178 1179 // Read the altitude from DBus 1180 void Manager::readAltitude() 1181 { 1182 static bool traceAltitudeErr = true; 1183 1184 utils::PropertyValue altitudeProperty{}; 1185 try 1186 { 1187 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1188 ALTITUDE_PROP); 1189 auto sensorVal = std::get<double>(altitudeProperty); 1190 if (sensorVal < 0xFFFF) 1191 { 1192 if (sensorVal < 0) 1193 { 1194 altitude = 0; 1195 } 1196 else 1197 { 1198 // Round to nearest meter 1199 altitude = uint16_t(sensorVal + 0.5); 1200 } 1201 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1202 sensorVal, altitude) 1203 .c_str()); 1204 traceAltitudeErr = true; 1205 } 1206 else 1207 { 1208 if (traceAltitudeErr) 1209 { 1210 traceAltitudeErr = false; 1211 log<level::DEBUG>( 1212 std::format("Invalid altitude value: {}", sensorVal) 1213 .c_str()); 1214 } 1215 } 1216 } 1217 catch (const sdbusplus::exception_t& e) 1218 { 1219 if (traceAltitudeErr) 1220 { 1221 traceAltitudeErr = false; 1222 log<level::INFO>( 1223 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1224 } 1225 altitude = 0xFFFF; // not available 1226 } 1227 } 1228 1229 // Callback function when ambient temperature changes 1230 void Manager::ambientCallback(sdbusplus::message_t& msg) 1231 { 1232 double currentTemp = 0; 1233 uint8_t truncatedTemp = 0xFF; 1234 std::string msgSensor; 1235 std::map<std::string, std::variant<double>> msgData; 1236 msg.read(msgSensor, msgData); 1237 1238 auto valPropMap = msgData.find(AMBIENT_PROP); 1239 if (valPropMap == msgData.end()) 1240 { 1241 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1242 return; 1243 } 1244 currentTemp = std::get<double>(valPropMap->second); 1245 if (std::isnan(currentTemp)) 1246 { 1247 truncatedTemp = 0xFF; 1248 } 1249 else 1250 { 1251 if (currentTemp < 0) 1252 { 1253 truncatedTemp = 0; 1254 } 1255 else 1256 { 1257 // Round to nearest degree C 1258 truncatedTemp = uint8_t(currentTemp + 0.5); 1259 } 1260 } 1261 1262 // If ambient changes, notify OCCs 1263 if (truncatedTemp != ambient) 1264 { 1265 log<level::DEBUG>( 1266 std::format("ambientCallback: Ambient change from {} to {}C", 1267 ambient, currentTemp) 1268 .c_str()); 1269 1270 ambient = truncatedTemp; 1271 if (altitude == 0xFFFF) 1272 { 1273 // No altitude yet, try reading again 1274 readAltitude(); 1275 } 1276 1277 log<level::DEBUG>( 1278 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1279 altitude) 1280 .c_str()); 1281 #ifdef POWER10 1282 // Send ambient and altitude to all OCCs 1283 for (auto& obj : statusObjects) 1284 { 1285 if (obj->occActive()) 1286 { 1287 obj->sendAmbient(ambient, altitude); 1288 } 1289 } 1290 #endif // POWER10 1291 } 1292 } 1293 1294 // return the current ambient and altitude readings 1295 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1296 uint16_t& altitudeValue) const 1297 { 1298 ambientValid = true; 1299 ambientTemp = ambient; 1300 altitudeValue = altitude; 1301 1302 if (ambient == 0xFF) 1303 { 1304 ambientValid = false; 1305 } 1306 } 1307 1308 #ifdef POWER10 1309 // Called when waitForAllOccsTimer expires 1310 // After the first OCC goes active, this timer will be started (60 seconds) 1311 void Manager::occsNotAllRunning() 1312 { 1313 if (activeCount != statusObjects.size()) 1314 { 1315 // Not all OCCs went active 1316 log<level::WARNING>( 1317 std::format( 1318 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1319 activeCount, statusObjects.size()) 1320 .c_str()); 1321 // Procs may be garded, so may be expected 1322 } 1323 1324 validateOccMaster(); 1325 } 1326 #endif // POWER10 1327 1328 // Verify single master OCC and start presence monitor 1329 void Manager::validateOccMaster() 1330 { 1331 int masterInstance = -1; 1332 for (auto& obj : statusObjects) 1333 { 1334 auto instance = obj->getOccInstanceID(); 1335 #ifdef POWER10 1336 if (!obj->occActive()) 1337 { 1338 if (utils::isHostRunning()) 1339 { 1340 // Check if sensor was queued while waiting for discovery 1341 auto match = queuedActiveState.find(instance); 1342 if (match != queuedActiveState.end()) 1343 { 1344 queuedActiveState.erase(match); 1345 log<level::INFO>( 1346 std::format( 1347 "validateOccMaster: OCC{} is ACTIVE (queued)", 1348 instance) 1349 .c_str()); 1350 obj->occActive(true); 1351 } 1352 else 1353 { 1354 // OCC does not appear to be active yet, check active sensor 1355 pldmHandle->checkActiveSensor(instance); 1356 if (obj->occActive()) 1357 { 1358 log<level::INFO>( 1359 std::format( 1360 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1361 instance) 1362 .c_str()); 1363 } 1364 } 1365 } 1366 else 1367 { 1368 log<level::WARNING>( 1369 std::format( 1370 "validateOccMaster: HOST is not running (OCC{})", 1371 instance) 1372 .c_str()); 1373 return; 1374 } 1375 } 1376 #endif // POWER10 1377 1378 if (obj->isMasterOcc()) 1379 { 1380 obj->addPresenceWatchMaster(); 1381 1382 if (masterInstance == -1) 1383 { 1384 masterInstance = instance; 1385 } 1386 else 1387 { 1388 log<level::ERR>( 1389 std::format( 1390 "validateOccMaster: Multiple OCC masters! ({} and {})", 1391 masterInstance, instance) 1392 .c_str()); 1393 // request reset 1394 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1395 } 1396 } 1397 } 1398 1399 if (masterInstance < 0) 1400 { 1401 log<level::ERR>( 1402 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1403 statusObjects.size()) 1404 .c_str()); 1405 // request reset 1406 statusObjects.front()->deviceError( 1407 Error::Descriptor(PRESENCE_ERROR_PATH)); 1408 } 1409 else 1410 { 1411 log<level::INFO>( 1412 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1413 masterInstance, activeCount) 1414 .c_str()); 1415 #ifdef POWER10 1416 pmode->updateDbusSafeMode(false); 1417 #endif 1418 } 1419 } 1420 1421 void Manager::updatePcapBounds() const 1422 { 1423 if (pcap) 1424 { 1425 pcap->updatePcapBounds(); 1426 } 1427 } 1428 1429 } // namespace occ 1430 } // namespace open_power 1431