1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 fmt::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 fmt::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 } 118 } 119 120 if (statusObjCreated && waitingForAllOccActiveSensors) 121 { 122 static bool tracedHostWait = false; 123 if (utils::isHostRunning()) 124 { 125 if (tracedHostWait) 126 { 127 log<level::INFO>( 128 "Manager::findAndCreateObjects(): Host is running"); 129 tracedHostWait = false; 130 } 131 checkAllActiveSensors(); 132 } 133 else 134 { 135 if (!tracedHostWait) 136 { 137 log<level::INFO>( 138 "Manager::findAndCreateObjects(): Waiting for host to start"); 139 tracedHostWait = true; 140 } 141 discoverTimer->restartOnce(30s); 142 } 143 } 144 } 145 else 146 { 147 log<level::INFO>( 148 fmt::format( 149 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 150 HOST_ON_FILE) 151 .c_str()); 152 discoverTimer->restartOnce(10s); 153 } 154 #endif 155 } 156 157 #ifdef POWER10 158 // Check if all occActive sensors are available 159 void Manager::checkAllActiveSensors() 160 { 161 static bool allActiveSensorAvailable = false; 162 static bool tracedSensorWait = false; 163 164 // Start with the assumption that all are available 165 allActiveSensorAvailable = true; 166 for (auto& obj : statusObjects) 167 { 168 if (!obj->occActive()) 169 { 170 if (!obj->getPldmSensorReceived()) 171 { 172 auto instance = obj->getOccInstanceID(); 173 // Check if sensor was queued while waiting for discovery 174 auto match = queuedActiveState.find(instance); 175 if (match != queuedActiveState.end()) 176 { 177 queuedActiveState.erase(match); 178 log<level::INFO>( 179 fmt::format( 180 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 181 instance) 182 .c_str()); 183 obj->occActive(true); 184 } 185 else 186 { 187 allActiveSensorAvailable = false; 188 if (!tracedSensorWait) 189 { 190 log<level::INFO>( 191 fmt::format( 192 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 193 instance) 194 .c_str()); 195 tracedSensorWait = true; 196 } 197 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 198 break; 199 } 200 } 201 } 202 } 203 204 if (allActiveSensorAvailable) 205 { 206 // All sensors were found, disable the discovery timer 207 if (discoverTimer->isEnabled()) 208 { 209 discoverTimer->setEnabled(false); 210 } 211 212 if (waitingForAllOccActiveSensors) 213 { 214 log<level::INFO>( 215 "checkAllActiveSensors(): OCC Active sensors are available"); 216 waitingForAllOccActiveSensors = false; 217 } 218 queuedActiveState.clear(); 219 tracedSensorWait = false; 220 } 221 else 222 { 223 // Not all sensors were available, so keep waiting 224 if (!tracedSensorWait) 225 { 226 log<level::INFO>( 227 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 228 tracedSensorWait = true; 229 } 230 discoverTimer->restartOnce(10s); 231 } 232 } 233 #endif 234 235 std::vector<int> Manager::findOCCsInDev() 236 { 237 std::vector<int> occs; 238 std::regex expr{R"(occ(\d+)$)"}; 239 240 for (auto& file : fs::directory_iterator("/dev")) 241 { 242 std::smatch match; 243 std::string path{file.path().string()}; 244 if (std::regex_search(path, match, expr)) 245 { 246 auto num = std::stoi(match[1].str()); 247 248 // /dev numbering starts at 1, ours starts at 0. 249 occs.push_back(num - 1); 250 } 251 } 252 253 return occs; 254 } 255 256 int Manager::cpuCreated(sdbusplus::message_t& msg) 257 { 258 namespace fs = std::filesystem; 259 260 sdbusplus::message::object_path o; 261 msg.read(o); 262 fs::path cpuPath(std::string(std::move(o))); 263 264 auto name = cpuPath.filename().string(); 265 auto index = name.find(CPU_NAME); 266 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 267 268 createObjects(name); 269 270 return 0; 271 } 272 273 void Manager::createObjects(const std::string& occ) 274 { 275 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 276 277 statusObjects.emplace_back(std::make_unique<Status>( 278 event, path.c_str(), *this, 279 #ifdef POWER10 280 pmode, 281 #endif 282 std::bind(std::mem_fn(&Manager::statusCallBack), this, 283 std::placeholders::_1, std::placeholders::_2) 284 #ifdef PLDM 285 , 286 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 287 std::placeholders::_1) 288 #endif 289 )); 290 291 // Create the power cap monitor object 292 if (!pcap) 293 { 294 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 295 *statusObjects.back()); 296 } 297 298 if (statusObjects.back()->isMasterOcc()) 299 { 300 log<level::INFO>( 301 fmt::format("Manager::createObjects(): OCC{} is the master", 302 statusObjects.back()->getOccInstanceID()) 303 .c_str()); 304 _pollTimer->setEnabled(false); 305 306 #ifdef POWER10 307 // Set the master OCC on the PowerMode object 308 pmode->setMasterOcc(path); 309 #endif 310 } 311 312 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 313 #ifdef POWER10 314 , 315 pmode 316 #endif 317 )); 318 } 319 320 void Manager::statusCallBack(instanceID instance, bool status) 321 { 322 using InternalFailure = 323 sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure; 324 325 // At this time, it won't happen but keeping it 326 // here just in case something changes in the future 327 if ((activeCount == 0) && (!status)) 328 { 329 log<level::ERR>( 330 fmt::format("Invalid update on OCCActive with OCC{}", instance) 331 .c_str()); 332 333 elog<InternalFailure>(); 334 } 335 336 if (status == true) 337 { 338 // OCC went active 339 ++activeCount; 340 341 #ifdef POWER10 342 if (activeCount == 1) 343 { 344 // First OCC went active (allow some time for all OCCs to go active) 345 waitForAllOccsTimer->restartOnce(60s); 346 } 347 #endif 348 349 if (activeCount == statusObjects.size()) 350 { 351 #ifdef POWER10 352 // All OCCs are now running 353 if (waitForAllOccsTimer->isEnabled()) 354 { 355 // stop occ wait timer 356 waitForAllOccsTimer->setEnabled(false); 357 } 358 #endif 359 360 // Verify master OCC and start presence monitor 361 validateOccMaster(); 362 } 363 364 // Start poll timer if not already started 365 if (!_pollTimer->isEnabled()) 366 { 367 log<level::INFO>( 368 fmt::format("Manager: OCCs will be polled every {} seconds", 369 pollInterval) 370 .c_str()); 371 372 // Send poll and start OCC poll timer 373 pollerTimerExpired(); 374 } 375 } 376 else 377 { 378 // OCC went away 379 --activeCount; 380 381 if (activeCount == 0) 382 { 383 // No OCCs are running 384 385 // Stop OCC poll timer 386 if (_pollTimer->isEnabled()) 387 { 388 log<level::INFO>( 389 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 390 _pollTimer->setEnabled(false); 391 } 392 393 #ifdef POWER10 394 // stop wait timer 395 if (waitForAllOccsTimer->isEnabled()) 396 { 397 waitForAllOccsTimer->setEnabled(false); 398 } 399 #endif 400 } 401 #ifdef READ_OCC_SENSORS 402 // Clear OCC sensors 403 setSensorValueToNaN(instance); 404 #endif 405 } 406 407 #ifdef POWER10 408 if (waitingForAllOccActiveSensors) 409 { 410 if (utils::isHostRunning()) 411 { 412 checkAllActiveSensors(); 413 } 414 } 415 #endif 416 } 417 418 #ifdef I2C_OCC 419 void Manager::initStatusObjects() 420 { 421 // Make sure we have a valid path string 422 static_assert(sizeof(DEV_PATH) != 0); 423 424 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 425 for (auto& name : deviceNames) 426 { 427 i2c_occ::i2cToDbus(name); 428 name = std::string(OCC_NAME) + '_' + name; 429 auto path = fs::path(OCC_CONTROL_ROOT) / name; 430 statusObjects.emplace_back( 431 std::make_unique<Status>(event, path.c_str(), *this)); 432 } 433 // The first device is master occ 434 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 435 *statusObjects.front()); 436 #ifdef POWER10 437 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 438 powermode::PIPS_PATH); 439 // Set the master OCC on the PowerMode object 440 pmode->setMasterOcc(path); 441 #endif 442 } 443 #endif 444 445 #ifdef PLDM 446 void Manager::sbeTimeout(unsigned int instance) 447 { 448 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 449 [instance](const auto& obj) { 450 return instance == obj->getOccInstanceID(); 451 }); 452 453 if (obj != statusObjects.end() && (*obj)->occActive()) 454 { 455 log<level::INFO>( 456 fmt::format("SBE timeout, requesting HRESET (OCC{})", instance) 457 .c_str()); 458 459 setSBEState(instance, SBE_STATE_NOT_USABLE); 460 461 pldmHandle->sendHRESET(instance); 462 } 463 } 464 465 bool Manager::updateOCCActive(instanceID instance, bool status) 466 { 467 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 468 [instance](const auto& obj) { 469 return instance == obj->getOccInstanceID(); 470 }); 471 472 if (obj != statusObjects.end()) 473 { 474 (*obj)->setPldmSensorReceived(true); 475 return (*obj)->occActive(status); 476 } 477 else 478 { 479 log<level::WARNING>( 480 fmt::format( 481 "Manager::updateOCCActive: No status object to update for OCC{} (active={})", 482 instance, status) 483 .c_str()); 484 if (status == true) 485 { 486 // OCC went active 487 queuedActiveState.insert(instance); 488 } 489 else 490 { 491 auto match = queuedActiveState.find(instance); 492 if (match != queuedActiveState.end()) 493 { 494 // OCC was disabled 495 queuedActiveState.erase(match); 496 } 497 } 498 return false; 499 } 500 } 501 502 // Called upon pldm event To set powermode Safe Mode State for system. 503 void Manager::updateOccSafeMode(bool safeMode) 504 { 505 #ifdef POWER10 506 pmode->updateDbusSafeMode(safeMode); 507 #endif 508 } 509 510 void Manager::sbeHRESETResult(instanceID instance, bool success) 511 { 512 if (success) 513 { 514 log<level::INFO>( 515 fmt::format("HRESET succeeded (OCC{})", instance).c_str()); 516 517 setSBEState(instance, SBE_STATE_BOOTED); 518 519 return; 520 } 521 522 setSBEState(instance, SBE_STATE_FAILED); 523 524 if (sbeCanDump(instance)) 525 { 526 log<level::INFO>( 527 fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance) 528 .c_str()); 529 530 auto& bus = utils::getBus(); 531 uint32_t src6 = instance << 16; 532 uint32_t logId = 533 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 534 src6, "SBE command timeout"); 535 536 try 537 { 538 constexpr auto path = "/org/openpower/dump"; 539 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 540 constexpr auto function = "CreateDump"; 541 542 std::string service = utils::getService(path, interface); 543 auto method = 544 bus.new_method_call(service.c_str(), path, interface, function); 545 546 std::map<std::string, std::variant<std::string, uint64_t>> 547 createParams{ 548 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 549 uint64_t(logId)}, 550 {"com.ibm.Dump.Create.CreateParameters.DumpType", 551 "com.ibm.Dump.Create.DumpType.SBE"}, 552 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 553 uint64_t(instance)}, 554 }; 555 556 method.append(createParams); 557 558 auto response = bus.call(method); 559 } 560 catch (const sdbusplus::exception_t& e) 561 { 562 constexpr auto ERROR_DUMP_DISABLED = 563 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 564 if (e.name() == ERROR_DUMP_DISABLED) 565 { 566 log<level::INFO>("Dump is disabled, skipping"); 567 } 568 else 569 { 570 log<level::ERR>("Dump failed"); 571 } 572 } 573 } 574 } 575 576 bool Manager::sbeCanDump(unsigned int instance) 577 { 578 struct pdbg_target* proc = getPdbgTarget(instance); 579 580 if (!proc) 581 { 582 // allow the dump in the error case 583 return true; 584 } 585 586 try 587 { 588 if (!openpower::phal::sbe::isDumpAllowed(proc)) 589 { 590 return false; 591 } 592 593 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 594 { 595 return false; 596 } 597 } 598 catch (openpower::phal::exception::SbeError& e) 599 { 600 log<level::INFO>("Failed to query SBE state"); 601 } 602 603 // allow the dump in the error case 604 return true; 605 } 606 607 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 608 { 609 struct pdbg_target* proc = getPdbgTarget(instance); 610 611 if (!proc) 612 { 613 return; 614 } 615 616 try 617 { 618 openpower::phal::sbe::setState(proc, state); 619 } 620 catch (const openpower::phal::exception::SbeError& e) 621 { 622 log<level::ERR>("Failed to set SBE state"); 623 } 624 } 625 626 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 627 { 628 if (!pdbgInitialized) 629 { 630 try 631 { 632 openpower::phal::pdbg::init(); 633 pdbgInitialized = true; 634 } 635 catch (const openpower::phal::exception::PdbgError& e) 636 { 637 log<level::ERR>("pdbg initialization failed"); 638 return nullptr; 639 } 640 } 641 642 struct pdbg_target* proc = nullptr; 643 pdbg_for_each_class_target("proc", proc) 644 { 645 if (pdbg_target_index(proc) == instance) 646 { 647 return proc; 648 } 649 } 650 651 log<level::ERR>("Failed to get pdbg target"); 652 return nullptr; 653 } 654 #endif 655 656 void Manager::pollerTimerExpired() 657 { 658 if (!_pollTimer) 659 { 660 log<level::ERR>( 661 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 662 return; 663 } 664 665 for (auto& obj : statusObjects) 666 { 667 if (!obj->occActive()) 668 { 669 // OCC is not running yet 670 #ifdef READ_OCC_SENSORS 671 auto id = obj->getOccInstanceID(); 672 setSensorValueToNaN(id); 673 #endif 674 continue; 675 } 676 677 // Read sysfs to force kernel to poll OCC 678 obj->readOccState(); 679 680 #ifdef READ_OCC_SENSORS 681 // Read occ sensor values 682 getSensorValues(obj); 683 #endif 684 } 685 686 if (activeCount > 0) 687 { 688 // Restart OCC poll timer 689 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 690 } 691 else 692 { 693 // No OCCs running, so poll timer will not be restarted 694 log<level::INFO>( 695 fmt::format( 696 "Manager::pollerTimerExpired: poll timer will not be restarted") 697 .c_str()); 698 } 699 } 700 701 #ifdef READ_OCC_SENSORS 702 void Manager::readTempSensors(const fs::path& path, uint32_t id) 703 { 704 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 705 for (auto& file : fs::directory_iterator(path)) 706 { 707 if (!std::regex_search(file.path().string(), expr)) 708 { 709 continue; 710 } 711 712 uint32_t labelValue{0}; 713 714 try 715 { 716 labelValue = readFile<uint32_t>(file.path()); 717 } 718 catch (const std::system_error& e) 719 { 720 log<level::DEBUG>( 721 fmt::format("readTempSensors: Failed reading {}, errno = {}", 722 file.path().string(), e.code().value()) 723 .c_str()); 724 continue; 725 } 726 727 const std::string& tempLabel = "label"; 728 const std::string filePathString = file.path().string().substr( 729 0, file.path().string().length() - tempLabel.length()); 730 731 uint32_t fruTypeValue{0}; 732 try 733 { 734 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 735 } 736 catch (const std::system_error& e) 737 { 738 log<level::DEBUG>( 739 fmt::format("readTempSensors: Failed reading {}, errno = {}", 740 filePathString + fruTypeSuffix, e.code().value()) 741 .c_str()); 742 continue; 743 } 744 745 std::string sensorPath = 746 OCC_SENSORS_ROOT + std::string("/temperature/"); 747 748 std::string dvfsTempPath; 749 750 if (fruTypeValue == VRMVdd) 751 { 752 sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp"); 753 } 754 else if (fruTypeValue == processorIoRing) 755 { 756 sensorPath.append("proc" + std::to_string(id) + "_ioring_temp"); 757 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 758 std::to_string(id) + "_ioring_dvfs_temp"; 759 } 760 else 761 { 762 uint16_t type = (labelValue & 0xFF000000) >> 24; 763 uint16_t instanceID = labelValue & 0x0000FFFF; 764 765 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 766 { 767 if (fruTypeValue == fruTypeNotAvailable) 768 { 769 // Not all DIMM related temps are available to read 770 // (no _input file in this case) 771 continue; 772 } 773 auto iter = dimmTempSensorName.find(fruTypeValue); 774 if (iter == dimmTempSensorName.end()) 775 { 776 log<level::ERR>( 777 fmt::format( 778 "readTempSensors: Fru type error! fruTypeValue = {}) ", 779 fruTypeValue) 780 .c_str()); 781 continue; 782 } 783 784 sensorPath.append("dimm" + std::to_string(instanceID) + 785 iter->second); 786 } 787 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 788 { 789 if (fruTypeValue == processorCore) 790 { 791 // The OCC reports small core temps, of which there are 792 // two per big core. All current P10 systems are in big 793 // core mode, so use a big core name. 794 uint16_t coreNum = instanceID / 2; 795 uint16_t tempNum = instanceID % 2; 796 sensorPath.append("proc" + std::to_string(id) + "_core" + 797 std::to_string(coreNum) + "_" + 798 std::to_string(tempNum) + "_temp"); 799 800 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + 801 "/temperature/proc" + std::to_string(id) + 802 "_core_dvfs_temp"; 803 } 804 else 805 { 806 continue; 807 } 808 } 809 else 810 { 811 continue; 812 } 813 } 814 815 // The dvfs temp file only needs to be read once per chip per type. 816 if (!dvfsTempPath.empty() && 817 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 818 { 819 try 820 { 821 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 822 823 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 824 dvfsTempPath, dvfsValue * std::pow(10, -3)); 825 } 826 catch (const std::system_error& e) 827 { 828 log<level::DEBUG>( 829 fmt::format( 830 "readTempSensors: Failed reading {}, errno = {}", 831 filePathString + maxSuffix, e.code().value()) 832 .c_str()); 833 } 834 } 835 836 uint32_t faultValue{0}; 837 try 838 { 839 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 840 } 841 catch (const std::system_error& e) 842 { 843 log<level::DEBUG>( 844 fmt::format("readTempSensors: Failed reading {}, errno = {}", 845 filePathString + faultSuffix, e.code().value()) 846 .c_str()); 847 continue; 848 } 849 850 // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1. 851 if (faultValue != 0) 852 { 853 dbus::OccDBusSensors::getOccDBus().setValue( 854 sensorPath, std::numeric_limits<double>::quiet_NaN()); 855 856 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 857 false); 858 859 continue; 860 } 861 862 double tempValue{0}; 863 864 try 865 { 866 tempValue = readFile<double>(filePathString + inputSuffix); 867 } 868 catch (const std::system_error& e) 869 { 870 log<level::DEBUG>( 871 fmt::format("readTempSensors: Failed reading {}, errno = {}", 872 filePathString + inputSuffix, e.code().value()) 873 .c_str()); 874 875 // if errno == EAGAIN(Resource temporarily unavailable) then set 876 // temp to 0, to avoid using old temp, and affecting FAN Control. 877 if (e.code().value() == EAGAIN) 878 { 879 tempValue = 0; 880 } 881 // else the errno would be something like 882 // EBADF(Bad file descriptor) 883 // or ENOENT(No such file or directory) 884 else 885 { 886 continue; 887 } 888 } 889 890 dbus::OccDBusSensors::getOccDBus().setValue( 891 sensorPath, tempValue * std::pow(10, -3)); 892 893 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 894 true); 895 896 // At this point, the sensor will be created for sure. 897 if (existingSensors.find(sensorPath) == existingSensors.end()) 898 { 899 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 900 sensorPath); 901 } 902 903 existingSensors[sensorPath] = id; 904 } 905 return; 906 } 907 908 std::optional<std::string> 909 Manager::getPowerLabelFunctionID(const std::string& value) 910 { 911 // If the value is "system", then the FunctionID is "system". 912 if (value == "system") 913 { 914 return value; 915 } 916 917 // If the value is not "system", then the label value have 3 numbers, of 918 // which we only care about the middle one: 919 // <sensor id>_<function id>_<apss channel> 920 // eg: The value is "0_10_5" , then the FunctionID is "10". 921 if (value.find("_") == std::string::npos) 922 { 923 return std::nullopt; 924 } 925 926 auto powerLabelValue = value.substr((value.find("_") + 1)); 927 928 if (powerLabelValue.find("_") == std::string::npos) 929 { 930 return std::nullopt; 931 } 932 933 return powerLabelValue.substr(0, powerLabelValue.find("_")); 934 } 935 936 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 937 { 938 std::regex expr{"power\\d+_label$"}; // Example: power5_label 939 for (auto& file : fs::directory_iterator(path)) 940 { 941 if (!std::regex_search(file.path().string(), expr)) 942 { 943 continue; 944 } 945 946 std::string labelValue; 947 try 948 { 949 labelValue = readFile<std::string>(file.path()); 950 } 951 catch (const std::system_error& e) 952 { 953 log<level::DEBUG>( 954 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 955 file.path().string(), e.code().value()) 956 .c_str()); 957 continue; 958 } 959 960 auto functionID = getPowerLabelFunctionID(labelValue); 961 if (functionID == std::nullopt) 962 { 963 continue; 964 } 965 966 const std::string& tempLabel = "label"; 967 const std::string filePathString = file.path().string().substr( 968 0, file.path().string().length() - tempLabel.length()); 969 970 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 971 972 auto iter = powerSensorName.find(*functionID); 973 if (iter == powerSensorName.end()) 974 { 975 continue; 976 } 977 sensorPath.append(iter->second); 978 979 double tempValue{0}; 980 981 try 982 { 983 tempValue = readFile<double>(filePathString + inputSuffix); 984 } 985 catch (const std::system_error& e) 986 { 987 log<level::DEBUG>( 988 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 989 filePathString + inputSuffix, e.code().value()) 990 .c_str()); 991 continue; 992 } 993 994 dbus::OccDBusSensors::getOccDBus().setUnit( 995 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 996 997 dbus::OccDBusSensors::getOccDBus().setValue( 998 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 999 1000 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1001 true); 1002 1003 if (existingSensors.find(sensorPath) == existingSensors.end()) 1004 { 1005 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1006 sensorPath); 1007 } 1008 1009 existingSensors[sensorPath] = id; 1010 } 1011 return; 1012 } 1013 1014 void Manager::setSensorValueToNaN(uint32_t id) const 1015 { 1016 for (const auto& [sensorPath, occId] : existingSensors) 1017 { 1018 if (occId == id) 1019 { 1020 dbus::OccDBusSensors::getOccDBus().setValue( 1021 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1022 1023 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1024 true); 1025 } 1026 } 1027 return; 1028 } 1029 1030 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1031 { 1032 for (const auto& [sensorPath, occId] : existingSensors) 1033 { 1034 if (occId == id) 1035 { 1036 dbus::OccDBusSensors::getOccDBus().setValue( 1037 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1038 1039 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1040 false); 1041 } 1042 } 1043 return; 1044 } 1045 1046 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1047 { 1048 static bool tracedError[8] = {0}; 1049 const fs::path sensorPath = occ->getHwmonPath(); 1050 const uint32_t id = occ->getOccInstanceID(); 1051 1052 if (fs::exists(sensorPath)) 1053 { 1054 // Read temperature sensors 1055 readTempSensors(sensorPath, id); 1056 1057 if (occ->isMasterOcc()) 1058 { 1059 // Read power sensors 1060 readPowerSensors(sensorPath, id); 1061 } 1062 tracedError[id] = false; 1063 } 1064 else 1065 { 1066 if (!tracedError[id]) 1067 { 1068 log<level::ERR>( 1069 fmt::format( 1070 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1071 id, sensorPath.c_str()) 1072 .c_str()); 1073 tracedError[id] = true; 1074 } 1075 } 1076 1077 return; 1078 } 1079 #endif 1080 1081 // Read the altitude from DBus 1082 void Manager::readAltitude() 1083 { 1084 static bool traceAltitudeErr = true; 1085 1086 utils::PropertyValue altitudeProperty{}; 1087 try 1088 { 1089 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1090 ALTITUDE_PROP); 1091 auto sensorVal = std::get<double>(altitudeProperty); 1092 if (sensorVal < 0xFFFF) 1093 { 1094 if (sensorVal < 0) 1095 { 1096 altitude = 0; 1097 } 1098 else 1099 { 1100 // Round to nearest meter 1101 altitude = uint16_t(sensorVal + 0.5); 1102 } 1103 log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)", 1104 sensorVal, altitude) 1105 .c_str()); 1106 traceAltitudeErr = true; 1107 } 1108 else 1109 { 1110 if (traceAltitudeErr) 1111 { 1112 traceAltitudeErr = false; 1113 log<level::DEBUG>( 1114 fmt::format("Invalid altitude value: {}", sensorVal) 1115 .c_str()); 1116 } 1117 } 1118 } 1119 catch (const sdbusplus::exception_t& e) 1120 { 1121 if (traceAltitudeErr) 1122 { 1123 traceAltitudeErr = false; 1124 log<level::INFO>( 1125 fmt::format("Unable to read Altitude: {}", e.what()).c_str()); 1126 } 1127 altitude = 0xFFFF; // not available 1128 } 1129 } 1130 1131 // Callback function when ambient temperature changes 1132 void Manager::ambientCallback(sdbusplus::message_t& msg) 1133 { 1134 double currentTemp = 0; 1135 uint8_t truncatedTemp = 0xFF; 1136 std::string msgSensor; 1137 std::map<std::string, std::variant<double>> msgData; 1138 msg.read(msgSensor, msgData); 1139 1140 auto valPropMap = msgData.find(AMBIENT_PROP); 1141 if (valPropMap == msgData.end()) 1142 { 1143 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1144 return; 1145 } 1146 currentTemp = std::get<double>(valPropMap->second); 1147 if (std::isnan(currentTemp)) 1148 { 1149 truncatedTemp = 0xFF; 1150 } 1151 else 1152 { 1153 if (currentTemp < 0) 1154 { 1155 truncatedTemp = 0; 1156 } 1157 else 1158 { 1159 // Round to nearest degree C 1160 truncatedTemp = uint8_t(currentTemp + 0.5); 1161 } 1162 } 1163 1164 // If ambient changes, notify OCCs 1165 if (truncatedTemp != ambient) 1166 { 1167 log<level::DEBUG>( 1168 fmt::format("ambientCallback: Ambient change from {} to {}C", 1169 ambient, currentTemp) 1170 .c_str()); 1171 1172 ambient = truncatedTemp; 1173 if (altitude == 0xFFFF) 1174 { 1175 // No altitude yet, try reading again 1176 readAltitude(); 1177 } 1178 1179 log<level::DEBUG>( 1180 fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1181 altitude) 1182 .c_str()); 1183 #ifdef POWER10 1184 // Send ambient and altitude to all OCCs 1185 for (auto& obj : statusObjects) 1186 { 1187 if (obj->occActive()) 1188 { 1189 obj->sendAmbient(ambient, altitude); 1190 } 1191 } 1192 #endif // POWER10 1193 } 1194 } 1195 1196 // return the current ambient and altitude readings 1197 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1198 uint16_t& altitudeValue) const 1199 { 1200 ambientValid = true; 1201 ambientTemp = ambient; 1202 altitudeValue = altitude; 1203 1204 if (ambient == 0xFF) 1205 { 1206 ambientValid = false; 1207 } 1208 } 1209 1210 #ifdef POWER10 1211 // Called when waitForAllOccsTimer expires 1212 // After the first OCC goes active, this timer will be started (60 seconds) 1213 void Manager::occsNotAllRunning() 1214 { 1215 if (activeCount != statusObjects.size()) 1216 { 1217 // Not all OCCs went active 1218 log<level::WARNING>( 1219 fmt::format( 1220 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1221 activeCount, statusObjects.size()) 1222 .c_str()); 1223 // Procs may be garded, so may be expected 1224 } 1225 1226 validateOccMaster(); 1227 } 1228 #endif // POWER10 1229 1230 // Verify single master OCC and start presence monitor 1231 void Manager::validateOccMaster() 1232 { 1233 int masterInstance = -1; 1234 for (auto& obj : statusObjects) 1235 { 1236 auto instance = obj->getOccInstanceID(); 1237 #ifdef POWER10 1238 if (!obj->occActive()) 1239 { 1240 if (utils::isHostRunning()) 1241 { 1242 // Check if sensor was queued while waiting for discovery 1243 auto match = queuedActiveState.find(instance); 1244 if (match != queuedActiveState.end()) 1245 { 1246 queuedActiveState.erase(match); 1247 log<level::INFO>( 1248 fmt::format( 1249 "validateOccMaster: OCC{} is ACTIVE (queued)", 1250 instance) 1251 .c_str()); 1252 obj->occActive(true); 1253 } 1254 else 1255 { 1256 // OCC does not appear to be active yet, check active sensor 1257 pldmHandle->checkActiveSensor(instance); 1258 if (obj->occActive()) 1259 { 1260 log<level::INFO>( 1261 fmt::format( 1262 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1263 instance) 1264 .c_str()); 1265 } 1266 } 1267 } 1268 else 1269 { 1270 log<level::WARNING>( 1271 fmt::format( 1272 "validateOccMaster: HOST is not running (OCC{})", 1273 instance) 1274 .c_str()); 1275 return; 1276 } 1277 } 1278 #endif // POWER10 1279 1280 if (obj->isMasterOcc()) 1281 { 1282 obj->addPresenceWatchMaster(); 1283 1284 if (masterInstance == -1) 1285 { 1286 masterInstance = instance; 1287 } 1288 else 1289 { 1290 log<level::ERR>( 1291 fmt::format( 1292 "validateOccMaster: Multiple OCC masters! ({} and {})", 1293 masterInstance, instance) 1294 .c_str()); 1295 // request reset 1296 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1297 } 1298 } 1299 } 1300 1301 if (masterInstance < 0) 1302 { 1303 log<level::ERR>( 1304 fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1305 statusObjects.size()) 1306 .c_str()); 1307 // request reset 1308 statusObjects.front()->deviceError( 1309 Error::Descriptor(PRESENCE_ERROR_PATH)); 1310 } 1311 else 1312 { 1313 log<level::INFO>( 1314 fmt::format("validateOccMaster: OCC{} is master of {} OCCs", 1315 masterInstance, activeCount) 1316 .c_str()); 1317 #ifdef POWER10 1318 pmode->updateDbusSafeMode(false); 1319 #endif 1320 } 1321 } 1322 1323 void Manager::updatePcapBounds() const 1324 { 1325 if (pcap) 1326 { 1327 pcap->updatePcapBounds(); 1328 } 1329 } 1330 1331 } // namespace occ 1332 } // namespace open_power 1333