1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 fmt::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 fmt::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 } 118 } 119 120 if (statusObjCreated && waitingForAllOccActiveSensors) 121 { 122 static bool tracedHostWait = false; 123 if (utils::isHostRunning()) 124 { 125 if (tracedHostWait) 126 { 127 log<level::INFO>( 128 "Manager::findAndCreateObjects(): Host is running"); 129 tracedHostWait = false; 130 } 131 checkAllActiveSensors(); 132 } 133 else 134 { 135 if (!tracedHostWait) 136 { 137 log<level::INFO>( 138 "Manager::findAndCreateObjects(): Waiting for host to start"); 139 tracedHostWait = true; 140 } 141 discoverTimer->restartOnce(30s); 142 } 143 } 144 } 145 else 146 { 147 log<level::INFO>( 148 fmt::format( 149 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 150 HOST_ON_FILE) 151 .c_str()); 152 discoverTimer->restartOnce(10s); 153 } 154 #endif 155 } 156 157 #ifdef POWER10 158 // Check if all occActive sensors are available 159 void Manager::checkAllActiveSensors() 160 { 161 static bool allActiveSensorAvailable = false; 162 static bool tracedSensorWait = false; 163 static bool waitingForHost = false; 164 165 if (open_power::occ::utils::isHostRunning()) 166 { 167 if (waitingForHost) 168 { 169 waitingForHost = false; 170 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 171 } 172 173 // Start with the assumption that all are available 174 allActiveSensorAvailable = true; 175 for (auto& obj : statusObjects) 176 { 177 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 178 { 179 auto instance = obj->getOccInstanceID(); 180 // Check if sensor was queued while waiting for discovery 181 auto match = queuedActiveState.find(instance); 182 if (match != queuedActiveState.end()) 183 { 184 queuedActiveState.erase(match); 185 log<level::INFO>( 186 fmt::format( 187 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 188 instance) 189 .c_str()); 190 obj->occActive(true); 191 } 192 else 193 { 194 allActiveSensorAvailable = false; 195 if (!tracedSensorWait) 196 { 197 log<level::INFO>( 198 fmt::format( 199 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 200 instance) 201 .c_str()); 202 tracedSensorWait = true; 203 } 204 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 205 break; 206 } 207 } 208 } 209 } 210 else 211 { 212 if (!waitingForHost) 213 { 214 waitingForHost = true; 215 log<level::INFO>( 216 "checkAllActiveSensors(): Waiting for host to start"); 217 } 218 } 219 220 if (allActiveSensorAvailable) 221 { 222 // All sensors were found, disable the discovery timer 223 if (discoverTimer->isEnabled()) 224 { 225 discoverTimer->setEnabled(false); 226 } 227 228 if (waitingForAllOccActiveSensors) 229 { 230 log<level::INFO>( 231 "checkAllActiveSensors(): OCC Active sensors are available"); 232 waitingForAllOccActiveSensors = false; 233 } 234 queuedActiveState.clear(); 235 tracedSensorWait = false; 236 } 237 else 238 { 239 // Not all sensors were available, so keep waiting 240 if (!tracedSensorWait) 241 { 242 log<level::INFO>( 243 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 244 tracedSensorWait = true; 245 } 246 discoverTimer->restartOnce(10s); 247 } 248 } 249 #endif 250 251 std::vector<int> Manager::findOCCsInDev() 252 { 253 std::vector<int> occs; 254 std::regex expr{R"(occ(\d+)$)"}; 255 256 for (auto& file : fs::directory_iterator("/dev")) 257 { 258 std::smatch match; 259 std::string path{file.path().string()}; 260 if (std::regex_search(path, match, expr)) 261 { 262 auto num = std::stoi(match[1].str()); 263 264 // /dev numbering starts at 1, ours starts at 0. 265 occs.push_back(num - 1); 266 } 267 } 268 269 return occs; 270 } 271 272 int Manager::cpuCreated(sdbusplus::message_t& msg) 273 { 274 namespace fs = std::filesystem; 275 276 sdbusplus::message::object_path o; 277 msg.read(o); 278 fs::path cpuPath(std::string(std::move(o))); 279 280 auto name = cpuPath.filename().string(); 281 auto index = name.find(CPU_NAME); 282 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 283 284 createObjects(name); 285 286 return 0; 287 } 288 289 void Manager::createObjects(const std::string& occ) 290 { 291 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 292 293 statusObjects.emplace_back(std::make_unique<Status>( 294 event, path.c_str(), *this, 295 #ifdef POWER10 296 pmode, 297 #endif 298 std::bind(std::mem_fn(&Manager::statusCallBack), this, 299 std::placeholders::_1, std::placeholders::_2) 300 #ifdef PLDM 301 , 302 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 303 std::placeholders::_1) 304 #endif 305 )); 306 307 // Create the power cap monitor object 308 if (!pcap) 309 { 310 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 311 *statusObjects.back()); 312 } 313 314 if (statusObjects.back()->isMasterOcc()) 315 { 316 log<level::INFO>( 317 fmt::format("Manager::createObjects(): OCC{} is the master", 318 statusObjects.back()->getOccInstanceID()) 319 .c_str()); 320 _pollTimer->setEnabled(false); 321 322 #ifdef POWER10 323 // Set the master OCC on the PowerMode object 324 pmode->setMasterOcc(path); 325 #endif 326 } 327 328 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 329 #ifdef POWER10 330 , 331 pmode 332 #endif 333 )); 334 } 335 336 void Manager::statusCallBack(instanceID instance, bool status) 337 { 338 if (status == true) 339 { 340 // OCC went active 341 ++activeCount; 342 343 #ifdef POWER10 344 if (activeCount == 1) 345 { 346 // First OCC went active (allow some time for all OCCs to go active) 347 waitForAllOccsTimer->restartOnce(60s); 348 } 349 #endif 350 351 if (activeCount == statusObjects.size()) 352 { 353 #ifdef POWER10 354 // All OCCs are now running 355 if (waitForAllOccsTimer->isEnabled()) 356 { 357 // stop occ wait timer 358 waitForAllOccsTimer->setEnabled(false); 359 } 360 #endif 361 362 // Verify master OCC and start presence monitor 363 validateOccMaster(); 364 } 365 366 // Start poll timer if not already started 367 if (!_pollTimer->isEnabled()) 368 { 369 log<level::INFO>( 370 fmt::format("Manager: OCCs will be polled every {} seconds", 371 pollInterval) 372 .c_str()); 373 374 // Send poll and start OCC poll timer 375 pollerTimerExpired(); 376 } 377 } 378 else 379 { 380 // OCC went away 381 if (activeCount > 0) 382 { 383 --activeCount; 384 } 385 else 386 { 387 log<level::ERR>( 388 fmt::format("OCC{} disabled, but currently no active OCCs", 389 instance) 390 .c_str()); 391 } 392 393 if (activeCount == 0) 394 { 395 // No OCCs are running 396 397 // Stop OCC poll timer 398 if (_pollTimer->isEnabled()) 399 { 400 log<level::INFO>( 401 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 402 _pollTimer->setEnabled(false); 403 } 404 405 #ifdef POWER10 406 // stop wait timer 407 if (waitForAllOccsTimer->isEnabled()) 408 { 409 waitForAllOccsTimer->setEnabled(false); 410 } 411 #endif 412 } 413 #ifdef READ_OCC_SENSORS 414 // Clear OCC sensors 415 setSensorValueToNaN(instance); 416 #endif 417 } 418 419 #ifdef POWER10 420 if (waitingForAllOccActiveSensors) 421 { 422 if (utils::isHostRunning()) 423 { 424 checkAllActiveSensors(); 425 } 426 } 427 #endif 428 } 429 430 #ifdef I2C_OCC 431 void Manager::initStatusObjects() 432 { 433 // Make sure we have a valid path string 434 static_assert(sizeof(DEV_PATH) != 0); 435 436 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 437 for (auto& name : deviceNames) 438 { 439 i2c_occ::i2cToDbus(name); 440 name = std::string(OCC_NAME) + '_' + name; 441 auto path = fs::path(OCC_CONTROL_ROOT) / name; 442 statusObjects.emplace_back( 443 std::make_unique<Status>(event, path.c_str(), *this)); 444 } 445 // The first device is master occ 446 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 447 *statusObjects.front()); 448 #ifdef POWER10 449 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 450 powermode::PIPS_PATH); 451 // Set the master OCC on the PowerMode object 452 pmode->setMasterOcc(path); 453 #endif 454 } 455 #endif 456 457 #ifdef PLDM 458 void Manager::sbeTimeout(unsigned int instance) 459 { 460 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 461 [instance](const auto& obj) { 462 return instance == obj->getOccInstanceID(); 463 }); 464 465 if (obj != statusObjects.end() && (*obj)->occActive()) 466 { 467 log<level::INFO>( 468 fmt::format("SBE timeout, requesting HRESET (OCC{})", instance) 469 .c_str()); 470 471 setSBEState(instance, SBE_STATE_NOT_USABLE); 472 473 pldmHandle->sendHRESET(instance); 474 } 475 } 476 477 bool Manager::updateOCCActive(instanceID instance, bool status) 478 { 479 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 480 [instance](const auto& obj) { 481 return instance == obj->getOccInstanceID(); 482 }); 483 484 const bool hostRunning = open_power::occ::utils::isHostRunning(); 485 if (obj != statusObjects.end()) 486 { 487 if (!hostRunning && (status == true)) 488 { 489 log<level::WARNING>( 490 fmt::format( 491 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 492 instance, status) 493 .c_str()); 494 (*obj)->setPldmSensorReceived(false); 495 if (!waitingForAllOccActiveSensors) 496 { 497 log<level::INFO>( 498 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 499 waitingForAllOccActiveSensors = true; 500 } 501 discoverTimer->restartOnce(30s); 502 return false; 503 } 504 else 505 { 506 log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}", 507 instance, status) 508 .c_str()); 509 (*obj)->setPldmSensorReceived(true); 510 return (*obj)->occActive(status); 511 } 512 } 513 else 514 { 515 if (hostRunning) 516 { 517 log<level::WARNING>( 518 fmt::format( 519 "updateOCCActive: No status object to update for OCC{} (active={})", 520 instance, status) 521 .c_str()); 522 } 523 else 524 { 525 if (status == true) 526 { 527 log<level::WARNING>( 528 fmt::format( 529 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 530 instance, status) 531 .c_str()); 532 } 533 } 534 if (status == true) 535 { 536 // OCC went active 537 queuedActiveState.insert(instance); 538 } 539 else 540 { 541 auto match = queuedActiveState.find(instance); 542 if (match != queuedActiveState.end()) 543 { 544 // OCC was disabled 545 queuedActiveState.erase(match); 546 } 547 } 548 return false; 549 } 550 } 551 552 // Called upon pldm event To set powermode Safe Mode State for system. 553 void Manager::updateOccSafeMode(bool safeMode) 554 { 555 #ifdef POWER10 556 pmode->updateDbusSafeMode(safeMode); 557 #endif 558 } 559 560 void Manager::sbeHRESETResult(instanceID instance, bool success) 561 { 562 if (success) 563 { 564 log<level::INFO>( 565 fmt::format("HRESET succeeded (OCC{})", instance).c_str()); 566 567 setSBEState(instance, SBE_STATE_BOOTED); 568 569 return; 570 } 571 572 setSBEState(instance, SBE_STATE_FAILED); 573 574 if (sbeCanDump(instance)) 575 { 576 log<level::INFO>( 577 fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance) 578 .c_str()); 579 580 auto& bus = utils::getBus(); 581 uint32_t src6 = instance << 16; 582 uint32_t logId = 583 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 584 src6, "SBE command timeout"); 585 586 try 587 { 588 constexpr auto path = "/org/openpower/dump"; 589 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 590 constexpr auto function = "CreateDump"; 591 592 std::string service = utils::getService(path, interface); 593 auto method = bus.new_method_call(service.c_str(), path, interface, 594 function); 595 596 std::map<std::string, std::variant<std::string, uint64_t>> 597 createParams{ 598 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 599 uint64_t(logId)}, 600 {"com.ibm.Dump.Create.CreateParameters.DumpType", 601 "com.ibm.Dump.Create.DumpType.SBE"}, 602 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 603 uint64_t(instance)}, 604 }; 605 606 method.append(createParams); 607 608 auto response = bus.call(method); 609 } 610 catch (const sdbusplus::exception_t& e) 611 { 612 constexpr auto ERROR_DUMP_DISABLED = 613 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 614 if (e.name() == ERROR_DUMP_DISABLED) 615 { 616 log<level::INFO>("Dump is disabled, skipping"); 617 } 618 else 619 { 620 log<level::ERR>("Dump failed"); 621 } 622 } 623 } 624 } 625 626 bool Manager::sbeCanDump(unsigned int instance) 627 { 628 struct pdbg_target* proc = getPdbgTarget(instance); 629 630 if (!proc) 631 { 632 // allow the dump in the error case 633 return true; 634 } 635 636 try 637 { 638 if (!openpower::phal::sbe::isDumpAllowed(proc)) 639 { 640 return false; 641 } 642 643 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 644 { 645 return false; 646 } 647 } 648 catch (openpower::phal::exception::SbeError& e) 649 { 650 log<level::INFO>("Failed to query SBE state"); 651 } 652 653 // allow the dump in the error case 654 return true; 655 } 656 657 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 658 { 659 struct pdbg_target* proc = getPdbgTarget(instance); 660 661 if (!proc) 662 { 663 return; 664 } 665 666 try 667 { 668 openpower::phal::sbe::setState(proc, state); 669 } 670 catch (const openpower::phal::exception::SbeError& e) 671 { 672 log<level::ERR>("Failed to set SBE state"); 673 } 674 } 675 676 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 677 { 678 if (!pdbgInitialized) 679 { 680 try 681 { 682 openpower::phal::pdbg::init(); 683 pdbgInitialized = true; 684 } 685 catch (const openpower::phal::exception::PdbgError& e) 686 { 687 log<level::ERR>("pdbg initialization failed"); 688 return nullptr; 689 } 690 } 691 692 struct pdbg_target* proc = nullptr; 693 pdbg_for_each_class_target("proc", proc) 694 { 695 if (pdbg_target_index(proc) == instance) 696 { 697 return proc; 698 } 699 } 700 701 log<level::ERR>("Failed to get pdbg target"); 702 return nullptr; 703 } 704 #endif 705 706 void Manager::pollerTimerExpired() 707 { 708 if (!_pollTimer) 709 { 710 log<level::ERR>( 711 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 712 return; 713 } 714 715 for (auto& obj : statusObjects) 716 { 717 if (!obj->occActive()) 718 { 719 // OCC is not running yet 720 #ifdef READ_OCC_SENSORS 721 auto id = obj->getOccInstanceID(); 722 setSensorValueToNaN(id); 723 #endif 724 continue; 725 } 726 727 // Read sysfs to force kernel to poll OCC 728 obj->readOccState(); 729 730 #ifdef READ_OCC_SENSORS 731 // Read occ sensor values 732 getSensorValues(obj); 733 #endif 734 } 735 736 if (activeCount > 0) 737 { 738 // Restart OCC poll timer 739 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 740 } 741 else 742 { 743 // No OCCs running, so poll timer will not be restarted 744 log<level::INFO>( 745 fmt::format( 746 "Manager::pollerTimerExpired: poll timer will not be restarted") 747 .c_str()); 748 } 749 } 750 751 #ifdef READ_OCC_SENSORS 752 void Manager::readTempSensors(const fs::path& path, uint32_t id) 753 { 754 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 755 for (auto& file : fs::directory_iterator(path)) 756 { 757 if (!std::regex_search(file.path().string(), expr)) 758 { 759 continue; 760 } 761 762 uint32_t labelValue{0}; 763 764 try 765 { 766 labelValue = readFile<uint32_t>(file.path()); 767 } 768 catch (const std::system_error& e) 769 { 770 log<level::DEBUG>( 771 fmt::format("readTempSensors: Failed reading {}, errno = {}", 772 file.path().string(), e.code().value()) 773 .c_str()); 774 continue; 775 } 776 777 const std::string& tempLabel = "label"; 778 const std::string filePathString = file.path().string().substr( 779 0, file.path().string().length() - tempLabel.length()); 780 781 uint32_t fruTypeValue{0}; 782 try 783 { 784 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 785 } 786 catch (const std::system_error& e) 787 { 788 log<level::DEBUG>( 789 fmt::format("readTempSensors: Failed reading {}, errno = {}", 790 filePathString + fruTypeSuffix, e.code().value()) 791 .c_str()); 792 continue; 793 } 794 795 std::string sensorPath = OCC_SENSORS_ROOT + 796 std::string("/temperature/"); 797 798 std::string dvfsTempPath; 799 800 if (fruTypeValue == VRMVdd) 801 { 802 sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp"); 803 } 804 else if (fruTypeValue == processorIoRing) 805 { 806 sensorPath.append("proc" + std::to_string(id) + "_ioring_temp"); 807 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 808 std::to_string(id) + "_ioring_dvfs_temp"; 809 } 810 else 811 { 812 uint16_t type = (labelValue & 0xFF000000) >> 24; 813 uint16_t instanceID = labelValue & 0x0000FFFF; 814 815 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 816 { 817 if (fruTypeValue == fruTypeNotAvailable) 818 { 819 // Not all DIMM related temps are available to read 820 // (no _input file in this case) 821 continue; 822 } 823 auto iter = dimmTempSensorName.find(fruTypeValue); 824 if (iter == dimmTempSensorName.end()) 825 { 826 log<level::ERR>( 827 fmt::format( 828 "readTempSensors: Fru type error! fruTypeValue = {}) ", 829 fruTypeValue) 830 .c_str()); 831 continue; 832 } 833 834 sensorPath.append("dimm" + std::to_string(instanceID) + 835 iter->second); 836 } 837 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 838 { 839 if (fruTypeValue == processorCore) 840 { 841 // The OCC reports small core temps, of which there are 842 // two per big core. All current P10 systems are in big 843 // core mode, so use a big core name. 844 uint16_t coreNum = instanceID / 2; 845 uint16_t tempNum = instanceID % 2; 846 sensorPath.append("proc" + std::to_string(id) + "_core" + 847 std::to_string(coreNum) + "_" + 848 std::to_string(tempNum) + "_temp"); 849 850 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + 851 "/temperature/proc" + std::to_string(id) + 852 "_core_dvfs_temp"; 853 } 854 else 855 { 856 continue; 857 } 858 } 859 else 860 { 861 continue; 862 } 863 } 864 865 // The dvfs temp file only needs to be read once per chip per type. 866 if (!dvfsTempPath.empty() && 867 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 868 { 869 try 870 { 871 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 872 873 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 874 dvfsTempPath, dvfsValue * std::pow(10, -3)); 875 } 876 catch (const std::system_error& e) 877 { 878 log<level::DEBUG>( 879 fmt::format( 880 "readTempSensors: Failed reading {}, errno = {}", 881 filePathString + maxSuffix, e.code().value()) 882 .c_str()); 883 } 884 } 885 886 uint32_t faultValue{0}; 887 try 888 { 889 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 890 } 891 catch (const std::system_error& e) 892 { 893 log<level::DEBUG>( 894 fmt::format("readTempSensors: Failed reading {}, errno = {}", 895 filePathString + faultSuffix, e.code().value()) 896 .c_str()); 897 continue; 898 } 899 900 // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1. 901 if (faultValue != 0) 902 { 903 dbus::OccDBusSensors::getOccDBus().setValue( 904 sensorPath, std::numeric_limits<double>::quiet_NaN()); 905 906 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 907 false); 908 909 continue; 910 } 911 912 double tempValue{0}; 913 914 try 915 { 916 tempValue = readFile<double>(filePathString + inputSuffix); 917 } 918 catch (const std::system_error& e) 919 { 920 log<level::DEBUG>( 921 fmt::format("readTempSensors: Failed reading {}, errno = {}", 922 filePathString + inputSuffix, e.code().value()) 923 .c_str()); 924 925 // if errno == EAGAIN(Resource temporarily unavailable) then set 926 // temp to 0, to avoid using old temp, and affecting FAN Control. 927 if (e.code().value() == EAGAIN) 928 { 929 tempValue = 0; 930 } 931 // else the errno would be something like 932 // EBADF(Bad file descriptor) 933 // or ENOENT(No such file or directory) 934 else 935 { 936 continue; 937 } 938 } 939 940 dbus::OccDBusSensors::getOccDBus().setValue( 941 sensorPath, tempValue * std::pow(10, -3)); 942 943 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 944 true); 945 946 // At this point, the sensor will be created for sure. 947 if (existingSensors.find(sensorPath) == existingSensors.end()) 948 { 949 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 950 sensorPath); 951 } 952 953 existingSensors[sensorPath] = id; 954 } 955 return; 956 } 957 958 std::optional<std::string> 959 Manager::getPowerLabelFunctionID(const std::string& value) 960 { 961 // If the value is "system", then the FunctionID is "system". 962 if (value == "system") 963 { 964 return value; 965 } 966 967 // If the value is not "system", then the label value have 3 numbers, of 968 // which we only care about the middle one: 969 // <sensor id>_<function id>_<apss channel> 970 // eg: The value is "0_10_5" , then the FunctionID is "10". 971 if (value.find("_") == std::string::npos) 972 { 973 return std::nullopt; 974 } 975 976 auto powerLabelValue = value.substr((value.find("_") + 1)); 977 978 if (powerLabelValue.find("_") == std::string::npos) 979 { 980 return std::nullopt; 981 } 982 983 return powerLabelValue.substr(0, powerLabelValue.find("_")); 984 } 985 986 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 987 { 988 std::regex expr{"power\\d+_label$"}; // Example: power5_label 989 for (auto& file : fs::directory_iterator(path)) 990 { 991 if (!std::regex_search(file.path().string(), expr)) 992 { 993 continue; 994 } 995 996 std::string labelValue; 997 try 998 { 999 labelValue = readFile<std::string>(file.path()); 1000 } 1001 catch (const std::system_error& e) 1002 { 1003 log<level::DEBUG>( 1004 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1005 file.path().string(), e.code().value()) 1006 .c_str()); 1007 continue; 1008 } 1009 1010 auto functionID = getPowerLabelFunctionID(labelValue); 1011 if (functionID == std::nullopt) 1012 { 1013 continue; 1014 } 1015 1016 const std::string& tempLabel = "label"; 1017 const std::string filePathString = file.path().string().substr( 1018 0, file.path().string().length() - tempLabel.length()); 1019 1020 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1021 1022 auto iter = powerSensorName.find(*functionID); 1023 if (iter == powerSensorName.end()) 1024 { 1025 continue; 1026 } 1027 sensorPath.append(iter->second); 1028 1029 double tempValue{0}; 1030 1031 try 1032 { 1033 tempValue = readFile<double>(filePathString + inputSuffix); 1034 } 1035 catch (const std::system_error& e) 1036 { 1037 log<level::DEBUG>( 1038 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1039 filePathString + inputSuffix, e.code().value()) 1040 .c_str()); 1041 continue; 1042 } 1043 1044 dbus::OccDBusSensors::getOccDBus().setUnit( 1045 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1046 1047 dbus::OccDBusSensors::getOccDBus().setValue( 1048 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1049 1050 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1051 true); 1052 1053 if (existingSensors.find(sensorPath) == existingSensors.end()) 1054 { 1055 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1056 sensorPath); 1057 } 1058 1059 existingSensors[sensorPath] = id; 1060 } 1061 return; 1062 } 1063 1064 void Manager::setSensorValueToNaN(uint32_t id) const 1065 { 1066 for (const auto& [sensorPath, occId] : existingSensors) 1067 { 1068 if (occId == id) 1069 { 1070 dbus::OccDBusSensors::getOccDBus().setValue( 1071 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1072 1073 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1074 true); 1075 } 1076 } 1077 return; 1078 } 1079 1080 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1081 { 1082 for (const auto& [sensorPath, occId] : existingSensors) 1083 { 1084 if (occId == id) 1085 { 1086 dbus::OccDBusSensors::getOccDBus().setValue( 1087 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1088 1089 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1090 false); 1091 } 1092 } 1093 return; 1094 } 1095 1096 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1097 { 1098 static bool tracedError[8] = {0}; 1099 const fs::path sensorPath = occ->getHwmonPath(); 1100 const uint32_t id = occ->getOccInstanceID(); 1101 1102 if (fs::exists(sensorPath)) 1103 { 1104 // Read temperature sensors 1105 readTempSensors(sensorPath, id); 1106 1107 if (occ->isMasterOcc()) 1108 { 1109 // Read power sensors 1110 readPowerSensors(sensorPath, id); 1111 } 1112 tracedError[id] = false; 1113 } 1114 else 1115 { 1116 if (!tracedError[id]) 1117 { 1118 log<level::ERR>( 1119 fmt::format( 1120 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1121 id, sensorPath.c_str()) 1122 .c_str()); 1123 tracedError[id] = true; 1124 } 1125 } 1126 1127 return; 1128 } 1129 #endif 1130 1131 // Read the altitude from DBus 1132 void Manager::readAltitude() 1133 { 1134 static bool traceAltitudeErr = true; 1135 1136 utils::PropertyValue altitudeProperty{}; 1137 try 1138 { 1139 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1140 ALTITUDE_PROP); 1141 auto sensorVal = std::get<double>(altitudeProperty); 1142 if (sensorVal < 0xFFFF) 1143 { 1144 if (sensorVal < 0) 1145 { 1146 altitude = 0; 1147 } 1148 else 1149 { 1150 // Round to nearest meter 1151 altitude = uint16_t(sensorVal + 0.5); 1152 } 1153 log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)", 1154 sensorVal, altitude) 1155 .c_str()); 1156 traceAltitudeErr = true; 1157 } 1158 else 1159 { 1160 if (traceAltitudeErr) 1161 { 1162 traceAltitudeErr = false; 1163 log<level::DEBUG>( 1164 fmt::format("Invalid altitude value: {}", sensorVal) 1165 .c_str()); 1166 } 1167 } 1168 } 1169 catch (const sdbusplus::exception_t& e) 1170 { 1171 if (traceAltitudeErr) 1172 { 1173 traceAltitudeErr = false; 1174 log<level::INFO>( 1175 fmt::format("Unable to read Altitude: {}", e.what()).c_str()); 1176 } 1177 altitude = 0xFFFF; // not available 1178 } 1179 } 1180 1181 // Callback function when ambient temperature changes 1182 void Manager::ambientCallback(sdbusplus::message_t& msg) 1183 { 1184 double currentTemp = 0; 1185 uint8_t truncatedTemp = 0xFF; 1186 std::string msgSensor; 1187 std::map<std::string, std::variant<double>> msgData; 1188 msg.read(msgSensor, msgData); 1189 1190 auto valPropMap = msgData.find(AMBIENT_PROP); 1191 if (valPropMap == msgData.end()) 1192 { 1193 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1194 return; 1195 } 1196 currentTemp = std::get<double>(valPropMap->second); 1197 if (std::isnan(currentTemp)) 1198 { 1199 truncatedTemp = 0xFF; 1200 } 1201 else 1202 { 1203 if (currentTemp < 0) 1204 { 1205 truncatedTemp = 0; 1206 } 1207 else 1208 { 1209 // Round to nearest degree C 1210 truncatedTemp = uint8_t(currentTemp + 0.5); 1211 } 1212 } 1213 1214 // If ambient changes, notify OCCs 1215 if (truncatedTemp != ambient) 1216 { 1217 log<level::DEBUG>( 1218 fmt::format("ambientCallback: Ambient change from {} to {}C", 1219 ambient, currentTemp) 1220 .c_str()); 1221 1222 ambient = truncatedTemp; 1223 if (altitude == 0xFFFF) 1224 { 1225 // No altitude yet, try reading again 1226 readAltitude(); 1227 } 1228 1229 log<level::DEBUG>( 1230 fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1231 altitude) 1232 .c_str()); 1233 #ifdef POWER10 1234 // Send ambient and altitude to all OCCs 1235 for (auto& obj : statusObjects) 1236 { 1237 if (obj->occActive()) 1238 { 1239 obj->sendAmbient(ambient, altitude); 1240 } 1241 } 1242 #endif // POWER10 1243 } 1244 } 1245 1246 // return the current ambient and altitude readings 1247 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1248 uint16_t& altitudeValue) const 1249 { 1250 ambientValid = true; 1251 ambientTemp = ambient; 1252 altitudeValue = altitude; 1253 1254 if (ambient == 0xFF) 1255 { 1256 ambientValid = false; 1257 } 1258 } 1259 1260 #ifdef POWER10 1261 // Called when waitForAllOccsTimer expires 1262 // After the first OCC goes active, this timer will be started (60 seconds) 1263 void Manager::occsNotAllRunning() 1264 { 1265 if (activeCount != statusObjects.size()) 1266 { 1267 // Not all OCCs went active 1268 log<level::WARNING>( 1269 fmt::format( 1270 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1271 activeCount, statusObjects.size()) 1272 .c_str()); 1273 // Procs may be garded, so may be expected 1274 } 1275 1276 validateOccMaster(); 1277 } 1278 #endif // POWER10 1279 1280 // Verify single master OCC and start presence monitor 1281 void Manager::validateOccMaster() 1282 { 1283 int masterInstance = -1; 1284 for (auto& obj : statusObjects) 1285 { 1286 auto instance = obj->getOccInstanceID(); 1287 #ifdef POWER10 1288 if (!obj->occActive()) 1289 { 1290 if (utils::isHostRunning()) 1291 { 1292 // Check if sensor was queued while waiting for discovery 1293 auto match = queuedActiveState.find(instance); 1294 if (match != queuedActiveState.end()) 1295 { 1296 queuedActiveState.erase(match); 1297 log<level::INFO>( 1298 fmt::format( 1299 "validateOccMaster: OCC{} is ACTIVE (queued)", 1300 instance) 1301 .c_str()); 1302 obj->occActive(true); 1303 } 1304 else 1305 { 1306 // OCC does not appear to be active yet, check active sensor 1307 pldmHandle->checkActiveSensor(instance); 1308 if (obj->occActive()) 1309 { 1310 log<level::INFO>( 1311 fmt::format( 1312 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1313 instance) 1314 .c_str()); 1315 } 1316 } 1317 } 1318 else 1319 { 1320 log<level::WARNING>( 1321 fmt::format( 1322 "validateOccMaster: HOST is not running (OCC{})", 1323 instance) 1324 .c_str()); 1325 return; 1326 } 1327 } 1328 #endif // POWER10 1329 1330 if (obj->isMasterOcc()) 1331 { 1332 obj->addPresenceWatchMaster(); 1333 1334 if (masterInstance == -1) 1335 { 1336 masterInstance = instance; 1337 } 1338 else 1339 { 1340 log<level::ERR>( 1341 fmt::format( 1342 "validateOccMaster: Multiple OCC masters! ({} and {})", 1343 masterInstance, instance) 1344 .c_str()); 1345 // request reset 1346 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1347 } 1348 } 1349 } 1350 1351 if (masterInstance < 0) 1352 { 1353 log<level::ERR>( 1354 fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1355 statusObjects.size()) 1356 .c_str()); 1357 // request reset 1358 statusObjects.front()->deviceError( 1359 Error::Descriptor(PRESENCE_ERROR_PATH)); 1360 } 1361 else 1362 { 1363 log<level::INFO>( 1364 fmt::format("validateOccMaster: OCC{} is master of {} OCCs", 1365 masterInstance, activeCount) 1366 .c_str()); 1367 #ifdef POWER10 1368 pmode->updateDbusSafeMode(false); 1369 #endif 1370 } 1371 } 1372 1373 void Manager::updatePcapBounds() const 1374 { 1375 if (pcap) 1376 { 1377 pcap->updatePcapBounds(); 1378 } 1379 } 1380 1381 } // namespace occ 1382 } // namespace open_power 1383