1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 fmt::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 fmt::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 118 // Find/update the processor path associated with each OCC 119 for (auto& obj : statusObjects) 120 { 121 obj->updateProcAssociation(); 122 } 123 } 124 } 125 126 if (statusObjCreated && waitingForAllOccActiveSensors) 127 { 128 static bool tracedHostWait = false; 129 if (utils::isHostRunning()) 130 { 131 if (tracedHostWait) 132 { 133 log<level::INFO>( 134 "Manager::findAndCreateObjects(): Host is running"); 135 tracedHostWait = false; 136 } 137 checkAllActiveSensors(); 138 } 139 else 140 { 141 if (!tracedHostWait) 142 { 143 log<level::INFO>( 144 "Manager::findAndCreateObjects(): Waiting for host to start"); 145 tracedHostWait = true; 146 } 147 discoverTimer->restartOnce(30s); 148 } 149 } 150 } 151 else 152 { 153 log<level::INFO>( 154 fmt::format( 155 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 156 HOST_ON_FILE) 157 .c_str()); 158 discoverTimer->restartOnce(10s); 159 } 160 #endif 161 } 162 163 #ifdef POWER10 164 // Check if all occActive sensors are available 165 void Manager::checkAllActiveSensors() 166 { 167 static bool allActiveSensorAvailable = false; 168 static bool tracedSensorWait = false; 169 static bool waitingForHost = false; 170 171 if (open_power::occ::utils::isHostRunning()) 172 { 173 if (waitingForHost) 174 { 175 waitingForHost = false; 176 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 177 } 178 179 // Start with the assumption that all are available 180 allActiveSensorAvailable = true; 181 for (auto& obj : statusObjects) 182 { 183 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 184 { 185 auto instance = obj->getOccInstanceID(); 186 // Check if sensor was queued while waiting for discovery 187 auto match = queuedActiveState.find(instance); 188 if (match != queuedActiveState.end()) 189 { 190 queuedActiveState.erase(match); 191 log<level::INFO>( 192 fmt::format( 193 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 194 instance) 195 .c_str()); 196 obj->occActive(true); 197 } 198 else 199 { 200 allActiveSensorAvailable = false; 201 if (!tracedSensorWait) 202 { 203 log<level::INFO>( 204 fmt::format( 205 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 206 instance) 207 .c_str()); 208 tracedSensorWait = true; 209 } 210 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 211 break; 212 } 213 } 214 } 215 } 216 else 217 { 218 if (!waitingForHost) 219 { 220 waitingForHost = true; 221 log<level::INFO>( 222 "checkAllActiveSensors(): Waiting for host to start"); 223 } 224 } 225 226 if (allActiveSensorAvailable) 227 { 228 // All sensors were found, disable the discovery timer 229 if (discoverTimer->isEnabled()) 230 { 231 discoverTimer->setEnabled(false); 232 } 233 234 if (waitingForAllOccActiveSensors) 235 { 236 log<level::INFO>( 237 "checkAllActiveSensors(): OCC Active sensors are available"); 238 waitingForAllOccActiveSensors = false; 239 } 240 queuedActiveState.clear(); 241 tracedSensorWait = false; 242 } 243 else 244 { 245 // Not all sensors were available, so keep waiting 246 if (!tracedSensorWait) 247 { 248 log<level::INFO>( 249 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 250 tracedSensorWait = true; 251 } 252 discoverTimer->restartOnce(10s); 253 } 254 } 255 #endif 256 257 std::vector<int> Manager::findOCCsInDev() 258 { 259 std::vector<int> occs; 260 std::regex expr{R"(occ(\d+)$)"}; 261 262 for (auto& file : fs::directory_iterator("/dev")) 263 { 264 std::smatch match; 265 std::string path{file.path().string()}; 266 if (std::regex_search(path, match, expr)) 267 { 268 auto num = std::stoi(match[1].str()); 269 270 // /dev numbering starts at 1, ours starts at 0. 271 occs.push_back(num - 1); 272 } 273 } 274 275 return occs; 276 } 277 278 int Manager::cpuCreated(sdbusplus::message_t& msg) 279 { 280 namespace fs = std::filesystem; 281 282 sdbusplus::message::object_path o; 283 msg.read(o); 284 fs::path cpuPath(std::string(std::move(o))); 285 286 auto name = cpuPath.filename().string(); 287 auto index = name.find(CPU_NAME); 288 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 289 290 createObjects(name); 291 292 return 0; 293 } 294 295 void Manager::createObjects(const std::string& occ) 296 { 297 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 298 299 statusObjects.emplace_back(std::make_unique<Status>( 300 event, path.c_str(), *this, 301 #ifdef POWER10 302 pmode, 303 #endif 304 std::bind(std::mem_fn(&Manager::statusCallBack), this, 305 std::placeholders::_1, std::placeholders::_2) 306 #ifdef PLDM 307 , 308 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 309 std::placeholders::_1) 310 #endif 311 )); 312 313 // Create the power cap monitor object 314 if (!pcap) 315 { 316 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 317 *statusObjects.back()); 318 } 319 320 if (statusObjects.back()->isMasterOcc()) 321 { 322 log<level::INFO>( 323 fmt::format("Manager::createObjects(): OCC{} is the master", 324 statusObjects.back()->getOccInstanceID()) 325 .c_str()); 326 _pollTimer->setEnabled(false); 327 328 #ifdef POWER10 329 // Set the master OCC on the PowerMode object 330 pmode->setMasterOcc(path); 331 #endif 332 } 333 334 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 335 #ifdef POWER10 336 , 337 pmode 338 #endif 339 )); 340 } 341 342 void Manager::statusCallBack(instanceID instance, bool status) 343 { 344 if (status == true) 345 { 346 // OCC went active 347 ++activeCount; 348 349 #ifdef POWER10 350 if (activeCount == 1) 351 { 352 // First OCC went active (allow some time for all OCCs to go active) 353 waitForAllOccsTimer->restartOnce(60s); 354 } 355 #endif 356 357 if (activeCount == statusObjects.size()) 358 { 359 #ifdef POWER10 360 // All OCCs are now running 361 if (waitForAllOccsTimer->isEnabled()) 362 { 363 // stop occ wait timer 364 waitForAllOccsTimer->setEnabled(false); 365 } 366 #endif 367 368 // Verify master OCC and start presence monitor 369 validateOccMaster(); 370 } 371 372 // Start poll timer if not already started 373 if (!_pollTimer->isEnabled()) 374 { 375 log<level::INFO>( 376 fmt::format("Manager: OCCs will be polled every {} seconds", 377 pollInterval) 378 .c_str()); 379 380 // Send poll and start OCC poll timer 381 pollerTimerExpired(); 382 } 383 } 384 else 385 { 386 // OCC went away 387 if (activeCount > 0) 388 { 389 --activeCount; 390 } 391 else 392 { 393 log<level::ERR>( 394 fmt::format("OCC{} disabled, but currently no active OCCs", 395 instance) 396 .c_str()); 397 } 398 399 if (activeCount == 0) 400 { 401 // No OCCs are running 402 403 // Stop OCC poll timer 404 if (_pollTimer->isEnabled()) 405 { 406 log<level::INFO>( 407 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 408 _pollTimer->setEnabled(false); 409 } 410 411 #ifdef POWER10 412 // stop wait timer 413 if (waitForAllOccsTimer->isEnabled()) 414 { 415 waitForAllOccsTimer->setEnabled(false); 416 } 417 #endif 418 } 419 #ifdef READ_OCC_SENSORS 420 // Clear OCC sensors 421 setSensorValueToNaN(instance); 422 #endif 423 } 424 425 #ifdef POWER10 426 if (waitingForAllOccActiveSensors) 427 { 428 if (utils::isHostRunning()) 429 { 430 checkAllActiveSensors(); 431 } 432 } 433 #endif 434 } 435 436 #ifdef I2C_OCC 437 void Manager::initStatusObjects() 438 { 439 // Make sure we have a valid path string 440 static_assert(sizeof(DEV_PATH) != 0); 441 442 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 443 for (auto& name : deviceNames) 444 { 445 i2c_occ::i2cToDbus(name); 446 name = std::string(OCC_NAME) + '_' + name; 447 auto path = fs::path(OCC_CONTROL_ROOT) / name; 448 statusObjects.emplace_back( 449 std::make_unique<Status>(event, path.c_str(), *this)); 450 } 451 // The first device is master occ 452 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 453 *statusObjects.front()); 454 #ifdef POWER10 455 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 456 powermode::PIPS_PATH); 457 // Set the master OCC on the PowerMode object 458 pmode->setMasterOcc(path); 459 #endif 460 } 461 #endif 462 463 #ifdef PLDM 464 void Manager::sbeTimeout(unsigned int instance) 465 { 466 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 467 [instance](const auto& obj) { 468 return instance == obj->getOccInstanceID(); 469 }); 470 471 if (obj != statusObjects.end() && (*obj)->occActive()) 472 { 473 log<level::INFO>( 474 fmt::format("SBE timeout, requesting HRESET (OCC{})", instance) 475 .c_str()); 476 477 setSBEState(instance, SBE_STATE_NOT_USABLE); 478 479 pldmHandle->sendHRESET(instance); 480 } 481 } 482 483 bool Manager::updateOCCActive(instanceID instance, bool status) 484 { 485 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 486 [instance](const auto& obj) { 487 return instance == obj->getOccInstanceID(); 488 }); 489 490 const bool hostRunning = open_power::occ::utils::isHostRunning(); 491 if (obj != statusObjects.end()) 492 { 493 if (!hostRunning && (status == true)) 494 { 495 log<level::WARNING>( 496 fmt::format( 497 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 498 instance, status) 499 .c_str()); 500 (*obj)->setPldmSensorReceived(false); 501 if (!waitingForAllOccActiveSensors) 502 { 503 log<level::INFO>( 504 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 505 waitingForAllOccActiveSensors = true; 506 } 507 discoverTimer->restartOnce(30s); 508 return false; 509 } 510 else 511 { 512 log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}", 513 instance, status) 514 .c_str()); 515 (*obj)->setPldmSensorReceived(true); 516 return (*obj)->occActive(status); 517 } 518 } 519 else 520 { 521 if (hostRunning) 522 { 523 log<level::WARNING>( 524 fmt::format( 525 "updateOCCActive: No status object to update for OCC{} (active={})", 526 instance, status) 527 .c_str()); 528 } 529 else 530 { 531 if (status == true) 532 { 533 log<level::WARNING>( 534 fmt::format( 535 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 536 instance, status) 537 .c_str()); 538 } 539 } 540 if (status == true) 541 { 542 // OCC went active 543 queuedActiveState.insert(instance); 544 } 545 else 546 { 547 auto match = queuedActiveState.find(instance); 548 if (match != queuedActiveState.end()) 549 { 550 // OCC was disabled 551 queuedActiveState.erase(match); 552 } 553 } 554 return false; 555 } 556 } 557 558 // Called upon pldm event To set powermode Safe Mode State for system. 559 void Manager::updateOccSafeMode(bool safeMode) 560 { 561 #ifdef POWER10 562 pmode->updateDbusSafeMode(safeMode); 563 #endif 564 // Update the processor throttle status on dbus 565 for (auto& obj : statusObjects) 566 { 567 obj->updateThrottle(safeMode, THROTTLED_SAFE); 568 } 569 } 570 571 void Manager::sbeHRESETResult(instanceID instance, bool success) 572 { 573 if (success) 574 { 575 log<level::INFO>( 576 fmt::format("HRESET succeeded (OCC{})", instance).c_str()); 577 578 setSBEState(instance, SBE_STATE_BOOTED); 579 580 return; 581 } 582 583 setSBEState(instance, SBE_STATE_FAILED); 584 585 if (sbeCanDump(instance)) 586 { 587 log<level::INFO>( 588 fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance) 589 .c_str()); 590 591 auto& bus = utils::getBus(); 592 uint32_t src6 = instance << 16; 593 uint32_t logId = 594 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 595 src6, "SBE command timeout"); 596 597 try 598 { 599 constexpr auto path = "/org/openpower/dump"; 600 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 601 constexpr auto function = "CreateDump"; 602 603 std::string service = utils::getService(path, interface); 604 auto method = bus.new_method_call(service.c_str(), path, interface, 605 function); 606 607 std::map<std::string, std::variant<std::string, uint64_t>> 608 createParams{ 609 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 610 uint64_t(logId)}, 611 {"com.ibm.Dump.Create.CreateParameters.DumpType", 612 "com.ibm.Dump.Create.DumpType.SBE"}, 613 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 614 uint64_t(instance)}, 615 }; 616 617 method.append(createParams); 618 619 auto response = bus.call(method); 620 } 621 catch (const sdbusplus::exception_t& e) 622 { 623 constexpr auto ERROR_DUMP_DISABLED = 624 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 625 if (e.name() == ERROR_DUMP_DISABLED) 626 { 627 log<level::INFO>("Dump is disabled, skipping"); 628 } 629 else 630 { 631 log<level::ERR>("Dump failed"); 632 } 633 } 634 } 635 } 636 637 bool Manager::sbeCanDump(unsigned int instance) 638 { 639 struct pdbg_target* proc = getPdbgTarget(instance); 640 641 if (!proc) 642 { 643 // allow the dump in the error case 644 return true; 645 } 646 647 try 648 { 649 if (!openpower::phal::sbe::isDumpAllowed(proc)) 650 { 651 return false; 652 } 653 654 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 655 { 656 return false; 657 } 658 } 659 catch (openpower::phal::exception::SbeError& e) 660 { 661 log<level::INFO>("Failed to query SBE state"); 662 } 663 664 // allow the dump in the error case 665 return true; 666 } 667 668 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 669 { 670 struct pdbg_target* proc = getPdbgTarget(instance); 671 672 if (!proc) 673 { 674 return; 675 } 676 677 try 678 { 679 openpower::phal::sbe::setState(proc, state); 680 } 681 catch (const openpower::phal::exception::SbeError& e) 682 { 683 log<level::ERR>("Failed to set SBE state"); 684 } 685 } 686 687 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 688 { 689 if (!pdbgInitialized) 690 { 691 try 692 { 693 openpower::phal::pdbg::init(); 694 pdbgInitialized = true; 695 } 696 catch (const openpower::phal::exception::PdbgError& e) 697 { 698 log<level::ERR>("pdbg initialization failed"); 699 return nullptr; 700 } 701 } 702 703 struct pdbg_target* proc = nullptr; 704 pdbg_for_each_class_target("proc", proc) 705 { 706 if (pdbg_target_index(proc) == instance) 707 { 708 return proc; 709 } 710 } 711 712 log<level::ERR>("Failed to get pdbg target"); 713 return nullptr; 714 } 715 #endif 716 717 void Manager::pollerTimerExpired() 718 { 719 if (!_pollTimer) 720 { 721 log<level::ERR>( 722 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 723 return; 724 } 725 726 for (auto& obj : statusObjects) 727 { 728 if (!obj->occActive()) 729 { 730 // OCC is not running yet 731 #ifdef READ_OCC_SENSORS 732 auto id = obj->getOccInstanceID(); 733 setSensorValueToNaN(id); 734 #endif 735 continue; 736 } 737 738 // Read sysfs to force kernel to poll OCC 739 obj->readOccState(); 740 741 #ifdef READ_OCC_SENSORS 742 // Read occ sensor values 743 getSensorValues(obj); 744 #endif 745 } 746 747 if (activeCount > 0) 748 { 749 // Restart OCC poll timer 750 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 751 } 752 else 753 { 754 // No OCCs running, so poll timer will not be restarted 755 log<level::INFO>( 756 fmt::format( 757 "Manager::pollerTimerExpired: poll timer will not be restarted") 758 .c_str()); 759 } 760 } 761 762 #ifdef READ_OCC_SENSORS 763 void Manager::readTempSensors(const fs::path& path, uint32_t id) 764 { 765 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 766 for (auto& file : fs::directory_iterator(path)) 767 { 768 if (!std::regex_search(file.path().string(), expr)) 769 { 770 continue; 771 } 772 773 uint32_t labelValue{0}; 774 775 try 776 { 777 labelValue = readFile<uint32_t>(file.path()); 778 } 779 catch (const std::system_error& e) 780 { 781 log<level::DEBUG>( 782 fmt::format("readTempSensors: Failed reading {}, errno = {}", 783 file.path().string(), e.code().value()) 784 .c_str()); 785 continue; 786 } 787 788 const std::string& tempLabel = "label"; 789 const std::string filePathString = file.path().string().substr( 790 0, file.path().string().length() - tempLabel.length()); 791 792 uint32_t fruTypeValue{0}; 793 try 794 { 795 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 796 } 797 catch (const std::system_error& e) 798 { 799 log<level::DEBUG>( 800 fmt::format("readTempSensors: Failed reading {}, errno = {}", 801 filePathString + fruTypeSuffix, e.code().value()) 802 .c_str()); 803 continue; 804 } 805 806 std::string sensorPath = OCC_SENSORS_ROOT + 807 std::string("/temperature/"); 808 809 std::string dvfsTempPath; 810 811 if (fruTypeValue == VRMVdd) 812 { 813 sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp"); 814 } 815 else if (fruTypeValue == processorIoRing) 816 { 817 sensorPath.append("proc" + std::to_string(id) + "_ioring_temp"); 818 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 819 std::to_string(id) + "_ioring_dvfs_temp"; 820 } 821 else 822 { 823 uint16_t type = (labelValue & 0xFF000000) >> 24; 824 uint16_t instanceID = labelValue & 0x0000FFFF; 825 826 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 827 { 828 if (fruTypeValue == fruTypeNotAvailable) 829 { 830 // Not all DIMM related temps are available to read 831 // (no _input file in this case) 832 continue; 833 } 834 auto iter = dimmTempSensorName.find(fruTypeValue); 835 if (iter == dimmTempSensorName.end()) 836 { 837 log<level::ERR>( 838 fmt::format( 839 "readTempSensors: Fru type error! fruTypeValue = {}) ", 840 fruTypeValue) 841 .c_str()); 842 continue; 843 } 844 845 sensorPath.append("dimm" + std::to_string(instanceID) + 846 iter->second); 847 } 848 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 849 { 850 if (fruTypeValue == processorCore) 851 { 852 // The OCC reports small core temps, of which there are 853 // two per big core. All current P10 systems are in big 854 // core mode, so use a big core name. 855 uint16_t coreNum = instanceID / 2; 856 uint16_t tempNum = instanceID % 2; 857 sensorPath.append("proc" + std::to_string(id) + "_core" + 858 std::to_string(coreNum) + "_" + 859 std::to_string(tempNum) + "_temp"); 860 861 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + 862 "/temperature/proc" + std::to_string(id) + 863 "_core_dvfs_temp"; 864 } 865 else 866 { 867 continue; 868 } 869 } 870 else 871 { 872 continue; 873 } 874 } 875 876 // The dvfs temp file only needs to be read once per chip per type. 877 if (!dvfsTempPath.empty() && 878 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 879 { 880 try 881 { 882 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 883 884 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 885 dvfsTempPath, dvfsValue * std::pow(10, -3)); 886 } 887 catch (const std::system_error& e) 888 { 889 log<level::DEBUG>( 890 fmt::format( 891 "readTempSensors: Failed reading {}, errno = {}", 892 filePathString + maxSuffix, e.code().value()) 893 .c_str()); 894 } 895 } 896 897 uint32_t faultValue{0}; 898 try 899 { 900 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 901 } 902 catch (const std::system_error& e) 903 { 904 log<level::DEBUG>( 905 fmt::format("readTempSensors: Failed reading {}, errno = {}", 906 filePathString + faultSuffix, e.code().value()) 907 .c_str()); 908 continue; 909 } 910 911 // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1. 912 if (faultValue != 0) 913 { 914 dbus::OccDBusSensors::getOccDBus().setValue( 915 sensorPath, std::numeric_limits<double>::quiet_NaN()); 916 917 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 918 false); 919 920 continue; 921 } 922 923 double tempValue{0}; 924 925 try 926 { 927 tempValue = readFile<double>(filePathString + inputSuffix); 928 } 929 catch (const std::system_error& e) 930 { 931 log<level::DEBUG>( 932 fmt::format("readTempSensors: Failed reading {}, errno = {}", 933 filePathString + inputSuffix, e.code().value()) 934 .c_str()); 935 936 // if errno == EAGAIN(Resource temporarily unavailable) then set 937 // temp to 0, to avoid using old temp, and affecting FAN Control. 938 if (e.code().value() == EAGAIN) 939 { 940 tempValue = 0; 941 } 942 // else the errno would be something like 943 // EBADF(Bad file descriptor) 944 // or ENOENT(No such file or directory) 945 else 946 { 947 continue; 948 } 949 } 950 951 dbus::OccDBusSensors::getOccDBus().setValue( 952 sensorPath, tempValue * std::pow(10, -3)); 953 954 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 955 true); 956 957 // At this point, the sensor will be created for sure. 958 if (existingSensors.find(sensorPath) == existingSensors.end()) 959 { 960 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 961 sensorPath); 962 } 963 964 existingSensors[sensorPath] = id; 965 } 966 return; 967 } 968 969 std::optional<std::string> 970 Manager::getPowerLabelFunctionID(const std::string& value) 971 { 972 // If the value is "system", then the FunctionID is "system". 973 if (value == "system") 974 { 975 return value; 976 } 977 978 // If the value is not "system", then the label value have 3 numbers, of 979 // which we only care about the middle one: 980 // <sensor id>_<function id>_<apss channel> 981 // eg: The value is "0_10_5" , then the FunctionID is "10". 982 if (value.find("_") == std::string::npos) 983 { 984 return std::nullopt; 985 } 986 987 auto powerLabelValue = value.substr((value.find("_") + 1)); 988 989 if (powerLabelValue.find("_") == std::string::npos) 990 { 991 return std::nullopt; 992 } 993 994 return powerLabelValue.substr(0, powerLabelValue.find("_")); 995 } 996 997 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 998 { 999 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1000 for (auto& file : fs::directory_iterator(path)) 1001 { 1002 if (!std::regex_search(file.path().string(), expr)) 1003 { 1004 continue; 1005 } 1006 1007 std::string labelValue; 1008 try 1009 { 1010 labelValue = readFile<std::string>(file.path()); 1011 } 1012 catch (const std::system_error& e) 1013 { 1014 log<level::DEBUG>( 1015 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1016 file.path().string(), e.code().value()) 1017 .c_str()); 1018 continue; 1019 } 1020 1021 auto functionID = getPowerLabelFunctionID(labelValue); 1022 if (functionID == std::nullopt) 1023 { 1024 continue; 1025 } 1026 1027 const std::string& tempLabel = "label"; 1028 const std::string filePathString = file.path().string().substr( 1029 0, file.path().string().length() - tempLabel.length()); 1030 1031 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1032 1033 auto iter = powerSensorName.find(*functionID); 1034 if (iter == powerSensorName.end()) 1035 { 1036 continue; 1037 } 1038 sensorPath.append(iter->second); 1039 1040 double tempValue{0}; 1041 1042 try 1043 { 1044 tempValue = readFile<double>(filePathString + inputSuffix); 1045 } 1046 catch (const std::system_error& e) 1047 { 1048 log<level::DEBUG>( 1049 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1050 filePathString + inputSuffix, e.code().value()) 1051 .c_str()); 1052 continue; 1053 } 1054 1055 dbus::OccDBusSensors::getOccDBus().setUnit( 1056 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1057 1058 dbus::OccDBusSensors::getOccDBus().setValue( 1059 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1060 1061 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1062 true); 1063 1064 if (existingSensors.find(sensorPath) == existingSensors.end()) 1065 { 1066 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1067 sensorPath); 1068 } 1069 1070 existingSensors[sensorPath] = id; 1071 } 1072 return; 1073 } 1074 1075 void Manager::setSensorValueToNaN(uint32_t id) const 1076 { 1077 for (const auto& [sensorPath, occId] : existingSensors) 1078 { 1079 if (occId == id) 1080 { 1081 dbus::OccDBusSensors::getOccDBus().setValue( 1082 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1083 1084 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1085 true); 1086 } 1087 } 1088 return; 1089 } 1090 1091 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1092 { 1093 for (const auto& [sensorPath, occId] : existingSensors) 1094 { 1095 if (occId == id) 1096 { 1097 dbus::OccDBusSensors::getOccDBus().setValue( 1098 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1099 1100 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1101 false); 1102 } 1103 } 1104 return; 1105 } 1106 1107 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1108 { 1109 static bool tracedError[8] = {0}; 1110 const fs::path sensorPath = occ->getHwmonPath(); 1111 const uint32_t id = occ->getOccInstanceID(); 1112 1113 if (fs::exists(sensorPath)) 1114 { 1115 // Read temperature sensors 1116 readTempSensors(sensorPath, id); 1117 1118 if (occ->isMasterOcc()) 1119 { 1120 // Read power sensors 1121 readPowerSensors(sensorPath, id); 1122 } 1123 tracedError[id] = false; 1124 } 1125 else 1126 { 1127 if (!tracedError[id]) 1128 { 1129 log<level::ERR>( 1130 fmt::format( 1131 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1132 id, sensorPath.c_str()) 1133 .c_str()); 1134 tracedError[id] = true; 1135 } 1136 } 1137 1138 return; 1139 } 1140 #endif 1141 1142 // Read the altitude from DBus 1143 void Manager::readAltitude() 1144 { 1145 static bool traceAltitudeErr = true; 1146 1147 utils::PropertyValue altitudeProperty{}; 1148 try 1149 { 1150 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1151 ALTITUDE_PROP); 1152 auto sensorVal = std::get<double>(altitudeProperty); 1153 if (sensorVal < 0xFFFF) 1154 { 1155 if (sensorVal < 0) 1156 { 1157 altitude = 0; 1158 } 1159 else 1160 { 1161 // Round to nearest meter 1162 altitude = uint16_t(sensorVal + 0.5); 1163 } 1164 log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)", 1165 sensorVal, altitude) 1166 .c_str()); 1167 traceAltitudeErr = true; 1168 } 1169 else 1170 { 1171 if (traceAltitudeErr) 1172 { 1173 traceAltitudeErr = false; 1174 log<level::DEBUG>( 1175 fmt::format("Invalid altitude value: {}", sensorVal) 1176 .c_str()); 1177 } 1178 } 1179 } 1180 catch (const sdbusplus::exception_t& e) 1181 { 1182 if (traceAltitudeErr) 1183 { 1184 traceAltitudeErr = false; 1185 log<level::INFO>( 1186 fmt::format("Unable to read Altitude: {}", e.what()).c_str()); 1187 } 1188 altitude = 0xFFFF; // not available 1189 } 1190 } 1191 1192 // Callback function when ambient temperature changes 1193 void Manager::ambientCallback(sdbusplus::message_t& msg) 1194 { 1195 double currentTemp = 0; 1196 uint8_t truncatedTemp = 0xFF; 1197 std::string msgSensor; 1198 std::map<std::string, std::variant<double>> msgData; 1199 msg.read(msgSensor, msgData); 1200 1201 auto valPropMap = msgData.find(AMBIENT_PROP); 1202 if (valPropMap == msgData.end()) 1203 { 1204 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1205 return; 1206 } 1207 currentTemp = std::get<double>(valPropMap->second); 1208 if (std::isnan(currentTemp)) 1209 { 1210 truncatedTemp = 0xFF; 1211 } 1212 else 1213 { 1214 if (currentTemp < 0) 1215 { 1216 truncatedTemp = 0; 1217 } 1218 else 1219 { 1220 // Round to nearest degree C 1221 truncatedTemp = uint8_t(currentTemp + 0.5); 1222 } 1223 } 1224 1225 // If ambient changes, notify OCCs 1226 if (truncatedTemp != ambient) 1227 { 1228 log<level::DEBUG>( 1229 fmt::format("ambientCallback: Ambient change from {} to {}C", 1230 ambient, currentTemp) 1231 .c_str()); 1232 1233 ambient = truncatedTemp; 1234 if (altitude == 0xFFFF) 1235 { 1236 // No altitude yet, try reading again 1237 readAltitude(); 1238 } 1239 1240 log<level::DEBUG>( 1241 fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1242 altitude) 1243 .c_str()); 1244 #ifdef POWER10 1245 // Send ambient and altitude to all OCCs 1246 for (auto& obj : statusObjects) 1247 { 1248 if (obj->occActive()) 1249 { 1250 obj->sendAmbient(ambient, altitude); 1251 } 1252 } 1253 #endif // POWER10 1254 } 1255 } 1256 1257 // return the current ambient and altitude readings 1258 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1259 uint16_t& altitudeValue) const 1260 { 1261 ambientValid = true; 1262 ambientTemp = ambient; 1263 altitudeValue = altitude; 1264 1265 if (ambient == 0xFF) 1266 { 1267 ambientValid = false; 1268 } 1269 } 1270 1271 #ifdef POWER10 1272 // Called when waitForAllOccsTimer expires 1273 // After the first OCC goes active, this timer will be started (60 seconds) 1274 void Manager::occsNotAllRunning() 1275 { 1276 if (activeCount != statusObjects.size()) 1277 { 1278 // Not all OCCs went active 1279 log<level::WARNING>( 1280 fmt::format( 1281 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1282 activeCount, statusObjects.size()) 1283 .c_str()); 1284 // Procs may be garded, so may be expected 1285 } 1286 1287 validateOccMaster(); 1288 } 1289 #endif // POWER10 1290 1291 // Verify single master OCC and start presence monitor 1292 void Manager::validateOccMaster() 1293 { 1294 int masterInstance = -1; 1295 for (auto& obj : statusObjects) 1296 { 1297 auto instance = obj->getOccInstanceID(); 1298 #ifdef POWER10 1299 if (!obj->occActive()) 1300 { 1301 if (utils::isHostRunning()) 1302 { 1303 // Check if sensor was queued while waiting for discovery 1304 auto match = queuedActiveState.find(instance); 1305 if (match != queuedActiveState.end()) 1306 { 1307 queuedActiveState.erase(match); 1308 log<level::INFO>( 1309 fmt::format( 1310 "validateOccMaster: OCC{} is ACTIVE (queued)", 1311 instance) 1312 .c_str()); 1313 obj->occActive(true); 1314 } 1315 else 1316 { 1317 // OCC does not appear to be active yet, check active sensor 1318 pldmHandle->checkActiveSensor(instance); 1319 if (obj->occActive()) 1320 { 1321 log<level::INFO>( 1322 fmt::format( 1323 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1324 instance) 1325 .c_str()); 1326 } 1327 } 1328 } 1329 else 1330 { 1331 log<level::WARNING>( 1332 fmt::format( 1333 "validateOccMaster: HOST is not running (OCC{})", 1334 instance) 1335 .c_str()); 1336 return; 1337 } 1338 } 1339 #endif // POWER10 1340 1341 if (obj->isMasterOcc()) 1342 { 1343 obj->addPresenceWatchMaster(); 1344 1345 if (masterInstance == -1) 1346 { 1347 masterInstance = instance; 1348 } 1349 else 1350 { 1351 log<level::ERR>( 1352 fmt::format( 1353 "validateOccMaster: Multiple OCC masters! ({} and {})", 1354 masterInstance, instance) 1355 .c_str()); 1356 // request reset 1357 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1358 } 1359 } 1360 } 1361 1362 if (masterInstance < 0) 1363 { 1364 log<level::ERR>( 1365 fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1366 statusObjects.size()) 1367 .c_str()); 1368 // request reset 1369 statusObjects.front()->deviceError( 1370 Error::Descriptor(PRESENCE_ERROR_PATH)); 1371 } 1372 else 1373 { 1374 log<level::INFO>( 1375 fmt::format("validateOccMaster: OCC{} is master of {} OCCs", 1376 masterInstance, activeCount) 1377 .c_str()); 1378 #ifdef POWER10 1379 pmode->updateDbusSafeMode(false); 1380 #endif 1381 } 1382 } 1383 1384 void Manager::updatePcapBounds() const 1385 { 1386 if (pcap) 1387 { 1388 pcap->updatePcapBounds(); 1389 } 1390 } 1391 1392 } // namespace occ 1393 } // namespace open_power 1394