1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 std::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 std::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 118 // Find/update the processor path associated with each OCC 119 for (auto& obj : statusObjects) 120 { 121 obj->updateProcAssociation(); 122 } 123 } 124 } 125 126 if (statusObjCreated && waitingForAllOccActiveSensors) 127 { 128 static bool tracedHostWait = false; 129 if (utils::isHostRunning()) 130 { 131 if (tracedHostWait) 132 { 133 log<level::INFO>( 134 "Manager::findAndCreateObjects(): Host is running"); 135 tracedHostWait = false; 136 } 137 checkAllActiveSensors(); 138 } 139 else 140 { 141 if (!tracedHostWait) 142 { 143 log<level::INFO>( 144 "Manager::findAndCreateObjects(): Waiting for host to start"); 145 tracedHostWait = true; 146 } 147 discoverTimer->restartOnce(30s); 148 } 149 } 150 } 151 else 152 { 153 log<level::INFO>( 154 std::format( 155 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 156 HOST_ON_FILE) 157 .c_str()); 158 discoverTimer->restartOnce(10s); 159 } 160 #endif 161 } 162 163 #ifdef POWER10 164 // Check if all occActive sensors are available 165 void Manager::checkAllActiveSensors() 166 { 167 static bool allActiveSensorAvailable = false; 168 static bool tracedSensorWait = false; 169 static bool waitingForHost = false; 170 171 if (open_power::occ::utils::isHostRunning()) 172 { 173 if (waitingForHost) 174 { 175 waitingForHost = false; 176 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 177 } 178 179 // Start with the assumption that all are available 180 allActiveSensorAvailable = true; 181 for (auto& obj : statusObjects) 182 { 183 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 184 { 185 auto instance = obj->getOccInstanceID(); 186 // Check if sensor was queued while waiting for discovery 187 auto match = queuedActiveState.find(instance); 188 if (match != queuedActiveState.end()) 189 { 190 queuedActiveState.erase(match); 191 log<level::INFO>( 192 std::format( 193 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 194 instance) 195 .c_str()); 196 obj->occActive(true); 197 } 198 else 199 { 200 allActiveSensorAvailable = false; 201 if (!tracedSensorWait) 202 { 203 log<level::INFO>( 204 std::format( 205 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 206 instance) 207 .c_str()); 208 tracedSensorWait = true; 209 } 210 #ifdef PLDM 211 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 212 #endif 213 break; 214 } 215 } 216 } 217 } 218 else 219 { 220 if (!waitingForHost) 221 { 222 waitingForHost = true; 223 log<level::INFO>( 224 "checkAllActiveSensors(): Waiting for host to start"); 225 } 226 } 227 228 if (allActiveSensorAvailable) 229 { 230 // All sensors were found, disable the discovery timer 231 if (discoverTimer->isEnabled()) 232 { 233 discoverTimer->setEnabled(false); 234 } 235 236 if (waitingForAllOccActiveSensors) 237 { 238 log<level::INFO>( 239 "checkAllActiveSensors(): OCC Active sensors are available"); 240 waitingForAllOccActiveSensors = false; 241 } 242 queuedActiveState.clear(); 243 tracedSensorWait = false; 244 } 245 else 246 { 247 // Not all sensors were available, so keep waiting 248 if (!tracedSensorWait) 249 { 250 log<level::INFO>( 251 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 252 tracedSensorWait = true; 253 } 254 discoverTimer->restartOnce(10s); 255 } 256 } 257 #endif 258 259 std::vector<int> Manager::findOCCsInDev() 260 { 261 std::vector<int> occs; 262 std::regex expr{R"(occ(\d+)$)"}; 263 264 for (auto& file : fs::directory_iterator("/dev")) 265 { 266 std::smatch match; 267 std::string path{file.path().string()}; 268 if (std::regex_search(path, match, expr)) 269 { 270 auto num = std::stoi(match[1].str()); 271 272 // /dev numbering starts at 1, ours starts at 0. 273 occs.push_back(num - 1); 274 } 275 } 276 277 return occs; 278 } 279 280 int Manager::cpuCreated(sdbusplus::message_t& msg) 281 { 282 namespace fs = std::filesystem; 283 284 sdbusplus::message::object_path o; 285 msg.read(o); 286 fs::path cpuPath(std::string(std::move(o))); 287 288 auto name = cpuPath.filename().string(); 289 auto index = name.find(CPU_NAME); 290 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 291 292 createObjects(name); 293 294 return 0; 295 } 296 297 void Manager::createObjects(const std::string& occ) 298 { 299 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 300 301 statusObjects.emplace_back(std::make_unique<Status>( 302 event, path.c_str(), *this, 303 #ifdef POWER10 304 pmode, 305 #endif 306 std::bind(std::mem_fn(&Manager::statusCallBack), this, 307 std::placeholders::_1, std::placeholders::_2) 308 #ifdef PLDM 309 , 310 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 311 std::placeholders::_1) 312 #endif 313 )); 314 315 // Create the power cap monitor object 316 if (!pcap) 317 { 318 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 319 *statusObjects.back()); 320 } 321 322 if (statusObjects.back()->isMasterOcc()) 323 { 324 log<level::INFO>( 325 std::format("Manager::createObjects(): OCC{} is the master", 326 statusObjects.back()->getOccInstanceID()) 327 .c_str()); 328 _pollTimer->setEnabled(false); 329 330 #ifdef POWER10 331 // Set the master OCC on the PowerMode object 332 pmode->setMasterOcc(path); 333 #endif 334 } 335 336 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 337 #ifdef POWER10 338 , 339 pmode 340 #endif 341 )); 342 } 343 344 void Manager::statusCallBack(instanceID instance, bool status) 345 { 346 if (status == true) 347 { 348 // OCC went active 349 ++activeCount; 350 351 #ifdef POWER10 352 if (activeCount == 1) 353 { 354 // First OCC went active (allow some time for all OCCs to go active) 355 waitForAllOccsTimer->restartOnce(60s); 356 } 357 #endif 358 359 if (activeCount == statusObjects.size()) 360 { 361 #ifdef POWER10 362 // All OCCs are now running 363 if (waitForAllOccsTimer->isEnabled()) 364 { 365 // stop occ wait timer 366 waitForAllOccsTimer->setEnabled(false); 367 } 368 #endif 369 370 // Verify master OCC and start presence monitor 371 validateOccMaster(); 372 } 373 374 // Start poll timer if not already started 375 if (!_pollTimer->isEnabled()) 376 { 377 log<level::INFO>( 378 std::format("Manager: OCCs will be polled every {} seconds", 379 pollInterval) 380 .c_str()); 381 382 // Send poll and start OCC poll timer 383 pollerTimerExpired(); 384 } 385 } 386 else 387 { 388 // OCC went away 389 if (activeCount > 0) 390 { 391 --activeCount; 392 } 393 else 394 { 395 log<level::ERR>( 396 std::format("OCC{} disabled, but currently no active OCCs", 397 instance) 398 .c_str()); 399 } 400 401 if (activeCount == 0) 402 { 403 // No OCCs are running 404 405 // Stop OCC poll timer 406 if (_pollTimer->isEnabled()) 407 { 408 log<level::INFO>( 409 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 410 _pollTimer->setEnabled(false); 411 } 412 413 #ifdef POWER10 414 // stop wait timer 415 if (waitForAllOccsTimer->isEnabled()) 416 { 417 waitForAllOccsTimer->setEnabled(false); 418 } 419 #endif 420 } 421 #ifdef READ_OCC_SENSORS 422 // Clear OCC sensors 423 setSensorValueToNaN(instance); 424 #endif 425 } 426 427 #ifdef POWER10 428 if (waitingForAllOccActiveSensors) 429 { 430 if (utils::isHostRunning()) 431 { 432 checkAllActiveSensors(); 433 } 434 } 435 #endif 436 } 437 438 #ifdef I2C_OCC 439 void Manager::initStatusObjects() 440 { 441 // Make sure we have a valid path string 442 static_assert(sizeof(DEV_PATH) != 0); 443 444 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 445 for (auto& name : deviceNames) 446 { 447 i2c_occ::i2cToDbus(name); 448 name = std::string(OCC_NAME) + '_' + name; 449 auto path = fs::path(OCC_CONTROL_ROOT) / name; 450 statusObjects.emplace_back( 451 std::make_unique<Status>(event, path.c_str(), *this)); 452 } 453 // The first device is master occ 454 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 455 *statusObjects.front()); 456 #ifdef POWER10 457 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 458 powermode::PIPS_PATH); 459 // Set the master OCC on the PowerMode object 460 pmode->setMasterOcc(path); 461 #endif 462 } 463 #endif 464 465 #ifdef PLDM 466 void Manager::sbeTimeout(unsigned int instance) 467 { 468 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 469 [instance](const auto& obj) { 470 return instance == obj->getOccInstanceID(); 471 }); 472 473 if (obj != statusObjects.end() && (*obj)->occActive()) 474 { 475 log<level::INFO>( 476 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 477 .c_str()); 478 479 setSBEState(instance, SBE_STATE_NOT_USABLE); 480 481 pldmHandle->sendHRESET(instance); 482 } 483 } 484 485 bool Manager::updateOCCActive(instanceID instance, bool status) 486 { 487 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 488 [instance](const auto& obj) { 489 return instance == obj->getOccInstanceID(); 490 }); 491 492 const bool hostRunning = open_power::occ::utils::isHostRunning(); 493 if (obj != statusObjects.end()) 494 { 495 if (!hostRunning && (status == true)) 496 { 497 log<level::WARNING>( 498 std::format( 499 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 500 instance, status) 501 .c_str()); 502 (*obj)->setPldmSensorReceived(false); 503 if (!waitingForAllOccActiveSensors) 504 { 505 log<level::INFO>( 506 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 507 waitingForAllOccActiveSensors = true; 508 } 509 discoverTimer->restartOnce(30s); 510 return false; 511 } 512 else 513 { 514 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 515 instance, status) 516 .c_str()); 517 (*obj)->setPldmSensorReceived(true); 518 return (*obj)->occActive(status); 519 } 520 } 521 else 522 { 523 if (hostRunning) 524 { 525 log<level::WARNING>( 526 std::format( 527 "updateOCCActive: No status object to update for OCC{} (active={})", 528 instance, status) 529 .c_str()); 530 } 531 else 532 { 533 if (status == true) 534 { 535 log<level::WARNING>( 536 std::format( 537 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 538 instance, status) 539 .c_str()); 540 } 541 } 542 if (status == true) 543 { 544 // OCC went active 545 queuedActiveState.insert(instance); 546 } 547 else 548 { 549 auto match = queuedActiveState.find(instance); 550 if (match != queuedActiveState.end()) 551 { 552 // OCC was disabled 553 queuedActiveState.erase(match); 554 } 555 } 556 return false; 557 } 558 } 559 560 // Called upon pldm event To set powermode Safe Mode State for system. 561 void Manager::updateOccSafeMode(bool safeMode) 562 { 563 #ifdef POWER10 564 pmode->updateDbusSafeMode(safeMode); 565 #endif 566 // Update the processor throttle status on dbus 567 for (auto& obj : statusObjects) 568 { 569 obj->updateThrottle(safeMode, THROTTLED_SAFE); 570 } 571 } 572 573 void Manager::sbeHRESETResult(instanceID instance, bool success) 574 { 575 if (success) 576 { 577 log<level::INFO>( 578 std::format("HRESET succeeded (OCC{})", instance).c_str()); 579 580 setSBEState(instance, SBE_STATE_BOOTED); 581 582 return; 583 } 584 585 setSBEState(instance, SBE_STATE_FAILED); 586 587 if (sbeCanDump(instance)) 588 { 589 log<level::INFO>( 590 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 591 .c_str()); 592 593 auto& bus = utils::getBus(); 594 uint32_t src6 = instance << 16; 595 uint32_t logId = 596 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 597 src6, "SBE command timeout"); 598 599 try 600 { 601 constexpr auto path = "/org/openpower/dump"; 602 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 603 constexpr auto function = "CreateDump"; 604 605 std::string service = utils::getService(path, interface); 606 auto method = bus.new_method_call(service.c_str(), path, interface, 607 function); 608 609 std::map<std::string, std::variant<std::string, uint64_t>> 610 createParams{ 611 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 612 uint64_t(logId)}, 613 {"com.ibm.Dump.Create.CreateParameters.DumpType", 614 "com.ibm.Dump.Create.DumpType.SBE"}, 615 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 616 uint64_t(instance)}, 617 }; 618 619 method.append(createParams); 620 621 auto response = bus.call(method); 622 } 623 catch (const sdbusplus::exception_t& e) 624 { 625 constexpr auto ERROR_DUMP_DISABLED = 626 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 627 if (e.name() == ERROR_DUMP_DISABLED) 628 { 629 log<level::INFO>("Dump is disabled, skipping"); 630 } 631 else 632 { 633 log<level::ERR>("Dump failed"); 634 } 635 } 636 } 637 } 638 639 bool Manager::sbeCanDump(unsigned int instance) 640 { 641 struct pdbg_target* proc = getPdbgTarget(instance); 642 643 if (!proc) 644 { 645 // allow the dump in the error case 646 return true; 647 } 648 649 try 650 { 651 if (!openpower::phal::sbe::isDumpAllowed(proc)) 652 { 653 return false; 654 } 655 656 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 657 { 658 return false; 659 } 660 } 661 catch (openpower::phal::exception::SbeError& e) 662 { 663 log<level::INFO>("Failed to query SBE state"); 664 } 665 666 // allow the dump in the error case 667 return true; 668 } 669 670 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 671 { 672 struct pdbg_target* proc = getPdbgTarget(instance); 673 674 if (!proc) 675 { 676 return; 677 } 678 679 try 680 { 681 openpower::phal::sbe::setState(proc, state); 682 } 683 catch (const openpower::phal::exception::SbeError& e) 684 { 685 log<level::ERR>("Failed to set SBE state"); 686 } 687 } 688 689 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 690 { 691 if (!pdbgInitialized) 692 { 693 try 694 { 695 openpower::phal::pdbg::init(); 696 pdbgInitialized = true; 697 } 698 catch (const openpower::phal::exception::PdbgError& e) 699 { 700 log<level::ERR>("pdbg initialization failed"); 701 return nullptr; 702 } 703 } 704 705 struct pdbg_target* proc = nullptr; 706 pdbg_for_each_class_target("proc", proc) 707 { 708 if (pdbg_target_index(proc) == instance) 709 { 710 return proc; 711 } 712 } 713 714 log<level::ERR>("Failed to get pdbg target"); 715 return nullptr; 716 } 717 #endif 718 719 void Manager::pollerTimerExpired() 720 { 721 if (!_pollTimer) 722 { 723 log<level::ERR>( 724 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 725 return; 726 } 727 728 for (auto& obj : statusObjects) 729 { 730 if (!obj->occActive()) 731 { 732 // OCC is not running yet 733 #ifdef READ_OCC_SENSORS 734 auto id = obj->getOccInstanceID(); 735 setSensorValueToNaN(id); 736 #endif 737 continue; 738 } 739 740 // Read sysfs to force kernel to poll OCC 741 obj->readOccState(); 742 743 #ifdef READ_OCC_SENSORS 744 // Read occ sensor values 745 getSensorValues(obj); 746 #endif 747 } 748 749 if (activeCount > 0) 750 { 751 // Restart OCC poll timer 752 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 753 } 754 else 755 { 756 // No OCCs running, so poll timer will not be restarted 757 log<level::INFO>( 758 std::format( 759 "Manager::pollerTimerExpired: poll timer will not be restarted") 760 .c_str()); 761 } 762 } 763 764 #ifdef READ_OCC_SENSORS 765 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 766 { 767 // There may be more than one sensor with the same FRU type 768 // and label so make two passes: the first to read the temps 769 // from sysfs, and the second to put them on D-Bus after 770 // resolving any conflicts. 771 std::map<std::string, double> sensorData; 772 773 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 774 for (auto& file : fs::directory_iterator(path)) 775 { 776 if (!std::regex_search(file.path().string(), expr)) 777 { 778 continue; 779 } 780 781 uint32_t labelValue{0}; 782 783 try 784 { 785 labelValue = readFile<uint32_t>(file.path()); 786 } 787 catch (const std::system_error& e) 788 { 789 log<level::DEBUG>( 790 std::format("readTempSensors: Failed reading {}, errno = {}", 791 file.path().string(), e.code().value()) 792 .c_str()); 793 continue; 794 } 795 796 const std::string& tempLabel = "label"; 797 const std::string filePathString = file.path().string().substr( 798 0, file.path().string().length() - tempLabel.length()); 799 800 uint32_t fruTypeValue{0}; 801 try 802 { 803 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 804 } 805 catch (const std::system_error& e) 806 { 807 log<level::DEBUG>( 808 std::format("readTempSensors: Failed reading {}, errno = {}", 809 filePathString + fruTypeSuffix, e.code().value()) 810 .c_str()); 811 continue; 812 } 813 814 std::string sensorPath = OCC_SENSORS_ROOT + 815 std::string("/temperature/"); 816 817 std::string dvfsTempPath; 818 819 if (fruTypeValue == VRMVdd) 820 { 821 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 822 "_temp"); 823 } 824 else if (fruTypeValue == processorIoRing) 825 { 826 sensorPath.append("proc" + std::to_string(occInstance) + 827 "_ioring_temp"); 828 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 829 std::to_string(occInstance) + "_ioring_dvfs_temp"; 830 } 831 else 832 { 833 uint16_t type = (labelValue & 0xFF000000) >> 24; 834 uint16_t instanceID = labelValue & 0x0000FFFF; 835 836 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 837 { 838 if (fruTypeValue == fruTypeNotAvailable) 839 { 840 // Not all DIMM related temps are available to read 841 // (no _input file in this case) 842 continue; 843 } 844 auto iter = dimmTempSensorName.find(fruTypeValue); 845 if (iter == dimmTempSensorName.end()) 846 { 847 log<level::ERR>( 848 std::format( 849 "readTempSensors: Fru type error! fruTypeValue = {}) ", 850 fruTypeValue) 851 .c_str()); 852 continue; 853 } 854 855 sensorPath.append("dimm" + std::to_string(instanceID) + 856 iter->second); 857 858 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 859 dimmDVFSSensorName.at(fruTypeValue); 860 } 861 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 862 { 863 if (fruTypeValue == processorCore) 864 { 865 // The OCC reports small core temps, of which there are 866 // two per big core. All current P10 systems are in big 867 // core mode, so use a big core name. 868 uint16_t coreNum = instanceID / 2; 869 uint16_t tempNum = instanceID % 2; 870 sensorPath.append("proc" + std::to_string(occInstance) + 871 "_core" + std::to_string(coreNum) + "_" + 872 std::to_string(tempNum) + "_temp"); 873 874 dvfsTempPath = 875 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 876 std::to_string(occInstance) + "_core_dvfs_temp"; 877 } 878 else 879 { 880 continue; 881 } 882 } 883 else 884 { 885 continue; 886 } 887 } 888 889 // The dvfs temp file only needs to be read once per chip per type. 890 if (!dvfsTempPath.empty() && 891 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 892 { 893 try 894 { 895 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 896 897 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 898 dvfsTempPath, dvfsValue * std::pow(10, -3)); 899 } 900 catch (const std::system_error& e) 901 { 902 log<level::DEBUG>( 903 std::format( 904 "readTempSensors: Failed reading {}, errno = {}", 905 filePathString + maxSuffix, e.code().value()) 906 .c_str()); 907 } 908 } 909 910 uint32_t faultValue{0}; 911 try 912 { 913 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 914 } 915 catch (const std::system_error& e) 916 { 917 log<level::DEBUG>( 918 std::format("readTempSensors: Failed reading {}, errno = {}", 919 filePathString + faultSuffix, e.code().value()) 920 .c_str()); 921 continue; 922 } 923 924 double tempValue{0}; 925 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 926 if (faultValue != 0) 927 { 928 tempValue = std::numeric_limits<double>::quiet_NaN(); 929 } 930 else 931 { 932 // Read the temperature 933 try 934 { 935 tempValue = readFile<double>(filePathString + inputSuffix); 936 } 937 catch (const std::system_error& e) 938 { 939 log<level::DEBUG>( 940 std::format( 941 "readTempSensors: Failed reading {}, errno = {}", 942 filePathString + inputSuffix, e.code().value()) 943 .c_str()); 944 945 // if errno == EAGAIN(Resource temporarily unavailable) then set 946 // temp to 0, to avoid using old temp, and affecting FAN 947 // Control. 948 if (e.code().value() == EAGAIN) 949 { 950 tempValue = 0; 951 } 952 // else the errno would be something like 953 // EBADF(Bad file descriptor) 954 // or ENOENT(No such file or directory) 955 else 956 { 957 continue; 958 } 959 } 960 } 961 962 // If this object path already has a value, only overwite 963 // it if the previous one was an NaN or a smaller value. 964 auto existing = sensorData.find(sensorPath); 965 if (existing != sensorData.end()) 966 { 967 // Multiple sensors found for this FRU type 968 if ((std::isnan(existing->second) && (tempValue == 0)) || 969 ((existing->second == 0) && std::isnan(tempValue))) 970 { 971 // One of the redundant sensors has failed (0xFF/nan), and the 972 // other sensor has no reading (0), so set the FRU to NaN to 973 // force fan increase 974 tempValue = std::numeric_limits<double>::quiet_NaN(); 975 existing->second = tempValue; 976 } 977 if (std::isnan(existing->second) || (tempValue > existing->second)) 978 { 979 existing->second = tempValue; 980 } 981 } 982 else 983 { 984 // First sensor for this FRU type 985 sensorData[sensorPath] = tempValue; 986 } 987 } 988 989 // Now publish the values on D-Bus. 990 for (const auto& [objectPath, value] : sensorData) 991 { 992 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 993 value * std::pow(10, -3)); 994 995 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 996 objectPath, !std::isnan(value)); 997 998 if (existingSensors.find(objectPath) == existingSensors.end()) 999 { 1000 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1001 objectPath); 1002 } 1003 1004 existingSensors[objectPath] = occInstance; 1005 } 1006 } 1007 1008 std::optional<std::string> 1009 Manager::getPowerLabelFunctionID(const std::string& value) 1010 { 1011 // If the value is "system", then the FunctionID is "system". 1012 if (value == "system") 1013 { 1014 return value; 1015 } 1016 1017 // If the value is not "system", then the label value have 3 numbers, of 1018 // which we only care about the middle one: 1019 // <sensor id>_<function id>_<apss channel> 1020 // eg: The value is "0_10_5" , then the FunctionID is "10". 1021 if (value.find("_") == std::string::npos) 1022 { 1023 return std::nullopt; 1024 } 1025 1026 auto powerLabelValue = value.substr((value.find("_") + 1)); 1027 1028 if (powerLabelValue.find("_") == std::string::npos) 1029 { 1030 return std::nullopt; 1031 } 1032 1033 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1034 } 1035 1036 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1037 { 1038 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1039 for (auto& file : fs::directory_iterator(path)) 1040 { 1041 if (!std::regex_search(file.path().string(), expr)) 1042 { 1043 continue; 1044 } 1045 1046 std::string labelValue; 1047 try 1048 { 1049 labelValue = readFile<std::string>(file.path()); 1050 } 1051 catch (const std::system_error& e) 1052 { 1053 log<level::DEBUG>( 1054 std::format("readPowerSensors: Failed reading {}, errno = {}", 1055 file.path().string(), e.code().value()) 1056 .c_str()); 1057 continue; 1058 } 1059 1060 auto functionID = getPowerLabelFunctionID(labelValue); 1061 if (functionID == std::nullopt) 1062 { 1063 continue; 1064 } 1065 1066 const std::string& tempLabel = "label"; 1067 const std::string filePathString = file.path().string().substr( 1068 0, file.path().string().length() - tempLabel.length()); 1069 1070 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1071 1072 auto iter = powerSensorName.find(*functionID); 1073 if (iter == powerSensorName.end()) 1074 { 1075 continue; 1076 } 1077 sensorPath.append(iter->second); 1078 1079 double tempValue{0}; 1080 1081 try 1082 { 1083 tempValue = readFile<double>(filePathString + inputSuffix); 1084 } 1085 catch (const std::system_error& e) 1086 { 1087 log<level::DEBUG>( 1088 std::format("readPowerSensors: Failed reading {}, errno = {}", 1089 filePathString + inputSuffix, e.code().value()) 1090 .c_str()); 1091 continue; 1092 } 1093 1094 dbus::OccDBusSensors::getOccDBus().setUnit( 1095 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1096 1097 dbus::OccDBusSensors::getOccDBus().setValue( 1098 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1099 1100 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1101 true); 1102 1103 if (existingSensors.find(sensorPath) == existingSensors.end()) 1104 { 1105 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1106 sensorPath); 1107 } 1108 1109 existingSensors[sensorPath] = id; 1110 } 1111 return; 1112 } 1113 1114 void Manager::setSensorValueToNaN(uint32_t id) const 1115 { 1116 for (const auto& [sensorPath, occId] : existingSensors) 1117 { 1118 if (occId == id) 1119 { 1120 dbus::OccDBusSensors::getOccDBus().setValue( 1121 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1122 1123 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1124 true); 1125 } 1126 } 1127 return; 1128 } 1129 1130 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1131 { 1132 for (const auto& [sensorPath, occId] : existingSensors) 1133 { 1134 if (occId == id) 1135 { 1136 dbus::OccDBusSensors::getOccDBus().setValue( 1137 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1138 1139 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1140 false); 1141 } 1142 } 1143 return; 1144 } 1145 1146 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1147 { 1148 static bool tracedError[8] = {0}; 1149 const fs::path sensorPath = occ->getHwmonPath(); 1150 const uint32_t id = occ->getOccInstanceID(); 1151 1152 if (fs::exists(sensorPath)) 1153 { 1154 // Read temperature sensors 1155 readTempSensors(sensorPath, id); 1156 1157 if (occ->isMasterOcc()) 1158 { 1159 // Read power sensors 1160 readPowerSensors(sensorPath, id); 1161 } 1162 tracedError[id] = false; 1163 } 1164 else 1165 { 1166 if (!tracedError[id]) 1167 { 1168 log<level::ERR>( 1169 std::format( 1170 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1171 id, sensorPath.c_str()) 1172 .c_str()); 1173 tracedError[id] = true; 1174 } 1175 } 1176 1177 return; 1178 } 1179 #endif 1180 1181 // Read the altitude from DBus 1182 void Manager::readAltitude() 1183 { 1184 static bool traceAltitudeErr = true; 1185 1186 utils::PropertyValue altitudeProperty{}; 1187 try 1188 { 1189 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1190 ALTITUDE_PROP); 1191 auto sensorVal = std::get<double>(altitudeProperty); 1192 if (sensorVal < 0xFFFF) 1193 { 1194 if (sensorVal < 0) 1195 { 1196 altitude = 0; 1197 } 1198 else 1199 { 1200 // Round to nearest meter 1201 altitude = uint16_t(sensorVal + 0.5); 1202 } 1203 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1204 sensorVal, altitude) 1205 .c_str()); 1206 traceAltitudeErr = true; 1207 } 1208 else 1209 { 1210 if (traceAltitudeErr) 1211 { 1212 traceAltitudeErr = false; 1213 log<level::DEBUG>( 1214 std::format("Invalid altitude value: {}", sensorVal) 1215 .c_str()); 1216 } 1217 } 1218 } 1219 catch (const sdbusplus::exception_t& e) 1220 { 1221 if (traceAltitudeErr) 1222 { 1223 traceAltitudeErr = false; 1224 log<level::INFO>( 1225 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1226 } 1227 altitude = 0xFFFF; // not available 1228 } 1229 } 1230 1231 // Callback function when ambient temperature changes 1232 void Manager::ambientCallback(sdbusplus::message_t& msg) 1233 { 1234 double currentTemp = 0; 1235 uint8_t truncatedTemp = 0xFF; 1236 std::string msgSensor; 1237 std::map<std::string, std::variant<double>> msgData; 1238 msg.read(msgSensor, msgData); 1239 1240 auto valPropMap = msgData.find(AMBIENT_PROP); 1241 if (valPropMap == msgData.end()) 1242 { 1243 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1244 return; 1245 } 1246 currentTemp = std::get<double>(valPropMap->second); 1247 if (std::isnan(currentTemp)) 1248 { 1249 truncatedTemp = 0xFF; 1250 } 1251 else 1252 { 1253 if (currentTemp < 0) 1254 { 1255 truncatedTemp = 0; 1256 } 1257 else 1258 { 1259 // Round to nearest degree C 1260 truncatedTemp = uint8_t(currentTemp + 0.5); 1261 } 1262 } 1263 1264 // If ambient changes, notify OCCs 1265 if (truncatedTemp != ambient) 1266 { 1267 log<level::DEBUG>( 1268 std::format("ambientCallback: Ambient change from {} to {}C", 1269 ambient, currentTemp) 1270 .c_str()); 1271 1272 ambient = truncatedTemp; 1273 if (altitude == 0xFFFF) 1274 { 1275 // No altitude yet, try reading again 1276 readAltitude(); 1277 } 1278 1279 log<level::DEBUG>( 1280 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1281 altitude) 1282 .c_str()); 1283 #ifdef POWER10 1284 // Send ambient and altitude to all OCCs 1285 for (auto& obj : statusObjects) 1286 { 1287 if (obj->occActive()) 1288 { 1289 obj->sendAmbient(ambient, altitude); 1290 } 1291 } 1292 #endif // POWER10 1293 } 1294 } 1295 1296 // return the current ambient and altitude readings 1297 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1298 uint16_t& altitudeValue) const 1299 { 1300 ambientValid = true; 1301 ambientTemp = ambient; 1302 altitudeValue = altitude; 1303 1304 if (ambient == 0xFF) 1305 { 1306 ambientValid = false; 1307 } 1308 } 1309 1310 #ifdef POWER10 1311 // Called when waitForAllOccsTimer expires 1312 // After the first OCC goes active, this timer will be started (60 seconds) 1313 void Manager::occsNotAllRunning() 1314 { 1315 if (activeCount != statusObjects.size()) 1316 { 1317 // Not all OCCs went active 1318 log<level::WARNING>( 1319 std::format( 1320 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1321 activeCount, statusObjects.size()) 1322 .c_str()); 1323 // Procs may be garded, so may be expected 1324 } 1325 1326 validateOccMaster(); 1327 } 1328 #endif // POWER10 1329 1330 // Verify single master OCC and start presence monitor 1331 void Manager::validateOccMaster() 1332 { 1333 int masterInstance = -1; 1334 for (auto& obj : statusObjects) 1335 { 1336 auto instance = obj->getOccInstanceID(); 1337 #ifdef POWER10 1338 if (!obj->occActive()) 1339 { 1340 if (utils::isHostRunning()) 1341 { 1342 // Check if sensor was queued while waiting for discovery 1343 auto match = queuedActiveState.find(instance); 1344 if (match != queuedActiveState.end()) 1345 { 1346 queuedActiveState.erase(match); 1347 log<level::INFO>( 1348 std::format( 1349 "validateOccMaster: OCC{} is ACTIVE (queued)", 1350 instance) 1351 .c_str()); 1352 obj->occActive(true); 1353 } 1354 else 1355 { 1356 // OCC does not appear to be active yet, check active sensor 1357 #ifdef PLDM 1358 pldmHandle->checkActiveSensor(instance); 1359 #endif 1360 if (obj->occActive()) 1361 { 1362 log<level::INFO>( 1363 std::format( 1364 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1365 instance) 1366 .c_str()); 1367 } 1368 } 1369 } 1370 else 1371 { 1372 log<level::WARNING>( 1373 std::format( 1374 "validateOccMaster: HOST is not running (OCC{})", 1375 instance) 1376 .c_str()); 1377 return; 1378 } 1379 } 1380 #endif // POWER10 1381 1382 if (obj->isMasterOcc()) 1383 { 1384 obj->addPresenceWatchMaster(); 1385 1386 if (masterInstance == -1) 1387 { 1388 masterInstance = instance; 1389 } 1390 else 1391 { 1392 log<level::ERR>( 1393 std::format( 1394 "validateOccMaster: Multiple OCC masters! ({} and {})", 1395 masterInstance, instance) 1396 .c_str()); 1397 // request reset 1398 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1399 } 1400 } 1401 } 1402 1403 if (masterInstance < 0) 1404 { 1405 log<level::ERR>( 1406 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1407 statusObjects.size()) 1408 .c_str()); 1409 // request reset 1410 statusObjects.front()->deviceError( 1411 Error::Descriptor(PRESENCE_ERROR_PATH)); 1412 } 1413 else 1414 { 1415 log<level::INFO>( 1416 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1417 masterInstance, activeCount) 1418 .c_str()); 1419 #ifdef POWER10 1420 pmode->updateDbusSafeMode(false); 1421 #endif 1422 } 1423 } 1424 1425 void Manager::updatePcapBounds() const 1426 { 1427 if (pcap) 1428 { 1429 pcap->updatePcapBounds(); 1430 } 1431 } 1432 1433 } // namespace occ 1434 } // namespace open_power 1435