1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 fmt::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 fmt::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 118 // Find/update the processor path associated with each OCC 119 for (auto& obj : statusObjects) 120 { 121 obj->updateProcAssociation(); 122 } 123 } 124 } 125 126 if (statusObjCreated && waitingForAllOccActiveSensors) 127 { 128 static bool tracedHostWait = false; 129 if (utils::isHostRunning()) 130 { 131 if (tracedHostWait) 132 { 133 log<level::INFO>( 134 "Manager::findAndCreateObjects(): Host is running"); 135 tracedHostWait = false; 136 } 137 checkAllActiveSensors(); 138 } 139 else 140 { 141 if (!tracedHostWait) 142 { 143 log<level::INFO>( 144 "Manager::findAndCreateObjects(): Waiting for host to start"); 145 tracedHostWait = true; 146 } 147 discoverTimer->restartOnce(30s); 148 } 149 } 150 } 151 else 152 { 153 log<level::INFO>( 154 fmt::format( 155 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 156 HOST_ON_FILE) 157 .c_str()); 158 discoverTimer->restartOnce(10s); 159 } 160 #endif 161 } 162 163 #ifdef POWER10 164 // Check if all occActive sensors are available 165 void Manager::checkAllActiveSensors() 166 { 167 static bool allActiveSensorAvailable = false; 168 static bool tracedSensorWait = false; 169 static bool waitingForHost = false; 170 171 if (open_power::occ::utils::isHostRunning()) 172 { 173 if (waitingForHost) 174 { 175 waitingForHost = false; 176 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 177 } 178 179 // Start with the assumption that all are available 180 allActiveSensorAvailable = true; 181 for (auto& obj : statusObjects) 182 { 183 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 184 { 185 auto instance = obj->getOccInstanceID(); 186 // Check if sensor was queued while waiting for discovery 187 auto match = queuedActiveState.find(instance); 188 if (match != queuedActiveState.end()) 189 { 190 queuedActiveState.erase(match); 191 log<level::INFO>( 192 fmt::format( 193 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 194 instance) 195 .c_str()); 196 obj->occActive(true); 197 } 198 else 199 { 200 allActiveSensorAvailable = false; 201 if (!tracedSensorWait) 202 { 203 log<level::INFO>( 204 fmt::format( 205 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 206 instance) 207 .c_str()); 208 tracedSensorWait = true; 209 } 210 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 211 break; 212 } 213 } 214 } 215 } 216 else 217 { 218 if (!waitingForHost) 219 { 220 waitingForHost = true; 221 log<level::INFO>( 222 "checkAllActiveSensors(): Waiting for host to start"); 223 } 224 } 225 226 if (allActiveSensorAvailable) 227 { 228 // All sensors were found, disable the discovery timer 229 if (discoverTimer->isEnabled()) 230 { 231 discoverTimer->setEnabled(false); 232 } 233 234 if (waitingForAllOccActiveSensors) 235 { 236 log<level::INFO>( 237 "checkAllActiveSensors(): OCC Active sensors are available"); 238 waitingForAllOccActiveSensors = false; 239 } 240 queuedActiveState.clear(); 241 tracedSensorWait = false; 242 } 243 else 244 { 245 // Not all sensors were available, so keep waiting 246 if (!tracedSensorWait) 247 { 248 log<level::INFO>( 249 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 250 tracedSensorWait = true; 251 } 252 discoverTimer->restartOnce(10s); 253 } 254 } 255 #endif 256 257 std::vector<int> Manager::findOCCsInDev() 258 { 259 std::vector<int> occs; 260 std::regex expr{R"(occ(\d+)$)"}; 261 262 for (auto& file : fs::directory_iterator("/dev")) 263 { 264 std::smatch match; 265 std::string path{file.path().string()}; 266 if (std::regex_search(path, match, expr)) 267 { 268 auto num = std::stoi(match[1].str()); 269 270 // /dev numbering starts at 1, ours starts at 0. 271 occs.push_back(num - 1); 272 } 273 } 274 275 return occs; 276 } 277 278 int Manager::cpuCreated(sdbusplus::message_t& msg) 279 { 280 namespace fs = std::filesystem; 281 282 sdbusplus::message::object_path o; 283 msg.read(o); 284 fs::path cpuPath(std::string(std::move(o))); 285 286 auto name = cpuPath.filename().string(); 287 auto index = name.find(CPU_NAME); 288 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 289 290 createObjects(name); 291 292 return 0; 293 } 294 295 void Manager::createObjects(const std::string& occ) 296 { 297 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 298 299 statusObjects.emplace_back(std::make_unique<Status>( 300 event, path.c_str(), *this, 301 #ifdef POWER10 302 pmode, 303 #endif 304 std::bind(std::mem_fn(&Manager::statusCallBack), this, 305 std::placeholders::_1, std::placeholders::_2) 306 #ifdef PLDM 307 , 308 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 309 std::placeholders::_1) 310 #endif 311 )); 312 313 // Create the power cap monitor object 314 if (!pcap) 315 { 316 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 317 *statusObjects.back()); 318 } 319 320 if (statusObjects.back()->isMasterOcc()) 321 { 322 log<level::INFO>( 323 fmt::format("Manager::createObjects(): OCC{} is the master", 324 statusObjects.back()->getOccInstanceID()) 325 .c_str()); 326 _pollTimer->setEnabled(false); 327 328 #ifdef POWER10 329 // Set the master OCC on the PowerMode object 330 pmode->setMasterOcc(path); 331 #endif 332 } 333 334 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 335 #ifdef POWER10 336 , 337 pmode 338 #endif 339 )); 340 } 341 342 void Manager::statusCallBack(instanceID instance, bool status) 343 { 344 if (status == true) 345 { 346 // OCC went active 347 ++activeCount; 348 349 #ifdef POWER10 350 if (activeCount == 1) 351 { 352 // First OCC went active (allow some time for all OCCs to go active) 353 waitForAllOccsTimer->restartOnce(60s); 354 } 355 #endif 356 357 if (activeCount == statusObjects.size()) 358 { 359 #ifdef POWER10 360 // All OCCs are now running 361 if (waitForAllOccsTimer->isEnabled()) 362 { 363 // stop occ wait timer 364 waitForAllOccsTimer->setEnabled(false); 365 } 366 #endif 367 368 // Verify master OCC and start presence monitor 369 validateOccMaster(); 370 } 371 372 // Start poll timer if not already started 373 if (!_pollTimer->isEnabled()) 374 { 375 log<level::INFO>( 376 fmt::format("Manager: OCCs will be polled every {} seconds", 377 pollInterval) 378 .c_str()); 379 380 // Send poll and start OCC poll timer 381 pollerTimerExpired(); 382 } 383 } 384 else 385 { 386 // OCC went away 387 if (activeCount > 0) 388 { 389 --activeCount; 390 } 391 else 392 { 393 log<level::ERR>( 394 fmt::format("OCC{} disabled, but currently no active OCCs", 395 instance) 396 .c_str()); 397 } 398 399 if (activeCount == 0) 400 { 401 // No OCCs are running 402 403 // Stop OCC poll timer 404 if (_pollTimer->isEnabled()) 405 { 406 log<level::INFO>( 407 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 408 _pollTimer->setEnabled(false); 409 } 410 411 #ifdef POWER10 412 // stop wait timer 413 if (waitForAllOccsTimer->isEnabled()) 414 { 415 waitForAllOccsTimer->setEnabled(false); 416 } 417 #endif 418 } 419 #ifdef READ_OCC_SENSORS 420 // Clear OCC sensors 421 setSensorValueToNaN(instance); 422 #endif 423 } 424 425 #ifdef POWER10 426 if (waitingForAllOccActiveSensors) 427 { 428 if (utils::isHostRunning()) 429 { 430 checkAllActiveSensors(); 431 } 432 } 433 #endif 434 } 435 436 #ifdef I2C_OCC 437 void Manager::initStatusObjects() 438 { 439 // Make sure we have a valid path string 440 static_assert(sizeof(DEV_PATH) != 0); 441 442 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 443 for (auto& name : deviceNames) 444 { 445 i2c_occ::i2cToDbus(name); 446 name = std::string(OCC_NAME) + '_' + name; 447 auto path = fs::path(OCC_CONTROL_ROOT) / name; 448 statusObjects.emplace_back( 449 std::make_unique<Status>(event, path.c_str(), *this)); 450 } 451 // The first device is master occ 452 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 453 *statusObjects.front()); 454 #ifdef POWER10 455 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 456 powermode::PIPS_PATH); 457 // Set the master OCC on the PowerMode object 458 pmode->setMasterOcc(path); 459 #endif 460 } 461 #endif 462 463 #ifdef PLDM 464 void Manager::sbeTimeout(unsigned int instance) 465 { 466 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 467 [instance](const auto& obj) { 468 return instance == obj->getOccInstanceID(); 469 }); 470 471 if (obj != statusObjects.end() && (*obj)->occActive()) 472 { 473 log<level::INFO>( 474 fmt::format("SBE timeout, requesting HRESET (OCC{})", instance) 475 .c_str()); 476 477 setSBEState(instance, SBE_STATE_NOT_USABLE); 478 479 pldmHandle->sendHRESET(instance); 480 } 481 } 482 483 bool Manager::updateOCCActive(instanceID instance, bool status) 484 { 485 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 486 [instance](const auto& obj) { 487 return instance == obj->getOccInstanceID(); 488 }); 489 490 const bool hostRunning = open_power::occ::utils::isHostRunning(); 491 if (obj != statusObjects.end()) 492 { 493 if (!hostRunning && (status == true)) 494 { 495 log<level::WARNING>( 496 fmt::format( 497 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 498 instance, status) 499 .c_str()); 500 (*obj)->setPldmSensorReceived(false); 501 if (!waitingForAllOccActiveSensors) 502 { 503 log<level::INFO>( 504 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 505 waitingForAllOccActiveSensors = true; 506 } 507 discoverTimer->restartOnce(30s); 508 return false; 509 } 510 else 511 { 512 log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}", 513 instance, status) 514 .c_str()); 515 (*obj)->setPldmSensorReceived(true); 516 return (*obj)->occActive(status); 517 } 518 } 519 else 520 { 521 if (hostRunning) 522 { 523 log<level::WARNING>( 524 fmt::format( 525 "updateOCCActive: No status object to update for OCC{} (active={})", 526 instance, status) 527 .c_str()); 528 } 529 else 530 { 531 if (status == true) 532 { 533 log<level::WARNING>( 534 fmt::format( 535 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 536 instance, status) 537 .c_str()); 538 } 539 } 540 if (status == true) 541 { 542 // OCC went active 543 queuedActiveState.insert(instance); 544 } 545 else 546 { 547 auto match = queuedActiveState.find(instance); 548 if (match != queuedActiveState.end()) 549 { 550 // OCC was disabled 551 queuedActiveState.erase(match); 552 } 553 } 554 return false; 555 } 556 } 557 558 // Called upon pldm event To set powermode Safe Mode State for system. 559 void Manager::updateOccSafeMode(bool safeMode) 560 { 561 #ifdef POWER10 562 pmode->updateDbusSafeMode(safeMode); 563 #endif 564 // Update the processor throttle status on dbus 565 for (auto& obj : statusObjects) 566 { 567 obj->updateThrottle(safeMode, THROTTLED_SAFE); 568 } 569 } 570 571 void Manager::sbeHRESETResult(instanceID instance, bool success) 572 { 573 if (success) 574 { 575 log<level::INFO>( 576 fmt::format("HRESET succeeded (OCC{})", instance).c_str()); 577 578 setSBEState(instance, SBE_STATE_BOOTED); 579 580 return; 581 } 582 583 setSBEState(instance, SBE_STATE_FAILED); 584 585 if (sbeCanDump(instance)) 586 { 587 log<level::INFO>( 588 fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance) 589 .c_str()); 590 591 auto& bus = utils::getBus(); 592 uint32_t src6 = instance << 16; 593 uint32_t logId = 594 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 595 src6, "SBE command timeout"); 596 597 try 598 { 599 constexpr auto path = "/org/openpower/dump"; 600 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 601 constexpr auto function = "CreateDump"; 602 603 std::string service = utils::getService(path, interface); 604 auto method = bus.new_method_call(service.c_str(), path, interface, 605 function); 606 607 std::map<std::string, std::variant<std::string, uint64_t>> 608 createParams{ 609 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 610 uint64_t(logId)}, 611 {"com.ibm.Dump.Create.CreateParameters.DumpType", 612 "com.ibm.Dump.Create.DumpType.SBE"}, 613 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 614 uint64_t(instance)}, 615 }; 616 617 method.append(createParams); 618 619 auto response = bus.call(method); 620 } 621 catch (const sdbusplus::exception_t& e) 622 { 623 constexpr auto ERROR_DUMP_DISABLED = 624 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 625 if (e.name() == ERROR_DUMP_DISABLED) 626 { 627 log<level::INFO>("Dump is disabled, skipping"); 628 } 629 else 630 { 631 log<level::ERR>("Dump failed"); 632 } 633 } 634 } 635 } 636 637 bool Manager::sbeCanDump(unsigned int instance) 638 { 639 struct pdbg_target* proc = getPdbgTarget(instance); 640 641 if (!proc) 642 { 643 // allow the dump in the error case 644 return true; 645 } 646 647 try 648 { 649 if (!openpower::phal::sbe::isDumpAllowed(proc)) 650 { 651 return false; 652 } 653 654 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 655 { 656 return false; 657 } 658 } 659 catch (openpower::phal::exception::SbeError& e) 660 { 661 log<level::INFO>("Failed to query SBE state"); 662 } 663 664 // allow the dump in the error case 665 return true; 666 } 667 668 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 669 { 670 struct pdbg_target* proc = getPdbgTarget(instance); 671 672 if (!proc) 673 { 674 return; 675 } 676 677 try 678 { 679 openpower::phal::sbe::setState(proc, state); 680 } 681 catch (const openpower::phal::exception::SbeError& e) 682 { 683 log<level::ERR>("Failed to set SBE state"); 684 } 685 } 686 687 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 688 { 689 if (!pdbgInitialized) 690 { 691 try 692 { 693 openpower::phal::pdbg::init(); 694 pdbgInitialized = true; 695 } 696 catch (const openpower::phal::exception::PdbgError& e) 697 { 698 log<level::ERR>("pdbg initialization failed"); 699 return nullptr; 700 } 701 } 702 703 struct pdbg_target* proc = nullptr; 704 pdbg_for_each_class_target("proc", proc) 705 { 706 if (pdbg_target_index(proc) == instance) 707 { 708 return proc; 709 } 710 } 711 712 log<level::ERR>("Failed to get pdbg target"); 713 return nullptr; 714 } 715 #endif 716 717 void Manager::pollerTimerExpired() 718 { 719 if (!_pollTimer) 720 { 721 log<level::ERR>( 722 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 723 return; 724 } 725 726 for (auto& obj : statusObjects) 727 { 728 if (!obj->occActive()) 729 { 730 // OCC is not running yet 731 #ifdef READ_OCC_SENSORS 732 auto id = obj->getOccInstanceID(); 733 setSensorValueToNaN(id); 734 #endif 735 continue; 736 } 737 738 // Read sysfs to force kernel to poll OCC 739 obj->readOccState(); 740 741 #ifdef READ_OCC_SENSORS 742 // Read occ sensor values 743 getSensorValues(obj); 744 #endif 745 } 746 747 if (activeCount > 0) 748 { 749 // Restart OCC poll timer 750 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 751 } 752 else 753 { 754 // No OCCs running, so poll timer will not be restarted 755 log<level::INFO>( 756 fmt::format( 757 "Manager::pollerTimerExpired: poll timer will not be restarted") 758 .c_str()); 759 } 760 } 761 762 #ifdef READ_OCC_SENSORS 763 void Manager::readTempSensors(const fs::path& path, uint32_t id) 764 { 765 // There may be more than one sensor with the same FRU type 766 // and label so make two passes: the first to read the temps 767 // from sysfs, and the second to put them on D-Bus after 768 // resolving any conflicts. 769 std::map<std::string, double> sensorData; 770 771 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 772 for (auto& file : fs::directory_iterator(path)) 773 { 774 if (!std::regex_search(file.path().string(), expr)) 775 { 776 continue; 777 } 778 779 uint32_t labelValue{0}; 780 781 try 782 { 783 labelValue = readFile<uint32_t>(file.path()); 784 } 785 catch (const std::system_error& e) 786 { 787 log<level::DEBUG>( 788 fmt::format("readTempSensors: Failed reading {}, errno = {}", 789 file.path().string(), e.code().value()) 790 .c_str()); 791 continue; 792 } 793 794 const std::string& tempLabel = "label"; 795 const std::string filePathString = file.path().string().substr( 796 0, file.path().string().length() - tempLabel.length()); 797 798 uint32_t fruTypeValue{0}; 799 try 800 { 801 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 802 } 803 catch (const std::system_error& e) 804 { 805 log<level::DEBUG>( 806 fmt::format("readTempSensors: Failed reading {}, errno = {}", 807 filePathString + fruTypeSuffix, e.code().value()) 808 .c_str()); 809 continue; 810 } 811 812 std::string sensorPath = OCC_SENSORS_ROOT + 813 std::string("/temperature/"); 814 815 std::string dvfsTempPath; 816 817 if (fruTypeValue == VRMVdd) 818 { 819 sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp"); 820 } 821 else if (fruTypeValue == processorIoRing) 822 { 823 sensorPath.append("proc" + std::to_string(id) + "_ioring_temp"); 824 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 825 std::to_string(id) + "_ioring_dvfs_temp"; 826 } 827 else 828 { 829 uint16_t type = (labelValue & 0xFF000000) >> 24; 830 uint16_t instanceID = labelValue & 0x0000FFFF; 831 832 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 833 { 834 if (fruTypeValue == fruTypeNotAvailable) 835 { 836 // Not all DIMM related temps are available to read 837 // (no _input file in this case) 838 continue; 839 } 840 auto iter = dimmTempSensorName.find(fruTypeValue); 841 if (iter == dimmTempSensorName.end()) 842 { 843 log<level::ERR>( 844 fmt::format( 845 "readTempSensors: Fru type error! fruTypeValue = {}) ", 846 fruTypeValue) 847 .c_str()); 848 continue; 849 } 850 851 sensorPath.append("dimm" + std::to_string(instanceID) + 852 iter->second); 853 854 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 855 dimmDVFSSensorName.at(fruTypeValue); 856 } 857 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 858 { 859 if (fruTypeValue == processorCore) 860 { 861 // The OCC reports small core temps, of which there are 862 // two per big core. All current P10 systems are in big 863 // core mode, so use a big core name. 864 uint16_t coreNum = instanceID / 2; 865 uint16_t tempNum = instanceID % 2; 866 sensorPath.append("proc" + std::to_string(id) + "_core" + 867 std::to_string(coreNum) + "_" + 868 std::to_string(tempNum) + "_temp"); 869 870 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + 871 "/temperature/proc" + std::to_string(id) + 872 "_core_dvfs_temp"; 873 } 874 else 875 { 876 continue; 877 } 878 } 879 else 880 { 881 continue; 882 } 883 } 884 885 // The dvfs temp file only needs to be read once per chip per type. 886 if (!dvfsTempPath.empty() && 887 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 888 { 889 try 890 { 891 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 892 893 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 894 dvfsTempPath, dvfsValue * std::pow(10, -3)); 895 } 896 catch (const std::system_error& e) 897 { 898 log<level::DEBUG>( 899 fmt::format( 900 "readTempSensors: Failed reading {}, errno = {}", 901 filePathString + maxSuffix, e.code().value()) 902 .c_str()); 903 } 904 } 905 906 uint32_t faultValue{0}; 907 try 908 { 909 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 910 } 911 catch (const std::system_error& e) 912 { 913 log<level::DEBUG>( 914 fmt::format("readTempSensors: Failed reading {}, errno = {}", 915 filePathString + faultSuffix, e.code().value()) 916 .c_str()); 917 continue; 918 } 919 920 // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1. 921 if (faultValue != 0) 922 { 923 // For cases when there are multiple readings per fru type/label, 924 // don't overwrite a good value with an NaN. 925 if (!sensorData.contains(sensorPath)) 926 { 927 sensorData[sensorPath] = 928 std::numeric_limits<double>::quiet_NaN(); 929 } 930 continue; 931 } 932 933 double tempValue{0}; 934 935 try 936 { 937 tempValue = readFile<double>(filePathString + inputSuffix); 938 } 939 catch (const std::system_error& e) 940 { 941 log<level::DEBUG>( 942 fmt::format("readTempSensors: Failed reading {}, errno = {}", 943 filePathString + inputSuffix, e.code().value()) 944 .c_str()); 945 946 // if errno == EAGAIN(Resource temporarily unavailable) then set 947 // temp to 0, to avoid using old temp, and affecting FAN Control. 948 if (e.code().value() == EAGAIN) 949 { 950 tempValue = 0; 951 } 952 // else the errno would be something like 953 // EBADF(Bad file descriptor) 954 // or ENOENT(No such file or directory) 955 else 956 { 957 continue; 958 } 959 } 960 961 // If this object path already has a value, only overwite 962 // it if the previous one was an NaN or a smaller value. 963 auto existing = sensorData.find(sensorPath); 964 if (existing != sensorData.end()) 965 { 966 if (std::isnan(existing->second) || (tempValue > existing->second)) 967 { 968 existing->second = tempValue; 969 } 970 } 971 else 972 { 973 sensorData[sensorPath] = tempValue; 974 } 975 } 976 977 // Now publish the values on D-Bus. 978 for (const auto& [objectPath, value] : sensorData) 979 { 980 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 981 value * std::pow(10, -3)); 982 983 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 984 objectPath, !std::isnan(value)); 985 986 if (existingSensors.find(objectPath) == existingSensors.end()) 987 { 988 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 989 objectPath); 990 } 991 992 existingSensors[objectPath] = id; 993 } 994 } 995 996 std::optional<std::string> 997 Manager::getPowerLabelFunctionID(const std::string& value) 998 { 999 // If the value is "system", then the FunctionID is "system". 1000 if (value == "system") 1001 { 1002 return value; 1003 } 1004 1005 // If the value is not "system", then the label value have 3 numbers, of 1006 // which we only care about the middle one: 1007 // <sensor id>_<function id>_<apss channel> 1008 // eg: The value is "0_10_5" , then the FunctionID is "10". 1009 if (value.find("_") == std::string::npos) 1010 { 1011 return std::nullopt; 1012 } 1013 1014 auto powerLabelValue = value.substr((value.find("_") + 1)); 1015 1016 if (powerLabelValue.find("_") == std::string::npos) 1017 { 1018 return std::nullopt; 1019 } 1020 1021 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1022 } 1023 1024 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1025 { 1026 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1027 for (auto& file : fs::directory_iterator(path)) 1028 { 1029 if (!std::regex_search(file.path().string(), expr)) 1030 { 1031 continue; 1032 } 1033 1034 std::string labelValue; 1035 try 1036 { 1037 labelValue = readFile<std::string>(file.path()); 1038 } 1039 catch (const std::system_error& e) 1040 { 1041 log<level::DEBUG>( 1042 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1043 file.path().string(), e.code().value()) 1044 .c_str()); 1045 continue; 1046 } 1047 1048 auto functionID = getPowerLabelFunctionID(labelValue); 1049 if (functionID == std::nullopt) 1050 { 1051 continue; 1052 } 1053 1054 const std::string& tempLabel = "label"; 1055 const std::string filePathString = file.path().string().substr( 1056 0, file.path().string().length() - tempLabel.length()); 1057 1058 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1059 1060 auto iter = powerSensorName.find(*functionID); 1061 if (iter == powerSensorName.end()) 1062 { 1063 continue; 1064 } 1065 sensorPath.append(iter->second); 1066 1067 double tempValue{0}; 1068 1069 try 1070 { 1071 tempValue = readFile<double>(filePathString + inputSuffix); 1072 } 1073 catch (const std::system_error& e) 1074 { 1075 log<level::DEBUG>( 1076 fmt::format("readPowerSensors: Failed reading {}, errno = {}", 1077 filePathString + inputSuffix, e.code().value()) 1078 .c_str()); 1079 continue; 1080 } 1081 1082 dbus::OccDBusSensors::getOccDBus().setUnit( 1083 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1084 1085 dbus::OccDBusSensors::getOccDBus().setValue( 1086 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1087 1088 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1089 true); 1090 1091 if (existingSensors.find(sensorPath) == existingSensors.end()) 1092 { 1093 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1094 sensorPath); 1095 } 1096 1097 existingSensors[sensorPath] = id; 1098 } 1099 return; 1100 } 1101 1102 void Manager::setSensorValueToNaN(uint32_t id) const 1103 { 1104 for (const auto& [sensorPath, occId] : existingSensors) 1105 { 1106 if (occId == id) 1107 { 1108 dbus::OccDBusSensors::getOccDBus().setValue( 1109 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1110 1111 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1112 true); 1113 } 1114 } 1115 return; 1116 } 1117 1118 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1119 { 1120 for (const auto& [sensorPath, occId] : existingSensors) 1121 { 1122 if (occId == id) 1123 { 1124 dbus::OccDBusSensors::getOccDBus().setValue( 1125 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1126 1127 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1128 false); 1129 } 1130 } 1131 return; 1132 } 1133 1134 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1135 { 1136 static bool tracedError[8] = {0}; 1137 const fs::path sensorPath = occ->getHwmonPath(); 1138 const uint32_t id = occ->getOccInstanceID(); 1139 1140 if (fs::exists(sensorPath)) 1141 { 1142 // Read temperature sensors 1143 readTempSensors(sensorPath, id); 1144 1145 if (occ->isMasterOcc()) 1146 { 1147 // Read power sensors 1148 readPowerSensors(sensorPath, id); 1149 } 1150 tracedError[id] = false; 1151 } 1152 else 1153 { 1154 if (!tracedError[id]) 1155 { 1156 log<level::ERR>( 1157 fmt::format( 1158 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1159 id, sensorPath.c_str()) 1160 .c_str()); 1161 tracedError[id] = true; 1162 } 1163 } 1164 1165 return; 1166 } 1167 #endif 1168 1169 // Read the altitude from DBus 1170 void Manager::readAltitude() 1171 { 1172 static bool traceAltitudeErr = true; 1173 1174 utils::PropertyValue altitudeProperty{}; 1175 try 1176 { 1177 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1178 ALTITUDE_PROP); 1179 auto sensorVal = std::get<double>(altitudeProperty); 1180 if (sensorVal < 0xFFFF) 1181 { 1182 if (sensorVal < 0) 1183 { 1184 altitude = 0; 1185 } 1186 else 1187 { 1188 // Round to nearest meter 1189 altitude = uint16_t(sensorVal + 0.5); 1190 } 1191 log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)", 1192 sensorVal, altitude) 1193 .c_str()); 1194 traceAltitudeErr = true; 1195 } 1196 else 1197 { 1198 if (traceAltitudeErr) 1199 { 1200 traceAltitudeErr = false; 1201 log<level::DEBUG>( 1202 fmt::format("Invalid altitude value: {}", sensorVal) 1203 .c_str()); 1204 } 1205 } 1206 } 1207 catch (const sdbusplus::exception_t& e) 1208 { 1209 if (traceAltitudeErr) 1210 { 1211 traceAltitudeErr = false; 1212 log<level::INFO>( 1213 fmt::format("Unable to read Altitude: {}", e.what()).c_str()); 1214 } 1215 altitude = 0xFFFF; // not available 1216 } 1217 } 1218 1219 // Callback function when ambient temperature changes 1220 void Manager::ambientCallback(sdbusplus::message_t& msg) 1221 { 1222 double currentTemp = 0; 1223 uint8_t truncatedTemp = 0xFF; 1224 std::string msgSensor; 1225 std::map<std::string, std::variant<double>> msgData; 1226 msg.read(msgSensor, msgData); 1227 1228 auto valPropMap = msgData.find(AMBIENT_PROP); 1229 if (valPropMap == msgData.end()) 1230 { 1231 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1232 return; 1233 } 1234 currentTemp = std::get<double>(valPropMap->second); 1235 if (std::isnan(currentTemp)) 1236 { 1237 truncatedTemp = 0xFF; 1238 } 1239 else 1240 { 1241 if (currentTemp < 0) 1242 { 1243 truncatedTemp = 0; 1244 } 1245 else 1246 { 1247 // Round to nearest degree C 1248 truncatedTemp = uint8_t(currentTemp + 0.5); 1249 } 1250 } 1251 1252 // If ambient changes, notify OCCs 1253 if (truncatedTemp != ambient) 1254 { 1255 log<level::DEBUG>( 1256 fmt::format("ambientCallback: Ambient change from {} to {}C", 1257 ambient, currentTemp) 1258 .c_str()); 1259 1260 ambient = truncatedTemp; 1261 if (altitude == 0xFFFF) 1262 { 1263 // No altitude yet, try reading again 1264 readAltitude(); 1265 } 1266 1267 log<level::DEBUG>( 1268 fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1269 altitude) 1270 .c_str()); 1271 #ifdef POWER10 1272 // Send ambient and altitude to all OCCs 1273 for (auto& obj : statusObjects) 1274 { 1275 if (obj->occActive()) 1276 { 1277 obj->sendAmbient(ambient, altitude); 1278 } 1279 } 1280 #endif // POWER10 1281 } 1282 } 1283 1284 // return the current ambient and altitude readings 1285 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1286 uint16_t& altitudeValue) const 1287 { 1288 ambientValid = true; 1289 ambientTemp = ambient; 1290 altitudeValue = altitude; 1291 1292 if (ambient == 0xFF) 1293 { 1294 ambientValid = false; 1295 } 1296 } 1297 1298 #ifdef POWER10 1299 // Called when waitForAllOccsTimer expires 1300 // After the first OCC goes active, this timer will be started (60 seconds) 1301 void Manager::occsNotAllRunning() 1302 { 1303 if (activeCount != statusObjects.size()) 1304 { 1305 // Not all OCCs went active 1306 log<level::WARNING>( 1307 fmt::format( 1308 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1309 activeCount, statusObjects.size()) 1310 .c_str()); 1311 // Procs may be garded, so may be expected 1312 } 1313 1314 validateOccMaster(); 1315 } 1316 #endif // POWER10 1317 1318 // Verify single master OCC and start presence monitor 1319 void Manager::validateOccMaster() 1320 { 1321 int masterInstance = -1; 1322 for (auto& obj : statusObjects) 1323 { 1324 auto instance = obj->getOccInstanceID(); 1325 #ifdef POWER10 1326 if (!obj->occActive()) 1327 { 1328 if (utils::isHostRunning()) 1329 { 1330 // Check if sensor was queued while waiting for discovery 1331 auto match = queuedActiveState.find(instance); 1332 if (match != queuedActiveState.end()) 1333 { 1334 queuedActiveState.erase(match); 1335 log<level::INFO>( 1336 fmt::format( 1337 "validateOccMaster: OCC{} is ACTIVE (queued)", 1338 instance) 1339 .c_str()); 1340 obj->occActive(true); 1341 } 1342 else 1343 { 1344 // OCC does not appear to be active yet, check active sensor 1345 pldmHandle->checkActiveSensor(instance); 1346 if (obj->occActive()) 1347 { 1348 log<level::INFO>( 1349 fmt::format( 1350 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1351 instance) 1352 .c_str()); 1353 } 1354 } 1355 } 1356 else 1357 { 1358 log<level::WARNING>( 1359 fmt::format( 1360 "validateOccMaster: HOST is not running (OCC{})", 1361 instance) 1362 .c_str()); 1363 return; 1364 } 1365 } 1366 #endif // POWER10 1367 1368 if (obj->isMasterOcc()) 1369 { 1370 obj->addPresenceWatchMaster(); 1371 1372 if (masterInstance == -1) 1373 { 1374 masterInstance = instance; 1375 } 1376 else 1377 { 1378 log<level::ERR>( 1379 fmt::format( 1380 "validateOccMaster: Multiple OCC masters! ({} and {})", 1381 masterInstance, instance) 1382 .c_str()); 1383 // request reset 1384 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1385 } 1386 } 1387 } 1388 1389 if (masterInstance < 0) 1390 { 1391 log<level::ERR>( 1392 fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1393 statusObjects.size()) 1394 .c_str()); 1395 // request reset 1396 statusObjects.front()->deviceError( 1397 Error::Descriptor(PRESENCE_ERROR_PATH)); 1398 } 1399 else 1400 { 1401 log<level::INFO>( 1402 fmt::format("validateOccMaster: OCC{} is master of {} OCCs", 1403 masterInstance, activeCount) 1404 .c_str()); 1405 #ifdef POWER10 1406 pmode->updateDbusSafeMode(false); 1407 #endif 1408 } 1409 } 1410 1411 void Manager::updatePcapBounds() const 1412 { 1413 if (pcap) 1414 { 1415 pcap->updatePcapBounds(); 1416 } 1417 } 1418 1419 } // namespace occ 1420 } // namespace open_power 1421