1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "utils.hpp" 8 9 #include <phosphor-logging/elog-errors.hpp> 10 #include <phosphor-logging/log.hpp> 11 #include <xyz/openbmc_project/Common/error.hpp> 12 13 #include <chrono> 14 #include <cmath> 15 #include <filesystem> 16 #include <fstream> 17 #include <regex> 18 19 namespace open_power 20 { 21 namespace occ 22 { 23 24 constexpr uint32_t fruTypeNotAvailable = 0xFF; 25 constexpr auto fruTypeSuffix = "fru_type"; 26 constexpr auto faultSuffix = "fault"; 27 constexpr auto inputSuffix = "input"; 28 constexpr auto maxSuffix = "max"; 29 30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 31 32 using namespace phosphor::logging; 33 using namespace std::literals::chrono_literals; 34 35 template <typename T> 36 T readFile(const std::string& path) 37 { 38 std::ifstream ifs; 39 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 40 std::ifstream::eofbit); 41 T data; 42 43 try 44 { 45 ifs.open(path); 46 ifs >> data; 47 ifs.close(); 48 } 49 catch (const std::exception& e) 50 { 51 auto err = errno; 52 throw std::system_error(err, std::generic_category()); 53 } 54 55 return data; 56 } 57 58 void Manager::findAndCreateObjects() 59 { 60 #ifndef POWER10 61 for (auto id = 0; id < MAX_CPUS; ++id) 62 { 63 // Create one occ per cpu 64 auto occ = std::string(OCC_NAME) + std::to_string(id); 65 createObjects(occ); 66 } 67 #else 68 if (!pmode) 69 { 70 // Create the power mode object 71 pmode = std::make_unique<powermode::PowerMode>( 72 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 73 } 74 75 if (!fs::exists(HOST_ON_FILE)) 76 { 77 static bool statusObjCreated = false; 78 if (!statusObjCreated) 79 { 80 // Create the OCCs based on on the /dev/occX devices 81 auto occs = findOCCsInDev(); 82 83 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 84 { 85 // Something changed or no OCCs yet, try again in 10s. 86 // Note on the first pass prevOCCSearch will be empty, 87 // so there will be at least one delay to give things 88 // a chance to settle. 89 prevOCCSearch = occs; 90 91 log<level::INFO>( 92 std::format( 93 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 94 occs.size()) 95 .c_str()); 96 97 discoverTimer->restartOnce(10s); 98 } 99 else 100 { 101 // All OCCs appear to be available, create status objects 102 103 // createObjects requires OCC0 first. 104 std::sort(occs.begin(), occs.end()); 105 106 log<level::INFO>( 107 std::format( 108 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 109 occs.size()) 110 .c_str()); 111 for (auto id : occs) 112 { 113 createObjects(std::string(OCC_NAME) + std::to_string(id)); 114 } 115 statusObjCreated = true; 116 waitingForAllOccActiveSensors = true; 117 118 // Find/update the processor path associated with each OCC 119 for (auto& obj : statusObjects) 120 { 121 obj->updateProcAssociation(); 122 } 123 } 124 } 125 126 if (statusObjCreated && waitingForAllOccActiveSensors) 127 { 128 static bool tracedHostWait = false; 129 if (utils::isHostRunning()) 130 { 131 if (tracedHostWait) 132 { 133 log<level::INFO>( 134 "Manager::findAndCreateObjects(): Host is running"); 135 tracedHostWait = false; 136 } 137 checkAllActiveSensors(); 138 } 139 else 140 { 141 if (!tracedHostWait) 142 { 143 log<level::INFO>( 144 "Manager::findAndCreateObjects(): Waiting for host to start"); 145 tracedHostWait = true; 146 } 147 discoverTimer->restartOnce(30s); 148 } 149 } 150 } 151 else 152 { 153 log<level::INFO>( 154 std::format( 155 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 156 HOST_ON_FILE) 157 .c_str()); 158 discoverTimer->restartOnce(10s); 159 } 160 #endif 161 } 162 163 #ifdef POWER10 164 // Check if all occActive sensors are available 165 void Manager::checkAllActiveSensors() 166 { 167 static bool allActiveSensorAvailable = false; 168 static bool tracedSensorWait = false; 169 static bool waitingForHost = false; 170 171 if (open_power::occ::utils::isHostRunning()) 172 { 173 if (waitingForHost) 174 { 175 waitingForHost = false; 176 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 177 } 178 179 // Start with the assumption that all are available 180 allActiveSensorAvailable = true; 181 for (auto& obj : statusObjects) 182 { 183 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 184 { 185 auto instance = obj->getOccInstanceID(); 186 // Check if sensor was queued while waiting for discovery 187 auto match = queuedActiveState.find(instance); 188 if (match != queuedActiveState.end()) 189 { 190 queuedActiveState.erase(match); 191 log<level::INFO>( 192 std::format( 193 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 194 instance) 195 .c_str()); 196 obj->occActive(true); 197 } 198 else 199 { 200 allActiveSensorAvailable = false; 201 if (!tracedSensorWait) 202 { 203 log<level::INFO>( 204 std::format( 205 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 206 instance) 207 .c_str()); 208 tracedSensorWait = true; 209 // Make sure traces are not throttled 210 #ifdef PLDM 211 pldmHandle->setTraceThrottle(false); 212 // Start timer to throttle pldm traces when timer 213 // expires 214 throttleTraceTimer->restartOnce(5min); 215 #endif 216 } 217 #ifdef PLDM 218 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 219 #endif 220 break; 221 } 222 } 223 } 224 } 225 else 226 { 227 if (!waitingForHost) 228 { 229 waitingForHost = true; 230 log<level::INFO>( 231 "checkAllActiveSensors(): Waiting for host to start"); 232 } 233 } 234 235 if (allActiveSensorAvailable) 236 { 237 // All sensors were found, disable the discovery timer 238 if (discoverTimer->isEnabled()) 239 { 240 discoverTimer->setEnabled(false); 241 } 242 #ifdef PLDM 243 if (throttleTraceTimer->isEnabled()) 244 { 245 // Disable throttle timer and make sure traces are not throttled 246 throttleTraceTimer->setEnabled(false); 247 pldmHandle->setTraceThrottle(false); 248 } 249 #endif 250 251 if (waitingForAllOccActiveSensors) 252 { 253 log<level::INFO>( 254 "checkAllActiveSensors(): OCC Active sensors are available"); 255 waitingForAllOccActiveSensors = false; 256 } 257 queuedActiveState.clear(); 258 tracedSensorWait = false; 259 } 260 else 261 { 262 // Not all sensors were available, so keep waiting 263 if (!tracedSensorWait) 264 { 265 log<level::INFO>( 266 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 267 tracedSensorWait = true; 268 } 269 discoverTimer->restartOnce(10s); 270 } 271 } 272 #endif 273 274 std::vector<int> Manager::findOCCsInDev() 275 { 276 std::vector<int> occs; 277 std::regex expr{R"(occ(\d+)$)"}; 278 279 for (auto& file : fs::directory_iterator("/dev")) 280 { 281 std::smatch match; 282 std::string path{file.path().string()}; 283 if (std::regex_search(path, match, expr)) 284 { 285 auto num = std::stoi(match[1].str()); 286 287 // /dev numbering starts at 1, ours starts at 0. 288 occs.push_back(num - 1); 289 } 290 } 291 292 return occs; 293 } 294 295 int Manager::cpuCreated(sdbusplus::message_t& msg) 296 { 297 namespace fs = std::filesystem; 298 299 sdbusplus::message::object_path o; 300 msg.read(o); 301 fs::path cpuPath(std::string(std::move(o))); 302 303 auto name = cpuPath.filename().string(); 304 auto index = name.find(CPU_NAME); 305 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 306 307 createObjects(name); 308 309 return 0; 310 } 311 312 void Manager::createObjects(const std::string& occ) 313 { 314 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 315 316 statusObjects.emplace_back(std::make_unique<Status>( 317 event, path.c_str(), *this, 318 #ifdef POWER10 319 pmode, 320 #endif 321 std::bind(std::mem_fn(&Manager::statusCallBack), this, 322 std::placeholders::_1, std::placeholders::_2) 323 #ifdef PLDM 324 , 325 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 326 std::placeholders::_1) 327 #endif 328 )); 329 330 // Create the power cap monitor object 331 if (!pcap) 332 { 333 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 334 *statusObjects.back()); 335 } 336 337 if (statusObjects.back()->isMasterOcc()) 338 { 339 log<level::INFO>( 340 std::format("Manager::createObjects(): OCC{} is the master", 341 statusObjects.back()->getOccInstanceID()) 342 .c_str()); 343 _pollTimer->setEnabled(false); 344 345 #ifdef POWER10 346 // Set the master OCC on the PowerMode object 347 pmode->setMasterOcc(path); 348 #endif 349 } 350 351 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 352 #ifdef POWER10 353 , 354 pmode 355 #endif 356 )); 357 } 358 359 void Manager::statusCallBack(instanceID instance, bool status) 360 { 361 if (status == true) 362 { 363 // OCC went active 364 ++activeCount; 365 366 #ifdef POWER10 367 if (activeCount == 1) 368 { 369 // First OCC went active (allow some time for all OCCs to go active) 370 waitForAllOccsTimer->restartOnce(60s); 371 } 372 #endif 373 374 if (activeCount == statusObjects.size()) 375 { 376 #ifdef POWER10 377 // All OCCs are now running 378 if (waitForAllOccsTimer->isEnabled()) 379 { 380 // stop occ wait timer 381 waitForAllOccsTimer->setEnabled(false); 382 } 383 #endif 384 385 // Verify master OCC and start presence monitor 386 validateOccMaster(); 387 } 388 389 // Start poll timer if not already started 390 if (!_pollTimer->isEnabled()) 391 { 392 log<level::INFO>( 393 std::format("Manager: OCCs will be polled every {} seconds", 394 pollInterval) 395 .c_str()); 396 397 // Send poll and start OCC poll timer 398 pollerTimerExpired(); 399 } 400 } 401 else 402 { 403 // OCC went away 404 if (activeCount > 0) 405 { 406 --activeCount; 407 } 408 else 409 { 410 log<level::ERR>( 411 std::format("OCC{} disabled, but currently no active OCCs", 412 instance) 413 .c_str()); 414 } 415 416 if (activeCount == 0) 417 { 418 // No OCCs are running 419 420 // Stop OCC poll timer 421 if (_pollTimer->isEnabled()) 422 { 423 log<level::INFO>( 424 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 425 _pollTimer->setEnabled(false); 426 } 427 428 #ifdef POWER10 429 // stop wait timer 430 if (waitForAllOccsTimer->isEnabled()) 431 { 432 waitForAllOccsTimer->setEnabled(false); 433 } 434 #endif 435 } 436 #ifdef READ_OCC_SENSORS 437 // Clear OCC sensors 438 setSensorValueToNaN(instance); 439 #endif 440 } 441 442 #ifdef POWER10 443 if (waitingForAllOccActiveSensors) 444 { 445 if (utils::isHostRunning()) 446 { 447 checkAllActiveSensors(); 448 } 449 } 450 #endif 451 } 452 453 #ifdef I2C_OCC 454 void Manager::initStatusObjects() 455 { 456 // Make sure we have a valid path string 457 static_assert(sizeof(DEV_PATH) != 0); 458 459 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 460 for (auto& name : deviceNames) 461 { 462 i2c_occ::i2cToDbus(name); 463 name = std::string(OCC_NAME) + '_' + name; 464 auto path = fs::path(OCC_CONTROL_ROOT) / name; 465 statusObjects.emplace_back( 466 std::make_unique<Status>(event, path.c_str(), *this)); 467 } 468 // The first device is master occ 469 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 470 *statusObjects.front()); 471 #ifdef POWER10 472 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 473 powermode::PIPS_PATH); 474 // Set the master OCC on the PowerMode object 475 pmode->setMasterOcc(path); 476 #endif 477 } 478 #endif 479 480 #ifdef PLDM 481 void Manager::sbeTimeout(unsigned int instance) 482 { 483 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 484 [instance](const auto& obj) { 485 return instance == obj->getOccInstanceID(); 486 }); 487 488 if (obj != statusObjects.end() && (*obj)->occActive()) 489 { 490 log<level::INFO>( 491 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 492 .c_str()); 493 494 setSBEState(instance, SBE_STATE_NOT_USABLE); 495 496 pldmHandle->sendHRESET(instance); 497 } 498 } 499 500 bool Manager::updateOCCActive(instanceID instance, bool status) 501 { 502 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 503 [instance](const auto& obj) { 504 return instance == obj->getOccInstanceID(); 505 }); 506 507 const bool hostRunning = open_power::occ::utils::isHostRunning(); 508 if (obj != statusObjects.end()) 509 { 510 if (!hostRunning && (status == true)) 511 { 512 log<level::WARNING>( 513 std::format( 514 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 515 instance, status) 516 .c_str()); 517 (*obj)->setPldmSensorReceived(false); 518 if (!waitingForAllOccActiveSensors) 519 { 520 log<level::INFO>( 521 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 522 waitingForAllOccActiveSensors = true; 523 } 524 #ifdef POWER10 525 discoverTimer->restartOnce(30s); 526 #endif 527 return false; 528 } 529 else 530 { 531 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 532 instance, status) 533 .c_str()); 534 (*obj)->setPldmSensorReceived(true); 535 return (*obj)->occActive(status); 536 } 537 } 538 else 539 { 540 if (hostRunning) 541 { 542 log<level::WARNING>( 543 std::format( 544 "updateOCCActive: No status object to update for OCC{} (active={})", 545 instance, status) 546 .c_str()); 547 } 548 else 549 { 550 if (status == true) 551 { 552 log<level::WARNING>( 553 std::format( 554 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 555 instance, status) 556 .c_str()); 557 } 558 } 559 if (status == true) 560 { 561 // OCC went active 562 queuedActiveState.insert(instance); 563 } 564 else 565 { 566 auto match = queuedActiveState.find(instance); 567 if (match != queuedActiveState.end()) 568 { 569 // OCC was disabled 570 queuedActiveState.erase(match); 571 } 572 } 573 return false; 574 } 575 } 576 577 // Called upon pldm event To set powermode Safe Mode State for system. 578 void Manager::updateOccSafeMode(bool safeMode) 579 { 580 #ifdef POWER10 581 pmode->updateDbusSafeMode(safeMode); 582 #endif 583 // Update the processor throttle status on dbus 584 for (auto& obj : statusObjects) 585 { 586 obj->updateThrottle(safeMode, THROTTLED_SAFE); 587 } 588 } 589 590 void Manager::sbeHRESETResult(instanceID instance, bool success) 591 { 592 if (success) 593 { 594 log<level::INFO>( 595 std::format("HRESET succeeded (OCC{})", instance).c_str()); 596 597 setSBEState(instance, SBE_STATE_BOOTED); 598 599 return; 600 } 601 602 setSBEState(instance, SBE_STATE_FAILED); 603 604 if (sbeCanDump(instance)) 605 { 606 log<level::INFO>( 607 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 608 .c_str()); 609 610 auto& bus = utils::getBus(); 611 uint32_t src6 = instance << 16; 612 uint32_t logId = 613 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 614 src6, "SBE command timeout"); 615 616 try 617 { 618 constexpr auto path = "/org/openpower/dump"; 619 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 620 constexpr auto function = "CreateDump"; 621 622 std::string service = utils::getService(path, interface); 623 auto method = bus.new_method_call(service.c_str(), path, interface, 624 function); 625 626 std::map<std::string, std::variant<std::string, uint64_t>> 627 createParams{ 628 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 629 uint64_t(logId)}, 630 {"com.ibm.Dump.Create.CreateParameters.DumpType", 631 "com.ibm.Dump.Create.DumpType.SBE"}, 632 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 633 uint64_t(instance)}, 634 }; 635 636 method.append(createParams); 637 638 auto response = bus.call(method); 639 } 640 catch (const sdbusplus::exception_t& e) 641 { 642 constexpr auto ERROR_DUMP_DISABLED = 643 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 644 if (e.name() == ERROR_DUMP_DISABLED) 645 { 646 log<level::INFO>("Dump is disabled, skipping"); 647 } 648 else 649 { 650 log<level::ERR>("Dump failed"); 651 } 652 } 653 } 654 } 655 656 bool Manager::sbeCanDump(unsigned int instance) 657 { 658 struct pdbg_target* proc = getPdbgTarget(instance); 659 660 if (!proc) 661 { 662 // allow the dump in the error case 663 return true; 664 } 665 666 try 667 { 668 if (!openpower::phal::sbe::isDumpAllowed(proc)) 669 { 670 return false; 671 } 672 673 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 674 { 675 return false; 676 } 677 } 678 catch (openpower::phal::exception::SbeError& e) 679 { 680 log<level::INFO>("Failed to query SBE state"); 681 } 682 683 // allow the dump in the error case 684 return true; 685 } 686 687 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 688 { 689 struct pdbg_target* proc = getPdbgTarget(instance); 690 691 if (!proc) 692 { 693 return; 694 } 695 696 try 697 { 698 openpower::phal::sbe::setState(proc, state); 699 } 700 catch (const openpower::phal::exception::SbeError& e) 701 { 702 log<level::ERR>("Failed to set SBE state"); 703 } 704 } 705 706 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 707 { 708 if (!pdbgInitialized) 709 { 710 try 711 { 712 openpower::phal::pdbg::init(); 713 pdbgInitialized = true; 714 } 715 catch (const openpower::phal::exception::PdbgError& e) 716 { 717 log<level::ERR>("pdbg initialization failed"); 718 return nullptr; 719 } 720 } 721 722 struct pdbg_target* proc = nullptr; 723 pdbg_for_each_class_target("proc", proc) 724 { 725 if (pdbg_target_index(proc) == instance) 726 { 727 return proc; 728 } 729 } 730 731 log<level::ERR>("Failed to get pdbg target"); 732 return nullptr; 733 } 734 #endif 735 736 void Manager::pollerTimerExpired() 737 { 738 if (!_pollTimer) 739 { 740 log<level::ERR>( 741 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 742 return; 743 } 744 745 for (auto& obj : statusObjects) 746 { 747 if (!obj->occActive()) 748 { 749 // OCC is not running yet 750 #ifdef READ_OCC_SENSORS 751 auto id = obj->getOccInstanceID(); 752 setSensorValueToNaN(id); 753 #endif 754 continue; 755 } 756 757 // Read sysfs to force kernel to poll OCC 758 obj->readOccState(); 759 760 #ifdef READ_OCC_SENSORS 761 // Read occ sensor values 762 getSensorValues(obj); 763 #endif 764 } 765 766 if (activeCount > 0) 767 { 768 // Restart OCC poll timer 769 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 770 } 771 else 772 { 773 // No OCCs running, so poll timer will not be restarted 774 log<level::INFO>( 775 std::format( 776 "Manager::pollerTimerExpired: poll timer will not be restarted") 777 .c_str()); 778 } 779 } 780 781 #ifdef READ_OCC_SENSORS 782 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 783 { 784 // There may be more than one sensor with the same FRU type 785 // and label so make two passes: the first to read the temps 786 // from sysfs, and the second to put them on D-Bus after 787 // resolving any conflicts. 788 std::map<std::string, double> sensorData; 789 790 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 791 for (auto& file : fs::directory_iterator(path)) 792 { 793 if (!std::regex_search(file.path().string(), expr)) 794 { 795 continue; 796 } 797 798 uint32_t labelValue{0}; 799 800 try 801 { 802 labelValue = readFile<uint32_t>(file.path()); 803 } 804 catch (const std::system_error& e) 805 { 806 log<level::DEBUG>( 807 std::format("readTempSensors: Failed reading {}, errno = {}", 808 file.path().string(), e.code().value()) 809 .c_str()); 810 continue; 811 } 812 813 const std::string& tempLabel = "label"; 814 const std::string filePathString = file.path().string().substr( 815 0, file.path().string().length() - tempLabel.length()); 816 817 uint32_t fruTypeValue{0}; 818 try 819 { 820 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 821 } 822 catch (const std::system_error& e) 823 { 824 log<level::DEBUG>( 825 std::format("readTempSensors: Failed reading {}, errno = {}", 826 filePathString + fruTypeSuffix, e.code().value()) 827 .c_str()); 828 continue; 829 } 830 831 std::string sensorPath = OCC_SENSORS_ROOT + 832 std::string("/temperature/"); 833 834 std::string dvfsTempPath; 835 836 if (fruTypeValue == VRMVdd) 837 { 838 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 839 "_temp"); 840 } 841 else if (fruTypeValue == processorIoRing) 842 { 843 sensorPath.append("proc" + std::to_string(occInstance) + 844 "_ioring_temp"); 845 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 846 std::to_string(occInstance) + "_ioring_dvfs_temp"; 847 } 848 else 849 { 850 uint16_t type = (labelValue & 0xFF000000) >> 24; 851 uint16_t instanceID = labelValue & 0x0000FFFF; 852 853 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 854 { 855 if (fruTypeValue == fruTypeNotAvailable) 856 { 857 // Not all DIMM related temps are available to read 858 // (no _input file in this case) 859 continue; 860 } 861 auto iter = dimmTempSensorName.find(fruTypeValue); 862 if (iter == dimmTempSensorName.end()) 863 { 864 log<level::ERR>( 865 std::format( 866 "readTempSensors: Fru type error! fruTypeValue = {}) ", 867 fruTypeValue) 868 .c_str()); 869 continue; 870 } 871 872 sensorPath.append("dimm" + std::to_string(instanceID) + 873 iter->second); 874 875 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 876 dimmDVFSSensorName.at(fruTypeValue); 877 } 878 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 879 { 880 if (fruTypeValue == processorCore) 881 { 882 // The OCC reports small core temps, of which there are 883 // two per big core. All current P10 systems are in big 884 // core mode, so use a big core name. 885 uint16_t coreNum = instanceID / 2; 886 uint16_t tempNum = instanceID % 2; 887 sensorPath.append("proc" + std::to_string(occInstance) + 888 "_core" + std::to_string(coreNum) + "_" + 889 std::to_string(tempNum) + "_temp"); 890 891 dvfsTempPath = 892 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 893 std::to_string(occInstance) + "_core_dvfs_temp"; 894 } 895 else 896 { 897 continue; 898 } 899 } 900 else 901 { 902 continue; 903 } 904 } 905 906 // The dvfs temp file only needs to be read once per chip per type. 907 if (!dvfsTempPath.empty() && 908 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 909 { 910 try 911 { 912 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 913 914 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 915 dvfsTempPath, dvfsValue * std::pow(10, -3)); 916 } 917 catch (const std::system_error& e) 918 { 919 log<level::DEBUG>( 920 std::format( 921 "readTempSensors: Failed reading {}, errno = {}", 922 filePathString + maxSuffix, e.code().value()) 923 .c_str()); 924 } 925 } 926 927 uint32_t faultValue{0}; 928 try 929 { 930 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 931 } 932 catch (const std::system_error& e) 933 { 934 log<level::DEBUG>( 935 std::format("readTempSensors: Failed reading {}, errno = {}", 936 filePathString + faultSuffix, e.code().value()) 937 .c_str()); 938 continue; 939 } 940 941 double tempValue{0}; 942 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 943 if (faultValue != 0) 944 { 945 tempValue = std::numeric_limits<double>::quiet_NaN(); 946 } 947 else 948 { 949 // Read the temperature 950 try 951 { 952 tempValue = readFile<double>(filePathString + inputSuffix); 953 } 954 catch (const std::system_error& e) 955 { 956 log<level::DEBUG>( 957 std::format( 958 "readTempSensors: Failed reading {}, errno = {}", 959 filePathString + inputSuffix, e.code().value()) 960 .c_str()); 961 962 // if errno == EAGAIN(Resource temporarily unavailable) then set 963 // temp to 0, to avoid using old temp, and affecting FAN 964 // Control. 965 if (e.code().value() == EAGAIN) 966 { 967 tempValue = 0; 968 } 969 // else the errno would be something like 970 // EBADF(Bad file descriptor) 971 // or ENOENT(No such file or directory) 972 else 973 { 974 continue; 975 } 976 } 977 } 978 979 // If this object path already has a value, only overwite 980 // it if the previous one was an NaN or a smaller value. 981 auto existing = sensorData.find(sensorPath); 982 if (existing != sensorData.end()) 983 { 984 // Multiple sensors found for this FRU type 985 if ((std::isnan(existing->second) && (tempValue == 0)) || 986 ((existing->second == 0) && std::isnan(tempValue))) 987 { 988 // One of the redundant sensors has failed (0xFF/nan), and the 989 // other sensor has no reading (0), so set the FRU to NaN to 990 // force fan increase 991 tempValue = std::numeric_limits<double>::quiet_NaN(); 992 existing->second = tempValue; 993 } 994 if (std::isnan(existing->second) || (tempValue > existing->second)) 995 { 996 existing->second = tempValue; 997 } 998 } 999 else 1000 { 1001 // First sensor for this FRU type 1002 sensorData[sensorPath] = tempValue; 1003 } 1004 } 1005 1006 // Now publish the values on D-Bus. 1007 for (const auto& [objectPath, value] : sensorData) 1008 { 1009 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1010 value * std::pow(10, -3)); 1011 1012 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1013 objectPath, !std::isnan(value)); 1014 1015 if (existingSensors.find(objectPath) == existingSensors.end()) 1016 { 1017 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1018 objectPath); 1019 } 1020 1021 existingSensors[objectPath] = occInstance; 1022 } 1023 } 1024 1025 std::optional<std::string> 1026 Manager::getPowerLabelFunctionID(const std::string& value) 1027 { 1028 // If the value is "system", then the FunctionID is "system". 1029 if (value == "system") 1030 { 1031 return value; 1032 } 1033 1034 // If the value is not "system", then the label value have 3 numbers, of 1035 // which we only care about the middle one: 1036 // <sensor id>_<function id>_<apss channel> 1037 // eg: The value is "0_10_5" , then the FunctionID is "10". 1038 if (value.find("_") == std::string::npos) 1039 { 1040 return std::nullopt; 1041 } 1042 1043 auto powerLabelValue = value.substr((value.find("_") + 1)); 1044 1045 if (powerLabelValue.find("_") == std::string::npos) 1046 { 1047 return std::nullopt; 1048 } 1049 1050 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1051 } 1052 1053 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1054 { 1055 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1056 for (auto& file : fs::directory_iterator(path)) 1057 { 1058 if (!std::regex_search(file.path().string(), expr)) 1059 { 1060 continue; 1061 } 1062 1063 std::string labelValue; 1064 try 1065 { 1066 labelValue = readFile<std::string>(file.path()); 1067 } 1068 catch (const std::system_error& e) 1069 { 1070 log<level::DEBUG>( 1071 std::format("readPowerSensors: Failed reading {}, errno = {}", 1072 file.path().string(), e.code().value()) 1073 .c_str()); 1074 continue; 1075 } 1076 1077 auto functionID = getPowerLabelFunctionID(labelValue); 1078 if (functionID == std::nullopt) 1079 { 1080 continue; 1081 } 1082 1083 const std::string& tempLabel = "label"; 1084 const std::string filePathString = file.path().string().substr( 1085 0, file.path().string().length() - tempLabel.length()); 1086 1087 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1088 1089 auto iter = powerSensorName.find(*functionID); 1090 if (iter == powerSensorName.end()) 1091 { 1092 continue; 1093 } 1094 sensorPath.append(iter->second); 1095 1096 double tempValue{0}; 1097 1098 try 1099 { 1100 tempValue = readFile<double>(filePathString + inputSuffix); 1101 } 1102 catch (const std::system_error& e) 1103 { 1104 log<level::DEBUG>( 1105 std::format("readPowerSensors: Failed reading {}, errno = {}", 1106 filePathString + inputSuffix, e.code().value()) 1107 .c_str()); 1108 continue; 1109 } 1110 1111 dbus::OccDBusSensors::getOccDBus().setUnit( 1112 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1113 1114 dbus::OccDBusSensors::getOccDBus().setValue( 1115 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1116 1117 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1118 true); 1119 1120 if (existingSensors.find(sensorPath) == existingSensors.end()) 1121 { 1122 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1123 sensorPath); 1124 } 1125 1126 existingSensors[sensorPath] = id; 1127 } 1128 return; 1129 } 1130 1131 void Manager::setSensorValueToNaN(uint32_t id) const 1132 { 1133 for (const auto& [sensorPath, occId] : existingSensors) 1134 { 1135 if (occId == id) 1136 { 1137 dbus::OccDBusSensors::getOccDBus().setValue( 1138 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1139 1140 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1141 true); 1142 } 1143 } 1144 return; 1145 } 1146 1147 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1148 { 1149 for (const auto& [sensorPath, occId] : existingSensors) 1150 { 1151 if (occId == id) 1152 { 1153 dbus::OccDBusSensors::getOccDBus().setValue( 1154 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1155 1156 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1157 false); 1158 } 1159 } 1160 return; 1161 } 1162 1163 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1164 { 1165 static bool tracedError[8] = {0}; 1166 const fs::path sensorPath = occ->getHwmonPath(); 1167 const uint32_t id = occ->getOccInstanceID(); 1168 1169 if (fs::exists(sensorPath)) 1170 { 1171 // Read temperature sensors 1172 readTempSensors(sensorPath, id); 1173 1174 if (occ->isMasterOcc()) 1175 { 1176 // Read power sensors 1177 readPowerSensors(sensorPath, id); 1178 } 1179 tracedError[id] = false; 1180 } 1181 else 1182 { 1183 if (!tracedError[id]) 1184 { 1185 log<level::ERR>( 1186 std::format( 1187 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1188 id, sensorPath.c_str()) 1189 .c_str()); 1190 tracedError[id] = true; 1191 } 1192 } 1193 1194 return; 1195 } 1196 #endif 1197 1198 // Read the altitude from DBus 1199 void Manager::readAltitude() 1200 { 1201 static bool traceAltitudeErr = true; 1202 1203 utils::PropertyValue altitudeProperty{}; 1204 try 1205 { 1206 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1207 ALTITUDE_PROP); 1208 auto sensorVal = std::get<double>(altitudeProperty); 1209 if (sensorVal < 0xFFFF) 1210 { 1211 if (sensorVal < 0) 1212 { 1213 altitude = 0; 1214 } 1215 else 1216 { 1217 // Round to nearest meter 1218 altitude = uint16_t(sensorVal + 0.5); 1219 } 1220 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1221 sensorVal, altitude) 1222 .c_str()); 1223 traceAltitudeErr = true; 1224 } 1225 else 1226 { 1227 if (traceAltitudeErr) 1228 { 1229 traceAltitudeErr = false; 1230 log<level::DEBUG>( 1231 std::format("Invalid altitude value: {}", sensorVal) 1232 .c_str()); 1233 } 1234 } 1235 } 1236 catch (const sdbusplus::exception_t& e) 1237 { 1238 if (traceAltitudeErr) 1239 { 1240 traceAltitudeErr = false; 1241 log<level::INFO>( 1242 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1243 } 1244 altitude = 0xFFFF; // not available 1245 } 1246 } 1247 1248 // Callback function when ambient temperature changes 1249 void Manager::ambientCallback(sdbusplus::message_t& msg) 1250 { 1251 double currentTemp = 0; 1252 uint8_t truncatedTemp = 0xFF; 1253 std::string msgSensor; 1254 std::map<std::string, std::variant<double>> msgData; 1255 msg.read(msgSensor, msgData); 1256 1257 auto valPropMap = msgData.find(AMBIENT_PROP); 1258 if (valPropMap == msgData.end()) 1259 { 1260 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1261 return; 1262 } 1263 currentTemp = std::get<double>(valPropMap->second); 1264 if (std::isnan(currentTemp)) 1265 { 1266 truncatedTemp = 0xFF; 1267 } 1268 else 1269 { 1270 if (currentTemp < 0) 1271 { 1272 truncatedTemp = 0; 1273 } 1274 else 1275 { 1276 // Round to nearest degree C 1277 truncatedTemp = uint8_t(currentTemp + 0.5); 1278 } 1279 } 1280 1281 // If ambient changes, notify OCCs 1282 if (truncatedTemp != ambient) 1283 { 1284 log<level::DEBUG>( 1285 std::format("ambientCallback: Ambient change from {} to {}C", 1286 ambient, currentTemp) 1287 .c_str()); 1288 1289 ambient = truncatedTemp; 1290 if (altitude == 0xFFFF) 1291 { 1292 // No altitude yet, try reading again 1293 readAltitude(); 1294 } 1295 1296 log<level::DEBUG>( 1297 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1298 altitude) 1299 .c_str()); 1300 #ifdef POWER10 1301 // Send ambient and altitude to all OCCs 1302 for (auto& obj : statusObjects) 1303 { 1304 if (obj->occActive()) 1305 { 1306 obj->sendAmbient(ambient, altitude); 1307 } 1308 } 1309 #endif // POWER10 1310 } 1311 } 1312 1313 // return the current ambient and altitude readings 1314 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1315 uint16_t& altitudeValue) const 1316 { 1317 ambientValid = true; 1318 ambientTemp = ambient; 1319 altitudeValue = altitude; 1320 1321 if (ambient == 0xFF) 1322 { 1323 ambientValid = false; 1324 } 1325 } 1326 1327 #ifdef POWER10 1328 // Called when waitForAllOccsTimer expires 1329 // After the first OCC goes active, this timer will be started (60 seconds) 1330 void Manager::occsNotAllRunning() 1331 { 1332 if (activeCount != statusObjects.size()) 1333 { 1334 // Not all OCCs went active 1335 log<level::WARNING>( 1336 std::format( 1337 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1338 activeCount, statusObjects.size()) 1339 .c_str()); 1340 // Procs may be garded, so may be expected 1341 } 1342 1343 validateOccMaster(); 1344 } 1345 1346 #ifdef PLDM 1347 // Called when throttleTraceTimer expires. 1348 // If this timer expires, that indicates there is still no confirmed OCC status 1349 // which will trigger pldm traces to be throttled. 1350 void Manager::throttleTraceExpired() 1351 { 1352 // Throttle traces 1353 pldmHandle->setTraceThrottle(true); 1354 } 1355 #endif // PLDM 1356 #endif // POWER10 1357 1358 // Verify single master OCC and start presence monitor 1359 void Manager::validateOccMaster() 1360 { 1361 int masterInstance = -1; 1362 for (auto& obj : statusObjects) 1363 { 1364 auto instance = obj->getOccInstanceID(); 1365 #ifdef POWER10 1366 if (!obj->occActive()) 1367 { 1368 if (utils::isHostRunning()) 1369 { 1370 // Check if sensor was queued while waiting for discovery 1371 auto match = queuedActiveState.find(instance); 1372 if (match != queuedActiveState.end()) 1373 { 1374 queuedActiveState.erase(match); 1375 log<level::INFO>( 1376 std::format( 1377 "validateOccMaster: OCC{} is ACTIVE (queued)", 1378 instance) 1379 .c_str()); 1380 obj->occActive(true); 1381 } 1382 else 1383 { 1384 // OCC does not appear to be active yet, check active sensor 1385 #ifdef PLDM 1386 pldmHandle->checkActiveSensor(instance); 1387 #endif 1388 if (obj->occActive()) 1389 { 1390 log<level::INFO>( 1391 std::format( 1392 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1393 instance) 1394 .c_str()); 1395 } 1396 } 1397 } 1398 else 1399 { 1400 log<level::WARNING>( 1401 std::format( 1402 "validateOccMaster: HOST is not running (OCC{})", 1403 instance) 1404 .c_str()); 1405 return; 1406 } 1407 } 1408 #endif // POWER10 1409 1410 if (obj->isMasterOcc()) 1411 { 1412 obj->addPresenceWatchMaster(); 1413 1414 if (masterInstance == -1) 1415 { 1416 masterInstance = instance; 1417 } 1418 else 1419 { 1420 log<level::ERR>( 1421 std::format( 1422 "validateOccMaster: Multiple OCC masters! ({} and {})", 1423 masterInstance, instance) 1424 .c_str()); 1425 // request reset 1426 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1427 } 1428 } 1429 } 1430 1431 if (masterInstance < 0) 1432 { 1433 log<level::ERR>( 1434 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1435 statusObjects.size()) 1436 .c_str()); 1437 // request reset 1438 statusObjects.front()->deviceError( 1439 Error::Descriptor(PRESENCE_ERROR_PATH)); 1440 } 1441 else 1442 { 1443 log<level::INFO>( 1444 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1445 masterInstance, activeCount) 1446 .c_str()); 1447 #ifdef POWER10 1448 pmode->updateDbusSafeMode(false); 1449 #endif 1450 } 1451 } 1452 1453 void Manager::updatePcapBounds() const 1454 { 1455 if (pcap) 1456 { 1457 pcap->updatePcapBounds(); 1458 } 1459 } 1460 1461 } // namespace occ 1462 } // namespace open_power 1463