1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/log.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> 37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 59 void Manager::findAndCreateObjects() 60 { 61 #ifndef POWER10 62 for (auto id = 0; id < MAX_CPUS; ++id) 63 { 64 // Create one occ per cpu 65 auto occ = std::string(OCC_NAME) + std::to_string(id); 66 createObjects(occ); 67 } 68 #else 69 if (!pmode) 70 { 71 // Create the power mode object 72 pmode = std::make_unique<powermode::PowerMode>( 73 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 74 } 75 76 if (!fs::exists(HOST_ON_FILE)) 77 { 78 static bool statusObjCreated = false; 79 if (!statusObjCreated) 80 { 81 // Create the OCCs based on on the /dev/occX devices 82 auto occs = findOCCsInDev(); 83 84 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 85 { 86 // Something changed or no OCCs yet, try again in 10s. 87 // Note on the first pass prevOCCSearch will be empty, 88 // so there will be at least one delay to give things 89 // a chance to settle. 90 prevOCCSearch = occs; 91 92 log<level::INFO>( 93 std::format( 94 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 95 occs.size()) 96 .c_str()); 97 98 discoverTimer->restartOnce(10s); 99 } 100 else 101 { 102 // All OCCs appear to be available, create status objects 103 104 // createObjects requires OCC0 first. 105 std::sort(occs.begin(), occs.end()); 106 107 log<level::INFO>( 108 std::format( 109 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 110 occs.size()) 111 .c_str()); 112 for (auto id : occs) 113 { 114 createObjects(std::string(OCC_NAME) + std::to_string(id)); 115 } 116 statusObjCreated = true; 117 waitingForAllOccActiveSensors = true; 118 119 // Find/update the processor path associated with each OCC 120 for (auto& obj : statusObjects) 121 { 122 obj->updateProcAssociation(); 123 } 124 } 125 } 126 127 if (statusObjCreated && waitingForAllOccActiveSensors) 128 { 129 static bool tracedHostWait = false; 130 if (utils::isHostRunning()) 131 { 132 if (tracedHostWait) 133 { 134 log<level::INFO>( 135 "Manager::findAndCreateObjects(): Host is running"); 136 tracedHostWait = false; 137 } 138 checkAllActiveSensors(); 139 } 140 else 141 { 142 if (!tracedHostWait) 143 { 144 log<level::INFO>( 145 "Manager::findAndCreateObjects(): Waiting for host to start"); 146 tracedHostWait = true; 147 } 148 discoverTimer->restartOnce(30s); 149 } 150 } 151 } 152 else 153 { 154 log<level::INFO>( 155 std::format( 156 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 157 HOST_ON_FILE) 158 .c_str()); 159 discoverTimer->restartOnce(10s); 160 } 161 #endif 162 } 163 164 #ifdef POWER10 165 // Check if all occActive sensors are available 166 void Manager::checkAllActiveSensors() 167 { 168 static bool allActiveSensorAvailable = false; 169 static bool tracedSensorWait = false; 170 static bool waitingForHost = false; 171 172 if (open_power::occ::utils::isHostRunning()) 173 { 174 if (waitingForHost) 175 { 176 waitingForHost = false; 177 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 178 } 179 180 // Start with the assumption that all are available 181 allActiveSensorAvailable = true; 182 for (auto& obj : statusObjects) 183 { 184 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 185 { 186 auto instance = obj->getOccInstanceID(); 187 // Check if sensor was queued while waiting for discovery 188 auto match = queuedActiveState.find(instance); 189 if (match != queuedActiveState.end()) 190 { 191 queuedActiveState.erase(match); 192 log<level::INFO>( 193 std::format( 194 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 195 instance) 196 .c_str()); 197 obj->occActive(true); 198 } 199 else 200 { 201 allActiveSensorAvailable = false; 202 if (!tracedSensorWait) 203 { 204 log<level::INFO>( 205 std::format( 206 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 207 instance) 208 .c_str()); 209 tracedSensorWait = true; 210 // Make sure traces are not throttled 211 #ifdef PLDM 212 pldmHandle->setTraceThrottle(false); 213 // Start timer to throttle pldm traces when timer 214 // expires 215 throttleTraceTimer->restartOnce(5min); 216 #endif 217 } 218 #ifdef PLDM 219 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 220 #endif 221 break; 222 } 223 } 224 } 225 } 226 else 227 { 228 if (!waitingForHost) 229 { 230 waitingForHost = true; 231 log<level::INFO>( 232 "checkAllActiveSensors(): Waiting for host to start"); 233 } 234 } 235 236 if (allActiveSensorAvailable) 237 { 238 // All sensors were found, disable the discovery timer 239 if (discoverTimer->isEnabled()) 240 { 241 discoverTimer->setEnabled(false); 242 } 243 #ifdef PLDM 244 if (throttleTraceTimer->isEnabled()) 245 { 246 // Disable throttle timer and make sure traces are not throttled 247 throttleTraceTimer->setEnabled(false); 248 pldmHandle->setTraceThrottle(false); 249 } 250 #endif 251 252 if (waitingForAllOccActiveSensors) 253 { 254 log<level::INFO>( 255 "checkAllActiveSensors(): OCC Active sensors are available"); 256 waitingForAllOccActiveSensors = false; 257 } 258 queuedActiveState.clear(); 259 tracedSensorWait = false; 260 } 261 else 262 { 263 // Not all sensors were available, so keep waiting 264 if (!tracedSensorWait) 265 { 266 log<level::INFO>( 267 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 268 tracedSensorWait = true; 269 } 270 discoverTimer->restartOnce(10s); 271 } 272 } 273 #endif 274 275 std::vector<int> Manager::findOCCsInDev() 276 { 277 std::vector<int> occs; 278 std::regex expr{R"(occ(\d+)$)"}; 279 280 for (auto& file : fs::directory_iterator("/dev")) 281 { 282 std::smatch match; 283 std::string path{file.path().string()}; 284 if (std::regex_search(path, match, expr)) 285 { 286 auto num = std::stoi(match[1].str()); 287 288 // /dev numbering starts at 1, ours starts at 0. 289 occs.push_back(num - 1); 290 } 291 } 292 293 return occs; 294 } 295 296 int Manager::cpuCreated(sdbusplus::message_t& msg) 297 { 298 namespace fs = std::filesystem; 299 300 sdbusplus::message::object_path o; 301 msg.read(o); 302 fs::path cpuPath(std::string(std::move(o))); 303 304 auto name = cpuPath.filename().string(); 305 auto index = name.find(CPU_NAME); 306 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 307 308 createObjects(name); 309 310 return 0; 311 } 312 313 void Manager::createObjects(const std::string& occ) 314 { 315 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 316 317 statusObjects.emplace_back(std::make_unique<Status>( 318 event, path.c_str(), *this, 319 #ifdef POWER10 320 pmode, 321 #endif 322 std::bind(std::mem_fn(&Manager::statusCallBack), this, 323 std::placeholders::_1, std::placeholders::_2) 324 #ifdef PLDM 325 , 326 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 327 std::placeholders::_1) 328 #endif 329 )); 330 331 // Create the power cap monitor object 332 if (!pcap) 333 { 334 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 335 *statusObjects.back()); 336 } 337 338 if (statusObjects.back()->isMasterOcc()) 339 { 340 log<level::INFO>( 341 std::format("Manager::createObjects(): OCC{} is the master", 342 statusObjects.back()->getOccInstanceID()) 343 .c_str()); 344 _pollTimer->setEnabled(false); 345 346 #ifdef POWER10 347 // Set the master OCC on the PowerMode object 348 pmode->setMasterOcc(path); 349 #endif 350 } 351 352 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 353 #ifdef POWER10 354 , 355 pmode 356 #endif 357 )); 358 } 359 360 void Manager::statusCallBack(instanceID instance, bool status) 361 { 362 if (status == true) 363 { 364 // OCC went active 365 ++activeCount; 366 367 #ifdef POWER10 368 if (activeCount == 1) 369 { 370 // First OCC went active (allow some time for all OCCs to go active) 371 waitForAllOccsTimer->restartOnce(60s); 372 } 373 #endif 374 375 if (activeCount == statusObjects.size()) 376 { 377 #ifdef POWER10 378 // All OCCs are now running 379 if (waitForAllOccsTimer->isEnabled()) 380 { 381 // stop occ wait timer 382 waitForAllOccsTimer->setEnabled(false); 383 } 384 #endif 385 386 // Verify master OCC and start presence monitor 387 validateOccMaster(); 388 } 389 390 // Start poll timer if not already started 391 if (!_pollTimer->isEnabled()) 392 { 393 log<level::INFO>( 394 std::format("Manager: OCCs will be polled every {} seconds", 395 pollInterval) 396 .c_str()); 397 398 // Send poll and start OCC poll timer 399 pollerTimerExpired(); 400 } 401 } 402 else 403 { 404 // OCC went away 405 if (activeCount > 0) 406 { 407 --activeCount; 408 } 409 else 410 { 411 log<level::ERR>( 412 std::format("OCC{} disabled, but currently no active OCCs", 413 instance) 414 .c_str()); 415 } 416 417 if (activeCount == 0) 418 { 419 // No OCCs are running 420 421 // Stop OCC poll timer 422 if (_pollTimer->isEnabled()) 423 { 424 log<level::INFO>( 425 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 426 _pollTimer->setEnabled(false); 427 } 428 429 #ifdef POWER10 430 // stop wait timer 431 if (waitForAllOccsTimer->isEnabled()) 432 { 433 waitForAllOccsTimer->setEnabled(false); 434 } 435 #endif 436 } 437 #ifdef READ_OCC_SENSORS 438 // Clear OCC sensors 439 setSensorValueToNaN(instance); 440 #endif 441 } 442 443 #ifdef POWER10 444 if (waitingForAllOccActiveSensors) 445 { 446 if (utils::isHostRunning()) 447 { 448 checkAllActiveSensors(); 449 } 450 } 451 #endif 452 } 453 454 #ifdef I2C_OCC 455 void Manager::initStatusObjects() 456 { 457 // Make sure we have a valid path string 458 static_assert(sizeof(DEV_PATH) != 0); 459 460 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 461 for (auto& name : deviceNames) 462 { 463 i2c_occ::i2cToDbus(name); 464 name = std::string(OCC_NAME) + '_' + name; 465 auto path = fs::path(OCC_CONTROL_ROOT) / name; 466 statusObjects.emplace_back( 467 std::make_unique<Status>(event, path.c_str(), *this)); 468 } 469 // The first device is master occ 470 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 471 *statusObjects.front()); 472 #ifdef POWER10 473 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 474 powermode::PIPS_PATH); 475 // Set the master OCC on the PowerMode object 476 pmode->setMasterOcc(path); 477 #endif 478 } 479 #endif 480 481 #ifdef PLDM 482 void Manager::sbeTimeout(unsigned int instance) 483 { 484 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 485 [instance](const auto& obj) { 486 return instance == obj->getOccInstanceID(); 487 }); 488 489 if (obj != statusObjects.end() && (*obj)->occActive()) 490 { 491 log<level::INFO>( 492 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 493 .c_str()); 494 495 setSBEState(instance, SBE_STATE_NOT_USABLE); 496 497 pldmHandle->sendHRESET(instance); 498 } 499 } 500 501 bool Manager::updateOCCActive(instanceID instance, bool status) 502 { 503 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 504 [instance](const auto& obj) { 505 return instance == obj->getOccInstanceID(); 506 }); 507 508 const bool hostRunning = open_power::occ::utils::isHostRunning(); 509 if (obj != statusObjects.end()) 510 { 511 if (!hostRunning && (status == true)) 512 { 513 log<level::WARNING>( 514 std::format( 515 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 516 instance, status) 517 .c_str()); 518 (*obj)->setPldmSensorReceived(false); 519 if (!waitingForAllOccActiveSensors) 520 { 521 log<level::INFO>( 522 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 523 waitingForAllOccActiveSensors = true; 524 } 525 #ifdef POWER10 526 discoverTimer->restartOnce(30s); 527 #endif 528 return false; 529 } 530 else 531 { 532 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 533 instance, status) 534 .c_str()); 535 (*obj)->setPldmSensorReceived(true); 536 return (*obj)->occActive(status); 537 } 538 } 539 else 540 { 541 if (hostRunning) 542 { 543 log<level::WARNING>( 544 std::format( 545 "updateOCCActive: No status object to update for OCC{} (active={})", 546 instance, status) 547 .c_str()); 548 } 549 else 550 { 551 if (status == true) 552 { 553 log<level::WARNING>( 554 std::format( 555 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 556 instance, status) 557 .c_str()); 558 } 559 } 560 if (status == true) 561 { 562 // OCC went active 563 queuedActiveState.insert(instance); 564 } 565 else 566 { 567 auto match = queuedActiveState.find(instance); 568 if (match != queuedActiveState.end()) 569 { 570 // OCC was disabled 571 queuedActiveState.erase(match); 572 } 573 } 574 return false; 575 } 576 } 577 578 // Called upon pldm event To set powermode Safe Mode State for system. 579 void Manager::updateOccSafeMode(bool safeMode) 580 { 581 #ifdef POWER10 582 pmode->updateDbusSafeMode(safeMode); 583 #endif 584 // Update the processor throttle status on dbus 585 for (auto& obj : statusObjects) 586 { 587 obj->updateThrottle(safeMode, THROTTLED_SAFE); 588 } 589 } 590 591 void Manager::sbeHRESETResult(instanceID instance, bool success) 592 { 593 if (success) 594 { 595 log<level::INFO>( 596 std::format("HRESET succeeded (OCC{})", instance).c_str()); 597 598 setSBEState(instance, SBE_STATE_BOOTED); 599 600 return; 601 } 602 603 setSBEState(instance, SBE_STATE_FAILED); 604 605 if (sbeCanDump(instance)) 606 { 607 log<level::INFO>( 608 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 609 .c_str()); 610 611 auto& bus = utils::getBus(); 612 uint32_t src6 = instance << 16; 613 uint32_t logId = 614 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 615 src6, "SBE command timeout"); 616 617 try 618 { 619 constexpr auto path = "/org/openpower/dump"; 620 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 621 constexpr auto function = "CreateDump"; 622 623 std::string service = utils::getService(path, interface); 624 auto method = bus.new_method_call(service.c_str(), path, interface, 625 function); 626 627 std::map<std::string, std::variant<std::string, uint64_t>> 628 createParams{ 629 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 630 uint64_t(logId)}, 631 {"com.ibm.Dump.Create.CreateParameters.DumpType", 632 "com.ibm.Dump.Create.DumpType.SBE"}, 633 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 634 uint64_t(instance)}, 635 }; 636 637 method.append(createParams); 638 639 auto response = bus.call(method); 640 } 641 catch (const sdbusplus::exception_t& e) 642 { 643 constexpr auto ERROR_DUMP_DISABLED = 644 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 645 if (e.name() == ERROR_DUMP_DISABLED) 646 { 647 log<level::INFO>("Dump is disabled, skipping"); 648 } 649 else 650 { 651 log<level::ERR>("Dump failed"); 652 } 653 } 654 } 655 } 656 657 bool Manager::sbeCanDump(unsigned int instance) 658 { 659 struct pdbg_target* proc = getPdbgTarget(instance); 660 661 if (!proc) 662 { 663 // allow the dump in the error case 664 return true; 665 } 666 667 try 668 { 669 if (!openpower::phal::sbe::isDumpAllowed(proc)) 670 { 671 return false; 672 } 673 674 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 675 { 676 return false; 677 } 678 } 679 catch (openpower::phal::exception::SbeError& e) 680 { 681 log<level::INFO>("Failed to query SBE state"); 682 } 683 684 // allow the dump in the error case 685 return true; 686 } 687 688 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 689 { 690 struct pdbg_target* proc = getPdbgTarget(instance); 691 692 if (!proc) 693 { 694 return; 695 } 696 697 try 698 { 699 openpower::phal::sbe::setState(proc, state); 700 } 701 catch (const openpower::phal::exception::SbeError& e) 702 { 703 log<level::ERR>("Failed to set SBE state"); 704 } 705 } 706 707 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 708 { 709 if (!pdbgInitialized) 710 { 711 try 712 { 713 openpower::phal::pdbg::init(); 714 pdbgInitialized = true; 715 } 716 catch (const openpower::phal::exception::PdbgError& e) 717 { 718 log<level::ERR>("pdbg initialization failed"); 719 return nullptr; 720 } 721 } 722 723 struct pdbg_target* proc = nullptr; 724 pdbg_for_each_class_target("proc", proc) 725 { 726 if (pdbg_target_index(proc) == instance) 727 { 728 return proc; 729 } 730 } 731 732 log<level::ERR>("Failed to get pdbg target"); 733 return nullptr; 734 } 735 #endif 736 737 void Manager::pollerTimerExpired() 738 { 739 if (!_pollTimer) 740 { 741 log<level::ERR>( 742 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 743 return; 744 } 745 746 for (auto& obj : statusObjects) 747 { 748 if (!obj->occActive()) 749 { 750 // OCC is not running yet 751 #ifdef READ_OCC_SENSORS 752 auto id = obj->getOccInstanceID(); 753 setSensorValueToNaN(id); 754 #endif 755 continue; 756 } 757 758 // Read sysfs to force kernel to poll OCC 759 obj->readOccState(); 760 761 #ifdef READ_OCC_SENSORS 762 // Read occ sensor values 763 getSensorValues(obj); 764 #endif 765 } 766 767 if (activeCount > 0) 768 { 769 // Restart OCC poll timer 770 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 771 } 772 else 773 { 774 // No OCCs running, so poll timer will not be restarted 775 log<level::INFO>( 776 std::format( 777 "Manager::pollerTimerExpired: poll timer will not be restarted") 778 .c_str()); 779 } 780 } 781 782 #ifdef READ_OCC_SENSORS 783 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 784 { 785 // There may be more than one sensor with the same FRU type 786 // and label so make two passes: the first to read the temps 787 // from sysfs, and the second to put them on D-Bus after 788 // resolving any conflicts. 789 std::map<std::string, double> sensorData; 790 791 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 792 for (auto& file : fs::directory_iterator(path)) 793 { 794 if (!std::regex_search(file.path().string(), expr)) 795 { 796 continue; 797 } 798 799 uint32_t labelValue{0}; 800 801 try 802 { 803 labelValue = readFile<uint32_t>(file.path()); 804 } 805 catch (const std::system_error& e) 806 { 807 log<level::DEBUG>( 808 std::format("readTempSensors: Failed reading {}, errno = {}", 809 file.path().string(), e.code().value()) 810 .c_str()); 811 continue; 812 } 813 814 const std::string& tempLabel = "label"; 815 const std::string filePathString = file.path().string().substr( 816 0, file.path().string().length() - tempLabel.length()); 817 818 uint32_t fruTypeValue{0}; 819 try 820 { 821 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 822 } 823 catch (const std::system_error& e) 824 { 825 log<level::DEBUG>( 826 std::format("readTempSensors: Failed reading {}, errno = {}", 827 filePathString + fruTypeSuffix, e.code().value()) 828 .c_str()); 829 continue; 830 } 831 832 std::string sensorPath = OCC_SENSORS_ROOT + 833 std::string("/temperature/"); 834 835 std::string dvfsTempPath; 836 837 if (fruTypeValue == VRMVdd) 838 { 839 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 840 "_temp"); 841 } 842 else if (fruTypeValue == processorIoRing) 843 { 844 sensorPath.append("proc" + std::to_string(occInstance) + 845 "_ioring_temp"); 846 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 847 std::to_string(occInstance) + "_ioring_dvfs_temp"; 848 } 849 else 850 { 851 uint16_t type = (labelValue & 0xFF000000) >> 24; 852 uint16_t instanceID = labelValue & 0x0000FFFF; 853 854 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 855 { 856 if (fruTypeValue == fruTypeNotAvailable) 857 { 858 // Not all DIMM related temps are available to read 859 // (no _input file in this case) 860 continue; 861 } 862 auto iter = dimmTempSensorName.find(fruTypeValue); 863 if (iter == dimmTempSensorName.end()) 864 { 865 log<level::ERR>( 866 std::format( 867 "readTempSensors: Fru type error! fruTypeValue = {}) ", 868 fruTypeValue) 869 .c_str()); 870 continue; 871 } 872 873 sensorPath.append("dimm" + std::to_string(instanceID) + 874 iter->second); 875 876 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 877 dimmDVFSSensorName.at(fruTypeValue); 878 } 879 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 880 { 881 if (fruTypeValue == processorCore) 882 { 883 // The OCC reports small core temps, of which there are 884 // two per big core. All current P10 systems are in big 885 // core mode, so use a big core name. 886 uint16_t coreNum = instanceID / 2; 887 uint16_t tempNum = instanceID % 2; 888 sensorPath.append("proc" + std::to_string(occInstance) + 889 "_core" + std::to_string(coreNum) + "_" + 890 std::to_string(tempNum) + "_temp"); 891 892 dvfsTempPath = 893 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 894 std::to_string(occInstance) + "_core_dvfs_temp"; 895 } 896 else 897 { 898 continue; 899 } 900 } 901 else 902 { 903 continue; 904 } 905 } 906 907 // The dvfs temp file only needs to be read once per chip per type. 908 if (!dvfsTempPath.empty() && 909 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 910 { 911 try 912 { 913 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 914 915 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 916 dvfsTempPath, dvfsValue * std::pow(10, -3)); 917 } 918 catch (const std::system_error& e) 919 { 920 log<level::DEBUG>( 921 std::format( 922 "readTempSensors: Failed reading {}, errno = {}", 923 filePathString + maxSuffix, e.code().value()) 924 .c_str()); 925 } 926 } 927 928 uint32_t faultValue{0}; 929 try 930 { 931 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 932 } 933 catch (const std::system_error& e) 934 { 935 log<level::DEBUG>( 936 std::format("readTempSensors: Failed reading {}, errno = {}", 937 filePathString + faultSuffix, e.code().value()) 938 .c_str()); 939 continue; 940 } 941 942 double tempValue{0}; 943 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 944 if (faultValue != 0) 945 { 946 tempValue = std::numeric_limits<double>::quiet_NaN(); 947 } 948 else 949 { 950 // Read the temperature 951 try 952 { 953 tempValue = readFile<double>(filePathString + inputSuffix); 954 } 955 catch (const std::system_error& e) 956 { 957 log<level::DEBUG>( 958 std::format( 959 "readTempSensors: Failed reading {}, errno = {}", 960 filePathString + inputSuffix, e.code().value()) 961 .c_str()); 962 963 // if errno == EAGAIN(Resource temporarily unavailable) then set 964 // temp to 0, to avoid using old temp, and affecting FAN 965 // Control. 966 if (e.code().value() == EAGAIN) 967 { 968 tempValue = 0; 969 } 970 // else the errno would be something like 971 // EBADF(Bad file descriptor) 972 // or ENOENT(No such file or directory) 973 else 974 { 975 continue; 976 } 977 } 978 } 979 980 // If this object path already has a value, only overwite 981 // it if the previous one was an NaN or a smaller value. 982 auto existing = sensorData.find(sensorPath); 983 if (existing != sensorData.end()) 984 { 985 // Multiple sensors found for this FRU type 986 if ((std::isnan(existing->second) && (tempValue == 0)) || 987 ((existing->second == 0) && std::isnan(tempValue))) 988 { 989 // One of the redundant sensors has failed (0xFF/nan), and the 990 // other sensor has no reading (0), so set the FRU to NaN to 991 // force fan increase 992 tempValue = std::numeric_limits<double>::quiet_NaN(); 993 existing->second = tempValue; 994 } 995 if (std::isnan(existing->second) || (tempValue > existing->second)) 996 { 997 existing->second = tempValue; 998 } 999 } 1000 else 1001 { 1002 // First sensor for this FRU type 1003 sensorData[sensorPath] = tempValue; 1004 } 1005 } 1006 1007 // Now publish the values on D-Bus. 1008 for (const auto& [objectPath, value] : sensorData) 1009 { 1010 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1011 value * std::pow(10, -3)); 1012 1013 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1014 objectPath, !std::isnan(value)); 1015 1016 if (existingSensors.find(objectPath) == existingSensors.end()) 1017 { 1018 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1019 objectPath); 1020 } 1021 1022 existingSensors[objectPath] = occInstance; 1023 } 1024 } 1025 1026 std::optional<std::string> 1027 Manager::getPowerLabelFunctionID(const std::string& value) 1028 { 1029 // If the value is "system", then the FunctionID is "system". 1030 if (value == "system") 1031 { 1032 return value; 1033 } 1034 1035 // If the value is not "system", then the label value have 3 numbers, of 1036 // which we only care about the middle one: 1037 // <sensor id>_<function id>_<apss channel> 1038 // eg: The value is "0_10_5" , then the FunctionID is "10". 1039 if (value.find("_") == std::string::npos) 1040 { 1041 return std::nullopt; 1042 } 1043 1044 auto powerLabelValue = value.substr((value.find("_") + 1)); 1045 1046 if (powerLabelValue.find("_") == std::string::npos) 1047 { 1048 return std::nullopt; 1049 } 1050 1051 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1052 } 1053 1054 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1055 { 1056 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1057 for (auto& file : fs::directory_iterator(path)) 1058 { 1059 if (!std::regex_search(file.path().string(), expr)) 1060 { 1061 continue; 1062 } 1063 1064 std::string labelValue; 1065 try 1066 { 1067 labelValue = readFile<std::string>(file.path()); 1068 } 1069 catch (const std::system_error& e) 1070 { 1071 log<level::DEBUG>( 1072 std::format("readPowerSensors: Failed reading {}, errno = {}", 1073 file.path().string(), e.code().value()) 1074 .c_str()); 1075 continue; 1076 } 1077 1078 auto functionID = getPowerLabelFunctionID(labelValue); 1079 if (functionID == std::nullopt) 1080 { 1081 continue; 1082 } 1083 1084 const std::string& tempLabel = "label"; 1085 const std::string filePathString = file.path().string().substr( 1086 0, file.path().string().length() - tempLabel.length()); 1087 1088 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1089 1090 auto iter = powerSensorName.find(*functionID); 1091 if (iter == powerSensorName.end()) 1092 { 1093 continue; 1094 } 1095 sensorPath.append(iter->second); 1096 1097 double tempValue{0}; 1098 1099 try 1100 { 1101 tempValue = readFile<double>(filePathString + inputSuffix); 1102 } 1103 catch (const std::system_error& e) 1104 { 1105 log<level::DEBUG>( 1106 std::format("readPowerSensors: Failed reading {}, errno = {}", 1107 filePathString + inputSuffix, e.code().value()) 1108 .c_str()); 1109 continue; 1110 } 1111 1112 dbus::OccDBusSensors::getOccDBus().setUnit( 1113 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1114 1115 dbus::OccDBusSensors::getOccDBus().setValue( 1116 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1117 1118 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1119 true); 1120 1121 if (existingSensors.find(sensorPath) == existingSensors.end()) 1122 { 1123 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1124 sensorPath); 1125 } 1126 1127 existingSensors[sensorPath] = id; 1128 } 1129 return; 1130 } 1131 1132 void Manager::setSensorValueToNaN(uint32_t id) const 1133 { 1134 for (const auto& [sensorPath, occId] : existingSensors) 1135 { 1136 if (occId == id) 1137 { 1138 dbus::OccDBusSensors::getOccDBus().setValue( 1139 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1140 1141 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1142 true); 1143 } 1144 } 1145 return; 1146 } 1147 1148 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1149 { 1150 for (const auto& [sensorPath, occId] : existingSensors) 1151 { 1152 if (occId == id) 1153 { 1154 dbus::OccDBusSensors::getOccDBus().setValue( 1155 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1156 1157 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1158 false); 1159 } 1160 } 1161 return; 1162 } 1163 1164 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1165 { 1166 static bool tracedError[8] = {0}; 1167 const fs::path sensorPath = occ->getHwmonPath(); 1168 const uint32_t id = occ->getOccInstanceID(); 1169 1170 if (fs::exists(sensorPath)) 1171 { 1172 // Read temperature sensors 1173 readTempSensors(sensorPath, id); 1174 1175 if (occ->isMasterOcc()) 1176 { 1177 // Read power sensors 1178 readPowerSensors(sensorPath, id); 1179 } 1180 tracedError[id] = false; 1181 } 1182 else 1183 { 1184 if (!tracedError[id]) 1185 { 1186 log<level::ERR>( 1187 std::format( 1188 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1189 id, sensorPath.c_str()) 1190 .c_str()); 1191 tracedError[id] = true; 1192 } 1193 } 1194 1195 return; 1196 } 1197 #endif 1198 1199 // Read the altitude from DBus 1200 void Manager::readAltitude() 1201 { 1202 static bool traceAltitudeErr = true; 1203 1204 utils::PropertyValue altitudeProperty{}; 1205 try 1206 { 1207 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1208 ALTITUDE_PROP); 1209 auto sensorVal = std::get<double>(altitudeProperty); 1210 if (sensorVal < 0xFFFF) 1211 { 1212 if (sensorVal < 0) 1213 { 1214 altitude = 0; 1215 } 1216 else 1217 { 1218 // Round to nearest meter 1219 altitude = uint16_t(sensorVal + 0.5); 1220 } 1221 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1222 sensorVal, altitude) 1223 .c_str()); 1224 traceAltitudeErr = true; 1225 } 1226 else 1227 { 1228 if (traceAltitudeErr) 1229 { 1230 traceAltitudeErr = false; 1231 log<level::DEBUG>( 1232 std::format("Invalid altitude value: {}", sensorVal) 1233 .c_str()); 1234 } 1235 } 1236 } 1237 catch (const sdbusplus::exception_t& e) 1238 { 1239 if (traceAltitudeErr) 1240 { 1241 traceAltitudeErr = false; 1242 log<level::INFO>( 1243 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1244 } 1245 altitude = 0xFFFF; // not available 1246 } 1247 } 1248 1249 // Callback function when ambient temperature changes 1250 void Manager::ambientCallback(sdbusplus::message_t& msg) 1251 { 1252 double currentTemp = 0; 1253 uint8_t truncatedTemp = 0xFF; 1254 std::string msgSensor; 1255 std::map<std::string, std::variant<double>> msgData; 1256 msg.read(msgSensor, msgData); 1257 1258 auto valPropMap = msgData.find(AMBIENT_PROP); 1259 if (valPropMap == msgData.end()) 1260 { 1261 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1262 return; 1263 } 1264 currentTemp = std::get<double>(valPropMap->second); 1265 if (std::isnan(currentTemp)) 1266 { 1267 truncatedTemp = 0xFF; 1268 } 1269 else 1270 { 1271 if (currentTemp < 0) 1272 { 1273 truncatedTemp = 0; 1274 } 1275 else 1276 { 1277 // Round to nearest degree C 1278 truncatedTemp = uint8_t(currentTemp + 0.5); 1279 } 1280 } 1281 1282 // If ambient changes, notify OCCs 1283 if (truncatedTemp != ambient) 1284 { 1285 log<level::DEBUG>( 1286 std::format("ambientCallback: Ambient change from {} to {}C", 1287 ambient, currentTemp) 1288 .c_str()); 1289 1290 ambient = truncatedTemp; 1291 if (altitude == 0xFFFF) 1292 { 1293 // No altitude yet, try reading again 1294 readAltitude(); 1295 } 1296 1297 log<level::DEBUG>( 1298 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1299 altitude) 1300 .c_str()); 1301 #ifdef POWER10 1302 // Send ambient and altitude to all OCCs 1303 for (auto& obj : statusObjects) 1304 { 1305 if (obj->occActive()) 1306 { 1307 obj->sendAmbient(ambient, altitude); 1308 } 1309 } 1310 #endif // POWER10 1311 } 1312 } 1313 1314 // return the current ambient and altitude readings 1315 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1316 uint16_t& altitudeValue) const 1317 { 1318 ambientValid = true; 1319 ambientTemp = ambient; 1320 altitudeValue = altitude; 1321 1322 if (ambient == 0xFF) 1323 { 1324 ambientValid = false; 1325 } 1326 } 1327 1328 #ifdef POWER10 1329 // Called when waitForAllOccsTimer expires 1330 // After the first OCC goes active, this timer will be started (60 seconds) 1331 void Manager::occsNotAllRunning() 1332 { 1333 if (activeCount != statusObjects.size()) 1334 { 1335 // Not all OCCs went active 1336 log<level::WARNING>( 1337 std::format( 1338 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1339 activeCount, statusObjects.size()) 1340 .c_str()); 1341 // Procs may be garded, so may be expected 1342 } 1343 1344 validateOccMaster(); 1345 } 1346 1347 #ifdef PLDM 1348 // Called when throttleTraceTimer expires. 1349 // If this timer expires, that indicates there is still no confirmed OCC status 1350 // which will trigger pldm traces to be throttled. 1351 void Manager::throttleTraceExpired() 1352 { 1353 // Throttle traces 1354 pldmHandle->setTraceThrottle(true); 1355 // Create PEL 1356 createPldmSensorPEL(); 1357 } 1358 1359 void Manager::createPldmSensorPEL() 1360 { 1361 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1362 std::map<std::string, std::string> additionalData; 1363 1364 additionalData.emplace("_PID", std::to_string(getpid())); 1365 1366 log<level::INFO>( 1367 std::format( 1368 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs") 1369 .c_str()); 1370 1371 auto& bus = utils::getBus(); 1372 1373 try 1374 { 1375 FFDCFiles ffdc; 1376 // Add occ-control journal traces to PEL FFDC 1377 auto occJournalFile = 1378 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1379 1380 static constexpr auto loggingObjectPath = 1381 "/xyz/openbmc_project/logging"; 1382 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1383 std::string service = utils::getService(loggingObjectPath, 1384 opLoggingInterface); 1385 auto method = bus.new_method_call(service.c_str(), loggingObjectPath, 1386 opLoggingInterface, 1387 "CreatePELWithFFDCFiles"); 1388 1389 // Set level to Notice (Informational). 1390 auto level = 1391 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1392 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1393 Notice); 1394 1395 method.append(d.path, level, additionalData, ffdc); 1396 bus.call(method); 1397 } 1398 catch (const sdbusplus::exception_t& e) 1399 { 1400 log<level::ERR>( 1401 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}", 1402 e.what()) 1403 .c_str()); 1404 } 1405 } 1406 #endif // PLDM 1407 #endif // POWER10 1408 1409 // Verify single master OCC and start presence monitor 1410 void Manager::validateOccMaster() 1411 { 1412 int masterInstance = -1; 1413 for (auto& obj : statusObjects) 1414 { 1415 auto instance = obj->getOccInstanceID(); 1416 #ifdef POWER10 1417 if (!obj->occActive()) 1418 { 1419 if (utils::isHostRunning()) 1420 { 1421 // Check if sensor was queued while waiting for discovery 1422 auto match = queuedActiveState.find(instance); 1423 if (match != queuedActiveState.end()) 1424 { 1425 queuedActiveState.erase(match); 1426 log<level::INFO>( 1427 std::format( 1428 "validateOccMaster: OCC{} is ACTIVE (queued)", 1429 instance) 1430 .c_str()); 1431 obj->occActive(true); 1432 } 1433 else 1434 { 1435 // OCC does not appear to be active yet, check active sensor 1436 #ifdef PLDM 1437 pldmHandle->checkActiveSensor(instance); 1438 #endif 1439 if (obj->occActive()) 1440 { 1441 log<level::INFO>( 1442 std::format( 1443 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1444 instance) 1445 .c_str()); 1446 } 1447 } 1448 } 1449 else 1450 { 1451 log<level::WARNING>( 1452 std::format( 1453 "validateOccMaster: HOST is not running (OCC{})", 1454 instance) 1455 .c_str()); 1456 return; 1457 } 1458 } 1459 #endif // POWER10 1460 1461 if (obj->isMasterOcc()) 1462 { 1463 obj->addPresenceWatchMaster(); 1464 1465 if (masterInstance == -1) 1466 { 1467 masterInstance = instance; 1468 } 1469 else 1470 { 1471 log<level::ERR>( 1472 std::format( 1473 "validateOccMaster: Multiple OCC masters! ({} and {})", 1474 masterInstance, instance) 1475 .c_str()); 1476 // request reset 1477 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1478 } 1479 } 1480 } 1481 1482 if (masterInstance < 0) 1483 { 1484 log<level::ERR>( 1485 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1486 statusObjects.size()) 1487 .c_str()); 1488 // request reset 1489 statusObjects.front()->deviceError( 1490 Error::Descriptor(PRESENCE_ERROR_PATH)); 1491 } 1492 else 1493 { 1494 log<level::INFO>( 1495 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1496 masterInstance, activeCount) 1497 .c_str()); 1498 #ifdef POWER10 1499 pmode->updateDbusSafeMode(false); 1500 #endif 1501 } 1502 } 1503 1504 void Manager::updatePcapBounds() const 1505 { 1506 if (pcap) 1507 { 1508 pcap->updatePcapBounds(); 1509 } 1510 } 1511 1512 } // namespace occ 1513 } // namespace open_power 1514