1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/log.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> 37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 59 void Manager::findAndCreateObjects() 60 { 61 #ifndef POWER10 62 for (auto id = 0; id < MAX_CPUS; ++id) 63 { 64 // Create one occ per cpu 65 auto occ = std::string(OCC_NAME) + std::to_string(id); 66 createObjects(occ); 67 } 68 #else 69 if (!pmode) 70 { 71 // Create the power mode object 72 pmode = std::make_unique<powermode::PowerMode>( 73 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 74 } 75 76 if (!fs::exists(HOST_ON_FILE)) 77 { 78 static bool statusObjCreated = false; 79 if (!statusObjCreated) 80 { 81 // Create the OCCs based on on the /dev/occX devices 82 auto occs = findOCCsInDev(); 83 84 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 85 { 86 // Something changed or no OCCs yet, try again in 10s. 87 // Note on the first pass prevOCCSearch will be empty, 88 // so there will be at least one delay to give things 89 // a chance to settle. 90 prevOCCSearch = occs; 91 92 log<level::INFO>( 93 std::format( 94 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 95 occs.size()) 96 .c_str()); 97 98 discoverTimer->restartOnce(10s); 99 } 100 else 101 { 102 // All OCCs appear to be available, create status objects 103 104 // createObjects requires OCC0 first. 105 std::sort(occs.begin(), occs.end()); 106 107 log<level::INFO>( 108 std::format( 109 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 110 occs.size()) 111 .c_str()); 112 for (auto id : occs) 113 { 114 createObjects(std::string(OCC_NAME) + std::to_string(id)); 115 } 116 statusObjCreated = true; 117 waitingForAllOccActiveSensors = true; 118 119 // Find/update the processor path associated with each OCC 120 for (auto& obj : statusObjects) 121 { 122 obj->updateProcAssociation(); 123 } 124 } 125 } 126 127 if (statusObjCreated && waitingForAllOccActiveSensors) 128 { 129 static bool tracedHostWait = false; 130 if (utils::isHostRunning()) 131 { 132 if (tracedHostWait) 133 { 134 log<level::INFO>( 135 "Manager::findAndCreateObjects(): Host is running"); 136 tracedHostWait = false; 137 } 138 checkAllActiveSensors(); 139 } 140 else 141 { 142 if (!tracedHostWait) 143 { 144 log<level::INFO>( 145 "Manager::findAndCreateObjects(): Waiting for host to start"); 146 tracedHostWait = true; 147 } 148 discoverTimer->restartOnce(30s); 149 #ifdef PLDM 150 if (throttleTraceTimer->isEnabled()) 151 { 152 // Host is no longer running, disable throttle timer and 153 // make sure traces are not throttled 154 log<level::INFO>( 155 "findAndCreateObjects(): disabling sensor timer"); 156 throttleTraceTimer->setEnabled(false); 157 pldmHandle->setTraceThrottle(false); 158 } 159 #endif 160 } 161 } 162 } 163 else 164 { 165 log<level::INFO>( 166 std::format( 167 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 168 HOST_ON_FILE) 169 .c_str()); 170 discoverTimer->restartOnce(10s); 171 } 172 #endif 173 } 174 175 #ifdef POWER10 176 // Check if all occActive sensors are available 177 void Manager::checkAllActiveSensors() 178 { 179 static bool allActiveSensorAvailable = false; 180 static bool tracedSensorWait = false; 181 static bool waitingForHost = false; 182 183 if (open_power::occ::utils::isHostRunning()) 184 { 185 if (waitingForHost) 186 { 187 waitingForHost = false; 188 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 189 } 190 191 // Start with the assumption that all are available 192 allActiveSensorAvailable = true; 193 for (auto& obj : statusObjects) 194 { 195 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 196 { 197 auto instance = obj->getOccInstanceID(); 198 // Check if sensor was queued while waiting for discovery 199 auto match = queuedActiveState.find(instance); 200 if (match != queuedActiveState.end()) 201 { 202 queuedActiveState.erase(match); 203 log<level::INFO>( 204 std::format( 205 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 206 instance) 207 .c_str()); 208 obj->occActive(true); 209 } 210 else 211 { 212 allActiveSensorAvailable = false; 213 if (!tracedSensorWait) 214 { 215 log<level::INFO>( 216 std::format( 217 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 218 instance) 219 .c_str()); 220 tracedSensorWait = true; 221 #ifdef PLDM 222 // Make sure traces are not throttled 223 pldmHandle->setTraceThrottle(false); 224 // Start timer to throttle pldm traces when timer 225 // expires 226 throttleTraceTimer->restartOnce(40min); 227 #endif 228 } 229 #ifdef PLDM 230 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 231 #endif 232 break; 233 } 234 } 235 } 236 } 237 else 238 { 239 if (!waitingForHost) 240 { 241 waitingForHost = true; 242 log<level::INFO>( 243 "checkAllActiveSensors(): Waiting for host to start"); 244 #ifdef PLDM 245 if (throttleTraceTimer->isEnabled()) 246 { 247 // Host is no longer running, disable throttle timer and 248 // make sure traces are not throttled 249 log<level::INFO>( 250 "checkAllActiveSensors(): disabling sensor timer"); 251 throttleTraceTimer->setEnabled(false); 252 pldmHandle->setTraceThrottle(false); 253 } 254 #endif 255 } 256 } 257 258 if (allActiveSensorAvailable) 259 { 260 // All sensors were found, disable the discovery timer 261 if (discoverTimer->isEnabled()) 262 { 263 discoverTimer->setEnabled(false); 264 } 265 #ifdef PLDM 266 if (throttleTraceTimer->isEnabled()) 267 { 268 // Disable throttle timer and make sure traces are not throttled 269 throttleTraceTimer->setEnabled(false); 270 pldmHandle->setTraceThrottle(false); 271 } 272 #endif 273 274 if (waitingForAllOccActiveSensors) 275 { 276 log<level::INFO>( 277 "checkAllActiveSensors(): OCC Active sensors are available"); 278 waitingForAllOccActiveSensors = false; 279 } 280 queuedActiveState.clear(); 281 tracedSensorWait = false; 282 } 283 else 284 { 285 // Not all sensors were available, so keep waiting 286 if (!tracedSensorWait) 287 { 288 log<level::INFO>( 289 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 290 tracedSensorWait = true; 291 } 292 discoverTimer->restartOnce(10s); 293 } 294 } 295 #endif 296 297 std::vector<int> Manager::findOCCsInDev() 298 { 299 std::vector<int> occs; 300 std::regex expr{R"(occ(\d+)$)"}; 301 302 for (auto& file : fs::directory_iterator("/dev")) 303 { 304 std::smatch match; 305 std::string path{file.path().string()}; 306 if (std::regex_search(path, match, expr)) 307 { 308 auto num = std::stoi(match[1].str()); 309 310 // /dev numbering starts at 1, ours starts at 0. 311 occs.push_back(num - 1); 312 } 313 } 314 315 return occs; 316 } 317 318 int Manager::cpuCreated(sdbusplus::message_t& msg) 319 { 320 namespace fs = std::filesystem; 321 322 sdbusplus::message::object_path o; 323 msg.read(o); 324 fs::path cpuPath(std::string(std::move(o))); 325 326 auto name = cpuPath.filename().string(); 327 auto index = name.find(CPU_NAME); 328 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 329 330 createObjects(name); 331 332 return 0; 333 } 334 335 void Manager::createObjects(const std::string& occ) 336 { 337 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 338 339 statusObjects.emplace_back(std::make_unique<Status>( 340 event, path.c_str(), *this, 341 #ifdef POWER10 342 pmode, 343 #endif 344 std::bind(std::mem_fn(&Manager::statusCallBack), this, 345 std::placeholders::_1, std::placeholders::_2) 346 #ifdef PLDM 347 , 348 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 349 std::placeholders::_1) 350 #endif 351 )); 352 353 // Create the power cap monitor object 354 if (!pcap) 355 { 356 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 357 *statusObjects.back()); 358 } 359 360 if (statusObjects.back()->isMasterOcc()) 361 { 362 log<level::INFO>( 363 std::format("Manager::createObjects(): OCC{} is the master", 364 statusObjects.back()->getOccInstanceID()) 365 .c_str()); 366 _pollTimer->setEnabled(false); 367 368 #ifdef POWER10 369 // Set the master OCC on the PowerMode object 370 pmode->setMasterOcc(path); 371 #endif 372 } 373 374 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 375 #ifdef POWER10 376 , 377 pmode 378 #endif 379 )); 380 } 381 382 void Manager::statusCallBack(instanceID instance, bool status) 383 { 384 if (status == true) 385 { 386 // OCC went active 387 ++activeCount; 388 389 #ifdef POWER10 390 if (activeCount == 1) 391 { 392 // First OCC went active (allow some time for all OCCs to go active) 393 waitForAllOccsTimer->restartOnce(60s); 394 } 395 #endif 396 397 if (activeCount == statusObjects.size()) 398 { 399 #ifdef POWER10 400 // All OCCs are now running 401 if (waitForAllOccsTimer->isEnabled()) 402 { 403 // stop occ wait timer 404 waitForAllOccsTimer->setEnabled(false); 405 } 406 #endif 407 408 // Verify master OCC and start presence monitor 409 validateOccMaster(); 410 } 411 412 // Start poll timer if not already started 413 if (!_pollTimer->isEnabled()) 414 { 415 log<level::INFO>( 416 std::format("Manager: OCCs will be polled every {} seconds", 417 pollInterval) 418 .c_str()); 419 420 // Send poll and start OCC poll timer 421 pollerTimerExpired(); 422 } 423 } 424 else 425 { 426 // OCC went away 427 if (activeCount > 0) 428 { 429 --activeCount; 430 } 431 else 432 { 433 log<level::ERR>( 434 std::format("OCC{} disabled, but currently no active OCCs", 435 instance) 436 .c_str()); 437 } 438 439 if (activeCount == 0) 440 { 441 // No OCCs are running 442 443 // Stop OCC poll timer 444 if (_pollTimer->isEnabled()) 445 { 446 log<level::INFO>( 447 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 448 _pollTimer->setEnabled(false); 449 } 450 451 #ifdef POWER10 452 // stop wait timer 453 if (waitForAllOccsTimer->isEnabled()) 454 { 455 waitForAllOccsTimer->setEnabled(false); 456 } 457 #endif 458 } 459 #ifdef READ_OCC_SENSORS 460 // Clear OCC sensors 461 setSensorValueToNaN(instance); 462 #endif 463 } 464 465 #ifdef POWER10 466 if (waitingForAllOccActiveSensors) 467 { 468 if (utils::isHostRunning()) 469 { 470 checkAllActiveSensors(); 471 } 472 } 473 #endif 474 } 475 476 #ifdef I2C_OCC 477 void Manager::initStatusObjects() 478 { 479 // Make sure we have a valid path string 480 static_assert(sizeof(DEV_PATH) != 0); 481 482 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 483 for (auto& name : deviceNames) 484 { 485 i2c_occ::i2cToDbus(name); 486 name = std::string(OCC_NAME) + '_' + name; 487 auto path = fs::path(OCC_CONTROL_ROOT) / name; 488 statusObjects.emplace_back( 489 std::make_unique<Status>(event, path.c_str(), *this)); 490 } 491 // The first device is master occ 492 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 493 *statusObjects.front()); 494 #ifdef POWER10 495 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 496 powermode::PIPS_PATH); 497 // Set the master OCC on the PowerMode object 498 pmode->setMasterOcc(path); 499 #endif 500 } 501 #endif 502 503 #ifdef PLDM 504 void Manager::sbeTimeout(unsigned int instance) 505 { 506 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 507 [instance](const auto& obj) { 508 return instance == obj->getOccInstanceID(); 509 }); 510 511 if (obj != statusObjects.end() && (*obj)->occActive()) 512 { 513 log<level::INFO>( 514 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 515 .c_str()); 516 517 setSBEState(instance, SBE_STATE_NOT_USABLE); 518 519 pldmHandle->sendHRESET(instance); 520 } 521 } 522 523 bool Manager::updateOCCActive(instanceID instance, bool status) 524 { 525 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 526 [instance](const auto& obj) { 527 return instance == obj->getOccInstanceID(); 528 }); 529 530 const bool hostRunning = open_power::occ::utils::isHostRunning(); 531 if (obj != statusObjects.end()) 532 { 533 if (!hostRunning && (status == true)) 534 { 535 log<level::WARNING>( 536 std::format( 537 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 538 instance, status) 539 .c_str()); 540 (*obj)->setPldmSensorReceived(false); 541 if (!waitingForAllOccActiveSensors) 542 { 543 log<level::INFO>( 544 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 545 waitingForAllOccActiveSensors = true; 546 } 547 #ifdef POWER10 548 discoverTimer->restartOnce(30s); 549 #endif 550 return false; 551 } 552 else 553 { 554 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 555 instance, status) 556 .c_str()); 557 (*obj)->setPldmSensorReceived(true); 558 return (*obj)->occActive(status); 559 } 560 } 561 else 562 { 563 if (hostRunning) 564 { 565 log<level::WARNING>( 566 std::format( 567 "updateOCCActive: No status object to update for OCC{} (active={})", 568 instance, status) 569 .c_str()); 570 } 571 else 572 { 573 if (status == true) 574 { 575 log<level::WARNING>( 576 std::format( 577 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 578 instance, status) 579 .c_str()); 580 } 581 } 582 if (status == true) 583 { 584 // OCC went active 585 queuedActiveState.insert(instance); 586 } 587 else 588 { 589 auto match = queuedActiveState.find(instance); 590 if (match != queuedActiveState.end()) 591 { 592 // OCC was disabled 593 queuedActiveState.erase(match); 594 } 595 } 596 return false; 597 } 598 } 599 600 // Called upon pldm event To set powermode Safe Mode State for system. 601 void Manager::updateOccSafeMode(bool safeMode) 602 { 603 #ifdef POWER10 604 pmode->updateDbusSafeMode(safeMode); 605 #endif 606 // Update the processor throttle status on dbus 607 for (auto& obj : statusObjects) 608 { 609 obj->updateThrottle(safeMode, THROTTLED_SAFE); 610 } 611 } 612 613 void Manager::sbeHRESETResult(instanceID instance, bool success) 614 { 615 if (success) 616 { 617 log<level::INFO>( 618 std::format("HRESET succeeded (OCC{})", instance).c_str()); 619 620 setSBEState(instance, SBE_STATE_BOOTED); 621 622 return; 623 } 624 625 setSBEState(instance, SBE_STATE_FAILED); 626 627 if (sbeCanDump(instance)) 628 { 629 log<level::INFO>( 630 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 631 .c_str()); 632 633 auto& bus = utils::getBus(); 634 uint32_t src6 = instance << 16; 635 uint32_t logId = 636 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 637 src6, "SBE command timeout"); 638 639 try 640 { 641 constexpr auto path = "/org/openpower/dump"; 642 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 643 constexpr auto function = "CreateDump"; 644 645 std::string service = utils::getService(path, interface); 646 auto method = bus.new_method_call(service.c_str(), path, interface, 647 function); 648 649 std::map<std::string, std::variant<std::string, uint64_t>> 650 createParams{ 651 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 652 uint64_t(logId)}, 653 {"com.ibm.Dump.Create.CreateParameters.DumpType", 654 "com.ibm.Dump.Create.DumpType.SBE"}, 655 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 656 uint64_t(instance)}, 657 }; 658 659 method.append(createParams); 660 661 auto response = bus.call(method); 662 } 663 catch (const sdbusplus::exception_t& e) 664 { 665 constexpr auto ERROR_DUMP_DISABLED = 666 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 667 if (e.name() == ERROR_DUMP_DISABLED) 668 { 669 log<level::INFO>("Dump is disabled, skipping"); 670 } 671 else 672 { 673 log<level::ERR>("Dump failed"); 674 } 675 } 676 } 677 } 678 679 bool Manager::sbeCanDump(unsigned int instance) 680 { 681 struct pdbg_target* proc = getPdbgTarget(instance); 682 683 if (!proc) 684 { 685 // allow the dump in the error case 686 return true; 687 } 688 689 try 690 { 691 if (!openpower::phal::sbe::isDumpAllowed(proc)) 692 { 693 return false; 694 } 695 696 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 697 { 698 return false; 699 } 700 } 701 catch (openpower::phal::exception::SbeError& e) 702 { 703 log<level::INFO>("Failed to query SBE state"); 704 } 705 706 // allow the dump in the error case 707 return true; 708 } 709 710 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 711 { 712 struct pdbg_target* proc = getPdbgTarget(instance); 713 714 if (!proc) 715 { 716 return; 717 } 718 719 try 720 { 721 openpower::phal::sbe::setState(proc, state); 722 } 723 catch (const openpower::phal::exception::SbeError& e) 724 { 725 log<level::ERR>("Failed to set SBE state"); 726 } 727 } 728 729 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 730 { 731 if (!pdbgInitialized) 732 { 733 try 734 { 735 openpower::phal::pdbg::init(); 736 pdbgInitialized = true; 737 } 738 catch (const openpower::phal::exception::PdbgError& e) 739 { 740 log<level::ERR>("pdbg initialization failed"); 741 return nullptr; 742 } 743 } 744 745 struct pdbg_target* proc = nullptr; 746 pdbg_for_each_class_target("proc", proc) 747 { 748 if (pdbg_target_index(proc) == instance) 749 { 750 return proc; 751 } 752 } 753 754 log<level::ERR>("Failed to get pdbg target"); 755 return nullptr; 756 } 757 #endif 758 759 void Manager::pollerTimerExpired() 760 { 761 if (!_pollTimer) 762 { 763 log<level::ERR>( 764 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 765 return; 766 } 767 768 for (auto& obj : statusObjects) 769 { 770 if (!obj->occActive()) 771 { 772 // OCC is not running yet 773 #ifdef READ_OCC_SENSORS 774 auto id = obj->getOccInstanceID(); 775 setSensorValueToNaN(id); 776 #endif 777 continue; 778 } 779 780 // Read sysfs to force kernel to poll OCC 781 obj->readOccState(); 782 783 #ifdef READ_OCC_SENSORS 784 // Read occ sensor values 785 getSensorValues(obj); 786 #endif 787 } 788 789 if (activeCount > 0) 790 { 791 // Restart OCC poll timer 792 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 793 } 794 else 795 { 796 // No OCCs running, so poll timer will not be restarted 797 log<level::INFO>( 798 std::format( 799 "Manager::pollerTimerExpired: poll timer will not be restarted") 800 .c_str()); 801 } 802 } 803 804 #ifdef READ_OCC_SENSORS 805 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 806 { 807 // There may be more than one sensor with the same FRU type 808 // and label so make two passes: the first to read the temps 809 // from sysfs, and the second to put them on D-Bus after 810 // resolving any conflicts. 811 std::map<std::string, double> sensorData; 812 813 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 814 for (auto& file : fs::directory_iterator(path)) 815 { 816 if (!std::regex_search(file.path().string(), expr)) 817 { 818 continue; 819 } 820 821 uint32_t labelValue{0}; 822 823 try 824 { 825 labelValue = readFile<uint32_t>(file.path()); 826 } 827 catch (const std::system_error& e) 828 { 829 log<level::DEBUG>( 830 std::format("readTempSensors: Failed reading {}, errno = {}", 831 file.path().string(), e.code().value()) 832 .c_str()); 833 continue; 834 } 835 836 const std::string& tempLabel = "label"; 837 const std::string filePathString = file.path().string().substr( 838 0, file.path().string().length() - tempLabel.length()); 839 840 uint32_t fruTypeValue{0}; 841 try 842 { 843 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 844 } 845 catch (const std::system_error& e) 846 { 847 log<level::DEBUG>( 848 std::format("readTempSensors: Failed reading {}, errno = {}", 849 filePathString + fruTypeSuffix, e.code().value()) 850 .c_str()); 851 continue; 852 } 853 854 std::string sensorPath = OCC_SENSORS_ROOT + 855 std::string("/temperature/"); 856 857 std::string dvfsTempPath; 858 859 if (fruTypeValue == VRMVdd) 860 { 861 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 862 "_temp"); 863 } 864 else if (fruTypeValue == processorIoRing) 865 { 866 sensorPath.append("proc" + std::to_string(occInstance) + 867 "_ioring_temp"); 868 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 869 std::to_string(occInstance) + "_ioring_dvfs_temp"; 870 } 871 else 872 { 873 uint16_t type = (labelValue & 0xFF000000) >> 24; 874 uint16_t instanceID = labelValue & 0x0000FFFF; 875 876 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 877 { 878 if (fruTypeValue == fruTypeNotAvailable) 879 { 880 // Not all DIMM related temps are available to read 881 // (no _input file in this case) 882 continue; 883 } 884 auto iter = dimmTempSensorName.find(fruTypeValue); 885 if (iter == dimmTempSensorName.end()) 886 { 887 log<level::ERR>( 888 std::format( 889 "readTempSensors: Fru type error! fruTypeValue = {}) ", 890 fruTypeValue) 891 .c_str()); 892 continue; 893 } 894 895 sensorPath.append("dimm" + std::to_string(instanceID) + 896 iter->second); 897 898 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 899 dimmDVFSSensorName.at(fruTypeValue); 900 } 901 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 902 { 903 if (fruTypeValue == processorCore) 904 { 905 // The OCC reports small core temps, of which there are 906 // two per big core. All current P10 systems are in big 907 // core mode, so use a big core name. 908 uint16_t coreNum = instanceID / 2; 909 uint16_t tempNum = instanceID % 2; 910 sensorPath.append("proc" + std::to_string(occInstance) + 911 "_core" + std::to_string(coreNum) + "_" + 912 std::to_string(tempNum) + "_temp"); 913 914 dvfsTempPath = 915 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 916 std::to_string(occInstance) + "_core_dvfs_temp"; 917 } 918 else 919 { 920 continue; 921 } 922 } 923 else 924 { 925 continue; 926 } 927 } 928 929 // The dvfs temp file only needs to be read once per chip per type. 930 if (!dvfsTempPath.empty() && 931 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 932 { 933 try 934 { 935 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 936 937 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 938 dvfsTempPath, dvfsValue * std::pow(10, -3)); 939 } 940 catch (const std::system_error& e) 941 { 942 log<level::DEBUG>( 943 std::format( 944 "readTempSensors: Failed reading {}, errno = {}", 945 filePathString + maxSuffix, e.code().value()) 946 .c_str()); 947 } 948 } 949 950 uint32_t faultValue{0}; 951 try 952 { 953 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 954 } 955 catch (const std::system_error& e) 956 { 957 log<level::DEBUG>( 958 std::format("readTempSensors: Failed reading {}, errno = {}", 959 filePathString + faultSuffix, e.code().value()) 960 .c_str()); 961 continue; 962 } 963 964 double tempValue{0}; 965 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 966 if (faultValue != 0) 967 { 968 tempValue = std::numeric_limits<double>::quiet_NaN(); 969 } 970 else 971 { 972 // Read the temperature 973 try 974 { 975 tempValue = readFile<double>(filePathString + inputSuffix); 976 } 977 catch (const std::system_error& e) 978 { 979 log<level::DEBUG>( 980 std::format( 981 "readTempSensors: Failed reading {}, errno = {}", 982 filePathString + inputSuffix, e.code().value()) 983 .c_str()); 984 985 // if errno == EAGAIN(Resource temporarily unavailable) then set 986 // temp to 0, to avoid using old temp, and affecting FAN 987 // Control. 988 if (e.code().value() == EAGAIN) 989 { 990 tempValue = 0; 991 } 992 // else the errno would be something like 993 // EBADF(Bad file descriptor) 994 // or ENOENT(No such file or directory) 995 else 996 { 997 continue; 998 } 999 } 1000 } 1001 1002 // If this object path already has a value, only overwite 1003 // it if the previous one was an NaN or a smaller value. 1004 auto existing = sensorData.find(sensorPath); 1005 if (existing != sensorData.end()) 1006 { 1007 // Multiple sensors found for this FRU type 1008 if ((std::isnan(existing->second) && (tempValue == 0)) || 1009 ((existing->second == 0) && std::isnan(tempValue))) 1010 { 1011 // One of the redundant sensors has failed (0xFF/nan), and the 1012 // other sensor has no reading (0), so set the FRU to NaN to 1013 // force fan increase 1014 tempValue = std::numeric_limits<double>::quiet_NaN(); 1015 existing->second = tempValue; 1016 } 1017 if (std::isnan(existing->second) || (tempValue > existing->second)) 1018 { 1019 existing->second = tempValue; 1020 } 1021 } 1022 else 1023 { 1024 // First sensor for this FRU type 1025 sensorData[sensorPath] = tempValue; 1026 } 1027 } 1028 1029 // Now publish the values on D-Bus. 1030 for (const auto& [objectPath, value] : sensorData) 1031 { 1032 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1033 value * std::pow(10, -3)); 1034 1035 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1036 objectPath, !std::isnan(value)); 1037 1038 if (existingSensors.find(objectPath) == existingSensors.end()) 1039 { 1040 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1041 objectPath); 1042 } 1043 1044 existingSensors[objectPath] = occInstance; 1045 } 1046 } 1047 1048 std::optional<std::string> 1049 Manager::getPowerLabelFunctionID(const std::string& value) 1050 { 1051 // If the value is "system", then the FunctionID is "system". 1052 if (value == "system") 1053 { 1054 return value; 1055 } 1056 1057 // If the value is not "system", then the label value have 3 numbers, of 1058 // which we only care about the middle one: 1059 // <sensor id>_<function id>_<apss channel> 1060 // eg: The value is "0_10_5" , then the FunctionID is "10". 1061 if (value.find("_") == std::string::npos) 1062 { 1063 return std::nullopt; 1064 } 1065 1066 auto powerLabelValue = value.substr((value.find("_") + 1)); 1067 1068 if (powerLabelValue.find("_") == std::string::npos) 1069 { 1070 return std::nullopt; 1071 } 1072 1073 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1074 } 1075 1076 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1077 { 1078 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1079 for (auto& file : fs::directory_iterator(path)) 1080 { 1081 if (!std::regex_search(file.path().string(), expr)) 1082 { 1083 continue; 1084 } 1085 1086 std::string labelValue; 1087 try 1088 { 1089 labelValue = readFile<std::string>(file.path()); 1090 } 1091 catch (const std::system_error& e) 1092 { 1093 log<level::DEBUG>( 1094 std::format("readPowerSensors: Failed reading {}, errno = {}", 1095 file.path().string(), e.code().value()) 1096 .c_str()); 1097 continue; 1098 } 1099 1100 auto functionID = getPowerLabelFunctionID(labelValue); 1101 if (functionID == std::nullopt) 1102 { 1103 continue; 1104 } 1105 1106 const std::string& tempLabel = "label"; 1107 const std::string filePathString = file.path().string().substr( 1108 0, file.path().string().length() - tempLabel.length()); 1109 1110 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1111 1112 auto iter = powerSensorName.find(*functionID); 1113 if (iter == powerSensorName.end()) 1114 { 1115 continue; 1116 } 1117 sensorPath.append(iter->second); 1118 1119 double tempValue{0}; 1120 1121 try 1122 { 1123 tempValue = readFile<double>(filePathString + inputSuffix); 1124 } 1125 catch (const std::system_error& e) 1126 { 1127 log<level::DEBUG>( 1128 std::format("readPowerSensors: Failed reading {}, errno = {}", 1129 filePathString + inputSuffix, e.code().value()) 1130 .c_str()); 1131 continue; 1132 } 1133 1134 dbus::OccDBusSensors::getOccDBus().setUnit( 1135 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1136 1137 dbus::OccDBusSensors::getOccDBus().setValue( 1138 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1139 1140 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1141 true); 1142 1143 if (existingSensors.find(sensorPath) == existingSensors.end()) 1144 { 1145 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1146 sensorPath); 1147 } 1148 1149 existingSensors[sensorPath] = id; 1150 } 1151 return; 1152 } 1153 1154 void Manager::setSensorValueToNaN(uint32_t id) const 1155 { 1156 for (const auto& [sensorPath, occId] : existingSensors) 1157 { 1158 if (occId == id) 1159 { 1160 dbus::OccDBusSensors::getOccDBus().setValue( 1161 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1162 1163 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1164 true); 1165 } 1166 } 1167 return; 1168 } 1169 1170 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1171 { 1172 for (const auto& [sensorPath, occId] : existingSensors) 1173 { 1174 if (occId == id) 1175 { 1176 dbus::OccDBusSensors::getOccDBus().setValue( 1177 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1178 1179 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1180 false); 1181 } 1182 } 1183 return; 1184 } 1185 1186 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1187 { 1188 static bool tracedError[8] = {0}; 1189 const fs::path sensorPath = occ->getHwmonPath(); 1190 const uint32_t id = occ->getOccInstanceID(); 1191 1192 if (fs::exists(sensorPath)) 1193 { 1194 // Read temperature sensors 1195 readTempSensors(sensorPath, id); 1196 1197 if (occ->isMasterOcc()) 1198 { 1199 // Read power sensors 1200 readPowerSensors(sensorPath, id); 1201 } 1202 tracedError[id] = false; 1203 } 1204 else 1205 { 1206 if (!tracedError[id]) 1207 { 1208 log<level::ERR>( 1209 std::format( 1210 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1211 id, sensorPath.c_str()) 1212 .c_str()); 1213 tracedError[id] = true; 1214 } 1215 } 1216 1217 return; 1218 } 1219 #endif 1220 1221 // Read the altitude from DBus 1222 void Manager::readAltitude() 1223 { 1224 static bool traceAltitudeErr = true; 1225 1226 utils::PropertyValue altitudeProperty{}; 1227 try 1228 { 1229 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1230 ALTITUDE_PROP); 1231 auto sensorVal = std::get<double>(altitudeProperty); 1232 if (sensorVal < 0xFFFF) 1233 { 1234 if (sensorVal < 0) 1235 { 1236 altitude = 0; 1237 } 1238 else 1239 { 1240 // Round to nearest meter 1241 altitude = uint16_t(sensorVal + 0.5); 1242 } 1243 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1244 sensorVal, altitude) 1245 .c_str()); 1246 traceAltitudeErr = true; 1247 } 1248 else 1249 { 1250 if (traceAltitudeErr) 1251 { 1252 traceAltitudeErr = false; 1253 log<level::DEBUG>( 1254 std::format("Invalid altitude value: {}", sensorVal) 1255 .c_str()); 1256 } 1257 } 1258 } 1259 catch (const sdbusplus::exception_t& e) 1260 { 1261 if (traceAltitudeErr) 1262 { 1263 traceAltitudeErr = false; 1264 log<level::INFO>( 1265 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1266 } 1267 altitude = 0xFFFF; // not available 1268 } 1269 } 1270 1271 // Callback function when ambient temperature changes 1272 void Manager::ambientCallback(sdbusplus::message_t& msg) 1273 { 1274 double currentTemp = 0; 1275 uint8_t truncatedTemp = 0xFF; 1276 std::string msgSensor; 1277 std::map<std::string, std::variant<double>> msgData; 1278 msg.read(msgSensor, msgData); 1279 1280 auto valPropMap = msgData.find(AMBIENT_PROP); 1281 if (valPropMap == msgData.end()) 1282 { 1283 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1284 return; 1285 } 1286 currentTemp = std::get<double>(valPropMap->second); 1287 if (std::isnan(currentTemp)) 1288 { 1289 truncatedTemp = 0xFF; 1290 } 1291 else 1292 { 1293 if (currentTemp < 0) 1294 { 1295 truncatedTemp = 0; 1296 } 1297 else 1298 { 1299 // Round to nearest degree C 1300 truncatedTemp = uint8_t(currentTemp + 0.5); 1301 } 1302 } 1303 1304 // If ambient changes, notify OCCs 1305 if (truncatedTemp != ambient) 1306 { 1307 log<level::DEBUG>( 1308 std::format("ambientCallback: Ambient change from {} to {}C", 1309 ambient, currentTemp) 1310 .c_str()); 1311 1312 ambient = truncatedTemp; 1313 if (altitude == 0xFFFF) 1314 { 1315 // No altitude yet, try reading again 1316 readAltitude(); 1317 } 1318 1319 log<level::DEBUG>( 1320 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1321 altitude) 1322 .c_str()); 1323 #ifdef POWER10 1324 // Send ambient and altitude to all OCCs 1325 for (auto& obj : statusObjects) 1326 { 1327 if (obj->occActive()) 1328 { 1329 obj->sendAmbient(ambient, altitude); 1330 } 1331 } 1332 #endif // POWER10 1333 } 1334 } 1335 1336 // return the current ambient and altitude readings 1337 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1338 uint16_t& altitudeValue) const 1339 { 1340 ambientValid = true; 1341 ambientTemp = ambient; 1342 altitudeValue = altitude; 1343 1344 if (ambient == 0xFF) 1345 { 1346 ambientValid = false; 1347 } 1348 } 1349 1350 #ifdef POWER10 1351 // Called when waitForAllOccsTimer expires 1352 // After the first OCC goes active, this timer will be started (60 seconds) 1353 void Manager::occsNotAllRunning() 1354 { 1355 if (activeCount != statusObjects.size()) 1356 { 1357 // Not all OCCs went active 1358 log<level::WARNING>( 1359 std::format( 1360 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1361 activeCount, statusObjects.size()) 1362 .c_str()); 1363 // Procs may be garded, so may be expected 1364 } 1365 1366 validateOccMaster(); 1367 } 1368 1369 #ifdef PLDM 1370 // Called when throttleTraceTimer expires. 1371 // If this timer expires, that indicates there are no OCC active sensor PDRs 1372 // found which will trigger pldm traces to be throttled and PEL to be created 1373 void Manager::throttleTraceExpired() 1374 { 1375 if (utils::isHostRunning()) 1376 { 1377 // Throttle traces 1378 pldmHandle->setTraceThrottle(true); 1379 // Create PEL 1380 createPldmSensorPEL(); 1381 } 1382 else 1383 { 1384 // Make sure traces are not throttled 1385 pldmHandle->setTraceThrottle(false); 1386 log<level::INFO>( 1387 "throttleTraceExpired(): host it not running ignoring sensor timer"); 1388 } 1389 } 1390 1391 void Manager::createPldmSensorPEL() 1392 { 1393 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1394 std::map<std::string, std::string> additionalData; 1395 1396 additionalData.emplace("_PID", std::to_string(getpid())); 1397 1398 log<level::INFO>( 1399 std::format( 1400 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs") 1401 .c_str()); 1402 1403 auto& bus = utils::getBus(); 1404 1405 try 1406 { 1407 FFDCFiles ffdc; 1408 // Add occ-control journal traces to PEL FFDC 1409 auto occJournalFile = 1410 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1411 1412 static constexpr auto loggingObjectPath = 1413 "/xyz/openbmc_project/logging"; 1414 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1415 std::string service = utils::getService(loggingObjectPath, 1416 opLoggingInterface); 1417 auto method = bus.new_method_call(service.c_str(), loggingObjectPath, 1418 opLoggingInterface, 1419 "CreatePELWithFFDCFiles"); 1420 1421 // Set level to Warning (Predictive). 1422 auto level = 1423 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1424 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1425 Warning); 1426 1427 method.append(d.path, level, additionalData, ffdc); 1428 bus.call(method); 1429 } 1430 catch (const sdbusplus::exception_t& e) 1431 { 1432 log<level::ERR>( 1433 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}", 1434 e.what()) 1435 .c_str()); 1436 } 1437 } 1438 #endif // PLDM 1439 #endif // POWER10 1440 1441 // Verify single master OCC and start presence monitor 1442 void Manager::validateOccMaster() 1443 { 1444 int masterInstance = -1; 1445 for (auto& obj : statusObjects) 1446 { 1447 auto instance = obj->getOccInstanceID(); 1448 #ifdef POWER10 1449 if (!obj->occActive()) 1450 { 1451 if (utils::isHostRunning()) 1452 { 1453 // Check if sensor was queued while waiting for discovery 1454 auto match = queuedActiveState.find(instance); 1455 if (match != queuedActiveState.end()) 1456 { 1457 queuedActiveState.erase(match); 1458 log<level::INFO>( 1459 std::format( 1460 "validateOccMaster: OCC{} is ACTIVE (queued)", 1461 instance) 1462 .c_str()); 1463 obj->occActive(true); 1464 } 1465 else 1466 { 1467 // OCC does not appear to be active yet, check active sensor 1468 #ifdef PLDM 1469 pldmHandle->checkActiveSensor(instance); 1470 #endif 1471 if (obj->occActive()) 1472 { 1473 log<level::INFO>( 1474 std::format( 1475 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1476 instance) 1477 .c_str()); 1478 } 1479 } 1480 } 1481 else 1482 { 1483 log<level::WARNING>( 1484 std::format( 1485 "validateOccMaster: HOST is not running (OCC{})", 1486 instance) 1487 .c_str()); 1488 return; 1489 } 1490 } 1491 #endif // POWER10 1492 1493 if (obj->isMasterOcc()) 1494 { 1495 obj->addPresenceWatchMaster(); 1496 1497 if (masterInstance == -1) 1498 { 1499 masterInstance = instance; 1500 } 1501 else 1502 { 1503 log<level::ERR>( 1504 std::format( 1505 "validateOccMaster: Multiple OCC masters! ({} and {})", 1506 masterInstance, instance) 1507 .c_str()); 1508 // request reset 1509 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1510 } 1511 } 1512 } 1513 1514 if (masterInstance < 0) 1515 { 1516 log<level::ERR>( 1517 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1518 statusObjects.size()) 1519 .c_str()); 1520 // request reset 1521 statusObjects.front()->deviceError( 1522 Error::Descriptor(PRESENCE_ERROR_PATH)); 1523 } 1524 else 1525 { 1526 log<level::INFO>( 1527 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1528 masterInstance, activeCount) 1529 .c_str()); 1530 #ifdef POWER10 1531 pmode->updateDbusSafeMode(false); 1532 #endif 1533 } 1534 } 1535 1536 void Manager::updatePcapBounds() const 1537 { 1538 if (pcap) 1539 { 1540 pcap->updatePcapBounds(); 1541 } 1542 } 1543 1544 } // namespace occ 1545 } // namespace open_power 1546