1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/log.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> 37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 59 // findAndCreateObjects(): 60 // Takes care of getting the required objects created and 61 // finds the available devices/processors. 62 // (function is called everytime the discoverTimer expires) 63 // - create the PowerMode object to control OCC modes 64 // - create statusObjects for each OCC device found 65 // - waits for OCC Active sensors PDRs to become available 66 // - restart discoverTimer if all data is not available yet 67 void Manager::findAndCreateObjects() 68 { 69 #ifndef POWER10 70 for (auto id = 0; id < MAX_CPUS; ++id) 71 { 72 // Create one occ per cpu 73 auto occ = std::string(OCC_NAME) + std::to_string(id); 74 createObjects(occ); 75 } 76 #else 77 if (!pmode) 78 { 79 // Create the power mode object 80 pmode = std::make_unique<powermode::PowerMode>( 81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 82 } 83 84 if (!fs::exists(HOST_ON_FILE)) 85 { 86 static bool statusObjCreated = false; 87 if (!statusObjCreated) 88 { 89 // Create the OCCs based on on the /dev/occX devices 90 auto occs = findOCCsInDev(); 91 92 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 93 { 94 // Something changed or no OCCs yet, try again in 10s. 95 // Note on the first pass prevOCCSearch will be empty, 96 // so there will be at least one delay to give things 97 // a chance to settle. 98 prevOCCSearch = occs; 99 100 log<level::INFO>( 101 std::format( 102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 103 occs.size()) 104 .c_str()); 105 106 discoverTimer->restartOnce(10s); 107 } 108 else 109 { 110 // All OCCs appear to be available, create status objects 111 112 // createObjects requires OCC0 first. 113 std::sort(occs.begin(), occs.end()); 114 115 log<level::INFO>( 116 std::format( 117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 118 occs.size()) 119 .c_str()); 120 for (auto id : occs) 121 { 122 createObjects(std::string(OCC_NAME) + std::to_string(id)); 123 } 124 statusObjCreated = true; 125 waitingForAllOccActiveSensors = true; 126 127 // Find/update the processor path associated with each OCC 128 for (auto& obj : statusObjects) 129 { 130 obj->updateProcAssociation(); 131 } 132 } 133 } 134 135 if (statusObjCreated && waitingForAllOccActiveSensors) 136 { 137 static bool tracedHostWait = false; 138 if (utils::isHostRunning()) 139 { 140 if (tracedHostWait) 141 { 142 log<level::INFO>( 143 "Manager::findAndCreateObjects(): Host is running"); 144 tracedHostWait = false; 145 } 146 checkAllActiveSensors(); 147 } 148 else 149 { 150 if (!tracedHostWait) 151 { 152 log<level::INFO>( 153 "Manager::findAndCreateObjects(): Waiting for host to start"); 154 tracedHostWait = true; 155 } 156 discoverTimer->restartOnce(30s); 157 #ifdef PLDM 158 if (throttlePldmTraceTimer->isEnabled()) 159 { 160 // Host is no longer running, disable throttle timer and 161 // make sure traces are not throttled 162 log<level::INFO>( 163 "findAndCreateObjects(): disabling sensor timer"); 164 throttlePldmTraceTimer->setEnabled(false); 165 pldmHandle->setTraceThrottle(false); 166 } 167 #endif 168 } 169 } 170 } 171 else 172 { 173 log<level::INFO>( 174 std::format( 175 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 176 HOST_ON_FILE) 177 .c_str()); 178 discoverTimer->restartOnce(10s); 179 } 180 #endif 181 } 182 183 #ifdef POWER10 184 // Check if all occActive sensors are available 185 void Manager::checkAllActiveSensors() 186 { 187 static bool allActiveSensorAvailable = false; 188 static bool tracedSensorWait = false; 189 static bool waitingForHost = false; 190 191 if (open_power::occ::utils::isHostRunning()) 192 { 193 if (waitingForHost) 194 { 195 waitingForHost = false; 196 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 197 } 198 199 // Start with the assumption that all are available 200 allActiveSensorAvailable = true; 201 for (auto& obj : statusObjects) 202 { 203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 204 { 205 auto instance = obj->getOccInstanceID(); 206 // Check if sensor was queued while waiting for discovery 207 auto match = queuedActiveState.find(instance); 208 if (match != queuedActiveState.end()) 209 { 210 queuedActiveState.erase(match); 211 log<level::INFO>( 212 std::format( 213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 214 instance) 215 .c_str()); 216 obj->occActive(true); 217 } 218 else 219 { 220 allActiveSensorAvailable = false; 221 if (!tracedSensorWait) 222 { 223 log<level::INFO>( 224 std::format( 225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 226 instance) 227 .c_str()); 228 tracedSensorWait = true; 229 #ifdef PLDM 230 // Make sure PLDM traces are not throttled 231 pldmHandle->setTraceThrottle(false); 232 // Start timer to throttle PLDM traces when timer 233 // expires 234 onPldmTimeoutCreatePel = false; 235 throttlePldmTraceTimer->restartOnce(5min); 236 #endif 237 } 238 #ifdef PLDM 239 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 240 #endif 241 break; 242 } 243 } 244 } 245 } 246 else 247 { 248 if (!waitingForHost) 249 { 250 waitingForHost = true; 251 log<level::INFO>( 252 "checkAllActiveSensors(): Waiting for host to start"); 253 #ifdef PLDM 254 if (throttlePldmTraceTimer->isEnabled()) 255 { 256 // Host is no longer running, disable throttle timer and 257 // make sure traces are not throttled 258 log<level::INFO>( 259 "checkAllActiveSensors(): disabling sensor timer"); 260 throttlePldmTraceTimer->setEnabled(false); 261 pldmHandle->setTraceThrottle(false); 262 } 263 #endif 264 } 265 } 266 267 if (allActiveSensorAvailable) 268 { 269 // All sensors were found, disable the discovery timer 270 if (discoverTimer->isEnabled()) 271 { 272 discoverTimer->setEnabled(false); 273 } 274 #ifdef PLDM 275 if (throttlePldmTraceTimer->isEnabled()) 276 { 277 // Disable throttle timer and make sure traces are not throttled 278 throttlePldmTraceTimer->setEnabled(false); 279 pldmHandle->setTraceThrottle(false); 280 } 281 #endif 282 if (waitingForAllOccActiveSensors) 283 { 284 log<level::INFO>( 285 "checkAllActiveSensors(): OCC Active sensors are available"); 286 waitingForAllOccActiveSensors = false; 287 } 288 queuedActiveState.clear(); 289 tracedSensorWait = false; 290 } 291 else 292 { 293 // Not all sensors were available, so keep waiting 294 if (!tracedSensorWait) 295 { 296 log<level::INFO>( 297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 298 tracedSensorWait = true; 299 } 300 discoverTimer->restartOnce(10s); 301 } 302 } 303 #endif 304 305 std::vector<int> Manager::findOCCsInDev() 306 { 307 std::vector<int> occs; 308 std::regex expr{R"(occ(\d+)$)"}; 309 310 for (auto& file : fs::directory_iterator("/dev")) 311 { 312 std::smatch match; 313 std::string path{file.path().string()}; 314 if (std::regex_search(path, match, expr)) 315 { 316 auto num = std::stoi(match[1].str()); 317 318 // /dev numbering starts at 1, ours starts at 0. 319 occs.push_back(num - 1); 320 } 321 } 322 323 return occs; 324 } 325 326 int Manager::cpuCreated(sdbusplus::message_t& msg) 327 { 328 namespace fs = std::filesystem; 329 330 sdbusplus::message::object_path o; 331 msg.read(o); 332 fs::path cpuPath(std::string(std::move(o))); 333 334 auto name = cpuPath.filename().string(); 335 auto index = name.find(CPU_NAME); 336 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 337 338 createObjects(name); 339 340 return 0; 341 } 342 343 void Manager::createObjects(const std::string& occ) 344 { 345 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 346 347 statusObjects.emplace_back(std::make_unique<Status>( 348 event, path.c_str(), *this, 349 #ifdef POWER10 350 pmode, 351 #endif 352 std::bind(std::mem_fn(&Manager::statusCallBack), this, 353 std::placeholders::_1, std::placeholders::_2) 354 #ifdef PLDM 355 , 356 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 357 std::placeholders::_1) 358 #endif 359 )); 360 361 // Create the power cap monitor object 362 if (!pcap) 363 { 364 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 365 *statusObjects.back()); 366 } 367 368 if (statusObjects.back()->isMasterOcc()) 369 { 370 log<level::INFO>( 371 std::format("Manager::createObjects(): OCC{} is the master", 372 statusObjects.back()->getOccInstanceID()) 373 .c_str()); 374 _pollTimer->setEnabled(false); 375 376 #ifdef POWER10 377 // Set the master OCC on the PowerMode object 378 pmode->setMasterOcc(path); 379 #endif 380 } 381 382 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str() 383 #ifdef POWER10 384 , 385 pmode 386 #endif 387 )); 388 } 389 390 void Manager::statusCallBack(instanceID instance, bool status) 391 { 392 if (status == true) 393 { 394 // OCC went active 395 ++activeCount; 396 397 #ifdef POWER10 398 if (activeCount == 1) 399 { 400 // First OCC went active (allow some time for all OCCs to go active) 401 waitForAllOccsTimer->restartOnce(60s); 402 } 403 #endif 404 405 if (activeCount == statusObjects.size()) 406 { 407 #ifdef POWER10 408 // All OCCs are now running 409 if (waitForAllOccsTimer->isEnabled()) 410 { 411 // stop occ wait timer 412 waitForAllOccsTimer->setEnabled(false); 413 } 414 #endif 415 416 // Verify master OCC and start presence monitor 417 validateOccMaster(); 418 } 419 420 // Start poll timer if not already started 421 if (!_pollTimer->isEnabled()) 422 { 423 log<level::INFO>( 424 std::format("Manager: OCCs will be polled every {} seconds", 425 pollInterval) 426 .c_str()); 427 428 // Send poll and start OCC poll timer 429 pollerTimerExpired(); 430 } 431 } 432 else 433 { 434 // OCC went away 435 if (activeCount > 0) 436 { 437 --activeCount; 438 } 439 else 440 { 441 log<level::ERR>( 442 std::format("OCC{} disabled, but currently no active OCCs", 443 instance) 444 .c_str()); 445 } 446 447 if (activeCount == 0) 448 { 449 // No OCCs are running 450 451 // Stop OCC poll timer 452 if (_pollTimer->isEnabled()) 453 { 454 log<level::INFO>( 455 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 456 _pollTimer->setEnabled(false); 457 } 458 459 #ifdef POWER10 460 // stop wait timer 461 if (waitForAllOccsTimer->isEnabled()) 462 { 463 waitForAllOccsTimer->setEnabled(false); 464 } 465 #endif 466 } 467 #ifdef READ_OCC_SENSORS 468 // Clear OCC sensors 469 setSensorValueToNaN(instance); 470 #endif 471 } 472 473 #ifdef POWER10 474 if (waitingForAllOccActiveSensors) 475 { 476 if (utils::isHostRunning()) 477 { 478 checkAllActiveSensors(); 479 } 480 } 481 #endif 482 } 483 484 #ifdef I2C_OCC 485 void Manager::initStatusObjects() 486 { 487 // Make sure we have a valid path string 488 static_assert(sizeof(DEV_PATH) != 0); 489 490 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 491 for (auto& name : deviceNames) 492 { 493 i2c_occ::i2cToDbus(name); 494 name = std::string(OCC_NAME) + '_' + name; 495 auto path = fs::path(OCC_CONTROL_ROOT) / name; 496 statusObjects.emplace_back( 497 std::make_unique<Status>(event, path.c_str(), *this)); 498 } 499 // The first device is master occ 500 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 501 *statusObjects.front()); 502 #ifdef POWER10 503 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 504 powermode::PIPS_PATH); 505 // Set the master OCC on the PowerMode object 506 pmode->setMasterOcc(path); 507 #endif 508 } 509 #endif 510 511 #ifdef PLDM 512 void Manager::sbeTimeout(unsigned int instance) 513 { 514 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 515 [instance](const auto& obj) { 516 return instance == obj->getOccInstanceID(); 517 }); 518 519 if (obj != statusObjects.end() && (*obj)->occActive()) 520 { 521 log<level::INFO>( 522 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 523 .c_str()); 524 525 setSBEState(instance, SBE_STATE_NOT_USABLE); 526 527 pldmHandle->sendHRESET(instance); 528 } 529 } 530 531 bool Manager::updateOCCActive(instanceID instance, bool status) 532 { 533 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 534 [instance](const auto& obj) { 535 return instance == obj->getOccInstanceID(); 536 }); 537 538 const bool hostRunning = open_power::occ::utils::isHostRunning(); 539 if (obj != statusObjects.end()) 540 { 541 if (!hostRunning && (status == true)) 542 { 543 log<level::WARNING>( 544 std::format( 545 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 546 instance, status) 547 .c_str()); 548 (*obj)->setPldmSensorReceived(false); 549 if (!waitingForAllOccActiveSensors) 550 { 551 log<level::INFO>( 552 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 553 waitingForAllOccActiveSensors = true; 554 } 555 #ifdef POWER10 556 discoverTimer->restartOnce(30s); 557 #endif 558 return false; 559 } 560 else 561 { 562 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 563 instance, status) 564 .c_str()); 565 (*obj)->setPldmSensorReceived(true); 566 return (*obj)->occActive(status); 567 } 568 } 569 else 570 { 571 if (hostRunning) 572 { 573 log<level::WARNING>( 574 std::format( 575 "updateOCCActive: No status object to update for OCC{} (active={})", 576 instance, status) 577 .c_str()); 578 } 579 else 580 { 581 if (status == true) 582 { 583 log<level::WARNING>( 584 std::format( 585 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 586 instance, status) 587 .c_str()); 588 } 589 } 590 if (status == true) 591 { 592 // OCC went active 593 queuedActiveState.insert(instance); 594 } 595 else 596 { 597 auto match = queuedActiveState.find(instance); 598 if (match != queuedActiveState.end()) 599 { 600 // OCC was disabled 601 queuedActiveState.erase(match); 602 } 603 } 604 return false; 605 } 606 } 607 608 // Called upon pldm event To set powermode Safe Mode State for system. 609 void Manager::updateOccSafeMode(bool safeMode) 610 { 611 #ifdef POWER10 612 pmode->updateDbusSafeMode(safeMode); 613 #endif 614 // Update the processor throttle status on dbus 615 for (auto& obj : statusObjects) 616 { 617 obj->updateThrottle(safeMode, THROTTLED_SAFE); 618 } 619 } 620 621 void Manager::sbeHRESETResult(instanceID instance, bool success) 622 { 623 if (success) 624 { 625 log<level::INFO>( 626 std::format("HRESET succeeded (OCC{})", instance).c_str()); 627 628 setSBEState(instance, SBE_STATE_BOOTED); 629 630 return; 631 } 632 633 setSBEState(instance, SBE_STATE_FAILED); 634 635 if (sbeCanDump(instance)) 636 { 637 log<level::INFO>( 638 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 639 .c_str()); 640 641 auto& bus = utils::getBus(); 642 uint32_t src6 = instance << 16; 643 uint32_t logId = 644 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 645 src6, "SBE command timeout"); 646 647 try 648 { 649 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 650 constexpr auto function = "CreateDump"; 651 652 std::string service = utils::getService(OP_DUMP_OBJ_PATH, 653 interface); 654 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH, 655 interface, function); 656 657 std::map<std::string, std::variant<std::string, uint64_t>> 658 createParams{ 659 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 660 uint64_t(logId)}, 661 {"com.ibm.Dump.Create.CreateParameters.DumpType", 662 "com.ibm.Dump.Create.DumpType.SBE"}, 663 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 664 uint64_t(instance)}, 665 }; 666 667 method.append(createParams); 668 669 auto response = bus.call(method); 670 } 671 catch (const sdbusplus::exception_t& e) 672 { 673 constexpr auto ERROR_DUMP_DISABLED = 674 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 675 if (e.name() == ERROR_DUMP_DISABLED) 676 { 677 log<level::INFO>("Dump is disabled, skipping"); 678 } 679 else 680 { 681 log<level::ERR>("Dump failed"); 682 } 683 } 684 } 685 } 686 687 bool Manager::sbeCanDump(unsigned int instance) 688 { 689 struct pdbg_target* proc = getPdbgTarget(instance); 690 691 if (!proc) 692 { 693 // allow the dump in the error case 694 return true; 695 } 696 697 try 698 { 699 if (!openpower::phal::sbe::isDumpAllowed(proc)) 700 { 701 return false; 702 } 703 704 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 705 { 706 return false; 707 } 708 } 709 catch (openpower::phal::exception::SbeError& e) 710 { 711 log<level::INFO>("Failed to query SBE state"); 712 } 713 714 // allow the dump in the error case 715 return true; 716 } 717 718 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 719 { 720 struct pdbg_target* proc = getPdbgTarget(instance); 721 722 if (!proc) 723 { 724 return; 725 } 726 727 try 728 { 729 openpower::phal::sbe::setState(proc, state); 730 } 731 catch (const openpower::phal::exception::SbeError& e) 732 { 733 log<level::ERR>("Failed to set SBE state"); 734 } 735 } 736 737 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 738 { 739 if (!pdbgInitialized) 740 { 741 try 742 { 743 openpower::phal::pdbg::init(); 744 pdbgInitialized = true; 745 } 746 catch (const openpower::phal::exception::PdbgError& e) 747 { 748 log<level::ERR>("pdbg initialization failed"); 749 return nullptr; 750 } 751 } 752 753 struct pdbg_target* proc = nullptr; 754 pdbg_for_each_class_target("proc", proc) 755 { 756 if (pdbg_target_index(proc) == instance) 757 { 758 return proc; 759 } 760 } 761 762 log<level::ERR>("Failed to get pdbg target"); 763 return nullptr; 764 } 765 #endif 766 767 void Manager::pollerTimerExpired() 768 { 769 if (!_pollTimer) 770 { 771 log<level::ERR>( 772 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 773 return; 774 } 775 776 for (auto& obj : statusObjects) 777 { 778 if (!obj->occActive()) 779 { 780 // OCC is not running yet 781 #ifdef READ_OCC_SENSORS 782 auto id = obj->getOccInstanceID(); 783 setSensorValueToNaN(id); 784 #endif 785 continue; 786 } 787 788 // Read sysfs to force kernel to poll OCC 789 obj->readOccState(); 790 791 #ifdef READ_OCC_SENSORS 792 // Read occ sensor values 793 getSensorValues(obj); 794 #endif 795 } 796 797 if (activeCount > 0) 798 { 799 // Restart OCC poll timer 800 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 801 } 802 else 803 { 804 // No OCCs running, so poll timer will not be restarted 805 log<level::INFO>( 806 std::format( 807 "Manager::pollerTimerExpired: poll timer will not be restarted") 808 .c_str()); 809 } 810 } 811 812 #ifdef READ_OCC_SENSORS 813 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 814 { 815 // There may be more than one sensor with the same FRU type 816 // and label so make two passes: the first to read the temps 817 // from sysfs, and the second to put them on D-Bus after 818 // resolving any conflicts. 819 std::map<std::string, double> sensorData; 820 821 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 822 for (auto& file : fs::directory_iterator(path)) 823 { 824 if (!std::regex_search(file.path().string(), expr)) 825 { 826 continue; 827 } 828 829 uint32_t labelValue{0}; 830 831 try 832 { 833 labelValue = readFile<uint32_t>(file.path()); 834 } 835 catch (const std::system_error& e) 836 { 837 log<level::DEBUG>( 838 std::format("readTempSensors: Failed reading {}, errno = {}", 839 file.path().string(), e.code().value()) 840 .c_str()); 841 continue; 842 } 843 844 const std::string& tempLabel = "label"; 845 const std::string filePathString = file.path().string().substr( 846 0, file.path().string().length() - tempLabel.length()); 847 848 uint32_t fruTypeValue{0}; 849 try 850 { 851 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 852 } 853 catch (const std::system_error& e) 854 { 855 log<level::DEBUG>( 856 std::format("readTempSensors: Failed reading {}, errno = {}", 857 filePathString + fruTypeSuffix, e.code().value()) 858 .c_str()); 859 continue; 860 } 861 862 std::string sensorPath = OCC_SENSORS_ROOT + 863 std::string("/temperature/"); 864 865 std::string dvfsTempPath; 866 867 if (fruTypeValue == VRMVdd) 868 { 869 sensorPath.append("vrm_vdd" + std::to_string(occInstance) + 870 "_temp"); 871 } 872 else if (fruTypeValue == processorIoRing) 873 { 874 sensorPath.append("proc" + std::to_string(occInstance) + 875 "_ioring_temp"); 876 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 877 std::to_string(occInstance) + "_ioring_dvfs_temp"; 878 } 879 else 880 { 881 uint16_t type = (labelValue & 0xFF000000) >> 24; 882 uint16_t instanceID = labelValue & 0x0000FFFF; 883 884 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 885 { 886 if (fruTypeValue == fruTypeNotAvailable) 887 { 888 // Not all DIMM related temps are available to read 889 // (no _input file in this case) 890 continue; 891 } 892 auto iter = dimmTempSensorName.find(fruTypeValue); 893 if (iter == dimmTempSensorName.end()) 894 { 895 log<level::ERR>( 896 std::format( 897 "readTempSensors: Fru type error! fruTypeValue = {}) ", 898 fruTypeValue) 899 .c_str()); 900 continue; 901 } 902 903 sensorPath.append("dimm" + std::to_string(instanceID) + 904 iter->second); 905 906 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 907 dimmDVFSSensorName.at(fruTypeValue); 908 } 909 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 910 { 911 if (fruTypeValue == processorCore) 912 { 913 // The OCC reports small core temps, of which there are 914 // two per big core. All current P10 systems are in big 915 // core mode, so use a big core name. 916 uint16_t coreNum = instanceID / 2; 917 uint16_t tempNum = instanceID % 2; 918 sensorPath.append("proc" + std::to_string(occInstance) + 919 "_core" + std::to_string(coreNum) + "_" + 920 std::to_string(tempNum) + "_temp"); 921 922 dvfsTempPath = 923 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 924 std::to_string(occInstance) + "_core_dvfs_temp"; 925 } 926 else 927 { 928 continue; 929 } 930 } 931 else 932 { 933 continue; 934 } 935 } 936 937 // The dvfs temp file only needs to be read once per chip per type. 938 if (!dvfsTempPath.empty() && 939 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 940 { 941 try 942 { 943 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 944 945 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 946 dvfsTempPath, dvfsValue * std::pow(10, -3)); 947 } 948 catch (const std::system_error& e) 949 { 950 log<level::DEBUG>( 951 std::format( 952 "readTempSensors: Failed reading {}, errno = {}", 953 filePathString + maxSuffix, e.code().value()) 954 .c_str()); 955 } 956 } 957 958 uint32_t faultValue{0}; 959 try 960 { 961 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 962 } 963 catch (const std::system_error& e) 964 { 965 log<level::DEBUG>( 966 std::format("readTempSensors: Failed reading {}, errno = {}", 967 filePathString + faultSuffix, e.code().value()) 968 .c_str()); 969 continue; 970 } 971 972 double tempValue{0}; 973 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 974 if (faultValue != 0) 975 { 976 tempValue = std::numeric_limits<double>::quiet_NaN(); 977 } 978 else 979 { 980 // Read the temperature 981 try 982 { 983 tempValue = readFile<double>(filePathString + inputSuffix); 984 } 985 catch (const std::system_error& e) 986 { 987 log<level::DEBUG>( 988 std::format( 989 "readTempSensors: Failed reading {}, errno = {}", 990 filePathString + inputSuffix, e.code().value()) 991 .c_str()); 992 993 // if errno == EAGAIN(Resource temporarily unavailable) then set 994 // temp to 0, to avoid using old temp, and affecting FAN 995 // Control. 996 if (e.code().value() == EAGAIN) 997 { 998 tempValue = 0; 999 } 1000 // else the errno would be something like 1001 // EBADF(Bad file descriptor) 1002 // or ENOENT(No such file or directory) 1003 else 1004 { 1005 continue; 1006 } 1007 } 1008 } 1009 1010 // If this object path already has a value, only overwite 1011 // it if the previous one was an NaN or a smaller value. 1012 auto existing = sensorData.find(sensorPath); 1013 if (existing != sensorData.end()) 1014 { 1015 // Multiple sensors found for this FRU type 1016 if ((std::isnan(existing->second) && (tempValue == 0)) || 1017 ((existing->second == 0) && std::isnan(tempValue))) 1018 { 1019 // One of the redundant sensors has failed (0xFF/nan), and the 1020 // other sensor has no reading (0), so set the FRU to NaN to 1021 // force fan increase 1022 tempValue = std::numeric_limits<double>::quiet_NaN(); 1023 existing->second = tempValue; 1024 } 1025 if (std::isnan(existing->second) || (tempValue > existing->second)) 1026 { 1027 existing->second = tempValue; 1028 } 1029 } 1030 else 1031 { 1032 // First sensor for this FRU type 1033 sensorData[sensorPath] = tempValue; 1034 } 1035 } 1036 1037 // Now publish the values on D-Bus. 1038 for (const auto& [objectPath, value] : sensorData) 1039 { 1040 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1041 value * std::pow(10, -3)); 1042 1043 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1044 objectPath, !std::isnan(value)); 1045 1046 if (existingSensors.find(objectPath) == existingSensors.end()) 1047 { 1048 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1049 objectPath); 1050 } 1051 1052 existingSensors[objectPath] = occInstance; 1053 } 1054 } 1055 1056 std::optional<std::string> 1057 Manager::getPowerLabelFunctionID(const std::string& value) 1058 { 1059 // If the value is "system", then the FunctionID is "system". 1060 if (value == "system") 1061 { 1062 return value; 1063 } 1064 1065 // If the value is not "system", then the label value have 3 numbers, of 1066 // which we only care about the middle one: 1067 // <sensor id>_<function id>_<apss channel> 1068 // eg: The value is "0_10_5" , then the FunctionID is "10". 1069 if (value.find("_") == std::string::npos) 1070 { 1071 return std::nullopt; 1072 } 1073 1074 auto powerLabelValue = value.substr((value.find("_") + 1)); 1075 1076 if (powerLabelValue.find("_") == std::string::npos) 1077 { 1078 return std::nullopt; 1079 } 1080 1081 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1082 } 1083 1084 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1085 { 1086 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1087 for (auto& file : fs::directory_iterator(path)) 1088 { 1089 if (!std::regex_search(file.path().string(), expr)) 1090 { 1091 continue; 1092 } 1093 1094 std::string labelValue; 1095 try 1096 { 1097 labelValue = readFile<std::string>(file.path()); 1098 } 1099 catch (const std::system_error& e) 1100 { 1101 log<level::DEBUG>( 1102 std::format("readPowerSensors: Failed reading {}, errno = {}", 1103 file.path().string(), e.code().value()) 1104 .c_str()); 1105 continue; 1106 } 1107 1108 auto functionID = getPowerLabelFunctionID(labelValue); 1109 if (functionID == std::nullopt) 1110 { 1111 continue; 1112 } 1113 1114 const std::string& tempLabel = "label"; 1115 const std::string filePathString = file.path().string().substr( 1116 0, file.path().string().length() - tempLabel.length()); 1117 1118 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1119 1120 auto iter = powerSensorName.find(*functionID); 1121 if (iter == powerSensorName.end()) 1122 { 1123 continue; 1124 } 1125 sensorPath.append(iter->second); 1126 1127 double tempValue{0}; 1128 1129 try 1130 { 1131 tempValue = readFile<double>(filePathString + inputSuffix); 1132 } 1133 catch (const std::system_error& e) 1134 { 1135 log<level::DEBUG>( 1136 std::format("readPowerSensors: Failed reading {}, errno = {}", 1137 filePathString + inputSuffix, e.code().value()) 1138 .c_str()); 1139 continue; 1140 } 1141 1142 dbus::OccDBusSensors::getOccDBus().setUnit( 1143 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1144 1145 dbus::OccDBusSensors::getOccDBus().setValue( 1146 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1147 1148 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1149 true); 1150 1151 if (existingSensors.find(sensorPath) == existingSensors.end()) 1152 { 1153 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1154 sensorPath); 1155 } 1156 1157 existingSensors[sensorPath] = id; 1158 } 1159 return; 1160 } 1161 1162 void Manager::setSensorValueToNaN(uint32_t id) const 1163 { 1164 for (const auto& [sensorPath, occId] : existingSensors) 1165 { 1166 if (occId == id) 1167 { 1168 dbus::OccDBusSensors::getOccDBus().setValue( 1169 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1170 1171 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1172 true); 1173 } 1174 } 1175 return; 1176 } 1177 1178 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1179 { 1180 for (const auto& [sensorPath, occId] : existingSensors) 1181 { 1182 if (occId == id) 1183 { 1184 dbus::OccDBusSensors::getOccDBus().setValue( 1185 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1186 1187 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath, 1188 false); 1189 } 1190 } 1191 return; 1192 } 1193 1194 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1195 { 1196 static bool tracedError[8] = {0}; 1197 const fs::path sensorPath = occ->getHwmonPath(); 1198 const uint32_t id = occ->getOccInstanceID(); 1199 1200 if (fs::exists(sensorPath)) 1201 { 1202 // Read temperature sensors 1203 readTempSensors(sensorPath, id); 1204 1205 if (occ->isMasterOcc()) 1206 { 1207 // Read power sensors 1208 readPowerSensors(sensorPath, id); 1209 } 1210 tracedError[id] = false; 1211 } 1212 else 1213 { 1214 if (!tracedError[id]) 1215 { 1216 log<level::ERR>( 1217 std::format( 1218 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1219 id, sensorPath.c_str()) 1220 .c_str()); 1221 tracedError[id] = true; 1222 } 1223 } 1224 1225 return; 1226 } 1227 #endif 1228 1229 // Read the altitude from DBus 1230 void Manager::readAltitude() 1231 { 1232 static bool traceAltitudeErr = true; 1233 1234 utils::PropertyValue altitudeProperty{}; 1235 try 1236 { 1237 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1238 ALTITUDE_PROP); 1239 auto sensorVal = std::get<double>(altitudeProperty); 1240 if (sensorVal < 0xFFFF) 1241 { 1242 if (sensorVal < 0) 1243 { 1244 altitude = 0; 1245 } 1246 else 1247 { 1248 // Round to nearest meter 1249 altitude = uint16_t(sensorVal + 0.5); 1250 } 1251 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1252 sensorVal, altitude) 1253 .c_str()); 1254 traceAltitudeErr = true; 1255 } 1256 else 1257 { 1258 if (traceAltitudeErr) 1259 { 1260 traceAltitudeErr = false; 1261 log<level::DEBUG>( 1262 std::format("Invalid altitude value: {}", sensorVal) 1263 .c_str()); 1264 } 1265 } 1266 } 1267 catch (const sdbusplus::exception_t& e) 1268 { 1269 if (traceAltitudeErr) 1270 { 1271 traceAltitudeErr = false; 1272 log<level::INFO>( 1273 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1274 } 1275 altitude = 0xFFFF; // not available 1276 } 1277 } 1278 1279 // Callback function when ambient temperature changes 1280 void Manager::ambientCallback(sdbusplus::message_t& msg) 1281 { 1282 double currentTemp = 0; 1283 uint8_t truncatedTemp = 0xFF; 1284 std::string msgSensor; 1285 std::map<std::string, std::variant<double>> msgData; 1286 msg.read(msgSensor, msgData); 1287 1288 auto valPropMap = msgData.find(AMBIENT_PROP); 1289 if (valPropMap == msgData.end()) 1290 { 1291 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1292 return; 1293 } 1294 currentTemp = std::get<double>(valPropMap->second); 1295 if (std::isnan(currentTemp)) 1296 { 1297 truncatedTemp = 0xFF; 1298 } 1299 else 1300 { 1301 if (currentTemp < 0) 1302 { 1303 truncatedTemp = 0; 1304 } 1305 else 1306 { 1307 // Round to nearest degree C 1308 truncatedTemp = uint8_t(currentTemp + 0.5); 1309 } 1310 } 1311 1312 // If ambient changes, notify OCCs 1313 if (truncatedTemp != ambient) 1314 { 1315 log<level::DEBUG>( 1316 std::format("ambientCallback: Ambient change from {} to {}C", 1317 ambient, currentTemp) 1318 .c_str()); 1319 1320 ambient = truncatedTemp; 1321 if (altitude == 0xFFFF) 1322 { 1323 // No altitude yet, try reading again 1324 readAltitude(); 1325 } 1326 1327 log<level::DEBUG>( 1328 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1329 altitude) 1330 .c_str()); 1331 #ifdef POWER10 1332 // Send ambient and altitude to all OCCs 1333 for (auto& obj : statusObjects) 1334 { 1335 if (obj->occActive()) 1336 { 1337 obj->sendAmbient(ambient, altitude); 1338 } 1339 } 1340 #endif // POWER10 1341 } 1342 } 1343 1344 // return the current ambient and altitude readings 1345 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1346 uint16_t& altitudeValue) const 1347 { 1348 ambientValid = true; 1349 ambientTemp = ambient; 1350 altitudeValue = altitude; 1351 1352 if (ambient == 0xFF) 1353 { 1354 ambientValid = false; 1355 } 1356 } 1357 1358 #ifdef POWER10 1359 // Called when waitForAllOccsTimer expires 1360 // After the first OCC goes active, this timer will be started (60 seconds) 1361 void Manager::occsNotAllRunning() 1362 { 1363 if (activeCount != statusObjects.size()) 1364 { 1365 // Not all OCCs went active 1366 log<level::WARNING>( 1367 std::format( 1368 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1369 activeCount, statusObjects.size()) 1370 .c_str()); 1371 // Procs may be garded, so may be expected 1372 } 1373 1374 validateOccMaster(); 1375 } 1376 1377 #ifdef PLDM 1378 // Called when throttlePldmTraceTimer expires. 1379 // If this timer expires, that indicates there are no OCC active sensor PDRs 1380 // found which will trigger pldm traces to be throttled. 1381 // The second time this timer expires, a PEL will get created. 1382 void Manager::throttlePldmTraceExpired() 1383 { 1384 if (utils::isHostRunning()) 1385 { 1386 if (!onPldmTimeoutCreatePel) 1387 { 1388 // Throttle traces 1389 pldmHandle->setTraceThrottle(true); 1390 // Restart timer to log a PEL when timer expires 1391 onPldmTimeoutCreatePel = true; 1392 throttlePldmTraceTimer->restartOnce(40min); 1393 } 1394 else 1395 { 1396 log<level::ERR>( 1397 "throttlePldmTraceExpired(): OCC active sensors still not available!"); 1398 // Create PEL 1399 createPldmSensorPEL(); 1400 } 1401 } 1402 else 1403 { 1404 // Make sure traces are not throttled 1405 pldmHandle->setTraceThrottle(false); 1406 log<level::INFO>( 1407 "throttlePldmTraceExpired(): host it not running ignoring sensor timer"); 1408 } 1409 } 1410 1411 void Manager::createPldmSensorPEL() 1412 { 1413 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1414 std::map<std::string, std::string> additionalData; 1415 1416 additionalData.emplace("_PID", std::to_string(getpid())); 1417 1418 log<level::INFO>( 1419 std::format( 1420 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs") 1421 .c_str()); 1422 1423 auto& bus = utils::getBus(); 1424 1425 try 1426 { 1427 FFDCFiles ffdc; 1428 // Add occ-control journal traces to PEL FFDC 1429 auto occJournalFile = 1430 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1431 1432 static constexpr auto loggingObjectPath = 1433 "/xyz/openbmc_project/logging"; 1434 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1435 std::string service = utils::getService(loggingObjectPath, 1436 opLoggingInterface); 1437 auto method = bus.new_method_call(service.c_str(), loggingObjectPath, 1438 opLoggingInterface, 1439 "CreatePELWithFFDCFiles"); 1440 1441 // Set level to Warning (Predictive). 1442 auto level = 1443 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1444 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1445 Warning); 1446 1447 method.append(d.path, level, additionalData, ffdc); 1448 bus.call(method); 1449 } 1450 catch (const sdbusplus::exception_t& e) 1451 { 1452 log<level::ERR>( 1453 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}", 1454 e.what()) 1455 .c_str()); 1456 } 1457 } 1458 #endif // PLDM 1459 #endif // POWER10 1460 1461 // Verify single master OCC and start presence monitor 1462 void Manager::validateOccMaster() 1463 { 1464 int masterInstance = -1; 1465 for (auto& obj : statusObjects) 1466 { 1467 auto instance = obj->getOccInstanceID(); 1468 #ifdef POWER10 1469 if (!obj->occActive()) 1470 { 1471 if (utils::isHostRunning()) 1472 { 1473 // Check if sensor was queued while waiting for discovery 1474 auto match = queuedActiveState.find(instance); 1475 if (match != queuedActiveState.end()) 1476 { 1477 queuedActiveState.erase(match); 1478 log<level::INFO>( 1479 std::format( 1480 "validateOccMaster: OCC{} is ACTIVE (queued)", 1481 instance) 1482 .c_str()); 1483 obj->occActive(true); 1484 } 1485 else 1486 { 1487 // OCC does not appear to be active yet, check active sensor 1488 #ifdef PLDM 1489 pldmHandle->checkActiveSensor(instance); 1490 #endif 1491 if (obj->occActive()) 1492 { 1493 log<level::INFO>( 1494 std::format( 1495 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1496 instance) 1497 .c_str()); 1498 } 1499 } 1500 } 1501 else 1502 { 1503 log<level::WARNING>( 1504 std::format( 1505 "validateOccMaster: HOST is not running (OCC{})", 1506 instance) 1507 .c_str()); 1508 return; 1509 } 1510 } 1511 #endif // POWER10 1512 1513 if (obj->isMasterOcc()) 1514 { 1515 obj->addPresenceWatchMaster(); 1516 1517 if (masterInstance == -1) 1518 { 1519 masterInstance = instance; 1520 } 1521 else 1522 { 1523 log<level::ERR>( 1524 std::format( 1525 "validateOccMaster: Multiple OCC masters! ({} and {})", 1526 masterInstance, instance) 1527 .c_str()); 1528 // request reset 1529 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1530 } 1531 } 1532 } 1533 1534 if (masterInstance < 0) 1535 { 1536 log<level::ERR>( 1537 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1538 statusObjects.size()) 1539 .c_str()); 1540 // request reset 1541 statusObjects.front()->deviceError( 1542 Error::Descriptor(PRESENCE_ERROR_PATH)); 1543 } 1544 else 1545 { 1546 log<level::INFO>( 1547 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1548 masterInstance, activeCount) 1549 .c_str()); 1550 #ifdef POWER10 1551 pmode->updateDbusSafeMode(false); 1552 #endif 1553 } 1554 } 1555 1556 void Manager::updatePcapBounds() const 1557 { 1558 if (pcap) 1559 { 1560 pcap->updatePcapBounds(); 1561 } 1562 } 1563 1564 } // namespace occ 1565 } // namespace open_power 1566