1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/log.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> 37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 59 // findAndCreateObjects(): 60 // Takes care of getting the required objects created and 61 // finds the available devices/processors. 62 // (function is called everytime the discoverTimer expires) 63 // - create the PowerMode object to control OCC modes 64 // - create statusObjects for each OCC device found 65 // - waits for OCC Active sensors PDRs to become available 66 // - restart discoverTimer if all data is not available yet 67 void Manager::findAndCreateObjects() 68 { 69 #ifndef POWER10 70 for (auto id = 0; id < MAX_CPUS; ++id) 71 { 72 // Create one occ per cpu 73 auto occ = std::string(OCC_NAME) + std::to_string(id); 74 createObjects(occ); 75 } 76 #else 77 if (!pmode) 78 { 79 // Create the power mode object 80 pmode = std::make_unique<powermode::PowerMode>( 81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 82 } 83 84 if (!fs::exists(HOST_ON_FILE)) 85 { 86 static bool statusObjCreated = false; 87 if (!statusObjCreated) 88 { 89 // Create the OCCs based on on the /dev/occX devices 90 auto occs = findOCCsInDev(); 91 92 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 93 { 94 // Something changed or no OCCs yet, try again in 10s. 95 // Note on the first pass prevOCCSearch will be empty, 96 // so there will be at least one delay to give things 97 // a chance to settle. 98 prevOCCSearch = occs; 99 100 log<level::INFO>( 101 std::format( 102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 103 occs.size()) 104 .c_str()); 105 106 discoverTimer->restartOnce(10s); 107 } 108 else 109 { 110 // All OCCs appear to be available, create status objects 111 112 // createObjects requires OCC0 first. 113 std::sort(occs.begin(), occs.end()); 114 115 log<level::INFO>( 116 std::format( 117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 118 occs.size()) 119 .c_str()); 120 for (auto id : occs) 121 { 122 createObjects(std::string(OCC_NAME) + std::to_string(id)); 123 } 124 statusObjCreated = true; 125 waitingForAllOccActiveSensors = true; 126 127 // Find/update the processor path associated with each OCC 128 for (auto& obj : statusObjects) 129 { 130 obj->updateProcAssociation(); 131 } 132 } 133 } 134 135 if (statusObjCreated && waitingForAllOccActiveSensors) 136 { 137 static bool tracedHostWait = false; 138 if (utils::isHostRunning()) 139 { 140 if (tracedHostWait) 141 { 142 log<level::INFO>( 143 "Manager::findAndCreateObjects(): Host is running"); 144 tracedHostWait = false; 145 } 146 checkAllActiveSensors(); 147 } 148 else 149 { 150 if (!tracedHostWait) 151 { 152 log<level::INFO>( 153 "Manager::findAndCreateObjects(): Waiting for host to start"); 154 tracedHostWait = true; 155 } 156 discoverTimer->restartOnce(30s); 157 #ifdef PLDM 158 if (throttlePldmTraceTimer->isEnabled()) 159 { 160 // Host is no longer running, disable throttle timer and 161 // make sure traces are not throttled 162 log<level::INFO>( 163 "findAndCreateObjects(): disabling sensor timer"); 164 throttlePldmTraceTimer->setEnabled(false); 165 pldmHandle->setTraceThrottle(false); 166 } 167 #endif 168 } 169 } 170 } 171 else 172 { 173 log<level::INFO>( 174 std::format( 175 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 176 HOST_ON_FILE) 177 .c_str()); 178 discoverTimer->restartOnce(10s); 179 } 180 #endif 181 } 182 183 #ifdef POWER10 184 // Check if all occActive sensors are available 185 void Manager::checkAllActiveSensors() 186 { 187 static bool allActiveSensorAvailable = false; 188 static bool tracedSensorWait = false; 189 static bool waitingForHost = false; 190 191 if (open_power::occ::utils::isHostRunning()) 192 { 193 if (waitingForHost) 194 { 195 waitingForHost = false; 196 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 197 } 198 199 // Start with the assumption that all are available 200 allActiveSensorAvailable = true; 201 for (auto& obj : statusObjects) 202 { 203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 204 { 205 auto instance = obj->getOccInstanceID(); 206 // Check if sensor was queued while waiting for discovery 207 auto match = queuedActiveState.find(instance); 208 if (match != queuedActiveState.end()) 209 { 210 queuedActiveState.erase(match); 211 log<level::INFO>( 212 std::format( 213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 214 instance) 215 .c_str()); 216 obj->occActive(true); 217 } 218 else 219 { 220 allActiveSensorAvailable = false; 221 if (!tracedSensorWait) 222 { 223 log<level::INFO>( 224 std::format( 225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 226 instance) 227 .c_str()); 228 tracedSensorWait = true; 229 #ifdef PLDM 230 // Make sure PLDM traces are not throttled 231 pldmHandle->setTraceThrottle(false); 232 // Start timer to throttle PLDM traces when timer 233 // expires 234 onPldmTimeoutCreatePel = false; 235 throttlePldmTraceTimer->restartOnce(5min); 236 #endif 237 } 238 #ifdef PLDM 239 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 240 #endif 241 break; 242 } 243 } 244 } 245 } 246 else 247 { 248 if (!waitingForHost) 249 { 250 waitingForHost = true; 251 log<level::INFO>( 252 "checkAllActiveSensors(): Waiting for host to start"); 253 #ifdef PLDM 254 if (throttlePldmTraceTimer->isEnabled()) 255 { 256 // Host is no longer running, disable throttle timer and 257 // make sure traces are not throttled 258 log<level::INFO>( 259 "checkAllActiveSensors(): disabling sensor timer"); 260 throttlePldmTraceTimer->setEnabled(false); 261 pldmHandle->setTraceThrottle(false); 262 } 263 #endif 264 } 265 } 266 267 if (allActiveSensorAvailable) 268 { 269 // All sensors were found, disable the discovery timer 270 if (discoverTimer->isEnabled()) 271 { 272 discoverTimer->setEnabled(false); 273 } 274 #ifdef PLDM 275 if (throttlePldmTraceTimer->isEnabled()) 276 { 277 // Disable throttle timer and make sure traces are not throttled 278 throttlePldmTraceTimer->setEnabled(false); 279 pldmHandle->setTraceThrottle(false); 280 } 281 #endif 282 if (waitingForAllOccActiveSensors) 283 { 284 log<level::INFO>( 285 "checkAllActiveSensors(): OCC Active sensors are available"); 286 waitingForAllOccActiveSensors = false; 287 } 288 queuedActiveState.clear(); 289 tracedSensorWait = false; 290 } 291 else 292 { 293 // Not all sensors were available, so keep waiting 294 if (!tracedSensorWait) 295 { 296 log<level::INFO>( 297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 298 tracedSensorWait = true; 299 } 300 discoverTimer->restartOnce(10s); 301 } 302 } 303 #endif 304 305 std::vector<int> Manager::findOCCsInDev() 306 { 307 std::vector<int> occs; 308 std::regex expr{R"(occ(\d+)$)"}; 309 310 for (auto& file : fs::directory_iterator("/dev")) 311 { 312 std::smatch match; 313 std::string path{file.path().string()}; 314 if (std::regex_search(path, match, expr)) 315 { 316 auto num = std::stoi(match[1].str()); 317 318 // /dev numbering starts at 1, ours starts at 0. 319 occs.push_back(num - 1); 320 } 321 } 322 323 return occs; 324 } 325 326 int Manager::cpuCreated(sdbusplus::message_t& msg) 327 { 328 namespace fs = std::filesystem; 329 330 sdbusplus::message::object_path o; 331 msg.read(o); 332 fs::path cpuPath(std::string(std::move(o))); 333 334 auto name = cpuPath.filename().string(); 335 auto index = name.find(CPU_NAME); 336 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 337 338 createObjects(name); 339 340 return 0; 341 } 342 343 void Manager::createObjects(const std::string& occ) 344 { 345 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 346 347 statusObjects.emplace_back(std::make_unique<Status>( 348 event, path.c_str(), *this, 349 #ifdef POWER10 350 pmode, 351 #endif 352 std::bind(std::mem_fn(&Manager::statusCallBack), this, 353 std::placeholders::_1, std::placeholders::_2) 354 #ifdef PLDM 355 , 356 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 357 std::placeholders::_1) 358 #endif 359 )); 360 361 // Create the power cap monitor object 362 if (!pcap) 363 { 364 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 365 *statusObjects.back()); 366 } 367 368 if (statusObjects.back()->isMasterOcc()) 369 { 370 log<level::INFO>( 371 std::format("Manager::createObjects(): OCC{} is the master", 372 statusObjects.back()->getOccInstanceID()) 373 .c_str()); 374 _pollTimer->setEnabled(false); 375 376 #ifdef POWER10 377 // Set the master OCC on the PowerMode object 378 pmode->setMasterOcc(path); 379 #endif 380 } 381 382 passThroughObjects.emplace_back(std::make_unique<PassThrough>( 383 path.c_str() 384 #ifdef POWER10 385 , 386 pmode 387 #endif 388 )); 389 } 390 391 void Manager::statusCallBack(instanceID instance, bool status) 392 { 393 if (status == true) 394 { 395 // OCC went active 396 ++activeCount; 397 398 #ifdef POWER10 399 if (activeCount == 1) 400 { 401 // First OCC went active (allow some time for all OCCs to go active) 402 waitForAllOccsTimer->restartOnce(60s); 403 } 404 #endif 405 406 if (activeCount == statusObjects.size()) 407 { 408 #ifdef POWER10 409 // All OCCs are now running 410 if (waitForAllOccsTimer->isEnabled()) 411 { 412 // stop occ wait timer 413 waitForAllOccsTimer->setEnabled(false); 414 } 415 #endif 416 417 // Verify master OCC and start presence monitor 418 validateOccMaster(); 419 } 420 421 // Start poll timer if not already started 422 if (!_pollTimer->isEnabled()) 423 { 424 log<level::INFO>( 425 std::format("Manager: OCCs will be polled every {} seconds", 426 pollInterval) 427 .c_str()); 428 429 // Send poll and start OCC poll timer 430 pollerTimerExpired(); 431 } 432 } 433 else 434 { 435 // OCC went away 436 if (activeCount > 0) 437 { 438 --activeCount; 439 } 440 else 441 { 442 log<level::ERR>( 443 std::format("OCC{} disabled, but currently no active OCCs", 444 instance) 445 .c_str()); 446 } 447 448 if (activeCount == 0) 449 { 450 // No OCCs are running 451 452 // Stop OCC poll timer 453 if (_pollTimer->isEnabled()) 454 { 455 log<level::INFO>( 456 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 457 _pollTimer->setEnabled(false); 458 } 459 460 #ifdef POWER10 461 // stop wait timer 462 if (waitForAllOccsTimer->isEnabled()) 463 { 464 waitForAllOccsTimer->setEnabled(false); 465 } 466 #endif 467 } 468 #ifdef READ_OCC_SENSORS 469 // Clear OCC sensors 470 setSensorValueToNaN(instance); 471 #endif 472 } 473 474 #ifdef POWER10 475 if (waitingForAllOccActiveSensors) 476 { 477 if (utils::isHostRunning()) 478 { 479 checkAllActiveSensors(); 480 } 481 } 482 #endif 483 } 484 485 #ifdef I2C_OCC 486 void Manager::initStatusObjects() 487 { 488 // Make sure we have a valid path string 489 static_assert(sizeof(DEV_PATH) != 0); 490 491 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 492 for (auto& name : deviceNames) 493 { 494 i2c_occ::i2cToDbus(name); 495 name = std::string(OCC_NAME) + '_' + name; 496 auto path = fs::path(OCC_CONTROL_ROOT) / name; 497 statusObjects.emplace_back( 498 std::make_unique<Status>(event, path.c_str(), *this)); 499 } 500 // The first device is master occ 501 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 502 *statusObjects.front()); 503 #ifdef POWER10 504 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 505 powermode::PIPS_PATH); 506 // Set the master OCC on the PowerMode object 507 pmode->setMasterOcc(path); 508 #endif 509 } 510 #endif 511 512 #ifdef PLDM 513 void Manager::sbeTimeout(unsigned int instance) 514 { 515 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 516 [instance](const auto& obj) { 517 return instance == obj->getOccInstanceID(); 518 }); 519 520 if (obj != statusObjects.end() && (*obj)->occActive()) 521 { 522 log<level::INFO>( 523 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 524 .c_str()); 525 526 setSBEState(instance, SBE_STATE_NOT_USABLE); 527 528 pldmHandle->sendHRESET(instance); 529 } 530 } 531 532 bool Manager::updateOCCActive(instanceID instance, bool status) 533 { 534 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 535 [instance](const auto& obj) { 536 return instance == obj->getOccInstanceID(); 537 }); 538 539 const bool hostRunning = open_power::occ::utils::isHostRunning(); 540 if (obj != statusObjects.end()) 541 { 542 if (!hostRunning && (status == true)) 543 { 544 log<level::WARNING>( 545 std::format( 546 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 547 instance, status) 548 .c_str()); 549 (*obj)->setPldmSensorReceived(false); 550 if (!waitingForAllOccActiveSensors) 551 { 552 log<level::INFO>( 553 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 554 waitingForAllOccActiveSensors = true; 555 } 556 #ifdef POWER10 557 discoverTimer->restartOnce(30s); 558 #endif 559 return false; 560 } 561 else 562 { 563 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}", 564 instance, status) 565 .c_str()); 566 (*obj)->setPldmSensorReceived(true); 567 return (*obj)->occActive(status); 568 } 569 } 570 else 571 { 572 if (hostRunning) 573 { 574 log<level::WARNING>( 575 std::format( 576 "updateOCCActive: No status object to update for OCC{} (active={})", 577 instance, status) 578 .c_str()); 579 } 580 else 581 { 582 if (status == true) 583 { 584 log<level::WARNING>( 585 std::format( 586 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 587 instance, status) 588 .c_str()); 589 } 590 } 591 if (status == true) 592 { 593 // OCC went active 594 queuedActiveState.insert(instance); 595 } 596 else 597 { 598 auto match = queuedActiveState.find(instance); 599 if (match != queuedActiveState.end()) 600 { 601 // OCC was disabled 602 queuedActiveState.erase(match); 603 } 604 } 605 return false; 606 } 607 } 608 609 // Called upon pldm event To set powermode Safe Mode State for system. 610 void Manager::updateOccSafeMode(bool safeMode) 611 { 612 #ifdef POWER10 613 pmode->updateDbusSafeMode(safeMode); 614 #endif 615 // Update the processor throttle status on dbus 616 for (auto& obj : statusObjects) 617 { 618 obj->updateThrottle(safeMode, THROTTLED_SAFE); 619 } 620 } 621 622 void Manager::sbeHRESETResult(instanceID instance, bool success) 623 { 624 if (success) 625 { 626 log<level::INFO>( 627 std::format("HRESET succeeded (OCC{})", instance).c_str()); 628 629 setSBEState(instance, SBE_STATE_BOOTED); 630 631 return; 632 } 633 634 setSBEState(instance, SBE_STATE_FAILED); 635 636 if (sbeCanDump(instance)) 637 { 638 log<level::INFO>( 639 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 640 .c_str()); 641 642 auto& bus = utils::getBus(); 643 uint32_t src6 = instance << 16; 644 uint32_t logId = 645 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 646 src6, "SBE command timeout"); 647 648 try 649 { 650 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 651 constexpr auto function = "CreateDump"; 652 653 std::string service = 654 utils::getService(OP_DUMP_OBJ_PATH, interface); 655 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH, 656 interface, function); 657 658 std::map<std::string, std::variant<std::string, uint64_t>> 659 createParams{ 660 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 661 uint64_t(logId)}, 662 {"com.ibm.Dump.Create.CreateParameters.DumpType", 663 "com.ibm.Dump.Create.DumpType.SBE"}, 664 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 665 uint64_t(instance)}, 666 }; 667 668 method.append(createParams); 669 670 auto response = bus.call(method); 671 } 672 catch (const sdbusplus::exception_t& e) 673 { 674 constexpr auto ERROR_DUMP_DISABLED = 675 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 676 if (e.name() == ERROR_DUMP_DISABLED) 677 { 678 log<level::INFO>("Dump is disabled, skipping"); 679 } 680 else 681 { 682 log<level::ERR>("Dump failed"); 683 } 684 } 685 } 686 } 687 688 bool Manager::sbeCanDump(unsigned int instance) 689 { 690 struct pdbg_target* proc = getPdbgTarget(instance); 691 692 if (!proc) 693 { 694 // allow the dump in the error case 695 return true; 696 } 697 698 try 699 { 700 if (!openpower::phal::sbe::isDumpAllowed(proc)) 701 { 702 return false; 703 } 704 705 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 706 { 707 return false; 708 } 709 } 710 catch (openpower::phal::exception::SbeError& e) 711 { 712 log<level::INFO>("Failed to query SBE state"); 713 } 714 715 // allow the dump in the error case 716 return true; 717 } 718 719 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 720 { 721 struct pdbg_target* proc = getPdbgTarget(instance); 722 723 if (!proc) 724 { 725 return; 726 } 727 728 try 729 { 730 openpower::phal::sbe::setState(proc, state); 731 } 732 catch (const openpower::phal::exception::SbeError& e) 733 { 734 log<level::ERR>("Failed to set SBE state"); 735 } 736 } 737 738 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 739 { 740 if (!pdbgInitialized) 741 { 742 try 743 { 744 openpower::phal::pdbg::init(); 745 pdbgInitialized = true; 746 } 747 catch (const openpower::phal::exception::PdbgError& e) 748 { 749 log<level::ERR>("pdbg initialization failed"); 750 return nullptr; 751 } 752 } 753 754 struct pdbg_target* proc = nullptr; 755 pdbg_for_each_class_target("proc", proc) 756 { 757 if (pdbg_target_index(proc) == instance) 758 { 759 return proc; 760 } 761 } 762 763 log<level::ERR>("Failed to get pdbg target"); 764 return nullptr; 765 } 766 #endif 767 768 void Manager::pollerTimerExpired() 769 { 770 if (!_pollTimer) 771 { 772 log<level::ERR>( 773 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 774 return; 775 } 776 777 for (auto& obj : statusObjects) 778 { 779 if (!obj->occActive()) 780 { 781 // OCC is not running yet 782 #ifdef READ_OCC_SENSORS 783 auto id = obj->getOccInstanceID(); 784 setSensorValueToNaN(id); 785 #endif 786 continue; 787 } 788 789 // Read sysfs to force kernel to poll OCC 790 obj->readOccState(); 791 792 #ifdef READ_OCC_SENSORS 793 // Read occ sensor values 794 getSensorValues(obj); 795 #endif 796 } 797 798 if (activeCount > 0) 799 { 800 // Restart OCC poll timer 801 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 802 } 803 else 804 { 805 // No OCCs running, so poll timer will not be restarted 806 log<level::INFO>( 807 std::format( 808 "Manager::pollerTimerExpired: poll timer will not be restarted") 809 .c_str()); 810 } 811 } 812 813 #ifdef READ_OCC_SENSORS 814 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 815 { 816 // There may be more than one sensor with the same FRU type 817 // and label so make two passes: the first to read the temps 818 // from sysfs, and the second to put them on D-Bus after 819 // resolving any conflicts. 820 std::map<std::string, double> sensorData; 821 822 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 823 for (auto& file : fs::directory_iterator(path)) 824 { 825 if (!std::regex_search(file.path().string(), expr)) 826 { 827 continue; 828 } 829 830 uint32_t labelValue{0}; 831 832 try 833 { 834 labelValue = readFile<uint32_t>(file.path()); 835 } 836 catch (const std::system_error& e) 837 { 838 log<level::DEBUG>( 839 std::format("readTempSensors: Failed reading {}, errno = {}", 840 file.path().string(), e.code().value()) 841 .c_str()); 842 continue; 843 } 844 845 const std::string& tempLabel = "label"; 846 const std::string filePathString = file.path().string().substr( 847 0, file.path().string().length() - tempLabel.length()); 848 849 uint32_t fruTypeValue{0}; 850 try 851 { 852 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 853 } 854 catch (const std::system_error& e) 855 { 856 log<level::DEBUG>( 857 std::format("readTempSensors: Failed reading {}, errno = {}", 858 filePathString + fruTypeSuffix, e.code().value()) 859 .c_str()); 860 continue; 861 } 862 863 std::string sensorPath = 864 OCC_SENSORS_ROOT + std::string("/temperature/"); 865 866 std::string dvfsTempPath; 867 868 if (fruTypeValue == VRMVdd) 869 { 870 sensorPath.append( 871 "vrm_vdd" + std::to_string(occInstance) + "_temp"); 872 } 873 else if (fruTypeValue == processorIoRing) 874 { 875 sensorPath.append( 876 "proc" + std::to_string(occInstance) + "_ioring_temp"); 877 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 878 std::to_string(occInstance) + "_ioring_dvfs_temp"; 879 } 880 else 881 { 882 uint16_t type = (labelValue & 0xFF000000) >> 24; 883 uint16_t instanceID = labelValue & 0x0000FFFF; 884 885 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 886 { 887 if (fruTypeValue == fruTypeNotAvailable) 888 { 889 // Not all DIMM related temps are available to read 890 // (no _input file in this case) 891 continue; 892 } 893 auto iter = dimmTempSensorName.find(fruTypeValue); 894 if (iter == dimmTempSensorName.end()) 895 { 896 log<level::ERR>( 897 std::format( 898 "readTempSensors: Fru type error! fruTypeValue = {}) ", 899 fruTypeValue) 900 .c_str()); 901 continue; 902 } 903 904 sensorPath.append( 905 "dimm" + std::to_string(instanceID) + iter->second); 906 907 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 908 dimmDVFSSensorName.at(fruTypeValue); 909 } 910 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 911 { 912 if (fruTypeValue == processorCore) 913 { 914 // The OCC reports small core temps, of which there are 915 // two per big core. All current P10 systems are in big 916 // core mode, so use a big core name. 917 uint16_t coreNum = instanceID / 2; 918 uint16_t tempNum = instanceID % 2; 919 sensorPath.append("proc" + std::to_string(occInstance) + 920 "_core" + std::to_string(coreNum) + "_" + 921 std::to_string(tempNum) + "_temp"); 922 923 dvfsTempPath = 924 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 925 std::to_string(occInstance) + "_core_dvfs_temp"; 926 } 927 else 928 { 929 continue; 930 } 931 } 932 else 933 { 934 continue; 935 } 936 } 937 938 // The dvfs temp file only needs to be read once per chip per type. 939 if (!dvfsTempPath.empty() && 940 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 941 { 942 try 943 { 944 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 945 946 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 947 dvfsTempPath, dvfsValue * std::pow(10, -3)); 948 } 949 catch (const std::system_error& e) 950 { 951 log<level::DEBUG>( 952 std::format( 953 "readTempSensors: Failed reading {}, errno = {}", 954 filePathString + maxSuffix, e.code().value()) 955 .c_str()); 956 } 957 } 958 959 uint32_t faultValue{0}; 960 try 961 { 962 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 963 } 964 catch (const std::system_error& e) 965 { 966 log<level::DEBUG>( 967 std::format("readTempSensors: Failed reading {}, errno = {}", 968 filePathString + faultSuffix, e.code().value()) 969 .c_str()); 970 continue; 971 } 972 973 double tempValue{0}; 974 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 975 if (faultValue != 0) 976 { 977 tempValue = std::numeric_limits<double>::quiet_NaN(); 978 } 979 else 980 { 981 // Read the temperature 982 try 983 { 984 tempValue = readFile<double>(filePathString + inputSuffix); 985 } 986 catch (const std::system_error& e) 987 { 988 log<level::DEBUG>( 989 std::format( 990 "readTempSensors: Failed reading {}, errno = {}", 991 filePathString + inputSuffix, e.code().value()) 992 .c_str()); 993 994 // if errno == EAGAIN(Resource temporarily unavailable) then set 995 // temp to 0, to avoid using old temp, and affecting FAN 996 // Control. 997 if (e.code().value() == EAGAIN) 998 { 999 tempValue = 0; 1000 } 1001 // else the errno would be something like 1002 // EBADF(Bad file descriptor) 1003 // or ENOENT(No such file or directory) 1004 else 1005 { 1006 continue; 1007 } 1008 } 1009 } 1010 1011 // If this object path already has a value, only overwite 1012 // it if the previous one was an NaN or a smaller value. 1013 auto existing = sensorData.find(sensorPath); 1014 if (existing != sensorData.end()) 1015 { 1016 // Multiple sensors found for this FRU type 1017 if ((std::isnan(existing->second) && (tempValue == 0)) || 1018 ((existing->second == 0) && std::isnan(tempValue))) 1019 { 1020 // One of the redundant sensors has failed (0xFF/nan), and the 1021 // other sensor has no reading (0), so set the FRU to NaN to 1022 // force fan increase 1023 tempValue = std::numeric_limits<double>::quiet_NaN(); 1024 existing->second = tempValue; 1025 } 1026 if (std::isnan(existing->second) || (tempValue > existing->second)) 1027 { 1028 existing->second = tempValue; 1029 } 1030 } 1031 else 1032 { 1033 // First sensor for this FRU type 1034 sensorData[sensorPath] = tempValue; 1035 } 1036 } 1037 1038 // Now publish the values on D-Bus. 1039 for (const auto& [objectPath, value] : sensorData) 1040 { 1041 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1042 value * std::pow(10, -3)); 1043 1044 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1045 objectPath, !std::isnan(value)); 1046 1047 if (existingSensors.find(objectPath) == existingSensors.end()) 1048 { 1049 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1050 objectPath); 1051 } 1052 1053 existingSensors[objectPath] = occInstance; 1054 } 1055 } 1056 1057 std::optional<std::string> 1058 Manager::getPowerLabelFunctionID(const std::string& value) 1059 { 1060 // If the value is "system", then the FunctionID is "system". 1061 if (value == "system") 1062 { 1063 return value; 1064 } 1065 1066 // If the value is not "system", then the label value have 3 numbers, of 1067 // which we only care about the middle one: 1068 // <sensor id>_<function id>_<apss channel> 1069 // eg: The value is "0_10_5" , then the FunctionID is "10". 1070 if (value.find("_") == std::string::npos) 1071 { 1072 return std::nullopt; 1073 } 1074 1075 auto powerLabelValue = value.substr((value.find("_") + 1)); 1076 1077 if (powerLabelValue.find("_") == std::string::npos) 1078 { 1079 return std::nullopt; 1080 } 1081 1082 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1083 } 1084 1085 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1086 { 1087 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1088 for (auto& file : fs::directory_iterator(path)) 1089 { 1090 if (!std::regex_search(file.path().string(), expr)) 1091 { 1092 continue; 1093 } 1094 1095 std::string labelValue; 1096 try 1097 { 1098 labelValue = readFile<std::string>(file.path()); 1099 } 1100 catch (const std::system_error& e) 1101 { 1102 log<level::DEBUG>( 1103 std::format("readPowerSensors: Failed reading {}, errno = {}", 1104 file.path().string(), e.code().value()) 1105 .c_str()); 1106 continue; 1107 } 1108 1109 auto functionID = getPowerLabelFunctionID(labelValue); 1110 if (functionID == std::nullopt) 1111 { 1112 continue; 1113 } 1114 1115 const std::string& tempLabel = "label"; 1116 const std::string filePathString = file.path().string().substr( 1117 0, file.path().string().length() - tempLabel.length()); 1118 1119 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1120 1121 auto iter = powerSensorName.find(*functionID); 1122 if (iter == powerSensorName.end()) 1123 { 1124 continue; 1125 } 1126 sensorPath.append(iter->second); 1127 1128 double tempValue{0}; 1129 1130 try 1131 { 1132 tempValue = readFile<double>(filePathString + inputSuffix); 1133 } 1134 catch (const std::system_error& e) 1135 { 1136 log<level::DEBUG>( 1137 std::format("readPowerSensors: Failed reading {}, errno = {}", 1138 filePathString + inputSuffix, e.code().value()) 1139 .c_str()); 1140 continue; 1141 } 1142 1143 dbus::OccDBusSensors::getOccDBus().setUnit( 1144 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1145 1146 dbus::OccDBusSensors::getOccDBus().setValue( 1147 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1148 1149 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1150 sensorPath, true); 1151 1152 if (existingSensors.find(sensorPath) == existingSensors.end()) 1153 { 1154 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1155 sensorPath); 1156 } 1157 1158 existingSensors[sensorPath] = id; 1159 } 1160 return; 1161 } 1162 1163 void Manager::setSensorValueToNaN(uint32_t id) const 1164 { 1165 for (const auto& [sensorPath, occId] : existingSensors) 1166 { 1167 if (occId == id) 1168 { 1169 dbus::OccDBusSensors::getOccDBus().setValue( 1170 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1171 1172 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1173 sensorPath, true); 1174 } 1175 } 1176 return; 1177 } 1178 1179 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1180 { 1181 for (const auto& [sensorPath, occId] : existingSensors) 1182 { 1183 if (occId == id) 1184 { 1185 dbus::OccDBusSensors::getOccDBus().setValue( 1186 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1187 1188 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1189 sensorPath, false); 1190 } 1191 } 1192 return; 1193 } 1194 1195 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1196 { 1197 static bool tracedError[8] = {0}; 1198 const fs::path sensorPath = occ->getHwmonPath(); 1199 const uint32_t id = occ->getOccInstanceID(); 1200 1201 if (fs::exists(sensorPath)) 1202 { 1203 // Read temperature sensors 1204 readTempSensors(sensorPath, id); 1205 1206 if (occ->isMasterOcc()) 1207 { 1208 // Read power sensors 1209 readPowerSensors(sensorPath, id); 1210 } 1211 tracedError[id] = false; 1212 } 1213 else 1214 { 1215 if (!tracedError[id]) 1216 { 1217 log<level::ERR>( 1218 std::format( 1219 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1220 id, sensorPath.c_str()) 1221 .c_str()); 1222 tracedError[id] = true; 1223 } 1224 } 1225 1226 return; 1227 } 1228 #endif 1229 1230 // Read the altitude from DBus 1231 void Manager::readAltitude() 1232 { 1233 static bool traceAltitudeErr = true; 1234 1235 utils::PropertyValue altitudeProperty{}; 1236 try 1237 { 1238 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1239 ALTITUDE_PROP); 1240 auto sensorVal = std::get<double>(altitudeProperty); 1241 if (sensorVal < 0xFFFF) 1242 { 1243 if (sensorVal < 0) 1244 { 1245 altitude = 0; 1246 } 1247 else 1248 { 1249 // Round to nearest meter 1250 altitude = uint16_t(sensorVal + 0.5); 1251 } 1252 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1253 sensorVal, altitude) 1254 .c_str()); 1255 traceAltitudeErr = true; 1256 } 1257 else 1258 { 1259 if (traceAltitudeErr) 1260 { 1261 traceAltitudeErr = false; 1262 log<level::DEBUG>( 1263 std::format("Invalid altitude value: {}", sensorVal) 1264 .c_str()); 1265 } 1266 } 1267 } 1268 catch (const sdbusplus::exception_t& e) 1269 { 1270 if (traceAltitudeErr) 1271 { 1272 traceAltitudeErr = false; 1273 log<level::INFO>( 1274 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1275 } 1276 altitude = 0xFFFF; // not available 1277 } 1278 } 1279 1280 // Callback function when ambient temperature changes 1281 void Manager::ambientCallback(sdbusplus::message_t& msg) 1282 { 1283 double currentTemp = 0; 1284 uint8_t truncatedTemp = 0xFF; 1285 std::string msgSensor; 1286 std::map<std::string, std::variant<double>> msgData; 1287 msg.read(msgSensor, msgData); 1288 1289 auto valPropMap = msgData.find(AMBIENT_PROP); 1290 if (valPropMap == msgData.end()) 1291 { 1292 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1293 return; 1294 } 1295 currentTemp = std::get<double>(valPropMap->second); 1296 if (std::isnan(currentTemp)) 1297 { 1298 truncatedTemp = 0xFF; 1299 } 1300 else 1301 { 1302 if (currentTemp < 0) 1303 { 1304 truncatedTemp = 0; 1305 } 1306 else 1307 { 1308 // Round to nearest degree C 1309 truncatedTemp = uint8_t(currentTemp + 0.5); 1310 } 1311 } 1312 1313 // If ambient changes, notify OCCs 1314 if (truncatedTemp != ambient) 1315 { 1316 log<level::DEBUG>( 1317 std::format("ambientCallback: Ambient change from {} to {}C", 1318 ambient, currentTemp) 1319 .c_str()); 1320 1321 ambient = truncatedTemp; 1322 if (altitude == 0xFFFF) 1323 { 1324 // No altitude yet, try reading again 1325 readAltitude(); 1326 } 1327 1328 log<level::DEBUG>( 1329 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1330 altitude) 1331 .c_str()); 1332 #ifdef POWER10 1333 // Send ambient and altitude to all OCCs 1334 for (auto& obj : statusObjects) 1335 { 1336 if (obj->occActive()) 1337 { 1338 obj->sendAmbient(ambient, altitude); 1339 } 1340 } 1341 #endif // POWER10 1342 } 1343 } 1344 1345 // return the current ambient and altitude readings 1346 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1347 uint16_t& altitudeValue) const 1348 { 1349 ambientValid = true; 1350 ambientTemp = ambient; 1351 altitudeValue = altitude; 1352 1353 if (ambient == 0xFF) 1354 { 1355 ambientValid = false; 1356 } 1357 } 1358 1359 #ifdef POWER10 1360 // Called when waitForAllOccsTimer expires 1361 // After the first OCC goes active, this timer will be started (60 seconds) 1362 void Manager::occsNotAllRunning() 1363 { 1364 if (activeCount != statusObjects.size()) 1365 { 1366 // Not all OCCs went active 1367 log<level::WARNING>( 1368 std::format( 1369 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1370 activeCount, statusObjects.size()) 1371 .c_str()); 1372 // Procs may be garded, so may be expected 1373 } 1374 1375 validateOccMaster(); 1376 } 1377 1378 #ifdef PLDM 1379 // Called when throttlePldmTraceTimer expires. 1380 // If this timer expires, that indicates there are no OCC active sensor PDRs 1381 // found which will trigger pldm traces to be throttled. 1382 // The second time this timer expires, a PEL will get created. 1383 void Manager::throttlePldmTraceExpired() 1384 { 1385 if (utils::isHostRunning()) 1386 { 1387 if (!onPldmTimeoutCreatePel) 1388 { 1389 // Throttle traces 1390 pldmHandle->setTraceThrottle(true); 1391 // Restart timer to log a PEL when timer expires 1392 onPldmTimeoutCreatePel = true; 1393 throttlePldmTraceTimer->restartOnce(40min); 1394 } 1395 else 1396 { 1397 log<level::ERR>( 1398 "throttlePldmTraceExpired(): OCC active sensors still not available!"); 1399 // Create PEL 1400 createPldmSensorPEL(); 1401 } 1402 } 1403 else 1404 { 1405 // Make sure traces are not throttled 1406 pldmHandle->setTraceThrottle(false); 1407 log<level::INFO>( 1408 "throttlePldmTraceExpired(): host it not running ignoring sensor timer"); 1409 } 1410 } 1411 1412 void Manager::createPldmSensorPEL() 1413 { 1414 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1415 std::map<std::string, std::string> additionalData; 1416 1417 additionalData.emplace("_PID", std::to_string(getpid())); 1418 1419 log<level::INFO>( 1420 std::format( 1421 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs") 1422 .c_str()); 1423 1424 auto& bus = utils::getBus(); 1425 1426 try 1427 { 1428 FFDCFiles ffdc; 1429 // Add occ-control journal traces to PEL FFDC 1430 auto occJournalFile = 1431 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1432 1433 static constexpr auto loggingObjectPath = 1434 "/xyz/openbmc_project/logging"; 1435 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1436 std::string service = 1437 utils::getService(loggingObjectPath, opLoggingInterface); 1438 auto method = 1439 bus.new_method_call(service.c_str(), loggingObjectPath, 1440 opLoggingInterface, "CreatePELWithFFDCFiles"); 1441 1442 // Set level to Warning (Predictive). 1443 auto level = 1444 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1445 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1446 Warning); 1447 1448 method.append(d.path, level, additionalData, ffdc); 1449 bus.call(method); 1450 } 1451 catch (const sdbusplus::exception_t& e) 1452 { 1453 log<level::ERR>( 1454 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}", 1455 e.what()) 1456 .c_str()); 1457 } 1458 } 1459 #endif // PLDM 1460 #endif // POWER10 1461 1462 // Verify single master OCC and start presence monitor 1463 void Manager::validateOccMaster() 1464 { 1465 int masterInstance = -1; 1466 for (auto& obj : statusObjects) 1467 { 1468 auto instance = obj->getOccInstanceID(); 1469 #ifdef POWER10 1470 if (!obj->occActive()) 1471 { 1472 if (utils::isHostRunning()) 1473 { 1474 // Check if sensor was queued while waiting for discovery 1475 auto match = queuedActiveState.find(instance); 1476 if (match != queuedActiveState.end()) 1477 { 1478 queuedActiveState.erase(match); 1479 log<level::INFO>( 1480 std::format( 1481 "validateOccMaster: OCC{} is ACTIVE (queued)", 1482 instance) 1483 .c_str()); 1484 obj->occActive(true); 1485 } 1486 else 1487 { 1488 // OCC does not appear to be active yet, check active sensor 1489 #ifdef PLDM 1490 pldmHandle->checkActiveSensor(instance); 1491 #endif 1492 if (obj->occActive()) 1493 { 1494 log<level::INFO>( 1495 std::format( 1496 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1497 instance) 1498 .c_str()); 1499 } 1500 } 1501 } 1502 else 1503 { 1504 log<level::WARNING>( 1505 std::format( 1506 "validateOccMaster: HOST is not running (OCC{})", 1507 instance) 1508 .c_str()); 1509 return; 1510 } 1511 } 1512 #endif // POWER10 1513 1514 if (obj->isMasterOcc()) 1515 { 1516 obj->addPresenceWatchMaster(); 1517 1518 if (masterInstance == -1) 1519 { 1520 masterInstance = instance; 1521 } 1522 else 1523 { 1524 log<level::ERR>( 1525 std::format( 1526 "validateOccMaster: Multiple OCC masters! ({} and {})", 1527 masterInstance, instance) 1528 .c_str()); 1529 // request reset 1530 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1531 } 1532 } 1533 } 1534 1535 if (masterInstance < 0) 1536 { 1537 log<level::ERR>( 1538 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1539 statusObjects.size()) 1540 .c_str()); 1541 // request reset 1542 statusObjects.front()->deviceError( 1543 Error::Descriptor(PRESENCE_ERROR_PATH)); 1544 } 1545 else 1546 { 1547 log<level::INFO>( 1548 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1549 masterInstance, activeCount) 1550 .c_str()); 1551 #ifdef POWER10 1552 pmode->updateDbusSafeMode(false); 1553 #endif 1554 } 1555 } 1556 1557 void Manager::updatePcapBounds() const 1558 { 1559 if (pcap) 1560 { 1561 pcap->updatePcapBounds(); 1562 } 1563 } 1564 1565 } // namespace occ 1566 } // namespace open_power 1567