1 #include "config.h" 2 3 #include "occ_manager.hpp" 4 5 #include "i2c_occ.hpp" 6 #include "occ_dbus.hpp" 7 #include "occ_errors.hpp" 8 #include "utils.hpp" 9 10 #include <phosphor-logging/elog-errors.hpp> 11 #include <phosphor-logging/log.hpp> 12 #include <xyz/openbmc_project/Common/error.hpp> 13 14 #include <chrono> 15 #include <cmath> 16 #include <filesystem> 17 #include <fstream> 18 #include <regex> 19 20 namespace open_power 21 { 22 namespace occ 23 { 24 25 constexpr uint32_t fruTypeNotAvailable = 0xFF; 26 constexpr auto fruTypeSuffix = "fru_type"; 27 constexpr auto faultSuffix = "fault"; 28 constexpr auto inputSuffix = "input"; 29 constexpr auto maxSuffix = "max"; 30 31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on"; 32 33 using namespace phosphor::logging; 34 using namespace std::literals::chrono_literals; 35 36 template <typename T> 37 T readFile(const std::string& path) 38 { 39 std::ifstream ifs; 40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit | 41 std::ifstream::eofbit); 42 T data; 43 44 try 45 { 46 ifs.open(path); 47 ifs >> data; 48 ifs.close(); 49 } 50 catch (const std::exception& e) 51 { 52 auto err = errno; 53 throw std::system_error(err, std::generic_category()); 54 } 55 56 return data; 57 } 58 59 // findAndCreateObjects(): 60 // Takes care of getting the required objects created and 61 // finds the available devices/processors. 62 // (function is called everytime the discoverTimer expires) 63 // - create the PowerMode object to control OCC modes 64 // - create statusObjects for each OCC device found 65 // - waits for OCC Active sensors PDRs to become available 66 // - restart discoverTimer if all data is not available yet 67 void Manager::findAndCreateObjects() 68 { 69 #ifndef POWER10 70 for (auto id = 0; id < MAX_CPUS; ++id) 71 { 72 // Create one occ per cpu 73 auto occ = std::string(OCC_NAME) + std::to_string(id); 74 createObjects(occ); 75 } 76 #else 77 if (!pmode) 78 { 79 // Create the power mode object 80 pmode = std::make_unique<powermode::PowerMode>( 81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event); 82 } 83 84 if (!fs::exists(HOST_ON_FILE)) 85 { 86 static bool statusObjCreated = false; 87 if (!statusObjCreated) 88 { 89 // Create the OCCs based on on the /dev/occX devices 90 auto occs = findOCCsInDev(); 91 92 if (occs.empty() || (prevOCCSearch.size() != occs.size())) 93 { 94 // Something changed or no OCCs yet, try again in 10s. 95 // Note on the first pass prevOCCSearch will be empty, 96 // so there will be at least one delay to give things 97 // a chance to settle. 98 prevOCCSearch = occs; 99 100 log<level::INFO>( 101 std::format( 102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})", 103 occs.size()) 104 .c_str()); 105 106 discoverTimer->restartOnce(10s); 107 } 108 else 109 { 110 // All OCCs appear to be available, create status objects 111 112 // createObjects requires OCC0 first. 113 std::sort(occs.begin(), occs.end()); 114 115 log<level::INFO>( 116 std::format( 117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects", 118 occs.size()) 119 .c_str()); 120 for (auto id : occs) 121 { 122 createObjects(std::string(OCC_NAME) + std::to_string(id)); 123 } 124 statusObjCreated = true; 125 waitingForAllOccActiveSensors = true; 126 127 // Find/update the processor path associated with each OCC 128 for (auto& obj : statusObjects) 129 { 130 obj->updateProcAssociation(); 131 } 132 } 133 } 134 135 if (statusObjCreated && waitingForAllOccActiveSensors) 136 { 137 static bool tracedHostWait = false; 138 if (utils::isHostRunning()) 139 { 140 if (tracedHostWait) 141 { 142 log<level::INFO>( 143 "Manager::findAndCreateObjects(): Host is running"); 144 tracedHostWait = false; 145 } 146 checkAllActiveSensors(); 147 } 148 else 149 { 150 if (!tracedHostWait) 151 { 152 log<level::INFO>( 153 "Manager::findAndCreateObjects(): Waiting for host to start"); 154 tracedHostWait = true; 155 } 156 discoverTimer->restartOnce(30s); 157 #ifdef PLDM 158 if (throttlePldmTraceTimer->isEnabled()) 159 { 160 // Host is no longer running, disable throttle timer and 161 // make sure traces are not throttled 162 log<level::INFO>( 163 "findAndCreateObjects(): disabling sensor timer"); 164 throttlePldmTraceTimer->setEnabled(false); 165 pldmHandle->setTraceThrottle(false); 166 } 167 #endif 168 } 169 } 170 } 171 else 172 { 173 log<level::INFO>( 174 std::format( 175 "Manager::findAndCreateObjects(): Waiting for {} to complete...", 176 HOST_ON_FILE) 177 .c_str()); 178 discoverTimer->restartOnce(10s); 179 } 180 #endif 181 } 182 183 #ifdef POWER10 184 // Check if all occActive sensors are available 185 void Manager::checkAllActiveSensors() 186 { 187 static bool allActiveSensorAvailable = false; 188 static bool tracedSensorWait = false; 189 static bool waitingForHost = false; 190 191 if (open_power::occ::utils::isHostRunning()) 192 { 193 if (waitingForHost) 194 { 195 waitingForHost = false; 196 log<level::INFO>("checkAllActiveSensors(): Host is now running"); 197 } 198 199 // Start with the assumption that all are available 200 allActiveSensorAvailable = true; 201 for (auto& obj : statusObjects) 202 { 203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived())) 204 { 205 auto instance = obj->getOccInstanceID(); 206 // Check if sensor was queued while waiting for discovery 207 auto match = queuedActiveState.find(instance); 208 if (match != queuedActiveState.end()) 209 { 210 queuedActiveState.erase(match); 211 log<level::INFO>( 212 std::format( 213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)", 214 instance) 215 .c_str()); 216 obj->occActive(true); 217 } 218 else 219 { 220 allActiveSensorAvailable = false; 221 if (!tracedSensorWait) 222 { 223 log<level::INFO>( 224 std::format( 225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor", 226 instance) 227 .c_str()); 228 tracedSensorWait = true; 229 #ifdef PLDM 230 // Make sure PLDM traces are not throttled 231 pldmHandle->setTraceThrottle(false); 232 // Start timer to throttle PLDM traces when timer 233 // expires 234 onPldmTimeoutCreatePel = false; 235 throttlePldmTraceTimer->restartOnce(5min); 236 #endif 237 } 238 #ifdef PLDM 239 pldmHandle->checkActiveSensor(obj->getOccInstanceID()); 240 #endif 241 break; 242 } 243 } 244 } 245 } 246 else 247 { 248 if (!waitingForHost) 249 { 250 waitingForHost = true; 251 log<level::INFO>( 252 "checkAllActiveSensors(): Waiting for host to start"); 253 #ifdef PLDM 254 if (throttlePldmTraceTimer->isEnabled()) 255 { 256 // Host is no longer running, disable throttle timer and 257 // make sure traces are not throttled 258 log<level::INFO>( 259 "checkAllActiveSensors(): disabling sensor timer"); 260 throttlePldmTraceTimer->setEnabled(false); 261 pldmHandle->setTraceThrottle(false); 262 } 263 #endif 264 } 265 } 266 267 if (allActiveSensorAvailable) 268 { 269 // All sensors were found, disable the discovery timer 270 if (discoverTimer->isEnabled()) 271 { 272 discoverTimer->setEnabled(false); 273 } 274 #ifdef PLDM 275 if (throttlePldmTraceTimer->isEnabled()) 276 { 277 // Disable throttle timer and make sure traces are not throttled 278 throttlePldmTraceTimer->setEnabled(false); 279 pldmHandle->setTraceThrottle(false); 280 } 281 #endif 282 if (waitingForAllOccActiveSensors) 283 { 284 log<level::INFO>( 285 "checkAllActiveSensors(): OCC Active sensors are available"); 286 waitingForAllOccActiveSensors = false; 287 } 288 queuedActiveState.clear(); 289 tracedSensorWait = false; 290 } 291 else 292 { 293 // Not all sensors were available, so keep waiting 294 if (!tracedSensorWait) 295 { 296 log<level::INFO>( 297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available"); 298 tracedSensorWait = true; 299 } 300 discoverTimer->restartOnce(10s); 301 } 302 } 303 #endif 304 305 std::vector<int> Manager::findOCCsInDev() 306 { 307 std::vector<int> occs; 308 std::regex expr{R"(occ(\d+)$)"}; 309 310 for (auto& file : fs::directory_iterator("/dev")) 311 { 312 std::smatch match; 313 std::string path{file.path().string()}; 314 if (std::regex_search(path, match, expr)) 315 { 316 auto num = std::stoi(match[1].str()); 317 318 // /dev numbering starts at 1, ours starts at 0. 319 occs.push_back(num - 1); 320 } 321 } 322 323 return occs; 324 } 325 326 int Manager::cpuCreated(sdbusplus::message_t& msg) 327 { 328 namespace fs = std::filesystem; 329 330 sdbusplus::message::object_path o; 331 msg.read(o); 332 fs::path cpuPath(std::string(std::move(o))); 333 334 auto name = cpuPath.filename().string(); 335 auto index = name.find(CPU_NAME); 336 name.replace(index, std::strlen(CPU_NAME), OCC_NAME); 337 338 createObjects(name); 339 340 return 0; 341 } 342 343 void Manager::createObjects(const std::string& occ) 344 { 345 auto path = fs::path(OCC_CONTROL_ROOT) / occ; 346 347 statusObjects.emplace_back(std::make_unique<Status>( 348 event, path.c_str(), *this, 349 #ifdef POWER10 350 pmode, 351 #endif 352 std::bind(std::mem_fn(&Manager::statusCallBack), this, 353 std::placeholders::_1, std::placeholders::_2) 354 #ifdef PLDM 355 , 356 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(), 357 std::placeholders::_1) 358 #endif 359 )); 360 361 // Create the power cap monitor object 362 if (!pcap) 363 { 364 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 365 *statusObjects.back()); 366 } 367 368 if (statusObjects.back()->isMasterOcc()) 369 { 370 log<level::INFO>( 371 std::format("Manager::createObjects(): OCC{} is the master", 372 statusObjects.back()->getOccInstanceID()) 373 .c_str()); 374 _pollTimer->setEnabled(false); 375 376 #ifdef POWER10 377 // Set the master OCC on the PowerMode object 378 pmode->setMasterOcc(path); 379 #endif 380 } 381 382 passThroughObjects.emplace_back(std::make_unique<PassThrough>( 383 path.c_str() 384 #ifdef POWER10 385 , 386 pmode 387 #endif 388 )); 389 } 390 391 void Manager::statusCallBack(instanceID instance, bool status) 392 { 393 if (status == true) 394 { 395 // OCC went active 396 ++activeCount; 397 398 #ifdef POWER10 399 if (activeCount == 1) 400 { 401 // First OCC went active (allow some time for all OCCs to go active) 402 waitForAllOccsTimer->restartOnce(60s); 403 } 404 #endif 405 406 if (activeCount == statusObjects.size()) 407 { 408 #ifdef POWER10 409 // All OCCs are now running 410 if (waitForAllOccsTimer->isEnabled()) 411 { 412 // stop occ wait timer 413 waitForAllOccsTimer->setEnabled(false); 414 } 415 #endif 416 417 // Verify master OCC and start presence monitor 418 validateOccMaster(); 419 } 420 421 // Start poll timer if not already started 422 if (!_pollTimer->isEnabled()) 423 { 424 log<level::INFO>( 425 std::format("Manager: OCCs will be polled every {} seconds", 426 pollInterval) 427 .c_str()); 428 429 // Send poll and start OCC poll timer 430 pollerTimerExpired(); 431 } 432 } 433 else 434 { 435 // OCC went away 436 if (activeCount > 0) 437 { 438 --activeCount; 439 } 440 else 441 { 442 log<level::ERR>( 443 std::format("OCC{} disabled, but currently no active OCCs", 444 instance) 445 .c_str()); 446 } 447 448 if (activeCount == 0) 449 { 450 // No OCCs are running 451 452 // Stop OCC poll timer 453 if (_pollTimer->isEnabled()) 454 { 455 log<level::INFO>( 456 "Manager::statusCallBack(): OCCs are not running, stopping poll timer"); 457 _pollTimer->setEnabled(false); 458 } 459 460 #ifdef POWER10 461 // stop wait timer 462 if (waitForAllOccsTimer->isEnabled()) 463 { 464 waitForAllOccsTimer->setEnabled(false); 465 } 466 #endif 467 } 468 #ifdef READ_OCC_SENSORS 469 // Clear OCC sensors 470 setSensorValueToNaN(instance); 471 #endif 472 } 473 474 #ifdef POWER10 475 if (waitingForAllOccActiveSensors) 476 { 477 if (utils::isHostRunning()) 478 { 479 checkAllActiveSensors(); 480 } 481 } 482 #endif 483 } 484 485 #ifdef I2C_OCC 486 void Manager::initStatusObjects() 487 { 488 // Make sure we have a valid path string 489 static_assert(sizeof(DEV_PATH) != 0); 490 491 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH); 492 for (auto& name : deviceNames) 493 { 494 i2c_occ::i2cToDbus(name); 495 name = std::string(OCC_NAME) + '_' + name; 496 auto path = fs::path(OCC_CONTROL_ROOT) / name; 497 statusObjects.emplace_back( 498 std::make_unique<Status>(event, path.c_str(), *this)); 499 } 500 // The first device is master occ 501 pcap = std::make_unique<open_power::occ::powercap::PowerCap>( 502 *statusObjects.front()); 503 #ifdef POWER10 504 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH, 505 powermode::PIPS_PATH); 506 // Set the master OCC on the PowerMode object 507 pmode->setMasterOcc(path); 508 #endif 509 } 510 #endif 511 512 #ifdef PLDM 513 void Manager::sbeTimeout(unsigned int instance) 514 { 515 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 516 [instance](const auto& obj) { 517 return instance == obj->getOccInstanceID(); 518 }); 519 520 if (obj != statusObjects.end() && (*obj)->occActive()) 521 { 522 log<level::INFO>( 523 std::format("SBE timeout, requesting HRESET (OCC{})", instance) 524 .c_str()); 525 526 setSBEState(instance, SBE_STATE_NOT_USABLE); 527 528 pldmHandle->sendHRESET(instance); 529 } 530 } 531 532 bool Manager::updateOCCActive(instanceID instance, bool status) 533 { 534 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(), 535 [instance](const auto& obj) { 536 return instance == obj->getOccInstanceID(); 537 }); 538 539 const bool hostRunning = open_power::occ::utils::isHostRunning(); 540 if (obj != statusObjects.end()) 541 { 542 if (!hostRunning && (status == true)) 543 { 544 log<level::WARNING>( 545 std::format( 546 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received", 547 instance, status) 548 .c_str()); 549 (*obj)->setPldmSensorReceived(false); 550 if (!waitingForAllOccActiveSensors) 551 { 552 log<level::INFO>( 553 "updateOCCActive: Waiting for Host and all OCC Active Sensors"); 554 waitingForAllOccActiveSensors = true; 555 } 556 #ifdef POWER10 557 discoverTimer->restartOnce(30s); 558 #endif 559 return false; 560 } 561 else 562 { 563 (*obj)->setPldmSensorReceived(true); 564 return (*obj)->occActive(status); 565 } 566 } 567 else 568 { 569 if (hostRunning) 570 { 571 log<level::WARNING>( 572 std::format( 573 "updateOCCActive: No status object to update for OCC{} (active={})", 574 instance, status) 575 .c_str()); 576 } 577 else 578 { 579 if (status == true) 580 { 581 log<level::WARNING>( 582 std::format( 583 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})", 584 instance, status) 585 .c_str()); 586 } 587 } 588 if (status == true) 589 { 590 // OCC went active 591 queuedActiveState.insert(instance); 592 } 593 else 594 { 595 auto match = queuedActiveState.find(instance); 596 if (match != queuedActiveState.end()) 597 { 598 // OCC was disabled 599 queuedActiveState.erase(match); 600 } 601 } 602 return false; 603 } 604 } 605 606 // Called upon pldm event To set powermode Safe Mode State for system. 607 void Manager::updateOccSafeMode(bool safeMode) 608 { 609 #ifdef POWER10 610 pmode->updateDbusSafeMode(safeMode); 611 #endif 612 // Update the processor throttle status on dbus 613 for (auto& obj : statusObjects) 614 { 615 obj->updateThrottle(safeMode, THROTTLED_SAFE); 616 } 617 } 618 619 void Manager::sbeHRESETResult(instanceID instance, bool success) 620 { 621 if (success) 622 { 623 log<level::INFO>( 624 std::format("HRESET succeeded (OCC{})", instance).c_str()); 625 626 setSBEState(instance, SBE_STATE_BOOTED); 627 628 return; 629 } 630 631 setSBEState(instance, SBE_STATE_FAILED); 632 633 if (sbeCanDump(instance)) 634 { 635 log<level::INFO>( 636 std::format("HRESET failed (OCC{}), triggering SBE dump", instance) 637 .c_str()); 638 639 auto& bus = utils::getBus(); 640 uint32_t src6 = instance << 16; 641 uint32_t logId = 642 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout", 643 src6, "SBE command timeout"); 644 645 try 646 { 647 constexpr auto interface = "xyz.openbmc_project.Dump.Create"; 648 constexpr auto function = "CreateDump"; 649 650 std::string service = 651 utils::getService(OP_DUMP_OBJ_PATH, interface); 652 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH, 653 interface, function); 654 655 std::map<std::string, std::variant<std::string, uint64_t>> 656 createParams{ 657 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId", 658 uint64_t(logId)}, 659 {"com.ibm.Dump.Create.CreateParameters.DumpType", 660 "com.ibm.Dump.Create.DumpType.SBE"}, 661 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId", 662 uint64_t(instance)}, 663 }; 664 665 method.append(createParams); 666 667 auto response = bus.call(method); 668 } 669 catch (const sdbusplus::exception_t& e) 670 { 671 constexpr auto ERROR_DUMP_DISABLED = 672 "xyz.openbmc_project.Dump.Create.Error.Disabled"; 673 if (e.name() == ERROR_DUMP_DISABLED) 674 { 675 log<level::INFO>("Dump is disabled, skipping"); 676 } 677 else 678 { 679 log<level::ERR>("Dump failed"); 680 } 681 } 682 } 683 } 684 685 bool Manager::sbeCanDump(unsigned int instance) 686 { 687 struct pdbg_target* proc = getPdbgTarget(instance); 688 689 if (!proc) 690 { 691 // allow the dump in the error case 692 return true; 693 } 694 695 try 696 { 697 if (!openpower::phal::sbe::isDumpAllowed(proc)) 698 { 699 return false; 700 } 701 702 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc)) 703 { 704 return false; 705 } 706 } 707 catch (openpower::phal::exception::SbeError& e) 708 { 709 log<level::INFO>("Failed to query SBE state"); 710 } 711 712 // allow the dump in the error case 713 return true; 714 } 715 716 void Manager::setSBEState(unsigned int instance, enum sbe_state state) 717 { 718 struct pdbg_target* proc = getPdbgTarget(instance); 719 720 if (!proc) 721 { 722 return; 723 } 724 725 try 726 { 727 openpower::phal::sbe::setState(proc, state); 728 } 729 catch (const openpower::phal::exception::SbeError& e) 730 { 731 log<level::ERR>( 732 std::format("Failed to set SBE state: {}", e.what()).c_str()); 733 } 734 } 735 736 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance) 737 { 738 if (!pdbgInitialized) 739 { 740 try 741 { 742 openpower::phal::pdbg::init(); 743 pdbgInitialized = true; 744 } 745 catch (const openpower::phal::exception::PdbgError& e) 746 { 747 log<level::ERR>("pdbg initialization failed"); 748 return nullptr; 749 } 750 } 751 752 struct pdbg_target* proc = nullptr; 753 pdbg_for_each_class_target("proc", proc) 754 { 755 if (pdbg_target_index(proc) == instance) 756 { 757 return proc; 758 } 759 } 760 761 log<level::ERR>("Failed to get pdbg target"); 762 return nullptr; 763 } 764 #endif 765 766 void Manager::pollerTimerExpired() 767 { 768 if (!_pollTimer) 769 { 770 log<level::ERR>( 771 "Manager::pollerTimerExpired() ERROR: Timer not defined"); 772 return; 773 } 774 775 for (auto& obj : statusObjects) 776 { 777 if (!obj->occActive()) 778 { 779 // OCC is not running yet 780 #ifdef READ_OCC_SENSORS 781 auto id = obj->getOccInstanceID(); 782 setSensorValueToNaN(id); 783 #endif 784 continue; 785 } 786 787 // Read sysfs to force kernel to poll OCC 788 obj->readOccState(); 789 790 #ifdef READ_OCC_SENSORS 791 // Read occ sensor values 792 getSensorValues(obj); 793 #endif 794 } 795 796 if (activeCount > 0) 797 { 798 // Restart OCC poll timer 799 _pollTimer->restartOnce(std::chrono::seconds(pollInterval)); 800 } 801 else 802 { 803 // No OCCs running, so poll timer will not be restarted 804 log<level::INFO>( 805 std::format( 806 "Manager::pollerTimerExpired: poll timer will not be restarted") 807 .c_str()); 808 } 809 } 810 811 #ifdef READ_OCC_SENSORS 812 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance) 813 { 814 // There may be more than one sensor with the same FRU type 815 // and label so make two passes: the first to read the temps 816 // from sysfs, and the second to put them on D-Bus after 817 // resolving any conflicts. 818 std::map<std::string, double> sensorData; 819 820 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label 821 for (auto& file : fs::directory_iterator(path)) 822 { 823 if (!std::regex_search(file.path().string(), expr)) 824 { 825 continue; 826 } 827 828 uint32_t labelValue{0}; 829 830 try 831 { 832 labelValue = readFile<uint32_t>(file.path()); 833 } 834 catch (const std::system_error& e) 835 { 836 log<level::DEBUG>( 837 std::format("readTempSensors: Failed reading {}, errno = {}", 838 file.path().string(), e.code().value()) 839 .c_str()); 840 continue; 841 } 842 843 const std::string& tempLabel = "label"; 844 const std::string filePathString = file.path().string().substr( 845 0, file.path().string().length() - tempLabel.length()); 846 847 uint32_t fruTypeValue{0}; 848 try 849 { 850 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix); 851 } 852 catch (const std::system_error& e) 853 { 854 log<level::DEBUG>( 855 std::format("readTempSensors: Failed reading {}, errno = {}", 856 filePathString + fruTypeSuffix, e.code().value()) 857 .c_str()); 858 continue; 859 } 860 861 std::string sensorPath = 862 OCC_SENSORS_ROOT + std::string("/temperature/"); 863 864 std::string dvfsTempPath; 865 866 if (fruTypeValue == VRMVdd) 867 { 868 sensorPath.append( 869 "vrm_vdd" + std::to_string(occInstance) + "_temp"); 870 } 871 else if (fruTypeValue == processorIoRing) 872 { 873 sensorPath.append( 874 "proc" + std::to_string(occInstance) + "_ioring_temp"); 875 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 876 std::to_string(occInstance) + "_ioring_dvfs_temp"; 877 } 878 else 879 { 880 uint16_t type = (labelValue & 0xFF000000) >> 24; 881 uint16_t instanceID = labelValue & 0x0000FFFF; 882 883 if (type == OCC_DIMM_TEMP_SENSOR_TYPE) 884 { 885 if (fruTypeValue == fruTypeNotAvailable) 886 { 887 // Not all DIMM related temps are available to read 888 // (no _input file in this case) 889 continue; 890 } 891 auto iter = dimmTempSensorName.find(fruTypeValue); 892 if (iter == dimmTempSensorName.end()) 893 { 894 log<level::ERR>( 895 std::format( 896 "readTempSensors: Fru type error! fruTypeValue = {}) ", 897 fruTypeValue) 898 .c_str()); 899 continue; 900 } 901 902 sensorPath.append( 903 "dimm" + std::to_string(instanceID) + iter->second); 904 905 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" + 906 dimmDVFSSensorName.at(fruTypeValue); 907 } 908 else if (type == OCC_CPU_TEMP_SENSOR_TYPE) 909 { 910 if (fruTypeValue == processorCore) 911 { 912 // The OCC reports small core temps, of which there are 913 // two per big core. All current P10 systems are in big 914 // core mode, so use a big core name. 915 uint16_t coreNum = instanceID / 2; 916 uint16_t tempNum = instanceID % 2; 917 sensorPath.append("proc" + std::to_string(occInstance) + 918 "_core" + std::to_string(coreNum) + "_" + 919 std::to_string(tempNum) + "_temp"); 920 921 dvfsTempPath = 922 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" + 923 std::to_string(occInstance) + "_core_dvfs_temp"; 924 } 925 else 926 { 927 continue; 928 } 929 } 930 else 931 { 932 continue; 933 } 934 } 935 936 // The dvfs temp file only needs to be read once per chip per type. 937 if (!dvfsTempPath.empty() && 938 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath)) 939 { 940 try 941 { 942 auto dvfsValue = readFile<double>(filePathString + maxSuffix); 943 944 dbus::OccDBusSensors::getOccDBus().setDvfsTemp( 945 dvfsTempPath, dvfsValue * std::pow(10, -3)); 946 } 947 catch (const std::system_error& e) 948 { 949 log<level::DEBUG>( 950 std::format( 951 "readTempSensors: Failed reading {}, errno = {}", 952 filePathString + maxSuffix, e.code().value()) 953 .c_str()); 954 } 955 } 956 957 uint32_t faultValue{0}; 958 try 959 { 960 faultValue = readFile<uint32_t>(filePathString + faultSuffix); 961 } 962 catch (const std::system_error& e) 963 { 964 log<level::DEBUG>( 965 std::format("readTempSensors: Failed reading {}, errno = {}", 966 filePathString + faultSuffix, e.code().value()) 967 .c_str()); 968 continue; 969 } 970 971 double tempValue{0}; 972 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1. 973 if (faultValue != 0) 974 { 975 tempValue = std::numeric_limits<double>::quiet_NaN(); 976 } 977 else 978 { 979 // Read the temperature 980 try 981 { 982 tempValue = readFile<double>(filePathString + inputSuffix); 983 } 984 catch (const std::system_error& e) 985 { 986 log<level::DEBUG>( 987 std::format( 988 "readTempSensors: Failed reading {}, errno = {}", 989 filePathString + inputSuffix, e.code().value()) 990 .c_str()); 991 992 // if errno == EAGAIN(Resource temporarily unavailable) then set 993 // temp to 0, to avoid using old temp, and affecting FAN 994 // Control. 995 if (e.code().value() == EAGAIN) 996 { 997 tempValue = 0; 998 } 999 // else the errno would be something like 1000 // EBADF(Bad file descriptor) 1001 // or ENOENT(No such file or directory) 1002 else 1003 { 1004 continue; 1005 } 1006 } 1007 } 1008 1009 // If this object path already has a value, only overwite 1010 // it if the previous one was an NaN or a smaller value. 1011 auto existing = sensorData.find(sensorPath); 1012 if (existing != sensorData.end()) 1013 { 1014 // Multiple sensors found for this FRU type 1015 if ((std::isnan(existing->second) && (tempValue == 0)) || 1016 ((existing->second == 0) && std::isnan(tempValue))) 1017 { 1018 // One of the redundant sensors has failed (0xFF/nan), and the 1019 // other sensor has no reading (0), so set the FRU to NaN to 1020 // force fan increase 1021 tempValue = std::numeric_limits<double>::quiet_NaN(); 1022 existing->second = tempValue; 1023 } 1024 if (std::isnan(existing->second) || (tempValue > existing->second)) 1025 { 1026 existing->second = tempValue; 1027 } 1028 } 1029 else 1030 { 1031 // First sensor for this FRU type 1032 sensorData[sensorPath] = tempValue; 1033 } 1034 } 1035 1036 // Now publish the values on D-Bus. 1037 for (const auto& [objectPath, value] : sensorData) 1038 { 1039 dbus::OccDBusSensors::getOccDBus().setValue(objectPath, 1040 value * std::pow(10, -3)); 1041 1042 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1043 objectPath, !std::isnan(value)); 1044 1045 if (existingSensors.find(objectPath) == existingSensors.end()) 1046 { 1047 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1048 objectPath); 1049 } 1050 1051 existingSensors[objectPath] = occInstance; 1052 } 1053 } 1054 1055 std::optional<std::string> 1056 Manager::getPowerLabelFunctionID(const std::string& value) 1057 { 1058 // If the value is "system", then the FunctionID is "system". 1059 if (value == "system") 1060 { 1061 return value; 1062 } 1063 1064 // If the value is not "system", then the label value have 3 numbers, of 1065 // which we only care about the middle one: 1066 // <sensor id>_<function id>_<apss channel> 1067 // eg: The value is "0_10_5" , then the FunctionID is "10". 1068 if (value.find("_") == std::string::npos) 1069 { 1070 return std::nullopt; 1071 } 1072 1073 auto powerLabelValue = value.substr((value.find("_") + 1)); 1074 1075 if (powerLabelValue.find("_") == std::string::npos) 1076 { 1077 return std::nullopt; 1078 } 1079 1080 return powerLabelValue.substr(0, powerLabelValue.find("_")); 1081 } 1082 1083 void Manager::readPowerSensors(const fs::path& path, uint32_t id) 1084 { 1085 std::regex expr{"power\\d+_label$"}; // Example: power5_label 1086 for (auto& file : fs::directory_iterator(path)) 1087 { 1088 if (!std::regex_search(file.path().string(), expr)) 1089 { 1090 continue; 1091 } 1092 1093 std::string labelValue; 1094 try 1095 { 1096 labelValue = readFile<std::string>(file.path()); 1097 } 1098 catch (const std::system_error& e) 1099 { 1100 log<level::DEBUG>( 1101 std::format("readPowerSensors: Failed reading {}, errno = {}", 1102 file.path().string(), e.code().value()) 1103 .c_str()); 1104 continue; 1105 } 1106 1107 auto functionID = getPowerLabelFunctionID(labelValue); 1108 if (functionID == std::nullopt) 1109 { 1110 continue; 1111 } 1112 1113 const std::string& tempLabel = "label"; 1114 const std::string filePathString = file.path().string().substr( 1115 0, file.path().string().length() - tempLabel.length()); 1116 1117 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/"); 1118 1119 auto iter = powerSensorName.find(*functionID); 1120 if (iter == powerSensorName.end()) 1121 { 1122 continue; 1123 } 1124 sensorPath.append(iter->second); 1125 1126 double tempValue{0}; 1127 1128 try 1129 { 1130 tempValue = readFile<double>(filePathString + inputSuffix); 1131 } 1132 catch (const std::system_error& e) 1133 { 1134 log<level::DEBUG>( 1135 std::format("readPowerSensors: Failed reading {}, errno = {}", 1136 filePathString + inputSuffix, e.code().value()) 1137 .c_str()); 1138 continue; 1139 } 1140 1141 dbus::OccDBusSensors::getOccDBus().setUnit( 1142 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts"); 1143 1144 dbus::OccDBusSensors::getOccDBus().setValue( 1145 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3)); 1146 1147 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1148 sensorPath, true); 1149 1150 if (existingSensors.find(sensorPath) == existingSensors.end()) 1151 { 1152 dbus::OccDBusSensors::getOccDBus().setChassisAssociation( 1153 sensorPath); 1154 } 1155 1156 existingSensors[sensorPath] = id; 1157 } 1158 return; 1159 } 1160 1161 void Manager::setSensorValueToNaN(uint32_t id) const 1162 { 1163 for (const auto& [sensorPath, occId] : existingSensors) 1164 { 1165 if (occId == id) 1166 { 1167 dbus::OccDBusSensors::getOccDBus().setValue( 1168 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1169 1170 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1171 sensorPath, true); 1172 } 1173 } 1174 return; 1175 } 1176 1177 void Manager::setSensorValueToNonFunctional(uint32_t id) const 1178 { 1179 for (const auto& [sensorPath, occId] : existingSensors) 1180 { 1181 if (occId == id) 1182 { 1183 dbus::OccDBusSensors::getOccDBus().setValue( 1184 sensorPath, std::numeric_limits<double>::quiet_NaN()); 1185 1186 dbus::OccDBusSensors::getOccDBus().setOperationalStatus( 1187 sensorPath, false); 1188 } 1189 } 1190 return; 1191 } 1192 1193 void Manager::getSensorValues(std::unique_ptr<Status>& occ) 1194 { 1195 static bool tracedError[8] = {0}; 1196 const fs::path sensorPath = occ->getHwmonPath(); 1197 const uint32_t id = occ->getOccInstanceID(); 1198 1199 if (fs::exists(sensorPath)) 1200 { 1201 // Read temperature sensors 1202 readTempSensors(sensorPath, id); 1203 1204 if (occ->isMasterOcc()) 1205 { 1206 // Read power sensors 1207 readPowerSensors(sensorPath, id); 1208 } 1209 tracedError[id] = false; 1210 } 1211 else 1212 { 1213 if (!tracedError[id]) 1214 { 1215 log<level::ERR>( 1216 std::format( 1217 "Manager::getSensorValues: OCC{} sensor path missing: {}", 1218 id, sensorPath.c_str()) 1219 .c_str()); 1220 tracedError[id] = true; 1221 } 1222 } 1223 1224 return; 1225 } 1226 #endif 1227 1228 // Read the altitude from DBus 1229 void Manager::readAltitude() 1230 { 1231 static bool traceAltitudeErr = true; 1232 1233 utils::PropertyValue altitudeProperty{}; 1234 try 1235 { 1236 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE, 1237 ALTITUDE_PROP); 1238 auto sensorVal = std::get<double>(altitudeProperty); 1239 if (sensorVal < 0xFFFF) 1240 { 1241 if (sensorVal < 0) 1242 { 1243 altitude = 0; 1244 } 1245 else 1246 { 1247 // Round to nearest meter 1248 altitude = uint16_t(sensorVal + 0.5); 1249 } 1250 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)", 1251 sensorVal, altitude) 1252 .c_str()); 1253 traceAltitudeErr = true; 1254 } 1255 else 1256 { 1257 if (traceAltitudeErr) 1258 { 1259 traceAltitudeErr = false; 1260 log<level::DEBUG>( 1261 std::format("Invalid altitude value: {}", sensorVal) 1262 .c_str()); 1263 } 1264 } 1265 } 1266 catch (const sdbusplus::exception_t& e) 1267 { 1268 if (traceAltitudeErr) 1269 { 1270 traceAltitudeErr = false; 1271 log<level::INFO>( 1272 std::format("Unable to read Altitude: {}", e.what()).c_str()); 1273 } 1274 altitude = 0xFFFF; // not available 1275 } 1276 } 1277 1278 // Callback function when ambient temperature changes 1279 void Manager::ambientCallback(sdbusplus::message_t& msg) 1280 { 1281 double currentTemp = 0; 1282 uint8_t truncatedTemp = 0xFF; 1283 std::string msgSensor; 1284 std::map<std::string, std::variant<double>> msgData; 1285 msg.read(msgSensor, msgData); 1286 1287 auto valPropMap = msgData.find(AMBIENT_PROP); 1288 if (valPropMap == msgData.end()) 1289 { 1290 log<level::DEBUG>("ambientCallback: Unknown ambient property changed"); 1291 return; 1292 } 1293 currentTemp = std::get<double>(valPropMap->second); 1294 if (std::isnan(currentTemp)) 1295 { 1296 truncatedTemp = 0xFF; 1297 } 1298 else 1299 { 1300 if (currentTemp < 0) 1301 { 1302 truncatedTemp = 0; 1303 } 1304 else 1305 { 1306 // Round to nearest degree C 1307 truncatedTemp = uint8_t(currentTemp + 0.5); 1308 } 1309 } 1310 1311 // If ambient changes, notify OCCs 1312 if (truncatedTemp != ambient) 1313 { 1314 log<level::DEBUG>( 1315 std::format("ambientCallback: Ambient change from {} to {}C", 1316 ambient, currentTemp) 1317 .c_str()); 1318 1319 ambient = truncatedTemp; 1320 if (altitude == 0xFFFF) 1321 { 1322 // No altitude yet, try reading again 1323 readAltitude(); 1324 } 1325 1326 log<level::DEBUG>( 1327 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient, 1328 altitude) 1329 .c_str()); 1330 #ifdef POWER10 1331 // Send ambient and altitude to all OCCs 1332 for (auto& obj : statusObjects) 1333 { 1334 if (obj->occActive()) 1335 { 1336 obj->sendAmbient(ambient, altitude); 1337 } 1338 } 1339 #endif // POWER10 1340 } 1341 } 1342 1343 // return the current ambient and altitude readings 1344 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp, 1345 uint16_t& altitudeValue) const 1346 { 1347 ambientValid = true; 1348 ambientTemp = ambient; 1349 altitudeValue = altitude; 1350 1351 if (ambient == 0xFF) 1352 { 1353 ambientValid = false; 1354 } 1355 } 1356 1357 #ifdef POWER10 1358 // Called when waitForAllOccsTimer expires 1359 // After the first OCC goes active, this timer will be started (60 seconds) 1360 void Manager::occsNotAllRunning() 1361 { 1362 if (activeCount != statusObjects.size()) 1363 { 1364 // Not all OCCs went active 1365 log<level::WARNING>( 1366 std::format( 1367 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})", 1368 activeCount, statusObjects.size()) 1369 .c_str()); 1370 // Procs may be garded, so may be expected 1371 } 1372 1373 validateOccMaster(); 1374 } 1375 1376 #ifdef PLDM 1377 // Called when throttlePldmTraceTimer expires. 1378 // If this timer expires, that indicates there are no OCC active sensor PDRs 1379 // found which will trigger pldm traces to be throttled. 1380 // The second time this timer expires, a PEL will get created. 1381 void Manager::throttlePldmTraceExpired() 1382 { 1383 if (utils::isHostRunning()) 1384 { 1385 if (!onPldmTimeoutCreatePel) 1386 { 1387 // Throttle traces 1388 pldmHandle->setTraceThrottle(true); 1389 // Restart timer to log a PEL when timer expires 1390 onPldmTimeoutCreatePel = true; 1391 throttlePldmTraceTimer->restartOnce(40min); 1392 } 1393 else 1394 { 1395 log<level::ERR>( 1396 "throttlePldmTraceExpired(): OCC active sensors still not available!"); 1397 // Create PEL 1398 createPldmSensorPEL(); 1399 } 1400 } 1401 else 1402 { 1403 // Make sure traces are not throttled 1404 pldmHandle->setTraceThrottle(false); 1405 log<level::INFO>( 1406 "throttlePldmTraceExpired(): host it not running ignoring sensor timer"); 1407 } 1408 } 1409 1410 void Manager::createPldmSensorPEL() 1411 { 1412 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH); 1413 std::map<std::string, std::string> additionalData; 1414 1415 additionalData.emplace("_PID", std::to_string(getpid())); 1416 1417 log<level::INFO>( 1418 std::format( 1419 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs") 1420 .c_str()); 1421 1422 auto& bus = utils::getBus(); 1423 1424 try 1425 { 1426 FFDCFiles ffdc; 1427 // Add occ-control journal traces to PEL FFDC 1428 auto occJournalFile = 1429 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40); 1430 1431 static constexpr auto loggingObjectPath = 1432 "/xyz/openbmc_project/logging"; 1433 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL"; 1434 std::string service = 1435 utils::getService(loggingObjectPath, opLoggingInterface); 1436 auto method = 1437 bus.new_method_call(service.c_str(), loggingObjectPath, 1438 opLoggingInterface, "CreatePELWithFFDCFiles"); 1439 1440 // Set level to Warning (Predictive). 1441 auto level = 1442 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage( 1443 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level:: 1444 Warning); 1445 1446 method.append(d.path, level, additionalData, ffdc); 1447 bus.call(method); 1448 } 1449 catch (const sdbusplus::exception_t& e) 1450 { 1451 log<level::ERR>( 1452 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}", 1453 e.what()) 1454 .c_str()); 1455 } 1456 } 1457 #endif // PLDM 1458 #endif // POWER10 1459 1460 // Verify single master OCC and start presence monitor 1461 void Manager::validateOccMaster() 1462 { 1463 int masterInstance = -1; 1464 for (auto& obj : statusObjects) 1465 { 1466 auto instance = obj->getOccInstanceID(); 1467 #ifdef POWER10 1468 if (!obj->occActive()) 1469 { 1470 if (utils::isHostRunning()) 1471 { 1472 // Check if sensor was queued while waiting for discovery 1473 auto match = queuedActiveState.find(instance); 1474 if (match != queuedActiveState.end()) 1475 { 1476 queuedActiveState.erase(match); 1477 log<level::INFO>( 1478 std::format( 1479 "validateOccMaster: OCC{} is ACTIVE (queued)", 1480 instance) 1481 .c_str()); 1482 obj->occActive(true); 1483 } 1484 else 1485 { 1486 // OCC does not appear to be active yet, check active sensor 1487 #ifdef PLDM 1488 pldmHandle->checkActiveSensor(instance); 1489 #endif 1490 if (obj->occActive()) 1491 { 1492 log<level::INFO>( 1493 std::format( 1494 "validateOccMaster: OCC{} is ACTIVE after reading sensor", 1495 instance) 1496 .c_str()); 1497 } 1498 } 1499 } 1500 else 1501 { 1502 log<level::WARNING>( 1503 std::format( 1504 "validateOccMaster: HOST is not running (OCC{})", 1505 instance) 1506 .c_str()); 1507 return; 1508 } 1509 } 1510 #endif // POWER10 1511 1512 if (obj->isMasterOcc()) 1513 { 1514 obj->addPresenceWatchMaster(); 1515 1516 if (masterInstance == -1) 1517 { 1518 masterInstance = instance; 1519 } 1520 else 1521 { 1522 log<level::ERR>( 1523 std::format( 1524 "validateOccMaster: Multiple OCC masters! ({} and {})", 1525 masterInstance, instance) 1526 .c_str()); 1527 // request reset 1528 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH)); 1529 } 1530 } 1531 } 1532 1533 if (masterInstance < 0) 1534 { 1535 log<level::ERR>( 1536 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)", 1537 statusObjects.size()) 1538 .c_str()); 1539 // request reset 1540 statusObjects.front()->deviceError( 1541 Error::Descriptor(PRESENCE_ERROR_PATH)); 1542 } 1543 else 1544 { 1545 log<level::INFO>( 1546 std::format("validateOccMaster: OCC{} is master of {} OCCs", 1547 masterInstance, activeCount) 1548 .c_str()); 1549 #ifdef POWER10 1550 pmode->updateDbusSafeMode(false); 1551 #endif 1552 } 1553 } 1554 1555 void Manager::updatePcapBounds() const 1556 { 1557 if (pcap) 1558 { 1559 pcap->updatePcapBounds(); 1560 } 1561 } 1562 1563 } // namespace occ 1564 } // namespace open_power 1565