1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <phosphor-logging/log.hpp> 9 10 #include <filesystem> 11 #include <format> 12 13 namespace open_power 14 { 15 namespace occ 16 { 17 18 using namespace phosphor::logging; 19 20 using ThrottleObj = 21 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle; 22 23 // Handles updates to occActive property 24 bool Status::occActive(bool value) 25 { 26 if (value != this->occActive()) 27 { 28 log<level::INFO>(std::format("Status::occActive OCC{} changed to {}", 29 instance, value) 30 .c_str()); 31 if (value) 32 { 33 // Clear prior throttle reason (before setting device active) 34 updateThrottle(false, THROTTLED_ALL); 35 36 // Set the device active 37 device.setActive(true); 38 39 // Reset last OCC state 40 lastState = 0; 41 42 // Start watching for errors (throttles, etc) 43 try 44 { 45 addErrorWatch(); 46 } 47 catch (const OpenFailure& e) 48 { 49 // Failed to add watch for throttle events, request reset to try 50 // to recover comm 51 log<level::ERR>( 52 std::format( 53 "Status::occActive: Unable to add error watch(s) for OCC{} watch: {}", 54 instance, e.what()) 55 .c_str()); 56 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 57 return Base::Status::occActive(false); 58 } 59 60 // Update the OCC active sensor 61 Base::Status::occActive(value); 62 63 if (device.master()) 64 { 65 // Update powercap bounds from OCC 66 manager.updatePcapBounds(); 67 } 68 69 // Call into Manager to let know that we have bound 70 if (this->managerCallBack) 71 { 72 this->managerCallBack(instance, value); 73 } 74 } 75 else 76 { 77 #ifdef POWER10 78 if (pmode && device.master()) 79 { 80 // Prevent mode changes 81 pmode->setMasterActive(false); 82 } 83 if (safeStateDelayTimer.isEnabled()) 84 { 85 // stop safe delay timer 86 safeStateDelayTimer.setEnabled(false); 87 } 88 #endif 89 // Call into Manager to let know that we will unbind. 90 if (this->managerCallBack) 91 { 92 this->managerCallBack(instance, value); 93 } 94 95 // Stop watching for errors 96 removeErrorWatch(); 97 98 // Set the device inactive 99 device.setActive(false); 100 101 // Clear throttles (OCC not active after disabling device) 102 updateThrottle(false, THROTTLED_ALL); 103 } 104 } 105 else if (value && !device.active()) 106 { 107 // Existing error watch is on a dead file descriptor. 108 removeErrorWatch(); 109 110 /* 111 * In it's constructor, Status checks Device::bound() to see if OCC is 112 * active or not. 113 * Device::bound() checks for occX-dev0 directory. 114 * We will lose occX-dev0 directories during FSI rescan. 115 * So, if we start this application (and construct Status), and then 116 * later do FSI rescan, we will end up with occActive = true and device 117 * NOT bound. Lets correct that situation here. 118 */ 119 device.setActive(true); 120 121 // Add error watch again 122 try 123 { 124 addErrorWatch(); 125 } 126 catch (const OpenFailure& e) 127 { 128 // Failed to add watch for throttle events, request reset to try to 129 // recover comm 130 log<level::ERR>( 131 std::format( 132 "Status::occActive: Unable to add error watch(s) again for OCC{} watch: {}", 133 instance, e.what()) 134 .c_str()); 135 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 136 return Base::Status::occActive(false); 137 } 138 } 139 else if (!value && device.active()) 140 { 141 removeErrorWatch(); 142 143 // In the event that the application never receives the active signal 144 // even though the OCC is active (this can occur if the BMC is rebooted 145 // with the host on, since the initial OCC driver probe will discover 146 // the OCCs), this application needs to be able to unbind the device 147 // when we get the OCC inactive signal. 148 device.setActive(false); 149 } 150 return Base::Status::occActive(value); 151 } 152 153 // Callback handler when a device error is reported. 154 void Status::deviceError(Error::Descriptor d) 155 { 156 #ifdef POWER10 157 if (pmode && device.master()) 158 { 159 // Prevent mode changes 160 pmode->setMasterActive(false); 161 } 162 #endif 163 164 if (d.log) 165 { 166 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 167 } 168 169 // This would deem OCC inactive 170 this->occActive(false); 171 172 // Reset the OCC 173 this->resetOCC(); 174 } 175 176 // Sends message to host control command handler to reset OCC 177 void Status::resetOCC() 178 { 179 log<level::INFO>( 180 std::format(">>Status::resetOCC() - requesting reset for OCC{}", 181 instance) 182 .c_str()); 183 this->occActive(false); 184 #ifdef PLDM 185 if (resetCallBack) 186 { 187 this->resetCallBack(instance); 188 } 189 #else 190 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 191 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 192 193 // This will throw exception on failure 194 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 195 196 auto& bus = utils::getBus(); 197 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 198 CONTROL_HOST_INTF, "Execute"); 199 // OCC Reset control command 200 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 201 202 // OCC Sensor ID for callout reasons 203 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 204 bus.call_noreply(method); 205 return; 206 #endif 207 } 208 209 // Handler called by Host control command handler to convey the 210 // status of the executed command 211 void Status::hostControlEvent(sdbusplus::message_t& msg) 212 { 213 std::string cmdCompleted{}; 214 std::string cmdStatus{}; 215 216 msg.read(cmdCompleted, cmdStatus); 217 218 log<level::DEBUG>("Host control signal values", 219 entry("COMMAND=%s", cmdCompleted.c_str()), 220 entry("STATUS=%s", cmdStatus.c_str())); 221 222 if (Control::Host::convertResultFromString(cmdStatus) != 223 Control::Host::Result::Success) 224 { 225 if (Control::Host::convertCommandFromString(cmdCompleted) == 226 Control::Host::Command::OCCReset) 227 { 228 // Must be a Timeout. Log an Error trace 229 log<level::ERR>( 230 "Error resetting the OCC.", entry("PATH=%s", path.c_str()), 231 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance)))); 232 } 233 } 234 return; 235 } 236 237 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 238 void Status::readOccState() 239 { 240 if (stateValid) 241 { 242 // Reset retry count (since state is good) 243 currentOccReadRetriesCount = occReadRetries; 244 } 245 occReadStateNow(); 246 } 247 248 #ifdef POWER10 249 // Special processing that needs to happen once the OCCs change to ACTIVE state 250 void Status::occsWentActive() 251 { 252 CmdStatus status = CmdStatus::SUCCESS; 253 254 status = pmode->sendModeChange(); 255 if (status != CmdStatus::SUCCESS) 256 { 257 log<level::ERR>( 258 std::format( 259 "Status::occsWentActive: OCC mode change failed with status {}", 260 status) 261 .c_str()); 262 263 // Disable and reset to try recovering 264 deviceError(); 265 } 266 267 status = pmode->sendIpsData(); 268 if (status != CmdStatus::SUCCESS) 269 { 270 log<level::ERR>( 271 std::format( 272 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}", 273 status) 274 .c_str()); 275 276 if (status == CmdStatus::COMM_FAILURE) 277 { 278 // Disable and reset to try recovering 279 deviceError(); 280 } 281 } 282 } 283 284 // Send Ambient and Altitude to the OCC 285 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 286 { 287 CmdStatus status = CmdStatus::FAILURE; 288 bool ambientValid = true; 289 uint8_t ambientTemp = inTemp; 290 uint16_t altitude = inAltitude; 291 292 if (ambientTemp == 0xFF) 293 { 294 // Get latest readings from manager 295 manager.getAmbientData(ambientValid, ambientTemp, altitude); 296 log<level::DEBUG>( 297 std::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m", 298 ambientValid, ambientTemp, altitude) 299 .c_str()); 300 } 301 302 std::vector<std::uint8_t> cmd, rsp; 303 cmd.reserve(11); 304 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 305 cmd.push_back(0x00); // Data Length (2 bytes) 306 cmd.push_back(0x08); // 307 cmd.push_back(0x00); // Version 308 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 309 cmd.push_back(ambientTemp); // Ambient Temperature 310 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 311 cmd.push_back(altitude & 0xFF); // 312 cmd.push_back(0x00); // Reserved (3 bytes) 313 cmd.push_back(0x00); 314 cmd.push_back(0x00); 315 log<level::DEBUG>(std::format("sendAmbient: SEND_AMBIENT " 316 "command to OCC{} ({} bytes)", 317 instance, cmd.size()) 318 .c_str()); 319 status = occCmd.send(cmd, rsp); 320 if (status == CmdStatus::SUCCESS) 321 { 322 if (rsp.size() == 5) 323 { 324 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 325 { 326 log<level::ERR>( 327 std::format( 328 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}", 329 rsp[2]) 330 .c_str()); 331 dump_hex(rsp); 332 status = CmdStatus::FAILURE; 333 } 334 } 335 else 336 { 337 log<level::ERR>( 338 std::format( 339 "sendAmbient: INVALID SEND_AMBIENT response length:{}", 340 rsp.size()) 341 .c_str()); 342 dump_hex(rsp); 343 status = CmdStatus::FAILURE; 344 } 345 } 346 else 347 { 348 log<level::ERR>( 349 std::format( 350 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}", 351 status) 352 .c_str()); 353 354 if (status == CmdStatus::COMM_FAILURE) 355 { 356 // Disable due to OCC comm failure and reset to try recovering 357 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 358 } 359 } 360 361 return status; 362 } 363 364 // Called when safe timer expires to determine if OCCs need to be reset 365 void Status::safeStateDelayExpired() 366 { 367 if (this->occActive()) 368 { 369 log<level::INFO>( 370 std::format( 371 "safeStateDelayExpired: OCC{} state missing or not valid, requesting reset", 372 instance) 373 .c_str()); 374 // Disable and reset to try recovering 375 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 376 } 377 } 378 #endif // POWER10 379 380 fs::path Status::getHwmonPath() 381 { 382 using namespace std::literals::string_literals; 383 384 if (!fs::exists(hwmonPath)) 385 { 386 static bool tracedFail[8] = {0}; 387 388 if (!hwmonPath.empty()) 389 { 390 log<level::WARNING>( 391 std::format("Status::getHwmonPath(): path no longer exists: {}", 392 hwmonPath.c_str()) 393 .c_str()); 394 hwmonPath.clear(); 395 } 396 397 // Build the base HWMON path 398 fs::path prefixPath = 399 fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 400 std::to_string(instance + 1) + "/hwmon/"s}; 401 402 // Get the hwmonXX directory name 403 try 404 { 405 // there should only be one directory 406 const int numDirs = std::distance( 407 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 408 if (numDirs == 1) 409 { 410 hwmonPath = *fs::directory_iterator(prefixPath); 411 tracedFail[instance] = false; 412 } 413 else 414 { 415 if (!tracedFail[instance]) 416 { 417 log<level::ERR>( 418 std::format( 419 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!", 420 numDirs) 421 .c_str()); 422 tracedFail[instance] = true; 423 } 424 } 425 } 426 catch (const fs::filesystem_error& e) 427 { 428 if (!tracedFail[instance]) 429 { 430 log<level::ERR>( 431 std::format( 432 "Status::getHwmonPath(): error accessing {}: {}", 433 prefixPath.c_str(), e.what()) 434 .c_str()); 435 tracedFail[instance] = true; 436 } 437 } 438 } 439 440 return hwmonPath; 441 } 442 443 // Called to read state and handle any errors 444 void Status::occReadStateNow() 445 { 446 unsigned int state; 447 const fs::path filename = 448 fs::path(DEV_PATH) / 449 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 450 451 std::ifstream file; 452 bool goodFile = false; 453 454 // open file. 455 file.open(filename, std::ios::in); 456 const int openErrno = errno; 457 458 // File is open and state can be used. 459 if (file.is_open() && file.good()) 460 { 461 goodFile = true; 462 file >> state; 463 // Read the error code (if any) to check status of the read 464 std::ios_base::iostate readState = file.rdstate(); 465 if (readState) 466 { 467 // There was a failure reading the file 468 if (lastOccReadStatus != -1) 469 { 470 // Trace error bits 471 std::string errorBits = ""; 472 if (readState & std::ios_base::eofbit) 473 { 474 errorBits += " EOF"; 475 } 476 if (readState & std::ios_base::failbit) 477 { 478 errorBits += " failbit"; 479 } 480 if (readState & std::ios_base::badbit) 481 { 482 errorBits += " badbit"; 483 } 484 log<level::ERR>( 485 std::format( 486 "readOccState: Failed to read OCC{} state: Read error on I/O operation -{}", 487 instance, errorBits) 488 .c_str()); 489 lastOccReadStatus = -1; 490 } 491 goodFile = false; 492 } 493 494 if (goodFile && (state != lastState)) 495 { 496 // Trace OCC state changes 497 log<level::INFO>( 498 std::format( 499 "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})", 500 instance, state, lastState) 501 .c_str()); 502 lastState = state; 503 #ifdef POWER10 504 if (OccState(state) == OccState::ACTIVE) 505 { 506 if (pmode && device.master()) 507 { 508 // Set the master OCC on the PowerMode object 509 pmode->setMasterOcc(path); 510 // Enable mode changes 511 pmode->setMasterActive(); 512 513 // Special processing by master OCC when it goes active 514 occsWentActive(); 515 } 516 517 CmdStatus status = sendAmbient(); 518 if (status != CmdStatus::SUCCESS) 519 { 520 log<level::ERR>( 521 std::format( 522 "readOccState: Sending Ambient failed with status {}", 523 status) 524 .c_str()); 525 } 526 } 527 528 // If OCC in known Good State. 529 if ((OccState(state) == OccState::ACTIVE) || 530 (OccState(state) == OccState::CHARACTERIZATION) || 531 (OccState(state) == OccState::OBSERVATION)) 532 { 533 // Good OCC State then sensors valid again 534 stateValid = true; 535 536 if (safeStateDelayTimer.isEnabled()) 537 { 538 // stop safe delay timer (no longer in SAFE state) 539 safeStateDelayTimer.setEnabled(false); 540 } 541 } 542 else 543 { 544 // OCC is in SAFE or some other unsupported state 545 if (!safeStateDelayTimer.isEnabled()) 546 { 547 log<level::ERR>( 548 std::format( 549 "readOccState: Invalid OCC{} state of {}, starting safe state delay timer", 550 instance, state) 551 .c_str()); 552 // start safe delay timer (before requesting reset) 553 using namespace std::literals::chrono_literals; 554 safeStateDelayTimer.restartOnce(60s); 555 } 556 // Not a supported state (update sensors to NaN and not 557 // functional) 558 stateValid = false; 559 } 560 #else 561 // Before P10 state not checked, only used good file open. 562 stateValid = true; 563 #endif 564 } 565 } 566 #ifdef POWER10 567 else 568 { 569 // Unable to read state 570 stateValid = false; 571 } 572 #endif 573 file.close(); 574 575 // if failed to Read a state or not a valid state -> Attempt retry 576 // after 1 Second delay if allowed. 577 if ((!goodFile) || (!stateValid)) 578 { 579 if (!goodFile) 580 { 581 // If not able to read, OCC may be offline 582 if (openErrno != lastOccReadStatus) 583 { 584 log<level::ERR>( 585 std::format( 586 "Status::readOccState: open/read failed trying to read OCC{} state (open errno={})", 587 instance, openErrno) 588 .c_str()); 589 lastOccReadStatus = openErrno; 590 } 591 } 592 else 593 { 594 // else this failed due to state not valid. 595 if (state != lastState) 596 { 597 log<level::ERR>( 598 std::format( 599 "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})", 600 instance, state, lastState) 601 .c_str()); 602 } 603 } 604 605 #ifdef READ_OCC_SENSORS 606 manager.setSensorValueToNaN(instance); 607 #endif 608 609 // See occReadRetries for number of retry attempts. 610 if (currentOccReadRetriesCount > 0) 611 { 612 --currentOccReadRetriesCount; 613 } 614 else 615 { 616 log<level::ERR>( 617 std::format("readOccState: failed to read OCC{} state!", 618 instance) 619 .c_str()); 620 621 // State could not be determined, set it to NO State. 622 lastState = 0; 623 624 // Disable the ability to send Failed actions until OCC is 625 // Active again. 626 stateValid = false; 627 628 // Disable due to OCC comm failure and reset to try recovering 629 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 630 631 // Reset retry count (for next attempt after recovery) 632 currentOccReadRetriesCount = occReadRetries; 633 } 634 } 635 else 636 { 637 if (lastOccReadStatus != 0) 638 { 639 log<level::INFO>( 640 std::format( 641 "Status::readOccState: successfully read OCC{} state: {}", 642 instance, state) 643 .c_str()); 644 lastOccReadStatus = 0; // no error 645 } 646 } 647 } 648 649 // Update processor throttle status on dbus 650 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason) 651 { 652 if (!throttleHandle) 653 { 654 return; 655 } 656 657 uint8_t newThrottleCause = throttleCause; 658 659 if (isThrottled) // throttled due to newReason 660 { 661 if ((newReason & throttleCause) == 0) 662 { 663 // set the bit(s) for passed in reason 664 newThrottleCause |= newReason; 665 } 666 // else no change 667 } 668 else // no longer throttled due to newReason 669 { 670 if ((newReason & throttleCause) != 0) 671 { 672 // clear the bit(s) for passed in reason 673 newThrottleCause &= ~newReason; 674 } 675 // else no change 676 } 677 678 if (newThrottleCause != throttleCause) 679 { 680 if (newThrottleCause == THROTTLED_NONE) 681 { 682 log<level::DEBUG>( 683 std::format( 684 "updateThrottle: OCC{} no longer throttled (prior reason: {})", 685 instance, throttleCause) 686 .c_str()); 687 throttleCause = THROTTLED_NONE; 688 throttleHandle->throttled(false); 689 throttleHandle->throttleCauses({}); 690 } 691 else 692 { 693 log<level::DEBUG>( 694 std::format( 695 "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})", 696 instance, newThrottleCause, throttleCause) 697 .c_str()); 698 throttleCause = newThrottleCause; 699 700 std::vector<ThrottleObj::ThrottleReasons> updatedCauses; 701 if (throttleCause & THROTTLED_POWER) 702 { 703 updatedCauses.push_back( 704 throttleHandle->ThrottleReasons::PowerLimit); 705 } 706 if (throttleCause & THROTTLED_THERMAL) 707 { 708 updatedCauses.push_back( 709 throttleHandle->ThrottleReasons::ThermalLimit); 710 } 711 if (throttleCause & THROTTLED_SAFE) 712 { 713 updatedCauses.push_back( 714 throttleHandle->ThrottleReasons::ManagementDetectedFault); 715 } 716 throttleHandle->throttleCauses(updatedCauses); 717 throttleHandle->throttled(true); 718 } 719 } 720 // else no change to throttle status 721 } 722 723 // Get processor path associated with this OCC 724 void Status::readProcAssociation() 725 { 726 std::string managingPath = path + "/power_managing"; 727 log<level::DEBUG>( 728 std::format("readProcAssociation: getting endpoints for {} ({})", 729 managingPath, path) 730 .c_str()); 731 try 732 { 733 utils::PropertyValue procPathProperty{}; 734 procPathProperty = utils::getProperty( 735 managingPath, "xyz.openbmc_project.Association", "endpoints"); 736 auto result = std::get<std::vector<std::string>>(procPathProperty); 737 if (result.size() > 0) 738 { 739 procPath = result[0]; 740 log<level::INFO>( 741 std::format("readProcAssociation: OCC{} has proc={}", instance, 742 procPath.c_str()) 743 .c_str()); 744 } 745 else 746 { 747 log<level::ERR>( 748 std::format( 749 "readProcAssociation: No processor associated with OCC{} / {}", 750 instance, path) 751 .c_str()); 752 } 753 } 754 catch (const sdbusplus::exception_t& e) 755 { 756 log<level::ERR>( 757 std::format( 758 "readProcAssociation: Unable to get proc assocated with {} - {}", 759 path, e.what()) 760 .c_str()); 761 procPath = {}; 762 } 763 } 764 765 } // namespace occ 766 } // namespace open_power 767