1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <phosphor-logging/log.hpp> 9 10 #include <filesystem> 11 #include <format> 12 13 namespace open_power 14 { 15 namespace occ 16 { 17 18 using namespace phosphor::logging; 19 20 using ThrottleObj = 21 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle; 22 23 // Handles updates to occActive property 24 bool Status::occActive(bool value) 25 { 26 if (value != this->occActive()) 27 { 28 log<level::INFO>(std::format("Status::occActive OCC{} changed to {}", 29 instance, value) 30 .c_str()); 31 if (value) 32 { 33 // Clear prior throttle reason (before setting device active) 34 updateThrottle(false, THROTTLED_ALL); 35 36 // Set the device active 37 device.setActive(true); 38 39 // Update the OCC active sensor 40 Base::Status::occActive(value); 41 42 // Start watching for errors 43 addErrorWatch(); 44 45 // Reset last OCC state 46 lastState = 0; 47 48 if (device.master()) 49 { 50 // Update powercap bounds from OCC 51 manager.updatePcapBounds(); 52 } 53 54 // Call into Manager to let know that we have bound 55 if (this->managerCallBack) 56 { 57 this->managerCallBack(instance, value); 58 } 59 } 60 else 61 { 62 #ifdef POWER10 63 if (pmode && device.master()) 64 { 65 // Prevent mode changes 66 pmode->setMasterActive(false); 67 } 68 if (safeStateDelayTimer.isEnabled()) 69 { 70 // stop safe delay timer 71 safeStateDelayTimer.setEnabled(false); 72 } 73 #endif 74 // Call into Manager to let know that we will unbind. 75 if (this->managerCallBack) 76 { 77 this->managerCallBack(instance, value); 78 } 79 80 // Stop watching for errors 81 removeErrorWatch(); 82 83 // Set the device inactive 84 device.setActive(false); 85 86 // Clear throttles (OCC not active after disabling device) 87 updateThrottle(false, THROTTLED_ALL); 88 } 89 } 90 else if (value && !device.active()) 91 { 92 // Existing error watch is on a dead file descriptor. 93 removeErrorWatch(); 94 95 /* 96 * In it's constructor, Status checks Device::bound() to see if OCC is 97 * active or not. 98 * Device::bound() checks for occX-dev0 directory. 99 * We will lose occX-dev0 directories during FSI rescan. 100 * So, if we start this application (and construct Status), and then 101 * later do FSI rescan, we will end up with occActive = true and device 102 * NOT bound. Lets correct that situation here. 103 */ 104 device.setActive(true); 105 106 // Add error watch again 107 addErrorWatch(); 108 } 109 else if (!value && device.active()) 110 { 111 removeErrorWatch(); 112 113 // In the event that the application never receives the active signal 114 // even though the OCC is active (this can occur if the BMC is rebooted 115 // with the host on, since the initial OCC driver probe will discover 116 // the OCCs), this application needs to be able to unbind the device 117 // when we get the OCC inactive signal. 118 device.setActive(false); 119 } 120 return Base::Status::occActive(value); 121 } 122 123 // Callback handler when a device error is reported. 124 void Status::deviceError(Error::Descriptor d) 125 { 126 #ifdef POWER10 127 if (pmode && device.master()) 128 { 129 // Prevent mode changes 130 pmode->setMasterActive(false); 131 } 132 #endif 133 134 if (d.log) 135 { 136 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 137 } 138 139 // This would deem OCC inactive 140 this->occActive(false); 141 142 // Reset the OCC 143 this->resetOCC(); 144 } 145 146 // Sends message to host control command handler to reset OCC 147 void Status::resetOCC() 148 { 149 log<level::INFO>( 150 std::format(">>Status::resetOCC() - requesting reset for OCC{}", 151 instance) 152 .c_str()); 153 #ifdef PLDM 154 if (resetCallBack) 155 { 156 this->resetCallBack(instance); 157 } 158 #else 159 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 160 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 161 162 // This will throw exception on failure 163 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 164 165 auto& bus = utils::getBus(); 166 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 167 CONTROL_HOST_INTF, "Execute"); 168 // OCC Reset control command 169 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 170 171 // OCC Sensor ID for callout reasons 172 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 173 bus.call_noreply(method); 174 return; 175 #endif 176 } 177 178 // Handler called by Host control command handler to convey the 179 // status of the executed command 180 void Status::hostControlEvent(sdbusplus::message_t& msg) 181 { 182 std::string cmdCompleted{}; 183 std::string cmdStatus{}; 184 185 msg.read(cmdCompleted, cmdStatus); 186 187 log<level::DEBUG>("Host control signal values", 188 entry("COMMAND=%s", cmdCompleted.c_str()), 189 entry("STATUS=%s", cmdStatus.c_str())); 190 191 if (Control::Host::convertResultFromString(cmdStatus) != 192 Control::Host::Result::Success) 193 { 194 if (Control::Host::convertCommandFromString(cmdCompleted) == 195 Control::Host::Command::OCCReset) 196 { 197 // Must be a Timeout. Log an Error trace 198 log<level::ERR>( 199 "Error resetting the OCC.", entry("PATH=%s", path.c_str()), 200 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance)))); 201 } 202 } 203 return; 204 } 205 206 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 207 void Status::readOccState() 208 { 209 currentOccReadRetriesCount = occReadRetries; 210 occReadStateNow(); 211 } 212 213 #ifdef POWER10 214 // Special processing that needs to happen once the OCCs change to ACTIVE state 215 void Status::occsWentActive() 216 { 217 CmdStatus status = CmdStatus::SUCCESS; 218 219 status = pmode->sendModeChange(); 220 if (status != CmdStatus::SUCCESS) 221 { 222 log<level::ERR>( 223 std::format( 224 "Status::occsWentActive: OCC mode change failed with status {}", 225 status) 226 .c_str()); 227 228 // Disable and reset to try recovering 229 deviceError(); 230 } 231 232 status = pmode->sendIpsData(); 233 if (status != CmdStatus::SUCCESS) 234 { 235 log<level::ERR>( 236 std::format( 237 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}", 238 status) 239 .c_str()); 240 241 if (status == CmdStatus::COMM_FAILURE) 242 { 243 // Disable and reset to try recovering 244 deviceError(); 245 } 246 } 247 } 248 249 // Send Ambient and Altitude to the OCC 250 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 251 { 252 CmdStatus status = CmdStatus::FAILURE; 253 bool ambientValid = true; 254 uint8_t ambientTemp = inTemp; 255 uint16_t altitude = inAltitude; 256 257 if (ambientTemp == 0xFF) 258 { 259 // Get latest readings from manager 260 manager.getAmbientData(ambientValid, ambientTemp, altitude); 261 log<level::DEBUG>( 262 std::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m", 263 ambientValid, ambientTemp, altitude) 264 .c_str()); 265 } 266 267 std::vector<std::uint8_t> cmd, rsp; 268 cmd.reserve(11); 269 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 270 cmd.push_back(0x00); // Data Length (2 bytes) 271 cmd.push_back(0x08); // 272 cmd.push_back(0x00); // Version 273 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 274 cmd.push_back(ambientTemp); // Ambient Temperature 275 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 276 cmd.push_back(altitude & 0xFF); // 277 cmd.push_back(0x00); // Reserved (3 bytes) 278 cmd.push_back(0x00); 279 cmd.push_back(0x00); 280 log<level::DEBUG>(std::format("sendAmbient: SEND_AMBIENT " 281 "command to OCC{} ({} bytes)", 282 instance, cmd.size()) 283 .c_str()); 284 status = occCmd.send(cmd, rsp); 285 if (status == CmdStatus::SUCCESS) 286 { 287 if (rsp.size() == 5) 288 { 289 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 290 { 291 log<level::ERR>( 292 std::format( 293 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}", 294 rsp[2]) 295 .c_str()); 296 dump_hex(rsp); 297 status = CmdStatus::FAILURE; 298 } 299 } 300 else 301 { 302 log<level::ERR>( 303 std::format( 304 "sendAmbient: INVALID SEND_AMBIENT response length:{}", 305 rsp.size()) 306 .c_str()); 307 dump_hex(rsp); 308 status = CmdStatus::FAILURE; 309 } 310 } 311 else 312 { 313 log<level::ERR>( 314 std::format( 315 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}", 316 status) 317 .c_str()); 318 319 if (status == CmdStatus::COMM_FAILURE) 320 { 321 // Disable and reset to try recovering 322 deviceError(); 323 } 324 } 325 326 return status; 327 } 328 329 // Called when safe timer expires to determine if OCCs need to be reset 330 void Status::safeStateDelayExpired() 331 { 332 if (this->occActive()) 333 { 334 log<level::INFO>( 335 std::format( 336 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset", 337 instance) 338 .c_str()); 339 // Disable and reset to try recovering 340 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 341 } 342 } 343 #endif // POWER10 344 345 fs::path Status::getHwmonPath() 346 { 347 using namespace std::literals::string_literals; 348 349 if (!fs::exists(hwmonPath)) 350 { 351 static bool tracedFail[8] = {0}; 352 353 if (!hwmonPath.empty()) 354 { 355 log<level::ERR>( 356 std::format("Status::getHwmonPath(): path no longer exists: {}", 357 hwmonPath.c_str()) 358 .c_str()); 359 hwmonPath.clear(); 360 } 361 362 // Build the base HWMON path 363 fs::path prefixPath = 364 fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 365 std::to_string(instance + 1) + "/hwmon/"s}; 366 367 // Get the hwmonXX directory name 368 try 369 { 370 // there should only be one directory 371 const int numDirs = std::distance( 372 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 373 if (numDirs == 1) 374 { 375 hwmonPath = *fs::directory_iterator(prefixPath); 376 tracedFail[instance] = false; 377 } 378 else 379 { 380 if (!tracedFail[instance]) 381 { 382 log<level::ERR>( 383 std::format( 384 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!", 385 numDirs) 386 .c_str()); 387 tracedFail[instance] = true; 388 } 389 } 390 } 391 catch (const fs::filesystem_error& e) 392 { 393 if (!tracedFail[instance]) 394 { 395 log<level::ERR>( 396 std::format( 397 "Status::getHwmonPath(): error accessing {}: {}", 398 prefixPath.c_str(), e.what()) 399 .c_str()); 400 tracedFail[instance] = true; 401 } 402 } 403 } 404 405 return hwmonPath; 406 } 407 408 // Called to read state and upon failure to read after occReadStateFailTimer. 409 void Status::occReadStateNow() 410 { 411 unsigned int state; 412 const fs::path filename = 413 fs::path(DEV_PATH) / 414 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 415 416 std::ifstream file; 417 bool goodFile = false; 418 419 // open file. 420 file.open(filename, std::ios::in); 421 const int openErrno = errno; 422 423 // File is open and state can be used. 424 if (file.is_open() && file.good()) 425 { 426 goodFile = true; 427 file >> state; 428 429 if (state != lastState) 430 { 431 // Trace OCC state changes 432 log<level::INFO>( 433 std::format( 434 "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})", 435 instance, state, lastState) 436 .c_str()); 437 lastState = state; 438 #ifdef POWER10 439 if (OccState(state) == OccState::ACTIVE) 440 { 441 if (pmode && device.master()) 442 { 443 // Set the master OCC on the PowerMode object 444 pmode->setMasterOcc(path); 445 // Enable mode changes 446 pmode->setMasterActive(); 447 448 // Special processing by master OCC when it goes active 449 occsWentActive(); 450 } 451 452 CmdStatus status = sendAmbient(); 453 if (status != CmdStatus::SUCCESS) 454 { 455 log<level::ERR>( 456 std::format( 457 "readOccState: Sending Ambient failed with status {}", 458 status) 459 .c_str()); 460 } 461 } 462 463 // If OCC in known Good State. 464 if ((OccState(state) == OccState::ACTIVE) || 465 (OccState(state) == OccState::CHARACTERIZATION) || 466 (OccState(state) == OccState::OBSERVATION)) 467 { 468 // Good OCC State then sensors valid again 469 stateValid = true; 470 471 if (safeStateDelayTimer.isEnabled()) 472 { 473 // stop safe delay timer (no longer in SAFE state) 474 safeStateDelayTimer.setEnabled(false); 475 } 476 } 477 // Else not Valid state We would be in SAFE mode. 478 // This captures both SAFE mode, and 0x00, or other invalid 479 // state values. 480 else 481 { 482 if (!safeStateDelayTimer.isEnabled()) 483 { 484 // start safe delay timer (before requesting reset) 485 using namespace std::literals::chrono_literals; 486 safeStateDelayTimer.restartOnce(60s); 487 } 488 // Not valid state, update sensors to Nan & not functional. 489 stateValid = false; 490 } 491 #else 492 // Before P10 state not checked, only used good file open. 493 stateValid = true; 494 #endif 495 } 496 } 497 file.close(); 498 499 // if failed to Read a state or not a valid state -> Attempt retry 500 // after 1 Second delay if allowed. 501 if ((!goodFile) || (!stateValid)) 502 { 503 if (!goodFile) 504 { 505 // If not able to read, OCC may be offline 506 log<level::ERR>( 507 std::format("Status::readOccState: open failed (errno={})", 508 openErrno) 509 .c_str()); 510 } 511 else 512 { 513 // else this failed due to state not valid. 514 if (state != lastState) 515 { 516 log<level::ERR>( 517 std::format( 518 "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})", 519 instance, state, lastState) 520 .c_str()); 521 } 522 } 523 524 #ifdef READ_OCC_SENSORS 525 manager.setSensorValueToNaN(instance); 526 #endif 527 528 // See occReadRetries for number of retry attempts. 529 if (currentOccReadRetriesCount > 0) 530 { 531 --currentOccReadRetriesCount; 532 #ifdef POWER10 533 using namespace std::chrono_literals; 534 occReadStateFailTimer.restartOnce(1s); 535 #endif 536 } 537 else 538 { 539 #ifdef POWER10 540 if (!stateValid && occActive()) 541 { 542 if (!safeStateDelayTimer.isEnabled()) 543 { 544 log<level::ERR>( 545 "Starting 60 sec delay timer before requesting a reset"); 546 // start safe delay timer (before requesting reset) 547 using namespace std::literals::chrono_literals; 548 safeStateDelayTimer.restartOnce(60s); 549 } 550 } 551 #else 552 // State could not be determined, set it to NO State. 553 lastState = 0; 554 555 // Disable the ability to send Failed actions until OCC is 556 // Active again. 557 stateValid = false; 558 559 // Disable and reset to try recovering 560 deviceError(); 561 #endif 562 } 563 } 564 } 565 566 // Update processor throttle status on dbus 567 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason) 568 { 569 if (!throttleHandle) 570 { 571 return; 572 } 573 574 uint8_t newThrottleCause = throttleCause; 575 576 if (isThrottled) // throttled due to newReason 577 { 578 if ((newReason & throttleCause) == 0) 579 { 580 // set the bit(s) for passed in reason 581 newThrottleCause |= newReason; 582 } 583 // else no change 584 } 585 else // no longer throttled due to newReason 586 { 587 if ((newReason & throttleCause) != 0) 588 { 589 // clear the bit(s) for passed in reason 590 newThrottleCause &= ~newReason; 591 } 592 // else no change 593 } 594 595 if (newThrottleCause != throttleCause) 596 { 597 if (newThrottleCause == THROTTLED_NONE) 598 { 599 log<level::DEBUG>( 600 std::format( 601 "updateThrottle: OCC{} no longer throttled (prior reason: {})", 602 instance, throttleCause) 603 .c_str()); 604 throttleCause = THROTTLED_NONE; 605 throttleHandle->throttled(false); 606 throttleHandle->throttleCauses({}); 607 } 608 else 609 { 610 log<level::DEBUG>( 611 std::format( 612 "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})", 613 instance, newThrottleCause, throttleCause) 614 .c_str()); 615 throttleCause = newThrottleCause; 616 617 std::vector<ThrottleObj::ThrottleReasons> updatedCauses; 618 if (throttleCause & THROTTLED_POWER) 619 { 620 updatedCauses.push_back( 621 throttleHandle->ThrottleReasons::PowerLimit); 622 } 623 if (throttleCause & THROTTLED_THERMAL) 624 { 625 updatedCauses.push_back( 626 throttleHandle->ThrottleReasons::ThermalLimit); 627 } 628 if (throttleCause & THROTTLED_SAFE) 629 { 630 updatedCauses.push_back( 631 throttleHandle->ThrottleReasons::ManagementDetectedFault); 632 } 633 throttleHandle->throttleCauses(updatedCauses); 634 throttleHandle->throttled(true); 635 } 636 } 637 // else no change to throttle status 638 } 639 640 // Get processor path associated with this OCC 641 void Status::readProcAssociation() 642 { 643 std::string managingPath = path + "/power_managing"; 644 log<level::DEBUG>( 645 std::format("readProcAssociation: getting endpoints for {} ({})", 646 managingPath, path) 647 .c_str()); 648 try 649 { 650 utils::PropertyValue procPathProperty{}; 651 procPathProperty = utils::getProperty( 652 managingPath, "xyz.openbmc_project.Association", "endpoints"); 653 auto result = std::get<std::vector<std::string>>(procPathProperty); 654 if (result.size() > 0) 655 { 656 procPath = result[0]; 657 log<level::INFO>( 658 std::format("readProcAssociation: OCC{} has proc={}", instance, 659 procPath.c_str()) 660 .c_str()); 661 } 662 else 663 { 664 log<level::ERR>( 665 std::format( 666 "readProcAssociation: No processor associated with OCC{} / {}", 667 instance, path) 668 .c_str()); 669 } 670 } 671 catch (const sdbusplus::exception_t& e) 672 { 673 log<level::ERR>( 674 std::format( 675 "readProcAssociation: Unable to get proc assocated with {} - {}", 676 path, e.what()) 677 .c_str()); 678 procPath = {}; 679 } 680 } 681 682 } // namespace occ 683 } // namespace open_power 684