1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <fmt/core.h> 9 10 #include <phosphor-logging/log.hpp> 11 12 #include <filesystem> 13 14 namespace open_power 15 { 16 namespace occ 17 { 18 19 using namespace phosphor::logging; 20 21 using ThrottleObj = 22 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle; 23 24 // Handles updates to occActive property 25 bool Status::occActive(bool value) 26 { 27 if (value != this->occActive()) 28 { 29 log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}", 30 instance, value) 31 .c_str()); 32 if (value) 33 { 34 // Clear prior throttle reason (before setting device active) 35 updateThrottle(false, THROTTLED_ALL); 36 37 // Set the device active 38 device.setActive(true); 39 40 // Update the OCC active sensor 41 Base::Status::occActive(value); 42 43 // Start watching for errors 44 addErrorWatch(); 45 46 // Reset last OCC state 47 lastState = 0; 48 49 if (device.master()) 50 { 51 // Update powercap bounds from OCC 52 manager.updatePcapBounds(); 53 } 54 55 // Call into Manager to let know that we have bound 56 if (this->managerCallBack) 57 { 58 this->managerCallBack(instance, value); 59 } 60 } 61 else 62 { 63 #ifdef POWER10 64 if (pmode && device.master()) 65 { 66 // Prevent mode changes 67 pmode->setMasterActive(false); 68 } 69 if (safeStateDelayTimer.isEnabled()) 70 { 71 // stop safe delay timer 72 safeStateDelayTimer.setEnabled(false); 73 } 74 #endif 75 // Call into Manager to let know that we will unbind. 76 if (this->managerCallBack) 77 { 78 this->managerCallBack(instance, value); 79 } 80 81 // Stop watching for errors 82 removeErrorWatch(); 83 84 // Set the device inactive 85 device.setActive(false); 86 87 // Clear throttles (OCC not active after disabling device) 88 updateThrottle(false, THROTTLED_ALL); 89 } 90 } 91 else if (value && !device.active()) 92 { 93 // Existing error watch is on a dead file descriptor. 94 removeErrorWatch(); 95 96 /* 97 * In it's constructor, Status checks Device::bound() to see if OCC is 98 * active or not. 99 * Device::bound() checks for occX-dev0 directory. 100 * We will lose occX-dev0 directories during FSI rescan. 101 * So, if we start this application (and construct Status), and then 102 * later do FSI rescan, we will end up with occActive = true and device 103 * NOT bound. Lets correct that situation here. 104 */ 105 device.setActive(true); 106 107 // Add error watch again 108 addErrorWatch(); 109 } 110 else if (!value && device.active()) 111 { 112 removeErrorWatch(); 113 114 // In the event that the application never receives the active signal 115 // even though the OCC is active (this can occur if the BMC is rebooted 116 // with the host on, since the initial OCC driver probe will discover 117 // the OCCs), this application needs to be able to unbind the device 118 // when we get the OCC inactive signal. 119 device.setActive(false); 120 } 121 return Base::Status::occActive(value); 122 } 123 124 // Callback handler when a device error is reported. 125 void Status::deviceError(Error::Descriptor d) 126 { 127 #ifdef POWER10 128 if (pmode && device.master()) 129 { 130 // Prevent mode changes 131 pmode->setMasterActive(false); 132 } 133 #endif 134 135 if (d.log) 136 { 137 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 138 } 139 140 // This would deem OCC inactive 141 this->occActive(false); 142 143 // Reset the OCC 144 this->resetOCC(); 145 } 146 147 // Sends message to host control command handler to reset OCC 148 void Status::resetOCC() 149 { 150 log<level::INFO>( 151 fmt::format(">>Status::resetOCC() - requesting reset for OCC{}", 152 instance) 153 .c_str()); 154 #ifdef PLDM 155 if (resetCallBack) 156 { 157 this->resetCallBack(instance); 158 } 159 #else 160 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 161 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 162 163 // This will throw exception on failure 164 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 165 166 auto& bus = utils::getBus(); 167 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 168 CONTROL_HOST_INTF, "Execute"); 169 // OCC Reset control command 170 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 171 172 // OCC Sensor ID for callout reasons 173 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 174 bus.call_noreply(method); 175 return; 176 #endif 177 } 178 179 // Handler called by Host control command handler to convey the 180 // status of the executed command 181 void Status::hostControlEvent(sdbusplus::message_t& msg) 182 { 183 std::string cmdCompleted{}; 184 std::string cmdStatus{}; 185 186 msg.read(cmdCompleted, cmdStatus); 187 188 log<level::DEBUG>("Host control signal values", 189 entry("COMMAND=%s", cmdCompleted.c_str()), 190 entry("STATUS=%s", cmdStatus.c_str())); 191 192 if (Control::Host::convertResultFromString(cmdStatus) != 193 Control::Host::Result::Success) 194 { 195 if (Control::Host::convertCommandFromString(cmdCompleted) == 196 Control::Host::Command::OCCReset) 197 { 198 // Must be a Timeout. Log an Error trace 199 log<level::ERR>( 200 "Error resetting the OCC.", entry("PATH=%s", path.c_str()), 201 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance)))); 202 } 203 } 204 return; 205 } 206 207 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 208 void Status::readOccState() 209 { 210 currentOccReadRetriesCount = occReadRetries; 211 occReadStateNow(); 212 } 213 214 #ifdef POWER10 215 // Special processing that needs to happen once the OCCs change to ACTIVE state 216 void Status::occsWentActive() 217 { 218 CmdStatus status = CmdStatus::SUCCESS; 219 220 status = pmode->sendModeChange(); 221 if (status != CmdStatus::SUCCESS) 222 { 223 log<level::ERR>( 224 fmt::format( 225 "Status::occsWentActive: OCC mode change failed with status {}", 226 status) 227 .c_str()); 228 229 // Disable and reset to try recovering 230 deviceError(); 231 } 232 233 status = pmode->sendIpsData(); 234 if (status != CmdStatus::SUCCESS) 235 { 236 log<level::ERR>( 237 fmt::format( 238 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}", 239 status) 240 .c_str()); 241 242 if (status == CmdStatus::COMM_FAILURE) 243 { 244 // Disable and reset to try recovering 245 deviceError(); 246 } 247 } 248 } 249 250 // Send Ambient and Altitude to the OCC 251 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 252 { 253 CmdStatus status = CmdStatus::FAILURE; 254 bool ambientValid = true; 255 uint8_t ambientTemp = inTemp; 256 uint16_t altitude = inAltitude; 257 258 if (ambientTemp == 0xFF) 259 { 260 // Get latest readings from manager 261 manager.getAmbientData(ambientValid, ambientTemp, altitude); 262 log<level::DEBUG>( 263 fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m", 264 ambientValid, ambientTemp, altitude) 265 .c_str()); 266 } 267 268 std::vector<std::uint8_t> cmd, rsp; 269 cmd.reserve(11); 270 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 271 cmd.push_back(0x00); // Data Length (2 bytes) 272 cmd.push_back(0x08); // 273 cmd.push_back(0x00); // Version 274 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 275 cmd.push_back(ambientTemp); // Ambient Temperature 276 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 277 cmd.push_back(altitude & 0xFF); // 278 cmd.push_back(0x00); // Reserved (3 bytes) 279 cmd.push_back(0x00); 280 cmd.push_back(0x00); 281 log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT " 282 "command to OCC{} ({} bytes)", 283 instance, cmd.size()) 284 .c_str()); 285 status = occCmd.send(cmd, rsp); 286 if (status == CmdStatus::SUCCESS) 287 { 288 if (rsp.size() == 5) 289 { 290 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 291 { 292 log<level::ERR>( 293 fmt::format( 294 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}", 295 rsp[2]) 296 .c_str()); 297 dump_hex(rsp); 298 status = CmdStatus::FAILURE; 299 } 300 } 301 else 302 { 303 log<level::ERR>( 304 fmt::format( 305 "sendAmbient: INVALID SEND_AMBIENT response length:{}", 306 rsp.size()) 307 .c_str()); 308 dump_hex(rsp); 309 status = CmdStatus::FAILURE; 310 } 311 } 312 else 313 { 314 log<level::ERR>( 315 fmt::format( 316 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}", 317 status) 318 .c_str()); 319 320 if (status == CmdStatus::COMM_FAILURE) 321 { 322 // Disable and reset to try recovering 323 deviceError(); 324 } 325 } 326 327 return status; 328 } 329 330 // Called when safe timer expires to determine if OCCs need to be reset 331 void Status::safeStateDelayExpired() 332 { 333 if (this->occActive()) 334 { 335 log<level::INFO>( 336 fmt::format( 337 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset", 338 instance) 339 .c_str()); 340 // Disable and reset to try recovering 341 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 342 } 343 } 344 #endif // POWER10 345 346 fs::path Status::getHwmonPath() 347 { 348 using namespace std::literals::string_literals; 349 350 if (!fs::exists(hwmonPath)) 351 { 352 static bool tracedFail[8] = {0}; 353 354 if (!hwmonPath.empty()) 355 { 356 log<level::ERR>( 357 fmt::format("Status::getHwmonPath(): path no longer exists: {}", 358 hwmonPath.c_str()) 359 .c_str()); 360 hwmonPath.clear(); 361 } 362 363 // Build the base HWMON path 364 fs::path prefixPath = fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 365 std::to_string(instance + 1) + 366 "/hwmon/"s}; 367 368 // Get the hwmonXX directory name 369 try 370 { 371 // there should only be one directory 372 const int numDirs = std::distance( 373 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 374 if (numDirs == 1) 375 { 376 hwmonPath = *fs::directory_iterator(prefixPath); 377 tracedFail[instance] = false; 378 } 379 else 380 { 381 if (!tracedFail[instance]) 382 { 383 log<level::ERR>( 384 fmt::format( 385 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!", 386 numDirs) 387 .c_str()); 388 tracedFail[instance] = true; 389 } 390 } 391 } 392 catch (const fs::filesystem_error& e) 393 { 394 if (!tracedFail[instance]) 395 { 396 log<level::ERR>( 397 fmt::format( 398 "Status::getHwmonPath(): error accessing {}: {}", 399 prefixPath.c_str(), e.what()) 400 .c_str()); 401 tracedFail[instance] = true; 402 } 403 } 404 } 405 406 return hwmonPath; 407 } 408 409 // Called to read state and upon failure to read after occReadStateFailTimer. 410 void Status::occReadStateNow() 411 { 412 unsigned int state; 413 const fs::path filename = 414 fs::path(DEV_PATH) / 415 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 416 417 std::ifstream file; 418 bool goodFile = false; 419 420 // open file. 421 file.open(filename, std::ios::in); 422 const int openErrno = errno; 423 424 // File is open and state can be used. 425 if (file.is_open() && file.good()) 426 { 427 goodFile = true; 428 file >> state; 429 430 if (state != lastState) 431 { 432 // Trace OCC state changes 433 log<level::INFO>( 434 fmt::format( 435 "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})", 436 instance, state, lastState) 437 .c_str()); 438 lastState = state; 439 #ifdef POWER10 440 if (OccState(state) == OccState::ACTIVE) 441 { 442 if (pmode && device.master()) 443 { 444 // Set the master OCC on the PowerMode object 445 pmode->setMasterOcc(path); 446 // Enable mode changes 447 pmode->setMasterActive(); 448 449 // Special processing by master OCC when it goes active 450 occsWentActive(); 451 } 452 453 CmdStatus status = sendAmbient(); 454 if (status != CmdStatus::SUCCESS) 455 { 456 log<level::ERR>( 457 fmt::format( 458 "readOccState: Sending Ambient failed with status {}", 459 status) 460 .c_str()); 461 } 462 } 463 464 // If OCC in known Good State. 465 if ((OccState(state) == OccState::ACTIVE) || 466 (OccState(state) == OccState::CHARACTERIZATION) || 467 (OccState(state) == OccState::OBSERVATION)) 468 { 469 // Good OCC State then sensors valid again 470 stateValid = true; 471 472 if (safeStateDelayTimer.isEnabled()) 473 { 474 // stop safe delay timer (no longer in SAFE state) 475 safeStateDelayTimer.setEnabled(false); 476 } 477 } 478 // Else not Valid state We would be in SAFE mode. 479 // This captures both SAFE mode, and 0x00, or other invalid 480 // state values. 481 else 482 { 483 if (!safeStateDelayTimer.isEnabled()) 484 { 485 // start safe delay timer (before requesting reset) 486 using namespace std::literals::chrono_literals; 487 safeStateDelayTimer.restartOnce(60s); 488 } 489 // Not valid state, update sensors to Nan & not functional. 490 stateValid = false; 491 } 492 #else 493 // Before P10 state not checked, only used good file open. 494 stateValid = true; 495 #endif 496 } 497 } 498 file.close(); 499 500 // if failed to Read a state or not a valid state -> Attempt retry 501 // after 1 Second delay if allowed. 502 if ((!goodFile) || (!stateValid)) 503 { 504 if (!goodFile) 505 { 506 // If not able to read, OCC may be offline 507 log<level::ERR>( 508 fmt::format("Status::readOccState: open failed (errno={})", 509 openErrno) 510 .c_str()); 511 } 512 else 513 { 514 // else this failed due to state not valid. 515 if (state != lastState) 516 { 517 log<level::ERR>( 518 fmt::format( 519 "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})", 520 instance, state, lastState) 521 .c_str()); 522 } 523 } 524 525 #ifdef READ_OCC_SENSORS 526 manager.setSensorValueToNaN(instance); 527 #endif 528 529 // See occReadRetries for number of retry attempts. 530 if (currentOccReadRetriesCount > 0) 531 { 532 --currentOccReadRetriesCount; 533 #ifdef POWER10 534 using namespace std::chrono_literals; 535 occReadStateFailTimer.restartOnce(1s); 536 #endif 537 } 538 else 539 { 540 #ifdef POWER10 541 if (!stateValid && occActive()) 542 { 543 if (!safeStateDelayTimer.isEnabled()) 544 { 545 log<level::ERR>( 546 "Starting 60 sec delay timer before requesting a reset"); 547 // start safe delay timer (before requesting reset) 548 using namespace std::literals::chrono_literals; 549 safeStateDelayTimer.restartOnce(60s); 550 } 551 } 552 #else 553 // State could not be determined, set it to NO State. 554 lastState = 0; 555 556 // Disable the ability to send Failed actions until OCC is 557 // Active again. 558 stateValid = false; 559 560 // Disable and reset to try recovering 561 deviceError(); 562 #endif 563 } 564 } 565 } 566 567 // Update processor throttle status on dbus 568 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason) 569 { 570 if (!throttleHandle) 571 { 572 return; 573 } 574 575 uint8_t newThrottleCause = throttleCause; 576 577 if (isThrottled) // throttled due to newReason 578 { 579 if ((newReason & throttleCause) == 0) 580 { 581 // set the bit(s) for passed in reason 582 newThrottleCause |= newReason; 583 } 584 // else no change 585 } 586 else // no longer throttled due to newReason 587 { 588 if ((newReason & throttleCause) != 0) 589 { 590 // clear the bit(s) for passed in reason 591 newThrottleCause &= ~newReason; 592 } 593 // else no change 594 } 595 596 if (newThrottleCause != throttleCause) 597 { 598 if (newThrottleCause == THROTTLED_NONE) 599 { 600 log<level::DEBUG>( 601 fmt::format( 602 "updateThrottle: OCC{} no longer throttled (prior reason: {})", 603 instance, throttleCause) 604 .c_str()); 605 throttleCause = THROTTLED_NONE; 606 throttleHandle->throttled(false); 607 throttleHandle->throttleCauses({}); 608 } 609 else 610 { 611 log<level::DEBUG>( 612 fmt::format( 613 "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})", 614 instance, newThrottleCause, throttleCause) 615 .c_str()); 616 throttleCause = newThrottleCause; 617 618 std::vector<ThrottleObj::ThrottleReasons> updatedCauses; 619 if (throttleCause & THROTTLED_POWER) 620 { 621 updatedCauses.push_back( 622 throttleHandle->ThrottleReasons::PowerLimit); 623 } 624 if (throttleCause & THROTTLED_THERMAL) 625 { 626 updatedCauses.push_back( 627 throttleHandle->ThrottleReasons::ThermalLimit); 628 } 629 if (throttleCause & THROTTLED_SAFE) 630 { 631 updatedCauses.push_back( 632 throttleHandle->ThrottleReasons::ManagementDetectedFault); 633 } 634 throttleHandle->throttleCauses(updatedCauses); 635 throttleHandle->throttled(true); 636 } 637 } 638 // else no change to throttle status 639 } 640 641 // Get processor path associated with this OCC 642 void Status::readProcAssociation() 643 { 644 std::string managingPath = path + "/power_managing"; 645 log<level::DEBUG>( 646 fmt::format("readProcAssociation: getting endpoints for {} ({})", 647 managingPath, path) 648 .c_str()); 649 try 650 { 651 utils::PropertyValue procPathProperty{}; 652 procPathProperty = utils::getProperty( 653 managingPath, "xyz.openbmc_project.Association", "endpoints"); 654 auto result = std::get<std::vector<std::string>>(procPathProperty); 655 if (result.size() > 0) 656 { 657 procPath = result[0]; 658 log<level::INFO>( 659 fmt::format("readProcAssociation: OCC{} has proc={}", instance, 660 procPath.c_str()) 661 .c_str()); 662 } 663 else 664 { 665 log<level::ERR>( 666 fmt::format( 667 "readProcAssociation: No processor associated with OCC{} / {}", 668 instance, path) 669 .c_str()); 670 } 671 } 672 catch (const sdbusplus::exception_t& e) 673 { 674 log<level::ERR>( 675 fmt::format( 676 "readProcAssociation: Unable to get proc assocated with {} - {}", 677 path, e.what()) 678 .c_str()); 679 procPath = {}; 680 } 681 } 682 683 } // namespace occ 684 } // namespace open_power 685