1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <fmt/core.h> 9 10 #ifdef POWER10 11 #include <com/ibm/Host/Target/server.hpp> 12 #endif 13 #include <phosphor-logging/log.hpp> 14 15 #include <filesystem> 16 17 namespace open_power 18 { 19 namespace occ 20 { 21 22 using namespace phosphor::logging; 23 24 // Handles updates to occActive property 25 bool Status::occActive(bool value) 26 { 27 if (value != this->occActive()) 28 { 29 log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}", 30 instance, value) 31 .c_str()); 32 if (value) 33 { 34 // Set the device active 35 device.setActive(true); 36 37 // Update the OCC active sensor 38 Base::Status::occActive(value); 39 40 // Start watching for errors 41 addErrorWatch(); 42 43 // Reset last OCC state 44 lastState = 0; 45 46 if (device.master()) 47 { 48 // Update powercap bounds from OCC 49 manager.updatePcapBounds(); 50 } 51 52 // Call into Manager to let know that we have bound 53 if (this->managerCallBack) 54 { 55 this->managerCallBack(instance, value); 56 } 57 } 58 else 59 { 60 #ifdef POWER10 61 if (pmode && device.master()) 62 { 63 // Prevent mode changes 64 pmode->setMasterActive(false); 65 } 66 if (safeStateDelayTimer.isEnabled()) 67 { 68 // stop safe delay timer 69 safeStateDelayTimer.setEnabled(false); 70 } 71 #endif 72 // Call into Manager to let know that we will unbind. 73 if (this->managerCallBack) 74 { 75 this->managerCallBack(instance, value); 76 } 77 78 // Stop watching for errors 79 removeErrorWatch(); 80 81 // Set the device inactive 82 device.setActive(false); 83 } 84 } 85 else if (value && !device.active()) 86 { 87 // Existing error watch is on a dead file descriptor. 88 removeErrorWatch(); 89 90 /* 91 * In it's constructor, Status checks Device::bound() to see if OCC is 92 * active or not. 93 * Device::bound() checks for occX-dev0 directory. 94 * We will lose occX-dev0 directories during FSI rescan. 95 * So, if we start this application (and construct Status), and then 96 * later do FSI rescan, we will end up with occActive = true and device 97 * NOT bound. Lets correct that situation here. 98 */ 99 device.setActive(true); 100 101 // Add error watch again 102 addErrorWatch(); 103 } 104 else if (!value && device.active()) 105 { 106 removeErrorWatch(); 107 108 // In the event that the application never receives the active signal 109 // even though the OCC is active (this can occur if the BMC is rebooted 110 // with the host on, since the initial OCC driver probe will discover 111 // the OCCs), this application needs to be able to unbind the device 112 // when we get the OCC inactive signal. 113 device.setActive(false); 114 } 115 return Base::Status::occActive(value); 116 } 117 118 // Callback handler when a device error is reported. 119 void Status::deviceError(Error::Descriptor d) 120 { 121 #ifdef POWER10 122 if (pmode && device.master()) 123 { 124 // Prevent mode changes 125 pmode->setMasterActive(false); 126 } 127 #endif 128 129 if (d.log) 130 { 131 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 132 } 133 134 // This would deem OCC inactive 135 this->occActive(false); 136 137 // Reset the OCC 138 this->resetOCC(); 139 } 140 141 // Sends message to host control command handler to reset OCC 142 void Status::resetOCC() 143 { 144 log<level::INFO>( 145 fmt::format(">>Status::resetOCC() - requesting reset for OCC{}", 146 instance) 147 .c_str()); 148 #ifdef PLDM 149 if (resetCallBack) 150 { 151 this->resetCallBack(instance); 152 } 153 #else 154 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 155 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 156 157 // This will throw exception on failure 158 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 159 160 auto& bus = utils::getBus(); 161 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 162 CONTROL_HOST_INTF, "Execute"); 163 // OCC Reset control command 164 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 165 166 // OCC Sensor ID for callout reasons 167 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 168 bus.call_noreply(method); 169 return; 170 #endif 171 } 172 173 // Handler called by Host control command handler to convey the 174 // status of the executed command 175 void Status::hostControlEvent(sdbusplus::message_t& msg) 176 { 177 std::string cmdCompleted{}; 178 std::string cmdStatus{}; 179 180 msg.read(cmdCompleted, cmdStatus); 181 182 log<level::DEBUG>("Host control signal values", 183 entry("COMMAND=%s", cmdCompleted.c_str()), 184 entry("STATUS=%s", cmdStatus.c_str())); 185 186 if (Control::Host::convertResultFromString(cmdStatus) != 187 Control::Host::Result::Success) 188 { 189 if (Control::Host::convertCommandFromString(cmdCompleted) == 190 Control::Host::Command::OCCReset) 191 { 192 // Must be a Timeout. Log an Error trace 193 log<level::ERR>( 194 "Error resetting the OCC.", entry("PATH=%s", path.c_str()), 195 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance)))); 196 } 197 } 198 return; 199 } 200 201 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 202 void Status::readOccState() 203 { 204 currentOccReadRetriesCount = occReadRetries; 205 occReadStateNow(); 206 } 207 208 #ifdef POWER10 209 // Special processing that needs to happen once the OCCs change to ACTIVE state 210 void Status::occsWentActive() 211 { 212 CmdStatus status = CmdStatus::SUCCESS; 213 214 status = pmode->sendModeChange(); 215 if (status != CmdStatus::SUCCESS) 216 { 217 log<level::ERR>( 218 fmt::format( 219 "Status::occsWentActive: OCC mode change failed with status {}", 220 status) 221 .c_str()); 222 223 // Disable and reset to try recovering 224 deviceError(); 225 } 226 227 status = pmode->sendIpsData(); 228 if (status != CmdStatus::SUCCESS) 229 { 230 log<level::ERR>( 231 fmt::format( 232 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}", 233 status) 234 .c_str()); 235 236 if (status == CmdStatus::COMM_FAILURE) 237 { 238 // Disable and reset to try recovering 239 deviceError(); 240 } 241 } 242 } 243 244 // Send Ambient and Altitude to the OCC 245 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 246 { 247 CmdStatus status = CmdStatus::FAILURE; 248 bool ambientValid = true; 249 uint8_t ambientTemp = inTemp; 250 uint16_t altitude = inAltitude; 251 252 if (ambientTemp == 0xFF) 253 { 254 // Get latest readings from manager 255 manager.getAmbientData(ambientValid, ambientTemp, altitude); 256 log<level::DEBUG>( 257 fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m", 258 ambientValid, ambientTemp, altitude) 259 .c_str()); 260 } 261 262 std::vector<std::uint8_t> cmd, rsp; 263 cmd.reserve(11); 264 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 265 cmd.push_back(0x00); // Data Length (2 bytes) 266 cmd.push_back(0x08); // 267 cmd.push_back(0x00); // Version 268 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 269 cmd.push_back(ambientTemp); // Ambient Temperature 270 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 271 cmd.push_back(altitude & 0xFF); // 272 cmd.push_back(0x00); // Reserved (3 bytes) 273 cmd.push_back(0x00); 274 cmd.push_back(0x00); 275 log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT " 276 "command to OCC{} ({} bytes)", 277 instance, cmd.size()) 278 .c_str()); 279 status = occCmd.send(cmd, rsp); 280 if (status == CmdStatus::SUCCESS) 281 { 282 if (rsp.size() == 5) 283 { 284 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 285 { 286 log<level::ERR>( 287 fmt::format( 288 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}", 289 rsp[2]) 290 .c_str()); 291 dump_hex(rsp); 292 status = CmdStatus::FAILURE; 293 } 294 } 295 else 296 { 297 log<level::ERR>( 298 fmt::format( 299 "sendAmbient: INVALID SEND_AMBIENT response length:{}", 300 rsp.size()) 301 .c_str()); 302 dump_hex(rsp); 303 status = CmdStatus::FAILURE; 304 } 305 } 306 else 307 { 308 log<level::ERR>( 309 fmt::format( 310 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}", 311 status) 312 .c_str()); 313 314 if (status == CmdStatus::COMM_FAILURE) 315 { 316 // Disable and reset to try recovering 317 deviceError(); 318 } 319 } 320 321 return status; 322 } 323 324 // Called when safe timer expires to determine if OCCs need to be reset 325 void Status::safeStateDelayExpired() 326 { 327 if (this->occActive()) 328 { 329 log<level::INFO>( 330 fmt::format( 331 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset", 332 instance) 333 .c_str()); 334 // Disable and reset to try recovering 335 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 336 } 337 } 338 #endif // POWER10 339 340 fs::path Status::getHwmonPath() 341 { 342 using namespace std::literals::string_literals; 343 344 if (!fs::exists(hwmonPath)) 345 { 346 static bool tracedFail[8] = {0}; 347 348 if (!hwmonPath.empty()) 349 { 350 log<level::ERR>( 351 fmt::format("Status::getHwmonPath(): path no longer exists: {}", 352 hwmonPath.c_str()) 353 .c_str()); 354 hwmonPath.clear(); 355 } 356 357 // Build the base HWMON path 358 fs::path prefixPath = 359 fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 360 std::to_string(instance + 1) + "/hwmon/"s}; 361 362 // Get the hwmonXX directory name 363 try 364 { 365 // there should only be one directory 366 const int numDirs = std::distance( 367 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 368 if (numDirs == 1) 369 { 370 hwmonPath = *fs::directory_iterator(prefixPath); 371 tracedFail[instance] = false; 372 } 373 else 374 { 375 if (!tracedFail[instance]) 376 { 377 log<level::ERR>( 378 fmt::format( 379 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!", 380 numDirs) 381 .c_str()); 382 tracedFail[instance] = true; 383 } 384 } 385 } 386 catch (const fs::filesystem_error& e) 387 { 388 if (!tracedFail[instance]) 389 { 390 log<level::ERR>( 391 fmt::format( 392 "Status::getHwmonPath(): error accessing {}: {}", 393 prefixPath.c_str(), e.what()) 394 .c_str()); 395 tracedFail[instance] = true; 396 } 397 } 398 } 399 400 return hwmonPath; 401 } 402 403 // Called to read state and upon failure to read after occReadStateFailTimer. 404 void Status::occReadStateNow() 405 { 406 unsigned int state; 407 const fs::path filename = 408 fs::path(DEV_PATH) / 409 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 410 411 std::ifstream file; 412 bool goodFile = false; 413 414 // open file. 415 file.open(filename, std::ios::in); 416 const int openErrno = errno; 417 418 // File is open and state can be used. 419 if (file.is_open() && file.good()) 420 { 421 goodFile = true; 422 file >> state; 423 424 if (state != lastState) 425 { 426 // Trace OCC state changes 427 log<level::INFO>( 428 fmt::format( 429 "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})", 430 instance, state, lastState) 431 .c_str()); 432 lastState = state; 433 #ifdef POWER10 434 if (OccState(state) == OccState::ACTIVE) 435 { 436 if (pmode && device.master()) 437 { 438 // Set the master OCC on the PowerMode object 439 pmode->setMasterOcc(path); 440 // Enable mode changes 441 pmode->setMasterActive(); 442 443 // Special processing by master OCC when it goes active 444 occsWentActive(); 445 } 446 447 CmdStatus status = sendAmbient(); 448 if (status != CmdStatus::SUCCESS) 449 { 450 log<level::ERR>( 451 fmt::format( 452 "readOccState: Sending Ambient failed with status {}", 453 status) 454 .c_str()); 455 } 456 } 457 458 // If OCC in known Good State. 459 if ((OccState(state) == OccState::ACTIVE) || 460 (OccState(state) == OccState::CHARACTERIZATION) || 461 (OccState(state) == OccState::OBSERVATION)) 462 { 463 // Good OCC State then sensors valid again 464 stateValid = true; 465 466 if (safeStateDelayTimer.isEnabled()) 467 { 468 // stop safe delay timer (no longer in SAFE state) 469 safeStateDelayTimer.setEnabled(false); 470 } 471 } 472 // Else not Valid state We would be in SAFE mode. 473 // This captures both SAFE mode, and 0x00, or other invalid 474 // state values. 475 else 476 { 477 if (!safeStateDelayTimer.isEnabled()) 478 { 479 // start safe delay timer (before requesting reset) 480 using namespace std::literals::chrono_literals; 481 safeStateDelayTimer.restartOnce(60s); 482 } 483 // Not valid state, update sensors to Nan & not functional. 484 stateValid = false; 485 } 486 #else 487 // Before P10 state not checked, only used good file open. 488 stateValid = true; 489 #endif 490 } 491 } 492 file.close(); 493 494 // if failed to Read a state or not a valid state -> Attempt retry 495 // after 1 Second delay if allowed. 496 if ((!goodFile) || (!stateValid)) 497 { 498 if (!goodFile) 499 { 500 // If not able to read, OCC may be offline 501 log<level::ERR>( 502 fmt::format("Status::readOccState: open failed (errno={})", 503 openErrno) 504 .c_str()); 505 } 506 else 507 { 508 // else this failed due to state not valid. 509 if (state != lastState) 510 { 511 log<level::ERR>( 512 fmt::format( 513 "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})", 514 instance, state, lastState) 515 .c_str()); 516 } 517 } 518 519 #ifdef READ_OCC_SENSORS 520 manager.setSensorValueToNaN(instance); 521 #endif 522 523 // See occReadRetries for number of retry attempts. 524 if (currentOccReadRetriesCount > 0) 525 { 526 --currentOccReadRetriesCount; 527 #ifdef POWER10 528 using namespace std::chrono_literals; 529 occReadStateFailTimer.restartOnce(1s); 530 #endif 531 } 532 else 533 { 534 #ifdef POWER10 535 if (!stateValid && occActive()) 536 { 537 if (!safeStateDelayTimer.isEnabled()) 538 { 539 log<level::ERR>( 540 "Starting 60 sec delay timer before requesting a reset"); 541 // start safe delay timer (before requesting reset) 542 using namespace std::literals::chrono_literals; 543 safeStateDelayTimer.restartOnce(60s); 544 } 545 } 546 #else 547 // State could not be determined, set it to NO State. 548 lastState = 0; 549 550 // Disable the ability to send Failed actions until OCC is 551 // Active again. 552 stateValid = false; 553 554 // Disable and reset to try recovering 555 deviceError(); 556 #endif 557 } 558 } 559 } 560 561 } // namespace occ 562 } // namespace open_power 563