1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <fmt/core.h> 9 10 #include <phosphor-logging/log.hpp> 11 12 #include <filesystem> 13 14 namespace open_power 15 { 16 namespace occ 17 { 18 19 using namespace phosphor::logging; 20 21 // Handles updates to occActive property 22 bool Status::occActive(bool value) 23 { 24 if (value != this->occActive()) 25 { 26 log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}", 27 instance, value) 28 .c_str()); 29 if (value) 30 { 31 // Set the device active 32 device.setActive(true); 33 34 // Update the OCC active sensor 35 Base::Status::occActive(value); 36 37 // Start watching for errors 38 addErrorWatch(); 39 40 // Reset last OCC state 41 lastState = 0; 42 43 if (device.master()) 44 { 45 // Update powercap bounds from OCC 46 manager.updatePcapBounds(); 47 } 48 49 // Call into Manager to let know that we have bound 50 if (this->managerCallBack) 51 { 52 this->managerCallBack(instance, value); 53 } 54 } 55 else 56 { 57 #ifdef POWER10 58 if (pmode && device.master()) 59 { 60 // Prevent mode changes 61 pmode->setMasterActive(false); 62 } 63 if (safeStateDelayTimer.isEnabled()) 64 { 65 // stop safe delay timer 66 safeStateDelayTimer.setEnabled(false); 67 } 68 #endif 69 // Call into Manager to let know that we will unbind. 70 if (this->managerCallBack) 71 { 72 this->managerCallBack(instance, value); 73 } 74 75 // Stop watching for errors 76 removeErrorWatch(); 77 78 // Set the device inactive 79 device.setActive(false); 80 } 81 } 82 else if (value && !device.active()) 83 { 84 // Existing error watch is on a dead file descriptor. 85 removeErrorWatch(); 86 87 /* 88 * In it's constructor, Status checks Device::bound() to see if OCC is 89 * active or not. 90 * Device::bound() checks for occX-dev0 directory. 91 * We will lose occX-dev0 directories during FSI rescan. 92 * So, if we start this application (and construct Status), and then 93 * later do FSI rescan, we will end up with occActive = true and device 94 * NOT bound. Lets correct that situation here. 95 */ 96 device.setActive(true); 97 98 // Add error watch again 99 addErrorWatch(); 100 } 101 else if (!value && device.active()) 102 { 103 removeErrorWatch(); 104 105 // In the event that the application never receives the active signal 106 // even though the OCC is active (this can occur if the BMC is rebooted 107 // with the host on, since the initial OCC driver probe will discover 108 // the OCCs), this application needs to be able to unbind the device 109 // when we get the OCC inactive signal. 110 device.setActive(false); 111 } 112 return Base::Status::occActive(value); 113 } 114 115 // Callback handler when a device error is reported. 116 void Status::deviceError(Error::Descriptor d) 117 { 118 #ifdef POWER10 119 if (pmode && device.master()) 120 { 121 // Prevent mode changes 122 pmode->setMasterActive(false); 123 } 124 #endif 125 126 if (d.log) 127 { 128 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 129 } 130 131 // This would deem OCC inactive 132 this->occActive(false); 133 134 // Reset the OCC 135 this->resetOCC(); 136 } 137 138 // Sends message to host control command handler to reset OCC 139 void Status::resetOCC() 140 { 141 log<level::INFO>( 142 fmt::format(">>Status::resetOCC() - requesting reset for OCC{}", 143 instance) 144 .c_str()); 145 #ifdef PLDM 146 if (resetCallBack) 147 { 148 this->resetCallBack(instance); 149 } 150 #else 151 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 152 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 153 154 // This will throw exception on failure 155 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 156 157 auto& bus = utils::getBus(); 158 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 159 CONTROL_HOST_INTF, "Execute"); 160 // OCC Reset control command 161 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 162 163 // OCC Sensor ID for callout reasons 164 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 165 bus.call_noreply(method); 166 return; 167 #endif 168 } 169 170 // Handler called by Host control command handler to convey the 171 // status of the executed command 172 void Status::hostControlEvent(sdbusplus::message_t& msg) 173 { 174 std::string cmdCompleted{}; 175 std::string cmdStatus{}; 176 177 msg.read(cmdCompleted, cmdStatus); 178 179 log<level::DEBUG>("Host control signal values", 180 entry("COMMAND=%s", cmdCompleted.c_str()), 181 entry("STATUS=%s", cmdStatus.c_str())); 182 183 if (Control::Host::convertResultFromString(cmdStatus) != 184 Control::Host::Result::Success) 185 { 186 if (Control::Host::convertCommandFromString(cmdCompleted) == 187 Control::Host::Command::OCCReset) 188 { 189 // Must be a Timeout. Log an Error trace 190 log<level::ERR>( 191 "Error resetting the OCC.", entry("PATH=%s", path.c_str()), 192 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance)))); 193 } 194 } 195 return; 196 } 197 198 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 199 void Status::readOccState() 200 { 201 currentOccReadRetriesCount = occReadRetries; 202 occReadStateNow(); 203 } 204 205 #ifdef POWER10 206 // Special processing that needs to happen once the OCCs change to ACTIVE state 207 void Status::occsWentActive() 208 { 209 CmdStatus status = CmdStatus::SUCCESS; 210 211 status = pmode->sendModeChange(); 212 if (status != CmdStatus::SUCCESS) 213 { 214 log<level::ERR>( 215 fmt::format( 216 "Status::occsWentActive: OCC mode change failed with status {}", 217 status) 218 .c_str()); 219 220 // Disable and reset to try recovering 221 deviceError(); 222 } 223 224 status = pmode->sendIpsData(); 225 if (status != CmdStatus::SUCCESS) 226 { 227 log<level::ERR>( 228 fmt::format( 229 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}", 230 status) 231 .c_str()); 232 233 if (status == CmdStatus::COMM_FAILURE) 234 { 235 // Disable and reset to try recovering 236 deviceError(); 237 } 238 } 239 } 240 241 // Send Ambient and Altitude to the OCC 242 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 243 { 244 CmdStatus status = CmdStatus::FAILURE; 245 bool ambientValid = true; 246 uint8_t ambientTemp = inTemp; 247 uint16_t altitude = inAltitude; 248 249 if (ambientTemp == 0xFF) 250 { 251 // Get latest readings from manager 252 manager.getAmbientData(ambientValid, ambientTemp, altitude); 253 log<level::DEBUG>( 254 fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m", 255 ambientValid, ambientTemp, altitude) 256 .c_str()); 257 } 258 259 std::vector<std::uint8_t> cmd, rsp; 260 cmd.reserve(11); 261 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 262 cmd.push_back(0x00); // Data Length (2 bytes) 263 cmd.push_back(0x08); // 264 cmd.push_back(0x00); // Version 265 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 266 cmd.push_back(ambientTemp); // Ambient Temperature 267 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 268 cmd.push_back(altitude & 0xFF); // 269 cmd.push_back(0x00); // Reserved (3 bytes) 270 cmd.push_back(0x00); 271 cmd.push_back(0x00); 272 log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT " 273 "command to OCC{} ({} bytes)", 274 instance, cmd.size()) 275 .c_str()); 276 status = occCmd.send(cmd, rsp); 277 if (status == CmdStatus::SUCCESS) 278 { 279 if (rsp.size() == 5) 280 { 281 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 282 { 283 log<level::ERR>( 284 fmt::format( 285 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}", 286 rsp[2]) 287 .c_str()); 288 dump_hex(rsp); 289 status = CmdStatus::FAILURE; 290 } 291 } 292 else 293 { 294 log<level::ERR>( 295 fmt::format( 296 "sendAmbient: INVALID SEND_AMBIENT response length:{}", 297 rsp.size()) 298 .c_str()); 299 dump_hex(rsp); 300 status = CmdStatus::FAILURE; 301 } 302 } 303 else 304 { 305 log<level::ERR>( 306 fmt::format( 307 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}", 308 status) 309 .c_str()); 310 311 if (status == CmdStatus::COMM_FAILURE) 312 { 313 // Disable and reset to try recovering 314 deviceError(); 315 } 316 } 317 318 return status; 319 } 320 321 // Called when safe timer expires to determine if OCCs need to be reset 322 void Status::safeStateDelayExpired() 323 { 324 if (this->occActive()) 325 { 326 log<level::INFO>( 327 fmt::format( 328 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset", 329 instance) 330 .c_str()); 331 // Disable and reset to try recovering 332 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 333 } 334 } 335 #endif // POWER10 336 337 fs::path Status::getHwmonPath() 338 { 339 using namespace std::literals::string_literals; 340 341 if (!fs::exists(hwmonPath)) 342 { 343 static bool tracedFail[8] = {0}; 344 345 if (!hwmonPath.empty()) 346 { 347 log<level::ERR>( 348 fmt::format("Status::getHwmonPath(): path no longer exists: {}", 349 hwmonPath.c_str()) 350 .c_str()); 351 hwmonPath.clear(); 352 } 353 354 // Build the base HWMON path 355 fs::path prefixPath = fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 356 std::to_string(instance + 1) + 357 "/hwmon/"s}; 358 359 // Get the hwmonXX directory name 360 try 361 { 362 // there should only be one directory 363 const int numDirs = std::distance( 364 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 365 if (numDirs == 1) 366 { 367 hwmonPath = *fs::directory_iterator(prefixPath); 368 tracedFail[instance] = false; 369 } 370 else 371 { 372 if (!tracedFail[instance]) 373 { 374 log<level::ERR>( 375 fmt::format( 376 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!", 377 numDirs) 378 .c_str()); 379 tracedFail[instance] = true; 380 } 381 } 382 } 383 catch (const fs::filesystem_error& e) 384 { 385 if (!tracedFail[instance]) 386 { 387 log<level::ERR>( 388 fmt::format( 389 "Status::getHwmonPath(): error accessing {}: {}", 390 prefixPath.c_str(), e.what()) 391 .c_str()); 392 tracedFail[instance] = true; 393 } 394 } 395 } 396 397 return hwmonPath; 398 } 399 400 // Called to read state and upon failure to read after occReadStateFailTimer. 401 void Status::occReadStateNow() 402 { 403 unsigned int state; 404 const fs::path filename = 405 fs::path(DEV_PATH) / 406 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 407 408 std::ifstream file; 409 bool goodFile = false; 410 411 // open file. 412 file.open(filename, std::ios::in); 413 const int openErrno = errno; 414 415 // File is open and state can be used. 416 if (file.is_open() && file.good()) 417 { 418 goodFile = true; 419 file >> state; 420 421 if (state != lastState) 422 { 423 // Trace OCC state changes 424 log<level::INFO>( 425 fmt::format( 426 "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})", 427 instance, state, lastState) 428 .c_str()); 429 lastState = state; 430 #ifdef POWER10 431 if (OccState(state) == OccState::ACTIVE) 432 { 433 if (pmode && device.master()) 434 { 435 // Set the master OCC on the PowerMode object 436 pmode->setMasterOcc(path); 437 // Enable mode changes 438 pmode->setMasterActive(); 439 440 // Special processing by master OCC when it goes active 441 occsWentActive(); 442 } 443 444 CmdStatus status = sendAmbient(); 445 if (status != CmdStatus::SUCCESS) 446 { 447 log<level::ERR>( 448 fmt::format( 449 "readOccState: Sending Ambient failed with status {}", 450 status) 451 .c_str()); 452 } 453 } 454 455 // If OCC in known Good State. 456 if ((OccState(state) == OccState::ACTIVE) || 457 (OccState(state) == OccState::CHARACTERIZATION) || 458 (OccState(state) == OccState::OBSERVATION)) 459 { 460 // Good OCC State then sensors valid again 461 stateValid = true; 462 463 if (safeStateDelayTimer.isEnabled()) 464 { 465 // stop safe delay timer (no longer in SAFE state) 466 safeStateDelayTimer.setEnabled(false); 467 } 468 } 469 // Else not Valid state We would be in SAFE mode. 470 // This captures both SAFE mode, and 0x00, or other invalid 471 // state values. 472 else 473 { 474 if (!safeStateDelayTimer.isEnabled()) 475 { 476 // start safe delay timer (before requesting reset) 477 using namespace std::literals::chrono_literals; 478 safeStateDelayTimer.restartOnce(60s); 479 } 480 // Not valid state, update sensors to Nan & not functional. 481 stateValid = false; 482 } 483 #else 484 // Before P10 state not checked, only used good file open. 485 stateValid = true; 486 #endif 487 } 488 } 489 file.close(); 490 491 // if failed to Read a state or not a valid state -> Attempt retry 492 // after 1 Second delay if allowed. 493 if ((!goodFile) || (!stateValid)) 494 { 495 if (!goodFile) 496 { 497 // If not able to read, OCC may be offline 498 log<level::ERR>( 499 fmt::format("Status::readOccState: open failed (errno={})", 500 openErrno) 501 .c_str()); 502 } 503 else 504 { 505 // else this failed due to state not valid. 506 if (state != lastState) 507 { 508 log<level::ERR>( 509 fmt::format( 510 "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})", 511 instance, state, lastState) 512 .c_str()); 513 } 514 } 515 516 #ifdef READ_OCC_SENSORS 517 manager.setSensorValueToNaN(instance); 518 #endif 519 520 // See occReadRetries for number of retry attempts. 521 if (currentOccReadRetriesCount > 0) 522 { 523 --currentOccReadRetriesCount; 524 #ifdef POWER10 525 using namespace std::chrono_literals; 526 occReadStateFailTimer.restartOnce(1s); 527 #endif 528 } 529 else 530 { 531 #ifdef POWER10 532 if (!stateValid && occActive()) 533 { 534 if (!safeStateDelayTimer.isEnabled()) 535 { 536 log<level::ERR>( 537 "Starting 60 sec delay timer before requesting a reset"); 538 // start safe delay timer (before requesting reset) 539 using namespace std::literals::chrono_literals; 540 safeStateDelayTimer.restartOnce(60s); 541 } 542 } 543 #else 544 // State could not be determined, set it to NO State. 545 lastState = 0; 546 547 // Disable the ability to send Failed actions until OCC is 548 // Active again. 549 stateValid = false; 550 551 // Disable and reset to try recovering 552 deviceError(); 553 #endif 554 } 555 } 556 } 557 558 } // namespace occ 559 } // namespace open_power 560